def crawler(n):
    """ this is the worker routine, the heart of this solution

    the job is performed by the following steps:
    1. take an url from the queue
    2. make a request to this url
    3. mark it as visited
    4. check whether the response is ok to be parsed
    5. if the url corresponds to a product page, then extract data from it
    6. extract more urls from the current page and add them to the queue

    this is repeated continuously until the queue is empty
    """
    while True:
        logger.info(
            'links: [%d] pending, [%d] discovered, [%d] visited'
            % (queue.qsize(), len(discovered), len(visited))
        )
        url = queue.get()
        logger.info('crawler [%d] took [%s] from queue' % (n, url))
        response = requests.get(url, verify=False)  # no SSL validation
        visited.append(url)
        if response.status_code == requests.codes.ok:
            soup = Soup(response.content)
            if is_valid_product_page(url, response):
                data = extract_product_data(url, soup)
                csv.write(CSV_FORMAT % data)
            discover_links(url, soup)
        else:
            logger.warning('response not ok for [%s]' % url)
        queue.task_done()
示例#2
0
def spawner(queue):
    while 1:
        try:
            item = queue.get()
        except hub.LoopExit:
            logger.error("exit getter spawner...")
            return
        queue.task_done()
        gs.append(gevent.spawn(http_getter, item))
示例#3
0
def spawner(queue):
    while 1:
        try:
            item = queue.get()
        except hub.LoopExit:
            logger.error("exit getter spawner...")
            return
        queue.task_done()
        gs.append(gevent.spawn(http_getter, item))
    def worker(self, thread_id, queue):         \
            # pylint: disable=unused-argument

        while True:
            try:
                spot_instance_request = queue.get()

                self.process_spot_instance_request(spot_instance_request)
            except Exception:
                self._logger.exception(
                    'Exception while processing spot instance request')
            finally:
                queue.task_done()
示例#5
0
def converter(queue):
    LOGGER.debug('converter started')
    while True:
        data = queue.get()
        LOGGER.debug('new data for conversion')
        if data == StopIteration:
            queue.task_done()
            break
        LOGGER.debug('flv file: %s' % path.abspath(data['source_file'].name))
        LOGGER.debug('target file: %s' % data['target_file'])
        ffmpeg_args = [
            'ffmpeg', '-i',
            path.abspath(data['source_file'].name), '-vn', '-acodec',
            data['acodec'], '-aq', data['quality'], '-y', data['target_file']
        ]

        p = subprocess.Popen(ffmpeg_args,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        fcntl.fcntl(p.stdin, fcntl.F_SETFL, O_NONBLOCK)
        fcntl.fcntl(p.stdout, fcntl.F_SETFL, O_NONBLOCK)
        p.stdin.close()

        output = ""

        while True:
            try:
                chunk = p.stdout.read(4096)
                if not chunk:
                    break
                output += chunk
            except IOError:
                ex = sys.exc_info()[1]
                if ex[0] != errno.EAGAIN:
                    raise
                sys.exc_clear()
            socket.wait_read(p.stdout.fileno())

        p.stdout.close()

        data['source_file'].close()
        LOGGER.debug('convertion done')
        queue.task_done()