예제 #1
0
파일: base.py 프로젝트: sn-donbenjamin/grab
    def run(self):
        """
        Main method. All work is done here.
        """
        if self.mp_mode:
            from multiprocessing import Process, Event, Queue
        else:
            from multiprocessing.dummy import Process, Event, Queue

        self.timer.start('total')

        if self.transport_name == 'multicurl':
            self.transport = MulticurlTransport(self, self.thread_number)
        elif self.transport_name == 'threaded':
            self.transport = ThreadedTransport(self, self.thread_number)

        if self.http_api_port:
            http_api_proc = self.start_api_thread()
        else:
            http_api_proc = None

        self.parser_result_queue = Queue()
        self.parser_pipeline = ParserPipeline(
            bot=self,
            mp_mode=self.mp_mode,
            pool_size=self.parser_pool_size,
            shutdown_event=self.shutdown_event,
            network_result_queue=self.network_result_queue,
            parser_result_queue=self.parser_result_queue,
            requests_per_process=self.parser_requests_per_process,
        )
        network_result_queue_limit = max(10, self.thread_number * 2)
        
        try:
            # Run custom things defined by this specific spider
            # By defaut it does nothing
            self.prepare()

            # Setup task queue if it has not been configured yet
            if self.task_queue is None:
                self.setup_queue()

            # Initiate task generator. Only in main process!
            with self.timer.log_time('task_generator'):
                self.start_task_generators()

            # Work in infinite cycle untill
            # `self.work_allowed` flag is True
            #shutdown_countdown = 0 # !!!
            pending_tasks = deque()
            while self.work_allowed:
                free_threads = self.transport.get_free_threads_number()
                # Load new task only if:
                # 1) network transport has free threads
                # 2) network result queue is not full
                # 3) cache is disabled OR cache has free resources
                if (self.transport.get_free_threads_number()
                        and (self.network_result_queue.qsize()
                             < network_result_queue_limit)
                        and (self.cache_pipeline is None
                             or self.cache_pipeline.has_free_resources())):
                    if pending_tasks:
                        task = pending_tasks.popleft()
                    else:
                        task = self.get_task_from_queue()
                    if task is None:
                        # If received task is None then
                        # check if spider is ready to be shut down
                        if not pending_tasks and self.is_ready_to_shutdown():
                            # I am afraid there is a bug in `is_ready_to_shutdown`
                            # because it tries to evaluate too many things
                            # includig things that are being set from other threads,
                            # so to ensure we are really ready to shutdown I call
                            # is_ready_to_shutdown a few more times.
                            # Without this hack some times really rarely times
                            # the Grab fails to do its job
                            # A good way to see this bug is to disable this hack
                            # and run:
                            # while ./runtest.py -t test.spider_data; do echo "ok"; done;
                            # And wait a few minutes
                            really_ready = True
                            for x in range(10):
                                if not self.is_ready_to_shutdown():
                                    really_ready = False
                                    break
                                time.sleep(0.001)
                            if really_ready:
                                self.shutdown_event.set()
                                self.stop()
                                break # Break from `while self.work_allowed` cycle
                    elif isinstance(task, bool) and (task is True):
                        # If received task is True
                        # and there is no active network threads then
                        # take some sleep
                        if not self.transport.get_active_threads_number():
                            time.sleep(0.01)
                    else:
                        logger_verbose.debug('Got new task from task queue: %s'
                                             % task)
                        task.network_try_count += 1
                        is_valid, reason = self.check_task_limits(task)
                        if is_valid:
                            task_grab = self.setup_grab_for_task(task)
                            if self.cache_pipeline:
                                self.cache_pipeline.input_queue.put(
                                    ('load', (task, task_grab)),
                                )
                            else:
                                self.submit_task_to_transport(task, task_grab)
                        else:
                            self.log_rejected_task(task, reason)
                            handler = task.get_fallback_handler(self)
                            if handler:
                                handler(task)

                with self.timer.log_time('network_transport'):
                    logger_verbose.debug('Asking transport layer to do '
                                         'something')
                    self.transport.process_handlers()

                logger_verbose.debug('Processing network results (if any).')

                # Collect completed network results
                # Each result could be valid or failed
                # Result is dict {ok, grab, grab_config_backup, task, emsg}
                results = [(x, False) for x in
                           self.transport.iterate_results()]
                if self.cache_pipeline:
                    while True:
                        try:
                            action, result = self.cache_pipeline\
                                                 .result_queue.get(False)
                        except queue.Empty:
                            break
                        else:
                            assert action in ('network_result', 'task')
                            if action == 'network_result':
                                results.append((result, True))
                            elif action == 'task':
                                task = result
                                task_grab = self.setup_grab_for_task(task)
                                if (self.transport.get_free_threads_number()
                                        and (self.network_result_queue.qsize()
                                             < network_result_queue_limit)):
                                    self.submit_task_to_transport(task, task_grab)
                                else:
                                    pending_tasks.append(task)

                # Take sleep to avoid millions of iterations per second.
                # 1) If no results from network transport
                # 2) If task queue is empty (or if there are only delayed tasks)
                # 3) If no network activity
                # 4) If parser result queue is empty
                if (not results
                    and (task is None or bool(task) == True)
                    and not self.transport.get_active_threads_number()
                    and not self.parser_result_queue.qsize()
                    and (self.cache_pipeline is None
                         or (self.cache_pipeline.input_queue.qsize() == 0
                             and self.cache_pipeline.is_idle()
                             and self.cache_pipeline.result_queue.qsize() == 0))
                    ):
                        time.sleep(0.001)

                for result, from_cache in results:
                    if self.cache_pipeline and not from_cache:
                        if result['ok']:
                            self.cache_pipeline.input_queue.put(
                                ('save', (result['task'], result['grab']))
                            )
                    self.log_network_result_stats(
                        result, from_cache=from_cache)

                    is_valid = False
                    if result['task'].get('raw'):
                        is_valid = True
                    elif result['ok']:
                        res_code = result['grab'].response.code
                        if self.is_valid_network_response_code(res_code, result['task']):
                            is_valid = True

                    if is_valid:
                        self.network_result_queue.put(result)
                    else:
                        self.log_failed_network_result(result)
                        # Try to do network request one more time
                        # TODO:
                        # Implement valid_try_limit
                        # Use it if request failed not because of network error
                        # But because of content integrity check
                        if self.network_try_limit > 0:
                            result['task'].refresh_cache = True
                            result['task'].setup_grab_config(
                                result['grab_config_backup'])
                            self.add_task(result['task'])
                    if from_cache:
                        self.stat.inc('spider:task-%s-cache' % result['task'].name)
                    self.stat.inc('spider:request')

                while True:
                    try:
                        p_res, p_task = self.parser_result_queue.get(block=False)
                    except queue.Empty:
                        break
                    else:
                        self.stat.inc('spider:parser-result')
                        self.process_handler_result(p_res, p_task)

                if not self.shutdown_event.is_set():
                    self.parser_pipeline.check_pool_health()

            logger_verbose.debug('Work done')
        except KeyboardInterrupt:
            logger.info('\nGot ^C signal in process %d. Stopping.'
                        % os.getpid())
            self.interrupted = True
            raise
        finally:
            # This code is executed when main cycles is breaked
            self.timer.stop('total')
            self.stat.print_progress_line()
            self.shutdown()

            # Stop HTTP API process
            if http_api_proc:
                http_api_proc.server.shutdown()
                http_api_proc.join()

            if self.task_queue:
                self.task_queue.clear()

            # Stop parser processes
            self.shutdown_event.set()
            self.parser_pipeline.shutdown()
            logger.debug('Main process [pid=%s]: work done' % os.getpid())
예제 #2
0
    def run(self):
        """
        Main method. All work is done here.
        """
        if self.mp_mode:
            from multiprocessing import Process, Event, Queue
        else:
            from multiprocessing.dummy import Process, Event, Queue

        self.timer.start('total')
        self.transport = MulticurlTransport(self.thread_number)

        if self.http_api_port:
            http_api_proc = self.start_api_thread()
        else:
            http_api_proc = None

        self.parser_pipeline = ParserPipeline(
            bot=self,
            mp_mode=self.mp_mode,
            pool_size=self.parser_pool_size,
            shutdown_event=self.shutdown_event,
            network_result_queue=self.network_result_queue,
            requests_per_process=self.parser_requests_per_process,
        )
        network_result_queue_limit = max(10, self.thread_number * 2)

        try:
            # Run custom things defined by this specific spider
            # By defaut it does nothing
            self.prepare()

            # Setup task queue if it has not been configured yet
            if self.task_queue is None:
                self.setup_queue()

            # Initiate task generator. Only in main process!
            with self.timer.log_time('task_generator'):
                self.start_task_generator()

            while self.work_allowed:
                with self.timer.log_time('task_generator'):
                    if self.task_generator_enabled:
                        self.process_task_generator()

                result_from_cache = None
                free_threads = self.transport.get_free_threads_number()
                # Load new task only if self.network_result_queue is not full
                if (self.transport.get_free_threads_number()
                        and (self.network_result_queue.qsize() <
                             network_result_queue_limit)):
                    logger_verbose.debug(
                        'Transport and parser have free resources. '
                        'Trying to load new task from task queue.')

                    task = self.get_task_from_queue()

                    # If no task received from task queue
                    # try to query task generator
                    # and then check if spider could be shuted down
                    if task is None:
                        if not self.transport.get_active_threads_number():
                            self.process_task_generator()

                    if task is None:
                        # If no task received from task queue
                        # check if spider could be shut down
                        if self.is_ready_to_shutdown():
                            self.shutdown_event.set()
                            self.stop()
                            break  # Break `if self.work_allowed` cycle
                    elif isinstance(task, bool) and (task is True):
                        # Take some sleep to not load CPU
                        if not self.transport.get_active_threads_number():
                            time.sleep(0.1)
                    else:
                        logger_verbose.debug(
                            'Got new task from task queue: %s' % task)
                        task.network_try_count += 1
                        is_valid, reason = self.check_task_limits(task)
                        if is_valid:
                            grab = self.setup_grab_for_task(task)
                            grab_config_backup = grab.dump_config()

                            result_from_cache = None
                            if self.is_task_cacheable(task, grab):
                                result_from_cache = self.load_task_from_cache(
                                    task, grab, grab_config_backup)

                            if result_from_cache:
                                logger_verbose.debug(
                                    'Task data is loaded from the cache. ')
                            else:
                                if self.only_cache:
                                    logger.debug('Skipping network request to '
                                                 '%s' % grab.config['url'])
                                else:
                                    self.process_grab_proxy(task, grab)
                                    self.submit_task_to_transport(
                                        task, grab, grab_config_backup)
                        else:
                            self.log_rejected_task(task, reason)
                            handler = task.get_fallback_handler(self)
                            if handler:
                                handler(task)

                with self.timer.log_time('network_transport'):
                    logger_verbose.debug('Asking transport layer to do '
                                         'something')
                    self.transport.process_handlers()

                logger_verbose.debug('Processing network results (if any).')

                # Collect completed network results
                # Each result could be valid or failed
                # Result is dict {ok, grab, grab_config_backup, task, emsg}
                results = [(x, False)
                           for x in self.transport.iterate_results()]
                if result_from_cache:
                    results.append((result_from_cache, True))

                # Some sleep to avoid thousands of iterations per second.
                # If no results from network transport
                if not results:
                    # If task queue is empty (or if there are only
                    # delayed tasks)
                    if task is None or bool(task) == True:
                        # If no network activity
                        if not self.transport.get_active_threads_number():
                            # If parser result queue is empty
                            if not self.parser_pipeline.has_results():
                                # Just sleep some time, do not kill CPU
                                time.sleep(0.1)

                for result, from_cache in results:
                    if not from_cache:
                        if self.is_valid_for_cache(result):
                            with self.timer.log_time('cache'):
                                with self.timer.log_time('cache.write'):
                                    self.cache.save_response(
                                        result['task'].url, result['grab'])
                    self.log_network_result_stats(result,
                                                  from_cache=from_cache)
                    if self.is_valid_network_result(result):
                        #handler = self.find_task_handler(result['task'])
                        #self.process_network_result_with_handler(
                        #    result, handler)
                        # MP:
                        # ***
                        self.network_result_queue.put(result)
                    else:
                        self.log_failed_network_result(result)
                        # Try to do network request one more time
                        if self.network_try_limit > 0:
                            result['task'].refresh_cache = True
                            result['task'].setup_grab_config(
                                result['grab_config_backup'])
                            self.add_task(result['task'])
                    if from_cache:
                        self.stat.inc('spider:task-%s-cache' % task.name)
                    self.stat.inc('spider:request')

                # MP:
                # ***
                while True:
                    try:
                        p_res, p_task = self.parser_pipeline.get_result()
                    except queue.Empty:
                        break
                    else:
                        self.stat.inc('spider:parser-result')
                        self.process_handler_result(p_res, p_task)

                if not self.shutdown_event.is_set():
                    self.parser_pipeline.check_pool_health()

            logger_verbose.debug('Work done')
        except KeyboardInterrupt:
            logger.info('\nGot ^C signal in process %d. Stopping.' %
                        os.getpid())
            self.interrupted = True
            raise
        finally:
            # This code is executed when main cycles is breaked
            self.timer.stop('total')
            self.stat.print_progress_line()
            self.shutdown()

            # Stop HTTP API process
            if http_api_proc:
                http_api_proc.server.shutdown()
                http_api_proc.join()

            self.task_queue.clear()

            # Stop parser processes
            self.shutdown_event.set()
            self.parser_pipeline.shutdown()
            logger.debug('Main process [pid=%s]: work done' % os.getpid())
예제 #3
0
파일: base.py 프로젝트: julia-bikova/grab
    def run(self):
        """
        Main method. All work is done here.
        """

        self.start_timer('total')

        self.transport = MulticurlTransport(self.thread_number)

        try:
            self.setup_default_queue()
            self.prepare()

            self.start_timer('task_generator')
            if not self.slave:
                self.init_task_generator()
            self.stop_timer('task_generator')

            while self.work_allowed:
                self.start_timer('task_generator')
                if self.task_generator_enabled:
                    self.process_task_generator()
                self.stop_timer('task_generator')

                free_threads = self.transport.get_free_threads_number()
                if free_threads:
                    logger_verbose.debug(
                        'Transport has free resources (%d). '
                        'Trying to add new task (if exists).' % free_threads)

                    # Try five times to get new task and proces task generator
                    # because slave parser could agressively consume
                    # tasks from task queue
                    for x in six.moves.range(5):
                        task = self.load_new_task()
                        if task is None:
                            if not self.transport.active_task_number():
                                self.process_task_generator()
                        elif task is True:
                            # If only delayed tasks in queue
                            break
                        else:
                            # If got some task
                            break

                    if not task:
                        if not self.transport.active_task_number():
                            logger_verbose.debug('Network transport has no '
                                                 'active tasks')
                            if not self.task_generator_enabled:
                                self.stop()
                        else:
                            logger_verbose.debug(
                                'Transport active tasks: %d' %
                                self.transport.active_task_number())
                    elif isinstance(task, NullTask):
                        logger_verbose.debug('Got NullTask')
                        if not self.transport.active_task_number():
                            if task.sleep:
                                logger.debug('Got NullTask with sleep '
                                             'instruction. Sleeping for'
                                             ' %.2f seconds' % task.sleep)
                                time.sleep(task.sleep)
                    elif isinstance(task, bool) and (task is True):
                        # Take some sleep to not load CPU
                        if not self.transport.active_task_number():
                            time.sleep(0.1)
                    else:
                        logger_verbose.debug(
                            'Got new task from task queue: %s' % task)
                        self.process_task_counters(task)

                        is_valid, reason = self.check_task_limits(task)
                        if not is_valid:
                            logger_verbose.debug('Task %s is rejected due to '
                                                 '%s limit' %
                                                 (task.name, reason))
                            if reason == 'task-try-count':
                                self.add_item('task-count-rejected', task.url)
                            elif reason == 'network-try-count':
                                self.add_item('network-count-rejected',
                                              task.url)
                            else:
                                raise SpiderError('Unknown response from '
                                                  'check_task_limits: %s' %
                                                  reason)
                            handler = task.get_fallback_handler(self)
                            if handler:
                                handler(task)
                        else:
                            self.process_new_task(task)
                            self.transport.process_handlers()

                with self.save_timer('network_transport'):
                    logger_verbose.debug('Asking transport layer to do '
                                         'something')
                    self.transport.process_handlers()

                logger_verbose.debug('Processing network results (if any).')
                # Iterate over network trasport ready results
                # Each result could be valid or failed
                # Result format: {ok, grab, grab_config_backup, task, emsg}

                # print '[transport iterate results - start]'
                for result in self.transport.iterate_results():
                    if self.is_valid_for_cache(result):
                        with self.save_timer('cache'):
                            with self.save_timer('cache.write'):
                                self.cache.save_response(
                                    result['task'].url, result['grab'])

                    # print '[process network results]'
                    self.process_network_result(result)
                    # print '[done]'
                    self.inc_count('request')

                # print '[transport iterate results - end]'

            logger_verbose.debug('Work done')
        except KeyboardInterrupt:
            print('\nGot ^C signal in process %d. Stopping.' % os.getpid())
            self.interrupted = True
            raise
        finally:
            # This code is executed when main cycles is breaked
            self.stop_timer('total')
            self.shutdown()
예제 #4
0
파일: base.py 프로젝트: sergithon/grab
    def run(self):
        """
        Main method. All work is done here.
        """

        self.start_timer('total')

        self.transport = MulticurlTransport(self.thread_number)

        try:
            self.setup_default_queue()
            self.prepare()

            self.start_timer('task_generator')
            if not self.slave:
                if not self.ng:
                    self.init_task_generator()
            self.stop_timer('task_generator')

            while self.work_allowed:

                now = int(time.time())
                if now - self.last_snapshot_values[
                        'timestamp'] > self.snapshot_interval:
                    snapshot = {'timestamp': now}
                    for key in ('download-size', 'upload-size',
                                'download-size-with-cache'):
                        snapshot[key] = self.counters[
                            key] - self.last_snapshot_values[key]
                        self.last_snapshot_values[key] = self.counters[key]

                    snapshot['request-count'] = self.counters['request'] -\
                        self.last_snapshot_values['request-count']
                    self.last_snapshot_values['request-count'] = self.counters[
                        'request']
                    self.last_snapshot_values['timestamp'] = now

                    self.snapshots[now] = snapshot
                    self.snapshot_timestamps.append(now)

                    if self.snapshot_file:
                        with open(self.snapshot_file, 'a') as out:
                            out.write(json.dumps(snapshot) + '\n')

                # FIXIT: REMOVE
                # Run update task handler which
                # updates database object which stores
                # info about current scraping process
                if self.dump_spider_stats:
                    self.dump_spider_stats(self)

                if self.controller.enabled:
                    self.controller.process_commands()

                if not self.ng:
                    # NG
                    self.start_timer('task_generator')
                    # star
                    if self.task_generator_enabled:
                        self.process_task_generator()
                    self.stop_timer('task_generator')

                if self.transport.ready_for_task():
                    logger_verbose.debug('Transport has free resources. '
                                         'Trying to add new task (if exists)')

                    # Try five times to get new task and proces task generator
                    # because slave parser could agressively consume
                    # tasks from task queue
                    for x in xrange(5):
                        task = self.load_new_task()
                        if task is None:
                            if not self.transport.active_task_number():
                                self.process_task_generator()
                        elif task is True:
                            # If only delayed tasks in queue
                            break
                        else:
                            # If got some task
                            break

                    if not task:
                        if not self.transport.active_task_number():
                            logger_verbose.debug('Network transport has no '
                                                 'active tasks')
                            # NG
                            if self.ng:
                                self.waiting_shutdown_event.set()
                                if self.shutdown_event.is_set():
                                    logger_verbose.debug('Got shutdown signal')
                                    self.stop()
                                else:
                                    logger_verbose.debug('Shutdown event has'
                                                         ' not been set yet')
                            else:
                                if not self.task_generator_enabled:
                                    self.stop()
                        else:
                            logger_verbose.debug(
                                'Transport active tasks: %d' %
                                self.transport.active_task_number())
                    elif isinstance(task, NullTask):
                        logger_verbose.debug('Got NullTask')
                        if not self.transport.active_task_number():
                            if task.sleep:
                                logger.debug('Got NullTask with sleep '
                                             'instruction. Sleeping for'
                                             ' %.2f seconds' % task.sleep)
                                time.sleep(task.sleep)
                    elif isinstance(task, bool) and (task is True):
                        pass
                    else:
                        if self.ng:
                            if self.waiting_shutdown_event.is_set():
                                self.waiting_shutdown_event.clear()
                        logger_verbose.debug(
                            'Got new task from task queue: %s' % task)
                        self.process_task_counters(task)

                        is_valid, reason = self.check_task_limits(task)
                        if not is_valid:
                            logger_verbose.debug('Task %s is rejected due to '
                                                 '%s limit' %
                                                 (task.name, reason))
                            if reason == 'task-try-count':
                                self.add_item('task-count-rejected', task.url)
                            elif reason == 'network-try-count':
                                self.add_item('network-count-rejected',
                                              task.url)
                            else:
                                raise Exception('Unknown response from '
                                                'check_task_limits: %s' %
                                                reason)
                            handler = task.get_fallback_handler(self)
                            if handler:
                                handler(task)
                            # TODO: not do following line
                            # TODO: middleware: TaskFails
                        else:
                            self.process_new_task(task)
                            self.transport.process_handlers()

                with self.save_timer('network_transport'):
                    logger_verbose.debug('Asking transport layer to do '
                                         'something')
                    self.transport.process_handlers()

                logger_verbose.debug('Processing network results (if any).')
                # Iterate over network trasport ready results
                # Each result could be valid or failed
                # Result format: {ok, grab, grab_config_backup, task, emsg}

                #print '[transport iterate results - start]'
                for result in self.transport.iterate_results():
                    if self.is_valid_for_cache(result):
                        with self.save_timer('cache'):
                            with self.save_timer('cache.write'):
                                self.cache.save_response(
                                    result['task'].url, result['grab'])

                    #print '[process network results]'
                    self.process_network_result(result)
                    #print '[done]'
                    self.inc_count('request')

                #print '[transport iterate results - end]'

            logger_verbose.debug('Work done')
        except KeyboardInterrupt:
            print('\nGot ^C signal. Stopping.')
            raise
        finally:
            # This code is executed when main cycles is breaked
            self.stop_timer('total')
            self.shutdown()