Пример #1
0
 def run_parser(self):
     """
     Main work cycle of spider process working in parser-mode.
     """
     # Use Stat instance that does not print any logging messages
     if self.parser_mode:
         self.stat = Stat(logging_period=None)
     self.prepare_parser()
     process_request_count = 0
     try:
         recent_task_time = time.time()
         while True:
             try:
                 result = self.network_result_queue.get(True, 0.1)
             except queue.Empty:
                 logger_verbose.debug('Network result queue is empty')
                 # Set `waiting_shutdown_event` only after 1 seconds
                 # of waiting for tasks to avoid
                 # race-condition issues
                 if time.time() - recent_task_time > 1:
                     self.waiting_shutdown_event.set()
                 if self.shutdown_event.is_set():
                     logger_verbose.debug('Got shutdown event')
                     return
             else:
                 process_request_count += 1
                 recent_task_time = time.time()
                 if self.parser_mode:
                     self.stat.reset()
                 if self.waiting_shutdown_event.is_set():
                     self.waiting_shutdown_event.clear()
                 try:
                     handler = self.find_task_handler(result['task'])
                 except NoTaskHandler as ex:
                     ex.tb = format_exc()
                     self.parser_result_queue.put((ex, result['task']))
                     self.stat.inc('parser:handler-not-found')
                 else:
                     self.process_network_result_with_handler_mp(
                         result, handler)
                     self.stat.inc('parser:handler-processed')
                 finally:
                     if self.parser_mode:
                         data = {
                             'type': 'stat',
                             'counters': self.stat.counters,
                             'collections': self.stat.collections,
                         }
                         self.parser_result_queue.put(
                             (data, result['task']))
                     if self.parser_mode:
                         if self.parser_requests_per_process:
                             if process_request_count >= self.parser_requests_per_process:
                                 break
     except Exception as ex:
         logging.error('', exc_info=ex)
         raise
     finally:
         self.waiting_shutdown_event.set()
Пример #2
0
 def run_parser(self):
     """
     Main work cycle of spider process working in parser-mode.
     """
     self.is_parser_idle.clear()
     # Use Stat instance that does not print any logging messages
     if self.parser_mode:
         self.stat = Stat(logging_period=None)
     self.prepare_parser()
     process_request_count = 0
     try:
         work_permitted = True
         while work_permitted:
             try:
                 result = self.network_result_queue.get(block=False)
             except queue.Empty:
                 self.is_parser_idle.set()
                 time.sleep(0.1)
                 self.is_parser_idle.clear()
                 logger_verbose.debug('Network result queue is empty')
                 if self.shutdown_event.is_set():
                     logger_verbose.debug('Got shutdown event')
                     return
             else:
                 process_request_count += 1
                 if self.parser_mode:
                     self.stat.reset()
                 try:
                     handler = self.find_task_handler(result['task'])
                 except NoTaskHandler as ex:
                     ex.tb = format_exc()
                     self.parser_result_queue.put((ex, result['task']))
                     self.stat.inc('parser:handler-not-found')
                 else:
                     self.process_network_result_with_handler(
                         result, handler)
                     self.stat.inc('parser:handler-processed')
                 finally:
                     if self.parser_mode:
                         data = {
                             'type': 'stat',
                             'counters': self.stat.counters,
                             'collections': self.stat.collections,
                         }
                         self.parser_result_queue.put((data,
                                                       result['task']))
                     if self.parser_mode:
                         if self.parser_requests_per_process:
                             if (process_request_count >=
                                     self.parser_requests_per_process):
                                 work_permitted = False
     except Exception as ex:
         logging.error('', exc_info=ex)
         raise
Пример #3
0
    def __init__(
            self,
            thread_number=None,
            network_try_limit=None, task_try_limit=None,
            request_pause=NULL,
            priority_mode='random',
            meta=None,
            config=None,
            args=None,
            parser_requests_per_process=10000,
            parser_pool_size=1,
            http_api_port=None,
            network_service='threaded',
            grab_transport='pycurl',
            # Deprecated
            transport=None,
            only_cache=False,
        ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * task_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        """

        self.fatal_error_queue = Queue()
        self.task_queue_parameters = None
        self.http_api_port = http_api_port
        self._started = None
        assert grab_transport in ('pycurl', 'urllib3')
        self.grab_transport_name = grab_transport
        self.parser_requests_per_process = parser_requests_per_process
        self.stat = Stat()
        self.task_queue = None
        if args is None:
            self.args = {}
        else:
            self.args = args
        if config is not None:
            self.config = config
        else:
            self.config = {}
        if meta:
            self.meta = meta
        else:
            self.meta = {}
        self.thread_number = (
            thread_number or
            int(self.config.get('thread_number',
                                DEFAULT_NETWORK_STREAM_NUMBER)))
        self.task_try_limit = (
            task_try_limit or
            int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT)))
        self.network_try_limit = (
            network_try_limit or
            int(self.config.get('network_try_limit',
                                DEFAULT_NETWORK_TRY_LIMIT)))
        self._grab_config = {}
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode
        if only_cache:
            raise_feature_is_deprecated('Cache feature')
        self.work_allowed = True
        if request_pause is not NULL:
            warn('Option `request_pause` is deprecated and is not '
                 'supported anymore')
        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False
        self.interrupted = False
        self.parser_pool_size = parser_pool_size
        self.parser_service = ParserService(
            spider=self,
            pool_size=self.parser_pool_size,
        )
        if transport is not None:
            warn('The "transport" argument of Spider constructor is'
                 ' deprecated. Use "network_service" argument.')
            network_service = transport
        assert network_service in ('threaded',)
        if network_service == 'threaded':
            # pylint: disable=no-name-in-module, import-error
            from grab.spider.network_service.threaded import (
                NetworkServiceThreaded
            )
            self.network_service = NetworkServiceThreaded(
                self, self.thread_number
            )
        self.task_dispatcher = TaskDispatcherService(self)
        if self.http_api_port:
            self.http_api_service = HttpApiService(self)
        else:
            self.http_api_service = None
        self.task_generator_service = TaskGeneratorService(
            self.task_generator(), self,
        )
Пример #4
0
    def __init__(self, thread_number=None,
                 network_try_limit=None, task_try_limit=None,
                 request_pause=NULL,
                 priority_mode='random',
                 meta=None,
                 only_cache=False,
                 config=None,
                 slave=None,
                 args=None,
                 # New options start here
                 taskq=None,
                 # MP:
                 network_result_queue=None,
                 parser_result_queue=None,
                 is_parser_idle=None,
                 shutdown_event=None,
                 mp_mode=False,
                 parser_pool_size=None,
                 parser_mode=False,
                 parser_requests_per_process=10000,
                 # http api
                 http_api_port=None,
                 transport='multicurl',
                 grab_transport='pycurl',
                 ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * network_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        New options:
        * taskq=None,
        * newtork_response_queue=None,
        """

        if slave is not None:
            raise SpiderConfigurtionError(
                'Slave mode is not supported anymore. '
                'Use `mp_mode=True` option to run multiple HTML'
                ' parser processes.')

        # API:
        self.http_api_port = http_api_port

        assert transport in ('multicurl', 'threaded')
        self.transport_name = transport

        assert grab_transport in ('pycurl', 'urllib3')
        self.grab_transport_name = grab_transport

        # MP:
        self.mp_mode = mp_mode
        if self.mp_mode:
            from multiprocessing import Process, Event, Queue
        else:
            from multiprocessing.dummy import Process, Event, Queue

        if network_result_queue is not None:
            self.network_result_queue = network_result_queue
        else:
            self.network_result_queue = Queue()
        self.parser_result_queue = parser_result_queue
        self.is_parser_idle = is_parser_idle
        if shutdown_event is not None:
            self.shutdown_event = shutdown_event
        else:
            self.shutdown_event = Event()
        if not self.mp_mode and parser_pool_size and parser_pool_size > 1:
            raise SpiderConfigurationError(
                'Parser pool size could be only 1 in '
                'non-multiprocess mode')
        self.parser_pool_size = parser_pool_size
        self.parser_mode = parser_mode
        self.parser_requests_per_process = parser_requests_per_process

        self.stat = Stat()
        self.timer = Timer()
        self.task_queue = taskq

        if args is None:
            self.args = {}
        else:
            self.args = args

        if config is not None:
            self.config = config
        else:
            self.config = {}

        if meta:
            self.meta = meta
        else:
            self.meta = {}

        self.thread_number = (
            thread_number or
            int(self.config.get('thread_number',
                                DEFAULT_NETWORK_STREAM_NUMBER)))
        self.task_try_limit = (
            task_try_limit or
            int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT)))
        self.network_try_limit = (
            network_try_limit or
            int(self.config.get('network_try_limit',
                                DEFAULT_NETWORK_TRY_LIMIT)))

        self._grab_config = {}
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode

        self.only_cache = only_cache
        self.cache_pipeline = None
        self.work_allowed = True
        if request_pause is not NULL:
            warn('Option `request_pause` is deprecated and is not '
                 'supported anymore')

        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False
        self.interrupted = False
Пример #5
0
 def test_zero_division_error(self):
     stat = Stat()
     stat.get_speed_line(stat.time)