def run_parser(self): """ Main work cycle of spider process working in parser-mode. """ # Use Stat instance that does not print any logging messages if self.parser_mode: self.stat = Stat(logging_period=None) self.prepare_parser() process_request_count = 0 try: recent_task_time = time.time() while True: try: result = self.network_result_queue.get(True, 0.1) except queue.Empty: logger_verbose.debug('Network result queue is empty') # Set `waiting_shutdown_event` only after 1 seconds # of waiting for tasks to avoid # race-condition issues if time.time() - recent_task_time > 1: self.waiting_shutdown_event.set() if self.shutdown_event.is_set(): logger_verbose.debug('Got shutdown event') return else: process_request_count += 1 recent_task_time = time.time() if self.parser_mode: self.stat.reset() if self.waiting_shutdown_event.is_set(): self.waiting_shutdown_event.clear() try: handler = self.find_task_handler(result['task']) except NoTaskHandler as ex: ex.tb = format_exc() self.parser_result_queue.put((ex, result['task'])) self.stat.inc('parser:handler-not-found') else: self.process_network_result_with_handler_mp( result, handler) self.stat.inc('parser:handler-processed') finally: if self.parser_mode: data = { 'type': 'stat', 'counters': self.stat.counters, 'collections': self.stat.collections, } self.parser_result_queue.put( (data, result['task'])) if self.parser_mode: if self.parser_requests_per_process: if process_request_count >= self.parser_requests_per_process: break except Exception as ex: logging.error('', exc_info=ex) raise finally: self.waiting_shutdown_event.set()
def run_parser(self): """ Main work cycle of spider process working in parser-mode. """ self.is_parser_idle.clear() # Use Stat instance that does not print any logging messages if self.parser_mode: self.stat = Stat(logging_period=None) self.prepare_parser() process_request_count = 0 try: work_permitted = True while work_permitted: try: result = self.network_result_queue.get(block=False) except queue.Empty: self.is_parser_idle.set() time.sleep(0.1) self.is_parser_idle.clear() logger_verbose.debug('Network result queue is empty') if self.shutdown_event.is_set(): logger_verbose.debug('Got shutdown event') return else: process_request_count += 1 if self.parser_mode: self.stat.reset() try: handler = self.find_task_handler(result['task']) except NoTaskHandler as ex: ex.tb = format_exc() self.parser_result_queue.put((ex, result['task'])) self.stat.inc('parser:handler-not-found') else: self.process_network_result_with_handler( result, handler) self.stat.inc('parser:handler-processed') finally: if self.parser_mode: data = { 'type': 'stat', 'counters': self.stat.counters, 'collections': self.stat.collections, } self.parser_result_queue.put((data, result['task'])) if self.parser_mode: if self.parser_requests_per_process: if (process_request_count >= self.parser_requests_per_process): work_permitted = False except Exception as ex: logging.error('', exc_info=ex) raise
def __init__( self, thread_number=None, network_try_limit=None, task_try_limit=None, request_pause=NULL, priority_mode='random', meta=None, config=None, args=None, parser_requests_per_process=10000, parser_pool_size=1, http_api_port=None, network_service='threaded', grab_transport='pycurl', # Deprecated transport=None, only_cache=False, ): """ Arguments: * thread-number - Number of concurrent network streams * network_try_limit - How many times try to send request again if network error was occurred, use 0 to disable * task_try_limit - Limit of tries to execute some task this is not the same as network_try_limit network try limit limits the number of tries which are performed automatically in case of network timeout of some other physical error but task_try_limit limits the number of attempts which are scheduled manually in the spider business logic * priority_mode - could be "random" or "const" * meta - arbitrary user data * retry_rebuild_user_agent - generate new random user-agent for each network request which is performed again due to network error * args - command line arguments parsed with `setup_arg_parser` method """ self.fatal_error_queue = Queue() self.task_queue_parameters = None self.http_api_port = http_api_port self._started = None assert grab_transport in ('pycurl', 'urllib3') self.grab_transport_name = grab_transport self.parser_requests_per_process = parser_requests_per_process self.stat = Stat() self.task_queue = None if args is None: self.args = {} else: self.args = args if config is not None: self.config = config else: self.config = {} if meta: self.meta = meta else: self.meta = {} self.thread_number = ( thread_number or int(self.config.get('thread_number', DEFAULT_NETWORK_STREAM_NUMBER))) self.task_try_limit = ( task_try_limit or int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT))) self.network_try_limit = ( network_try_limit or int(self.config.get('network_try_limit', DEFAULT_NETWORK_TRY_LIMIT))) self._grab_config = {} if priority_mode not in ['random', 'const']: raise SpiderMisuseError('Value of priority_mode option should be ' '"random" or "const"') else: self.priority_mode = priority_mode if only_cache: raise_feature_is_deprecated('Cache feature') self.work_allowed = True if request_pause is not NULL: warn('Option `request_pause` is deprecated and is not ' 'supported anymore') self.proxylist_enabled = None self.proxylist = None self.proxy = None self.proxy_auto_change = False self.interrupted = False self.parser_pool_size = parser_pool_size self.parser_service = ParserService( spider=self, pool_size=self.parser_pool_size, ) if transport is not None: warn('The "transport" argument of Spider constructor is' ' deprecated. Use "network_service" argument.') network_service = transport assert network_service in ('threaded',) if network_service == 'threaded': # pylint: disable=no-name-in-module, import-error from grab.spider.network_service.threaded import ( NetworkServiceThreaded ) self.network_service = NetworkServiceThreaded( self, self.thread_number ) self.task_dispatcher = TaskDispatcherService(self) if self.http_api_port: self.http_api_service = HttpApiService(self) else: self.http_api_service = None self.task_generator_service = TaskGeneratorService( self.task_generator(), self, )
def __init__(self, thread_number=None, network_try_limit=None, task_try_limit=None, request_pause=NULL, priority_mode='random', meta=None, only_cache=False, config=None, slave=None, args=None, # New options start here taskq=None, # MP: network_result_queue=None, parser_result_queue=None, is_parser_idle=None, shutdown_event=None, mp_mode=False, parser_pool_size=None, parser_mode=False, parser_requests_per_process=10000, # http api http_api_port=None, transport='multicurl', grab_transport='pycurl', ): """ Arguments: * thread-number - Number of concurrent network streams * network_try_limit - How many times try to send request again if network error was occurred, use 0 to disable * network_try_limit - Limit of tries to execute some task this is not the same as network_try_limit network try limit limits the number of tries which are performed automatically in case of network timeout of some other physical error but task_try_limit limits the number of attempts which are scheduled manually in the spider business logic * priority_mode - could be "random" or "const" * meta - arbitrary user data * retry_rebuild_user_agent - generate new random user-agent for each network request which is performed again due to network error * args - command line arguments parsed with `setup_arg_parser` method New options: * taskq=None, * newtork_response_queue=None, """ if slave is not None: raise SpiderConfigurtionError( 'Slave mode is not supported anymore. ' 'Use `mp_mode=True` option to run multiple HTML' ' parser processes.') # API: self.http_api_port = http_api_port assert transport in ('multicurl', 'threaded') self.transport_name = transport assert grab_transport in ('pycurl', 'urllib3') self.grab_transport_name = grab_transport # MP: self.mp_mode = mp_mode if self.mp_mode: from multiprocessing import Process, Event, Queue else: from multiprocessing.dummy import Process, Event, Queue if network_result_queue is not None: self.network_result_queue = network_result_queue else: self.network_result_queue = Queue() self.parser_result_queue = parser_result_queue self.is_parser_idle = is_parser_idle if shutdown_event is not None: self.shutdown_event = shutdown_event else: self.shutdown_event = Event() if not self.mp_mode and parser_pool_size and parser_pool_size > 1: raise SpiderConfigurationError( 'Parser pool size could be only 1 in ' 'non-multiprocess mode') self.parser_pool_size = parser_pool_size self.parser_mode = parser_mode self.parser_requests_per_process = parser_requests_per_process self.stat = Stat() self.timer = Timer() self.task_queue = taskq if args is None: self.args = {} else: self.args = args if config is not None: self.config = config else: self.config = {} if meta: self.meta = meta else: self.meta = {} self.thread_number = ( thread_number or int(self.config.get('thread_number', DEFAULT_NETWORK_STREAM_NUMBER))) self.task_try_limit = ( task_try_limit or int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT))) self.network_try_limit = ( network_try_limit or int(self.config.get('network_try_limit', DEFAULT_NETWORK_TRY_LIMIT))) self._grab_config = {} if priority_mode not in ['random', 'const']: raise SpiderMisuseError('Value of priority_mode option should be ' '"random" or "const"') else: self.priority_mode = priority_mode self.only_cache = only_cache self.cache_pipeline = None self.work_allowed = True if request_pause is not NULL: warn('Option `request_pause` is deprecated and is not ' 'supported anymore') self.proxylist_enabled = None self.proxylist = None self.proxy = None self.proxy_auto_change = False self.interrupted = False
def test_zero_division_error(self): stat = Stat() stat.get_speed_line(stat.time)