def process_handler_result(self, result, task): """ Process result received from the task handler. Result could be: * None * Task instance * Data instance. * dict: * {type: "stat", counters: [], collections: []} * ResponseNotValid-based exception * Arbitrary exception """ if isinstance(result, Task): self.add_task(result) elif isinstance(result, Data): handler = self.find_data_handler(result) try: data_result = handler(**result.storage) if data_result is None: pass else: for something in data_result: self.process_handler_result(something, task) except Exception as ex: self.process_handler_error('data_%s' % result.handler_key, ex, task) elif result is None: pass elif isinstance(result, ResponseNotValid): self.add_task(task.clone(refresh_cache=True)) error_code = result.__class__.__name__.replace('_', '-') self.stat.inc('integrity:%s' % error_code) elif isinstance(result, Exception): handler = self.find_task_handler(task) handler_name = getattr(handler, '__name__', 'NONE') self.process_handler_error(handler_name, result, task) elif isinstance(result, dict): if result.get('type') == 'stat': for name, count in result['counters'].items(): self.stat.inc(name, count) for name, items in result['collections'].items(): for item in items: self.stat.collect(name, item) else: raise SpiderError('Unknown result type: %s' % result) else: raise SpiderError('Unknown result type: %s' % result)
def add_task(self, task, queue=None, raise_error=False): """ Add task to the task queue. """ if queue is None: queue = self.task_queue if queue is None: raise SpiderMisuseError('You should configure task queue before ' 'adding tasks. Use `setup_queue` method.') if task.priority is None or not task.priority_set_explicitly: task.priority = self.generate_task_priority() task.priority_set_explicitly = False else: task.priority_set_explicitly = True if not task.url.startswith(('http://', 'https://', 'ftp://', 'file://', 'feed://')): self.stat.collect('task-with-invalid-url', task.url) msg = 'Invalid task URL: %s' % task.url if raise_error: raise SpiderError(msg) else: logger.error( '%s\nTraceback:\n%s', msg, ''.join(format_stack()), ) return False else: # TODO: keep original task priority if it was set explicitly # WTF the previous comment means? queue.put( task, priority=task.priority, schedule_time=task.schedule_time ) return True
def process_handler_result(self, result, task=None): """ Process result received from the task handler. Result could be: * None * Task instance * Data instance. """ if isinstance(result, Task): self.add_task(result) elif isinstance(result, Data): handler = self.find_data_handler(result) try: data_result = handler(**result.storage) if data_result is None: pass else: for something in data_result: self.process_handler_result(something, task) except Exception as ex: self.process_handler_error('data_%s' % result.handler_key, ex, task) elif result is None: pass elif isinstance(result, NullTask): pass else: raise SpiderError('Unknown result type: %s' % result)
def log_rejected_task(self, task, reason): if reason == 'task-try-count': self.stat.collect('task-count-rejected', task.url) elif reason == 'network-try-count': self.stat.collect('network-count-rejected', task.url) else: raise SpiderError('Unknown response from ' 'check_task_limits: %s' % reason)
def log_rejected_task(self, task, reason): logger_verbose.debug('Task %s is rejected due to ' '%s limit' % (task.name, reason)) if reason == 'task-try-count': self.stat.collect('task-count-rejected', task.url) elif reason == 'network-try-count': self.stat.collect('network-count-rejected', task.url) else: raise SpiderError('Unknown response from ' 'check_task_limits: %s' % reason)
def process_handler_result(self, result, task=None): """ Process result received from the task handler. Result could be: * None * Task instance * Data instance. """ if isinstance(result, Task): self.add_task(result) elif isinstance(result, Data): handler = self.find_data_handler(result) try: data_result = handler(**result.storage) if data_result is None: pass else: for something in data_result: self.process_handler_result(something, task) except Exception as ex: self.process_handler_error('data_%s' % result.handler_key, ex, task) elif result is None: pass elif isinstance(result, Exception): handler = self.find_task_handler(task) handler_name = getattr(handler, '__name__', 'NONE') self.process_handler_error(handler_name, result, task) elif isinstance(result, dict): if result.get('type') == 'stat': for name, count in result['counters'].items(): self.stat.inc(name, count) for name, items in result['collections'].items(): for item in items: self.stat.collect(name, item) else: raise SpiderError('Unknown result type: %s' % result) else: raise SpiderError('Unknown result type: %s' % result)
def add_task(self, task, raise_error=False): """ Add task to the task queue. """ # MP: # *** if self.parser_mode: self.parser_result_queue.put((task, None)) return if self.task_queue is None: raise SpiderMisuseError('You should configure task queue before ' 'adding tasks. Use `setup_queue` method.') if task.priority is None or not task.priority_is_custom: task.priority = self.generate_task_priority() task.priority_is_custom = False else: task.priority_is_custom = True try: if not task.url.startswith( ('http://', 'https://', 'ftp://', 'file://', 'feed://')): if self.base_url is None: msg = 'Could not resolve relative URL because base_url ' \ 'is not specified. Task: %s, URL: %s'\ % (task.name, task.url) raise SpiderError(msg) else: warn('Class attribute `Spider::base_url` is deprecated. ' 'Use Task objects with absolute URLs') task.url = urljoin(self.base_url, task.url) # If task has grab_config object then update it too if task.grab_config: task.grab_config['url'] = task.url except Exception as ex: self.stat.collect('task-with-invalid-url', task.url) if raise_error: raise else: logger.error('', exc_info=ex) return False # TODO: keep original task priority if it was set explicitly self.task_queue.put(task, task.priority, schedule_time=task.schedule_time) return True
def add_task(self, task, raise_error=False): """ Add task to the task queue. """ # MP: # *** if self.parser_mode: self.parser_result_queue.put((task, None)) return if self.task_queue is None: raise SpiderMisuseError('You should configure task queue before ' 'adding tasks. Use `setup_queue` method.') if task.priority is None or not task.priority_set_explicitly: task.priority = self.generate_task_priority() task.priority_set_explicitly = False else: task.priority_set_explicitly = True if not task.url.startswith(('http://', 'https://', 'ftp://', 'file://', 'feed://')): self.stat.collect('task-with-invalid-url', task.url) msg = ('It is not allowed to build Task object with ' 'relative URL: %s' % task.url) ex = SpiderError(msg) if raise_error: raise ex else: # Just want to print traceback # Do this to avoid the error # http://bugs.python.org/issue23003 # FIXME: use something less awkward try: raise ex except SpiderError as ex: logger.error('', exc_info=ex) return False # TODO: keep original task priority if it was set explicitly # WTF the previous comment means? self.task_queue.put(task, task.priority, schedule_time=task.schedule_time) return True
def add_task(self, task, raise_error=False): """ Add task to the task queue. """ if self.taskq is None: raise SpiderMisuseError('You should configure task queue before ' 'adding tasks. Use `setup_queue` method.') if task.priority is None or not task.priority_is_custom: task.priority = self.generate_task_priority() task.priority_is_custom = False else: task.priority_is_custom = True if not isinstance(task, NullTask): try: if not task.url.startswith( ('http://', 'https://', 'ftp://', 'file://', 'feed://')): if self.base_url is None: msg = 'Could not resolve relative URL because base_url ' \ 'is not specified. Task: %s, URL: %s'\ % (task.name, task.url) raise SpiderError(msg) else: task.url = urljoin(self.base_url, task.url) # If task has grab_config object then update it too if task.grab_config: task.grab_config['url'] = task.url except Exception as ex: self.add_item('task-with-invalid-url', task.url) if raise_error: raise else: logger.error('', exc_info=ex) return False # TODO: keep original task priority if it was set explicitly self.taskq.put(task, task.priority, schedule_time=task.schedule_time) return True
def run(self): """ Main method. All work is done here. """ self.start_timer('total') self.transport = MulticurlTransport(self.thread_number) try: self.setup_default_queue() self.prepare() self.start_timer('task_generator') if not self.slave: self.init_task_generator() self.stop_timer('task_generator') while self.work_allowed: self.start_timer('task_generator') if self.task_generator_enabled: self.process_task_generator() self.stop_timer('task_generator') free_threads = self.transport.get_free_threads_number() if free_threads: logger_verbose.debug( 'Transport has free resources (%d). ' 'Trying to add new task (if exists).' % free_threads) # Try five times to get new task and proces task generator # because slave parser could agressively consume # tasks from task queue for x in six.moves.range(5): task = self.load_new_task() if task is None: if not self.transport.active_task_number(): self.process_task_generator() elif task is True: # If only delayed tasks in queue break else: # If got some task break if not task: if not self.transport.active_task_number(): logger_verbose.debug('Network transport has no ' 'active tasks') if not self.task_generator_enabled: self.stop() else: logger_verbose.debug( 'Transport active tasks: %d' % self.transport.active_task_number()) elif isinstance(task, NullTask): logger_verbose.debug('Got NullTask') if not self.transport.active_task_number(): if task.sleep: logger.debug('Got NullTask with sleep ' 'instruction. Sleeping for' ' %.2f seconds' % task.sleep) time.sleep(task.sleep) elif isinstance(task, bool) and (task is True): # Take some sleep to not load CPU if not self.transport.active_task_number(): time.sleep(0.1) else: logger_verbose.debug( 'Got new task from task queue: %s' % task) self.process_task_counters(task) is_valid, reason = self.check_task_limits(task) if not is_valid: logger_verbose.debug('Task %s is rejected due to ' '%s limit' % (task.name, reason)) if reason == 'task-try-count': self.add_item('task-count-rejected', task.url) elif reason == 'network-try-count': self.add_item('network-count-rejected', task.url) else: raise SpiderError('Unknown response from ' 'check_task_limits: %s' % reason) handler = task.get_fallback_handler(self) if handler: handler(task) else: self.process_new_task(task) self.transport.process_handlers() with self.save_timer('network_transport'): logger_verbose.debug('Asking transport layer to do ' 'something') self.transport.process_handlers() logger_verbose.debug('Processing network results (if any).') # Iterate over network trasport ready results # Each result could be valid or failed # Result format: {ok, grab, grab_config_backup, task, emsg} # print '[transport iterate results - start]' for result in self.transport.iterate_results(): if self.is_valid_for_cache(result): with self.save_timer('cache'): with self.save_timer('cache.write'): self.cache.save_response( result['task'].url, result['grab']) # print '[process network results]' self.process_network_result(result) # print '[done]' self.inc_count('request') # print '[transport iterate results - end]' logger_verbose.debug('Work done') except KeyboardInterrupt: print('\nGot ^C signal in process %d. Stopping.' % os.getpid()) self.interrupted = True raise finally: # This code is executed when main cycles is breaked self.stop_timer('total') self.shutdown()
def process_service_result(self, result, task, meta=None): """ Process result submitted from any service to task dispatcher service. Result could be: * Task * None * Task instance * ResponseNotValid-based exception * Arbitrary exception * Network response: {ok, ecode, emsg, error_abbr, exc, grab, grab_config_backup} Exception can come only from parser_service and it always has meta {"from": "parser", "exc_info": <...>} """ if meta is None: meta = {} if isinstance(result, Task): self.spider.add_task(result) elif result is None: pass elif isinstance(result, ResponseNotValid): self.spider.add_task(task.clone()) error_code = result.__class__.__name__.replace('_', '-') self.spider.stat.inc('integrity:%s' % error_code) elif isinstance(result, Exception): if task: handler = self.spider.find_task_handler(task) handler_name = getattr(handler, '__name__', 'NONE') else: handler_name = 'NA' self.spider.process_parser_error( handler_name, task, meta['exc_info'], ) if isinstance(result, FatalError): self.spider.fatal_error_queue.put(meta['exc_info']) elif isinstance(result, dict) and 'grab' in result: # TODO: Move to network service # starts self.spider.log_network_result_stats(result, task) # ends is_valid = False if task.get('raw'): is_valid = True elif result['ok']: res_code = result['grab'].doc.code is_valid = self.spider.is_valid_network_response_code( res_code, task ) if is_valid: self.spider.parser_service.input_queue.put((result, task)) else: self.spider.log_failed_network_result(result) # Try to do network request one more time # TODO: # Implement valid_try_limit # Use it if request failed not because of network error # But because of content integrity check if self.spider.network_try_limit > 0: task.setup_grab_config( result['grab_config_backup']) self.spider.add_task(task) self.spider.stat.inc('spider:request') else: raise SpiderError('Unknown result received from a service: %s' % result)