def run(self): """ Main method. All work is done here. """ if self.mp_mode: from multiprocessing import Process, Event, Queue else: from multiprocessing.dummy import Process, Event, Queue self.timer.start('total') if self.transport_name == 'multicurl': self.transport = MulticurlTransport(self, self.thread_number) elif self.transport_name == 'threaded': self.transport = ThreadedTransport(self, self.thread_number) if self.http_api_port: http_api_proc = self.start_api_thread() else: http_api_proc = None self.parser_result_queue = Queue() self.parser_pipeline = ParserPipeline( bot=self, mp_mode=self.mp_mode, pool_size=self.parser_pool_size, shutdown_event=self.shutdown_event, network_result_queue=self.network_result_queue, parser_result_queue=self.parser_result_queue, requests_per_process=self.parser_requests_per_process, ) network_result_queue_limit = max(10, self.thread_number * 2) try: # Run custom things defined by this specific spider # By defaut it does nothing self.prepare() # Setup task queue if it has not been configured yet if self.task_queue is None: self.setup_queue() # Initiate task generator. Only in main process! with self.timer.log_time('task_generator'): self.start_task_generators() # Work in infinite cycle untill # `self.work_allowed` flag is True #shutdown_countdown = 0 # !!! pending_tasks = deque() while self.work_allowed: free_threads = self.transport.get_free_threads_number() # Load new task only if: # 1) network transport has free threads # 2) network result queue is not full # 3) cache is disabled OR cache has free resources if (self.transport.get_free_threads_number() and (self.network_result_queue.qsize() < network_result_queue_limit) and (self.cache_pipeline is None or self.cache_pipeline.has_free_resources())): if pending_tasks: task = pending_tasks.popleft() else: task = self.get_task_from_queue() if task is None: # If received task is None then # check if spider is ready to be shut down if not pending_tasks and self.is_ready_to_shutdown(): # I am afraid there is a bug in `is_ready_to_shutdown` # because it tries to evaluate too many things # includig things that are being set from other threads, # so to ensure we are really ready to shutdown I call # is_ready_to_shutdown a few more times. # Without this hack some times really rarely times # the Grab fails to do its job # A good way to see this bug is to disable this hack # and run: # while ./runtest.py -t test.spider_data; do echo "ok"; done; # And wait a few minutes really_ready = True for x in range(10): if not self.is_ready_to_shutdown(): really_ready = False break time.sleep(0.001) if really_ready: self.shutdown_event.set() self.stop() break # Break from `while self.work_allowed` cycle elif isinstance(task, bool) and (task is True): # If received task is True # and there is no active network threads then # take some sleep if not self.transport.get_active_threads_number(): time.sleep(0.01) else: logger_verbose.debug('Got new task from task queue: %s' % task) task.network_try_count += 1 is_valid, reason = self.check_task_limits(task) if is_valid: task_grab = self.setup_grab_for_task(task) if self.cache_pipeline: self.cache_pipeline.input_queue.put( ('load', (task, task_grab)), ) else: self.submit_task_to_transport(task, task_grab) else: self.log_rejected_task(task, reason) handler = task.get_fallback_handler(self) if handler: handler(task) with self.timer.log_time('network_transport'): logger_verbose.debug('Asking transport layer to do ' 'something') self.transport.process_handlers() logger_verbose.debug('Processing network results (if any).') # Collect completed network results # Each result could be valid or failed # Result is dict {ok, grab, grab_config_backup, task, emsg} results = [(x, False) for x in self.transport.iterate_results()] if self.cache_pipeline: while True: try: action, result = self.cache_pipeline\ .result_queue.get(False) except queue.Empty: break else: assert action in ('network_result', 'task') if action == 'network_result': results.append((result, True)) elif action == 'task': task = result task_grab = self.setup_grab_for_task(task) if (self.transport.get_free_threads_number() and (self.network_result_queue.qsize() < network_result_queue_limit)): self.submit_task_to_transport(task, task_grab) else: pending_tasks.append(task) # Take sleep to avoid millions of iterations per second. # 1) If no results from network transport # 2) If task queue is empty (or if there are only delayed tasks) # 3) If no network activity # 4) If parser result queue is empty if (not results and (task is None or bool(task) == True) and not self.transport.get_active_threads_number() and not self.parser_result_queue.qsize() and (self.cache_pipeline is None or (self.cache_pipeline.input_queue.qsize() == 0 and self.cache_pipeline.is_idle() and self.cache_pipeline.result_queue.qsize() == 0)) ): time.sleep(0.001) for result, from_cache in results: if self.cache_pipeline and not from_cache: if result['ok']: self.cache_pipeline.input_queue.put( ('save', (result['task'], result['grab'])) ) self.log_network_result_stats( result, from_cache=from_cache) is_valid = False if result['task'].get('raw'): is_valid = True elif result['ok']: res_code = result['grab'].response.code if self.is_valid_network_response_code(res_code, result['task']): is_valid = True if is_valid: self.network_result_queue.put(result) else: self.log_failed_network_result(result) # Try to do network request one more time # TODO: # Implement valid_try_limit # Use it if request failed not because of network error # But because of content integrity check if self.network_try_limit > 0: result['task'].refresh_cache = True result['task'].setup_grab_config( result['grab_config_backup']) self.add_task(result['task']) if from_cache: self.stat.inc('spider:task-%s-cache' % result['task'].name) self.stat.inc('spider:request') while True: try: p_res, p_task = self.parser_result_queue.get(block=False) except queue.Empty: break else: self.stat.inc('spider:parser-result') self.process_handler_result(p_res, p_task) if not self.shutdown_event.is_set(): self.parser_pipeline.check_pool_health() logger_verbose.debug('Work done') except KeyboardInterrupt: logger.info('\nGot ^C signal in process %d. Stopping.' % os.getpid()) self.interrupted = True raise finally: # This code is executed when main cycles is breaked self.timer.stop('total') self.stat.print_progress_line() self.shutdown() # Stop HTTP API process if http_api_proc: http_api_proc.server.shutdown() http_api_proc.join() if self.task_queue: self.task_queue.clear() # Stop parser processes self.shutdown_event.set() self.parser_pipeline.shutdown() logger.debug('Main process [pid=%s]: work done' % os.getpid())
def run(self): """ Main method. All work is done here. """ if self.mp_mode: from multiprocessing import Process, Event, Queue else: from multiprocessing.dummy import Process, Event, Queue self.timer.start('total') self.transport = MulticurlTransport(self.thread_number) if self.http_api_port: http_api_proc = self.start_api_thread() else: http_api_proc = None self.parser_pipeline = ParserPipeline( bot=self, mp_mode=self.mp_mode, pool_size=self.parser_pool_size, shutdown_event=self.shutdown_event, network_result_queue=self.network_result_queue, requests_per_process=self.parser_requests_per_process, ) network_result_queue_limit = max(10, self.thread_number * 2) try: # Run custom things defined by this specific spider # By defaut it does nothing self.prepare() # Setup task queue if it has not been configured yet if self.task_queue is None: self.setup_queue() # Initiate task generator. Only in main process! with self.timer.log_time('task_generator'): self.start_task_generator() while self.work_allowed: with self.timer.log_time('task_generator'): if self.task_generator_enabled: self.process_task_generator() result_from_cache = None free_threads = self.transport.get_free_threads_number() # Load new task only if self.network_result_queue is not full if (self.transport.get_free_threads_number() and (self.network_result_queue.qsize() < network_result_queue_limit)): logger_verbose.debug( 'Transport and parser have free resources. ' 'Trying to load new task from task queue.') task = self.get_task_from_queue() # If no task received from task queue # try to query task generator # and then check if spider could be shuted down if task is None: if not self.transport.get_active_threads_number(): self.process_task_generator() if task is None: # If no task received from task queue # check if spider could be shut down if self.is_ready_to_shutdown(): self.shutdown_event.set() self.stop() break # Break `if self.work_allowed` cycle elif isinstance(task, bool) and (task is True): # Take some sleep to not load CPU if not self.transport.get_active_threads_number(): time.sleep(0.1) else: logger_verbose.debug( 'Got new task from task queue: %s' % task) task.network_try_count += 1 is_valid, reason = self.check_task_limits(task) if is_valid: grab = self.setup_grab_for_task(task) grab_config_backup = grab.dump_config() result_from_cache = None if self.is_task_cacheable(task, grab): result_from_cache = self.load_task_from_cache( task, grab, grab_config_backup) if result_from_cache: logger_verbose.debug( 'Task data is loaded from the cache. ') else: if self.only_cache: logger.debug('Skipping network request to ' '%s' % grab.config['url']) else: self.process_grab_proxy(task, grab) self.submit_task_to_transport( task, grab, grab_config_backup) else: self.log_rejected_task(task, reason) handler = task.get_fallback_handler(self) if handler: handler(task) with self.timer.log_time('network_transport'): logger_verbose.debug('Asking transport layer to do ' 'something') self.transport.process_handlers() logger_verbose.debug('Processing network results (if any).') # Collect completed network results # Each result could be valid or failed # Result is dict {ok, grab, grab_config_backup, task, emsg} results = [(x, False) for x in self.transport.iterate_results()] if result_from_cache: results.append((result_from_cache, True)) # Some sleep to avoid thousands of iterations per second. # If no results from network transport if not results: # If task queue is empty (or if there are only # delayed tasks) if task is None or bool(task) == True: # If no network activity if not self.transport.get_active_threads_number(): # If parser result queue is empty if not self.parser_pipeline.has_results(): # Just sleep some time, do not kill CPU time.sleep(0.1) for result, from_cache in results: if not from_cache: if self.is_valid_for_cache(result): with self.timer.log_time('cache'): with self.timer.log_time('cache.write'): self.cache.save_response( result['task'].url, result['grab']) self.log_network_result_stats(result, from_cache=from_cache) if self.is_valid_network_result(result): #handler = self.find_task_handler(result['task']) #self.process_network_result_with_handler( # result, handler) # MP: # *** self.network_result_queue.put(result) else: self.log_failed_network_result(result) # Try to do network request one more time if self.network_try_limit > 0: result['task'].refresh_cache = True result['task'].setup_grab_config( result['grab_config_backup']) self.add_task(result['task']) if from_cache: self.stat.inc('spider:task-%s-cache' % task.name) self.stat.inc('spider:request') # MP: # *** while True: try: p_res, p_task = self.parser_pipeline.get_result() except queue.Empty: break else: self.stat.inc('spider:parser-result') self.process_handler_result(p_res, p_task) if not self.shutdown_event.is_set(): self.parser_pipeline.check_pool_health() logger_verbose.debug('Work done') except KeyboardInterrupt: logger.info('\nGot ^C signal in process %d. Stopping.' % os.getpid()) self.interrupted = True raise finally: # This code is executed when main cycles is breaked self.timer.stop('total') self.stat.print_progress_line() self.shutdown() # Stop HTTP API process if http_api_proc: http_api_proc.server.shutdown() http_api_proc.join() self.task_queue.clear() # Stop parser processes self.shutdown_event.set() self.parser_pipeline.shutdown() logger.debug('Main process [pid=%s]: work done' % os.getpid())
def run(self): """ Main method. All work is done here. """ self.start_timer('total') self.transport = MulticurlTransport(self.thread_number) try: self.setup_default_queue() self.prepare() self.start_timer('task_generator') if not self.slave: self.init_task_generator() self.stop_timer('task_generator') while self.work_allowed: self.start_timer('task_generator') if self.task_generator_enabled: self.process_task_generator() self.stop_timer('task_generator') free_threads = self.transport.get_free_threads_number() if free_threads: logger_verbose.debug( 'Transport has free resources (%d). ' 'Trying to add new task (if exists).' % free_threads) # Try five times to get new task and proces task generator # because slave parser could agressively consume # tasks from task queue for x in six.moves.range(5): task = self.load_new_task() if task is None: if not self.transport.active_task_number(): self.process_task_generator() elif task is True: # If only delayed tasks in queue break else: # If got some task break if not task: if not self.transport.active_task_number(): logger_verbose.debug('Network transport has no ' 'active tasks') if not self.task_generator_enabled: self.stop() else: logger_verbose.debug( 'Transport active tasks: %d' % self.transport.active_task_number()) elif isinstance(task, NullTask): logger_verbose.debug('Got NullTask') if not self.transport.active_task_number(): if task.sleep: logger.debug('Got NullTask with sleep ' 'instruction. Sleeping for' ' %.2f seconds' % task.sleep) time.sleep(task.sleep) elif isinstance(task, bool) and (task is True): # Take some sleep to not load CPU if not self.transport.active_task_number(): time.sleep(0.1) else: logger_verbose.debug( 'Got new task from task queue: %s' % task) self.process_task_counters(task) is_valid, reason = self.check_task_limits(task) if not is_valid: logger_verbose.debug('Task %s is rejected due to ' '%s limit' % (task.name, reason)) if reason == 'task-try-count': self.add_item('task-count-rejected', task.url) elif reason == 'network-try-count': self.add_item('network-count-rejected', task.url) else: raise SpiderError('Unknown response from ' 'check_task_limits: %s' % reason) handler = task.get_fallback_handler(self) if handler: handler(task) else: self.process_new_task(task) self.transport.process_handlers() with self.save_timer('network_transport'): logger_verbose.debug('Asking transport layer to do ' 'something') self.transport.process_handlers() logger_verbose.debug('Processing network results (if any).') # Iterate over network trasport ready results # Each result could be valid or failed # Result format: {ok, grab, grab_config_backup, task, emsg} # print '[transport iterate results - start]' for result in self.transport.iterate_results(): if self.is_valid_for_cache(result): with self.save_timer('cache'): with self.save_timer('cache.write'): self.cache.save_response( result['task'].url, result['grab']) # print '[process network results]' self.process_network_result(result) # print '[done]' self.inc_count('request') # print '[transport iterate results - end]' logger_verbose.debug('Work done') except KeyboardInterrupt: print('\nGot ^C signal in process %d. Stopping.' % os.getpid()) self.interrupted = True raise finally: # This code is executed when main cycles is breaked self.stop_timer('total') self.shutdown()
def run(self): """ Main method. All work is done here. """ self.start_timer('total') self.transport = MulticurlTransport(self.thread_number) try: self.setup_default_queue() self.prepare() self.start_timer('task_generator') if not self.slave: if not self.ng: self.init_task_generator() self.stop_timer('task_generator') while self.work_allowed: now = int(time.time()) if now - self.last_snapshot_values[ 'timestamp'] > self.snapshot_interval: snapshot = {'timestamp': now} for key in ('download-size', 'upload-size', 'download-size-with-cache'): snapshot[key] = self.counters[ key] - self.last_snapshot_values[key] self.last_snapshot_values[key] = self.counters[key] snapshot['request-count'] = self.counters['request'] -\ self.last_snapshot_values['request-count'] self.last_snapshot_values['request-count'] = self.counters[ 'request'] self.last_snapshot_values['timestamp'] = now self.snapshots[now] = snapshot self.snapshot_timestamps.append(now) if self.snapshot_file: with open(self.snapshot_file, 'a') as out: out.write(json.dumps(snapshot) + '\n') # FIXIT: REMOVE # Run update task handler which # updates database object which stores # info about current scraping process if self.dump_spider_stats: self.dump_spider_stats(self) if self.controller.enabled: self.controller.process_commands() if not self.ng: # NG self.start_timer('task_generator') # star if self.task_generator_enabled: self.process_task_generator() self.stop_timer('task_generator') if self.transport.ready_for_task(): logger_verbose.debug('Transport has free resources. ' 'Trying to add new task (if exists)') # Try five times to get new task and proces task generator # because slave parser could agressively consume # tasks from task queue for x in xrange(5): task = self.load_new_task() if task is None: if not self.transport.active_task_number(): self.process_task_generator() elif task is True: # If only delayed tasks in queue break else: # If got some task break if not task: if not self.transport.active_task_number(): logger_verbose.debug('Network transport has no ' 'active tasks') # NG if self.ng: self.waiting_shutdown_event.set() if self.shutdown_event.is_set(): logger_verbose.debug('Got shutdown signal') self.stop() else: logger_verbose.debug('Shutdown event has' ' not been set yet') else: if not self.task_generator_enabled: self.stop() else: logger_verbose.debug( 'Transport active tasks: %d' % self.transport.active_task_number()) elif isinstance(task, NullTask): logger_verbose.debug('Got NullTask') if not self.transport.active_task_number(): if task.sleep: logger.debug('Got NullTask with sleep ' 'instruction. Sleeping for' ' %.2f seconds' % task.sleep) time.sleep(task.sleep) elif isinstance(task, bool) and (task is True): pass else: if self.ng: if self.waiting_shutdown_event.is_set(): self.waiting_shutdown_event.clear() logger_verbose.debug( 'Got new task from task queue: %s' % task) self.process_task_counters(task) is_valid, reason = self.check_task_limits(task) if not is_valid: logger_verbose.debug('Task %s is rejected due to ' '%s limit' % (task.name, reason)) if reason == 'task-try-count': self.add_item('task-count-rejected', task.url) elif reason == 'network-try-count': self.add_item('network-count-rejected', task.url) else: raise Exception('Unknown response from ' 'check_task_limits: %s' % reason) handler = task.get_fallback_handler(self) if handler: handler(task) # TODO: not do following line # TODO: middleware: TaskFails else: self.process_new_task(task) self.transport.process_handlers() with self.save_timer('network_transport'): logger_verbose.debug('Asking transport layer to do ' 'something') self.transport.process_handlers() logger_verbose.debug('Processing network results (if any).') # Iterate over network trasport ready results # Each result could be valid or failed # Result format: {ok, grab, grab_config_backup, task, emsg} #print '[transport iterate results - start]' for result in self.transport.iterate_results(): if self.is_valid_for_cache(result): with self.save_timer('cache'): with self.save_timer('cache.write'): self.cache.save_response( result['task'].url, result['grab']) #print '[process network results]' self.process_network_result(result) #print '[done]' self.inc_count('request') #print '[transport iterate results - end]' logger_verbose.debug('Work done') except KeyboardInterrupt: print('\nGot ^C signal. Stopping.') raise finally: # This code is executed when main cycles is breaked self.stop_timer('total') self.shutdown()