def __init__(self, pool_size = 20, pop_interval = 1, request_interval = 0, max_empty_retry = 2, request_timeout = 10, each_size_from_queue = 10, max_failure_allowed = -1): from gevent import monkey monkey.patch_all() self.pop_interval = pop_interval self.request_interval = request_interval self.pool = Pool(pool_size) self.quit_event = Event() self.max_empty_retry = max_empty_retry self.request_timeout = request_timeout self.each_size_from_queue = each_size_from_queue self.user_agent_provider = UserAgentProvider() self.max_failure_allowed = max_failure_allowed self._request_failure = 0 self.proxy_provider = None self.processor_manager = RequestEngine.ProcessorManager() self.before_each = [] self.after_each = [] gevent.signal(signal.SIGINT, self.quit) gevent.signal(signal.SIGQUIT, self.quit) gevent.signal(signal.SIGTERM, self.quit)
class RequestEngine: class ProcessorManager(object): def __init__(self): self._processor_map = {'default': None} def set(self, processor_name, value): self._processor_map[processor_name] = value def route(self, processor_name, **kwargs): if processor_name is None: processor_name_indeed = 'default' else: processor_name_indeed = processor_name processor = self._processor_map[processor_name_indeed] if processor is None: pass elif hasattr(processor, '__call__'): return processor.__call__(**kwargs) def __init__(self, pool_size = 20, pop_interval = 1, request_interval = 0, max_empty_retry = 2, request_timeout = 10, each_size_from_queue = 10, max_failure_allowed = -1): from gevent import monkey monkey.patch_all() self.pop_interval = pop_interval self.request_interval = request_interval self.pool = Pool(pool_size) self.quit_event = Event() self.max_empty_retry = max_empty_retry self.request_timeout = request_timeout self.each_size_from_queue = each_size_from_queue self.user_agent_provider = UserAgentProvider() self.max_failure_allowed = max_failure_allowed self._request_failure = 0 self.proxy_provider = None self.processor_manager = RequestEngine.ProcessorManager() self.before_each = [] self.after_each = [] gevent.signal(signal.SIGINT, self.quit) gevent.signal(signal.SIGQUIT, self.quit) gevent.signal(signal.SIGTERM, self.quit) def setup_request_queue(self, request_queue_ins): self.request_queue = request_queue_ins @property def active(self): if not hasattr(self, '_active'): self._active = False return self._active @active.setter def active(self, value): self._active = value def before_each(self, *processors): self.before_each += processors def after_each(self, *processors): self.after_each += processors def worker_count(self): return self.pool.size - self.pool.free_count() def quit(self): self.quit_event.set() def request(self, override_req_args= {}): self.active = True empty_count = 0 while True: if self.quit_event.is_set(): logger.warning("Quiting Engine") if self.pool.size != self.pool.free_count(): time.sleep(1) continue self.active = False logger.warning("Engine Gracefully Quit") break if (self.max_failure_allowed != -1 and self._request_failure >= self.max_failure_allowed): logger.warning( "Exceed Max Failures Count. Engine Stopping ..." ) self.quit() continue if self.pool.free_count() > self.each_size_from_queue: this_time_size = self.each_size_from_queue else: this_time_size = self.pool.free_count() if this_time_size > 0: reqs = self.request_queue.pop(this_time_size) logger.info('Current free workers: '+str(self.pool.free_count())) if (reqs is not None) and (len(reqs) > 0): for i in reqs: self.pool.spawn(self._make_requests, request=i, override = override_req_args) time.sleep(self.request_interval) else: empty_count +=1 if (self.max_empty_retry != -1 and empty_count >= self.max_empty_retry): logger.warning( "Exceed Max Empty. Engine Stopping ..." ) self.quit() continue #while self.pool.free_count() == 0: time.sleep(self.pop_interval) def setup_user_agent_provider(self, provider): self.user_agent_provider = provider def setup_proxy_provider(self, provider): self.proxy_provider = provider def register_processor(self, processor, name='default'): self.processor_manager.set(name, processor) def _make_requests(self, request, override): empty_count = 0 data= {} # Data flow is_failure_set = False request.kwargs.update(override) # Setting user agent if self.user_agent_provider: if 'headers' in request.kwargs: request.kwargs['headers'].update({'User-Agent': self.user_agent_provider.provide()}) else: request.kwargs['headers'] = {'User-Agent': self.user_agent_provider.provide()} # Setting proxy provider if self.proxy_provider: proxy = self.proxy_provider.provide() if proxy is not None: # If Provider return None, not use proxy _proxy = {'http':proxy.proxy, 'https':proxy.proxy} if 'proxies' in request.kwargs: request.kwargs['proxies'].update(_proxy) else: request.kwargs['proxies'] = _proxy logger.warning("Using Proxy: %s" % str(_proxy)) else: logger.warning("No Using Proxy") else: proxy = None ar = None result = False processors = {'before':None, 'after':None} if request.processors is not None: processors.update(request.processors) before_each_hook_result = None # Execute hook before every item try: logger.info("Executing before hook") before_each_hook_result = self.processor_manager.route( processor_name=processors['before'], request = request, extra = request.raw_info, data= data) for p in self.before_each: self.processor_manager.route(processor_name=p, request = request ,extra = request.raw_info, data= data) except: if not is_failure_set: self._request_failure += 1 is_failure_set = True logger.error("Exception while before hook execution: "+ traceback.format_exc()) # Execute request if before_each_hook_result != False: # Only if before hook return non-false try: logger.debug("Making request... (%s)" % str(request.kwargs)) _timeout = getattr(request.raw_info,'_timeout',self.request_timeout) logger.debug("Timeout setting: %s" % _timeout) with gevent.Timeout(_timeout): ar = requests.request(**request.kwargs) ar.raw_info = request.raw_info result = True # if result is False: # raise Exception("Request timeout (%s)" % self.request_timeout) except: if not is_failure_set: self._request_failure += 1 is_failure_set = True logger.error("Exception while requests execution: "+ traceback.format_exc()) try: # Execute hook after every request logger.info("Executing after hook") self.processor_manager.route( processor_name=processors['after'], response = ar, request = request, extra = request.raw_info, result = result, data=data) for p in self.after_each: self.processor_manager.route(processor_name=p,response = ar, request = request,extra = request.raw_info, result = result, data= data) # process proxy provider if proxy: self.proxy_provider.callback(proxy, result=result, response = ar, request=request) except: if not is_failure_set: self._request_failure += 1 is_failure_set = True logger.error("Exception while after hook execution", exc_info=True)
if hasattr(settings, 'PROXY_PROVIDER'): proxy_provider = getattr( importlib.import_module(settings.PROXY_PROVIDER[0]), settings.PROXY_PROVIDER[1]) request_engine.setup_proxy_provider( proxy_provider(**settings.PROXY_PROVIDER_ARGUMENTS)) logger.info('Set up proxy provider') else: request_engine.setup_proxy_provider(CustomProxyProvider()) if hasattr(settings, 'UA_PROVIDER'): ua_provider = getattr(importlib.import_module(settings.UA_PROVIDER[0]), settings.UA_PROVIDER[1]) request_engine.setup_user_agent_provider(ua_provider()) logger.info('Set up UA provider') else: request_engine.setup_user_agent_provider(UserAgentProvider()) request_engine.setup_request_queue(env.request_queue) env.downloader = request_engine # Processors processors = importlib.import_module('app.processors') for f in glob.glob(os.path.dirname(processors.__file__) + "/*.py"): __import__('app.processors.' + os.path.basename(f)[:-3]) def start(): env.downloader.request()