def __init__(self, engine): settings = engine.settings self.policy = load_object(settings['HTTP_CACHE_POLICY'])(settings) self.storage = load_object(settings['HTTP_CACHE_STORAGE'])(engine) self.ignore_missing = settings.get_bool('HTTP_CACHE_IGNORE_MISSING') self.stats = engine.stats engine.signals.connect(self.engine_started, signal=signals.engine_started) engine.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
def _get_middlewares(self, mw_classes): if mw_classes is None: mwlist = [] for clspath in self._get_mwlist(): mwlist.append(load_object(clspath)) else: mwlist = mw_classes self.mwlist = mwlist middlewares = [] for mwcls in mwlist: try: # middlewares disabled through enabled_setting if hasattr(mwcls, 'enabled_setting'): enabled_setting = mwcls.enabled_setting else: enabled_setting = ('%s_ENABLED' % camelcase_to_capital(mwcls.__name__)) if not self.settings.get_bool(enabled_setting, True): raise NotConfigured() mw = mwcls(self.engine) mw.enabled_setting = enabled_setting middlewares.append(mw) except NotConfigured as e: log.msg(format='Disabled %(clsname)s: %(error)s', level=log.DEBUG, clsname=mwcls, error=e) enabled = [x.__class__.__name__ for x in middlewares] log.msg(format='Enabled %(componentname)ss: %(enabledlist)s', level=log.DEBUG, componentname=self.component_name, enabledlist=', '.join(enabled)) return middlewares
def __init__(self, settings): self._handlers = {} self._not_configured = {} handlers = settings.get('DOWNLOAD_HANDLERS', {}) for scheme, clspath in handlers.iteritems(): cls = load_object(clspath) try: dh = cls(settings) except NotConfigured as e: self._not_configured[scheme] = str(e) else: self._handlers[scheme] = dh
def setup(self): assert self.spider is not None, 'Spider is not set in Engine.' # IMPORTANT: order of the following initializations is very important # so please, think twice about any changes to it # initialize logging if self.settings.get_bool('LOG_ENABLED'): log.start( self.settings['LOG_FILE'], self.settings['LOG_LEVEL'], self.settings['LOG_STDOUT'], self.settings['LOG_ENCODING']) # initialize signals self.signals = SignalManager(self) #initialize stats stats_cls = load_object(self.settings.get('STATS_CLASS')) self.stats = stats_cls(self) # initialize downloader self.request_queue = PriorityQueue(lambda _: MemoryQueue()) self.response_queue = ResponseQueue( self.settings.get_int('RESPONSE_ACTIVE_SIZE_LIMIT')) self.downloader = Downloader(self.settings, self.request_queue, self.response_queue, clock=self.clock) # initialize extensions self.extensions = ExtensionManager(self) # initialize downloader pipeline self.pipeline = PipelineManager(self) self.initialized = True # now that everything is ready, set the spider's engine self.spider.set_engine(self)
def setup(self): assert self.spider is not None, 'Spider is not set in Engine.' # IMPORTANT: order of the following initializations is very important # so please, think twice about any changes to it # initialize logging if self.settings.get_bool('LOG_ENABLED'): log.start(self.settings['LOG_FILE'], self.settings['LOG_LEVEL'], self.settings['LOG_STDOUT'], self.settings['LOG_ENCODING']) # initialize signals self.signals = SignalManager(self) #initialize stats stats_cls = load_object(self.settings.get('STATS_CLASS')) self.stats = stats_cls(self) # initialize downloader self.request_queue = PriorityQueue(lambda _: MemoryQueue()) self.response_queue = ResponseQueue( self.settings.get_int('RESPONSE_ACTIVE_SIZE_LIMIT')) self.downloader = Downloader(self.settings, self.request_queue, self.response_queue, clock=self.clock) # initialize extensions self.extensions = ExtensionManager(self) # initialize downloader pipeline self.pipeline = PipelineManager(self) self.initialized = True # now that everything is ready, set the spider's engine self.spider.set_engine(self)
def test_load_object(self): obj = load_object('crawlmi.utils.misc.load_object') self.assertIs(obj, load_object) self.assertRaises(ImportError, load_object, 'nomodule999.mod.function') self.assertRaises(NameError, load_object, 'crawlmi.utils.misc.load_object999')