Exemplo n.º 1
0
    def __init__(self, engine):
        settings = engine.settings
        self.policy = load_object(settings['HTTP_CACHE_POLICY'])(settings)
        self.storage = load_object(settings['HTTP_CACHE_STORAGE'])(engine)
        self.ignore_missing = settings.get_bool('HTTP_CACHE_IGNORE_MISSING')
        self.stats = engine.stats

        engine.signals.connect(self.engine_started, signal=signals.engine_started)
        engine.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
Exemplo n.º 2
0
    def _get_middlewares(self, mw_classes):
        if mw_classes is None:
            mwlist = []
            for clspath in self._get_mwlist():
                mwlist.append(load_object(clspath))
        else:
            mwlist = mw_classes

        self.mwlist = mwlist
        middlewares = []
        for mwcls in mwlist:
            try:
                # middlewares disabled through enabled_setting
                if hasattr(mwcls, 'enabled_setting'):
                    enabled_setting = mwcls.enabled_setting
                else:
                    enabled_setting = ('%s_ENABLED' %
                                       camelcase_to_capital(mwcls.__name__))
                if not self.settings.get_bool(enabled_setting, True):
                    raise NotConfigured()

                mw = mwcls(self.engine)
                mw.enabled_setting = enabled_setting
                middlewares.append(mw)
            except NotConfigured as e:
                log.msg(format='Disabled %(clsname)s: %(error)s',
                        level=log.DEBUG, clsname=mwcls, error=e)

        enabled = [x.__class__.__name__ for x in middlewares]
        log.msg(format='Enabled %(componentname)ss: %(enabledlist)s',
                level=log.DEBUG,
                componentname=self.component_name,
                enabledlist=', '.join(enabled))
        return middlewares
Exemplo n.º 3
0
 def __init__(self, settings):
     self._handlers = {}
     self._not_configured = {}
     handlers = settings.get('DOWNLOAD_HANDLERS', {})
     for scheme, clspath in handlers.iteritems():
         cls = load_object(clspath)
         try:
             dh = cls(settings)
         except NotConfigured as e:
             self._not_configured[scheme] = str(e)
         else:
             self._handlers[scheme] = dh
Exemplo n.º 4
0
 def __init__(self, settings):
     self._handlers = {}
     self._not_configured = {}
     handlers = settings.get('DOWNLOAD_HANDLERS', {})
     for scheme, clspath in handlers.iteritems():
         cls = load_object(clspath)
         try:
             dh = cls(settings)
         except NotConfigured as e:
             self._not_configured[scheme] = str(e)
         else:
             self._handlers[scheme] = dh
Exemplo n.º 5
0
    def setup(self):
        assert self.spider is not None, 'Spider is not set in Engine.'

        # IMPORTANT: order of the following initializations is very important
        # so please, think twice about any changes to it

        # initialize logging
        if self.settings.get_bool('LOG_ENABLED'):
            log.start(
                self.settings['LOG_FILE'],
                self.settings['LOG_LEVEL'],
                self.settings['LOG_STDOUT'],
                self.settings['LOG_ENCODING'])

        # initialize signals
        self.signals = SignalManager(self)

        #initialize stats
        stats_cls = load_object(self.settings.get('STATS_CLASS'))
        self.stats = stats_cls(self)

        # initialize downloader
        self.request_queue = PriorityQueue(lambda _: MemoryQueue())
        self.response_queue = ResponseQueue(
            self.settings.get_int('RESPONSE_ACTIVE_SIZE_LIMIT'))
        self.downloader = Downloader(self.settings, self.request_queue,
                                     self.response_queue, clock=self.clock)

        # initialize extensions
        self.extensions = ExtensionManager(self)
        # initialize downloader pipeline
        self.pipeline = PipelineManager(self)

        self.initialized = True

        # now that everything is ready, set the spider's engine
        self.spider.set_engine(self)
Exemplo n.º 6
0
    def setup(self):
        assert self.spider is not None, 'Spider is not set in Engine.'

        # IMPORTANT: order of the following initializations is very important
        # so please, think twice about any changes to it

        # initialize logging
        if self.settings.get_bool('LOG_ENABLED'):
            log.start(self.settings['LOG_FILE'], self.settings['LOG_LEVEL'],
                      self.settings['LOG_STDOUT'],
                      self.settings['LOG_ENCODING'])

        # initialize signals
        self.signals = SignalManager(self)

        #initialize stats
        stats_cls = load_object(self.settings.get('STATS_CLASS'))
        self.stats = stats_cls(self)

        # initialize downloader
        self.request_queue = PriorityQueue(lambda _: MemoryQueue())
        self.response_queue = ResponseQueue(
            self.settings.get_int('RESPONSE_ACTIVE_SIZE_LIMIT'))
        self.downloader = Downloader(self.settings,
                                     self.request_queue,
                                     self.response_queue,
                                     clock=self.clock)

        # initialize extensions
        self.extensions = ExtensionManager(self)
        # initialize downloader pipeline
        self.pipeline = PipelineManager(self)

        self.initialized = True

        # now that everything is ready, set the spider's engine
        self.spider.set_engine(self)
Exemplo n.º 7
0
 def test_load_object(self):
     obj = load_object('crawlmi.utils.misc.load_object')
     self.assertIs(obj, load_object)
     self.assertRaises(ImportError, load_object, 'nomodule999.mod.function')
     self.assertRaises(NameError, load_object,
                       'crawlmi.utils.misc.load_object999')
Exemplo n.º 8
0
 def test_load_object(self):
     obj = load_object('crawlmi.utils.misc.load_object')
     self.assertIs(obj, load_object)
     self.assertRaises(ImportError, load_object, 'nomodule999.mod.function')
     self.assertRaises(NameError, load_object, 'crawlmi.utils.misc.load_object999')