def __init__(self, settings, stats): if not settings['HTTPCACHE_ENABLED']: raise NotConfigured self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings) self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings) self.ignore_missing = settings['HTTPCACHE_IGNORE_MISSING'] self.stats = stats
def __init__(self): self.classes = {} self.mimetypes = MimeTypes() mimedata = get_data('scrapy', 'mime.types').decode('utf8') self.mimetypes.readfp(StringIO(mimedata)) for mimetype, cls in six.iteritems(self.CLASSES): self.classes[mimetype] = load_object(cls)
def from_settings(cls, settings, starter=None): mwlist = cls._get_middleware_list_from_settings(settings) middlewares = [] enabled = [] for clspath in mwlist: try: mwcls = load_object(clspath) if starter and hasattr(mwcls, 'from_starter'): mw = mwcls.from_starter(starter) elif hasattr(mwcls, 'from_settings'): mw = mwcls.from_settings(settings) else: mw = mwcls() middlewares.append(mw) enabled.append(clspath) except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] logger.warning("Disabled %(clsname)s: %(eargs)s", { 'clsname': clsname, 'eargs': e.args[0] }, extra={'starter': starter}) logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", { 'componentname': cls.component_name, 'enabledlist': pprint.pformat(enabled) }, extra={'starter': starter}) return cls(*middlewares)
def __init__(self, starter): self.slot = None self.spidermw = SpiderMiddlewareManager.from_starter(starter) itemmw_cls = load_object(starter.settings['ITEM_PROCESSOR']) self.itemmw = itemmw_cls.from_starter(starter) self.concurrent_items = starter.settings['CONCURRENT_ITEMS'] self.starter = starter
def __init__(self, starter): self.starter = starter self.settings = starter.settings self.signals = starter.signals self.loop = None self.spider = None self.scheduler = None self.scraper = Scraper(starter) self.heart = None self.scheduler_class = load_object(self.settings['SCHEDULER']) downloader_class = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_class(starter) self.running = False self.crawling = [] self.max = 5 self.paused = False self.start_time = 0
def _get_spider_loader(settings): """Get SpiderLoader instance from settings""" cls_path = settings['SPIDER_LOADER_CLASS'] loader_cls = load_object(cls_path) try: verifyClass(ISpiderLoader, loader_cls) except DoesNotImplement: logger.warn( 'SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does not fully implement ' 'quixote.spider.spiderloader.ISpiderLoader interface. Please add all missing methods to avoid ' 'unexpected runtime errors.', category=QuixoteDeprecationWarning, stacklevel=2) return loader_cls.from_settings(settings)
def __init__(self, spider_name, project_settings=None, is_check_emmory=False): from quixote.settings import settings self.settings = Settings(settings).get_settings() project_settings = Settings.get_dict_from_settings_file( project_settings) for s_k, s_v in project_settings.items(): self.settings[s_k] = s_v self.spider_loader = _get_spider_loader(self.settings) print(self.spider_loader) print(self.settings) self.engine_class = load_object(self.settings['ENGINE']) # self.spider_class = load_object(spider_class) self.spider_class = self.spider_loader.get_spider_by_name(spider_name) self.signals = SignalManager() self.stats = load_object(self.settings['STATS_CLASS'])(self) self.extensions = ExtensionManager.from_starter(self) self.engine = None self.spider = None self.crawling = False self.is_check_emmory = is_check_emmory self.start_time = None
def _get_handler(self, scheme): if scheme in self._handlers: return self._handlers[scheme] if scheme in self._not_configured: return None if scheme not in self._schemes: self._not_configured[scheme] = 'no handler available for that scheme' return None path = self._schemes[scheme] try: download_handler = load_object(path) dh = download_handler(self._starter.settings) except NotConfigured as e: self._not_configured[scheme] = str(e) return None except Exception as e: logger.error('Loading "%(class_path)s" for scheme "%(scheme)s"', {"class_path": path, "scheme": scheme}, exc_info=True, extra={'crawler': self._starter}) self._not_configured[scheme] = str(e) return None else: self._handlers[scheme] = dh return self._handlers[scheme]