def test_frontera_settings_have_precedence_over_crawler_settings(): crawler_settings = { 'MAX_REQUESTS': 10, 'FRONTERA_SETTINGS': 'tests.scrapy_spider.frontera.settings' } settings = ScrapySettingsAdapter(crawler_settings) assert settings.get('MAX_REQUESTS') == 5
def __init__(self, crawler): settings = ScrapySettingsAdapter(crawler.settings) self.partition_id = settings.get('SPIDER_PARTITION_ID') # XXX this can be improved later by reusing spider's producer # (crawler->engine->slot->scheduler->frontier->manager-> backend->_producer) # but the topic is hard-coded in the current scheme, so it requires some # preliminary changes in Frontera itself. message_bus = load_object(settings.get('MESSAGE_BUS'))(settings) stats_log = message_bus.stats_log() if not stats_log: raise NotConfigured self.stats_producer = stats_log.producer() self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") self._stats_encoder = encoder_cls(request_model=None) # no need to encode requests self._export_stats_task = None
def __init__(self, crawler): self.crawler = crawler self.stats_manager = StatsManager(crawler.stats) self._pending_requests = deque() self.redirect_enabled = crawler.settings.get('REDIRECT_ENABLED') settings = ScrapySettingsAdapter(crawler.settings) self.frontier = ScrapyFrontierManager(settings) self._delay_on_empty = self.frontier.manager.settings.get('DELAY_ON_EMPTY') self._delay_next_call = 0.0
def __init__(self, crawler, manager=None): self.crawler = crawler self.stats_manager = StatsManager(crawler.stats) self._pending_requests = deque() self.redirect_enabled = crawler.settings.get('REDIRECT_ENABLED') settings = ScrapySettingsAdapter(crawler.settings) self.frontier = ScrapyFrontierManager(settings, manager) self._delay_on_empty = self.frontier.manager.settings.get('DELAY_ON_EMPTY') self._delay_next_call = 0.0 self.logger = getLogger('frontera.contrib.scrapy.schedulers.FronteraScheduler')
def open(self, spider): super(FronteraScheduler, self).open(spider) settings = ScrapySettingsAdapter(spider.crawler.settings) settings.set_from_dict(getattr(spider, 'frontera_settings', {})) settings.set_from_dict( json.loads(getattr(spider, 'frontera_settings_json', '{}'))) settings.set('STATS_MANAGER', self.stats) self.frontier = ScrapyFrontierManager(settings) self.frontier.set_spider(spider) if self.crawler.settings.getbool( 'FRONTERA_SCHEDULER_START_REQUESTS_TO_FRONTIER'): self.frontier.add_seeds(spider.start_requests()) self.frontier_requests_callbacks = \ self.crawler.settings.getlist('FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER') LOG.info('Starting frontier') if not self.frontier.manager.auto_start: self.frontier.start()
def __init__(self, crawler): settings = ScrapySettingsAdapter(crawler.settings) self.partition_id = settings.get('SPIDER_PARTITION_ID') # XXX this can be improved later by reusing spider's producer # (crawler->engine->slot->scheduler->frontier->manager-> backend->_producer) # but the topic is hard-coded in the current scheme, so it requires some # preliminary changes in Frontera itself. message_bus = load_object(settings.get('MESSAGE_BUS'))(settings) stats_log = message_bus.stats_log() if not stats_log: raise NotConfigured self.stats_producer = stats_log.producer() self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path + ".Encoder") self._stats_encoder = encoder_cls( request_model=None) # no need to encode requests self._export_stats_task = None
def test_fallsback_to_crawler_settings(): settings = ScrapySettingsAdapter({'DELAY_ON_EMPTY': 10}) assert settings.get('DELAY_ON_EMPTY') == 10
def test_frontera_settings_have_precedence_over_crawler_settings(): crawler_settings = {'MAX_REQUESTS': 10, 'FRONTERA_SETTINGS': 'tests.scrapy_spider.frontera.settings'} settings = ScrapySettingsAdapter(crawler_settings) assert settings.get('MAX_REQUESTS') == 5