Exemplo n.º 1
0
def test_frontera_settings_have_precedence_over_crawler_settings():
    crawler_settings = {
        'MAX_REQUESTS': 10,
        'FRONTERA_SETTINGS': 'tests.scrapy_spider.frontera.settings'
    }
    settings = ScrapySettingsAdapter(crawler_settings)
    assert settings.get('MAX_REQUESTS') == 5
Exemplo n.º 2
0
 def __init__(self, crawler):
     settings = ScrapySettingsAdapter(crawler.settings)
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     # XXX this can be improved later by reusing spider's producer
     # (crawler->engine->slot->scheduler->frontier->manager-> backend->_producer)
     # but the topic is hard-coded in the current scheme, so it requires some
     # preliminary changes in Frontera itself.
     message_bus = load_object(settings.get('MESSAGE_BUS'))(settings)
     stats_log = message_bus.stats_log()
     if not stats_log:
         raise NotConfigured
     self.stats_producer = stats_log.producer()
     self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60)
     codec_path = settings.get('MESSAGE_BUS_CODEC')
     encoder_cls = load_object(codec_path + ".Encoder")
     self._stats_encoder = encoder_cls(request_model=None)  # no need to encode requests
     self._export_stats_task = None
Exemplo n.º 3
0
 def __init__(self, crawler):
     self.crawler = crawler
     self.stats_manager = StatsManager(crawler.stats)
     self._pending_requests = deque()
     self.redirect_enabled = crawler.settings.get('REDIRECT_ENABLED')
     settings = ScrapySettingsAdapter(crawler.settings)
     self.frontier = ScrapyFrontierManager(settings)
     self._delay_on_empty = self.frontier.manager.settings.get('DELAY_ON_EMPTY')
     self._delay_next_call = 0.0
Exemplo n.º 4
0
 def __init__(self, crawler, manager=None):
     self.crawler = crawler
     self.stats_manager = StatsManager(crawler.stats)
     self._pending_requests = deque()
     self.redirect_enabled = crawler.settings.get('REDIRECT_ENABLED')
     settings = ScrapySettingsAdapter(crawler.settings)
     self.frontier = ScrapyFrontierManager(settings, manager)
     self._delay_on_empty = self.frontier.manager.settings.get('DELAY_ON_EMPTY')
     self._delay_next_call = 0.0
     self.logger = getLogger('frontera.contrib.scrapy.schedulers.FronteraScheduler')
Exemplo n.º 5
0
    def open(self, spider):
        super(FronteraScheduler, self).open(spider)
        settings = ScrapySettingsAdapter(spider.crawler.settings)
        settings.set_from_dict(getattr(spider, 'frontera_settings', {}))
        settings.set_from_dict(
            json.loads(getattr(spider, 'frontera_settings_json', '{}')))
        settings.set('STATS_MANAGER', self.stats)
        self.frontier = ScrapyFrontierManager(settings)

        self.frontier.set_spider(spider)

        if self.crawler.settings.getbool(
                'FRONTERA_SCHEDULER_START_REQUESTS_TO_FRONTIER'):
            self.frontier.add_seeds(spider.start_requests())

        self.frontier_requests_callbacks = \
            self.crawler.settings.getlist('FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER')

        LOG.info('Starting frontier')
        if not self.frontier.manager.auto_start:
            self.frontier.start()
Exemplo n.º 6
0
 def __init__(self, crawler):
     settings = ScrapySettingsAdapter(crawler.settings)
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     # XXX this can be improved later by reusing spider's producer
     # (crawler->engine->slot->scheduler->frontier->manager-> backend->_producer)
     # but the topic is hard-coded in the current scheme, so it requires some
     # preliminary changes in Frontera itself.
     message_bus = load_object(settings.get('MESSAGE_BUS'))(settings)
     stats_log = message_bus.stats_log()
     if not stats_log:
         raise NotConfigured
     self.stats_producer = stats_log.producer()
     self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60)
     codec_path = settings.get('MESSAGE_BUS_CODEC')
     encoder_cls = load_object(codec_path + ".Encoder")
     self._stats_encoder = encoder_cls(
         request_model=None)  # no need to encode requests
     self._export_stats_task = None
Exemplo n.º 7
0
def test_fallsback_to_crawler_settings():
    settings = ScrapySettingsAdapter({'DELAY_ON_EMPTY': 10})
    assert settings.get('DELAY_ON_EMPTY') == 10
def test_fallsback_to_crawler_settings():
    settings = ScrapySettingsAdapter({'DELAY_ON_EMPTY': 10})
    assert settings.get('DELAY_ON_EMPTY') == 10
def test_frontera_settings_have_precedence_over_crawler_settings():
    crawler_settings = {'MAX_REQUESTS': 10,
                        'FRONTERA_SETTINGS': 'tests.scrapy_spider.frontera.settings'}
    settings = ScrapySettingsAdapter(crawler_settings)
    assert settings.get('MAX_REQUESTS') == 5