def from_settings(cls, settings): persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) input_queue_key = settings.get( 'INPUT_QUEUE_KEY', INPUT_QUEUE_KEY) input_queue_cls = load_object(settings.get( 'INPUT_QUEUE_CLASS', INPUT_QUEUE_CLASS)) input_queue_shard_dist = settings.get( 'INPUT_QUEUE_SHARD_DIST', INPUT_QUEUE_SHARD_DIST) output_queue_key = settings.get( 'OUTPUT_QUEUE_KEY', OUTPUT_QUEUE_KEY) output_queue_cls = load_object(settings.get( 'OUTPUT_QUEUE_CLASS', OUTPUT_QUEUE_CLASS)) output_queue_shard_dist = settings.get( 'OUTPUT_QUEUE_SHARD_DIST', OUTPUT_QUEUE_SHARD_DIST) priority_queue_key = settings.get( 'PRIORITY_QUEUE_KEY', PRIORITY_QUEUE_KEY) priority_queue_cls = load_object(settings.get( 'PRIORITY_QUEUE_CLASS', PRIORITY_QUEUE_CLASS)) priority_queue_shard_dist = settings.get( 'PRIORITY_QUEUE_SHARD_DIST', PRIORITY_QUEUE_SHARD_DIST) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) servers = connection.from_settings(settings) dupefilter_ins = load_object( settings['DUPEFILTER_CLASS']).from_settings(settings) recrawl_key = settings.get('RECRAWL_LIST_KEY', RECRAWL_KEY) return cls(servers, persist, input_queue_key, input_queue_cls, input_queue_shard_dist, output_queue_key, output_queue_cls, output_queue_shard_dist, priority_queue_key, priority_queue_cls, priority_queue_shard_dist, recrawl_key, dupefilter_key, dupefilter_ins, idle_before_close)
def from_settings(cls, settings): server = connection.from_settings(settings) # create one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed key = "dupefilter:%s" % int(time.time()) return cls(server, key)
def from_crawler(cls, crawler): if not crawler.spider.islinkgenerator: settings = crawler.settings persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = "%s:requests" % crawler.spider.name queue_cls = queue.SpiderQueue idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) server = connection.from_settings(settings, crawler.spider.name) stats = crawler.stats return cls(server, persist, queue_key, queue_cls, idle_before_close, stats) else: settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) core_scheduler = load_object('scrapy.core.scheduler.Scheduler') return core_scheduler(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
def from_crawler(cls, crawler): servers = connection.from_settings(crawler.settings) key = crawler.settings.get('RECRAWL_LIST_KEY', RECRAWL_KEY) shard_dist = crawler.settings.get('RECRAWL_SHARD_DIST', RECRAWL_SHARD_DIST) recrawl = cls(servers, key, shard_dist, crawler) crawler.signals.connect( recrawl.setup_recrawl, signal=scrapy.signals.spider_opened) return recrawl
def from_settings(cls, settings): persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) server = connection.from_settings(settings) return cls(server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close)
def test_redis_host_port_fallback(self): settings = dict(REDIS_HOST='baz', REDIS_PORT=1337, REDIS_URL=None) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'baz') self.assertEqual(connect_args['port'], 1337)
def test_redis_default(self): settings = dict() server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 6379)
def from_settings(cls, settings): persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) server, redis_server = connection.from_settings(settings) return cls(server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, redis_server)
def test_redis_host_port(self): settings = dict(REDIS_HOST='localhost', REDIS_PORT=9001) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 9001)
def test_redis_url(self): settings = dict(REDIS_URL='redis://*****:*****@localhost:9001/42') server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 9001) self.assertEqual(connect_args['password'], 'bar') self.assertEqual(connect_args['db'], 42)
def test_redis_host_port(self): settings = dict( REDIS_HOST = 'localhost', REDIS_PORT = 9001 ) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 9001)
def test_redis_url(self): settings = dict( REDIS_URL = 'redis://*****:*****@localhost:9001/42' ) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 9001) self.assertEqual(connect_args['password'], 'bar') self.assertEqual(connect_args['db'], 42)
def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if not self.redis_key: self.redis_key = '%s:start_urls' % self.name self.server = connection.from_settings(self.crawler.settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def test_redis_host_port_fallback(self): settings = dict( REDIS_HOST = 'baz', REDIS_PORT = 1337, REDIS_URL = None ) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'baz') self.assertEqual(connect_args['port'], 1337)
def setup_rabbitmq(self): """ Setup RabbitMQ connection. Call this method after spider has set its crawler object. :return: None """ if not self.rabbitmq_key: self.rabbitmq_key = '{}:start_urls'.format(self.name) self.server = connection.from_settings(self.crawler.settings) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if not self.redis_key: self.redis_key = '%s:start_urls' % self.name self.server = connection.from_settings(self.crawler.settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.log("Reading URLs from redis list '%s'" % self.redis_key)
def setup_redis(self, crawler=None): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if self.server is not None: return if crawler is None: # We allow optional crawler argument to keep backwards # compatibility. # XXX: Raise a deprecation warning. crawler = getattr(self, 'crawler', None) if crawler is None: raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None: self.redis_key = settings.get( 'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY, ) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip(): raise ValueError("redis_key must not be empty") if self.redis_batch_size is None: self.redis_batch_size = settings.getint( 'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE, ) try: self.redis_batch_size = int(self.redis_batch_size) except (TypeError, ValueError): raise ValueError("redis_batch_size must be an integer") self.logger.info( "Reading start URLs from redis key '%(redis_key)s' " "(batch size: %(redis_batch_size)s)", self.__dict__) self.server = connection.from_settings(crawler.settings) # The idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def setup_redis(self, request_schedule_mode="default"): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if not self.redis_key: self.redis_key = '%s:start_urls' % self.name self.server = connection.from_settings(self.crawler.settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) if request_schedule_mode == 'default': self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) elif request_schedule_mode == 'fast': self.crawler.signals.connect(self.request_scheduled, signal=signals.request_scheduled) self.log("Reading URLs from redis list '%s'" % self.redis_key)
def from_settings(cls, settings): server = connection.from_settings(settings) return cls(server)
def from_settings(cls, settings): servers = connection.from_settings(settings) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) shard_dist = settings.get('DUPE_SHARD_DIST', DUPE_SHARD_DIST) return cls(servers, dupefilter_key, shard_dist)
def from_settings(cls, settings): server, redis_server = connection.from_settings(settings) exchange_name = settings.get('RABBITMQ_EXCHANGE_NAME', EXCHANGE_NAME) return cls(server, exchange_name)
def from_settings(cls, settings): server = connection.from_settings(settings) key = "dupefilter:bloom:%s" % int(time.time()) return cls(server, key)