Пример #1
0
 def from_settings(cls, settings):
   persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
   input_queue_key = settings.get(
     'INPUT_QUEUE_KEY', INPUT_QUEUE_KEY)
   input_queue_cls = load_object(settings.get(
     'INPUT_QUEUE_CLASS', INPUT_QUEUE_CLASS))
   input_queue_shard_dist = settings.get(
     'INPUT_QUEUE_SHARD_DIST', INPUT_QUEUE_SHARD_DIST)
   output_queue_key = settings.get(
     'OUTPUT_QUEUE_KEY', OUTPUT_QUEUE_KEY)
   output_queue_cls = load_object(settings.get(
     'OUTPUT_QUEUE_CLASS', OUTPUT_QUEUE_CLASS))
   output_queue_shard_dist = settings.get(
     'OUTPUT_QUEUE_SHARD_DIST', OUTPUT_QUEUE_SHARD_DIST)
   priority_queue_key = settings.get(
     'PRIORITY_QUEUE_KEY', PRIORITY_QUEUE_KEY)
   priority_queue_cls = load_object(settings.get(
     'PRIORITY_QUEUE_CLASS', PRIORITY_QUEUE_CLASS))
   priority_queue_shard_dist = settings.get(
     'PRIORITY_QUEUE_SHARD_DIST', PRIORITY_QUEUE_SHARD_DIST)
   dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
   idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE)
   servers = connection.from_settings(settings)
   dupefilter_ins = load_object(
     settings['DUPEFILTER_CLASS']).from_settings(settings)
   recrawl_key = settings.get('RECRAWL_LIST_KEY', RECRAWL_KEY)
   return cls(servers, persist, input_queue_key, input_queue_cls,
              input_queue_shard_dist, output_queue_key, output_queue_cls,
              output_queue_shard_dist, priority_queue_key,
              priority_queue_cls, priority_queue_shard_dist, recrawl_key,
              dupefilter_key, dupefilter_ins, idle_before_close)
Пример #2
0
 def from_settings(cls, settings):
     server = connection.from_settings(settings)
     # create one-time key. needed to support to use this
     # class as standalone dupefilter with scrapy's default scheduler
     # if scrapy passes spider on open() method this wouldn't be needed
     key = "dupefilter:%s" % int(time.time())
     return cls(server, key)
Пример #3
0
 def from_settings(cls, settings):
     server = connection.from_settings(settings)
     # create one-time key. needed to support to use this
     # class as standalone dupefilter with scrapy's default scheduler
     # if scrapy passes spider on open() method this wouldn't be needed
     key = "dupefilter:%s" % int(time.time())
     return cls(server, key)
 def from_crawler(cls, crawler):
     if not crawler.spider.islinkgenerator:
         settings = crawler.settings
         persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
         queue_key = "%s:requests" % crawler.spider.name
         queue_cls = queue.SpiderQueue
         idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE',
                                          IDLE_BEFORE_CLOSE)
         server = connection.from_settings(settings, crawler.spider.name)
         stats = crawler.stats
         return cls(server, persist, queue_key, queue_cls,
                    idle_before_close, stats)
     else:
         settings = crawler.settings
         dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
         dupefilter = dupefilter_cls.from_settings(settings)
         pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
         dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
         mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
         logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
                                     settings.getbool('SCHEDULER_DEBUG'))
         core_scheduler = load_object('scrapy.core.scheduler.Scheduler')
         return core_scheduler(dupefilter,
                               jobdir=job_dir(settings),
                               logunser=logunser,
                               stats=crawler.stats,
                               pqclass=pqclass,
                               dqclass=dqclass,
                               mqclass=mqclass)
Пример #5
0
 def from_crawler(cls, crawler):
   servers = connection.from_settings(crawler.settings)
   key = crawler.settings.get('RECRAWL_LIST_KEY', RECRAWL_KEY)
   shard_dist = crawler.settings.get('RECRAWL_SHARD_DIST', RECRAWL_SHARD_DIST)
   recrawl = cls(servers, key, shard_dist, crawler)
   crawler.signals.connect(
       recrawl.setup_recrawl, signal=scrapy.signals.spider_opened)
   return recrawl
Пример #6
0
 def from_settings(cls, settings):
     persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
     queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
     queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
     dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
     idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE)
     server = connection.from_settings(settings)
     return cls(server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close)
Пример #7
0
    def test_redis_host_port_fallback(self):
        settings = dict(REDIS_HOST='baz', REDIS_PORT=1337, REDIS_URL=None)

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'baz')
        self.assertEqual(connect_args['port'], 1337)
Пример #8
0
    def test_redis_default(self):
        settings = dict()

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 6379)
Пример #9
0
    def test_redis_default(self):
        settings = dict()

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 6379)
Пример #10
0
 def from_settings(cls, settings):
     persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
     queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
     queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
     dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
     idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE)
     server, redis_server = connection.from_settings(settings)
     return cls(server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, redis_server)
Пример #11
0
    def test_redis_host_port(self):
        settings = dict(REDIS_HOST='localhost', REDIS_PORT=9001)

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 9001)
Пример #12
0
    def test_redis_url(self):
        settings = dict(REDIS_URL='redis://*****:*****@localhost:9001/42')

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 9001)
        self.assertEqual(connect_args['password'], 'bar')
        self.assertEqual(connect_args['db'], 42)
Пример #13
0
    def test_redis_host_port(self):
        settings = dict(
            REDIS_HOST = 'localhost',
            REDIS_PORT = 9001
        )

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 9001)
Пример #14
0
    def test_redis_url(self):
        settings = dict(
            REDIS_URL = 'redis://*****:*****@localhost:9001/42'
        )

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'localhost')
        self.assertEqual(connect_args['port'], 9001)
        self.assertEqual(connect_args['password'], 'bar')
        self.assertEqual(connect_args['db'], 42)
Пример #15
0
    def setup_redis(self):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.
        """
        if not self.redis_key:
            self.redis_key = '%s:start_urls' % self.name

        self.server = connection.from_settings(self.crawler.settings)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
Пример #16
0
    def test_redis_host_port_fallback(self):
        settings = dict(
            REDIS_HOST = 'baz',
            REDIS_PORT = 1337,
            REDIS_URL = None
        )

        server = connection.from_settings(settings)
        connect_args = server.connection_pool.connection_kwargs

        self.assertEqual(connect_args['host'], 'baz')
        self.assertEqual(connect_args['port'], 1337)
Пример #17
0
    def setup_rabbitmq(self):
        """ Setup RabbitMQ connection.

            Call this method after spider has set its crawler object.
        :return: None
        """

        if not self.rabbitmq_key:
            self.rabbitmq_key = '{}:start_urls'.format(self.name)

        self.server = connection.from_settings(self.crawler.settings)
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
Пример #18
0
    def setup_redis(self):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.
        """
        if not self.redis_key:
            self.redis_key = '%s:start_urls' % self.name

        self.server = connection.from_settings(self.crawler.settings)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.log("Reading URLs from redis list '%s'" % self.redis_key)
Пример #19
0
    def setup_redis(self, crawler=None):
        """Setup redis connection and idle signal. 
 
        This should be called after the spider has set its crawler object. 
        """
        if self.server is not None:
            return

        if crawler is None:
            # We allow optional crawler argument to keep backwards
            # compatibility.
            # XXX: Raise a deprecation warning.
            crawler = getattr(self, 'crawler', None)

        if crawler is None:
            raise ValueError("crawler is required")

        settings = crawler.settings

        if self.redis_key is None:
            self.redis_key = settings.get(
                'REDIS_START_URLS_KEY',
                DEFAULT_START_URLS_KEY,
            )

        self.redis_key = self.redis_key % {'name': self.name}

        if not self.redis_key.strip():
            raise ValueError("redis_key must not be empty")

        if self.redis_batch_size is None:
            self.redis_batch_size = settings.getint(
                'REDIS_START_URLS_BATCH_SIZE',
                DEFAULT_START_URLS_BATCH_SIZE,
            )

        try:
            self.redis_batch_size = int(self.redis_batch_size)
        except (TypeError, ValueError):
            raise ValueError("redis_batch_size must be an integer")

        self.logger.info(
            "Reading start URLs from redis key '%(redis_key)s' "
            "(batch size: %(redis_batch_size)s)", self.__dict__)

        self.server = connection.from_settings(crawler.settings)
        # The idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
Пример #20
0
    def setup_rabbitmq(self):
        """ Setup RabbitMQ connection.

            Call this method after spider has set its crawler object.
        :return: None
        """

        if not self.rabbitmq_key:
            self.rabbitmq_key = '{}:start_urls'.format(self.name)

        self.server = connection.from_settings(self.crawler.settings)
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped,
                                     signal=signals.item_scraped)
Пример #21
0
    def setup_redis(self, request_schedule_mode="default"):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.
        """
        if not self.redis_key:
            self.redis_key = '%s:start_urls' % self.name

        self.server = connection.from_settings(self.crawler.settings)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)

        if request_schedule_mode == 'default':
            self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        elif request_schedule_mode == 'fast':
            self.crawler.signals.connect(self.request_scheduled, signal=signals.request_scheduled)
        self.log("Reading URLs from redis list '%s'" % self.redis_key)
Пример #22
0
 def from_settings(cls, settings):
     server = connection.from_settings(settings)
     return cls(server)
Пример #23
0
 def from_settings(cls, settings):
     servers = connection.from_settings(settings)
     dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
     shard_dist = settings.get('DUPE_SHARD_DIST', DUPE_SHARD_DIST)
     return cls(servers, dupefilter_key, shard_dist)
Пример #24
0
 def from_settings(cls, settings):
     server, redis_server = connection.from_settings(settings)
     exchange_name = settings.get('RABBITMQ_EXCHANGE_NAME', EXCHANGE_NAME)
     return cls(server, exchange_name)
Пример #25
0
 def from_settings(cls, settings):
     server = connection.from_settings(settings)
     key = "dupefilter:bloom:%s" % int(time.time())
     return cls(server, key)
Пример #26
0
 def from_settings(cls, settings):
     server = connection.from_settings(settings)
     return cls(server)
Пример #27
0
 def from_settings(cls, settings):
     server = connection.from_settings(settings)
     key = "dupefilter:bloom:%s" % int(time.time())
     return cls(server, key)
Пример #28
0
 def from_settings(cls, settings):
     server, redis_server = connection.from_settings(settings)
     exchange_name = settings.get('RABBITMQ_EXCHANGE_NAME', EXCHANGE_NAME)
     return cls(server, exchange_name)