Пример #1
0
 def __init__(self, settings, crawler):
     self.user_reds = get_redis(url=settings['REDIS_URL'],
                                db=settings['USER_DB'],
                                decode_responses=True)
     self.cookie_reds = get_redis(url=settings['REDIS_URL'],
                                  db=settings['COOKIE_DB'],
                                  decode_responses=True)
     init_cookie(self.user_reds, self.cookie_reds, crawler.spider.name)
Пример #2
0
    def run_crawler(self):
        spider_class = self.get_spider_class(self.spider_name)

        if os.getenv('APP_DISTRIBUTED'):
            redis = get_redis(url=self.crawler.settings.get('REDIS_URL'))

        if len(list(self.crawler.crawlers)) < 1:
            self.crawler.settings.set(
                'APP_TASK',
                ScheduleCrawlerRunner.interval_to_app_task(
                    self.crawler.settings.get('APP_STORAGE_SHUFFLE_INTERVAL')))

            if os.getenv('APP_DISTRIBUTED'):
                if redis.zcount(spider_class.name + ':requests', 0, 100) < 1:
                    for start_url in spider_class.start_urls:
                        redis.sadd(spider_class.name + ':start_urls',
                                   start_url)
                else:
                    self.crawler.settings.set(
                        'APP_TASK',
                        redis.get(spider_class.name +
                                  ':app_task').decode('utf-8'))

            logger.info(
                '[SPIDER.%s.%s.DIS_%s.ROUND_%s] started, APP_CRAWL_INTERVAL: %s, APP_STORAGE_SHUFFLE_INTERVAL: %s',
                spider_class.name, self.crawler.settings.get('APP_TASK'),
                os.getenv('APP_DISTRIBUTED'), self.round,
                self.crawler.settings.get('APP_CRAWL_INTERVAL'),
                self.crawler.settings.get('APP_STORAGE_SHUFFLE_INTERVAL'))
            self.crawler.crawl(spider_class)
            if os.getenv('APP_DISTRIBUTED'):
                redis.set(spider_class.name + ':app_task',
                          self.crawler.settings.get('APP_TASK'))
            self.round += 1
        else:
            logger.info('NEW ROUND SKIPPED BY [SPIDER.%s.%s.DIS_%s.ROUND_%s]',
                        spider_class.name,
                        self.crawler.settings.get('APP_TASK'),
                        os.getenv('APP_DISTRIBUTED'), self.round)
Пример #3
0
    def from_settings(cls, settings):
        kwargs = {
            'persist': settings.getbool('SCHEDULER_PERSIST'),
            'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),
            'idle_before_close':
            settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),
            'GET_CLOSE_NUM': settings.getint('GET_CLOSE_NUM'),
        }

        # If these values are missing, it means we want to use the defaults.
        optional = {
            # TODO: Use custom prefixes for this settings to note that are
            # specific to scrapy-redis.
            'queue_key': 'SCHEDULER_QUEUE_KEY',
            'queue_cls': 'SCHEDULER_QUEUE_CLASS',
            'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',
            # We use the default setting name to keep compatibility.
            'dupefilter_cls': 'DUPEFILTER_CLASS',
            'serializer': 'SCHEDULER_SERIALIZER',
        }
        for name, setting_name in optional.items():
            val = settings.get(setting_name)
            if val:
                kwargs[name] = val

        # Support serializer as a path to a module.
        if isinstance(kwargs.get('serializer'), six.string_types):
            kwargs['serializer'] = importlib.import_module(
                kwargs['serializer'])

        server = connection.from_settings(settings)
        # Ensure the connection is working.
        server.ping()

        param = DEFAULT_PARAMS.copy()
        param.update(settings.getdict('REDIS_PARAMS'))
        param['url'] = settings.get('REDIS_DUP_URL')
        kwargs['dfserver'] = connection.get_redis(**param)
        return cls(server=server, **kwargs)
Пример #4
0
import sys
import threading

from scrapy.cmdline import execute

from novel_search.search import search_url
from novel_spider.settings import ROOT_PATH
from scrapy_redis.connection import get_redis
from scrapy_redis.picklecompat import loads

sys.path.extend(ROOT_PATH)
redis = get_redis()


def lpush():
    name, url = search_url('怪物乐园')
    redis.lpush('biquge_single:start_urls', url)
    global timer
    timer = threading.Timer(300, lpush)
    timer.start()


if __name__ == '__main__':
    execute(["scrapy", "crawl", "biquge:single"])
    # lpush()
    # for item in redis.lrange("biquge:items", 0, 1000):
    #     obj = loads(item)
    #     print(obj)

Пример #5
0
 def test_from_url(self):
     client_cls = mock.Mock()
     url = 'redis://localhost'
     server = get_redis(redis_cls=client_cls, url=url, param='foo')
     assert server is client_cls.from_url.return_value
     client_cls.from_url.assert_called_with(url, param='foo')
Пример #6
0
 def test_custom_class(self):
     client_cls = mock.Mock()
     server = get_redis(param='foo', redis_cls=client_cls)
     assert server is client_cls.return_value
     client_cls.assert_called_with(param='foo')
Пример #7
0
 def test_default_instance(self):
     server = get_redis()
     assert isinstance(server, defaults.REDIS_CLS)
Пример #8
0
 def test_from_url(self):
     client_cls = mock.Mock()
     url = 'redis://localhost'
     server = get_redis(redis_cls=client_cls, url=url, param='foo')
     assert server is client_cls.from_url.return_value
     client_cls.from_url.assert_called_with(url, param='foo')
Пример #9
0
 def test_custom_class(self):
     client_cls = mock.Mock()
     server = get_redis(param='foo', redis_cls=client_cls)
     assert server is client_cls.return_value
     client_cls.assert_called_with(param='foo')
Пример #10
0
 def test_default_instance(self):
     server = get_redis()
     assert isinstance(server, DEFAULT_REDIS_CLS)
Пример #11
0
 def __init__(self):
     self.redis = get_redis()