def __init__(self, settings, crawler): self.user_reds = get_redis(url=settings['REDIS_URL'], db=settings['USER_DB'], decode_responses=True) self.cookie_reds = get_redis(url=settings['REDIS_URL'], db=settings['COOKIE_DB'], decode_responses=True) init_cookie(self.user_reds, self.cookie_reds, crawler.spider.name)
def run_crawler(self): spider_class = self.get_spider_class(self.spider_name) if os.getenv('APP_DISTRIBUTED'): redis = get_redis(url=self.crawler.settings.get('REDIS_URL')) if len(list(self.crawler.crawlers)) < 1: self.crawler.settings.set( 'APP_TASK', ScheduleCrawlerRunner.interval_to_app_task( self.crawler.settings.get('APP_STORAGE_SHUFFLE_INTERVAL'))) if os.getenv('APP_DISTRIBUTED'): if redis.zcount(spider_class.name + ':requests', 0, 100) < 1: for start_url in spider_class.start_urls: redis.sadd(spider_class.name + ':start_urls', start_url) else: self.crawler.settings.set( 'APP_TASK', redis.get(spider_class.name + ':app_task').decode('utf-8')) logger.info( '[SPIDER.%s.%s.DIS_%s.ROUND_%s] started, APP_CRAWL_INTERVAL: %s, APP_STORAGE_SHUFFLE_INTERVAL: %s', spider_class.name, self.crawler.settings.get('APP_TASK'), os.getenv('APP_DISTRIBUTED'), self.round, self.crawler.settings.get('APP_CRAWL_INTERVAL'), self.crawler.settings.get('APP_STORAGE_SHUFFLE_INTERVAL')) self.crawler.crawl(spider_class) if os.getenv('APP_DISTRIBUTED'): redis.set(spider_class.name + ':app_task', self.crawler.settings.get('APP_TASK')) self.round += 1 else: logger.info('NEW ROUND SKIPPED BY [SPIDER.%s.%s.DIS_%s.ROUND_%s]', spider_class.name, self.crawler.settings.get('APP_TASK'), os.getenv('APP_DISTRIBUTED'), self.round)
def from_settings(cls, settings): kwargs = { 'persist': settings.getbool('SCHEDULER_PERSIST'), 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), 'GET_CLOSE_NUM': settings.getint('GET_CLOSE_NUM'), } # If these values are missing, it means we want to use the defaults. optional = { # TODO: Use custom prefixes for this settings to note that are # specific to scrapy-redis. 'queue_key': 'SCHEDULER_QUEUE_KEY', 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', # We use the default setting name to keep compatibility. 'dupefilter_cls': 'DUPEFILTER_CLASS', 'serializer': 'SCHEDULER_SERIALIZER', } for name, setting_name in optional.items(): val = settings.get(setting_name) if val: kwargs[name] = val # Support serializer as a path to a module. if isinstance(kwargs.get('serializer'), six.string_types): kwargs['serializer'] = importlib.import_module( kwargs['serializer']) server = connection.from_settings(settings) # Ensure the connection is working. server.ping() param = DEFAULT_PARAMS.copy() param.update(settings.getdict('REDIS_PARAMS')) param['url'] = settings.get('REDIS_DUP_URL') kwargs['dfserver'] = connection.get_redis(**param) return cls(server=server, **kwargs)
import sys import threading from scrapy.cmdline import execute from novel_search.search import search_url from novel_spider.settings import ROOT_PATH from scrapy_redis.connection import get_redis from scrapy_redis.picklecompat import loads sys.path.extend(ROOT_PATH) redis = get_redis() def lpush(): name, url = search_url('怪物乐园') redis.lpush('biquge_single:start_urls', url) global timer timer = threading.Timer(300, lpush) timer.start() if __name__ == '__main__': execute(["scrapy", "crawl", "biquge:single"]) # lpush() # for item in redis.lrange("biquge:items", 0, 1000): # obj = loads(item) # print(obj)
def test_from_url(self): client_cls = mock.Mock() url = 'redis://localhost' server = get_redis(redis_cls=client_cls, url=url, param='foo') assert server is client_cls.from_url.return_value client_cls.from_url.assert_called_with(url, param='foo')
def test_custom_class(self): client_cls = mock.Mock() server = get_redis(param='foo', redis_cls=client_cls) assert server is client_cls.return_value client_cls.assert_called_with(param='foo')
def test_default_instance(self): server = get_redis() assert isinstance(server, defaults.REDIS_CLS)
def test_default_instance(self): server = get_redis() assert isinstance(server, DEFAULT_REDIS_CLS)
def __init__(self): self.redis = get_redis()