def __init__(self, crawler): self.settings = crawler.settings self.logger = Logger.from_crawler(crawler) if self.settings.getbool("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.getint("REDIS_PORT")) self.queue_name = None self.queues = {}
def setup(self): self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.clean_previous_task(self.crawlid)
def __init__(self, redis_host, redis_port, redis_key, **kwargs): try: from custom_redis.client import Redis except ImportError: try: from redis import Redis except ImportError: warnings.warn( "RedisSource depends on redis, try: pip install redis. ") exit(1) self.redis_key = redis_key self.redis_conn = Redis(redis_host, redis_port)
def __init__(self, crawler): self.settings = crawler.settings self.logger = CustomLogger.from_crawler(crawler) if self.settings.getbool("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.getint("REDIS_PORT")) self.queue_name = None self.queues = {} self.request_interval = 60 / self.settings.getint("SPEED", 60) self.last_acs_time = time.time()
def setup(self): self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.redis_conn.delete("crawlid:%s" % self.crawlid) self.redis_conn.delete("failed_pages:%s" % self.crawlid) self.redis_conn.delete("crawlid:%s:model" % self.crawlid)
def __init__(self, settings): self.settings_file = settings Logger.__init__(self, settings) self.set_logger() MultiThreadClosing.__init__(self) self.de_queue = Queue() if self.settings.get("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.small = False
def __init__(self, crawler): self.settings = crawler.settings self.set_logger(crawler) if self.settings.get("CUSTOM_REDIS"): from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get("REDIS_PORT")) self.queue_name = "%s:*:queue" self.queues = {} self.extract = tldextract.extract
def start(crawlid, host, custom): if custom: from custom_redis.client import Redis else: from redis import Redis redis_conn = Redis(host) key = "crawlid:%s" % crawlid failed_pages = int(redis_conn.hget(key, "failed_download_pages") or 0) format(redis_conn.hgetall(key)) if failed_pages: print_if = raw_input("show the failed pages? y/n default n:") if print_if == "y": key_ = "failed_pages:%s" % crawlid p = redis_conn.hgetall(key_) format(p, True)
def start(crawlid, host, port, custom): if custom: from custom_redis.client import Redis else: from redis import Redis redis_conn = Redis(host, port) key = "crawlid:%s" % crawlid data = redis_conn.hgetall(key) failed_keys = [x for x in data.keys() if fnmatch.fnmatch( x.decode() if isinstance(x, bytes) else x, "failed_download_*")] format(data) for fk in failed_keys: fk = fk.decode() if isinstance(fk, bytes) else fk print_if = input("show the %s? y/n default n:" % fk.replace("_", " ")) if print_if == "y": key_ = "%s:%s" % (fk, crawlid) p = redis_conn.hgetall(key_) format(p, True)
def start(crawlid, host, custom): if custom: from custom_redis.client import Redis else: from redis import Redis redis_conn = Redis(host) key = "crawlid:%s" % crawlid data = redis_conn.hgetall(key) failed_keys = filter(lambda x: fnmatch.fnmatch(x, "failed_download_*"), data.keys()) format(data) for fk in failed_keys: print_if = raw_input("show the %s? y/n default n:" % fk.replace("_", " ")) if print_if == "y": key_ = "%s:%s" % (fk, crawlid) p = redis_conn.hgetall(key_) format(p, True)
def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host, custom): self.crawlid = crawlid self.spiderid = spiderid self.url = url self.urls_file = urls_file self.priority = priority self.port = port self.host = host self.custom = custom self.inc = 0 self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0 if self.custom: from custom_redis.client import Redis else: from redis import Redis self.redis_conn = Redis(host=self.host, port=self.port) self.clean_previous_task(self.crawlid)