def run(self): """ run crawlers to get proxy :return: """ if self.is_full(): return proxyfile = "staticproxy.txt" with open(proxyfile, 'r') as fh: proxylines = fh.readlines() logger.info(f'read {proxyfile}') for line in proxylines: if line.strip() != "" and not line.startswith("#"): line = line.replace("\r\n", "").replace("\n", "") pattern = re.compile( r'((?P<username>\S*?)\:(?P<password>\S*?)@)?(?P<ip>[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})\:(?P<port>\d*)' ) match = re.search(pattern, line) if match: username = match.groupdict()['username'] password = match.groupdict()['password'] ip = match.groupdict()['ip'] port = match.groupdict()['port'] proxy = Proxy(host=ip, port=port, username=username, password=password) logger.info("getproxy " + proxy.string()) self.redis.add(proxy) for crawler in self.crawlers: logger.info(f'crawler {crawler} to get proxy') for proxy in crawler.crawl(): print(proxy.string()) self.redis.add(proxy)
def max(self, proxy: Proxy) -> int: """ set proxy to max score :param proxy: proxy :return: new score """ logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}') if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
def max(self, proxy: Proxy) -> int: """ 将代理设置为 MAX_SCORE :param proxy: 代理 :return: 设置结果 """ logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}') if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
def max(self, proxy: Proxy) -> int: ''' 将代理分数设成最大 :param proxy: 代理 :return: 新的分数 ''' logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}') if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
def decrease(self, proxy: Proxy) -> int: """ decrease score of proxy, if mall than PROXY_SCORE_MIN, delete it :param proxy: proxy :return new score """ if IS_REDIS_VERSION_2: self.db.zincrby(REDIS_KEY, proxy.string(), -1) else: self.db.zincrby(REDIS_KEY, -1, proxy.string()) score = self.db.zscore(REDIS_KEY, proxy.string()) logger.info(f'{proxy.string()} curent score {score}, remove') self.db.zrem(REDIS_KEY, proxy.string())
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: """ add proxy and set it to init score :param proxy: proxy, ip:port, like 8.8.8.8:88 :param score: int score :return: result """ if not is_valid_proxy(f'{proxy.host}:{proxy.port}'): logger.info(f'invalid proxy {proxy}, throw it') return if not self.exists(proxy): if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, score, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): score})
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT): """ 添加代理,设置分数为最高 :param proxy: 代理 :param score: 分数 :return: 添加结果 """ if not is_valid_proxy(f'{proxy.host}:{proxy.port}'): logger.info(f'invalid proxy {proxy}, throw it') return if not self.exists(proxy): if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, score, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): score})
def exists(self, proxy: Proxy) -> bool: """ if proxy exists :param proxy: proxy :return: if exists, bool """ return not self.db.zscore(REDIS_KEY, proxy.string()) is None
def exists(self, proxy: Proxy) -> bool: """ 判断是否存在 :param proxy: 代理 :return: 是否存在 """ return not self.db.zscore(REDIS_KEY, proxy.string()) is None
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: ''' 将代理加到 redis 中,并设置初始分数 :param proxy: 代理,格式 ip:port, 例如 8.8.8.8:888 :param score: 代理初始化的分数 :type score: int :return: 成功添加的数量 ''' if not is_valid_proxy(f'{proxy.host}:{proxy.port}'): logger.info(f'invalid proxy {proxy}, throw it') return # if not self.db.exists(proxy): # 将代理添加到有序集合中 if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, score, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): score})
def decrease(self, proxy: Proxy) -> int: """ decrease score of proxy, if small than PROXY_SCORE_MIN, delete it :param proxy: proxy :return: new score """ score = self.db.zscore(REDIS_KEY, proxy.string()) # current score is larger than PROXY_SCORE_MIN if score and score > PROXY_SCORE_MIN: logger.info(f'{proxy.string()} current score {score}, decrease 1') if IS_REDIS_VERSION_2: return self.db.zincrby(REDIS_KEY, proxy.string(), -1) return self.db.zincrby(REDIS_KEY, -1, proxy.string()) # otherwise delete proxy else: logger.info(f'{proxy.string()} current score {score}, remove') return self.db.zrem(REDIS_KEY, proxy)
def decrease(self, proxy: Proxy) -> int: """ 代理值减一分,小于最小值则删除 :param proxy: 代理 :return: 修改后的代理分数 """ score = self.db.zscore(REDIS_KEY, proxy.string()) # current score is larger than PROXY_SCORE_MIN if score and score > PROXY_SCORE_MIN: logger.info(f'{proxy.string()} current score {score}, decrease 1') if IS_REDIS_VERSION_2: return self.db.zincrby(REDIS_KEY, proxy.string(), -1) return self.db.zincrby(REDIS_KEY, -1, proxy.string()) # otherwise delete proxy else: logger.info(f'{proxy.string()} current score {score}, remove') return self.db.zrem(REDIS_KEY, proxy.string())
def decrease(self, proxy: Proxy) -> int: ''' 降低代理的分数,如果比最小值还低,则删除 :param proxy: proxy :return: new score ''' score = self.db.zscore(REDIS_KEY, proxy.string()) # 当前分数比最小值大 if score and score > PROXY_SCORE_MIN: logger.info(f'{proxy.string()} current score {score}, decrease 1') if IS_REDIS_VERSION_2: return self.db.zincrby(REDIS_KEY, proxy.string(), -1) return self.db.zincrby(REDIS_KEY, -1, proxy.string()) # 当前分数比最小值小 else: logger.info(f'{proxy.string()} current score {score}, remove') return self.db.zrem(REDIS_KEY, proxy.string())