class ProxyCheck(Utility): def __init__(self): self.redis = RedisClient() self.valid = IpValidation() def check_num(self): valid_num = self.redis.count(VALIDATED_SCORE, VALIDATED_SCORE, name=PROXY_VALIDATED) if valid_num < VALIDATED_PROXY_NUM: run() def init_score(self): start, end = DISCARD_SCORE, '+inf' length = self.redis.count(start, end, name=PROXY_VALIDATED) while length > 0: result = self.redis.get_proxy_by_score(start, end, 1000) for ip in result: self.redis.db.zadd(PROXY_VALIDATED, {ip: INITIAL_SCORE}) start += 1000 length = self.redis.count(start, end) logger.info('initiation finished') def check_valid(self): settings.SPIDER_RUNNING = False self.init_score() self.valid.run_validation(key=PROXY_VALIDATED) self.check_num()
def exists_proxy(self): """ 代理是否存在 :return: bool """ self.redis = RedisClient() return self.redis.exists(CLIENT_NAME)
class GetProxy: def __init__(self): self.redis = RedisClient() def clear_old_key(self): min_, max_ = '-inf', '+inf' length = self.redis.count(min_, max_, name=PROXY_FOR_USE) if length > 0: self.redis.db.zremrangebyrank(PROXY_FOR_USE, 0, -1) def init_redis_key(self): self.clear_old_key() min_, max_ = '-inf', '+inf' length = self.redis.count(min_, max_) if length > 0: self.redis.db.zunionstore(PROXY_FOR_USE, [PROXY_VALIDATED]) def get_proxy(self): min_, max_ = '-inf', '+inf' length = self.redis.count(min_, max_, name=PROXY_FOR_USE) if length > 0: proxy = self.redis.get_proxy_by_score( min_, max_, 1, key=PROXY_FOR_USE)[0].split('-')[1] return proxy else: raise Exception('no proxy to use')
class Test_ip(object): def __init__(self): self.db = RedisClient() self.headers = headers self.url = test_url def get_url(self,proxy): try: con = requests.get(self.url,headers = self.headers,proxies = proxy) if con.status_code==200: return True else: return False except: return False def test(self,ip): ip = ip.decode('utf-8') proxy = {'http':'http://'+ip} test_result = self.get_url(proxy) if test_result: self.db.max(ip,) else: self.db.decrease(ip) def run(self): proxies = self.db.all() for i in range(len(proxies)): ip = proxies[i] t = threading.Thread(target=self.test,args=(ip,)) t.setDaemon(True) t.start() random_time() if i%100==0: time.sleep(5)
def __init__(self, site, accounts_pool_size, single_cycle_limit): self.site = site # 账号池数量上限 self.accounts_pool_size = accounts_pool_size # 单轮注册数量上限 self.single_cycle_limit = single_cycle_limit self.accounts_db = RedisClient('accounts', self.site)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到代理池数量限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print("获取器开始执行") if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] #获取代理 proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False ## def run(self): ## print('获取器开始执行') ## if not self.is_over_threshold(): ## for callback_label in range(self.crawler.__CrawlFuncCount__): ## callback = self.crawler.__CrawlFunc__[callback_label] ## # 获取代理 ## proxies = self.crawler.get_proxies(callback) ## sys.stdout.flush() ## for proxy in proxies: ## self.redis.add(proxy) def run(self): print('开始向代理池中添加代理') if not self.is_over_threshold(): proxies = self.crawler.get_proxies() sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class PoolGetter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def test_proxy_add(self, proxy): """检测是否可用, 可用添加到redis中""" # print("proxy: ", test_proxy_vaild(proxy)) if test_proxy_vaild(proxy): # print('[+]' + proxy + "可用") print(Fore.GREEN + '成功获取到代理', proxy) self.redis.add(proxy) def run(self): print("[-] 代理池获取器开始执行......") if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) # print("proxies: ", proxies) # 刷新输出 sys.stdout.flush() with ThreadPoolExecutor(ThreadCount) as pool: pool.map(self.test_proxy_add, proxies)
def proxy(): redis_cli = RedisClient() ip = redis_cli.random() if ip: res = ip + ':' + str(config.PORT) return Response(response=res, status=200) else: return Response(response='代理池为空', status=400)
def __init__(self, website='default'): """ 父类,初始化一些对象 :param website: 名称 """ self.website = website self.cookies_db = RedisClient('cookies', self.website) self.accounts_db = RedisClient('accounts', self.website)
def get_cookies_from_db(website): """ 提供此方法可以直接从数据库获取随机cookies :param website: 网站,全小写 :return: 该网站的随机cookies,str类型,需要转换成Dict或CookieJar对象才能使用 """ acc = RedisClient('cookies', website) return acc.random()
def _redis_init(self): try: if hasattr(self, 'redis') and self.redis: self.redis.close() self.redis = RedisClient() except redis.ConnectionError: self.redis = RedisClient() logger.warning("redis ConnectionError")
def __init__(self): super(nanjingLandDetailSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) self.redisClient = RedisClient('nanjing', 'LandDetail') self.duplicateUrl = 0 self.targetUrl = 'https://jy.landnj.cn/default.aspx?page={}' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡'
def __init__(self): self.redis = RedisClient() self.real_ip = '' # 每次验证不成功,减去的分值 self.minus_every_time = (INITIAL_SCORE - DISCARD_SCORE) // VALIDATE_TIME self.key = PROXY_ORIGINAL self.anon_check_url = 'http://httpbin.org/ip'
def remove_proxy(self): """ 移除代理 :return: None """ self.redis = RedisClient() self.redis.remove(CLIENT_NAME) print('Successfully Removed Proxy')
def __init__(self): super(shanxiTransformNoticeSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('shanxi', 'shanxiTransformResult') self.duplicateUrl = 0 self.targetUrl = 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/crjg/index_{}.shtml' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡'
def __init__(self): self.server = RedisClient() # 区别对待使用密码和不使用密码的配置模板 if settings.USE_PASSWORD: self.peer_conf = "cache_peer %s parent %s 0 weighted-round-robin weight=2\n" # self.peer_conf = "cache_peer %s parent %s 0 no-query proxy-only login={}:{} never_direct allow all round-robin weight=1 connect-fail-limit=2 allow-miss max-conn=5\n".format( # settings.USERNAME, settings.PASSWORD) else: self.peer_conf = "cache_peer %s parent %s 0 weighted-round-robin weight=2\n"
def save_accounts(): """ 录入账号和密码 :return: """ for website, accounts in ACCOUNTS.items(): acc = RedisClient('accounts', website) acc.set_many(accounts) print('%s的所有账号已保存成功...' % website)
def set_proxy(self, proxy): """ 设置代理 :param proxy: 代理 :return: None """ self.redis = RedisClient() if self.redis.set(CLIENT_NAME, proxy): print('Successfully Set Proxy', proxy)
def record(key): if key in config.KEYS: ip = request.remote_addr print(ip) redis_cli = RedisClient() redis_cli.put(key, ip) return 'Successfully saved: {}'.format(ip) else: return 'Invalid Key'
def set_proxy(self, proxy): """ 设置代理 :param proxy: 代理 :return: None """ self.redis = RedisClient() if self.redis.set(self.CLIENT_NAME, proxy): logger.info(f'Successfully set proxy {proxy}')
def __init__(self): super(longyanTransformNoticeSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('longyan', 'longyanTransformNotice') self.duplicateUrl = 0 # TODO self.targetUrl = 'https://www.lyggzy.com.cn/lyztb/tdky/084002/?pageing={}' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡'
def main(): s = Scheduler() print('程序开始运行。。') redisClient = RedisClient() # flag = True # while flag: redis_len = redisClient.llen('employment') print('redis队列长度:' + str(redis_len)) if redis_len >= 0: s.run()
def __init__(self, website='default'): """ 父类,初始化一些对象 :param website: 名称 :param browser: 浏览器,不用可以设置为None """ self.website = website self.cookies_db = RedisClient('cookies', self.website) self.accounts_db = RedisClient('accounts', self.website) self.init_browser()
def __init__(self): super(zhengzhouLandTransformNoticeSpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('zhengzhou', 'zhengzhouLandTransformNotice') self.duplicateUrl = 0 self.targetUrl = 'http://zzland.zhengzhou.gov.cn/xycrgg/index_{}.jhtml' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡'
def __init__(self): super(hefeiLandSupplySpider, self).__init__() dispatcher.connect(self.CloseSpider, signals.spider_closed) # TODO self.redisClient = RedisClient('hefei', 'hefeiLandSupply') self.duplicateUrl = 0 # TODO self.targetUrl = 'http://ggzy.hefei.gov.cn/hftd/tdgy/?Paging={}' self.header = {'User-Agent': random.choice(agent_list)} self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡'
class Checker(object): def __init__(self): self.db = RedisClient() self.counts = defaultdict(int) def check(self, proxy): """ 测试代理,返回测试结果 :param proxy: 代理 :return: 测试结果 """ try: response = requests.get(settings.TEST_URL, proxies={ 'http': 'http://' + proxy, 'https': 'https://' + proxy }, timeout=settings.TEST_TIMEOUT) logger.debug(f'Using {proxy} to test {settings.TEST_URL}...') if response.status_code == 200: return True except (ConnectionError, ReadTimeout): return False def run(self): """ 测试一轮 :return: """ proxies = self.db.all() logger.info(f'Try to get all proxies {proxies}') for name, proxy in proxies.items(): # 检测无效 if not self.check(proxy): logger.info(f'Proxy {proxy} invalid') self.counts[proxy] += 1 else: logger.info(f'Proxy {proxy} valid') count = self.counts.get(proxy) or 0 logger.debug( f'Count {count}, TEST_MAX_ERROR_COUNT {settings.TEST_MAX_ERROR_COUNT}' ) if count >= settings.TEST_MAX_ERROR_COUNT: self.db.remove(name) def loop(self): """ 循环测试 :return: """ while True: logger.info('Check for infinite') self.run() logger.info(f'Tested, sleeping for {settings.TEST_CYCLE}s...') time.sleep(settings.TEST_CYCLE)
def delete_account(site, type, username): conn = RedisClient(type, site) num = 0 while num < 5: result = conn.delete(username) if result: print('删除 {} 成功! '.format(username)) break num += 1 time.sleep(1) print('删除失败, 请手动删除! ')
class Getter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def run(self): print("开始抓取代理ip") ips_list = self.crawler.run() for ip in ips_list: self.redis.add(ip)
def set_account(site, type, account, sep=' '): conn = RedisClient(type, site) username, value = account.split(sep) num = 0 while num < 5: result = conn.set(username, value) if result: print('{}--{} 录入成功! '.format(username, value)) return num += 1 time.sleep(1) print('录入失败, 请检查 redis 内存是否已满, 尝试手动录入! ')
class Getter: def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
def __init__(self,font_key,url=None,imgSize=(0,0),imgMode='RGB',bg_color=(0,0,0),fg_color=(255,255,255),fontsize=30): self.imgSize = imgSize self.imgMode = imgMode self.fontsize = fontsize self.bg_color = bg_color self.fg_color = fg_color self.font_key = font_key self.url = url or self.make_url self.get_ttl() self.client = AipClient(APP_ID, API_KEY, SECRET_KEY,REDIS_URL) self.r = RedisClient(REDIS_URL)
def verify_cookie(cls): baseurl = 'https://weibo.cn/' conn = RedisClient() if conn.get(): #print(conn.get()) try: response = requests.get(baseurl,cookies=conn.get()) #print(response.text) if response.status_code == 200: return cls(cookie=conn.get()) else: conn.add_score(conn.get()) return cls(cookie=Spider.verify_cookie()) except Exception: print('verify error') else: l = Login() l.save_cookies() return cls(cookie=Spider.verify_cookie())
def save_cookies(self): self.login() conn = RedisClient() conn.add(self._cookie)
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def __init__(self, appid, api_key, secrrt_key, redis_url): self.appid = appid self.api_key = api_key self.secrrt_key = secrrt_key self.client = AipOcr(appid, api_key, secrrt_key) self.redis = RedisClient(redis_url)
class TycTTF(): _instance = {} def __init__(self,font_key,url=None,imgSize=(0,0),imgMode='RGB',bg_color=(0,0,0),fg_color=(255,255,255),fontsize=30): self.imgSize = imgSize self.imgMode = imgMode self.fontsize = fontsize self.bg_color = bg_color self.fg_color = fg_color self.font_key = font_key self.url = url or self.make_url self.get_ttl() self.client = AipClient(APP_ID, API_KEY, SECRET_KEY,REDIS_URL) self.r = RedisClient(REDIS_URL) def __new__(cls, url, *args, **kw): ''' 伪单例模式 缓存优化 ''' if url not in cls._instance: cls._instance[url] = super().__new__(cls) return cls._instance[url] @property def make_url(self): return 'https://static.tianyancha.com/fonts-styles/fonts/%s/%s/tyc-num.woff' % (self.font_key[:2],self.font_key) def get_ttl(self): res = requests.get(self.url) # PIL 字体对象 self.font = ImageFont.truetype(BytesIO(res.content),self.fontsize) # ttf字体对象 self.ttf = TTFont(BytesIO(res.content)) # 反向解析 获取字体库所有文字 self.strings = {hex(string).replace('0x','\\u').encode('utf-8').decode('unicode-escape') if string > 2**8 else hex(string).replace('0x','\\x').encode('utf-8').decode('unicode-escape') for string in self.ttf.getBestCmap().keys() } def GenLetterImage(self,letters:str): self.letters = letters (self.letterWidth,self.letterHeight) = self.font.getsize(letters) if self.imgSize==(0,0): # 文字大小基础上 长宽各加10个像素点 self.imgSize=(self.letterWidth+10,self.letterHeight+10) self.imgWidth,self.imgHeight=self.imgSize # new一个image对象 self.img = Image.new(self.imgMode, self.imgSize, self.bg_color) # 画笔对象 self.drawBrush = ImageDraw.Draw(self.img) textY0 = (self.imgHeight-self.letterHeight+1)/2 textY0 = int(textY0) textX0 = int((self.imgWidth-self.letterWidth+1)/2) # 从font对象内获取 letter 映射 文字 并写入空白image对象内 self.drawBrush.text((textX0,textY0), self.letters, fill=self.fg_color,font=self.font) def _orc(self, word:str): # image = pretreat_image(self.img) self.GenLetterImage(word) # 实例化image容器 img = ImageBytes() # 将img bytes 传给image容器 self.img.save(img, 'JPEG') if word in {'0','1','2','3','4','5','6','7','8','9','x'}: # 数字 用eng 解析 kwarg = {'language_type':'ENG'} else: # 其他使用中英文 kwarg = {'language_type':'CHN_ENG'} return self.client.run(img.img,self.font_key,word,**kwarg) def orc(self,word:str): if self.r.hexists(self.url, word): return self.r.hget(self.font_key, word).decode('utf-8') else: return self._orc(word) def run(self, word:str): string = '' for letter in word: if letter in self.strings: string += self.orc(letter) else: string += letter return string
class AipClient(object): ''' 百度识别api ''' def __init__(self, appid, api_key, secrrt_key, redis_url): self.appid = appid self.api_key = api_key self.secrrt_key = secrrt_key self.client = AipOcr(appid, api_key, secrrt_key) self.redis = RedisClient(redis_url) def __new__(cls, *args, **kw): ''' api 单例模式 ''' if not hasattr(cls, '_instance'): cls._instance = super().__new__(cls) return cls._instance @property def options(self): return {"language_type":"CHN_ENG", "detect_direction":"false", "detect_language":"false", "probability":"false"} def General(self, image,**kwargs): print('调取General_api 识别') return self.client.basicGeneral(image, self.options) def Accurate(self, image): print('调取Accurate_api 识别') return self.client.basicAccurate(image, self.options) def orc(self, image, font_key, word, **kwargs): hash_value = MD5.md5(image) results = self.General(image, **kwargs) if results.get('words_result'): if results.get('words_result') != '*': result = results['words_result'][0]['words'] self.redis.add(hash_value, result) self.redis.hadd(font_key, word, result) return result results = self.Accurate(image) if results.get('words_result'): if results.get('words_result') != '*': result = results['words_result'][0]['words'] self.redis.add(hash_value, result) self.redis.hadd(font_key, word, result) return result # Image.open(BytesIO(image)).show() # print(hash_value) return '*' def run(self, image, font_key,word, **kwargs): hash_value = MD5.md5(image) if self.redis.exists(hash_value): result = self.redis.get(hash_value) self.redis.hadd(font_key, word, result) return result else: return self.orc(image, font_key, word, **kwargs)