class Getter: def __init__(self, website='tianyancha'): """初始化数据库类和cookie爬虫类""" self.website = website self.redis = RedisClient('accounts', self.website) self.crawler = Crawler() self.accounts_db = RedisClient('accounts', self.website) def is_over_threshold(self): """判断数据库是否已经存满""" if self.redis.count() >= POOL_UPPER_THRESHLD: return True return False def run(self): """开始抓取cookies存入数据库""" accounts_usernames = self.accounts_db.usernames() keys = self.redis.get() for username in accounts_usernames[:]: if not username in keys: password = self.accounts_db.get_value(username) print('正在生成Cookies', '账号', username, '密码', password) if not self.is_over_threshold(): try: time.sleep(5) cookie = self.crawler.crawl_main(username, password) if cookie: self.redis.add(username, cookie) print("cookie有效") else: print("监控到cookie为空") except Exception as e: pass else: print('账号', username, "存在于cookie池里")
class Crawl_ip(object): def __init__(self): self.db = RedisClient() def ip_xici(self): url = 'http://www.xicidaili.com/' con = get_page(url) html = etree.HTML(con) ip_list = html.xpath('//tr/td[2]/text()') ip_port = html.xpath('//tr/td[3]/text()') for i in range(100): ip = ip_list[i] + ':' + ip_port[i] self.db.add(ip) def ip_66(self): preurl = 'http://www.66ip.cn/' for i in range(100): url = preurl + str(i) + '.html' con = get_page(url) if con: html = etree.HTML(con) ip_list = html.xpath('//tr') for i in range(2, len(ip_list)): ip = ip_list[i].xpath('td[1]/text()')[0] + ":" + ip_list[ i].xpath('td[2]/text()')[0] self.db.add(ip, 10) intr = random.randint(5, 15) time.sleep(intr * 0.1) def run(self): self.ip_66() self.ip_xici()
class Getter(object): """ 根据阈值判断是否爬取 """ def __init__(self): self.crawler = Crawler() self.redis = RedisClient() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ return self.redis.count() >= POOL_UPPER_THRESHOLD def run(self): if not self.is_over_threshold(): logging.info('Getter now running') old_count = self.redis.count() # 调用爬虫方法 for callback in self.crawler.__CrawlFunc__: # 爬取数据 proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy) new_count = self.redis.count() # 未爬取到数据 if old_count == new_count and len(proxies) == 0: logging.error('%s function can\'t crawl new proxy' % callback) elif old_count != new_count: logging.info('%s crawl %d proxies' % (callback, (new_count - old_count))) old_count = new_count
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): ''' 运行 :return: ''' print('获取器开始执行') if not self.is_over_threshold(): print(self.crawler.__CrawlFuncCount__) for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) #强制刷新缓冲区 sys.stdout.flush() print('存储代理') for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False ## def run(self): ## print('获取器开始执行') ## if not self.is_over_threshold(): ## for callback_label in range(self.crawler.__CrawlFuncCount__): ## callback = self.crawler.__CrawlFunc__[callback_label] ## # 获取代理 ## proxies = self.crawler.get_proxies(callback) ## sys.stdout.flush() ## for proxy in proxies: ## self.redis.add(proxy) def run(self): print('开始向代理池中添加代理') if not self.is_over_threshold(): proxies = self.crawler.get_proxies() sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class PoolGetter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def test_proxy_add(self, proxy): """检测是否可用, 可用添加到redis中""" if test_proxy_vaild(proxy): # print('[+]' + proxy + "可用") print(Fore.GREEN + '成功获取到代理', proxy) self.redis.add(proxy) def run(self): print("[-] 代理池获取器开始执行......") if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] # 获取代理 proxies = self.crawler.get_proxies(callback) # 刷新输出 sys.stdout.flush() with ThreadPoolExecutor(ThreadCount) as pool: pool.map(self.test_proxy_add, proxies)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): ''' 判断是否达到了代理池限制 :return: ''' if self.redis.count() > POOL_UPPER_THRESHOLD: return True else: return False def run(self): ''' 开始执行 :return: ''' print('获取器开始执行') if not self.is_over_threshold(): print('1') #获取代理池的代理总数 # #__CrawlFunc__是列表,存放获取代理网函数的名称 for callback_label in range(self.crawler.__CrawlFuncCount__): print('2') #callback就是crawl类里面的属性函数名称 callback = self.crawler.__CrawlFunc__[callback_label] print('3') proxies = self.crawler.get_proxies(callback) print('4') for proxy in proxies: print('5') self.redis.add(proxy)
class PoolGetter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def test_proxy_add(self, proxy): if test_proxy_vaild(proxy): print(Fore.GREEN + "成功获取到代理IP", proxy) self.redis.add(proxy) def run(self): print("[-]代理池获取器开始执行") if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() with ThreadPoolExecutor(ThreadCount) as pool: pool.map(self.test_proxy_add, proxies)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到代理池数量限制 :return : 返回判断结果 """ if self.redis.count() >= PROXIES_THRESHOLD: return True else: return False def run(self): """ 从各大网站中获取代理 """ # print('获取器开始执行!') for callback_label in range(self.crawler.__CrawlFuncCount__): # 获取Crawler里面的以 crawl 开头的函数 callback = self.crawler.__CrawlFunc__[callback_label] # 判断是否达到代理池数量限制 if not self.is_over_threshold(): # 运行函数获取代理 proxies_list = self.crawler.get_proxies(callback) sys.stdout.flush() # 添加代理到数据库 for proxy in proxies_list: self.redis.add(proxy) else: print(callback, '代理池代理数量已满,不再爬取代理!') print('代理爬取完毕,当前代理池代理总数:', self.redis.count())
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() # 设置一个函数判断代理池是否达到上限 def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: # 如果超过或等于上限 return True else: return False # 执行函数 def run(self): print('执行获取代理开始') if not self.is_over_threshold(): # 通过上个类里面统计到的函数数量,循环挨个执行函数获取代理 for callback_lable in range(self.crawler.__CrawlFunCount__): callback = self.crawler.__CrawlFunc__[ callback_lable] # callback为Crawler类中的代理函数名 # 用到Crawler类中的get_proxies函数,将抓取到的函数保存到一个列表中,并且返回 proxies = self.crawler.get_proxies(callback) # 遍历返回的列表,将抓到的数据保存到redis中 for proxy in proxies: self.redis.add(proxy) # 将代理加入到有序集合proxies中,并设置初始分
class Gettet(): def __init__(self): self.redis = RedisClient() self.crawl = Crawler() def is_over_proxy(self): ''' 判断是否达到代理池的极限 :return: ''' if self.redis.count() >= max_proxy: return True else: return False def run(self): print('获取器开始执行》》》》》》》》》》') if not self.is_over_proxy(): print(1) for callbak_lable in range(self.crawl.__CrawlFuncCount__): print(2) callbak = self.crawl.__CrawlFunc__[callbak_lable] print(3) proxies = self.crawl.get_proxies(callbak) for proxy in proxies: self.redis.add(proxy)
class Getter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def run(self): print("开始抓取代理ip") ips_list = self.crawler.run() for ip in ips_list: self.redis.add(ip)
class AipClient(object): ''' 百度识别api ''' def __init__(self, appid, api_key, secrrt_key, redis_url): self.appid = appid self.api_key = api_key self.secrrt_key = secrrt_key self.client = AipOcr(appid, api_key, secrrt_key) self.redis = RedisClient(redis_url) def __new__(cls, *args, **kw): ''' api 单例模式 ''' if not hasattr(cls, '_instance'): cls._instance = super().__new__(cls) return cls._instance @property def options(self): return { "language_type": "CHN_ENG", "detect_direction": "false", "detect_language": "true", "probability": "true" } def General(self, image, **kwargs): print('调取General_api 识别') return self.client.basicGeneral(image, self.options) def Accurate(self, image): print('调取Accurate_api 识别') return self.client.basicAccurate(image, self.options) def orc(self, image, **kwargs): hash_value = MD5.md5(image) results = self.General(image, **kwargs) if results.get('words_result'): self.redis.add(hash_value, results['words_result'][0]['words']) return results['words_result'][0]['words'] results = self.Accurate(image) if results.get('words_result'): self.redis.add(hash_value, results['words_result'][0]['words']) return results['words_result'][0]['words'] return '*' def run(self, image, **kwargs): hash_value = MD5.md5(image) if self.redis.exists(hash_value): return self.redis.get(hash_value) else: return self.orc(image, **kwargs)
class Getter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.count() >= POOL_THRESHOLD: return True else: return False def run(self): print('获取器开始执行!') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] for proxy in eval('self.crawler.{}()'.format(callback)): self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.count() >= POOL_UPPER: return True else: return False def run(self): print("Starts running crawlers") if not self.is_over_threshold(): for callback_func in self.crawler.__CrawlFunc__: proxies = self.crawler.get_proxies(callback_func) for proxy in proxies: self.redis.add(proxy)
class Sync(object): """ mysql与redis数据同步 """ def __init__(self): self.redis = RedisClient() self.pool = ProxyPool.objects.filter(is_exsist=True) # ProxyPool.objects.filter(proxy='').delete() def sync_start(self): # mysql同步到redis for item in self.pool: proxy = item.proxy score = item.score self.redis.add(proxy, score, mysql_save=False) # redis同步到mysql for proxy in self.redis.all(): self.redis.mysql_add(proxy)
class Getter(object): def __init__(self): self.db = RedisClient() self.crawl = Crawler() def is_over_threshold(self): if self.db.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): if not self.is_over_threshold(): func_names = self.crawl.__FUNC__ for func_name in func_names: proxies = self.crawl.get_proxies(func_name) for proxy in proxies: self.db.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawl = Crawler() def is_over_limit(self): if self.redis.count() > MAX_POOL: return True else: return False def run(self): print("获取器开始执行") if not self.is_over_limit(): for name in range(self.crawl.__CrawlFuncCount__): proxies = self.crawl.get_proxies( self.crawl.__CrawlFunc__[name]) for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.got_proxie = Got_Proxie() #判断代理数量是否达到限值 def is_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('执行开始') #如果代理数量没有达到限值 if not self.is_threshold(): proxies = self.got_proxie.got_proxie_ip66() for proxy in proxies: self.redis.add(proxy)
class Getter: def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() #判断是否达到了代理池限制 def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for index in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[index] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): ''' 初始化过程中实例化一个redis和一个crwal对象 ''' self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池限制 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') #如果代理数量太多,默认不执行获取工作 if not self.is_over_threshold(): #crawler.__CrawlFuncCount__获取有crawl开头方法的个数 for callback_label in range(self.crawler.__CrawlFuncCount__): #crawler.__CrawlFunc__获取crawl方法的列表 callback = self.crawler.__CrawlFunc__[callback_label] # 调用方法获取代理,这里的callback指的是某个具体的爬取函数 proxies = self.crawler.get_proxies(callback) ''' 在python中,执行结果都是经由sys.stdout()输出的,而stdout具有缓冲区, 即不会每一次写入就输出内容,而是会在得到相应的指令后才将缓冲区的内容输出。 sys.stdout.flush()的作用就是显示地让缓冲区的内容输出。 ''' sys.stdout.flush() #把proxy加入reids for proxy in proxies: self.redis.add(proxy)
class Getter: def __init__(self): """初始化数据库类和代理爬虫类""" self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """判断数据库是否已经存满""" if self.redis.count() >= POOL_UPPER_THRESHLD: return True return False def run(self): """开始抓取各个代理网站的免费代理存入数据库""" if not self.is_over_threshold(): for i in range(self.crawler.CrawlFuncCount): crawl_func = self.crawler.CrawlFunc[i] proxies = self.crawler.get_proxies(crawl_func) for proxy in proxies: print('获取:', proxy) self.redis.add(proxy)
class Getter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshhold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshhold(): proxies = self.crawler.run() for ip in proxies: self.redis.add(ip) print('已抓取', ip) print('结束,共抓取', self.redis.count()) for i in self.redis.all(): print(i, '当前分数', self.redis.score(i))
def set(proxy): """ 添加代理 :params proxy: 代理 """ try: # 连接redis db = RedisClient() # 添加代理 result = db.add(proxy) print(proxy,'导入成功!' if result else proxy,'导入失败!') except: pass
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): # is_over_threshold方法判断代理池是否达到了容量阈值 """ 判断是否达到了代理池限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: # 调用RedisClient的count方法来获取代理的数量,进行判断, return True # 达到阈值,返回True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): # 判断代理池是否达到阈值 for callback_label in range(self.crawler.__CrawlFuncCount__): # 调用了Crawler类的__CrawlFunc__属性 callback = self.crawler.__CrawlFunc__[callback_label] # 获取到所有以crawl开头的方法列表 proxies = self.crawler.get_proxies(callback) # 依次通过get_proxies方法调用 for proxy in proxies: # 得到各个方法抓球到的代理 self.redis.add(proxy) # 利用RedisClient的add方法加入数据库
class Saver: """ 爬取代理,并且存入redis数据库 """ def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断代理池中的代理数是否已经足够 """ if self.redis.count() >= settings.proxy_enough_count: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for crawl_func in self.crawler.crawl_funcs: proxies = self.crawler.get_proxies(crawl_func) for proxy in proxies: self.redis.add(proxy)
class Fetcher: def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到了代理池数量上限 """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for func in self.crawler.get_funclist(): # 从各个代理IP网站开始获取IP代理地址 proxies = self.crawler.get_proxies(func) sys.stdout.flush() for proxy in proxies: # 将获取的proxy加入到redis队列 self.redis.add(proxy)
class Getter: def __init__(self): """Initializing databases class and spider class""" self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """Determine if the database if full""" if self.redis.count() >= POOL_UPPER_THRESHLD: return True return False async def run(self): print('开始获取代理...') if not self.is_over_threshold(): for i in range(self.crawler.CrawlFuncCount): crawl_func = self.crawler.CrawlFunc[i] proxies = await self.crawler.get_proxy(crawl_func) for proxy in proxies: print(proxy) self.redis.add(proxy)
class Getter(): def __init__(self): self.crawler = Crawler() self.db = RedisClient() def run(self): print('获取器开始执行') for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] #print(callback) proxies = self.crawler.get_proxies(callback) sys.stdout.flush() #print(len(proxies)) for proxy in proxies: #print(proxy) if not self.db.add(proxy): print( '添加失败', proxy, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
class AipClient(object): ''' 百度识别api ''' def __init__(self, appid, api_key, secrrt_key, redis_url): self.appid = appid self.api_key = api_key self.secrrt_key = secrrt_key self.client = AipOcr(appid, api_key, secrrt_key) self.redis = RedisClient(redis_url) def __new__(cls, *args, **kw): ''' api 单例模式 ''' if not hasattr(cls, '_instance'): cls._instance = super().__new__(cls) return cls._instance @property def options(self): return {"language_type":"CHN_ENG", "detect_direction":"false", "detect_language":"false", "probability":"false"} def General(self, image,**kwargs): print('调取General_api 识别') return self.client.basicGeneral(image, self.options) def Accurate(self, image): print('调取Accurate_api 识别') return self.client.basicAccurate(image, self.options) def orc(self, image, font_key, word, **kwargs): hash_value = MD5.md5(image) results = self.General(image, **kwargs) if results.get('words_result'): if results.get('words_result') != '*': result = results['words_result'][0]['words'] self.redis.add(hash_value, result) self.redis.hadd(font_key, word, result) return result results = self.Accurate(image) if results.get('words_result'): if results.get('words_result') != '*': result = results['words_result'][0]['words'] self.redis.add(hash_value, result) self.redis.hadd(font_key, word, result) return result # Image.open(BytesIO(image)).show() # print(hash_value) return '*' def run(self, image, font_key,word, **kwargs): hash_value = MD5.md5(image) if self.redis.exists(hash_value): result = self.redis.get(hash_value) self.redis.hadd(font_key, word, result) return result else: return self.orc(image, font_key, word, **kwargs)
def save_cookies(self): self.login() conn = RedisClient() conn.add(self._cookie)