class Getter: def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_arrive_threshold(self): ''' 判断当前数据库中的IP条目数是否到达设置的阈值 :return: ''' if self.redis.count() >= THRESHOLD: return True else: return False def run(self): print('Getter Run...') if not self.is_arrive_threshold(): #未到达阈值才继续获取 ''' 调用Crawler类的crawl_*方法 ''' for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
def download_proxies(): conn = RedisClient(name='enterprise_proxies') url = 'http://svip.kuaidaili.com/api/getproxy' params = { 'orderid': '979397309945634', 'num': 20, 'quality': 2, 'format': 'json' } content = requests.get(url, params=params).json() for proxy in content['data']['proxy_list']: proxies = { 'http': 'http://%s' % proxy, 'https': 'http://%s' % proxy, } ping_url = 'http://www.baidu.com' status_code = requests.get(ping_url).status_code if status_code == 200: p = json.dumps(proxies) now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) check = conn.exist(p) if not check: conn.set(p, 1) conn.lpush(p) now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print(now, ' New proxies: ', p) else: print(now, ' already exist proxies: ', p)
class Tester: def __init__(self): self.redis = RedisClient() async def test_proxy(self, proxy): ''' 测试指定的代理 :param proxy: 待测试的代理 :return: ''' conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: try: if isinstance(proxy, bytes): proxy = proxy.decode('UTF-8') test_proxy = 'http://' + proxy print('测试:', proxy) async with session.get(TEST_URL, proxy=test_proxy, timeout=5) as resp: if resp.status == 200: self.redis.max(proxy) #检测到代理可用,将标识分数设置为MAX_SCORE else: self.redis.decrease(proxy) #代理不可用,标识分数减1 except (AttributeError, TimeoutError, ClientError, aiohttp.ClientConnectorError): self.redis.decrease(proxy) #代理不可用,标识分数减1 def run(self): ''' 测试main方法 :return: ''' print('开始测试...') try: proxies = self.redis.all() loop = asyncio.get_event_loop() #批量测试 for i in range(0, len(proxies), BATCH_SIZE): test_proxies = proxies[i:i + BATCH_SIZE] #获取测试代理 tasks = [self.test_proxy(proxy) for proxy in test_proxies] loop.run_until_complete(asyncio.wait(tasks)) time.sleep(5) except Exception as e: print('测试错误', e.args)
class RandomHttpsProxyMiddleware(object): def __init__(self): ''' 设置代理ip池 ''' self.db = RedisClient('useful_proxy', '111.231.255.225', 6379) def process_request(self, request, spider): ''' 判断是否是http 或者https选择对应的 :param request: :param spider: :return: ''' if request.url.startswith('https'): self.db.changeTable('useful_proxy_https') request.meta['proxy'] = self.db.get() else: self.db.changeTable('useful_proxy') request.meta['proxy'] = self.db.get()
def __init__(self): self.redis = RedisClient()
def get_conn(): if not hasattr(g, 'redis'): g.redis = RedisClient() return g.redis
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def __init__(self): self.r = RedisClient(DB_Config)
class ProxyService(object): def __init__(self): self.r = RedisClient(DB_Config) def get(self, ptype, n=1): n = 10 if n > 10 else n ptype = ptype.lower() if ptype in PROXY_TYPES: return self.r.get(PROXY + ptype, n) return [] def delete(self, ptype, v): ptype = ptype.lower() if ptype in PROXY_TYPES: return self.r.delRecord(PROXY + ptype, v) def refresh(self): self.__fetch() self.__validate() self.__flush() return 'ok' def __fetch(self): """ feed proxy:uncheck set """ for _cr in Crawlers: _crpath = '.'.join(['ProxyCrawler', _cr]) logging.info("import crawler: %s" % _crpath) Crawler = getattr(__import__(_crpath, fromlist=[_cr]), _cr) crawler = Crawler() crawler.run() for item in crawler.items: self.r.add(UNCHECK, item) def __validate(self): """ proxy:uncheck set -> proxy:checked set """ def __callback(item): self.r.add(CHECKED, item) total = self.r.len(UNCHECK) logging.info("need to validate %d proxies" % total) limit = 1000 for offset in xrange(0, total, limit): events = [] for _ in xrange(limit): item = self.r.pop(UNCHECK) if not item: break events.append(gevent.spawn(validate, item, __callback)) gevent.joinall(events) def __flush(self): """ proxy:checked set -> proxy:{http, https, socks5} set """ # clean all for _type in PROXY_TYPES: self.r.delTable(PROXY + _type) item = self.r.pop(CHECKED) while item: prefix, proxy = item.split(PROXY_SEP, 1) self.r.add(PROXY + prefix, proxy) item = self.r.pop(CHECKED)
def __init__(self): ''' 设置代理ip池 ''' self.db = RedisClient('useful_proxy', '111.231.255.225', 6379)
def __init__(self): ''' 设置代理ip池 ''' self.db = RedisClient('useful_proxy', '', 6379)
def __init__(self): self.logger = LogHandler('ProxyManger') self.dbClient = RedisClient() self.config = GetConfig() self.orignal_proxy_name,self.useful_proxy_name = self.config.dbName
class ProxyManager(object): def __init__(self): self.logger = LogHandler('ProxyManger') self.dbClient = RedisClient() self.config = GetConfig() self.orignal_proxy_name,self.useful_proxy_name = self.config.dbName def refresh(self): for proxyGetFunc in self.config.proxyGetter: # proxy_set = set() for proxy in getattr(ProxyGetter,proxyGetFunc.strip())(): if proxy: self.logger.info("{func} fetch proxy {proxy}".format(func=proxyGetFunc,proxy=proxy)) # proxy_set.add(proxy.strip()) self.dbClient.lput(self.orignal_proxy_name,proxy) def get(self): """ 从可用代理池中获取一个可用代理 :return: one useful proxy """ return self.dbClient.sgetOne(self.useful_proxy_name)[0] def getAll(self): return self.dbClient.sgetAll(self.useful_proxy_name) def spop(self): """ 从可用代理池中获随机一个代理,并删除 """ return self.dbClient.spop(self.useful_proxy_name) def pop(self): """ 从原始代理池中获一个然后删除 :return: one original proxy """ return self.dbClient.rpop(self.orignal_proxy_name) def put(self,value): """ save validproxy into useful proxy pool :param value: :return: """ self.dbClient.sput(self.useful_proxy_name,value) def delete(self, value): """ 可用数据库中删除一个代理 :return: """ self.dbClient.sdeleteValue(self.useful_proxy_name,value) def getStatus(self): return self.dbClient.sgetStatues()