class ProxyRefresher: """ 代理定时刷新 """ def __init__(self): self._pm = ProxyManager() self.log = LogHandler('proxy_refresher') def fetch_all_proxy(self): """ fetch proxy into Db by ProxyGetter/get_free_proxy.py :return: """ for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) # for proxy in getattr(GetFreeProxy, proxyGetter.strip())(self.get()): for proxy in getattr(GetFreeProxy, proxyGetter.strip())(None): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verify_proxy_format(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self._pm.db.change_table(self._pm.raw_proxy_queue) self._pm.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) pass except Exception as e: self.log.error("{func}: fetch proxy fail, {e}".format( func=proxyGetter, e=e)) continue def validate_raw_proxy(self): """ 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue :return: """ self._pm.db.change_table(self._pm.raw_proxy_queue) raw_proxy = self._pm.db.pop() self.log.info('ProxyRefresher: %s start validProxy' % time.ctime()) # 计算剩余代理,用来减少重复计算 remaining_proxies = self._pm.get_all() while raw_proxy: if (raw_proxy not in remaining_proxies ) and valid_useful_proxy(raw_proxy): self._pm.db.change_table(self._pm.useful_proxy_queue) self._pm.db.put(raw_proxy) self.log.info('ProxyRefresher: %s validation pass' % raw_proxy) else: self.log.info('ProxyRefresher: %s validation fail' % raw_proxy) self._pm.db.change_table(self._pm.raw_proxy_queue) raw_proxy = self._pm.db.pop() remaining_proxies = self._pm.get_all() self.log.info('ProxyRefresher: %s validProxy complete' % time.ctime())
def testLogHandler(): """ test function LogHandler in Util/LogHandler :return: """ log = LogHandler('test') log.error('this is a log from test') log.resetName(name='test1') log.warning('this is a log from test1') log.resetName(name='test2') log.info('this is a log from test2')
class ProxyManager(object): def __init__(self): self.client = db.DBclient() self.log = LogHandler('proxy_manager') def fetch(self): proxy_set = set() self.log.info(u'代理抓取: start') get_function = GetFunctions() for proxy_get in get_function.proxy_get_functions: self.log.info('Get Proxy - {}: start'.format(proxy_get)) try: for proxy in getattr(GetFreeProxy, proxy_get.strip())(): proxy = proxy.strip() if not proxy or not verifyProxyFormat(proxy): self.log.error('Get Proxy - {}: {} error'.format( proxy_get, proxy)) continue elif proxy in proxy_set: self.log.info('Get Proxy - {}: {} is exist'.format( proxy_get, proxy)) continue else: self.log.info('Get Proxy - {}: {} success'.format( proxy_get, proxy)) self.client.put(Proxy(proxy, source=proxy_get)) proxy_set.add(proxy) except Exception as e: self.log.error('Get Proxy - {}: error'.format(proxy_get)) self.log.error(str(e)) def get(self): proxy_list = self.client.getAll() if proxy_list: proxy = random.choice(proxy_list) return Proxy.newProxyFromJson(proxy) else: return None def getAll(self): proxy_list = self.client.getAll() return [Proxy.newProxyFromJson(_) for _ in proxy_list] def getCount(self): proxy_counts = self.client.getCount() return proxy_counts def delete(self, proxy_key): self.client.delete(proxy_key)