def fetch(self): """ fetch proxy into db by ProxyGetter :return: """ self.db.changeTable(self.raw_proxy_queue) proxy_set = set() self.log.info("ProxyFetch : start") for proxyGetter in config.proxy_getter_functions: self.log.info( "ProxyFetch - {func}: start".format(func=proxyGetter)) try: for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): proxy = proxy.strip() if not proxy or not verifyProxyFormat(proxy): self.log.error( 'ProxyFetch - {func}: {proxy} illegal'.format( func=proxyGetter, proxy=proxy.ljust(20))) continue elif proxy in proxy_set: self.log.info( 'ProxyFetch - {func}: {proxy} exist'.format( func=proxyGetter, proxy=proxy.ljust(20))) continue else: self.log.info( 'ProxyFetch - {func}: {proxy} success'.format( func=proxyGetter, proxy=proxy.ljust(20))) self.db.put(Proxy(proxy, source=proxyGetter)) proxy_set.add(proxy) except Exception as e: self.log.error( "ProxyFetch - {func}: error".format(func=proxyGetter)) self.log.error(str(e)) self.log.error(traceback.format_exc())
def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ for proxyGetter in self.config.proxy_getter_functions: # fetch proxy_set = set() try: self.log.info("{func}: fetch proxy start".format(func=proxyGetter)) proxy_iter = [_ for _ in getattr(GetFreeProxy, proxyGetter.strip())()] except Exception as e: self.log.error("{func}: fetch proxy fail".format(func=proxyGetter)) continue for proxy in proxy_iter: proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) proxy_set.add(proxy) else: self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy)) # store for proxy in proxy_set: self.db.changeTable(self.useful_proxy_queue) if self.db.exists(proxy): continue self.db.changeTable(self.raw_proxy_queue) self.db.put(proxy)
def refresh(self): """从已有站点上抓取proxy,并存放到redis raw_proxy fetch proxy into Db by ProxyGetter/getFreeProxy.py :return: """ max_conn = 100 meta: dict = {} self.db.changeTable(self.raw_proxy_queue) for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) host, port = proxy.split(":") meta["host"] = host meta["port"] = port meta["max_conn"] = max_conn self.db.put(proxy, json.dumps(meta)) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error(e) self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue
def checkAllGetProxyFunc(): """ 检查getFreeProxy所有代理获取函数运行情况 Returns: None """ import inspect member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction) proxy_count_dict = dict() for func_name, func in member_list: log.info(u"开始运行 {}".format(func_name)) try: # func() ---> 生成器 返回 ip:port --- string proxy_list = [_ for _ in func() if verifyProxyFormat(_)] # fun_name:count proxy_count_dict[func_name] = len(proxy_list) except Exception as e: log.info(u"代理获取函数 {} 运行出错!".format(func_name)) log.error(str(e)) log.info(u"所有函数运行完毕 " + "***" * 5) for func_name, func in member_list: log.info(u"函数 {n}, 获取到代理数: {c}".format(n=func_name, c=proxy_count_dict.get( func_name, 0)))
def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ self.db.changeTable(self.raw_proxy_queue) for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): # 挨个存储 proxy,优化raw 队列的 push 速度,进而加快 check proxy 的速度 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue
def refresh(self): """ fetch proxy into Db by ProxyGetter user defined proxy getter class :return: """ self.db.changeTable(self.raw_proxy_queue) try: proxy_getter_class = self.__dynamic_import__( config.proxy_getter_lib) except Exception as e: raise Exception('%s not found in ProxyGetter' % config.proxy_getter_lib) for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(proxy_getter_class, proxyGetter.strip())(): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue
def refresh(self): """ fetch proxy into Db by ProxyGetter :return: """ self.db.changeTable(self.raw_proxy_queue) for proxyGetter in self.config.proxy_getter_functions: # fetch # proxy_set = set() try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) # proxy_iter = [_ for _ in getattr(GetFreeProxy, proxyGetter.strip())()] for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue
def refresh(self): """ fetch proxy into Db by ProxyGetter/getFreeProxy.py :return: """ self.db.changeTable(self.raw_proxy_queue) for proxyGetter in config.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue
async def fetch_spys_one(self): """ fetch proxy into db by ProxyGetter :return: """ proxy_set = set() self.db.changeTable(self.raw_proxy_queue) self.log.info("ProxyFetch : start") self.log.info("ProxyFetch - {func}: start".format(func="fetch_spys_one")) try: proxy_list = await scrape_spys_one() if proxy_list is not None: for proxy in proxy_list: proxy = proxy.strip() if not proxy or not verifyProxyFormat(proxy): self.log.error('ProxyFetch - {func}: {proxy} illegal'.format(func="fetch_spys_one", proxy=proxy.ljust(20))) continue elif proxy in proxy_set: self.log.info('ProxyFetch - {func}: {proxy} exist'.format(func="fetch_spys_one", proxy=proxy.ljust(20))) continue else: self.log.info('ProxyFetch - {func}: {proxy} success'.format(func="fetch_spys_one", proxy=proxy.ljust(20))) self.db.put(Proxy(proxy, source="fetch_spys_one")) proxy_set.add(proxy) except Exception as e: self.log.error("ProxyFetch - {func}: error".format(func="fetch_spys_one")) self.log.error(str(e)) self.log.error(traceback.format_exc())
def checkGetProxyFunc(func): """ 检查指定的getFreeProxy某个function运行情况 Args: func: getFreeProxy中某个可调用方法 Returns: None """ func_name = getattr(func, '__name__', "None") log.info("start running func: {}".format(func_name)) count = 0 #save file="proxies.txt" f=open(file,'a') for proxy in func(): if verifyProxyFormat(proxy): log.info("{} fetch proxy: {}".format(func_name, proxy)) f.write(proxy) f.write('\n') count += 1 f.close() log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count))
def fetch(self): start_time = time.time() total = 0 succ = 0 fail = 0 skip = 0 fetcher_name = self.queue.get() fetcher_class = FetcherManager.get_class(fetcher_name) log.debug("fetch [{fetcher_name}] proxy start".format( fetcher_name=fetcher_name)) try: f = fetcher_class() for proxy in f.run(): proxy = proxy.strip() if proxy and verifyProxyFormat(proxy) and \ not proxy_manager.checkRawProxyExists(proxy) and \ not proxy_manager.checkUsefulProxyExists(proxy): proxy_manager.saveRawProxy(proxy) succ = succ + 1 log.debug( "fetch [{fetcher_name}] proxy {proxy} succ".format( fetcher_name=fetcher_name, proxy=proxy)) else: skip = skip + 1 log.debug( "fetch [{fetcher_name}] proxy {proxy} skip".format( fetcher_name=fetcher_name, proxy=proxy)) total = total + 1 except Exception as e: log.error("fetch [{fetcher_name}] proxy fail: {error}".format( fetcher_name=fetcher_name, error=e)) fail = fail + 1 end_time = time.time() elapsed_time = int(end_time - start_time) self.queue.task_done() stat = dict( total=total, succ=succ, fail=fail, skip=skip, ) ConfigManager.fetcher_config.update_stat(fetcher_name, stat) log.info( "fetch [{fetcher_name}] proxy finish, total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s" .format(fetcher_name=fetcher_name, total=total, succ=succ, fail=fail, skip=skip, elapsed_time=elapsed_time))
def fetch(self): start_time = time.time() total = 0 succ = 0 fail = 0 skip = 0 fetcher = self.queue.get() name = fetcher["name"] fetcher_class = FetcherManager.getFetcherClass(name) log.debug("fetch [{name}] proxy start".format(name=name)) try: f = fetcher_class() for proxy in f.run(): proxy = proxy.strip() if proxy and verifyProxyFormat(proxy) and \ not ProxyManager.proxy_manager.checkUsefulProxyExists(proxy): ProxyManager.proxy_manager.saveUsefulProxy(proxy) succ = succ + 1 log.debug("fetch [{name}] proxy {proxy} succ".format( name=name, proxy=proxy)) else: skip = skip + 1 log.debug("fetch [{name}] proxy {proxy} skip".format( name=name, proxy=proxy)) total = total + 1 except Exception as e: log.error("fetch [{name}] proxy fail: {error}".format(name=name, error=e)) fail = fail + 1 self.queue.task_done() now = int(time.time()) elapsed_time = int(now - start_time) next_fetch_time = self.start_time + (fetcher["interval"] * 60) data = { "$inc": { "succ": succ, "fail": fail, "skip": skip, "total": total, }, "$set": { "next_fetch_time": next_fetch_time, } } ProxyManager.proxy_manager.updateFetcher(name, data) log.info("fetch [{name:^15}] proxy finish, \ total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s" . \ format(name=name, total=total, succ=succ, fail=fail, skip=skip, elapsed_time=elapsed_time))
def checkGetProxyFunc(func): func_name = getattr(func, '__name__', 'None') log.info("start running func: {}".format(func_name)) count = 0 for proxy in func(): if verifyProxyFormat(proxy): log.info("{} fetch proxy: {}".format(func_name, proxy)) count += 1 log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count))
def checkGetProxyFunc(func): """ 检查指定的getFreeProxy某个function运行情况 Args: func: getFreeProxy中某个可调用方法 Returns: None """ func_name = getattr(func, '__name__', "None") log.info("start running func: {}".format(func_name)) count = 0 for proxy in func(): if verifyProxyFormat(proxy): log.info("{} fetch proxy: {}".format(func_name, proxy)) count += 1 log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count))
def checkGetProxyFunc(func): """ 检查指定的getFreeProxy函数运行情况 :param func: :return: """ func_name = getattr(func, '__name__', 'None') print(u'开始运行函数:{}'.format(func_name)) count = 0 for proxy in func(): if verifyProxyFormat(proxy): print(u'{} 获取到代理: {}'.format(func_name, proxy)) count += 1 print(u'函数 {n} 运行完毕,获取到 {c} 个代理。'.format(n=func_name, c=count))
def checkAllGetProxyFunc(): import inspect member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.ismethod) proxy_count_dict = dict() for func_name, func in member_list: log.info(u"开始运行 {}".format(func_name)) try: proxy_list = [_ for _ in func() if verifyProxyFormat(_)] proxy_count_dict[func_name] = len(proxy_list) except Exception as e: log.info(u"代理获取函数 {} 运行出错!".format(func_name)) log.error(str(e)) log.info(u"所有函数运行完毕 " + "***" * 50) for func_name, func in member_list: log.info(u"函数 {n},获取到代理输: {c}".format(n=func_name, c=proxy_count_dict.get( func_name, 0)))
def refresh(self): self.db.changeTable(self.raw_proxy_queue) for proxyGetter in config.proxy_getter_functions: try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): if proxy is not False: if proxy and verifyProxyFormat(proxy): self.log.info("{func}: fetch proxy {proxy}".format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( "{func}: fetch proxy {proxy} error".format( func=proxyGetter, proxy=proxy)) except Exception as s: self.log.error("refresh: {}".format(s)) self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue
def checkAllGetProxyFunc(self): """ 检查getFreeProxy所有代理获取函数的运行情况 :return: """ import inspect member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction) proxy_count_dict = dict() for func_name, func in member_list: print(u'开始运行{}'.format(func_name)) try: proxy_list = [_ for _ in func() if verifyProxyFormat(_)] proxy_count_dict[func_name] = len(proxy_list) except Exception as e: print(u'代理获取函数 {} 运行出错!'.format(func_name)) print(str(e)) print(u'所有函数运行完毕' + ' ******' * 5) for func_name, func in member_list: print(u'函数 {n}, 获取到代理 {c}'.format(n=func_name, c=proxy_count_dict.get(func_name, 0)))
def refresh(self): proxy_getter_functions = config.cf.options("ProxyGetter") for proxyGetter in proxy_getter_functions: try: log.info( "Fetch Proxy Start, func:{func}".format(func=proxyGetter)) total = 0 succ = 0 fail = 0 for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): proxy = proxy.strip() if proxy and verifyProxyFormat( proxy) and not self.checkRawProxyExists(proxy): self.saveRawProxy(proxy) succ = succ + 1 log.debug('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) else: fail = fail + 1 log.error('{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) total = total + 1 log.info( "fetch proxy end, func:{func}, total:{total}, succ:{succ} fail:{fail}" .format(func=proxyGetter, total=total, succ=succ, fail=fail)) except Exception as e: log.error( "func_name:{func_name} fetch proxy fail, error:{error}". format(func_name=proxyGetter, error=e)) continue
# test_batch(gg.freeProxySeventh()) # test_batch(gg.freeProxyEight()) # test_batch(gg.freeProxyNinth()) # test_batch(gg.freeProxyTen()) # test_batch(gg.freeProxyEleven()) proxy_iter = gg.freeProxyTwelve() proxy_set = set() for proxy in proxy_iter: proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): #self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) proxy_set.add(proxy) #else: #self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy)) # store for proxy in proxy_set: print(proxy) # test_batch(gg.freeProxyTwelve()) # test_batch(gg.freeProxyWallFirst()) # test_batch(gg.freeProxyWallSecond())