def validProxy(self): thread_id = threading.currentThread().ident log.info("thread_id:{thread_id}, Start ValidProxy `raw_proxy_queue`".format(thread_id=thread_id)) total = 0 succ = 0 fail = 0 while self.queue.qsize(): proxy = self.queue.get() if proxy not in self.remaining_proxies: (http_result, https_result) = validUsefulProxy(proxy) if http_result: self.saveUsefulProxy(proxy, https_result) self.deleteRawProxy(proxy) self.remaining_proxies.append(proxy) succ = succ + 1 else: self.tickRawProxyVaildFail(proxy) fail = fail + 1 log.debug('ProxyRefreshSchedule: %s validation fail' % proxy) # self.tickRawProxyVaildSucc(proxy) log.debug('ProxyRefreshSchedule: %s validation pass' % proxy) else: self.deleteRawProxy(proxy) log.debug('ProxyRefreshSchedule: %s repetition, skip!' % proxy) self.queue.task_done() self.tickRawProxyVaildTotal(proxy) total = total + 1 log.info('thread_id:{thread_id}, ValidProxy Complete `raw_proxy_queue`, total:{total}, succ:{succ}, fail:{fail}'.format(thread_id=thread_id, total=total, succ=succ, fail=fail))
def run(self): self.db.changeTable(self.useful_proxy_queue) thread_id = threading.currentThread().ident log.info("thread_id:{thread_id} useful_proxy proxy check start".format( thread_id=thread_id)) total = 0 succ = 0 fail = 0 while self.queue.qsize(): proxy = self.queue.get() (http_result, _) = validUsefulProxy(proxy) if http_result: self.tickUsefulProxyVaildSucc(proxy) succ = succ + 1 log.debug( "ProxyCheck: {proxy} validation pass".format(proxy=proxy)) else: self.tickUsefulProxyVaildFail(proxy) fail = fail + 1 log.debug( "ProxyCheck: {proxy} validation fail".format(proxy=proxy)) self.queue.task_done() total = total + 1 self.tickUsefulProxyVaildTotal(proxy) log.info( 'thread_id:{thread_id} proxy check end, total:{total}, succ:{succ}, fail:{fail}' .format(thread_id=thread_id, total=total, succ=succ, fail=fail))
def start(self): start_time = time.time() log.debug("useful_proxy proxy verify start") self.stat = dict( total=0, succ=0, fail=0, ) concurrency = ConfigManager.setting_config.setting.get( "verify_useful_proxy_concurrency") queue_size = self.queue.qsize() if concurrency > queue_size: spawn_num = queue_size else: spawn_num = concurrency greenlet_list = [] for _ in range(spawn_num): greenlet_list.append(gevent.spawn(self.run)) gevent.joinall(greenlet_list) end_time = time.time() elapsed_time = int(end_time - start_time) log.info( 'useful_proxy verify proxy finish, total:{total}, succ:{succ}, fail:{fail}, elapsed_time:{elapsed_time}s' .format(total=self.stat["total"], succ=self.stat["succ"], fail=self.stat["fail"], elapsed_time=elapsed_time))
def update_job_interval(self, **kwargs): job_name = kwargs.get("job_name") value = ConfigManager.setting_config.setting.get(job_name) trigger_args = { "minutes": value } trigger='interval' job = self._update_job(job_name, trigger, **trigger_args) log.info("update_job_interval: {job_name}, {job}".format(job_name=job_name, job=job)) return job
def fetch(self): start_time = time.time() total = 0 succ = 0 fail = 0 skip = 0 fetcher = self.queue.get() name = fetcher["name"] fetcher_class = FetcherManager.getFetcherClass(name) log.debug("fetch [{name}] proxy start".format(name=name)) try: f = fetcher_class() for proxy in f.run(): proxy = proxy.strip() if proxy and verifyProxyFormat(proxy) and \ not ProxyManager.proxy_manager.checkUsefulProxyExists(proxy): ProxyManager.proxy_manager.saveUsefulProxy(proxy) succ = succ + 1 log.debug("fetch [{name}] proxy {proxy} succ".format( name=name, proxy=proxy)) else: skip = skip + 1 log.debug("fetch [{name}] proxy {proxy} skip".format( name=name, proxy=proxy)) total = total + 1 except Exception as e: log.error("fetch [{name}] proxy fail: {error}".format(name=name, error=e)) fail = fail + 1 self.queue.task_done() now = int(time.time()) elapsed_time = int(now - start_time) next_fetch_time = self.start_time + (fetcher["interval"] * 60) data = { "$inc": { "succ": succ, "fail": fail, "skip": skip, "total": total, }, "$set": { "next_fetch_time": next_fetch_time, } } ProxyManager.proxy_manager.updateFetcher(name, data) log.info("fetch [{name:^15}] proxy finish, \ total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s" . \ format(name=name, total=total, succ=succ, fail=fail, skip=skip, elapsed_time=elapsed_time))
def fetch(self): start_time = time.time() total = 0 succ = 0 fail = 0 skip = 0 fetcher_name = self.queue.get() fetcher_class = FetcherManager.get_class(fetcher_name) log.debug("fetch [{fetcher_name}] proxy start".format( fetcher_name=fetcher_name)) try: f = fetcher_class() for proxy in f.run(): proxy = proxy.strip() if proxy and verifyProxyFormat(proxy) and \ not proxy_manager.checkRawProxyExists(proxy) and \ not proxy_manager.checkUsefulProxyExists(proxy): proxy_manager.saveRawProxy(proxy) succ = succ + 1 log.debug( "fetch [{fetcher_name}] proxy {proxy} succ".format( fetcher_name=fetcher_name, proxy=proxy)) else: skip = skip + 1 log.debug( "fetch [{fetcher_name}] proxy {proxy} skip".format( fetcher_name=fetcher_name, proxy=proxy)) total = total + 1 except Exception as e: log.error("fetch [{fetcher_name}] proxy fail: {error}".format( fetcher_name=fetcher_name, error=e)) fail = fail + 1 end_time = time.time() elapsed_time = int(end_time - start_time) self.queue.task_done() stat = dict( total=total, succ=succ, fail=fail, skip=skip, ) ConfigManager.fetcher_config.update_stat(fetcher_name, stat) log.info( "fetch [{fetcher_name}] proxy finish, total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s" .format(fetcher_name=fetcher_name, total=total, succ=succ, fail=fail, skip=skip, elapsed_time=elapsed_time))
def main(self): self.putQueue() while True: if not self.queue.empty(): log.info("Start Valid useful_proxy proxy") self.__validProxy() else: log.info('Valid Complete, Sleep 5 Min!') time.sleep(60 * config.BASE.verify_useful_proxy_interval) self.putQueue()
def run(self): total_number = proxy_manager.getRawProxyNumber() clean_number = proxy_manager.cleanRawProxy() remain_number = total_number - clean_number log.info( "clean raw_proxy, total_number:{total_number}, clean_number:{clean_number}, remain_number:{remain_number}" .format(total_number=total_number, clean_number=clean_number, remain_number=remain_number))
def get(self): result = {"data": {}} options = {"proxy": self.args.get('proxy'), "quality": -1} log.info("receive params: {}".format(options)) info = {} item = proxy_manager.updateUsefulProxy(options, info) log.info("delete {}".format(item)) result["data"] = item return result
def run(self): hold_number = ConfigManager.setting_config.setting.get( "hold_useful_proxy_number") total_number = proxy_manager.getUsefulProxyNumber() clean_number = proxy_manager.cleanUsefulProxy(hold_number=hold_number) log.info( "clean useful, total_number:{total_number}, clean_number:{clean_number}, hold_number:{hold_number}" .format(total_number=total_number, clean_number=clean_number, hold_number=hold_number))
def start(self): concurrency = ConfigManager.setting_config.setting.get( "fetch_new_proxy_concurrency") task_pool = pool.Pool(concurrency) queue_size = self.queue.qsize() if queue_size > 0: greenlet_list = [] for _ in range(queue_size): greenlet_list.append(task_pool.spawn(self.fetch)) gevent.joinall(greenlet_list) else: log.info("Not Have Fetcher Of Now, skip!")
def refresh(self): proxy_getter_functions = config.cf.options("ProxyGetter") for proxyGetter in proxy_getter_functions: try: log.info( "Fetch Proxy Start, func:{func}".format(func=proxyGetter)) total = 0 succ = 0 fail = 0 for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): proxy = proxy.strip() if proxy and verifyProxyFormat( proxy) and not self.checkRawProxyExists(proxy): self.saveRawProxy(proxy) succ = succ + 1 log.debug('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) else: fail = fail + 1 log.error('{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) total = total + 1 log.info( "fetch proxy end, func:{func}, total:{total}, succ:{succ} fail:{fail}" .format(func=proxyGetter, total=total, succ=succ, fail=fail)) except Exception as e: log.error( "func_name:{func_name} fetch proxy fail, error:{error}". format(func_name=proxyGetter, error=e)) continue