class ProxyFactory(Service): name = "proxy_factory" current_dir = getcwd() default_settings = settings def __init__(self): """ 初始化logger, redis_conn """ super(ProxyFactory, self).__init__() sys.path.insert(0, self.current_dir) self.headers = self.settings.HEADERS self.proxies_check_in_channel = ThreadSafeSet() self.proxies_check_out_channel = TreadSafeDict() self.load_site(proxy_site_spider) self.load_site(self.args.spider_module) self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get_int("REDIS_PORT")) if self.args.check_method: self.check_method = partial(load_function(self.args.check_method), self) def load_site(self, module_str): if module_str: if isinstance(module_str, str): mod = load_module(module_str) else: mod = module_str for key, func in vars(mod).items(): if key.startswith("fetch"): self.__dict__[key] = partial(exception_wrapper(func), self) def check_method(self, proxy): resp = requests.get("http://www.whatismyip.com.tw/", headers=self.headers, timeout=10, proxies={"http": "http://%s" % proxy}) ip, real_ip = re.search(r'"ip": "(.*?)"[\s\S]+"ip-real": "(.*?)",', resp.text).groups() self.logger.debug("IP: %s. Real IP: %s. Proxy: %s" % (ip, real_ip, proxy)) return resp.status_code < 300 and not real_ip def check(self, proxy, good): """ 检查代理是否可用 """ with ExceptContext(errback=lambda *args: True): if self.check_method(proxy): good.add(proxy) def check_proxies(self): """ 对待检查队列中的代理进行检查 :return: """ self.logger.debug("Start check thread. ") threads = dict() good = set() while self.alive: if len(self.proxies_check_in_channel): proxy = self.proxies_check_in_channel.pop() else: proxy = None if isinstance(proxy, bytes): proxy = proxy.decode() if len(threads) < 150 and proxy: th = Thread(target=self.check, args=(proxy, good)) th.setDaemon(True) th.start() threads[time.time()] = (th, proxy) time.sleep(.001) else: time.sleep(1) for start_time, (th, proxy) in threads.copy().items(): if start_time + 60 < time.time() or not th.is_alive(): del threads[start_time] self.proxies_check_out_channel[proxy] = proxy in good good.discard(proxy) self.logger.debug("Stop check thread. ") def bad_source(self): """ 每隔指定时间间隔将无效代理放到待检查队列进行检查 :return: """ self.logger.debug("Start bad source thread. ") while self.alive: with Blocker( self.settings.get_int("BAD_CHECK_INTERVAL", 60 * 5), self, notify=lambda instance: not instance.alive) as blocker: if blocker.is_notified or len(self.proxies_check_in_channel): continue with ExceptContext(errback=self.log_err): proxies = self.redis_conn.hgetall("bad_proxies") if proxies: self.logger.debug( "Bad proxy count is : %s, ready to check. " % len(proxies)) while proxies: proxy, times = proxies.popitem() self.proxies_check_in_channel.add(proxy) self.logger.debug("Stop bad source thread. ") def good_source(self): """ 每隔指定时间间隔将有效代理放到待检查队列进行检查 :return: """ self.logger.debug("Start good source thread. ") while self.alive: with Blocker( self.settings.get_int("GOOD_CHECK_INTERVAL", 60 * 5), self, notify=lambda instance: not instance.alive) as blocker: if blocker.is_notified or len(self.proxies_check_in_channel): continue with ExceptContext(errback=self.log_err): proxies = self.redis_conn.smembers("good_proxies") if proxies: self.logger.debug( "Good proxy count is : %s, ready to check. " % len(proxies)) self.proxies_check_in_channel.update(proxies) self.logger.debug("Stop good source thread. ") def reset_proxies(self): """ 分发有效代理和无效代理 :return: """ self.logger.debug("Start resets thread. ") while self.alive: with ExceptContext(errback=self.log_err): proxies = list(self.proxies_check_out_channel.pop_all()) if proxies: self.logger.debug("Got %s proxies to reset. " % len(proxies)) for proxy, good in proxies: if good: self.redis_conn.sadd("good_proxies", proxy) self.redis_conn.hdel("bad_proxies", proxy) else: count = self.redis_conn.hincrby( "bad_proxies", proxy) if count > self.settings.get_int( "FAILED_TIMES", 5): self.redis_conn.hdel("bad_proxies", proxy) self.logger.debug( "Abandon %s of failed for %s times. " % (proxy, count)) self.redis_conn.srem("good_proxies", proxy) else: time.sleep(1) time.sleep(1) self.logger.debug("Stop resets thread. ") def gen_thread(self, target, name=None, args=(), kwargs=None): thread = Thread(target=target, name=name, args=args, kwargs=kwargs) thread.setDaemon(True) thread.start() self.children.append(thread) def start(self): self.logger.debug("Start proxy factory. ") self.gen_thread(self.check_proxies) self.gen_thread(self.bad_source) self.gen_thread(self.good_source) self.gen_thread(self.reset_proxies) is_started = False while self.alive or [ thread for thread in self.children if thread.is_alive() ]: with Blocker(self.settings.get_int("FETCH_INTERVAL", 10 * 60), self, notify=lambda instance: not instance.alive, immediately=not is_started) as blocker: if blocker.is_notified: continue with ExceptContext(errback=self.log_err): if self.alive: self.logger.debug("Start to fetch proxies. ") proxies = self.fetch_all() self.logger.debug("%s proxies found. " % len(proxies)) self.proxies_check_in_channel.update(proxies) is_started = True self.logger.debug("Stop proxy factory. ") def fetch_all(self): """ 获取全部网站代理,内部调用各网站代理获取函数 """ proxies = set() for key, value in self.__dict__.items(): if key.startswith("fetch"): proxies.update(value()) return proxies def enrich_parser_arguments(self): self.parser.add_argument( "-cm", "--check-method", help="proivde a check method to check proxies. eg:module.func") self.parser.add_argument( "-sm", "--spider-module", help= "proivde a module contains proxy site spider methods. eg:module1.module2" ) return super(ProxyFactory, self).enrich_parser_arguments()
class ProxyFactory(Service): name = "proxy_factory" current_dir = getcwd() default_settings = settings headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", } def __init__(self): """ 初始化logger, redis_conn """ super(ProxyFactory, self).__init__() sys.path.insert(0, self.current_dir) self.proxies_check_in_set = ThreadSafeSet() self.proxies_check_out_set = TreadSafeDict() self.load_site(proxy_site_spider) self.load_site(self.args.spider_module) self.redis_conn = Redis(self.settings.get("REDIS_HOST"), self.settings.get_int("REDIS_PORT")) if self.args.check_method: self.check_method = partial(load_function(self.args.check_method), self) def load_site(self, module_str): if module_str: if isinstance(module_str, str): mod = load_module(module_str) else: mod = module_str for key, func in vars(mod).items(): if key.startswith("fetch"): self.__dict__[key] = partial(exception_wrapper(func), self) def check_method(self, proxy): resp = requests.get( "http://www.whatismyip.com.tw/", headers=self.headers, timeout=10, proxies={"http": "http://%s" % proxy}) ip, real_ip = re.search(r'"ip": "(.*?)"[\s\S]+"ip-real": "(.*?)",', resp.text).groups() self.logger.debug("IP: %s. Real IP: %s. Proxy: %s" % (ip, real_ip, proxy)) return resp.status_code < 300 and not real_ip def check(self, proxy, good): """ 检查代理是否可用 """ with ExceptContext(errback=lambda *args: True): if self.check_method(proxy): good.append(proxy) def check_proxies(self): """ 对待检查队列中的代理进行检查 :return: """ self.logger.debug("Start check thread. ") while self.alive: with ExceptContext(errback=self.log_err): proxies = list(self.proxies_check_in_set.pop_all()) if proxies: self.logger.debug("Got %s proxies to check. " % len(proxies)) proxies = [proxy.decode() if isinstance(proxy, bytes) else proxy for proxy in proxies] good = list() for i in range(0, len(proxies), 150): # 分批检查 thread_list = [] for proxy in proxies[i: i+150]: th = Thread(target=self.check, args=(proxy, good)) th.setDaemon(True) th.start() thread_list.append(th) start_time = time.time() while [thread for thread in thread_list if thread.is_alive()] and start_time + 60 > time.time(): time.sleep(1) self.logger.debug("%s proxies is good. " % (len(good))) self.proxies_check_out_set.update(dict((proxy, proxy in good) for proxy in proxies)) else: time.sleep(1) time.sleep(1) self.logger.debug("Stop check thread. ") def bad_source(self): """ 每隔指定时间间隔将无效代理放到待检查队列进行检查 :return: """ self.logger.debug("Start bad source thread. ") while self.alive: with Blocker(self.settings.get_int("BAD_CHECK_INTERVAL", 60 * 5), self, notify=lambda instance: not instance.alive) as blocker: if blocker.is_notified: continue with ExceptContext(errback=self.log_err): proxies = self.redis_conn.hgetall("bad_proxies") if proxies: self.logger.debug("Bad proxy count is : %s, ready to check. " % len(proxies)) for proxy, times in proxies.items(): if int(times) > self.settings.get_int("FAILED_TIMES", 5): self.redis_conn.hdel("bad_proxies", proxy) self.logger.debug("Abandon %s of failed for %s times. " % (proxy, times)) self.proxies_check_in_set.update(proxies.keys()) self.logger.debug("Stop bad source thread. ") def good_source(self): """ 每隔指定时间间隔将有效代理放到待检查队列进行检查 :return: """ self.logger.debug("Start good source thread. ") while self.alive: with Blocker(self.settings.get_int("GOOD_CHECK_INTERVAL", 60 * 5), self, notify=lambda instance: not instance.alive) as blocker: if blocker.is_notified: continue with ExceptContext(errback=self.log_err): proxies = self.redis_conn.smembers("good_proxies") if proxies: self.logger.debug("Good proxy count is : %s, ready to check. " % len(proxies)) self.proxies_check_in_set.update(proxies) self.logger.debug("Stop good source thread. ") def reset_proxies(self): """ 分发有效代理和无效代理 :return: """ self.logger.debug("Start resets thread. ") while self.alive: with ExceptContext(errback=self.log_err): proxies = list(self.proxies_check_out_set.pop_all()) if proxies: self.logger.debug("Got %s proxies to reset. " % len(proxies)) for proxy, good in proxies: if good: self.redis_conn.sadd("good_proxies", proxy) self.redis_conn.hdel("bad_proxies", proxy) else: self.redis_conn.hincrby("bad_proxies", proxy) self.redis_conn.srem("good_proxies", proxy) else: time.sleep(1) time.sleep(1) self.logger.debug("Stop resets thread. ") def gen_thread(self, target, name=None, args=(), kwargs=None): thread = Thread(target=target, name=name, args=args, kwargs=kwargs) thread.setDaemon(True) thread.start() self.children.append(thread) def start(self): self.logger.debug("Start proxy factory. ") self.gen_thread(self.check_proxies) self.gen_thread(self.bad_source) self.gen_thread(self.good_source) self.gen_thread(self.reset_proxies) is_started = False while self.alive or [thread for thread in self.children if thread.is_alive()]: with Blocker(self.settings.get_int("FETCH_INTERVAL", 10 * 60), self, notify=lambda instance: not instance.alive, immediately=not is_started) as blocker: if blocker.is_notified: continue with ExceptContext(errback=self.log_err): if self.alive: self.logger.debug("Start to fetch proxies. ") proxies = self.fetch_all() self.logger.debug("%s proxies found. " % len(proxies)) self.proxies_check_in_set.update(proxies) is_started = True self.logger.debug("Stop proxy factory. ") def fetch_all(self): """ 获取全部网站代理,内部调用各网站代理获取函数 """ proxies = set() for key, value in self.__dict__.items(): if key.startswith("fetch"): proxies.update(value()) return proxies def enrich_parser_arguments(self): self.parser.add_argument("-cm", "--check-method", help="proivde a check method to check proxies. eg:module.func") self.parser.add_argument("-sm", "--spider-module", help="proivde a module contains proxy site spider methods. eg:module1.module2") return super(ProxyFactory, self).enrich_parser_arguments()