class Getter(object): # 爬取代理网站的免费代理IP,存入redis def __init__(self): self.redis_client = RedisClient() self.crawler = Crawler() def is_full(self): # 判断代理池是否满了 return self.redis_client.get_proxy_count() >= FULL_COUNT def run(self): # 将爬取到的代理存入redis if not self.is_full(): proxys = self.crawler.get_crawler_proxy() for proxy in proxys: self.redis_client.add(proxy)
class Getter(object): def __init__(self): self.redis_client = RedisClient() self.crawler = Crawler() def is_full(self): """ 判断代理池是否满了 """ if self.redis_client.get_proxy_count() >= FULL_COUNT: return True else: return False def run(self): """ 将爬取到的代理存入redis """ if not self.is_full(): proxy_list = self.crawler.get_crawler_proxy() for proxy in proxy_list: self.redis_client.add(proxy)
class Getter(object): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): ''' 判断是否超出了抓取的限制 ''' if self.redis.count() >= settings.POOL_UPPER_THRESHOLD: return True return False def run(self): print('开始抓取') if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) sys.stdout.flush() for proxy in proxies: self.redis.add(proxy)
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() def is_over_threshold(self): """ 判断是否达到代理池限制 :return: """ if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print("获取器开始执行") if not self.is_over_threshold(): for callback_label in range(self.crawler.__CrawlFuncCount__): callback = self.crawler.__CrawlFunc__[callback_label] proxies = self.crawler.get_proxies(callback) for proxy in proxies: self.redis.add(proxy)
class Crawler(object): '''crawl proxy ip from proxy website''' def __init__(self, host="localhost", port='6379', key='Proxy'): self.__redis = RedisClient(host=host, port=port, password=None, key=key) def __select_crawl_func(self): '''select function start with "crawl_" ''' return filter( lambda x: x.startswith('crawl_') and callable(getattr(self, x)), dir(self)) def get_proxies(self): '''run all "crawl_*" function ''' proxies = [] funcs = self.__select_crawl_func() for func in funcs: proxy = eval("self.{}()".format(func)) if proxy: proxies.append(proxy) return proxies def run(self): proxies = self.get_proxies() thread_pool = [] for proxy in proxies: th = threading.Thread(target=self.__single_run, args=(proxy, )) thread_pool.append(th) th.start() for th in thread_pool: th.join() def __single_run(self, proxy): '''crawler for specific website''' for ip in proxy: # print(threading.current_thread().name, "\t", ip) self.__redis.add(ip) def __base_crawl_func(self, page_num, url_base, host, id_anonymous, name_anonymous): '''base function for crawler''' urls = [] if page_num > 1: for page in range(page_num): url = url_base.format(page + 1) urls.append(url) shuffle(urls) else: urls.append(url_base) for page in range(page_num): if page % 10 == 0: headers = UserAgent(host).headers() try: if page % 5 == 0: proxy = self.__redis.random_max() except Exception: proxy = None url = urls[page] try: if proxy: proxies = {"http": "http://" + proxy} response = requests.get(url=url, headers=headers, proxies=proxies, timeout=15) else: response = requests.get(url=url, headers=headers, timeout=15) except Exception: # print(threading.current_thread().name, "Request url error:", url) continue if not response.status_code == 200: continue for code in ['utf-8', 'gbk', 'gb2312']: try: html = response.content.decode(code) break except Exception: # print('code error:{}'.format(code)) pass if not html: continue soup = BeautifulSoup(html, "lxml") tds = soup.find_all("td") for index, td in enumerate(tds): text = re.sub(r"[\s\n\t]+", "", td.text) rule = r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$" if not re.match(rule, text): continue if name_anonymous not in re.sub( r"[\s\n\t]+", "", tds[index + id_anonymous - 1].text): continue IP = re.sub(r"[\s\n\t]+", "", tds[index + 0].text) PORT = re.sub(r"[\s\n\t]+", "", tds[index + 1].text) proxy = "{}:{}".format(IP, PORT) yield proxy def crawl_xici(self): '''crawl proxy ip from xici website''' page_num = 3336 url_base = "http://www.xicidaili.com/nn/{}" host = "www.xicidaili.com" id_anonymous = 4 name_anonymous = '高匿名' return self.__base_crawl_func(page_num, url_base, host, id_anonymous, name_anonymous) def crawl_kuaidaili(self): '''crawl proxy ip from kuaidaili website''' page_num = 2367 url_base = "https://www.kuaidaili.com/free/inha/{}" host = "www.kuaidaili.com" id_anonymous = 3 name_anonymous = '高匿名' return self.__base_crawl_func(page_num, url_base, host, id_anonymous, name_anonymous) def crawl_66(self): '''crawl proxy ip from 66 website''' page_num = 1288 url_base = "http://www.66ip.cn/{}.html" host = "www.66ip.cn" id_anonymous = 4 name_anonymous = '高匿代理' return self.__base_crawl_func(page_num, url_base, host, id_anonymous, name_anonymous) def crawl_yqie(self): '''crawl proxy ip from yqie website''' page_num = 1 url_base = "http://ip.yqie.com/ipproxy.htm" host = "ip.yqie.com" id_anonymous = 4 name_anonymous = '高匿' return self.__base_crawl_func(page_num, url_base, host, id_anonymous, name_anonymous) def crawl_yundaili(self): '''crawl proxy ip from yundaili website''' page_num = 7 url_base = "http://www.ip3366.net/?stype=1&page={}" host = "www.ip3366.net" id_anonymous = 3 name_anonymous = '高匿代理IP' return self.__base_crawl_func(page_num, url_base, host, id_anonymous, name_anonymous)