class RandomUserAgentMiddleware(object): """ 动态代理 """ def __init__(self, crawler): super(RandomUserAgentMiddleware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random') self.args = dict(host='127.0.0.1', port=6379, password=REDIS_PASSWORD, db=0) self.fetcher = ProxyFetcher('jd', strategy='greedy', redis_args=self.args) self.fetch_count = 0 @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): def get_ua(): return getattr(self.ua, self.ua_type) # 代理池使用1000次后更换 self.fetch_count += 1 if self.fetch_count > 1000: self.fetcher = ProxyFetcher('jd', strategy='greedy', redis_args=self.args) self.fetch_count = 0 request.headers.setdefault('User-Agent', get_ua()) request.meta['proxy'] = self.fetcher.get_proxy()
def get_proxy(): try: args = settings['outgoing']['haipproxy_redis'] fetcher = ProxyFetcher('zhihu', strategy='greedy', redis_args=args) proxy = fetcher.get_proxy() if proxy: return {'http': proxy} else: logger.warning('No available proxy fetched from the proxy pool.') except Exception: logger.warning('Exception in fetching proxy.') logger.warning(traceback.print_exc())
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2018/7/23 18:42 # @Author : youfeng import time from haipproxy.client.py_cli import ProxyFetcher args = dict(host='192.168.1.90', port=6379, password='******', db=15) # # 这里`zhihu`的意思是,去和`zhihu`相关的代理ip校验队列中获取ip # # 这么做的原因是同一个代理IP对不同网站代理效果不同 fetcher = ProxyFetcher('http', strategy='greedy', redis_args=args) # 获取一个可用代理 start_time = time.time() print(fetcher.get_proxy()) print("获取代理耗费时间: {} s".format(time.time() - start_time)) # 获取可用代理列表 proxies_list = fetcher.get_proxies() print(len(proxies_list)) print(fetcher.get_proxies()) # or print(fetcher.pool)
def proxy(self): fetcher = ProxyFetcher('http', strategy='greedy', redis_args=self.args) return fetcher.get_proxy()