예제 #1
0
class RandomUserAgentMiddleware(object):
    """
    动态代理
    """
    def __init__(self, crawler):
        super(RandomUserAgentMiddleware, self).__init__()
        self.ua = UserAgent()
        self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
        self.args = dict(host='127.0.0.1',
                         port=6379,
                         password=REDIS_PASSWORD,
                         db=0)
        self.fetcher = ProxyFetcher('jd',
                                    strategy='greedy',
                                    redis_args=self.args)
        self.fetch_count = 0

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        def get_ua():
            return getattr(self.ua, self.ua_type)

        # 代理池使用1000次后更换
        self.fetch_count += 1
        if self.fetch_count > 1000:
            self.fetcher = ProxyFetcher('jd',
                                        strategy='greedy',
                                        redis_args=self.args)
            self.fetch_count = 0
        request.headers.setdefault('User-Agent', get_ua())
        request.meta['proxy'] = self.fetcher.get_proxy()
예제 #2
0
def get_proxy():
    try:
        args = settings['outgoing']['haipproxy_redis']
        fetcher = ProxyFetcher('zhihu', strategy='greedy', redis_args=args)
        proxy = fetcher.get_proxy()
        if proxy:
            return {'http': proxy}
        else:
            logger.warning('No available proxy fetched from the proxy pool.')
    except Exception:
        logger.warning('Exception in fetching proxy.')
        logger.warning(traceback.print_exc())
예제 #3
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/7/23 18:42
# @Author  : youfeng
import time

from haipproxy.client.py_cli import ProxyFetcher

args = dict(host='192.168.1.90', port=6379, password='******', db=15)
# # 这里`zhihu`的意思是,去和`zhihu`相关的代理ip校验队列中获取ip
# # 这么做的原因是同一个代理IP对不同网站代理效果不同
fetcher = ProxyFetcher('http', strategy='greedy', redis_args=args)
# 获取一个可用代理
start_time = time.time()
print(fetcher.get_proxy())
print("获取代理耗费时间: {} s".format(time.time() - start_time))
# 获取可用代理列表
proxies_list = fetcher.get_proxies()
print(len(proxies_list))
print(fetcher.get_proxies())  # or print(fetcher.pool)
예제 #4
0
 def proxy(self):
     fetcher = ProxyFetcher('http', strategy='greedy', redis_args=self.args)
     return fetcher.get_proxy()