Пример #1
0
class Getter:
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_arrive_threshold(self):
        '''
        判断当前数据库中的IP条目数是否到达设置的阈值
        :return:
        '''
        if self.redis.count() >= THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('Getter Run...')
        if not self.is_arrive_threshold():
            #未到达阈值才继续获取
            '''
           调用Crawler类的crawl_*方法
           '''
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
Пример #2
0
def download_proxies():
    conn = RedisClient(name='enterprise_proxies')
    url = 'http://svip.kuaidaili.com/api/getproxy'
    params = {
        'orderid': '979397309945634',
        'num': 20,
        'quality': 2,
        'format': 'json'
    }
    content = requests.get(url, params=params).json()
    for proxy in content['data']['proxy_list']:
        proxies = {
            'http': 'http://%s' % proxy,
            'https': 'http://%s' % proxy,
        }

        ping_url = 'http://www.baidu.com'
        status_code = requests.get(ping_url).status_code
        if status_code == 200:
            p = json.dumps(proxies)
            now = time.strftime('%Y-%m-%d %H:%M:%S',
                                time.localtime(time.time()))
            check = conn.exist(p)
            if not check:
                conn.set(p, 1)
                conn.lpush(p)
                now = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
                print(now, ' New proxies: ', p)
            else:
                print(now, ' already exist proxies: ', p)
Пример #3
0
class Tester:
    def __init__(self):
        self.redis = RedisClient()

    async def test_proxy(self, proxy):
        '''
        测试指定的代理
        :param proxy: 待测试的代理
        :return:
        '''
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('UTF-8')
                test_proxy = 'http://' + proxy
                print('测试:', proxy)
                async with session.get(TEST_URL, proxy=test_proxy,
                                       timeout=5) as resp:
                    if resp.status == 200:
                        self.redis.max(proxy)  #检测到代理可用,将标识分数设置为MAX_SCORE
                    else:
                        self.redis.decrease(proxy)  #代理不可用,标识分数减1
            except (AttributeError, TimeoutError, ClientError,
                    aiohttp.ClientConnectorError):
                self.redis.decrease(proxy)  #代理不可用,标识分数减1

    def run(self):
        '''
        测试main方法
        :return:
        '''
        print('开始测试...')
        try:
            proxies = self.redis.all()
            loop = asyncio.get_event_loop()
            #批量测试
            for i in range(0, len(proxies), BATCH_SIZE):
                test_proxies = proxies[i:i + BATCH_SIZE]  #获取测试代理
                tasks = [self.test_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:
            print('测试错误', e.args)
Пример #4
0
class RandomHttpsProxyMiddleware(object):
    def __init__(self):
        '''
        设置代理ip池
        '''
        self.db = RedisClient('useful_proxy', '111.231.255.225', 6379)

    def process_request(self, request, spider):
        '''
        判断是否是http 或者https选择对应的
        :param request:
        :param spider:
        :return:
        '''
        if request.url.startswith('https'):
            self.db.changeTable('useful_proxy_https')
            request.meta['proxy'] = self.db.get()
        else:
            self.db.changeTable('useful_proxy')
            request.meta['proxy'] = self.db.get()
Пример #5
0
 def __init__(self):
     self.redis = RedisClient()
Пример #6
0
def get_conn():
    if not hasattr(g, 'redis'):
        g.redis = RedisClient()
    return g.redis
Пример #7
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
Пример #8
0
 def __init__(self):
     self.r = RedisClient(DB_Config)
Пример #9
0
class ProxyService(object):
    def __init__(self):
        self.r = RedisClient(DB_Config)

    def get(self, ptype, n=1):
        n = 10 if n > 10 else n
        ptype = ptype.lower()
        if ptype in PROXY_TYPES:
            return self.r.get(PROXY + ptype, n)
        return []

    def delete(self, ptype, v):
        ptype = ptype.lower()
        if ptype in PROXY_TYPES:
            return self.r.delRecord(PROXY + ptype, v)

    def refresh(self):
        self.__fetch()
        self.__validate()
        self.__flush()
        return 'ok'

    def __fetch(self):
        """ feed proxy:uncheck set """

        for _cr in Crawlers:
            _crpath = '.'.join(['ProxyCrawler', _cr])
            logging.info("import crawler: %s" % _crpath)
            Crawler = getattr(__import__(_crpath, fromlist=[_cr]), _cr)
            crawler = Crawler()
            crawler.run()
            for item in crawler.items:
                self.r.add(UNCHECK, item)

    def __validate(self):
        """ proxy:uncheck set -> proxy:checked set """
        def __callback(item):
            self.r.add(CHECKED, item)

        total = self.r.len(UNCHECK)
        logging.info("need to validate %d proxies" % total)
        limit = 1000
        for offset in xrange(0, total, limit):
            events = []
            for _ in xrange(limit):
                item = self.r.pop(UNCHECK)
                if not item: break
                events.append(gevent.spawn(validate, item, __callback))
            gevent.joinall(events)

    def __flush(self):
        """ proxy:checked set -> proxy:{http, https, socks5} set """

        # clean all
        for _type in PROXY_TYPES:
            self.r.delTable(PROXY + _type)

        item = self.r.pop(CHECKED)
        while item:
            prefix, proxy = item.split(PROXY_SEP, 1)
            self.r.add(PROXY + prefix, proxy)
            item = self.r.pop(CHECKED)
Пример #10
0
 def __init__(self):
     '''
     设置代理ip池
     '''
     self.db = RedisClient('useful_proxy', '111.231.255.225', 6379)
Пример #11
0
 def __init__(self):
     '''
     设置代理ip池
     '''
     self.db = RedisClient('useful_proxy', '', 6379)
Пример #12
0
 def __init__(self):
     self.logger = LogHandler('ProxyManger')
     self.dbClient = RedisClient()
     self.config = GetConfig()
     self.orignal_proxy_name,self.useful_proxy_name = self.config.dbName
Пример #13
0
class ProxyManager(object):
    def __init__(self):
        self.logger = LogHandler('ProxyManger')
        self.dbClient = RedisClient()
        self.config = GetConfig()
        self.orignal_proxy_name,self.useful_proxy_name = self.config.dbName

    def refresh(self):
        for proxyGetFunc in self.config.proxyGetter:
            # proxy_set = set()
            for proxy in getattr(ProxyGetter,proxyGetFunc.strip())():
                if proxy:
                    self.logger.info("{func} fetch proxy {proxy}".format(func=proxyGetFunc,proxy=proxy))
                    # proxy_set.add(proxy.strip())
                    self.dbClient.lput(self.orignal_proxy_name,proxy)

    def get(self):
        """
        从可用代理池中获取一个可用代理
        :return: one useful proxy
        """
        return self.dbClient.sgetOne(self.useful_proxy_name)[0]

    def getAll(self):
        return self.dbClient.sgetAll(self.useful_proxy_name)

    def spop(self):
        """
        从可用代理池中获随机一个代理,并删除
        """
        return self.dbClient.spop(self.useful_proxy_name)

    def pop(self):
        """
        从原始代理池中获一个然后删除
        :return: one original proxy
        """
        return self.dbClient.rpop(self.orignal_proxy_name)

    def put(self,value):
        """
        save validproxy into useful proxy pool
        :param value: 
        :return: 
        """
        self.dbClient.sput(self.useful_proxy_name,value)

    def delete(self, value):
        """
        可用数据库中删除一个代理
        :return: 
        """
        self.dbClient.sdeleteValue(self.useful_proxy_name,value)

    def getStatus(self):
        return self.dbClient.sgetStatues()