Пример #1
0
class Getter:
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def limit(self, limit_num=500):
        """
        判断代理数量是否超过代理池设定值
        :param limit_num:
        :return:
        """
        if self.redis.count() >= limit_num:
            return True
        else:
            return False

    def run(self):
        print("Getter is running...")
        if not self.limit():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.put(proxy)
Пример #2
0
class ProxyPool:
    def __init__(self):
        self.logger = logger
        self.db = RedisClient(config.NAME, config.HOST, config.PORT,
                              config.PASSWORD)
        self.html_request = HtmlRequest()
        self.html_parser = HtmlParser()

    def update(self):
        """
        更新代理池
        :return:
        """
        while True:
            if self.db.nums() < config.PROXY_MINNUM:
                self.logger.info(
                    "db exists ip:%d, less the minnum, start crawling proxy..."
                    % self.db.nums())
                spawns = []
                gevent.spawn(self.crawl_gatherproxy)
                # for parser in config.parserList:
                #     spawns.append(gevent.spawn(self.crawl, parser))
                #     if len(spawns) >= config.MAX_DOWNLOAD_CONCURRENT:
                #         gevent.joinall(spawns)
                #         spawns = []
                gevent.joinall(spawns)
            else:
                self.logger.info(
                    "db exists ip:%d, enough to use, wait next update..." %
                    self.db.nums())
            time.sleep(config.UPDATE_TIME)

    def crawl(self, parser):
        for url in parser['urls']:
            response = self.html_request.get(url)
            if response:
                proxy_list = self.html_parser.parse(response.text, parser)
                if proxy_list:
                    self.logger.info("get %d proxy from %s", len(proxy_list),
                                     url)
                    for proxy in proxy_list:
                        if self.vaild(proxy):
                            # save proxy
                            self.logger.info("get a vaild proxy: %s", proxy)
                            self.db.put(proxy)

    def crawl_gatherproxy(self):
        headers = {
            'Host': 'www.gatherproxy.com',
            'Proxy-Connection': 'keep-alive',
            'Origin': 'http://www.gatherproxy.com',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.gatherproxy.com/proxylist/country/?c=China',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
        url = 'http://www.gatherproxy.com/proxylist/country/?c=China'
        data = {"Country": "china", "PageIdx": 1, "Filter": '', "Uptime": 0}
        for page in range(1, 40):
            data['PageIdx'] = page
            response = self.html_request.post(url, data, headers)
            proxy_list = []
            root = etree.HTML(response.text)
            proxys = root.xpath(".//table[@id='tblproxy']/tr[position()>2]")
            for proxy in proxys:
                try:
                    ip_text = proxy.xpath(".//td[2]/script")[0].text
                    ip = ip_text.split("'")[1]
                    port_text = proxy.xpath(".//td[3]/script")[0].text
                    port = str(int(port_text.split("'")[1], 16))
                except Exception as e:
                    self.logger.error("parse proxy error: ", e)
                    continue
                proxy = ":".join([ip, port])
                proxy_list.append(proxy)
            if proxy_list:
                self.logger.info("get %d proxy from %s", len(proxy_list), url)
                for proxy in proxy_list:
                    if self.vaild(proxy):
                        # save proxy
                        self.logger.info("get a vaild proxy: %s", proxy)
                        self.db.changeTable("gatherproxy")
                        self.db.put(proxy)

    def vaild(self, proxy):
        proxies = {"http": "http://{proxy}".format(proxy=proxy)}
        try:
            # 超过20秒的代理就不要了
            r = requests.get('http://httpbin.org/ip',
                             proxies=proxies,
                             timeout=10,
                             verify=False)
            if r.status_code == 200:
                # logger.info('%s is ok' % proxy)
                return True
        except Exception as e:
            # logger.error(str(e))
            return False