Пример #1
0
class Getter(object):
    # 爬取代理网站的免费代理IP,存入redis
    def __init__(self):
        self.redis_client = RedisClient()
        self.crawler = Crawler()

    def is_full(self):
        # 判断代理池是否满了
        return self.redis_client.get_proxy_count() >= FULL_COUNT

    def run(self):
        # 将爬取到的代理存入redis
        if not self.is_full():
            proxys = self.crawler.get_crawler_proxy()
            for proxy in proxys:
                self.redis_client.add(proxy)
Пример #2
0
class Getter(object):
    def __init__(self):
        self.redis_client = RedisClient()
        self.crawler = Crawler()

    def is_full(self):
        """
            判断代理池是否满了
        """
        if self.redis_client.get_proxy_count() >= FULL_COUNT:
            return True
        else:
            return False

    def run(self):
        """
            将爬取到的代理存入redis
        """
        if not self.is_full():
            proxy_list = self.crawler.get_crawler_proxy()
            for proxy in proxy_list:
                self.redis_client.add(proxy)
Пример #3
0
class Getter(object):
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        '''
        判断是否超出了抓取的限制
        '''
        if self.redis.count() >= settings.POOL_UPPER_THRESHOLD:
            return True
        return False

    def run(self):
        print('开始抓取')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
Пример #4
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到代理池限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print("获取器开始执行")
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
Пример #5
0
class Crawler(object):
    '''crawl proxy ip from proxy website'''
    def __init__(self, host="localhost", port='6379', key='Proxy'):
        self.__redis = RedisClient(host=host,
                                   port=port,
                                   password=None,
                                   key=key)

    def __select_crawl_func(self):
        '''select function start with "crawl_" '''
        return filter(
            lambda x: x.startswith('crawl_') and callable(getattr(self, x)),
            dir(self))

    def get_proxies(self):
        '''run all "crawl_*" function '''
        proxies = []
        funcs = self.__select_crawl_func()
        for func in funcs:
            proxy = eval("self.{}()".format(func))
            if proxy:
                proxies.append(proxy)
        return proxies

    def run(self):
        proxies = self.get_proxies()
        thread_pool = []
        for proxy in proxies:
            th = threading.Thread(target=self.__single_run, args=(proxy, ))
            thread_pool.append(th)
            th.start()
        for th in thread_pool:
            th.join()

    def __single_run(self, proxy):
        '''crawler for specific website'''
        for ip in proxy:
            # print(threading.current_thread().name, "\t", ip)
            self.__redis.add(ip)

    def __base_crawl_func(self, page_num, url_base, host, id_anonymous,
                          name_anonymous):
        '''base function for crawler'''

        urls = []
        if page_num > 1:
            for page in range(page_num):
                url = url_base.format(page + 1)
                urls.append(url)
            shuffle(urls)
        else:
            urls.append(url_base)
        for page in range(page_num):
            if page % 10 == 0:
                headers = UserAgent(host).headers()
            try:
                if page % 5 == 0:
                    proxy = self.__redis.random_max()
            except Exception:
                proxy = None
            url = urls[page]
            try:
                if proxy:
                    proxies = {"http": "http://" + proxy}
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            timeout=15)
                else:
                    response = requests.get(url=url,
                                            headers=headers,
                                            timeout=15)
            except Exception:
                # print(threading.current_thread().name, "Request url error:", url)
                continue
            if not response.status_code == 200:
                continue
            for code in ['utf-8', 'gbk', 'gb2312']:
                try:
                    html = response.content.decode(code)
                    break
                except Exception:
                    # print('code error:{}'.format(code))
                    pass
            if not html:
                continue
            soup = BeautifulSoup(html, "lxml")
            tds = soup.find_all("td")
            for index, td in enumerate(tds):
                text = re.sub(r"[\s\n\t]+", "", td.text)
                rule = r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$"
                if not re.match(rule, text):
                    continue
                if name_anonymous not in re.sub(
                        r"[\s\n\t]+", "", tds[index + id_anonymous - 1].text):
                    continue
                IP = re.sub(r"[\s\n\t]+", "", tds[index + 0].text)
                PORT = re.sub(r"[\s\n\t]+", "", tds[index + 1].text)
                proxy = "{}:{}".format(IP, PORT)
                yield proxy

    def crawl_xici(self):
        '''crawl proxy ip from xici website'''
        page_num = 3336
        url_base = "http://www.xicidaili.com/nn/{}"
        host = "www.xicidaili.com"
        id_anonymous = 4
        name_anonymous = '高匿名'

        return self.__base_crawl_func(page_num, url_base, host, id_anonymous,
                                      name_anonymous)

    def crawl_kuaidaili(self):
        '''crawl proxy ip from kuaidaili website'''
        page_num = 2367
        url_base = "https://www.kuaidaili.com/free/inha/{}"
        host = "www.kuaidaili.com"
        id_anonymous = 3
        name_anonymous = '高匿名'

        return self.__base_crawl_func(page_num, url_base, host, id_anonymous,
                                      name_anonymous)

    def crawl_66(self):
        '''crawl proxy ip from 66 website'''
        page_num = 1288
        url_base = "http://www.66ip.cn/{}.html"
        host = "www.66ip.cn"
        id_anonymous = 4
        name_anonymous = '高匿代理'

        return self.__base_crawl_func(page_num, url_base, host, id_anonymous,
                                      name_anonymous)

    def crawl_yqie(self):
        '''crawl proxy ip from yqie website'''
        page_num = 1
        url_base = "http://ip.yqie.com/ipproxy.htm"
        host = "ip.yqie.com"
        id_anonymous = 4
        name_anonymous = '高匿'

        return self.__base_crawl_func(page_num, url_base, host, id_anonymous,
                                      name_anonymous)

    def crawl_yundaili(self):
        '''crawl proxy ip from yundaili website'''
        page_num = 7
        url_base = "http://www.ip3366.net/?stype=1&page={}"
        host = "www.ip3366.net"
        id_anonymous = 3
        name_anonymous = '高匿代理IP'

        return self.__base_crawl_func(page_num, url_base, host, id_anonymous,
                                      name_anonymous)