예제 #1
0
class Getter():
    def __init__(self):
        self.redis = FileClient()
        self.crawler = Crawler()

    def is_over_flow(self):
        '''判断是否达到代理池限制'''
        if self.redis.count() >= POOL_UPPER_FLOW:
            return True
        return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_flow():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
        else:
            print('代理池数量已够',str(self.redis.count()))


# if __name__ == '__main__':
#     getter = Getter()
#     getter.run()
예제 #2
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """ Check if the mount of proxies is over threshold."""
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('Getter started.')
        if PRIVATE_PROXY_ENABLE:
            proxies = PrivateProxy().get_proxies()
            for proxy in proxies:
                print('Add private proxy {}'.format(proxy))
                self.redis.add(proxy)
        else:
            if not self.is_over_threshold():
                for callback_label in range(self.crawler.__CrawlFuncCount__):
                    callback = self.crawler.__CrawlFunc__[callback_label]
                    # 获取代理
                    proxies = self.crawler.get_proxies(callback)
                    sys.stdout.flush()
                    for proxy in proxies:
                        self.redis.add(proxy)
예제 #3
0
class Getter(object):
    def __init__(self):
        """
        初始化数据库和创建爬虫
        """
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断代理池是否达到上限
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('或取器开始运作...')
        # 判断代理池是否达到上限
        if not self.is_over_threshold():
            # 遍历所有的代理网站生成的各自的解析函数
            for crawler_index in range(self.crawler.__CrawlerCount__):
                # 获取对应索引的回调函数
                callback = self.crawler.__CrawlerFunc__[crawler_index]
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    # print(proxy)
                    self.redis.add(proxy)
예제 #4
0
class Getter(object):
    """
    代理IP获取器
    """
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_limit(self):
        """
        检测是否超过代理的最大限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        """
        通过python定义的元类可以顺序执行以crawl_开头的函数
        :return:
        """
        print("获取器开始运行,爬取免费代理")
        if not self.is_over_limit():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                # 执行获取代理的函数
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy=proxy)
예제 #5
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            # 看不懂
            # self.crawler.__CrawlFuncCount__ 获取所有以crawl开头的函数
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #6
0
class GetterProxy(object):
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print("获取器开始执行")
        if not self.is_over_threshold():
            for callback_index in range(Crawler.__CrawlFuncCount__):
                # 获取方法
                callback = self.crawler.__CrawlFunc__[callback_index]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                # 添加代理
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #7
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush(
                )  #当我们打印一些字符时,并不是调用print函数后就立即打印的。一般会先将字符送到缓冲区,然后再打印。这就存在一个问题,如果你想立刻看到日志,但由于缓冲区没满,不会打印,sys.stdout.flush()立即把stdout缓存内容输出
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #8
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies2 = self.crawler.get_proxies(callback)
                print(proxies2)
                for i in proxies2:
                    print('__________________________')
                    self.redis.add(i)
예제 #9
0
파일: getter.py 프로젝트: g2thend/proxypool
class Getter(object):
    def __init__(self):
        self.sqlite3 = sqlitedb()
        self.crawler = Crawler()

    def run(self):
        cprint('获取器开始执行')
        for callback_label in range(self.crawler.__CrawlFuncCount__):
            callback = self.crawler.__CrawlFunc__[callback_label]
            # 获取代理
            proxies = self.crawler.get_proxies(callback)
            sys.stdout.flush()
            cprint("插入数据到sqlite3 proxy 表")
            for proxy in proxies:
                self.sqlite3.add(list(proxy))
예제 #10
0
class Getter():
    def __init__(self):
        self.crawler = Crawler()
        self.redis = RedisClient()

    def run(self):
        if self.redis.count() < POOL_UPPER_THRESHOLD:
            #for crawl_func_label in range(self.crawler.__CrawlFuncCount__):
            for crawl_func in self.crawler.__CrawlFunc__:
                #crawl_func = self.crawler.__CrawlFunc__[crawl_func_label]
                proxies = self.crawler.start_crawl_func(crawl_func)
                print(crawl_func, '正在爬取代理')
                for proxy in proxies:
                    print(proxy)
                    self.redis.add(proxy)
        proxy_sum = self.redis.count()
        print('目前代理个数:', proxy_sum)
예제 #11
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def log(self):
        if not os.path.exists('log2'):
            os.mkdir('log2')
        log_file_name = 'log2/' + LOG_PATH
        log_file_1 = logging.FileHandler(log_file_name, 'a', encoding='utf-8')
        fmt = logging.Formatter(
            fmt=
            "%(asctime)s - %(name)s - %(levelname)s -%(module)s:  %(message)s")
        log_file_1.setFormatter(fmt)
        logger1 = logging.Logger('run_log', level=logging.DEBUG)
        logger1.addHandler(log_file_1)

        return logger1

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        """爬取到代理设置初始分数,直接存入redis"""
        print('获取器开始执行')
        if not self.is_over_threshold():
            try:
                for callback_label in range(self.crawler.__CrawlFuncCount__):
                    callback = self.crawler.__CrawlFunc__[callback_label]
                    # 获取代理
                    proxies = self.crawler.get_proxies(callback)
                    sys.stdout.flush()
                    if not proxies:
                        self.log().error('代理抓取失败,抓取函数:%s' % callback)
                        continue
                    for proxy in proxies:
                        self.redis.add(proxy)
            except Exception as e:
                self.log().exception(e)
예제 #12
0
class Getter(object):
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        if self.redis.get_count() < MAX_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        if self.is_over_threshold():
            for i in range(self.crawler.__CrawlCount__):
                proxies = self.crawler.get_proxies(
                    self.crawler.__CrawlFunc__[i])
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #13
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_thershold(self):
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print("the getter programmer started!")
        if not self.is_over_thershold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #14
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    #判断是否达到了代理池限制
    def is_over_threshold(self):
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for index in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[index]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #15
0
class Getter:
    def __init__(self):
        self._conn = RedisClient()
        self._crawler = Crawler()

    def is_over_threshold(self):
        if self._conn.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print("获取器开始运行")
        if not self.is_over_threshold():
            for callback_index in range(self._crawler.__CrawlFuncCount__):
                callback = self._crawler.__CrawlFunc__[callback_index]
                proxies = self._crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self._conn.add(proxy)
예제 #16
0
class Getter(object):
    """docstring for Getter"""
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #17
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            #循环遍历计数器、
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                #使用下表索引取出函数列表中的函数并进行运行
                #爬虫程序返回的是一个迭代器
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                try:
                    '''
                    函数get_proxies是对爬虫结果进行遍历并取出其中的值加入
                    代理列表并且返回代理
                    '''
                    proxies = self.crawler.get_proxies(callback)
                    #proxies: list
                except Exception:
                    print("\033[1;31;40m这里有错误...\033[0m")
                    print(f'爬虫{callback.__name__}发生了错误,需要进行调试')
                #清除缓存使得结果连续输出
                sys.stdout.flush()
                #遍历列表中的代理,加入数据库
                for proxy in proxies:
                    try:
                        self.redis.add(proxy)
                    except OSError as e:
                        print(f"\033[1;31;40m发生错误...{e.reason}\033[0m")
예제 #18
0
class Getter():
    def __init__(self):
        self.mongo = MonClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        data = list(
            self.mongo.db.aggregate([{
                "$match": {
                    "pid": {
                        "$eq": 0
                    }
                }
            }, {
                "$group": {
                    "_id": None,
                    "count": {
                        "$sum": 1
                    }
                }
            }]))
        if len(data) == 1 and data[0]['count'] >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('\033[1;30;44m 获取器开始执行 \033[0m')
        if not self.is_over_threshold():
            print("\033[1;30;44m 没有达到上限! \033[0m")
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.mongo.add(proxy)
예제 #19
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    if self.first_test(proxy):  # 如果测试成功
                        print('添加代理:', proxy)
                        self.redis.add(proxy)  # 添加到队列

    def first_test(self, proxy):
        print('筛选测试代理:', proxy)
        proxies = {
            "http": "http://{}".format(proxy),
        }
        try:
            r = requests.get(TEST_URL, proxies=proxies, timeout=4)
            if r.status_code == 200:
                return True
        except:
            print('测试失败,删除代理', proxy)
            return False
예제 #20
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到代理池的界限
        """
        if self.redis.count() >= POOL_UPPER_LIMIT:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()  # 输出获取代理地址信息,类似print
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #21
0
파일: getter.py 프로젝트: LMFrank/ProxyPool
class Getter(object):
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        Judge whether the threshold has been reached.
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('Start excution')
        if self.is_over_threshold() is not None:
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #22
0
class Getter(object):
    def __init__(self):
        """
        初始化数据库与爬虫
        """
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_full(self):
        """
        判断代理数目是否达上限
        """
        return self.redis.count() >= PROXY_NUMBER_MAX

    @logger.catch
    def run(self):
        logger.info('获取器开始执行......')
        if not self.is_full():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #23
0
파일: getter.py 프로젝트: enshui/ProxyPool
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()
    
    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False
    
    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #24
0
class GetProxy(object):
    def __init__(self):
        self.redis = SaveData()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        log.logger.info("获取器开始执行")
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
예제 #25
0
class Getter():
    crawler_list = [
        "crawl_ip3366", "crawl_kuaidaili", "crawl_ip3366_new", "crawl_iphai",
        "crawl_data5u"
    ]

    def __init__(self):
        self.spider_log = logging.getLogger(GETTERLOGGER)
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self, mode=None):
        """
        判断是否达到了代理池限制
        """
        if mode is None:
            rediskey = REDIS_KEY
        else:
            rediskey = mode

        if self.redis.count(rediskey) >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        self.spider_log.info('获取器定时开始')
        httpflag = 0
        httpsflag = 0
        if not self.is_over_threshold(REDIS_HTTP):
            httpflag = 1
        if not self.is_over_threshold(REDIS_HTTPS):
            httpsflag = 1
        try:
            if httpflag == 1 or httpsflag == 1:
                self.spider_log.info("获取器开始执行,http:" +
                                     str(self.redis.count(REDIS_HTTP)) +
                                     ";https:" +
                                     str(self.redis.count(REDIS_HTTPS)))
                # if True:
                for callback_label in range(self.crawler.__CrawlFuncCount__):
                    callback = self.crawler.__CrawlFunc__[callback_label]
                    # 获取代理
                    if callback not in Getter.crawler_list:
                        continue
                    self.spider_log.info('开始获取:' + callback)
                    proxies = self.crawler.get_proxies(callback)
                    sys.stdout.flush()
                    if httpflag == 1:
                        for proxy in proxies:
                            self.redis.add(proxy, mode=REDIS_HTTP)
                    if httpsflag == 1:
                        for proxy in proxies:
                            self.redis.add(proxy, mode=REDIS_HTTPS)
            else:
                self.spider_log.info("获取器无需执行,http:" +
                                     str(self.redis.count(REDIS_HTTP)) +
                                     ";https:" +
                                     str(self.redis.count(REDIS_HTTPS)))

        except Exception as e:
            self.spider_log.error('获取器发生错误' + str(e.args))
            self.spider_log.error('traceback:' + traceback.format_exc())
예제 #26
0
 def __init__(self):
     self.spider_log = logging.getLogger(GETTERLOGGER)
     self.redis = RedisClient()
     self.crawler = Crawler()
예제 #27
0
파일: getter.py 프로젝트: g2thend/proxypool
 def __init__(self):
     self.sqlite3 = sqlitedb()
     self.crawler = Crawler()
예제 #28
0
 def __init__(self):
     self.redis = SaveData()
     self.crawler = Crawler()
예제 #29
0
파일: getter.py 프로젝트: enshui/ProxyPool
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
예제 #30
0
 def __init__(self):
     self.mongo = MonClient()
     self.crawler = Crawler()
예제 #31
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
예제 #32
0
 def __init__(self):
     """
     初始化数据库与爬虫
     """
     self.redis = RedisClient()
     self.crawler = Crawler()