示例#1
0
 def fetch(self):
     """
     fetch proxy into db by ProxyGetter
     :return:
     """
     self.db.changeTable(self.raw_proxy_queue)
     proxy_set = set()
     self.log.info("ProxyFetch : start")
     for proxyGetter in config.proxy_getter_functions:
         self.log.info(
             "ProxyFetch - {func}: start".format(func=proxyGetter))
         try:
             for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                 proxy = proxy.strip()
                 if not proxy or not verifyProxyFormat(proxy):
                     self.log.error(
                         'ProxyFetch - {func}: {proxy} illegal'.format(
                             func=proxyGetter, proxy=proxy.ljust(20)))
                     continue
                 elif proxy in proxy_set:
                     self.log.info(
                         'ProxyFetch - {func}: {proxy} exist'.format(
                             func=proxyGetter, proxy=proxy.ljust(20)))
                     continue
                 else:
                     self.log.info(
                         'ProxyFetch - {func}: {proxy} success'.format(
                             func=proxyGetter, proxy=proxy.ljust(20)))
                     self.db.put(Proxy(proxy, source=proxyGetter))
                     proxy_set.add(proxy)
         except Exception as e:
             self.log.error(
                 "ProxyFetch - {func}: error".format(func=proxyGetter))
             self.log.error(str(e))
             self.log.error(traceback.format_exc())
示例#2
0
    def refresh(self):
        """
        fetch proxy into Db by ProxyGetter
        :return:
        """
        for proxyGetter in self.config.proxy_getter_functions:
            # fetch
            proxy_set = set()
            try:
                self.log.info("{func}: fetch proxy start".format(func=proxyGetter))
                proxy_iter = [_ for _ in getattr(GetFreeProxy, proxyGetter.strip())()]
            except Exception as e:
                self.log.error("{func}: fetch proxy fail".format(func=proxyGetter))
                continue
            for proxy in proxy_iter:
                proxy = proxy.strip()
                if proxy and verifyProxyFormat(proxy):
                    self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
                    proxy_set.add(proxy)
                else:
                    self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy))

            # store
            for proxy in proxy_set:
                self.db.changeTable(self.useful_proxy_queue)
                if self.db.exists(proxy):
                    continue
                self.db.changeTable(self.raw_proxy_queue)
                self.db.put(proxy)
示例#3
0
 def refresh(self):
     """从已有站点上抓取proxy,并存放到redis raw_proxy
     fetch proxy into Db by ProxyGetter/getFreeProxy.py
     :return:
     """
     max_conn = 100
     meta: dict = {}
     self.db.changeTable(self.raw_proxy_queue)
     for proxyGetter in config.proxy_getter_functions:
         # fetch
         try:
             self.log.info(
                 "{func}: fetch proxy start".format(func=proxyGetter))
             for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                 # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能
                 proxy = proxy.strip()
                 if proxy and verifyProxyFormat(proxy):
                     self.log.info('{func}: fetch proxy {proxy}'.format(
                         func=proxyGetter, proxy=proxy))
                     host, port = proxy.split(":")
                     meta["host"] = host
                     meta["port"] = port
                     meta["max_conn"] = max_conn
                     self.db.put(proxy, json.dumps(meta))
                 else:
                     self.log.error(
                         '{func}: fetch proxy {proxy} error'.format(
                             func=proxyGetter, proxy=proxy))
         except Exception as e:
             self.log.error(e)
             self.log.error(
                 "{func}: fetch proxy fail".format(func=proxyGetter))
             continue
示例#4
0
    def checkAllGetProxyFunc():
        """
        检查getFreeProxy所有代理获取函数运行情况
        Returns:
            None
        """
        import inspect
        member_list = inspect.getmembers(GetFreeProxy,
                                         predicate=inspect.isfunction)

        proxy_count_dict = dict()
        for func_name, func in member_list:
            log.info(u"开始运行 {}".format(func_name))
            try:
                # func() ---> 生成器 返回 ip:port --- string
                proxy_list = [_ for _ in func() if verifyProxyFormat(_)]
                # fun_name:count
                proxy_count_dict[func_name] = len(proxy_list)
            except Exception as e:
                log.info(u"代理获取函数 {} 运行出错!".format(func_name))
                log.error(str(e))
        log.info(u"所有函数运行完毕 " + "***" * 5)
        for func_name, func in member_list:
            log.info(u"函数 {n}, 获取到代理数: {c}".format(n=func_name,
                                                   c=proxy_count_dict.get(
                                                       func_name, 0)))
示例#5
0
 def refresh(self):
     """
     fetch proxy into Db by ProxyGetter
     :return:
     """
     self.db.changeTable(self.raw_proxy_queue)
     for proxyGetter in config.proxy_getter_functions:
         # fetch
         try:
             self.log.info(
                 "{func}: fetch proxy start".format(func=proxyGetter))
             for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                 # 挨个存储 proxy,优化raw 队列的 push 速度,进而加快 check proxy 的速度
                 proxy = proxy.strip()
                 if proxy and verifyProxyFormat(proxy):
                     self.log.info('{func}: fetch proxy {proxy}'.format(
                         func=proxyGetter, proxy=proxy))
                     self.db.put(proxy)
                 else:
                     self.log.error(
                         '{func}: fetch proxy {proxy} error'.format(
                             func=proxyGetter, proxy=proxy))
         except Exception as e:
             self.log.error(
                 "{func}: fetch proxy fail".format(func=proxyGetter))
             continue
示例#6
0
 def refresh(self):
     """
     fetch proxy into Db by ProxyGetter user defined proxy getter class
     :return:
     """
     self.db.changeTable(self.raw_proxy_queue)
     try:
         proxy_getter_class = self.__dynamic_import__(
             config.proxy_getter_lib)
     except Exception as e:
         raise Exception('%s not found in ProxyGetter' %
                         config.proxy_getter_lib)
     for proxyGetter in config.proxy_getter_functions:
         # fetch
         try:
             self.log.info(
                 "{func}: fetch proxy start".format(func=proxyGetter))
             for proxy in getattr(proxy_getter_class,
                                  proxyGetter.strip())():
                 # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能
                 proxy = proxy.strip()
                 if proxy and verifyProxyFormat(proxy):
                     self.log.info('{func}: fetch proxy {proxy}'.format(
                         func=proxyGetter, proxy=proxy))
                     self.db.put(proxy)
                 else:
                     self.log.error(
                         '{func}: fetch proxy {proxy} error'.format(
                             func=proxyGetter, proxy=proxy))
         except Exception as e:
             self.log.error(
                 "{func}: fetch proxy fail".format(func=proxyGetter))
             continue
示例#7
0
 def refresh(self):
     """
     fetch proxy into Db by ProxyGetter
     :return:
     """
     self.db.changeTable(self.raw_proxy_queue)
     for proxyGetter in self.config.proxy_getter_functions:
         # fetch
         # proxy_set = set()
         try:
             self.log.info(
                 "{func}: fetch proxy start".format(func=proxyGetter))
             # proxy_iter = [_ for _ in getattr(GetFreeProxy, proxyGetter.strip())()]
             for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                 proxy = proxy.strip()
                 if proxy and verifyProxyFormat(proxy):
                     self.log.info('{func}: fetch proxy {proxy}'.format(
                         func=proxyGetter, proxy=proxy))
                     self.db.put(proxy)
                 else:
                     self.log.error(
                         '{func}: fetch proxy {proxy} error'.format(
                             func=proxyGetter, proxy=proxy))
         except Exception as e:
             self.log.error(
                 "{func}: fetch proxy fail".format(func=proxyGetter))
             continue
示例#8
0
 def refresh(self):
     """
     fetch proxy into Db by ProxyGetter/getFreeProxy.py
     :return:
     """
     self.db.changeTable(self.raw_proxy_queue)
     for proxyGetter in config.proxy_getter_functions:
         # fetch
         try:
             self.log.info(
                 "{func}: fetch proxy start".format(func=proxyGetter))
             for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                 # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能
                 proxy = proxy.strip()
                 if proxy and verifyProxyFormat(proxy):
                     self.log.info('{func}: fetch proxy {proxy}'.format(
                         func=proxyGetter, proxy=proxy))
                     self.db.put(proxy)
                 else:
                     self.log.error(
                         '{func}: fetch proxy {proxy} error'.format(
                             func=proxyGetter, proxy=proxy))
         except Exception as e:
             self.log.error(
                 "{func}: fetch proxy fail".format(func=proxyGetter))
             continue
 async def fetch_spys_one(self):
     """
     fetch proxy into db by ProxyGetter
     :return:
     """
     proxy_set = set()
     self.db.changeTable(self.raw_proxy_queue)
     self.log.info("ProxyFetch : start")
     self.log.info("ProxyFetch - {func}: start".format(func="fetch_spys_one"))
     try:
         proxy_list = await scrape_spys_one()
         if proxy_list is not None:
             for proxy in proxy_list:
                 proxy = proxy.strip()
                 if not proxy or not verifyProxyFormat(proxy):
                     self.log.error('ProxyFetch - {func}: {proxy} illegal'.format(func="fetch_spys_one", proxy=proxy.ljust(20)))
                     continue
                 elif proxy in proxy_set:
                     self.log.info('ProxyFetch - {func}: {proxy} exist'.format(func="fetch_spys_one", proxy=proxy.ljust(20)))
                     continue
                 else:
                     self.log.info('ProxyFetch - {func}: {proxy} success'.format(func="fetch_spys_one", proxy=proxy.ljust(20)))
                     self.db.put(Proxy(proxy, source="fetch_spys_one"))
                     proxy_set.add(proxy)
     except Exception as e:
         self.log.error("ProxyFetch - {func}: error".format(func="fetch_spys_one"))
         self.log.error(str(e))
         self.log.error(traceback.format_exc())
示例#10
0
    def checkGetProxyFunc(func):
        """
        检查指定的getFreeProxy某个function运行情况
        Args:
            func: getFreeProxy中某个可调用方法

        Returns:
            None
        """
        func_name = getattr(func, '__name__', "None")
        log.info("start running func: {}".format(func_name))
        count = 0

        #save
        file="proxies.txt"
        f=open(file,'a')

        for proxy in func():
            if verifyProxyFormat(proxy):
                log.info("{} fetch proxy: {}".format(func_name, proxy))
                f.write(proxy)
                f.write('\n')
                count += 1
        f.close()
        log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count))
示例#11
0
    def fetch(self):
        start_time = time.time()
        total = 0
        succ = 0
        fail = 0
        skip = 0

        fetcher_name = self.queue.get()
        fetcher_class = FetcherManager.get_class(fetcher_name)
        log.debug("fetch [{fetcher_name}] proxy start".format(
            fetcher_name=fetcher_name))
        try:
            f = fetcher_class()
            for proxy in f.run():
                proxy = proxy.strip()
                if proxy and verifyProxyFormat(proxy) and \
                not proxy_manager.checkRawProxyExists(proxy) and \
                not proxy_manager.checkUsefulProxyExists(proxy):

                    proxy_manager.saveRawProxy(proxy)
                    succ = succ + 1
                    log.debug(
                        "fetch [{fetcher_name}] proxy {proxy} succ".format(
                            fetcher_name=fetcher_name, proxy=proxy))
                else:
                    skip = skip + 1
                    log.debug(
                        "fetch [{fetcher_name}] proxy {proxy} skip".format(
                            fetcher_name=fetcher_name, proxy=proxy))

                total = total + 1
        except Exception as e:
            log.error("fetch [{fetcher_name}] proxy fail: {error}".format(
                fetcher_name=fetcher_name, error=e))
            fail = fail + 1

        end_time = time.time()
        elapsed_time = int(end_time - start_time)

        self.queue.task_done()

        stat = dict(
            total=total,
            succ=succ,
            fail=fail,
            skip=skip,
        )
        ConfigManager.fetcher_config.update_stat(fetcher_name, stat)

        log.info(
            "fetch [{fetcher_name}] proxy finish, total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s"
            .format(fetcher_name=fetcher_name,
                    total=total,
                    succ=succ,
                    fail=fail,
                    skip=skip,
                    elapsed_time=elapsed_time))
示例#12
0
    def fetch(self):
        start_time = time.time()
        total = 0
        succ = 0
        fail = 0
        skip = 0

        fetcher = self.queue.get()
        name = fetcher["name"]

        fetcher_class = FetcherManager.getFetcherClass(name)
        log.debug("fetch [{name}] proxy start".format(name=name))
        try:
            f = fetcher_class()
            for proxy in f.run():
                proxy = proxy.strip()
                if proxy and verifyProxyFormat(proxy) and \
                        not ProxyManager.proxy_manager.checkUsefulProxyExists(proxy):

                    ProxyManager.proxy_manager.saveUsefulProxy(proxy)
                    succ = succ + 1
                    log.debug("fetch [{name}] proxy {proxy} succ".format(
                        name=name, proxy=proxy))
                else:
                    skip = skip + 1
                    log.debug("fetch [{name}] proxy {proxy} skip".format(
                        name=name, proxy=proxy))

                total = total + 1
        except Exception as e:
            log.error("fetch [{name}] proxy fail: {error}".format(name=name,
                                                                  error=e))
            fail = fail + 1

        self.queue.task_done()

        now = int(time.time())
        elapsed_time = int(now - start_time)

        next_fetch_time = self.start_time + (fetcher["interval"] * 60)

        data = {
            "$inc": {
                "succ": succ,
                "fail": fail,
                "skip": skip,
                "total": total,
            },
            "$set": {
                "next_fetch_time": next_fetch_time,
            }
        }

        ProxyManager.proxy_manager.updateFetcher(name, data)
        log.info("fetch [{name:^15}] proxy finish, \
            total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s"                                                                                               . \
                 format(name=name, total=total, succ=succ, fail=fail, skip=skip, elapsed_time=elapsed_time))
示例#13
0
 def checkGetProxyFunc(func):
     func_name = getattr(func, '__name__', 'None')
     log.info("start running func: {}".format(func_name))
     count = 0
     for proxy in func():
         if verifyProxyFormat(proxy):
             log.info("{} fetch proxy: {}".format(func_name, proxy))
             count += 1
     log.info("{n} completed, fetch proxy number: {c}".format(n=func_name,
                                                              c=count))
示例#14
0
    def checkGetProxyFunc(func):
        """
        检查指定的getFreeProxy某个function运行情况
        Args:
            func: getFreeProxy中某个可调用方法

        Returns:
            None
        """
        func_name = getattr(func, '__name__', "None")
        log.info("start running func: {}".format(func_name))
        count = 0
        for proxy in func():
            if verifyProxyFormat(proxy):
                log.info("{} fetch proxy: {}".format(func_name, proxy))
                count += 1
        log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count))
示例#15
0
    def checkGetProxyFunc(func):
        """
        检查指定的getFreeProxy函数运行情况
        :param func:
        :return:
        """

        func_name = getattr(func, '__name__', 'None')
        print(u'开始运行函数:{}'.format(func_name))

        count = 0
        for proxy in func():

            if verifyProxyFormat(proxy):
                print(u'{} 获取到代理: {}'.format(func_name, proxy))
                count += 1

        print(u'函数 {n} 运行完毕,获取到 {c} 个代理。'.format(n=func_name, c=count))
示例#16
0
 def checkAllGetProxyFunc():
     import inspect
     member_list = inspect.getmembers(GetFreeProxy,
                                      predicate=inspect.ismethod)
     proxy_count_dict = dict()
     for func_name, func in member_list:
         log.info(u"开始运行 {}".format(func_name))
         try:
             proxy_list = [_ for _ in func() if verifyProxyFormat(_)]
             proxy_count_dict[func_name] = len(proxy_list)
         except Exception as e:
             log.info(u"代理获取函数 {} 运行出错!".format(func_name))
             log.error(str(e))
     log.info(u"所有函数运行完毕 " + "***" * 50)
     for func_name, func in member_list:
         log.info(u"函数 {n},获取到代理输: {c}".format(n=func_name,
                                               c=proxy_count_dict.get(
                                                   func_name, 0)))
示例#17
0
 def refresh(self):
     self.db.changeTable(self.raw_proxy_queue)
     for proxyGetter in config.proxy_getter_functions:
         try:
             self.log.info(
                 "{func}: fetch proxy start".format(func=proxyGetter))
             for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                 if proxy is not False:
                     if proxy and verifyProxyFormat(proxy):
                         self.log.info("{func}: fetch proxy {proxy}".format(
                             func=proxyGetter, proxy=proxy))
                         self.db.put(proxy)
                     else:
                         self.log.error(
                             "{func}: fetch proxy {proxy} error".format(
                                 func=proxyGetter, proxy=proxy))
         except Exception as s:
             self.log.error("refresh: {}".format(s))
             self.log.error(
                 "{func}: fetch proxy fail".format(func=proxyGetter))
             continue
示例#18
0
    def checkAllGetProxyFunc(self):
        """
        检查getFreeProxy所有代理获取函数的运行情况
        :return:
        """
        import inspect
        member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction)
        proxy_count_dict = dict()

        for func_name, func in member_list:
            print(u'开始运行{}'.format(func_name))

            try:
                proxy_list = [_ for _ in func() if verifyProxyFormat(_)]
                proxy_count_dict[func_name] = len(proxy_list)
            except Exception as e:
                print(u'代理获取函数 {} 运行出错!'.format(func_name))
                print(str(e))

        print(u'所有函数运行完毕' + ' ******' * 5)

        for func_name, func in member_list:
            print(u'函数 {n}, 获取到代理 {c}'.format(n=func_name, c=proxy_count_dict.get(func_name, 0)))
示例#19
0
    def refresh(self):
        proxy_getter_functions = config.cf.options("ProxyGetter")
        for proxyGetter in proxy_getter_functions:
            try:
                log.info(
                    "Fetch Proxy Start, func:{func}".format(func=proxyGetter))

                total = 0
                succ = 0
                fail = 0
                for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                    proxy = proxy.strip()
                    if proxy and verifyProxyFormat(
                            proxy) and not self.checkRawProxyExists(proxy):
                        self.saveRawProxy(proxy)
                        succ = succ + 1
                        log.debug('{func}: fetch proxy {proxy}'.format(
                            func=proxyGetter, proxy=proxy))
                    else:
                        fail = fail + 1
                        log.error('{func}: fetch proxy {proxy} error'.format(
                            func=proxyGetter, proxy=proxy))

                    total = total + 1

                log.info(
                    "fetch proxy end, func:{func}, total:{total}, succ:{succ} fail:{fail}"
                    .format(func=proxyGetter,
                            total=total,
                            succ=succ,
                            fail=fail))

            except Exception as e:
                log.error(
                    "func_name:{func_name} fetch proxy fail, error:{error}".
                    format(func_name=proxyGetter, error=e))
                continue
示例#20
0
    # test_batch(gg.freeProxySeventh())

    # test_batch(gg.freeProxyEight())

    # test_batch(gg.freeProxyNinth())

    # test_batch(gg.freeProxyTen())

    # test_batch(gg.freeProxyEleven())

    proxy_iter = gg.freeProxyTwelve()
    proxy_set = set()
    for proxy in proxy_iter:
        proxy = proxy.strip()
        if proxy and verifyProxyFormat(proxy):
            #self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
            proxy_set.add(proxy)
        #else:
        #self.log.error('{func}: fetch proxy {proxy} error'.format(func=proxyGetter, proxy=proxy))

        # store
    for proxy in proxy_set:
        print(proxy)

    # test_batch(gg.freeProxyTwelve())

    # test_batch(gg.freeProxyWallFirst())

    # test_batch(gg.freeProxyWallSecond())