示例#1
0
class Getter():
    def __init__(self):
        # self.redis = RedisClient()
        self.local = LocalDict()
        self.crawler = Crawler()
        self.factory = Factory()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.local.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        t = set()
        count = self.local.count()
        if self.is_over_threshold():
            Log.info("Getter:此时容量已达上限,不获取ip")
            return
        Log.info(f'Getter:开始执行, 当前容量:{count}')
        for callback_label in range(self.crawler.__CrawlFuncCount__):
            try:
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                t.add(self.factory.add(self.crawler.get_proxies, callback))
                sys.stdout.flush()
            except:
                traceback.print_exc()

        self.factory.wait(t)
        Log.info(f'Getter:执行结束, 获取前容量:{count}, 当前:{self.local.count()}')
示例#2
0
def get_counts():
    """
    Get the sleep_count of proxies
    :return: 代理池总量
    """
    conn = LocalDict()
    return str(conn.count())
示例#3
0
class Tester(object):
    def __init__(self):
        self.local = LocalDict()
        self.factory = Factory()
        self.mutex = threading.Lock()
        self._minus_count = 0

    def _minus(self):
        with self.mutex:
            self._minus_count += 1

    def test_single_proxy(self, url, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        proxies = {
            "http": "http://" + proxy,
        }
        try:
            response = requests.head(url,
                                     headers=base_headers,
                                     proxies=proxies,
                                     timeout=15,
                                     allow_redirects=False,
                                     verify=False)
            status_code = response.status_code
            if status_code in VALID_STATUS_CODES:
                Log.debug(f'Tester:代理可用 {proxy}')
                pass
            else:
                if status_code in FORBIDEN_STATUS_CODES:
                    self._minus()
                    self.local.decrease(proxy, -MAX_SCORE)
                else:
                    self.local.decrease(proxy)
                Log.error(
                    f'Tester:请求响应码不合法 {status_code} ,IP {proxy}, URL: {url}')
        except (ReadTimeout, HTTPError, ProxyError, ConnectionError):
            self._minus()
            self.local.decrease(proxy, -MAX_SCORE)
            Log.warning(f'Tester:无用ip,直接删掉, ip: {proxy}')
        except (TypeError, AttributeError) as e:
            self.local.decrease(proxy)
            Log.error(f'Tester:代理请求失败 {proxy} ERROR: {e}')

    def run(self):
        """
        测试主函数
        :return:
        """
        t = set()
        self._minus_count = 0
        count = self.local.count()
        if count == 0:
            Log.info("Tester:无代理")
            return
        Log.info(f'Tester:开始运行, 当前容量:{count}')
        try:
            stop = max(0, count)
            test_proxies = self.local.batch(0, stop)
            for proxy in test_proxies:
                for url in TEST_URLS:
                    t.add(self.factory.add(self.test_single_proxy, url, proxy))

            self.local.clear()

        except Exception as e:
            Log.error(f'Tester:发生错误 {e.args}')

        self.factory.wait(t)
        Log.info(f'Tester:执行结束, 测试前容量:{count}, 剩余:{count-self._minus_count}')