示例#1
0
class Getter():
    def __init__(self):
        # self.redis = RedisClient()
        self.local = LocalDict()
        self.crawler = Crawler()
        self.factory = Factory()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.local.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        t = set()
        count = self.local.count()
        if self.is_over_threshold():
            Log.info("Getter:此时容量已达上限,不获取ip")
            return
        Log.info(f'Getter:开始执行, 当前容量:{count}')
        for callback_label in range(self.crawler.__CrawlFuncCount__):
            try:
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                t.add(self.factory.add(self.crawler.get_proxies, callback))
                sys.stdout.flush()
            except:
                traceback.print_exc()

        self.factory.wait(t)
        Log.info(f'Getter:执行结束, 获取前容量:{count}, 当前:{self.local.count()}')
示例#2
0
def get_counts():
    """
    Get the sleep_count of proxies
    :return: 代理池总量
    """
    conn = LocalDict()
    return str(conn.count())
示例#3
0
def random_proxy():
    """
    Get a proxy
    :return: 随机代理
    """
    conn = LocalDict()
    ip = conn.random()
    Log.info(f"ip: {ip}")
    return ip
示例#4
0
def get_proxy():
    """
    Get a proxy
    :return: 随机代理
    """
    conn = LocalDict()
    ip = conn.max()
    Log.info(f"ip: {ip}")
    return ip
示例#5
0
def decrease_proxy():
    """
    Get a proxy
    :return: 随机代理
    """
    proxy = request.args.get("proxy")
    conn = LocalDict()
    conn.decrease(proxy, MAX_SCORE)
    Log.info(f"删除的ip为{proxy}")
    return "ok"
示例#6
0
    def crawl_zdaye(self):
        base_url = 'https://www.zdaye.com/dayProxy/{page}.html'
        head = {
            "Cookie":
            "_qddac=3-3-1.1.u7q9fs.khfmp63u; acw_tc=76b20f7016052337002376740e34e53c53751ff3dd45dc920c0e102e8298be; __51cke__=; Hm_lvt_80f407a85cf0bc32ab5f9cc91c15f88b=1605233700; __root_domain_v=.zdaye.com; _qddaz=QD.amc9vg.uv9mtn.khfmp5xx; _qdda=3-1.1; _qddab=3-u7q9fs.khfmp63u; _qddamta_2355087264=3-0; acw_sc__v2=5fadec2632d8f78e61f23f09232ed1c5c171b200; ASPSESSIONIDSGQQDQDB=HPEIIOOCNIFNIJAPKMGKBPIO; __tins__16949115=%7B%22sid%22%3A%201605233699770%2C%20%22vd%22%3A%204%2C%20%22expires%22%3A%201605236758401%7D; __51laig__=4; Hm_lpvt_80f407a85cf0bc32ab5f9cc91c15f88b=1605234959"
        }

        urls = [base_url.format(page=page) for page in range(1, 5)]

        for url in urls:
            proxy = {
                "http": "http://" + LocalDict().max(),
            }
            html = get_page(url, head, proxy)
            if html is None:
                continue
            doc = pq(html)
            for item in doc('#J_posts_list .thread_item div div p a').items():
                url_detail = 'https://www.zdaye.com' + item.attr('href')
                html_detail = get_page(url_detail, head, proxy)
                if html_detail is None:
                    break
                doc_detail = pq(html_detail)
                trs = doc_detail('.cont br').items()
                for tr in trs:
                    line = tr[0].tail
                    match = re.search(r'(\d+\.\d+\.\d+\.\d+):(\d+)', line)
                    if match:
                        host = match.group(1)
                        port = match.group(2)
                        yield ':'.join([host, port])
示例#7
0
 def __init__(self):
     # self.redis = RedisClient()
     self.local = LocalDict()
     self.crawler = Crawler()
     self.factory = Factory()
示例#8
0
 def __init__(self):
     self.local = LocalDict()
     self.factory = Factory()
     self.mutex = threading.Lock()
     self._minus_count = 0
示例#9
0
class Tester(object):
    def __init__(self):
        self.local = LocalDict()
        self.factory = Factory()
        self.mutex = threading.Lock()
        self._minus_count = 0

    def _minus(self):
        with self.mutex:
            self._minus_count += 1

    def test_single_proxy(self, url, proxy):
        """
        测试单个代理
        :param proxy:
        :return:
        """
        proxies = {
            "http": "http://" + proxy,
        }
        try:
            response = requests.head(url,
                                     headers=base_headers,
                                     proxies=proxies,
                                     timeout=15,
                                     allow_redirects=False,
                                     verify=False)
            status_code = response.status_code
            if status_code in VALID_STATUS_CODES:
                Log.debug(f'Tester:代理可用 {proxy}')
                pass
            else:
                if status_code in FORBIDEN_STATUS_CODES:
                    self._minus()
                    self.local.decrease(proxy, -MAX_SCORE)
                else:
                    self.local.decrease(proxy)
                Log.error(
                    f'Tester:请求响应码不合法 {status_code} ,IP {proxy}, URL: {url}')
        except (ReadTimeout, HTTPError, ProxyError, ConnectionError):
            self._minus()
            self.local.decrease(proxy, -MAX_SCORE)
            Log.warning(f'Tester:无用ip,直接删掉, ip: {proxy}')
        except (TypeError, AttributeError) as e:
            self.local.decrease(proxy)
            Log.error(f'Tester:代理请求失败 {proxy} ERROR: {e}')

    def run(self):
        """
        测试主函数
        :return:
        """
        t = set()
        self._minus_count = 0
        count = self.local.count()
        if count == 0:
            Log.info("Tester:无代理")
            return
        Log.info(f'Tester:开始运行, 当前容量:{count}')
        try:
            stop = max(0, count)
            test_proxies = self.local.batch(0, stop)
            for proxy in test_proxies:
                for url in TEST_URLS:
                    t.add(self.factory.add(self.test_single_proxy, url, proxy))

            self.local.clear()

        except Exception as e:
            Log.error(f'Tester:发生错误 {e.args}')

        self.factory.wait(t)
        Log.info(f'Tester:执行结束, 测试前容量:{count}, 剩余:{count-self._minus_count}')
示例#10
0
 def save_proxies(self, proxies):
     local = LocalDict()
     for proxy in proxies:
         local.add(proxy)
示例#11
0
@app.route('/sleep_count')
def get_counts():
    """
    Get the sleep_count of proxies
    :return: 代理池总量
    """
    conn = LocalDict()
    return str(conn.count())


@app.route('/useless')
def decrease_proxy():
    """
    Get a proxy
    :return: 随机代理
    """
    proxy = request.args.get("proxy")
    conn = LocalDict()
    conn.decrease(proxy, MAX_SCORE)
    Log.info(f"删除的ip为{proxy}")
    return "ok"


if __name__ == '__main__':
    ld = LocalDict()
    s = {"aaa": 12, "bbb": 5, "ccc": 16, "ddd": 20}
    for k, v in s.items():
        ld.add(k, v)
    app.run(debug=True)