Пример #1
0
 def __get_page_from_url(self, company):
     # data = {"qy_name": company}
     data = {"complexname": company}
     if self.proxies:
         try:
             if IS_PROXY:
                 response = requests.post(self.url,
                                          headers=get_request_headers(),
                                          data=data,
                                          proxies=self.proxies,
                                          timeout=TIMEOUT)
             else:
                 response = requests.post(self.url,
                                          headers=get_request_headers(),
                                          data=data,
                                          timeout=TIMEOUT)
             if response.status_code == 200:
                 return response.text
             else:
                 raise requests.HTTPError
         except:
             # 将公司放回队列,更换代理ip
             self.company_queue.put(company)
             if self.proxy_queue.qsize() > 0:
                 self.proxies = {'http': self.proxy_queue.get()}
                 print(self.proxies)
             else:
                 self.proxy_queue.join()
             return None
Пример #2
0
def get_html(url, proxy={}):
    time.sleep(0.5)
    if not proxy:
        res = requests.get(url, headers=get_request_headers())
        if res.status_code == 200:
            return res.text
    else:
        res = requests.get(url, headers=get_request_headers(), proxies=proxy)
        if res.status_code == 200:
            return res.text
Пример #3
0
    def get_page_from_url(self, url):
        """根据url发送请求,获取页面数据"""
        response = requests.get(url, headers=get_request_headers())
        # 测试url请求用,正式开发请删除
        print(url, response.status_code)

        return response.content
Пример #4
0
def __check_http_proxies(proxies, is_http=True):
    # 匿名类型 高匿为2,匿名为1,透明为0
    nick_type = -1
    # 代理IP的响应速度
    speed = -1

    if is_http:
        test_url = 'http://httpbin.org/get'
    else:
        test_url = 'https://httpbin.org/get'

    try:
        # 获取开始时间
        start = time.time()
        response = requests.get(test_url,
                                headers=get_request_headers(),
                                proxies=proxies,
                                timeout=5)
        if response.ok:
            speed = round(time.time() - start, 2)

            dic = json.loads(response.text)
            origin = dic['origin']
            proxy_connection = dic['headers'].get('Proxy-Connection', None)
            if ',' in origin:
                nick_type = 2
            elif proxy_connection:
                nick_type = 1
            else:
                nick_type = 0

            return True, nick_type, speed
        return False, nick_type, speed
    except Exception as e:
        return False, nick_type, speed
Пример #5
0
    def __get_page_from_html(self, project):
        url = self.url + project.url
        try:
            response = requests.get(url,
                                    headers=get_request_headers(),
                                    proxies=self.proxies,
                                    timeout=TIMEOUT)
            # response = requests.get(url, headers=get_request_headers(), timeout=5)
            if response.status_code == 200:
                return response.content
            else:
                self.project_queue.put(project)
                print("detail_spider错误url:", response.status_code)
                if self.proxy_queue.qsize() > 0:
                    self.proxies = {'http': self.proxy_queue.get()}
                else:
                    print("The proxy_queue is empty!\n")
                    with open('./config/proxy', 'r', encoding='utf-8') as f1:
                        for data in f1.readlines():
                            self.proxy_queue.put(data.strip())
                self.proxies = {'http': self.proxy_queue.get()}

        except Exception as e:
            self.project_queue.put(project)
            print("detail_spider错误url:", e)
            if self.proxy_queue.qsize() > 0:
                self.proxies = {'http': self.proxy_queue.get()}
            else:
                print("The proxy_queue is empty!\n")
                with open('./config/proxy', 'r', encoding='utf-8') as f1:
                    for data in f1.readlines():
                        self.proxy_queue.put(data.strip())
                self.proxies = {'http': self.proxy_queue.get()}
Пример #6
0
 def get_page_from_url(self, url):
     headers = get_request_headers()
     response = requests.get(url, headers=headers)
     if response.status_code == 521:
         # 生成cookie信息, 再携带cookie发送请求
         # 生成 `_ydclearance` cookie信息
         # 1. 确定 _ydclearance 是从哪里来的;
         # 观察发现: 这个cookie信息不使用通过服务器响应设置过来的; 那么他就是通过js生成.
         # 2. 第一次发送请求的页面中, 有一个生成这个cookie的js; 执行这段js, 生成我们需要的cookie
         # 这段js是经过加密处理后的js, 真正js在 "po" 中.
         # 提取 `jp(107)` 调用函数的方法, 以及函数
         result = re.findall('window.onload=setTimeout\("(.+?)", 200\);\s*(.+?)\s*</script> ', response.content.decode('GBK'))
         # print(result)
         # 我希望执行js时候, 返回真正要执行的js
         # 把 `eval("qo=eval;qo(po);")` 替换为 return po
         func_str = result[0][1]
         func_str = func_str.replace('eval("qo=eval;qo(po);")', 'return po')
         # print(func_str)
         # 获取执行js的环境
         context = js2py.EvalJs()
         # 加载(执行) func_str
         context.execute(func_str)
         # 执行这个方法, 生成我们需要的js
         # code = gv(50)
         context.execute('code = {};'.format(result[0][0]))
         # 打印最终生成的代码
         # print(context.code)
         cookie_str = re.findall("document.cookie='(.+?); ", context.code)[0]
         # print(cookie_str)
         headers['Cookie'] = cookie_str
         response = requests.get(url, headers=headers)
         return response.content.decode('GBK')
     else:
         return response.content.decode('GBK')
Пример #7
0
 def get_page_from_url(self, url):
     # 随机等待1到3秒
     time.sleep(random.uniform(1, 3))
     # 调用父类的方法,发送请求,获取响应数据
     headers = get_request_headers()
     response = requests.get(url, headers=headers)
     return response.content
Пример #8
0
def _check_http_proxies(proxies, is_http=True):
    nick_type = -1
    speed = -1
    if is_http:
        test_url = 'http://httpbin.org/get'
    else:
        test_url = 'https://httpbin.org/get'
    try:
        start = time.time()

        response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT)

        if response.ok:
            speed = round(time.time() - start, 2)
            dic = json.loads(response.text)
            origin = dic['origin']
            proxy_connection = dic['headers'].get('Proxy-Connection', None)
            if ',' in origin:
                nick_type = 2
            elif proxy_connection:
                nick_type = 1
            else:
                nick_type = 0
            return True, nick_type, speed
        return False, nick_type, speed
    except Exception as ex:
        # logger.exception(ex)
        return False, nick_type, speed
Пример #9
0
 def get_page_from_url(self, url):
     time.sleep(random.uniform(1, 3))
     headers = get_request_headers()
     response = requests.get(url, headers=headers)
     # if response.status_code == 521:
     #     # 这个cookie信息不是通过服务器响应设置过来的;是通过js加密生成的
     #     # 真正的js在po里面
     #     result = re.findall('window.onload=setTimeout\("(.+?)",200\);\s*(.+?)\s*</script> ', response.content.decode('GBK'))
     #     print(result)
     #     # 执行js,返回真正的js,把'eval("qo=eval;qo(po);")' 替换为 return po
     #     func_str = result[0][1]
     #     func_str = func_str.replace('eval("qo=eval;qo(po);")', 'return po')
     #     print(func_str)
     #     # 获取执行js的环境
     #     context = js2py.EvalJs()
     #     # 加载(执行)func_str
     #     context.execute(func_str)
     #     # 执行这个方法,生成我们需要的js
     #     context.execute('code = {};'.format(result[0][0]))
     #     # 打印最终生成的代码
     #     print(context.code)
     #     cookie_str = re.findall("document.cookie='(.+?); ", context.code)[0]
     #     print(cookie_str)
     #     headers['Cookie'] = cookie_str
     #     response = requests.get(url, headers=headers)
     #     print(response.content.decode('GBK'))
     #     return response.content.decode('GBK')
     # else:
     #     return response.content.decode('GBK')
     return response.content
Пример #10
0
    def __check_one_proxy(self):
        ''' 检查一个代理IP的可用性 '''
        while True:
            proxy = self.queue.get()
            proxies = {"http": "http://" + proxy}
            start_time = time.time()
            try:
                headers = get_request_headers()
                data = {"complexname": "中国建筑第五工程局有限公司"}
                res = requests.post(self.url,
                                    headers=headers,
                                    data=data,
                                    proxies=proxies,
                                    timeout=10)
                if (res.status_code == 200) and res.text[:3] != 'HTT':
                    ele = etree.HTML(res.text)
                    result = ele.xpath('//font/b/text()')
                    if result:
                        if result[0] == "全国建筑市场监管公共服务平台":
                            t = round(time.time() - start_time, 2)
                            print(t, proxies)
                            if t < 4:
                                self.content_queue.put(proxies)
                    else:
                        print("SB::", proxies)

                else:
                    print("不行的代理:", proxies, res.status_code)
            except:
                print("其他错误:", proxies)
            self.queue.task_done()
Пример #11
0
def __check_http_proxies(proxies, is_http=True):
    nick_type = -1  # 表征匿名程度,'高匿名':nick_type=0; '匿名':nick_type=1; '透明':nick_type=2
    speed = -1  # 表征相应速度
    if is_http:
        test_url = 'http://httpbin.org/get'
    else:
        test_url = 'https://httpbin.org/get'
    # print("检测中 $$$$$$$$$$$$$$$$$$$$$$$$$$")
    try:
        start = time.time()
        # print(f"正在检测 ************************************* ")
        response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=VALIDATE_TIMEOUT)
        if response.ok:

            speed = round(time.time() - start, 2)
            content = json.loads(response.text)
            origin = content['origin']
            proxy_connection = content['headers'].get('Proxy-Connection', None)

            if ',' in origin:
                nick_type = 2  # 表示透明代理
            elif proxy_connection:
                nick_type = 1  # 表示匿名代理
            else:
                nick_type = 0  # 表示高匿代理

            return True, nick_type, speed

        else:
            return False, nick_type, speed

    except Exception as e:
        # logger.exception(e)
        # print(e)
        return False, nick_type, speed
Пример #12
0
def __check_http_proxies(proxies, is_http=True):
    nick_type = -1
    speed = -1
    if is_http:
        test_url = 'http://httpbin.org/get'
    else:
        test_url = 'https://httpbin.org/get'
    try:
        start = time.time()  # 获取开始时间
        # 发送请求,获取响应数据
        response = requests.get(test_url,
                                headers=get_request_headers(),
                                proxies=proxies,
                                timeout=TEST_TIMEOUT)
        print(response.status_code)

        if response.ok:
            # 计算响应速度
            speed = round(time.time() - start, 2)
            # 计算匿名程度
            dic = json.loads(response.text)
            origin = dic['origin']
            proxy_connection = dic['headers'].get('Proxy-Connection', None)
            if ',' in origin:
                nick_type = 2  # 透明代理
            elif proxy_connection:
                nick_type = 1  # 匿名代理
            else:
                nick_type = 0  # 高匿代理

            return True, nick_type, speed
        return False, nick_type, speed
    except Exception as ex:
        return False, nick_type, speed
Пример #13
0
 def get_page_from_url(self, url):
     """
     根据URL发送请求,获取页面数据
     :param url:
     :return:
     """
     response = requests.get(url, headers=get_request_headers())
     return response.content
Пример #14
0
def _check_http_proxy(proxies, is_http=True):
    # nick_type = -1  # 匿名程度: 高匿 0, 匿名 1, 透明 2
    # speed = -1  # 响应速度,单位秒

    if is_http:
        test_url = "http://httpbin.org/get"
    else:
        test_url = "https://httpbin.org/get"

    try:
        # 获取开始时间
        # start = time.time()
        # 发送请求,获取响应数据
        res = requests.get(url=test_url,
                           headers=http.get_request_headers(),
                           timeout=settings.TEST_TIMEOUT,
                           proxies=proxies)
        # print("------- 请求状态码: %s" %res.status_code)
        if res.ok:  # 说明响应成功
            # if res.status_code == 200:  # 说明响应成功
            # 计算响应速度,保留两位小数
            # speed = round(time.time() - start, 2)
            # 把响应内容转换为字典数据
            # content = json.loads(res.text)

            # 匿名程度
            # 获取origin,请求来源的IP地址
            # origin = content["origin"]
            # 获取请求中Proxy-Connection,如果有,说明是匿名代理
            # proxy_connection = res.get("Proxy-Connection", None)

            # if "," in origin:
            #     # 如果origin中有,逗号分割的两个IP就是透明代理IP,
            #     nick_type = 2  # 透明
            # elif proxy_connection:
            #     # 如果headers中含有Proxy-Connection说明是匿名代理
            #     nick_type = 1  # 匿名
            # else:
            #     # 否则就是高匿代理IP
            #     nick_type = 0  # 高匿
            # return True, nick_type, speed

            return True

        else:
            # return False, nick_type, speed  # False, nick_type=-1, speed=-1
            return False

    except Exception as e:
        # logger.exception(e)
        # return False, nick_type, speed  # False, nick_type=-1, speed=-1
        if is_http:
            print("******* HTTP请求出现错误!")
        else:
            print("******* HTTPS请求出现错误!")
        return False
Пример #15
0
 def __get_page_from_html(self):
     try:
         response = requests.get(self.url, headers=get_request_headers(), proxies=self.proxies, timeout=10)
         if response.status_code == 200:
             return response.text
         else:
             print("detail_spider错误url:", response.status_code, self.url, self.project, self.proxies)
     except Exception as e:
         print("detail_spider错误url:",self.url, self.project, self.proxies)
         print(e)
Пример #16
0
 def __get_page_from_html(self, company_code):
     url = self.url + company_code[1]
     try:
         response = requests.get(url,
                                 headers=get_request_headers(),
                                 proxies=self.proxies,
                                 timeout=TIMEOUT)
         if response.status_code == 200:
             return response.content.decode()
         else:
             self.company_code_queue.put(company_code)
             self.proxies = {'http': self.proxy_queue.get()}
     except:
         self.company_code_queue.put(company_code)
         self.proxies = {'http': self.proxy_queue.get()}
Пример #17
0
def http_check_proxies(proxies, isHttp=True):
    '''
    代理ip请求校验ip
    '''
    nick_type = -1  # 匿名程度变量
    speed = -1  # 响应速度变量
    if isHttp:
        test_url = 'http://httpbin.org/get'
    else:
        test_url = 'https://httpbin.org/get'
    # requests库请求test_url
    try:
        # 响应时间
        start_time = time.time()
        res = requests.get(test_url,
                           headers=get_request_headers(),
                           proxies=proxies,
                           timeout=CHECK_TIMEOUT)
        end_time = time.time()
        cost_time = end_time - start_time

        if res.status_code == 200:
            # 响应速度
            speed = round(cost_time, 2)
            # 转换为字典
            res_dict = json.loads(res.text)
            # 获取请求来源ip
            origin_ip = res_dict['origin']
            # 获取响应请求头中'Proxy-Connection',若有,说明是匿名代理
            proxy_connection = res_dict['headers'].get('Proxy-Conntion', None)

            if "," in origin_ip:
                # 如果响应内容中的源ip中有‘,’分割的两个ip的话及时透明代理ip
                nick_type = 2  # 透明
            elif proxy_connection:
                # 'Proxy-Connection'存在说明是匿名ip
                nick_type = 1  # 匿名
            else:
                nick_type = 0  # 高匿
            return True, nick_type, speed
        else:
            return False, nick_type, speed
    except Exception as e:
        # logger.exception(e)
        return False, nick_type, speed
Пример #18
0
def _check_http_proxy(proxies, isHttp=True):
    # 匿名程度
    nick_type = -1
    # 响应速度
    speed = -1

    if isHttp:
        test_url = 'http://httpbin.org/get'
    else:
        test_url = 'https://httpbin.org/get'
    try:
        start = time.time()
        response = requests.get(url=test_url,
                                headers=http.get_request_headers(),
                                timeout=settings.TEST_TIMEOUT,
                                proxies=proxies)
        if response.ok:
            # 计算响应速度
            speed = round(time.time() - start, 2)
            # 把响应内动转换为字符串
            content = json.loads(response.text)
            # 获取请求头
            headers = content['headers']
            # 获取origin ,请求来源的Ip地址
            ip = content['origin']
            # 获取请求头中的‘Proxy-Connection’如果有,说明是匿名代理
            proxy_connection = headers.get('Proxy-Connection', None)

            if ',' in ip:
                # 如果‘origin’中含有‘,’分割的两个代理就是透明代理IP
                nick_type = 2  # 透明代理
            elif proxy_connection:
                # 如果header中包含‘Proxy-Connection‘
                nick_type = 1  # 匿名代理
            else:
                # 否则就为高匿代理
                nick_type = 0  # 高匿
            return True, nick_type, speed
        else:
            return False, nick_type, speed
    except Exception as e:
        # 测试中可记录错误,实际开发中请注释此行
        # logger.exception(e)
        return False, nick_type, speed
Пример #19
0
def __check_http_proxies(proxies, is_http=True):
    # 匿名类型: 高匿: 0, 匿名: 1, 透明: 2
    nick_type = -1
    # 响应速度, 单位s
    speed = -1

    if is_http:
        test_url = 'http://httpbin.org/get'
    else:
        test_url = 'https://httpbin.org/get'

    try:
        # 获取开始时间
        start = time.time()
        # 发送请求, 获取响应数据
        response = requests.get(test_url,
                                headers=get_request_headers(),
                                proxies=proxies,
                                timeout=TEST_TIMEOUT)

        if response.ok:
            # 计算响应速度
            speed = round(time.time() - start, 2)
            # 匿名程度
            # 把响应的json字符串, 转换为字典
            dic = json.loads(response.text)
            # 获取来源IP: origin
            origin = dic['origin']
            proxy_connection = dic['headers'].get('Proxy-Connection', None)
            if ',' in origin:
                #    1. 如果 响应的origin 中有','分割的两个IP就是透明代理IP
                nick_type = 2
            elif proxy_connection:
                #    2. 如果 响应的headers 中包含 Proxy-Connection 说明是匿名代理IP
                nick_type = 1
            else:
                #  3. 否则就是高匿代理IP
                nick_type = 0

            return True, nick_type, speed
        return False, nick_type, speed
    except Exception as ex:
        # logger.exception(ex)
        return False, nick_type, speed
Пример #20
0
    def get_page_from_url(self, url):
        headers = get_request_headers()
        response = requests.get(url, headers=headers)
        if response.status_code == 521:
            #生成cookies信息,在携带cookies请求
            url = 'http://www.66ip.cn/1.html'
            # url='https://m.guazi.com/wh/dazhong/'
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                'Cookie':
                '__jsl_clearance=1565788094.68|0|EmOnBjtTOFID8%2BQNO3SBgFgs66A%3D',
                # 'Host':'www.66ip.cn'
            }
            response = requests.get(url, headers=headers)

            return response.content.decode('GBK')
        else:
            return response.content.decode('GBK')
Пример #21
0
 def __get_page_from_html(self, project):
     url = self.url + project.url
     try:
         response = requests.get(url,
                                 headers=get_request_headers(),
                                 proxies=self.proxies,
                                 timeout=TIMEOUT)
         # response = requests.get(url, headers=get_request_headers(), timeout=5)
         if response.status_code == 200:
             return response.content
         else:
             self.project_queue.put(project)
             self.proxies = self.proxy_queue.get()
             self.proxies = {'http': self.proxy_queue.get()}
             print("detail_spider错误url:", response.status_code)
     except Exception as e:
         self.project_queue.put(project)
         self.proxies = {'http': self.proxy_queue.get()}
         print("detail_spider错误url:", e)
Пример #22
0
def __check_http_proxies(proxies, is_http=True):
    # 匿名类型 代理的匿名程度,高匿:0,匿名:1,透明:2
    nice_type = -1,  #不存在默认-1
    #响应速度,单位S
    speed = -1
    if is_http:
        test_url = 'http://httpbin.org/get'
    else:
        test_url = 'https://httpbin.org/get'
    #获取开始时间
    try:
        start = time.time()
        #发送请求,获取响应数据
        # 1.对 http://httpbin.org/get 或 https://httpbin.org/get 发送请求
        response = requests.get(test_url,
                                headers=get_request_headers(),
                                proxies=proxies,
                                timeout=TEST_TIMEOUT)
        print(response.text)
        if response.ok:
            #计算响应速度
            speed = round(time.time() - start, 2)  #round(,2)代表保留2位小数
            # 匿名程度检查:
            #把响应的JOSN字符串,转化为字典
            dic = json.loads(response.text)
            #获取来源IP:origin
            origin = dic['origin']
            proxy_connection = dic['headers'].get('Proxy-Connection', None)
            # 2.如果 响应的headers 中有','分割的两个IP就是透明代理IP
            if ',' in origin:
                nice_type = 2
            # 3.如果 响应的headers 中包含 Proxy-Connection 说明是匿名代理IP
            elif proxy_connection:
                nice_type = 1
            # 4.否则就是高匿代理IP
            else:
                nice_type = 0
            return True, nice_type, speed
        return False, nice_type, speed
    except Exception as es:
        # logger.exception(es)
        return False, nice_type, speed
Пример #23
0
def __check_http_proxies(proxies, is_http=True):
    #匿名类型
    nick_type = -1
    #响应速度
    speed = -1
    if is_http:
        test_url = 'http://httpbin.org/get'
    else:
        test_url = 'https://httpbin.org/get'
    try:
        #获取开始时间
        start = time.time()
        #发送请求,获取响应数据
        response = requests.get(test_url,
                                headers=get_request_headers(),
                                proxies=proxies,
                                timeout=TEST_TIMEOUT)
        if response.ok:
            #计算响应速度
            speed = round(time.time() - start, 2)
            #转换成python字典
            dic = response.json()
            #获取来源IP:origin
            origin = dic["origin"]
            proxy_connection = dic["headers"].get("Proxy-Connection")
            if ',' in origin:
                # 1.如果响应的origin中有 ',' 分割的两个ip就是透明代理
                nick_type = 2
            elif proxy_connection:
                # 2.如果响应的的headers中包含 Proxy_Connection说明是匿名代理
                nick_type = 1
            else:
                # 3.否则就是高匿代理ip
                nick_type = 0
            return True, nick_type, speed
        return False, nick_type, speed

    except Exception as e:
        print(f'请求失败: {e}')
        # logger.exception(e)
        return False, nick_type, speed
Пример #24
0
 def __get_post_page_from_html(self, data, company_code):
     url = self.url + company_code[1]
     proxy = self.post_proxy_queue.get()
     proxies = {'http': proxy}
     try:
         response = requests.post(url,
                                  headers=get_request_headers(),
                                  data=data,
                                  proxies=proxies,
                                  timeout=TIMEOUT)
         if response.status_code == 200:
             self.post_proxy_queue.put(proxy)
             return response.content.decode()
         else:
             result = (company_code, data)
             self.data_queue.put(result)
             print("post请求错误page", data, url, proxies)
     except Exception as e:
         result = (company_code, data)
         self.data_queue.put(result)
         print("post请求错误page", data, url, proxies)
         print(e)
Пример #25
0
def _check_http_proxies(proxies, is_http=True):
    nick_type = -1  # 匿名程度
    speed = -1

    if is_http:
        test_url = 'http://httpbin.org/get'
    else:
        test_url = 'https://httpbin.org/get'

    # 获取开始时间
    start = time.time()
    try:
        # 发送请求
        response = requests.get(test_url, headers=http.get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT)

        if response.ok:
            speed = round(time.time()-start, 2)
            # 匿名程度
            # 把响应的JSON字符串转为字典
            dic = json.loads(response.text)
            # 获取来源的IP:origin
            origin = dic['origin']
            proxy_connection = dic['headers'].get('Proxy-Connection', None)
            if ',' in origin:
                # 如果响应的origin中有','分割的两个IP就是透明代理IP
                nick_type = 2
            elif proxy_connection:
                # 如果响应的headers中包含Proxy-Connection 说明是匿名代理IP
                nick_type = 1
            else:
                # 否则是高匿
                nick_type = 0

            return True, nick_type, speed
        return False, nick_type, speed
    except Exception as e:
        return False, nick_type, speed
def __check_http_proxies(proxies, is_http=True):
    # 匿名程度
    nick_type = -1
    # 响应速度
    speed = -1
    if is_http:
        test_url = 'http://httpbin.org/get'
    else:
        test_url = 'https://httpbin.org/get'
    # 获取开始时间
    start = time.time()
    try:
        # 发送请求
        response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT)
        # 请求成功
        if response.ok:
            # 计算速度
            speed = round(time.time() - start, 2)  # 保留两位小数
            dic = json.loads(response.text)
            # 匿名程度
            # 获取来源的ip:orgin
            origin = dic['origin']
            proxy_connection = dic['headers'].get('Proxy-Connection', None)
            if ',' in origin:
                # 如果 响应的orgin有 ',' ,就是透明代理
                nick_type = 2
            elif proxy_connection:
                # 如果 响应的origin有proxy_connection  ,就是匿名代理
                nick_type = 1
            else:
                # 高匿代理
                nick_type = 0
            return True, nick_type, speed
        return False, nick_type, speed
    except Exception as e:
        # logger.exception(e)
        return False, nick_type, speed
Пример #27
0
def _check_http_proxies(proxies, is_http=True):
    # 匿名类型
    nick_type = -1
    # 响应速度,单位s
    speed = -1
    if is_http:
        test_url = "http://httpbin.org/get"
    else:
        test_url = "https://httpbin.org/get"
    try:
        # 获取开始时间
        start = time.time()
        # 发送请求,获取响应速度
        response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT)

        if response.ok:
            # 计算响应速度
            speed = round(time.time() - start, 2)
            # 匿名程度
            dic = json.loads(response.text)
            # 获取来源IP:origin
            origin = dic['origin']
            proxy_connettin = dic['headers'].get('Proxy-Connetion', None)
            # 2.如果 响应的origin中有','分割的两个IP就是透明代理IP
            if ',' in origin:
                nick_type = 2
            # 3.如果 响应的headers中包含Proxy-Connetion 说明时匿名代理IP
            elif proxy_connettin:
                nick_type = 1
            # 4.否则就是高匿名代理IP
            else:
                nick_type = 0
            return True, nick_type, speed
        return False, nick_type, speed
    except Exception as ex:
        # logger.exception(ex)
        return False, nick_type, speed
Пример #28
0
 def get_page_from_url(self, url):
     headers = get_request_headers()
     """根据url发送请求,获取页面数据"""
     response = requests.get(url, headers=headers)
     return response.content
Пример #29
0
 def get_page_from_url(self, url):
     response = requests.get(url, headers=get_request_headers())
     return response.content
Пример #30
0
 def get_page_from_url(self, url, charset='gb2312'):
     response = requests.get(url, headers=get_request_headers())
     return response.content.decode(charset)