def __get_page_from_url(self, company): # data = {"qy_name": company} data = {"complexname": company} if self.proxies: try: if IS_PROXY: response = requests.post(self.url, headers=get_request_headers(), data=data, proxies=self.proxies, timeout=TIMEOUT) else: response = requests.post(self.url, headers=get_request_headers(), data=data, timeout=TIMEOUT) if response.status_code == 200: return response.text else: raise requests.HTTPError except: # 将公司放回队列,更换代理ip self.company_queue.put(company) if self.proxy_queue.qsize() > 0: self.proxies = {'http': self.proxy_queue.get()} print(self.proxies) else: self.proxy_queue.join() return None
def get_html(url, proxy={}): time.sleep(0.5) if not proxy: res = requests.get(url, headers=get_request_headers()) if res.status_code == 200: return res.text else: res = requests.get(url, headers=get_request_headers(), proxies=proxy) if res.status_code == 200: return res.text
def get_page_from_url(self, url): """根据url发送请求,获取页面数据""" response = requests.get(url, headers=get_request_headers()) # 测试url请求用,正式开发请删除 print(url, response.status_code) return response.content
def __check_http_proxies(proxies, is_http=True): # 匿名类型 高匿为2,匿名为1,透明为0 nick_type = -1 # 代理IP的响应速度 speed = -1 if is_http: test_url = 'http://httpbin.org/get' else: test_url = 'https://httpbin.org/get' try: # 获取开始时间 start = time.time() response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=5) if response.ok: speed = round(time.time() - start, 2) dic = json.loads(response.text) origin = dic['origin'] proxy_connection = dic['headers'].get('Proxy-Connection', None) if ',' in origin: nick_type = 2 elif proxy_connection: nick_type = 1 else: nick_type = 0 return True, nick_type, speed return False, nick_type, speed except Exception as e: return False, nick_type, speed
def __get_page_from_html(self, project): url = self.url + project.url try: response = requests.get(url, headers=get_request_headers(), proxies=self.proxies, timeout=TIMEOUT) # response = requests.get(url, headers=get_request_headers(), timeout=5) if response.status_code == 200: return response.content else: self.project_queue.put(project) print("detail_spider错误url:", response.status_code) if self.proxy_queue.qsize() > 0: self.proxies = {'http': self.proxy_queue.get()} else: print("The proxy_queue is empty!\n") with open('./config/proxy', 'r', encoding='utf-8') as f1: for data in f1.readlines(): self.proxy_queue.put(data.strip()) self.proxies = {'http': self.proxy_queue.get()} except Exception as e: self.project_queue.put(project) print("detail_spider错误url:", e) if self.proxy_queue.qsize() > 0: self.proxies = {'http': self.proxy_queue.get()} else: print("The proxy_queue is empty!\n") with open('./config/proxy', 'r', encoding='utf-8') as f1: for data in f1.readlines(): self.proxy_queue.put(data.strip()) self.proxies = {'http': self.proxy_queue.get()}
def get_page_from_url(self, url): headers = get_request_headers() response = requests.get(url, headers=headers) if response.status_code == 521: # 生成cookie信息, 再携带cookie发送请求 # 生成 `_ydclearance` cookie信息 # 1. 确定 _ydclearance 是从哪里来的; # 观察发现: 这个cookie信息不使用通过服务器响应设置过来的; 那么他就是通过js生成. # 2. 第一次发送请求的页面中, 有一个生成这个cookie的js; 执行这段js, 生成我们需要的cookie # 这段js是经过加密处理后的js, 真正js在 "po" 中. # 提取 `jp(107)` 调用函数的方法, 以及函数 result = re.findall('window.onload=setTimeout\("(.+?)", 200\);\s*(.+?)\s*</script> ', response.content.decode('GBK')) # print(result) # 我希望执行js时候, 返回真正要执行的js # 把 `eval("qo=eval;qo(po);")` 替换为 return po func_str = result[0][1] func_str = func_str.replace('eval("qo=eval;qo(po);")', 'return po') # print(func_str) # 获取执行js的环境 context = js2py.EvalJs() # 加载(执行) func_str context.execute(func_str) # 执行这个方法, 生成我们需要的js # code = gv(50) context.execute('code = {};'.format(result[0][0])) # 打印最终生成的代码 # print(context.code) cookie_str = re.findall("document.cookie='(.+?); ", context.code)[0] # print(cookie_str) headers['Cookie'] = cookie_str response = requests.get(url, headers=headers) return response.content.decode('GBK') else: return response.content.decode('GBK')
def get_page_from_url(self, url): # 随机等待1到3秒 time.sleep(random.uniform(1, 3)) # 调用父类的方法,发送请求,获取响应数据 headers = get_request_headers() response = requests.get(url, headers=headers) return response.content
def _check_http_proxies(proxies, is_http=True): nick_type = -1 speed = -1 if is_http: test_url = 'http://httpbin.org/get' else: test_url = 'https://httpbin.org/get' try: start = time.time() response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT) if response.ok: speed = round(time.time() - start, 2) dic = json.loads(response.text) origin = dic['origin'] proxy_connection = dic['headers'].get('Proxy-Connection', None) if ',' in origin: nick_type = 2 elif proxy_connection: nick_type = 1 else: nick_type = 0 return True, nick_type, speed return False, nick_type, speed except Exception as ex: # logger.exception(ex) return False, nick_type, speed
def get_page_from_url(self, url): time.sleep(random.uniform(1, 3)) headers = get_request_headers() response = requests.get(url, headers=headers) # if response.status_code == 521: # # 这个cookie信息不是通过服务器响应设置过来的;是通过js加密生成的 # # 真正的js在po里面 # result = re.findall('window.onload=setTimeout\("(.+?)",200\);\s*(.+?)\s*</script> ', response.content.decode('GBK')) # print(result) # # 执行js,返回真正的js,把'eval("qo=eval;qo(po);")' 替换为 return po # func_str = result[0][1] # func_str = func_str.replace('eval("qo=eval;qo(po);")', 'return po') # print(func_str) # # 获取执行js的环境 # context = js2py.EvalJs() # # 加载(执行)func_str # context.execute(func_str) # # 执行这个方法,生成我们需要的js # context.execute('code = {};'.format(result[0][0])) # # 打印最终生成的代码 # print(context.code) # cookie_str = re.findall("document.cookie='(.+?); ", context.code)[0] # print(cookie_str) # headers['Cookie'] = cookie_str # response = requests.get(url, headers=headers) # print(response.content.decode('GBK')) # return response.content.decode('GBK') # else: # return response.content.decode('GBK') return response.content
def __check_one_proxy(self): ''' 检查一个代理IP的可用性 ''' while True: proxy = self.queue.get() proxies = {"http": "http://" + proxy} start_time = time.time() try: headers = get_request_headers() data = {"complexname": "中国建筑第五工程局有限公司"} res = requests.post(self.url, headers=headers, data=data, proxies=proxies, timeout=10) if (res.status_code == 200) and res.text[:3] != 'HTT': ele = etree.HTML(res.text) result = ele.xpath('//font/b/text()') if result: if result[0] == "全国建筑市场监管公共服务平台": t = round(time.time() - start_time, 2) print(t, proxies) if t < 4: self.content_queue.put(proxies) else: print("SB::", proxies) else: print("不行的代理:", proxies, res.status_code) except: print("其他错误:", proxies) self.queue.task_done()
def __check_http_proxies(proxies, is_http=True): nick_type = -1 # 表征匿名程度,'高匿名':nick_type=0; '匿名':nick_type=1; '透明':nick_type=2 speed = -1 # 表征相应速度 if is_http: test_url = 'http://httpbin.org/get' else: test_url = 'https://httpbin.org/get' # print("检测中 $$$$$$$$$$$$$$$$$$$$$$$$$$") try: start = time.time() # print(f"正在检测 ************************************* ") response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=VALIDATE_TIMEOUT) if response.ok: speed = round(time.time() - start, 2) content = json.loads(response.text) origin = content['origin'] proxy_connection = content['headers'].get('Proxy-Connection', None) if ',' in origin: nick_type = 2 # 表示透明代理 elif proxy_connection: nick_type = 1 # 表示匿名代理 else: nick_type = 0 # 表示高匿代理 return True, nick_type, speed else: return False, nick_type, speed except Exception as e: # logger.exception(e) # print(e) return False, nick_type, speed
def __check_http_proxies(proxies, is_http=True): nick_type = -1 speed = -1 if is_http: test_url = 'http://httpbin.org/get' else: test_url = 'https://httpbin.org/get' try: start = time.time() # 获取开始时间 # 发送请求,获取响应数据 response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT) print(response.status_code) if response.ok: # 计算响应速度 speed = round(time.time() - start, 2) # 计算匿名程度 dic = json.loads(response.text) origin = dic['origin'] proxy_connection = dic['headers'].get('Proxy-Connection', None) if ',' in origin: nick_type = 2 # 透明代理 elif proxy_connection: nick_type = 1 # 匿名代理 else: nick_type = 0 # 高匿代理 return True, nick_type, speed return False, nick_type, speed except Exception as ex: return False, nick_type, speed
def get_page_from_url(self, url): """ 根据URL发送请求,获取页面数据 :param url: :return: """ response = requests.get(url, headers=get_request_headers()) return response.content
def _check_http_proxy(proxies, is_http=True): # nick_type = -1 # 匿名程度: 高匿 0, 匿名 1, 透明 2 # speed = -1 # 响应速度,单位秒 if is_http: test_url = "http://httpbin.org/get" else: test_url = "https://httpbin.org/get" try: # 获取开始时间 # start = time.time() # 发送请求,获取响应数据 res = requests.get(url=test_url, headers=http.get_request_headers(), timeout=settings.TEST_TIMEOUT, proxies=proxies) # print("------- 请求状态码: %s" %res.status_code) if res.ok: # 说明响应成功 # if res.status_code == 200: # 说明响应成功 # 计算响应速度,保留两位小数 # speed = round(time.time() - start, 2) # 把响应内容转换为字典数据 # content = json.loads(res.text) # 匿名程度 # 获取origin,请求来源的IP地址 # origin = content["origin"] # 获取请求中Proxy-Connection,如果有,说明是匿名代理 # proxy_connection = res.get("Proxy-Connection", None) # if "," in origin: # # 如果origin中有,逗号分割的两个IP就是透明代理IP, # nick_type = 2 # 透明 # elif proxy_connection: # # 如果headers中含有Proxy-Connection说明是匿名代理 # nick_type = 1 # 匿名 # else: # # 否则就是高匿代理IP # nick_type = 0 # 高匿 # return True, nick_type, speed return True else: # return False, nick_type, speed # False, nick_type=-1, speed=-1 return False except Exception as e: # logger.exception(e) # return False, nick_type, speed # False, nick_type=-1, speed=-1 if is_http: print("******* HTTP请求出现错误!") else: print("******* HTTPS请求出现错误!") return False
def __get_page_from_html(self): try: response = requests.get(self.url, headers=get_request_headers(), proxies=self.proxies, timeout=10) if response.status_code == 200: return response.text else: print("detail_spider错误url:", response.status_code, self.url, self.project, self.proxies) except Exception as e: print("detail_spider错误url:",self.url, self.project, self.proxies) print(e)
def __get_page_from_html(self, company_code): url = self.url + company_code[1] try: response = requests.get(url, headers=get_request_headers(), proxies=self.proxies, timeout=TIMEOUT) if response.status_code == 200: return response.content.decode() else: self.company_code_queue.put(company_code) self.proxies = {'http': self.proxy_queue.get()} except: self.company_code_queue.put(company_code) self.proxies = {'http': self.proxy_queue.get()}
def http_check_proxies(proxies, isHttp=True): ''' 代理ip请求校验ip ''' nick_type = -1 # 匿名程度变量 speed = -1 # 响应速度变量 if isHttp: test_url = 'http://httpbin.org/get' else: test_url = 'https://httpbin.org/get' # requests库请求test_url try: # 响应时间 start_time = time.time() res = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=CHECK_TIMEOUT) end_time = time.time() cost_time = end_time - start_time if res.status_code == 200: # 响应速度 speed = round(cost_time, 2) # 转换为字典 res_dict = json.loads(res.text) # 获取请求来源ip origin_ip = res_dict['origin'] # 获取响应请求头中'Proxy-Connection',若有,说明是匿名代理 proxy_connection = res_dict['headers'].get('Proxy-Conntion', None) if "," in origin_ip: # 如果响应内容中的源ip中有‘,’分割的两个ip的话及时透明代理ip nick_type = 2 # 透明 elif proxy_connection: # 'Proxy-Connection'存在说明是匿名ip nick_type = 1 # 匿名 else: nick_type = 0 # 高匿 return True, nick_type, speed else: return False, nick_type, speed except Exception as e: # logger.exception(e) return False, nick_type, speed
def _check_http_proxy(proxies, isHttp=True): # 匿名程度 nick_type = -1 # 响应速度 speed = -1 if isHttp: test_url = 'http://httpbin.org/get' else: test_url = 'https://httpbin.org/get' try: start = time.time() response = requests.get(url=test_url, headers=http.get_request_headers(), timeout=settings.TEST_TIMEOUT, proxies=proxies) if response.ok: # 计算响应速度 speed = round(time.time() - start, 2) # 把响应内动转换为字符串 content = json.loads(response.text) # 获取请求头 headers = content['headers'] # 获取origin ,请求来源的Ip地址 ip = content['origin'] # 获取请求头中的‘Proxy-Connection’如果有,说明是匿名代理 proxy_connection = headers.get('Proxy-Connection', None) if ',' in ip: # 如果‘origin’中含有‘,’分割的两个代理就是透明代理IP nick_type = 2 # 透明代理 elif proxy_connection: # 如果header中包含‘Proxy-Connection‘ nick_type = 1 # 匿名代理 else: # 否则就为高匿代理 nick_type = 0 # 高匿 return True, nick_type, speed else: return False, nick_type, speed except Exception as e: # 测试中可记录错误,实际开发中请注释此行 # logger.exception(e) return False, nick_type, speed
def __check_http_proxies(proxies, is_http=True): # 匿名类型: 高匿: 0, 匿名: 1, 透明: 2 nick_type = -1 # 响应速度, 单位s speed = -1 if is_http: test_url = 'http://httpbin.org/get' else: test_url = 'https://httpbin.org/get' try: # 获取开始时间 start = time.time() # 发送请求, 获取响应数据 response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT) if response.ok: # 计算响应速度 speed = round(time.time() - start, 2) # 匿名程度 # 把响应的json字符串, 转换为字典 dic = json.loads(response.text) # 获取来源IP: origin origin = dic['origin'] proxy_connection = dic['headers'].get('Proxy-Connection', None) if ',' in origin: # 1. 如果 响应的origin 中有','分割的两个IP就是透明代理IP nick_type = 2 elif proxy_connection: # 2. 如果 响应的headers 中包含 Proxy-Connection 说明是匿名代理IP nick_type = 1 else: # 3. 否则就是高匿代理IP nick_type = 0 return True, nick_type, speed return False, nick_type, speed except Exception as ex: # logger.exception(ex) return False, nick_type, speed
def get_page_from_url(self, url): headers = get_request_headers() response = requests.get(url, headers=headers) if response.status_code == 521: #生成cookies信息,在携带cookies请求 url = 'http://www.66ip.cn/1.html' # url='https://m.guazi.com/wh/dazhong/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Cookie': '__jsl_clearance=1565788094.68|0|EmOnBjtTOFID8%2BQNO3SBgFgs66A%3D', # 'Host':'www.66ip.cn' } response = requests.get(url, headers=headers) return response.content.decode('GBK') else: return response.content.decode('GBK')
def __get_page_from_html(self, project): url = self.url + project.url try: response = requests.get(url, headers=get_request_headers(), proxies=self.proxies, timeout=TIMEOUT) # response = requests.get(url, headers=get_request_headers(), timeout=5) if response.status_code == 200: return response.content else: self.project_queue.put(project) self.proxies = self.proxy_queue.get() self.proxies = {'http': self.proxy_queue.get()} print("detail_spider错误url:", response.status_code) except Exception as e: self.project_queue.put(project) self.proxies = {'http': self.proxy_queue.get()} print("detail_spider错误url:", e)
def __check_http_proxies(proxies, is_http=True): # 匿名类型 代理的匿名程度,高匿:0,匿名:1,透明:2 nice_type = -1, #不存在默认-1 #响应速度,单位S speed = -1 if is_http: test_url = 'http://httpbin.org/get' else: test_url = 'https://httpbin.org/get' #获取开始时间 try: start = time.time() #发送请求,获取响应数据 # 1.对 http://httpbin.org/get 或 https://httpbin.org/get 发送请求 response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT) print(response.text) if response.ok: #计算响应速度 speed = round(time.time() - start, 2) #round(,2)代表保留2位小数 # 匿名程度检查: #把响应的JOSN字符串,转化为字典 dic = json.loads(response.text) #获取来源IP:origin origin = dic['origin'] proxy_connection = dic['headers'].get('Proxy-Connection', None) # 2.如果 响应的headers 中有','分割的两个IP就是透明代理IP if ',' in origin: nice_type = 2 # 3.如果 响应的headers 中包含 Proxy-Connection 说明是匿名代理IP elif proxy_connection: nice_type = 1 # 4.否则就是高匿代理IP else: nice_type = 0 return True, nice_type, speed return False, nice_type, speed except Exception as es: # logger.exception(es) return False, nice_type, speed
def __check_http_proxies(proxies, is_http=True): #匿名类型 nick_type = -1 #响应速度 speed = -1 if is_http: test_url = 'http://httpbin.org/get' else: test_url = 'https://httpbin.org/get' try: #获取开始时间 start = time.time() #发送请求,获取响应数据 response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT) if response.ok: #计算响应速度 speed = round(time.time() - start, 2) #转换成python字典 dic = response.json() #获取来源IP:origin origin = dic["origin"] proxy_connection = dic["headers"].get("Proxy-Connection") if ',' in origin: # 1.如果响应的origin中有 ',' 分割的两个ip就是透明代理 nick_type = 2 elif proxy_connection: # 2.如果响应的的headers中包含 Proxy_Connection说明是匿名代理 nick_type = 1 else: # 3.否则就是高匿代理ip nick_type = 0 return True, nick_type, speed return False, nick_type, speed except Exception as e: print(f'请求失败: {e}') # logger.exception(e) return False, nick_type, speed
def __get_post_page_from_html(self, data, company_code): url = self.url + company_code[1] proxy = self.post_proxy_queue.get() proxies = {'http': proxy} try: response = requests.post(url, headers=get_request_headers(), data=data, proxies=proxies, timeout=TIMEOUT) if response.status_code == 200: self.post_proxy_queue.put(proxy) return response.content.decode() else: result = (company_code, data) self.data_queue.put(result) print("post请求错误page", data, url, proxies) except Exception as e: result = (company_code, data) self.data_queue.put(result) print("post请求错误page", data, url, proxies) print(e)
def _check_http_proxies(proxies, is_http=True): nick_type = -1 # 匿名程度 speed = -1 if is_http: test_url = 'http://httpbin.org/get' else: test_url = 'https://httpbin.org/get' # 获取开始时间 start = time.time() try: # 发送请求 response = requests.get(test_url, headers=http.get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT) if response.ok: speed = round(time.time()-start, 2) # 匿名程度 # 把响应的JSON字符串转为字典 dic = json.loads(response.text) # 获取来源的IP:origin origin = dic['origin'] proxy_connection = dic['headers'].get('Proxy-Connection', None) if ',' in origin: # 如果响应的origin中有','分割的两个IP就是透明代理IP nick_type = 2 elif proxy_connection: # 如果响应的headers中包含Proxy-Connection 说明是匿名代理IP nick_type = 1 else: # 否则是高匿 nick_type = 0 return True, nick_type, speed return False, nick_type, speed except Exception as e: return False, nick_type, speed
def __check_http_proxies(proxies, is_http=True): # 匿名程度 nick_type = -1 # 响应速度 speed = -1 if is_http: test_url = 'http://httpbin.org/get' else: test_url = 'https://httpbin.org/get' # 获取开始时间 start = time.time() try: # 发送请求 response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT) # 请求成功 if response.ok: # 计算速度 speed = round(time.time() - start, 2) # 保留两位小数 dic = json.loads(response.text) # 匿名程度 # 获取来源的ip:orgin origin = dic['origin'] proxy_connection = dic['headers'].get('Proxy-Connection', None) if ',' in origin: # 如果 响应的orgin有 ',' ,就是透明代理 nick_type = 2 elif proxy_connection: # 如果 响应的origin有proxy_connection ,就是匿名代理 nick_type = 1 else: # 高匿代理 nick_type = 0 return True, nick_type, speed return False, nick_type, speed except Exception as e: # logger.exception(e) return False, nick_type, speed
def _check_http_proxies(proxies, is_http=True): # 匿名类型 nick_type = -1 # 响应速度,单位s speed = -1 if is_http: test_url = "http://httpbin.org/get" else: test_url = "https://httpbin.org/get" try: # 获取开始时间 start = time.time() # 发送请求,获取响应速度 response = requests.get(test_url, headers=get_request_headers(), proxies=proxies, timeout=TEST_TIMEOUT) if response.ok: # 计算响应速度 speed = round(time.time() - start, 2) # 匿名程度 dic = json.loads(response.text) # 获取来源IP:origin origin = dic['origin'] proxy_connettin = dic['headers'].get('Proxy-Connetion', None) # 2.如果 响应的origin中有','分割的两个IP就是透明代理IP if ',' in origin: nick_type = 2 # 3.如果 响应的headers中包含Proxy-Connetion 说明时匿名代理IP elif proxy_connettin: nick_type = 1 # 4.否则就是高匿名代理IP else: nick_type = 0 return True, nick_type, speed return False, nick_type, speed except Exception as ex: # logger.exception(ex) return False, nick_type, speed
def get_page_from_url(self, url): headers = get_request_headers() """根据url发送请求,获取页面数据""" response = requests.get(url, headers=headers) return response.content
def get_page_from_url(self, url): response = requests.get(url, headers=get_request_headers()) return response.content
def get_page_from_url(self, url, charset='gb2312'): response = requests.get(url, headers=get_request_headers()) return response.content.decode(charset)