def req(url, titleList): try: response = requests.get(url, headers=headers, proxies=proxies, timeout=5) html = etree.HTML(response.content.decode('utf-8')) if (response.status_code == 200): successed = True # logger.info("使用代理< " + proxy._get_url() + " > 请求 < " + url + " > 结果: 成功 ") except: logger.info("使用代理< " + proxy._get_url() + " > 请求 < " + url + " > 结果: 失败 ") csv_filess = open('E:\\python\\环境1\\walmart\\Failure title.csv', 'a', newline='', encoding='utf-8') writerss = csv.writer(csv_filess) for er in range(0, len(titleList)): writerss.writerow([titleList[er]]) csv_filess.close() # 根据请求的响应结果更新代理 proxy._update(successed) # 将代理返还给队列,返还时不校验可用性 fifo_queue.push(proxy, need_check=False) return html
def _start_crawl(self): for url_dict in self.urls: logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]") has_more = True url = None while has_more: if 'page' in url_dict.keys() and str.find( url_dict['url'], '{}') != -1: url = url_dict['url'].format(str(url_dict['page'])) url_dict['page'] = url_dict['page'] + 1 else: url = url_dict['url'] has_more = False html = etree.HTML(request_page(url)) ul_list = html.xpath( "//div[@class='wlist'][2]//ul[@class='l2']") for ul in ul_list: ip = ul.xpath("./span[1]/li/text()")[0] if len( ul.xpath("./span[1]/li/text()")) else None port = ul.xpath("./span[2]/li/text()")[0] if len( ul.xpath("./span[2]/li/text()")) else None schema = ul.xpath("./span[4]/li/text()")[0] if len( ul.xpath("./span[4]/li/text()")) else None proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port)) if proxy._check_format(): self.queue.push(proxy) if ul_list is None: has_more = False
def _start_crawl(self): for url_dict in self.urls: logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]") has_more = True url = None while has_more: if 'page' in url_dict.keys() and str.find( url_dict['url'], '{}') != -1: url = url_dict['url'].format(str(url_dict['page'])) url_dict['page'] = url_dict['page'] + 1 else: url = url_dict['url'] has_more = False html = etree.HTML(request_page(url)) tr_list = html.xpath( "//div[@id='main-content']//table/tr[position()>1]") for tr in tr_list: ip = tr.xpath("./td[1]/text()")[0] if len( tr.xpath("./td[1]/text()")) else None port = tr.xpath("./td[2]/text()")[0] if len( tr.xpath("./td[2]/text()")) else None schema = tr.xpath("./td[4]/text()")[0] if len( tr.xpath("./td[4]/text()")) else None proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port)) if proxy._check_format(): self.queue.push(proxy) if tr_list is None: has_more = False
def _start_crawl(self): for url_dict in self.urls: logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]") has_more = True url = None while has_more: if 'page' in url_dict.keys() and str.find( url_dict['url'], '{}') != -1: url = url_dict['url'].format(str(url_dict['page'])) url_dict['page'] = url_dict['page'] + 1 else: url = url_dict['url'] has_more = False html = etree.HTML(request_page(url)) tr_list = html.xpath( "//table[@id='ip_list']//tr[@class!='subtitle']") for tr in tr_list: ip = tr.xpath("./td[2]/text()")[0] if len( tr.xpath("./td[2]/text()")) else None port = tr.xpath("./td[3]/text()")[0] if len( tr.xpath("./td[3]/text()")) else None schema = tr.xpath("./td[6]/text()")[0] if len( tr.xpath("./td[6]/text()")) else None if schema.lower() == "http" or schema.lower() == "https": proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port)) if proxy._check_format(): self.queue.push(proxy) if tr_list is None: has_more = False
def _start_crawl(self): for url_dict in self.urls: logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]") has_more = True url = None while has_more: if 'page' in url_dict.keys() and str.find( url_dict['url'], '{}') != -1: url = url_dict['url'].format(str(url_dict['page'])) url_dict['page'] = url_dict['page'] + 1 else: url = url_dict['url'] has_more = False html = etree.HTML(request_page(url)) tr_list = html.xpath( "//table[@class='table table-bordered table-striped']/tbody/tr" ) for tr in tr_list: ip = tr.xpath("./td[@data-title='IP']/text()")[0] if len( tr.xpath("./td[@data-title='IP']/text()")) else None port = tr.xpath( "./td[@data-title='PORT']/text()")[0] if len( tr.xpath( "./td[@data-title='PORT']/text()")) else None schema = tr.xpath( "./td[@data-title='类型']/text()")[0] if len( tr.xpath( "./td[@data-title='类型']/text()")) else None proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port)) if proxy._check_format(): self.queue.push(proxy) if tr_list is None: has_more = False
from proxy_util import base_headers # 测试地址 url = 'http://blog.csdn.net/pengjunlee/article/details/90174453' # 获取代理 proxy = fifo_queue.pop(schema='http') proxies = {proxy.schema: proxy._get_url()} # 构造请求头 headers = dict(base_headers) if 'User-Agent' not in headers.keys(): headers['User-Agent'] = random.choice(USER_AGENT_LIST) response = None successed = False try: response = requests.get(url, headers=headers, proxies=proxies, timeout=5) print(response.content.decode()) if (response.status_code == 200): successed = True logger.info("使用代理< " + proxy._get_url() + " > 请求 < " + url + " > 结果: 成功 ") except: logger.info("使用代理< " + proxy._get_url() + " > 请求 < " + url + " > 结果: 失败 ") # 根据请求的响应结果更新代理 proxy._update(successed) # 将代理返还给队列,返还时不校验可用性 fifo_queue.push(proxy, need_check=False)