Пример #1
0
def req(url, titleList):
    try:
        response = requests.get(url,
                                headers=headers,
                                proxies=proxies,
                                timeout=5)
        html = etree.HTML(response.content.decode('utf-8'))
        if (response.status_code == 200):
            successed = True
            # logger.info("使用代理< " + proxy._get_url() + " > 请求 < " + url + " > 结果: 成功 ")
    except:
        logger.info("使用代理< " + proxy._get_url() + " > 请求 < " + url +
                    " > 结果: 失败 ")
        csv_filess = open('E:\\python\\环境1\\walmart\\Failure title.csv',
                          'a',
                          newline='',
                          encoding='utf-8')
        writerss = csv.writer(csv_filess)
        for er in range(0, len(titleList)):
            writerss.writerow([titleList[er]])
        csv_filess.close()

    # 根据请求的响应结果更新代理
    proxy._update(successed)
    # 将代理返还给队列,返还时不校验可用性
    fifo_queue.push(proxy, need_check=False)
    return html
Пример #2
0
 def _start_crawl(self):
     for url_dict in self.urls:
         logger.info("开始爬取 [ " + self.website + " ] :::> [ " +
                     url_dict['type'] + " ]")
         has_more = True
         url = None
         while has_more:
             if 'page' in url_dict.keys() and str.find(
                     url_dict['url'], '{}') != -1:
                 url = url_dict['url'].format(str(url_dict['page']))
                 url_dict['page'] = url_dict['page'] + 1
             else:
                 url = url_dict['url']
                 has_more = False
             html = etree.HTML(request_page(url))
             ul_list = html.xpath(
                 "//div[@class='wlist'][2]//ul[@class='l2']")
             for ul in ul_list:
                 ip = ul.xpath("./span[1]/li/text()")[0] if len(
                     ul.xpath("./span[1]/li/text()")) else None
                 port = ul.xpath("./span[2]/li/text()")[0] if len(
                     ul.xpath("./span[2]/li/text()")) else None
                 schema = ul.xpath("./span[4]/li/text()")[0] if len(
                     ul.xpath("./span[4]/li/text()")) else None
                 proxy = IPProxy(schema=strip(schema),
                                 ip=strip(ip),
                                 port=strip(port))
                 if proxy._check_format():
                     self.queue.push(proxy)
             if ul_list is None:
                 has_more = False
Пример #3
0
 def _start_crawl(self):
     for url_dict in self.urls:
         logger.info("开始爬取 [ " + self.website + " ] :::> [ " +
                     url_dict['type'] + " ]")
         has_more = True
         url = None
         while has_more:
             if 'page' in url_dict.keys() and str.find(
                     url_dict['url'], '{}') != -1:
                 url = url_dict['url'].format(str(url_dict['page']))
                 url_dict['page'] = url_dict['page'] + 1
             else:
                 url = url_dict['url']
                 has_more = False
             html = etree.HTML(request_page(url))
             tr_list = html.xpath(
                 "//div[@id='main-content']//table/tr[position()>1]")
             for tr in tr_list:
                 ip = tr.xpath("./td[1]/text()")[0] if len(
                     tr.xpath("./td[1]/text()")) else None
                 port = tr.xpath("./td[2]/text()")[0] if len(
                     tr.xpath("./td[2]/text()")) else None
                 schema = tr.xpath("./td[4]/text()")[0] if len(
                     tr.xpath("./td[4]/text()")) else None
                 proxy = IPProxy(schema=strip(schema),
                                 ip=strip(ip),
                                 port=strip(port))
                 if proxy._check_format():
                     self.queue.push(proxy)
             if tr_list is None:
                 has_more = False
Пример #4
0
 def _start_crawl(self):
     for url_dict in self.urls:
         logger.info("开始爬取 [ " + self.website + " ] :::> [ " +
                     url_dict['type'] + " ]")
         has_more = True
         url = None
         while has_more:
             if 'page' in url_dict.keys() and str.find(
                     url_dict['url'], '{}') != -1:
                 url = url_dict['url'].format(str(url_dict['page']))
                 url_dict['page'] = url_dict['page'] + 1
             else:
                 url = url_dict['url']
                 has_more = False
             html = etree.HTML(request_page(url))
             tr_list = html.xpath(
                 "//table[@id='ip_list']//tr[@class!='subtitle']")
             for tr in tr_list:
                 ip = tr.xpath("./td[2]/text()")[0] if len(
                     tr.xpath("./td[2]/text()")) else None
                 port = tr.xpath("./td[3]/text()")[0] if len(
                     tr.xpath("./td[3]/text()")) else None
                 schema = tr.xpath("./td[6]/text()")[0] if len(
                     tr.xpath("./td[6]/text()")) else None
                 if schema.lower() == "http" or schema.lower() == "https":
                     proxy = IPProxy(schema=strip(schema),
                                     ip=strip(ip),
                                     port=strip(port))
                     if proxy._check_format():
                         self.queue.push(proxy)
             if tr_list is None:
                 has_more = False
Пример #5
0
 def _start_crawl(self):
     for url_dict in self.urls:
         logger.info("开始爬取 [ " + self.website + " ] :::> [ " +
                     url_dict['type'] + " ]")
         has_more = True
         url = None
         while has_more:
             if 'page' in url_dict.keys() and str.find(
                     url_dict['url'], '{}') != -1:
                 url = url_dict['url'].format(str(url_dict['page']))
                 url_dict['page'] = url_dict['page'] + 1
             else:
                 url = url_dict['url']
                 has_more = False
             html = etree.HTML(request_page(url))
             tr_list = html.xpath(
                 "//table[@class='table table-bordered table-striped']/tbody/tr"
             )
             for tr in tr_list:
                 ip = tr.xpath("./td[@data-title='IP']/text()")[0] if len(
                     tr.xpath("./td[@data-title='IP']/text()")) else None
                 port = tr.xpath(
                     "./td[@data-title='PORT']/text()")[0] if len(
                         tr.xpath(
                             "./td[@data-title='PORT']/text()")) else None
                 schema = tr.xpath(
                     "./td[@data-title='类型']/text()")[0] if len(
                         tr.xpath(
                             "./td[@data-title='类型']/text()")) else None
                 proxy = IPProxy(schema=strip(schema),
                                 ip=strip(ip),
                                 port=strip(port))
                 if proxy._check_format():
                     self.queue.push(proxy)
             if tr_list is None:
                 has_more = False
Пример #6
0
from proxy_util import base_headers

# 测试地址
url = 'http://blog.csdn.net/pengjunlee/article/details/90174453'

# 获取代理
proxy = fifo_queue.pop(schema='http')
proxies = {proxy.schema: proxy._get_url()}

# 构造请求头
headers = dict(base_headers)
if 'User-Agent' not in headers.keys():
    headers['User-Agent'] = random.choice(USER_AGENT_LIST)

response = None
successed = False
try:
    response = requests.get(url, headers=headers, proxies=proxies, timeout=5)
    print(response.content.decode())
    if (response.status_code == 200):
        successed = True
        logger.info("使用代理< " + proxy._get_url() + " > 请求 < " + url +
                    " > 结果: 成功 ")
except:
    logger.info("使用代理< " + proxy._get_url() + " > 请求 < " + url + " > 结果: 失败 ")

# 根据请求的响应结果更新代理
proxy._update(successed)
# 将代理返还给队列,返还时不校验可用性
fifo_queue.push(proxy, need_check=False)