示例#1
0
 def iphai_spider(self):
     """
     开始第四个代理厂家的抓取:http://www.iphai.com/free/ng
     :return:
     """
     url = "http://www.iphai.com/free/ng"
     response_info = get_response(url)
     if response_info:
         selector_response = Selector(text=response_info)
         all_info = selector_response.xpath("//tr")
         for each_info in all_info[1:]:
             each_ip = each_info.xpath("td[1]/text()").extract_first()  # IP
             each_ip = each_ip.replace("\n", "").replace(
                 " ", "") if each_ip else ""
             each_port = each_info.xpath(
                 "td[2]/text()").extract_first()  # port:端口
             each_port = (each_port.replace("\n", "").replace(" ", "")
                          if each_port else "")
             each_type = each_info.xpath(
                 "td[4]/text()").extract_first()  # 代理的协议的类型:http or https
             each_type = (each_type.replace("\n", "").replace(" ", "")
                          if each_type.replace("\n", "").replace(" ", "")
                          else "http")
             full_agent = {
                 "%s" % each_type.lower():
                 "%s://%s:%s" % (each_type.lower(), each_ip, each_port)
             }
             check_agent(full_agent, each_ip, each_port,
                         each_type)  # 进行代理的测试........
示例#2
0
 def parser_66ip(self, response):
     """
     对获取的内容进行解析
     :param response:获取到的内容
     :return:
     """
     all_info = response.xpath("//tr")
     for each_info in all_info[2:]:
         each_ip = each_info.xpath("td[1]/text()").extract_first()  # IP
         each_port = each_info.xpath(
             "td[2]/text()").extract_first()  # port:端口
         full_agent = {"http": "http://%s:%s" % (each_ip, each_port)}
         each_type = "http"
         check_agent(full_agent, each_ip, each_port,
                     each_type)  # 进行代理的测试........
示例#3
0
 def parser_xici_daili(self, response):
     """
     解析每一页获取内容想要的信息。
     :param response:
     :return:
     """
     all_agent = response.xpath('//tr[@class="odd"]')
     for each_agent in all_agent:
         each_ip = each_agent.xpath("td[2]/text()").extract_first()
         each_port = each_agent.xpath("td[3]/text()").extract_first()
         each_type = each_agent.xpath("td[6]/text()").extract_first()
         full_agent = {
             "%s" % each_type.lower():
             "%s://%s:%s" % (each_type.lower(), each_ip, each_port)
         }
         check_agent(full_agent, each_ip, each_port,
                     each_type)  # 进行代理的测试........
示例#4
0
 def parser_89ip(self, response):
     """
     进行解析获得到的 html 得到 ip port。
     :param response: 每次响应的内容
     :return:
     """
     all_info = response.xpath("//tr")
     for each_info in all_info[1:]:
         each_ip = each_info.xpath("td[1]/text()").extract_first()  # IP
         each_ip = each_ip.strip() if each_ip else ""
         each_port = each_info.xpath(
             "td[2]/text()").extract_first()  # port:端口
         each_port = each_port.strip()
         full_agent = {"http": "http://%s:%s" % (each_ip, each_port)}
         each_type = "http"
         check_agent(full_agent, each_ip, each_port,
                     each_type)  # 进行代理的测试........
示例#5
0
 def parser_kuai_dai_li(self, response):
     """
     对获取的内容进行解析。
     :param response:获取到每页的内容。
     :return:
     """
     all_info = response.xpath("//tr")
     for each_info in all_info[1:]:
         each_ip = each_info.xpath("td[1]/text()").extract_first()  # IP
         each_port = each_info.xpath(
             "td[2]/text()").extract_first()  # port:端口
         each_type = each_info.xpath(
             "td[4]/text()").extract_first()  # 代理的协议的类型:http or https
         full_agent = {
             "%s" % each_type.lower():
             "%s://%s:%s" % (each_type.lower(), each_ip, each_port)
         }
         check_agent(full_agent, each_ip, each_port,
                     each_type)  # 进行代理的测试........
示例#6
0
 def fei_yi_spider(self):
     """
     抓取飞蚁代理的网站:http://www.feiyiproxy.com/?page_id=1457
     :return:
     """
     url = "http://www.feiyiproxy.com/?page_id=1457"
     response_info = get_response(url)
     if response_info:
         selector_response = Selector(text=response_info)
         all_info = selector_response.xpath("//tr")
         for each_info in all_info[1:26]:
             each_ip = each_info.xpath("td[1]/text()").extract_first()  # IP
             each_port = each_info.xpath(
                 "td[2]/text()").extract_first()  # port:端口
             each_type = each_info.xpath(
                 "td[4]/text()").extract_first()  # 代理的协议的类型:http or https
             full_agent = {
                 "%s" % each_type.lower():
                 "%s://%s:%s" % (each_type.lower(), each_ip, each_port)
             }
             check_agent(full_agent, each_ip, each_port,
                         each_type)  # 进行代理的测试........
示例#7
0
 def data5u_spider(self):
     """
     开始第五个代理商家的抓取:http://www.iphai.com/free/ng
     :return:
     """
     url = "http://www.data5u.com/"
     response_info = get_response(url)
     if response_info:
         selector_response = Selector(text=response_info)
         all_info = selector_response.xpath('//ul[@class="l2"]')
         for each_info in all_info:
             each_ip = each_info.xpath(
                 "span[1]/li/text()").extract_first()  # ip
             each_port = each_info.xpath(
                 "span[2]/li/text()").extract_first()  # 端口
             each_type = each_info.xpath("span[4]/li/text()").extract_first(
             )  # 代理协议的类型:http or https
             full_agent = {
                 "%s" % each_type.lower():
                 "%s://%s:%s" % (each_type.lower(), each_ip, each_port)
             }
             check_agent(full_agent, each_ip, each_port,
                         each_type)  # 进行代理的测试........