def iphai_spider(self): """ 开始第四个代理厂家的抓取:http://www.iphai.com/free/ng :return: """ url = "http://www.iphai.com/free/ng" response_info = get_response(url) if response_info: selector_response = Selector(text=response_info) all_info = selector_response.xpath("//tr") for each_info in all_info[1:]: each_ip = each_info.xpath("td[1]/text()").extract_first() # IP each_ip = each_ip.replace("\n", "").replace( " ", "") if each_ip else "" each_port = each_info.xpath( "td[2]/text()").extract_first() # port:端口 each_port = (each_port.replace("\n", "").replace(" ", "") if each_port else "") each_type = each_info.xpath( "td[4]/text()").extract_first() # 代理的协议的类型:http or https each_type = (each_type.replace("\n", "").replace(" ", "") if each_type.replace("\n", "").replace(" ", "") else "http") full_agent = { "%s" % each_type.lower(): "%s://%s:%s" % (each_type.lower(), each_ip, each_port) } check_agent(full_agent, each_ip, each_port, each_type) # 进行代理的测试........
def parser_66ip(self, response): """ 对获取的内容进行解析 :param response:获取到的内容 :return: """ all_info = response.xpath("//tr") for each_info in all_info[2:]: each_ip = each_info.xpath("td[1]/text()").extract_first() # IP each_port = each_info.xpath( "td[2]/text()").extract_first() # port:端口 full_agent = {"http": "http://%s:%s" % (each_ip, each_port)} each_type = "http" check_agent(full_agent, each_ip, each_port, each_type) # 进行代理的测试........
def parser_xici_daili(self, response): """ 解析每一页获取内容想要的信息。 :param response: :return: """ all_agent = response.xpath('//tr[@class="odd"]') for each_agent in all_agent: each_ip = each_agent.xpath("td[2]/text()").extract_first() each_port = each_agent.xpath("td[3]/text()").extract_first() each_type = each_agent.xpath("td[6]/text()").extract_first() full_agent = { "%s" % each_type.lower(): "%s://%s:%s" % (each_type.lower(), each_ip, each_port) } check_agent(full_agent, each_ip, each_port, each_type) # 进行代理的测试........
def parser_89ip(self, response): """ 进行解析获得到的 html 得到 ip port。 :param response: 每次响应的内容 :return: """ all_info = response.xpath("//tr") for each_info in all_info[1:]: each_ip = each_info.xpath("td[1]/text()").extract_first() # IP each_ip = each_ip.strip() if each_ip else "" each_port = each_info.xpath( "td[2]/text()").extract_first() # port:端口 each_port = each_port.strip() full_agent = {"http": "http://%s:%s" % (each_ip, each_port)} each_type = "http" check_agent(full_agent, each_ip, each_port, each_type) # 进行代理的测试........
def parser_kuai_dai_li(self, response): """ 对获取的内容进行解析。 :param response:获取到每页的内容。 :return: """ all_info = response.xpath("//tr") for each_info in all_info[1:]: each_ip = each_info.xpath("td[1]/text()").extract_first() # IP each_port = each_info.xpath( "td[2]/text()").extract_first() # port:端口 each_type = each_info.xpath( "td[4]/text()").extract_first() # 代理的协议的类型:http or https full_agent = { "%s" % each_type.lower(): "%s://%s:%s" % (each_type.lower(), each_ip, each_port) } check_agent(full_agent, each_ip, each_port, each_type) # 进行代理的测试........
def fei_yi_spider(self): """ 抓取飞蚁代理的网站:http://www.feiyiproxy.com/?page_id=1457 :return: """ url = "http://www.feiyiproxy.com/?page_id=1457" response_info = get_response(url) if response_info: selector_response = Selector(text=response_info) all_info = selector_response.xpath("//tr") for each_info in all_info[1:26]: each_ip = each_info.xpath("td[1]/text()").extract_first() # IP each_port = each_info.xpath( "td[2]/text()").extract_first() # port:端口 each_type = each_info.xpath( "td[4]/text()").extract_first() # 代理的协议的类型:http or https full_agent = { "%s" % each_type.lower(): "%s://%s:%s" % (each_type.lower(), each_ip, each_port) } check_agent(full_agent, each_ip, each_port, each_type) # 进行代理的测试........
def data5u_spider(self): """ 开始第五个代理商家的抓取:http://www.iphai.com/free/ng :return: """ url = "http://www.data5u.com/" response_info = get_response(url) if response_info: selector_response = Selector(text=response_info) all_info = selector_response.xpath('//ul[@class="l2"]') for each_info in all_info: each_ip = each_info.xpath( "span[1]/li/text()").extract_first() # ip each_port = each_info.xpath( "span[2]/li/text()").extract_first() # 端口 each_type = each_info.xpath("span[4]/li/text()").extract_first( ) # 代理协议的类型:http or https full_agent = { "%s" % each_type.lower(): "%s://%s:%s" % (each_type.lower(), each_ip, each_port) } check_agent(full_agent, each_ip, each_port, each_type) # 进行代理的测试........