def parse(self, response): # pass res_main = response.xpath('//table/tbody') res_page = response.xpath( '//div[@id="listnav"]/ul/li[last()-1]/a/text()').extract()[0] # print(res_page) data = ProxyItem() data['name'] = '快代理' data['ip'] = res_main.xpath( './tr/td[@data-title="IP"]/text()').extract() data['port'] = res_main.xpath( './tr/td[@data-title="PORT"]/text()').extract() data['protocol'] = res_main.xpath( './tr/td[@data-title="类型"]/text()').extract() data['anonymity'] = res_main.xpath( './tr/td[@data-title="匿名度"]/text()').extract() data['area'] = res_main.xpath( './tr/td[@data-title="位置"]/text()').extract() yield data # print(data['name'],data['ip'],data['port'],data['protocol'],data['anonymity'],data['area'],res_page) for i in range(2, int(res_page) + 1): url = 'https://www.kuaidaili.com/free/inha/{}'.format(i) # print(url) yield Request(url, callback=self.parse)
def parse(self, response): print response.url i = ProxyItem() regex=re.compile(r'(?<=<script src=")[\s\S]*?(?=<script type=")') list_text=regex.findall(response.body,re.S) if list_text: #list_url=list_text[0].split("<br \>") list_ip_port=list_text[0].replace("\r","").replace("\n","").replace("\t","").split("</script>")[1].split("<br />") for ip_port in list_ip_port: if ip_port: i['source'] = 'www.66ip.cn' ip_port = ip_port.split(":") i['ip'] = ip_port[0].strip(" ") i['port'] =ip_port[1].strip(" ") ## type #################### i['type'] = D_TYPE['NA'] #to default set ############################### ## support #################### i['support']=0 #to default set ############################### yield i else: return
def parse(self, response): p = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,5}' proxies = re.findall(p, response.text) for proxy in proxies: h, p = proxy.split(':') r = {'ip': h, 'port': p, 'scheme': 'http', 'anno': '高匿'} yield ProxyItem(**r)
def parse(self, response): hxs = HtmlXPathSelector(response) addresses = hxs.select('//tr[position()>1]/td[position()=1]').re('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') protocols = hxs.select('//tr[position()>1]/td[position()=2]').re('<td>(.*)<\/td>') locations = hxs.select('//tr[position()>1]/td[position()=4]').re('<td>(.*)<\/td>') ports_re = re.compile('write\(":"(.*)\)') raw_ports = ports_re.findall(response.body); port_map = {'v':'3','m':'4','a':'2','l':'9','q':'0','b':'5','i':'7','w':'6','r':'8','c':'1','+':''} ports = [] for port in raw_ports: tmp = port for key in port_map: tmp = tmp.replace(key, port_map[key]); ports.append(tmp) items = [] for i in range(len(addresses)): try: item = ProxyItem() item['address'] = addresses[i] item['protocol'] = protocols[i] item['location'] = locations[i] item['port'] = ports[i] items.append(item) except IndexError: continue return items
def parse(self, response): i = ProxyItem() index = 1 while True: item = response.xpath("//tr[%d]/td/text()" % (index)).extract() if item: if item[0].find('ip') != -1: continue i['source'] = 'www.ip181.com' i['ip'] = item[0] i['port'] = item[1] ## type #################### if item[2] == u'透明': i['type'] = D_TYPE['TP'] elif item[2] == u'高匿': i['type'] = D_TYPE['HA'] else: i['type'] = D_TYPE['NA'] ############################### ## support #################### support = item[3] if support and support.find(',') == -1: support += ',' + support support = reduce( lambda x, y: D_SUPPORT.get(x, 0) | D_SUPPORT.get(y, 0), [s.strip() for s in support.split(',')]) if support else 0 i['support'] = support ############################### index += 1 yield i else: return
def parse(self, response): ''' 解析出其中的ip和端口 :param response: :return: ''' trs = response.xpath('//tr[@class="odd" or @class=""]') for tr in trs: item = ProxyItem() tds = tr.xpath('./td/text()').extract() for td in tds: content = td.strip() if len(content) > 0: if content.isdigit(): item['port'] = content print 'ip:', item['ip'] print 'port:', item['port'] break if content.find('.') != -1: item['ip'] = content yield item for i in self.start_urls: for j in range(self.Page_End): new_url = i + str(j + 1) yield Request(new_url, headers=self.headers, callback=self.parse)
def parse(self, response): i = ProxyItem() index = 1 while True: #item = response.xpath("//div[@id='list']/table/tbody/tr[%d]/td/text()" %(index)).extract() item = response.xpath("//table[@id]/tr[@class][%d]/td/text()" %(index)).extract() if item: i['source'] = 'www.xicidaili.com' i['ip'] = item[0] i['port'] = item[1] ## type #################### if item[4] == u'透明': i['type'] = D_TYPE['TP'] elif item[2] == u'高匿': i['type'] = D_TYPE['HA'] else: i['type'] = D_TYPE['NA'] ############################### ## support #################### support = item[5] if support and support.find(',') == -1: support += ',' + support support = reduce(lambda x, y: D_SUPPORT.get(x,0) | D_SUPPORT.get(y,0), [s.strip() for s in support.split(',')]) if support else 0 i['support'] = support ############################### index += 1 yield i else: return
def parse_item(self, response): soup = BeautifulSoup(response.body) str_list = [tag.string or '' for tag in soup.find_all(True)] body_str = ' '.join(str_list) items = [ ProxyItem(ip=group[0], port=group[7], protocol='HTTP') for group in re.findall(REG_IP, body_str) ] return items
def parse(self, response): item = ProxyItem() main = response.xpath('//tr[@class="odd"] | //tr[@class=""]') for li in main: ip, port = li.xpath('.//td/text()').extract()[:2] item['addr'] = ip + ':' + port yield item
def parse(self, response): item = ProxyItem() match_list=response.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr') for match in match_list: ipaddr = match.xpath('td/text()').extract()[0] port = match.xpath('td/text()').extract()[1] item['addr'] = ipaddr + ":" + port yield item
def parse(self, response): crawl_items = response.xpath('//tr[@class]') # print(items) item = ProxyItem() for crawl_item in crawl_items: item['ip'] = crawl_item.xpath('./td[2]/text()').extract() item['port'] = crawl_item.xpath('./td[3]/text()').extract() item['area'] = crawl_item.xpath('./td[4]/a/text()').extract() item['ip_type'] = crawl_item.xpath('./td[6]/text()').extract() yield item
def parse(self, response): item = ProxyItem() mian = response.xpath('//*[@id="list"]/table/tbody/tr') for li in mian: ip = li.xpath('./td//text()').extract()[0] port = li.xpath('./td//text()').extract()[1] item['address'] = ip + ':' + port yield item
def parse(self, response): subSelector = response.xpath('//tr[@class=""]|//tr[@class="odd"]') items = [] for sub in subSelector: item = ProxyItem() ip = sub.xpath('.//td[2]/text()').extract()[0] port = sub.xpath('.//td[3]/text()').extract()[0] item['addr'] = ip + ':' + port items.append(item) return items
def parse(self, response): items = [] item = ProxyItem() main = response.xpath('//ul[@class="l2"]') for li in main: ip, port = li.xpath('./span/li/text()').extract()[:2] item['addr'] = ip + ':' + port items.append(item) return items
def parse(self, response): jsonresponse = json.loads(response.text)['proxies'] items = [] for i in jsonresponse: item = ProxyItem() item['addr'] = i['http'] items.append(item) return items
def parse(self, response): print(response.body) data = json.loads(response.body) item = ProxyItem() item["name"] = data["DOCUMENT_ROOT"] item["ip"] = data["REMOTE_ADDR"] yield item
def parse(self, response): text = response.text.split('\n')[5:-2] items = [] for i in text: item = ProxyItem() item['addr'] = i.split()[0] items.append(item) return items
def parse(self, response): item = ProxyItem() main = response.xpath( '//table[@class="table table-bordered table-striped"]/tbody/tr') for li in main: ip = li.xpath('td/text()').extract()[0] port = li.xpath('td/text()').extract()[1] item['addr'] = ip + ':' + port yield item
def parse(self, response): item = ProxyItem() lists = response.xpath('//tr[@class="odd" or @class=""]') for l in lists: ip = l.xpath('td/text()').extract()[0] port = l.xpath('td/text()').extract()[1] print('ip address is belowing') print(ip) print('port number is belowing') print(port) item['addr'] = ip + ':' + port yield item
def parse(self, response): item = ProxyItem() mian=response.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr') for li in mian: #找到ip地址 ip = li.xpath('td/text()').extract()[0] #找到端口: port =li.xpath('td/text()').extract()[1] #将两者连接,并返回给item处理 item['addr'] = ip+':'+port yield item
def parse_item(self, response): sel = Selector(response) item = ProxyItem() ip = sel.xpath('//*[@id="index_free_list"]/table/tbody/tr/td[1]/text()' ).extract() port = sel.xpath( '//*[@id="index_free_list"]/table/tbody/tr/td[2]/text()').extract( ) for i in range(10): item['ip'] = ip[i] item['port'] = port[i] yield item
def parse(self, response): item = ProxyItem() mian = response.xpath('//table[@id="ip_list"]/tbody/tr[@class="odd"]') #mian=response.xpath('//tr[@class="odd"]') for li in mian: #找到ip地址 ip = li.xpath('td/text()').extract()[0] #找到端口: port = li.xpath('td/text()').extract()[1] #将两者连接,并返回给item处理 item['addr'] = ip + ':' + port yield item
def parse(self, response): # pass res_main = response.xpath('//table[@id="ip_list"]') # res_page = response.xpath('//div[@id="listnav"]/ul/li[last()-1]/a/text()').extract()[0] # print(res_page) data = ProxyItem() data['name'] = '西刺代理' data['ip'] = res_main.xpath('./tr/td[2]/text()').extract() data['port'] = res_main.xpath('./tr/td[3]/text()').extract() data['protocol'] = res_main.xpath('./tr/td[6]/text()').extract() data['anonymity'] = res_main.xpath('./tr/td[5]/text()').extract() data['area'] = res_main.xpath('./tr/td[4]/text()').extract() yield data
def parse(self, response): rows = response.css('.wlist .l2') for i, row in enumerate(rows): if i == 0: continue cols = row.css('li') r = { 'ip': cols[0].css('::text').extract_first().strip(), 'port': cols[1].css('::text').extract_first().strip(), 'scheme': cols[3].css('a::text').extract_first(), 'anno': cols[2].css('a::text').extract_first(), } if '匿' in r['anno'] and r['scheme'] in ['http', 'https']: yield ProxyItem(**r)
def parse(self, response): #先实例化一个item items = [] main = response.xpath( '//table[@class="table table-bordered table-striped"]/tbody/tr') for li in main: item = ProxyItem() ip = li.xpath('td/text()').extract()[0] port = li.xpath('td/text()').extract()[1] item['addr'] = ip + ':' + port items.append(item) return items
def proxy_parse(self, response): sel = Selector(response=response) item = ProxyItem() ip_list = sel.xpath('//table[@id="ip_list"]/tr') if len(ip_list) > 0: ip_list.pop(0) for ip in ip_list: item["ip"] = ip.xpath("td[2]/text()").extract()[0].strip() item["port"] = ip.xpath("td[3]/text()").extract()[0].strip() item["p_type"] = ip.xpath("td[6]/text()").extract()[0].strip() if "http" == item["p_type"].lower(): yield item
def parse(self, response): rows = response.css('table tbody tr') for i, row in enumerate(rows): if i == 0: continue cols = row.css('td') r = { 'ip': cols[0].css('::text').extract_first().strip(), 'port': cols[1].css('::text').extract_first().strip(), 'scheme': cols[3].css('::text').extract_first().strip().lower(), 'anno': cols[2].css('::text').extract_first().strip(), } if r['anno'] != '透明' and r['scheme'] in ['http', 'https']: yield ProxyItem(**r)
def parse(self, response): soup = BeautifulSoup(response.body) items = [] for i in range(len(soup.select('.country + td'))): #class = "country"的下一个兄弟标签 item = ProxyItem() if i % 2 == 0: item['adrs'] = soup.select('.country + td')[i].get_text() item['port'] = soup.select('.country + td + td')[i].get_text() item['_type'] = soup.select('.country + td')[i + 1].get_text() items.append(item) return items
def proxy_parse(self, response): sel = Selector(response=response) item =ProxyItem() ip_list = sel.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr') for ip in ip_list: item["ip"] = ip.xpath("td[1]/text()").extract()[0].strip() print item["ip"] item["port"] = ip.xpath("td[2]/text()").extract()[0].strip() print item["port"] item["p_type"] = ip.xpath("td[4]/text()").extract()[0].strip() print item["p_type"] if "http" == item["p_type"].lower(): yield item
def parse(self, response): tbody = response.xpath( '/html/body/div[1]/div[4]/div[2]/div/div[2]/table/tbody//tr') for tr in tbody: p_type = tr.xpath('td[4]/text()').extract_first().lower() p_type = 'http' if p_type == 'HTTP' else 'https' data = { 'ip': tr.xpath('td[1]/text()').extract_first(), 'port': tr.xpath('td[2]/text()').extract_first(), 'anonymity': tr.xpath('td[3]/text()').extract_first(), 'p_type': p_type, 'p_address': tr.xpath('td[5]/text()').extract_first() } item = ProxyItem() item['proxy_info'] = json.dumps(data) yield item