예제 #1
0
파일: kuaidaili.py 프로젝트: majiajue/proxy
    def parse(self, response):
        # pass
        res_main = response.xpath('//table/tbody')
        res_page = response.xpath(
            '//div[@id="listnav"]/ul/li[last()-1]/a/text()').extract()[0]
        # print(res_page)
        data = ProxyItem()
        data['name'] = '快代理'
        data['ip'] = res_main.xpath(
            './tr/td[@data-title="IP"]/text()').extract()
        data['port'] = res_main.xpath(
            './tr/td[@data-title="PORT"]/text()').extract()
        data['protocol'] = res_main.xpath(
            './tr/td[@data-title="类型"]/text()').extract()
        data['anonymity'] = res_main.xpath(
            './tr/td[@data-title="匿名度"]/text()').extract()
        data['area'] = res_main.xpath(
            './tr/td[@data-title="位置"]/text()').extract()
        yield data
        # print(data['name'],data['ip'],data['port'],data['protocol'],data['anonymity'],data['area'],res_page)

        for i in range(2, int(res_page) + 1):
            url = 'https://www.kuaidaili.com/free/inha/{}'.format(i)
            # print(url)
            yield Request(url, callback=self.parse)
예제 #2
0
    def parse(self, response):
        print response.url
        i = ProxyItem()
        regex=re.compile(r'(?<=<script src=")[\s\S]*?(?=<script type=")')
        list_text=regex.findall(response.body,re.S)
        if list_text:
            #list_url=list_text[0].split("<br \>")
            list_ip_port=list_text[0].replace("\r","").replace("\n","").replace("\t","").split("</script>")[1].split("<br />")

        for ip_port in list_ip_port:
            if ip_port:
                i['source'] = 'www.66ip.cn'
                ip_port = ip_port.split(":")
                i['ip'] = ip_port[0].strip(" ")
                i['port'] =ip_port[1].strip(" ") 

                ## type ####################
                i['type'] = D_TYPE['NA'] #to default set
                ###############################

                ## support ####################
                i['support']=0 #to default set
                ###############################

                yield i
            else:
               return 
예제 #3
0
파일: proxy.py 프로젝트: wingyiu/papa
 def parse(self, response):
     p = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,5}'
     proxies = re.findall(p, response.text)
     for proxy in proxies:
         h, p = proxy.split(':')
         r = {'ip': h, 'port': p, 'scheme': 'http', 'anno': '高匿'}
         yield ProxyItem(**r)
예제 #4
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     addresses = hxs.select('//tr[position()>1]/td[position()=1]').re('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
     protocols = hxs.select('//tr[position()>1]/td[position()=2]').re('<td>(.*)<\/td>')
     locations = hxs.select('//tr[position()>1]/td[position()=4]').re('<td>(.*)<\/td>')
     ports_re  = re.compile('write\(":"(.*)\)')
     raw_ports = ports_re.findall(response.body);
     port_map = {'v':'3','m':'4','a':'2','l':'9','q':'0','b':'5','i':'7','w':'6','r':'8','c':'1','+':''}
     ports     = []
     for port in raw_ports:
         tmp = port
         for key in port_map:
             tmp = tmp.replace(key, port_map[key]);
         ports.append(tmp)
     items = []
     for i in range(len(addresses)):
         try:
             item = ProxyItem()
             item['address']  = addresses[i]
             item['protocol'] = protocols[i]
             item['location'] = locations[i]
             item['port']     = ports[i]
             items.append(item)
         except IndexError:
             continue
     return items
예제 #5
0
    def parse(self, response):
        i = ProxyItem()
        index = 1
        while True:
            item = response.xpath("//tr[%d]/td/text()" % (index)).extract()
            if item:
                if item[0].find('ip') != -1:
                    continue
                i['source'] = 'www.ip181.com'
                i['ip'] = item[0]
                i['port'] = item[1]

                ## type ####################
                if item[2] == u'透明':
                    i['type'] = D_TYPE['TP']
                elif item[2] == u'高匿':
                    i['type'] = D_TYPE['HA']
                else:
                    i['type'] = D_TYPE['NA']
                ###############################

                ## support ####################
                support = item[3]
                if support and support.find(',') == -1:
                    support += ',' + support
                support = reduce(
                    lambda x, y: D_SUPPORT.get(x, 0) | D_SUPPORT.get(y, 0),
                    [s.strip() for s in support.split(',')]) if support else 0
                i['support'] = support
                ###############################

                index += 1
                yield i
            else:
                return
예제 #6
0
    def parse(self, response):
        '''
        解析出其中的ip和端口
        :param response:
        :return:
        '''

        trs = response.xpath('//tr[@class="odd" or @class=""]')
        for tr in trs:
            item = ProxyItem()
            tds = tr.xpath('./td/text()').extract()
            for td in tds:
                content = td.strip()
                if len(content) > 0:
                    if content.isdigit():
                        item['port'] = content
                        print 'ip:', item['ip']
                        print 'port:', item['port']
                        break
                    if content.find('.') != -1:
                        item['ip'] = content
            yield item
        for i in self.start_urls:
            for j in range(self.Page_End):
                new_url = i + str(j + 1)
                yield Request(new_url,
                              headers=self.headers,
                              callback=self.parse)
예제 #7
0
    def parse(self, response):
        i = ProxyItem()
        index = 1
        while True:
            #item = response.xpath("//div[@id='list']/table/tbody/tr[%d]/td/text()" %(index)).extract()
            item = response.xpath("//table[@id]/tr[@class][%d]/td/text()" %(index)).extract()
            if item:
                i['source'] = 'www.xicidaili.com'
                i['ip'] = item[0]
                i['port'] = item[1]

                ## type ####################
                if item[4] == u'透明':
                    i['type'] = D_TYPE['TP']
                elif item[2] == u'高匿':
                    i['type'] = D_TYPE['HA']
                else:
                    i['type'] = D_TYPE['NA']
                ###############################

                ## support ####################
                support = item[5]
                if support and support.find(',') == -1:
                    support += ',' + support
                support = reduce(lambda x, y: D_SUPPORT.get(x,0) | D_SUPPORT.get(y,0), [s.strip() for s in support.split(',')]) if support else 0
                i['support'] = support
                ###############################

                index += 1
                yield i
            else:
               return 
예제 #8
0
 def parse_item(self, response):
     soup = BeautifulSoup(response.body)
     str_list = [tag.string or '' for tag in soup.find_all(True)]
     body_str = ' '.join(str_list)
     items = [
         ProxyItem(ip=group[0], port=group[7], protocol='HTTP')
         for group in re.findall(REG_IP, body_str)
     ]
     return items
예제 #9
0
    def parse(self, response):
        item = ProxyItem()

        main = response.xpath('//tr[@class="odd"] | //tr[@class=""]')

        for li in main:
            ip, port = li.xpath('.//td/text()').extract()[:2]
            item['addr'] = ip + ':' + port
            yield item
예제 #10
0
	def parse(self, response):
		item = ProxyItem()
		match_list=response.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
		for match in match_list:
			ipaddr = match.xpath('td/text()').extract()[0]
			port = match.xpath('td/text()').extract()[1]
			item['addr'] = ipaddr + ":" + port

			yield item
예제 #11
0
 def parse(self, response):
     crawl_items = response.xpath('//tr[@class]')
     # print(items)
     item = ProxyItem()
     for crawl_item in crawl_items:
         item['ip'] = crawl_item.xpath('./td[2]/text()').extract()
         item['port'] = crawl_item.xpath('./td[3]/text()').extract()
         item['area'] = crawl_item.xpath('./td[4]/a/text()').extract()
         item['ip_type'] = crawl_item.xpath('./td[6]/text()').extract()
         yield item
예제 #12
0
    def parse(self, response):
        item = ProxyItem()
        mian = response.xpath('//*[@id="list"]/table/tbody/tr')

        for li in mian:
            ip = li.xpath('./td//text()').extract()[0]
            port = li.xpath('./td//text()').extract()[1]
            item['address'] = ip + ':' + port

            yield item
예제 #13
0
 def parse(self, response):
     subSelector = response.xpath('//tr[@class=""]|//tr[@class="odd"]')
     items = []
     for sub in subSelector:
         item = ProxyItem()
         ip = sub.xpath('.//td[2]/text()').extract()[0]
         port = sub.xpath('.//td[3]/text()').extract()[0]
         item['addr'] = ip + ':' + port
         items.append(item)
     return items
예제 #14
0
    def parse(self, response):
        items = []
        item = ProxyItem()

        main = response.xpath('//ul[@class="l2"]')
        for li in main:
            ip, port = li.xpath('./span/li/text()').extract()[:2]
            item['addr'] = ip + ':' + port
            items.append(item)
        return items
예제 #15
0
    def parse(self, response):
        jsonresponse = json.loads(response.text)['proxies']
        items = []

        for i in jsonresponse:
            item = ProxyItem()
            item['addr'] = i['http']
            items.append(item)

        return items
예제 #16
0
    def parse(self, response):
        print(response.body)

        data = json.loads(response.body)

        item = ProxyItem()

        item["name"] = data["DOCUMENT_ROOT"]
        item["ip"] = data["REMOTE_ADDR"]
        yield item
예제 #17
0
    def parse(self, response):
        text = response.text.split('\n')[5:-2]

        items = []

        for i in text:
            item = ProxyItem()
            item['addr'] = i.split()[0]
            items.append(item)

        return items
예제 #18
0
    def parse(self, response):
        item = ProxyItem()

        main = response.xpath(
            '//table[@class="table table-bordered table-striped"]/tbody/tr')

        for li in main:
            ip = li.xpath('td/text()').extract()[0]
            port = li.xpath('td/text()').extract()[1]
            item['addr'] = ip + ':' + port
            yield item
예제 #19
0
 def parse(self, response):
     item = ProxyItem()
     lists = response.xpath('//tr[@class="odd" or @class=""]')
     for l in lists:
         ip = l.xpath('td/text()').extract()[0]
         port = l.xpath('td/text()').extract()[1]
         print('ip address is belowing')
         print(ip)
         print('port number is belowing')
         print(port)
         item['addr'] = ip + ':' + port
         yield item
예제 #20
0
 def parse(self, response):
     item = ProxyItem()
     mian=response.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
     
     for li in mian:
         #找到ip地址
         ip = li.xpath('td/text()').extract()[0]
         #找到端口:
         port =li.xpath('td/text()').extract()[1]
         #将两者连接,并返回给item处理
         item['addr'] = ip+':'+port
         yield item
예제 #21
0
 def parse_item(self, response):
     sel = Selector(response)
     item = ProxyItem()
     ip = sel.xpath('//*[@id="index_free_list"]/table/tbody/tr/td[1]/text()'
                    ).extract()
     port = sel.xpath(
         '//*[@id="index_free_list"]/table/tbody/tr/td[2]/text()').extract(
         )
     for i in range(10):
         item['ip'] = ip[i]
         item['port'] = port[i]
         yield item
예제 #22
0
 def parse(self, response):
     item = ProxyItem()
     mian = response.xpath('//table[@id="ip_list"]/tbody/tr[@class="odd"]')
     #mian=response.xpath('//tr[@class="odd"]')
     for li in mian:
         #找到ip地址
         ip = li.xpath('td/text()').extract()[0]
         #找到端口:
         port = li.xpath('td/text()').extract()[1]
         #将两者连接,并返回给item处理
         item['addr'] = ip + ':' + port
         yield item
예제 #23
0
파일: xicidaili.py 프로젝트: majiajue/proxy
 def parse(self, response):
     # pass
     res_main = response.xpath('//table[@id="ip_list"]')
     # res_page = response.xpath('//div[@id="listnav"]/ul/li[last()-1]/a/text()').extract()[0]
     # print(res_page)
     data = ProxyItem()
     data['name'] = '西刺代理'
     data['ip'] = res_main.xpath('./tr/td[2]/text()').extract()
     data['port'] = res_main.xpath('./tr/td[3]/text()').extract()
     data['protocol'] = res_main.xpath('./tr/td[6]/text()').extract()
     data['anonymity'] = res_main.xpath('./tr/td[5]/text()').extract()
     data['area'] = res_main.xpath('./tr/td[4]/text()').extract()
     yield data
예제 #24
0
파일: proxy.py 프로젝트: wingyiu/papa
 def parse(self, response):
     rows = response.css('.wlist .l2')
     for i, row in enumerate(rows):
         if i == 0:
             continue
         cols = row.css('li')
         r = {
             'ip': cols[0].css('::text').extract_first().strip(),
             'port': cols[1].css('::text').extract_first().strip(),
             'scheme': cols[3].css('a::text').extract_first(),
             'anno': cols[2].css('a::text').extract_first(),
         }
         if '匿' in r['anno'] and r['scheme'] in ['http', 'https']:
             yield ProxyItem(**r)
예제 #25
0
    def parse(self, response):
        #先实例化一个item
        items = []

        main = response.xpath(
            '//table[@class="table table-bordered table-striped"]/tbody/tr')

        for li in main:
            item = ProxyItem()
            ip = li.xpath('td/text()').extract()[0]
            port = li.xpath('td/text()').extract()[1]
            item['addr'] = ip + ':' + port
            items.append(item)
        return items
예제 #26
0
파일: xici.py 프로젝트: su-duan/proxy
    def proxy_parse(self, response):
        sel = Selector(response=response)
        item = ProxyItem()

        ip_list = sel.xpath('//table[@id="ip_list"]/tr')

        if len(ip_list) > 0:
            ip_list.pop(0)
        for ip in ip_list:
            item["ip"] = ip.xpath("td[2]/text()").extract()[0].strip()
            item["port"] = ip.xpath("td[3]/text()").extract()[0].strip()
            item["p_type"] = ip.xpath("td[6]/text()").extract()[0].strip()
            if "http" == item["p_type"].lower():
                yield item
예제 #27
0
파일: proxy.py 프로젝트: wingyiu/papa
 def parse(self, response):
     rows = response.css('table tbody tr')
     for i, row in enumerate(rows):
         if i == 0:
             continue
         cols = row.css('td')
         r = {
             'ip': cols[0].css('::text').extract_first().strip(),
             'port': cols[1].css('::text').extract_first().strip(),
             'scheme':
             cols[3].css('::text').extract_first().strip().lower(),
             'anno': cols[2].css('::text').extract_first().strip(),
         }
         if r['anno'] != '透明' and r['scheme'] in ['http', 'https']:
             yield ProxyItem(**r)
예제 #28
0
    def parse(self, response):
        soup = BeautifulSoup(response.body)
        items = []

        for i in range(len(soup.select('.country + td'))):
            #class = "country"的下一个兄弟标签
            item = ProxyItem()

            if i % 2 == 0:
                item['adrs'] = soup.select('.country + td')[i].get_text()
                item['port'] = soup.select('.country + td + td')[i].get_text()
                item['_type'] = soup.select('.country + td')[i + 1].get_text()
                items.append(item)

        return items
예제 #29
0
파일: kuai.py 프로젝트: su-duan/proxy
    def proxy_parse(self, response):
        sel = Selector(response=response)
        item =ProxyItem()
        
        ip_list = sel.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')

        for ip in ip_list:
            item["ip"] = ip.xpath("td[1]/text()").extract()[0].strip()
            print item["ip"]
            item["port"] = ip.xpath("td[2]/text()").extract()[0].strip()
            print item["port"]
            item["p_type"] = ip.xpath("td[4]/text()").extract()[0].strip()
            print item["p_type"]
            if "http" == item["p_type"].lower():  
                yield item  
예제 #30
0
 def parse(self, response):
     tbody = response.xpath(
         '/html/body/div[1]/div[4]/div[2]/div/div[2]/table/tbody//tr')
     for tr in tbody:
         p_type = tr.xpath('td[4]/text()').extract_first().lower()
         p_type = 'http' if p_type == 'HTTP' else 'https'
         data = {
             'ip': tr.xpath('td[1]/text()').extract_first(),
             'port': tr.xpath('td[2]/text()').extract_first(),
             'anonymity': tr.xpath('td[3]/text()').extract_first(),
             'p_type': p_type,
             'p_address': tr.xpath('td[5]/text()').extract_first()
         }
         item = ProxyItem()
         item['proxy_info'] = json.dumps(data)
         yield item