Python ProxyIPItem示例，crawler.items.ProxyIPItem Python示例

示例#1

0

显示文件

 def parse(self, response):
     total = int(
         response.xpath(
             '//*[@id="content"]/table[2]/tr/td[1]/table/tr[2]/td/small/b/text()'
         ).extract()[0]) / 10
     if response.url.find('pnum=') == -1:
         cur_page = 0
     else:
         cur_page = int(search('pnum=(\d+)', response.url).group(1))
     if total - cur_page > 1:
         yield Request(url="http://www.xroxy.com/proxylist.php?pnum=%d" %
                       (cur_page + 1),
                       headers={'Referer': response.url})
     ip_list = response.xpath(
         '//*[@id="content"]/table[1]/tr[@class="row0"] | //*[@id="content"]/table[1]/tr[@class="row1"]'
     )
     for ip in ip_list:
         item = ProxyIPItem()
         item['ip'] = ip.xpath('td[2]/a/text()').extract()[0].strip()
         item['port'] = ip.xpath('td[3]/a/text()').extract()[0].strip()
         type = ip.xpath('td[4]/a/text()').extract()[0].strip().lower()
         type = 'http' if type in [
             'anonymous', 'transparent', 'high anonymity', 'distorting'
         ] else type
         item['type'] = type
         yield item

示例#2

0

显示文件

 def parse(self, response):
     ip_list = response.xpath('//*[@id="proxylisttable"]/tbody/tr')
     for ip in ip_list:
         item = ProxyIPItem()
         item['ip'] = ip.xpath('td[1]/text()').extract()[0]
         item['port'] = ip.xpath('td[2]/text()').extract()[0]
         item['type'] = ip.xpath('td[5]/text()').extract()[0].lower()
         yield item

示例#3

0

显示文件

文件： 66ip.py 项目： dongdongdu/scrapy_proxy_server_crawler

 def parse(self, response):
     ip_list = re.findall("\d+\.\d+\.\d+\.\d+:\d+", response.body)
     for ip in ip_list:
         item = ProxyIPItem()
         item['ip'] = ip[:ip.index(':')]
         item['port'] = ip[ip.index(":") + 1:]
         item['type'] = 'http'
         yield item

示例#4

0

显示文件

 def parse(self, response):
     for tr in response.xpath('//tr'):
         item = ProxyIPItem()
         try:
             item['ip'] = tr.xpath('td/text()')[0].extract()
             item['port'] = tr.xpath('td/text()')[1].extract()
         except:
             continue
         item['type'] = 'http'
         yield item

示例#5

0

显示文件

 def parse(self, response):
     ip_list = response.xpath('//table[@id="ip_list"]/tr')
     if len(ip_list) > 0:
         ip_list.pop(0)
     for ip in ip_list:
         item = ProxyIPItem()
         item['ip'] = ip.xpath('td[2]/text()').extract()[0]
         item['port'] = ip.xpath('td[3]/text()').extract()[0]
         item['type'] = 'http'
         yield item

示例#6

0

显示文件

 def parse(self, response):
     try:
         proxies = json.loads(response.text)
         for proxy in proxies:
             item = ProxyIPItem()
             item['ip'] = proxy['IP']
             item['port'] = proxy['PORT']
             item['type'] = 'http'
             yield item
         pass
     except:
         pass

示例#7

0

显示文件

 def parseDetail(self, response):
     for tr in response.xpath('//tr'):
         item = ProxyIPItem()
         try:
             server = tr.xpath('td/a/text()')[0].extract()
             strs = server.split(':')
             item['ip'] = strs[0]
             item['port'] = strs[1]
         except Exception as e:
             print(e)
             continue
         item['type'] = 'http'
         yield item

示例#8

0

显示文件

 def parse(self, response):
     ip_list = response.xpath('//div[@id="boxright"]/div/ul/li')
     if len(ip_list) > 0:
         ip_list.pop(0)
     for ip in ip_list:
         item = ProxyIPItem()
         item['ip'] = ip.xpath('div[@class="ip"]/text()').extract()[0]
         item['port'] = ip.xpath('div[@class="port"]/text()').extract()[0]
         if response.url.find('socks4') != -1:
             item['type'] = 'socks4'
         elif response.url.find('socks5') != -1:
             item['type'] = 'socks5'
         else:
             item['type'] = 'http'
         yield item

示例#9

0

显示文件

 def parse(self, response):
     ip_list = response.xpath('//div[@id="index_free_list"]/table/tbody/tr')
     for line in ip_list:
         item = ProxyIPItem(type="http")
         item["ip"] = line.xpath('td[1]/text()').extract()[0].strip()
         item["port"] = line.xpath('td[2]/text()').extract()[0].strip()
         yield item
     if response.request.url.find('proxylist') < 0:
         pages = response.xpath('//div[@id="listnav"]/ul/li/a')
         pages.pop(0)
         for page in pages:
             path = page.xpath('@href').extract()[0]
             yield Request(url=self.start_urls[0] + path,
                           headers={
                               'Referer':
                               response.request.url,
                               'User-Agent':
                               response.request.headers.get('User-Agent')
                           })

示例#10

0

显示文件

文件： qiaodm.py 项目： dongdongdu/scrapy_proxy_server_crawler

    def parse(self, response):
        # pages = response.xpath('//*[@id="flip"]/div/span | //*[@id="flip"]/div/a')
        # if len(pages) > 4:
        #     next_page = pages[-2].xpath('@href').extract()
        #     if len(next_page) == 1:
        #         yield Request(url='%s/%s' % (self.referer, next_page[0]), headers={'Referer': response.url})
        if response.request.url == 'http://ip.qiaodm.com/free/index.html':
            hot_urls = response.xpath(
                '//div[@class="freeb"]/a[contains(@href,"free")]/@href'
            ).extract()
            for url in hot_urls:
                yield Request(url=url, headers={'Referer': self.referer})
            country_urls = response.xpath('//a[@class="item"]/@href').extract()
            for url in country_urls:
                yield Request(url=url, headers={'Referer': self.referer})

        ip_list = response.xpath(
            '//*[@id="main_container"]/div[1]/table/tbody/tr')
        if len(ip_list) > 2:
            ip_list.pop(1)
            ip_list.pop(0)
        for line in ip_list:
            item = ProxyIPItem()
            columns = line.xpath('td')
            ip_spans = columns[0].xpath(
                'node()/script/text() | node()[not(contains(@style, "none"))]/text()'
            ).extract()
            item['ip'] = ''.join([
                a.replace('document.write(\'', '').replace('\');', '')
                for a in ip_spans
            ])
            # port = columns[1].xpath('text()').extract()[0]
            port = columns[1].xpath('@class').extract()[0].split(' ')[1]
            port = int(''.join([str("ABCDEFGHIZ".index(c)) for c in port])) / 8
            item['port'] = port
            # port = columns[1].xpath('script/text()').extract()[0]
            # port = port[port.index('=') + 1:port.index(';')]
            # item['port'] = ''.join([str(eval(a)) for a in port.split('+')])
            item['type'] = 'http'
            yield item

示例#11

0

显示文件

文件： haodaili.py 项目： dongdongdu/scrapy_proxy_server_crawler

 def parse(self, response):
     ip_list = response.xpath(
         '/html/body/center/table[2]/tr/td[1]/table/tr')
     if len(ip_list) > 1:
         ip_list.pop(0)
     has_next = True
     for ip in ip_list:
         item = ProxyIPItem()
         columns = ip.xpath('td/text()').extract()
         item['ip'] = columns[0].strip()
         item['port'] = columns[1].strip()
         item['type'] = 'http'
         if columns[-1].strip() == u'超时':
             has_next = False
         yield item
     if has_next:
         url = "%s%s" % (
             self.referer,
             response.xpath(
                 '/html/body/center/table[2]/tr/td[1]/p/a[last()]/@href').
             extract()[0])
         yield Request(url=url, headers={'Referer': response.url})

示例#12

0

显示文件

文件： proxylists.py 项目： dongdongdu/scrapy_proxy_server_crawler

 def parse(self, response):
     ip_list = response.xpath('body/font/b/table/tr[1]/td[2]/table/tr')
     if len(ip_list) > 3:
         ip_list.pop(1)
         ip_list.pop(0)
         ip = ip_list.pop()
         cur_page = int(search('cn_(\d+)_ext', response.url).group(1))
         total = len(ip.xpath('td/b/a'))
         if total - cur_page > 1:
             yield Request(url="http://www.proxylists.net/cn_%d_ext.html" %
                           (cur_page + 1),
                           headers={'Referer': response.url})
     for ip in ip_list:
         item = ProxyIPItem()
         item['ip'] = unquote(
             search('%22(.*)%22',
                    ip.xpath('td/script/text()').extract()[0]).group(1))
         item['port'] = ip.xpath('td[2]/text()').extract()[0]
         type = ip.xpath('td[3]/text()').extract()[0].lower()
         type = 'http' if type in [
             'anonymous', 'transparent', 'high anonymity', 'distorting'
         ] else type
         item['type'] = type
         yield item