示例#1
0
 def sec_parse(self, response):
     for row in response.xpath("//table//tr[@class!='Caption']")[1:]:
         
         column = row.xpath("td")
         de_str = re.sub(r'IPDecode[(]"(.*?)"[)]', lambda _: _.group(1), ''.join(column[0].xpath(".//text()")))
         de_str = re.sub(r'%([0-9a-fA-F]+)', lambda _:unichr(int(_.group(1), 16)), de_str)
         de_str = re.sub(r'<.*?>', '', de_str).strip()
         
         item = proxyItem()
         item['address'] = de_str
         item['port'] = ''.join(column[1].xpath('text()')).strip()
         item['protocol'] = ''.join(column[2].xpath('text()')).strip()
         yield item
示例#2
0
 def parse(self, response):
     logger.info("parsing %s", response.url)
     for row in etree.HTML(response.body).xpath(self.xpath):
         item = proxyItem()
         if not self.item_xpath['address']:
             item['address'] = ''
         else:
             item['address'] = row.xpath(self.item_xpath['address'])[0]
         if not self.item_xpath['port']:
             item['port'] = ''
         else:
             item['port'] = row.xpath(self.item_xpath['port'])[0]
         if not self.item_xpath['protocol']:
             item['protocol'] = ''
         else:
             item['protocol'] = row.xpath(self.item_xpath['protocol'])[0]
         yield item
示例#3
0
    def parse(self, response):
        for row in etree.HTML(response.body).xpath('//table//tr'):
            column = row.xpath('td')
            if column == []:
                continue

            ipport_str = ''
            for part in column[0]:
                ipport_style = part.xpath("@style")
                if ipport_style == "display: inline-block;" or ipport_sytle == []:
                    ipport_str += part.xpath("text()")
            parts = re.match(r'([.\d])+:(\d+)')

            item = proxyItem()
            item['address'] = parts.group(1)
            item['port'] = parts.group(2)
            item['protocol'] = ''.join(column[2].xpath('text()')).strip()
            yield item