def sec_parse(self, response): for row in response.xpath("//table//tr[@class!='Caption']")[1:]: column = row.xpath("td") de_str = re.sub(r'IPDecode[(]"(.*?)"[)]', lambda _: _.group(1), ''.join(column[0].xpath(".//text()"))) de_str = re.sub(r'%([0-9a-fA-F]+)', lambda _:unichr(int(_.group(1), 16)), de_str) de_str = re.sub(r'<.*?>', '', de_str).strip() item = proxyItem() item['address'] = de_str item['port'] = ''.join(column[1].xpath('text()')).strip() item['protocol'] = ''.join(column[2].xpath('text()')).strip() yield item
def parse(self, response): logger.info("parsing %s", response.url) for row in etree.HTML(response.body).xpath(self.xpath): item = proxyItem() if not self.item_xpath['address']: item['address'] = '' else: item['address'] = row.xpath(self.item_xpath['address'])[0] if not self.item_xpath['port']: item['port'] = '' else: item['port'] = row.xpath(self.item_xpath['port'])[0] if not self.item_xpath['protocol']: item['protocol'] = '' else: item['protocol'] = row.xpath(self.item_xpath['protocol'])[0] yield item
def parse(self, response): for row in etree.HTML(response.body).xpath('//table//tr'): column = row.xpath('td') if column == []: continue ipport_str = '' for part in column[0]: ipport_style = part.xpath("@style") if ipport_style == "display: inline-block;" or ipport_sytle == []: ipport_str += part.xpath("text()") parts = re.match(r'([.\d])+:(\d+)') item = proxyItem() item['address'] = parts.group(1) item['port'] = parts.group(2) item['protocol'] = ''.join(column[2].xpath('text()')).strip() yield item