def parse_item(self, response): hxs = HtmlXPathSelector(response) i = webgraphItem() i['node'] = response.url print "#######################" print response.url print "#######################" # i['http_status'] = response.status llinks=[] for anchor in hxs.select('//a[@href]'): href=anchor.select('@href').extract()[0] if not href.lower().startswith("javascript") and href.startswith("http://perso.ens-lyon.fr/baptiste.roziere/"): llinks.append(urljoin_rfc(response.url,href)) i['edge'] = llinks return i
def parse_item(self, response): hxs = HtmlXPathSelector(response) i = webgraphItem() i['node'] = response.url print "#######################" print response.url print "#######################" # i['http_status'] = response.status llinks=[] seen = {} for anchor in hxs.select('//a[@href]'): href=anchor.select('@href').extract()[0] if href.startswith("http://www.cdiscount.com") and not (href in seen): seen[href]=True llinks.append(urljoin_rfc(response.url,href)) i['edge'] = llinks return i