def parse(self, response): soup = BeautifulSoup(response.body) for script in soup(["script", "style"]): script.extract() text = soup.get_text() response.meta.update(score=KeywordScorer.score(text)) for link in self.link_extractor.extract_links(response): request = Request(url=link.url) request.meta.update(link_text=link.text) link_score = KeywordScorer.score(link.text) request.meta.update(score=link_score) yield request
def parse(self, response): soup = BeautifulSoup(response.body) for script in soup(["script", "style"]): script.extract() text = soup.get_text() response.meta.update(score=KeywordScorer.score(text)) response.meta.update( content_hash=xxhash.xxh64(text.encode('ascii', 'ignore')).intdigest()) for link in self.link_extractor.extract_links(response): request = Request(url=link.url) request.meta.update(link_text=link.text) link_score = KeywordScorer.score(link.text) request.meta.update(score=link_score) yield request
def parse(self, response): soup = BeautifulSoup(response.body) for script in soup(["script", "style"]): script.extract() text = soup.get_text() response.meta.update(score=KeywordScorer.score(text)) response.meta.update(content_hash=xxhash.xxh64( text.encode('ascii', 'ignore')).intdigest()) for link in self.link_extractor.extract_links(response): request = Request(url=link.url) request.meta.update(link_text=link.text) link_score = KeywordScorer.score(link.text) request.meta.update(score=link_score) yield request