def handle_ParseHTMLMessage(self, page_info): if (page_info is None) or (page_info.status & PageInfo.PARSED): return contents = self.remove_xml_declaration(page_info.raw_contents) document_tree = lxml.html.fromstring(contents, base_url=target_url) document_tree.make_links_absolute() title_elements = document_tree.xpath("//title") if title_elements: page_info.title = title_elements[0].text.strip() else: page_info.title = None page_info.text = TextUtils.normalize_text(self.collect_text(document_tree)) page_info.status |= PageInfo.PARSED self.page_storage_client.set_page(page_info) bigram_index = [ ("".join(ngram), (page_info.id, pos)) \ for (pos, ngram) in enumerate(TextUtils.make_ngram(page_info.text_contents))] self.index_storage_client.set_index(bigram_index) return True
def hancle_ParseHTMLMessage(self, page_info): self.show_timestamped_message("Parsing ... %s" % page_info.url) contents = self.remove_xml_declaration(page_info.raw_contents) document_tree = lxml.html.fromstring(contents, base_url=page_info.url) document_tree.make_links_absolute() if not(page_info.status & PageInfo.PARSED): title_elements = document_tree.xpath("//title") if title_elements: page_info.title = title_elements[0].text.strip() else: page_info.title = None page_info.text = TextUtils.normalize_text(self.collect_text(document_tree)) self.page_storage_client.set_page(page_info) self.indexer(page_info) if self.link_queue: for link_url in self.extract_link_urls(document_tree): self.link_queue(link_url)