Exemplo n.º 1
0
    def handle_ParseHTMLMessage(self, page_info):
        if (page_info is None) or (page_info.status & PageInfo.PARSED):
            return

        contents = self.remove_xml_declaration(page_info.raw_contents)
        document_tree = lxml.html.fromstring(contents, base_url=target_url)
        document_tree.make_links_absolute()

        title_elements = document_tree.xpath("//title")
        if title_elements:
            page_info.title = title_elements[0].text.strip()

        else:
            page_info.title = None

        page_info.text = TextUtils.normalize_text(self.collect_text(document_tree))
        page_info.status |= PageInfo.PARSED
        self.page_storage_client.set_page(page_info)

        bigram_index = [
            ("".join(ngram), (page_info.id, pos)) \
            for (pos, ngram) in enumerate(TextUtils.make_ngram(page_info.text_contents))]

        self.index_storage_client.set_index(bigram_index)

        return True
Exemplo n.º 2
0
    def hancle_ParseHTMLMessage(self, page_info):
        self.show_timestamped_message("Parsing ... %s" % page_info.url)

        contents = self.remove_xml_declaration(page_info.raw_contents)
        document_tree = lxml.html.fromstring(contents, base_url=page_info.url)
        document_tree.make_links_absolute()

        if not(page_info.status & PageInfo.PARSED):
            title_elements = document_tree.xpath("//title")
            if title_elements:
                page_info.title = title_elements[0].text.strip()
            else:
                page_info.title = None

            page_info.text = TextUtils.normalize_text(self.collect_text(document_tree))
            self.page_storage_client.set_page(page_info)

        self.indexer(page_info)

        if self.link_queue:
            for link_url in self.extract_link_urls(document_tree):
                self.link_queue(link_url)