Exemplo n.º 1
0
    def _key_element_to_cell(self, key: str, key_element: HtmlElement) -> bool:
        """Converts a |key_element| Element to a table cell and tries to modify
        the corresponding value to a cell.

        Args:
            key: (string) the key that |key_element| represents
            key_element: (HtmlElement) the element to be modified
        Returns:
            True if a modification was made and False otherwise.
        """

        # <foo><bar>key</bar>value</foo>
        # Create a new td element containing the following-sibling's text and
        # add it after the key cell.
        following_siblings = key_element.xpath("following-sibling::text()")
        if following_siblings:
            following_text = following_siblings[0].strip()
            if following_text:
                key_element.tag = "td"
                following_cell = HtmlElement(following_text)
                following_cell.tag = "td"
                key_element.addnext(following_cell)
                return True

        # <foo>key</foo><bar>value</bar>
        # The key and value are already adjacent, so just make them both cells.
        if key_element.getnext() is not None:
            key_element.tag = "td"
            key_element.getnext().tag = "td"
            return True

        # <foo><bar/><baz></baz>key: value</foo>
        # Create new td elements for the key and the value and insert them.
        for child in key_element:
            if child.tail and child.tail.startswith(key):
                if self._insert_cells_from_text(key, child.tail, key_element):
                    return True

        # <foo>key<bar>value</bar></foo>
        # Create a new td element containing the key and add it before the
        # value cell.
        if len(key_element) == 1:
            key_cell = HtmlElement(key)
            key_cell.tag = "td"
            value_cell = key_element[0]
            value_cell.tag = "td"
            value_cell.addprevious(key_cell)
            return True

        # <foo>key : value</foo>
        # Create new td elements for the key and the value and insert them.
        text = self._get_text_from_element(key_element)
        if text and text.startswith(key):
            if self._insert_cells_from_text(key, text, key_element):
                return True

        return False
Exemplo n.º 2
0
 def _insert_cells_from_text(self, key: str, text: str, container) -> bool:
     """Given a |text| string in the format '<key>: <value>', inserts
     corresponding key/value td cells into |container|. Returns True if
     insertion is performed, False otherwise."""
     remaining = text[len(key):].strip().strip(":").strip()
     if remaining:
         key_cell = HtmlElement(key)
         key_cell.tag = "td"
         value_cell = HtmlElement(remaining)
         value_cell.tag = "td"
         container.insert(0, key_cell)
         container.insert(1, value_cell)
         return True
     return False
Exemplo n.º 3
0
    def calculate_best_node(self, doc):
        top_node = None
        nodes_to_check = self.nodes_to_check(doc)

        starting_boost = float(1.0)
        cnt = 0
        i = 0
        parent_nodes = []
        nodes_with_text = []

        for node in nodes_to_check:
            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(
                language=self.get_language()).get_stopword_count(text_node)
            high_link_density = self.is_highlink_density(node)
            if word_stats.get_stopword_count() > 2 and not high_link_density:
                nodes_with_text.append(node)

        nodes_number = len(nodes_with_text)
        negative_scoring = 0
        bottom_negativescore_nodes = float(nodes_number) * 0.25

        dupe_track = []
        for node in nodes_with_text:
            boost_score = float(0)
            # boost
            if self.is_boostable(node):
                if cnt >= 0:
                    boost_score = float((1.0 / starting_boost) * 50)
                    starting_boost += 1
            # nodes_number
            if nodes_number > 15:
                if (nodes_number - i) <= bottom_negativescore_nodes:
                    booster = float(bottom_negativescore_nodes -
                                    (nodes_number - i))
                    boost_score = float(-pow(booster, float(2)))
                    negscore = abs(boost_score) + negative_scoring
                    if negscore > 40:
                        boost_score = float(5)

            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(
                language=self.get_language()).get_stopword_count(text_node)
            upscore = int(word_stats.get_stopword_count() + boost_score)

            self.update_score(node, upscore)
            self.update_node_count(node, 1)

            if text_node not in dupe_track:
                parent_nodes.append(node)
                dupe_track.append(text_node)

        top_node = HtmlElement()
        for itm in parent_nodes:
            score = self.get_score(itm)
            print("{}\n--------------------\n{}\n".format(
                score, itm.text_content()))
            top_node.append(itm)

        return top_node
Exemplo n.º 4
0
    def calculate_best_node(self, html):
        html_element = Parser().fromstring(html)
        script_contents = html_element.get_element_by_id(
            'shoebox-content-items').text
        top_node = HtmlElement()
        for k, v in json.loads(script_contents).items():
            data = json.loads(v)
            chapters = data['chapters']
            for i, chapter in enumerate(chapters):
                if 'text' in chapter:
                    text_element = Parser().fromstring(chapter['text'])
                    text_element.attrib['class'] = 'le-droit-content'
                    top_node.insert(i, text_element)
            top_node.insert(-1, HtmlElement())

        return top_node
Exemplo n.º 5
0
    def _css_key_to_cell(self, content: HtmlElement, css_key: str) -> None:
        matches = content.cssselect(css_key)

        for match in matches:
            logging.debug("Adding cell [%s] which matches css key [%s]",
                          tostring(match), css_key)
            key_cell = HtmlElement(css_key)
            key_cell.tag = "td"
            match.tag = "td"
            match.addprevious(key_cell)
Exemplo n.º 6
0
def reduce_tree(node):
    """Removes all but the important tags from a node and its children."""
    newnode = HtmlElement(attrib=node.attrib)
    newnode.tag = node.tag
    newnode.text = node.text
    for child in node.getchildren():
        newchild = reduce_tree(child)
        if child.tag.lower() in ('a', 'dl', 'h1', 'h3'):
            newnode.append(newchild)
        else:
            # we don't want this node, so get its children instead
            for grandchild in newchild.getchildren():
                newnode.append(grandchild)
    return newnode