示例#1
0
    def calculate_best_node(self, doc):
        top_node = None
        nodes_to_check = self.nodes_to_check(doc)

        starting_boost = float(1.0)
        cnt = 0
        i = 0
        parent_nodes = []
        nodes_with_text = []

        for node in nodes_to_check:
            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(
                language=self.get_language()).get_stopword_count(text_node)
            high_link_density = self.is_highlink_density(node)
            if word_stats.get_stopword_count() > 2 and not high_link_density:
                nodes_with_text.append(node)

        nodes_number = len(nodes_with_text)
        negative_scoring = 0
        bottom_negativescore_nodes = float(nodes_number) * 0.25

        dupe_track = []
        for node in nodes_with_text:
            boost_score = float(0)
            # boost
            if self.is_boostable(node):
                if cnt >= 0:
                    boost_score = float((1.0 / starting_boost) * 50)
                    starting_boost += 1
            # nodes_number
            if nodes_number > 15:
                if (nodes_number - i) <= bottom_negativescore_nodes:
                    booster = float(bottom_negativescore_nodes -
                                    (nodes_number - i))
                    boost_score = float(-pow(booster, float(2)))
                    negscore = abs(boost_score) + negative_scoring
                    if negscore > 40:
                        boost_score = float(5)

            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(
                language=self.get_language()).get_stopword_count(text_node)
            upscore = int(word_stats.get_stopword_count() + boost_score)

            self.update_score(node, upscore)
            self.update_node_count(node, 1)

            if text_node not in dupe_track:
                parent_nodes.append(node)
                dupe_track.append(text_node)

        top_node = HtmlElement()
        for itm in parent_nodes:
            score = self.get_score(itm)
            print("{}\n--------------------\n{}\n".format(
                score, itm.text_content()))
            top_node.append(itm)

        return top_node
示例#2
0
def set_inner_html(elem: HtmlElement, html: str):
    """Replace innerHTML of a lxml element."""

    # Clear the element contents
    child: HtmlElement
    for child in elem.getchildren():
        elem.remove(child)

    # Create and add new contents
    content = fragment_fromstring(html)
    elem.append(content)
示例#3
0
def xhtml(div: HtmlElement, data: PostData) -> str:
    title = escape(data.title)
    date = data.date.strftime("%Y-%m-%d")
    url = ArchiveURL + data.href
    div.tag = "body"
    div.append(
        fromstring(
            f'<p class="postwebpage"><a href="{url}">[{date}]</a></p>\n\n'))
    body = html_to_string(div)
    if data.kind == "miscellaneous":
        body = pants.process(body)
    return f"""<?xml version="1.0" encoding="utf-8" standalone="no"?>
示例#4
0
def reduce_tree(node):
    """Removes all but the important tags from a node and its children."""
    newnode = HtmlElement(attrib=node.attrib)
    newnode.tag = node.tag
    newnode.text = node.text
    for child in node.getchildren():
        newchild = reduce_tree(child)
        if child.tag.lower() in ('a', 'dl', 'h1', 'h3'):
            newnode.append(newchild)
        else:
            # we don't want this node, so get its children instead
            for grandchild in newchild.getchildren():
                newnode.append(grandchild)
    return newnode
示例#5
0
    def process(self, default_idname=None, extra_filters=None, reqinfo=None):
        '''
        Process the extracted element, before rendering as a string

        This is for an HTML element that has been extracted and parsed
        from the document source.  We apply certain transformations and
        mods needed before it can be rendered into a string.

        Operates on self.elem, replacing it as a side effect.

        The element will be wrapped in a new div, which is given the
        class and ID according to the classvalue and idname member
        variables.  default_idname is used as a fallback idname; If
        self.idname has already been set, that will be used instead.
        It is a runtime error if neither are set.

        @param elem           : HTML element to process
        @type  elem           : lxml.html.HtmlElement

        @param default_idname : Optional fallback ID attribute to apply to the enclosing div
        @type  default_idname : str

        @param extra_filters  : Additional filters to post-apply, from moplate
        @type  extra_filters  : list of callable; or None for no filters (empty list)

        @return               : New element with the applied changes
        @rtype                : lxml.html.HtmlElement
        
        '''
        from lxml.html import HtmlElement
        if extra_filters is None:
            extra_filters = []
        def applyfilters(elem):
            from itertools import chain
            def relevant(filt):
                _is_relevant = True
                if hasattr(filt, 'relevant'):
                    assert callable(filt.relevant), filt.relevant
                    _is_relevant = filt.relevant(reqinfo)
                return _is_relevant
            for filt in chain(self.filters, extra_filters):
                if relevant(filt):
                    filt(elem)
        assert type(self.elems) is list, self.elems
        if self.idname is None:
            assert default_idname is not None, 'cannot determine an idname!'
            idname = default_idname
        else:
            idname = self.idname
        if self.filtermode == FILT_EACHELEM:
            # applying filters to extracted elements individually
            for elem in self.elems:
                applyfilters(elem)
        # wrap in special mobilize class, id
        if self.innerhtml and len(self.elems) == 1:
            newelem = copy.deepcopy(self.elems[0])
            newelem.tag = self.tag
        else:
            newelem = HtmlElement()
            newelem.tag = self.tag
            for elem in self.elems:
                newelem.append(elem)
        if self.filtermode == FILT_COLLAPSED:
            # applying filters to the single collapsed element
            applyfilters(newelem)
        newelem.attrib['class'] = self.classvalue
        newelem.attrib['id'] = idname
        if bool(self.style):
            newelem.attrib['style'] = self.style
        self.elem = newelem
        return newelem