def calculate_best_node(self, doc): top_node = None nodes_to_check = self.nodes_to_check(doc) starting_boost = float(1.0) cnt = 0 i = 0 parent_nodes = [] nodes_with_text = [] for node in nodes_to_check: text_node = self.parser.getText(node) word_stats = self.stopwords_class( language=self.get_language()).get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: nodes_with_text.append(node) nodes_number = len(nodes_with_text) negative_scoring = 0 bottom_negativescore_nodes = float(nodes_number) * 0.25 dupe_track = [] for node in nodes_with_text: boost_score = float(0) # boost if self.is_boostable(node): if cnt >= 0: boost_score = float((1.0 / starting_boost) * 50) starting_boost += 1 # nodes_number if nodes_number > 15: if (nodes_number - i) <= bottom_negativescore_nodes: booster = float(bottom_negativescore_nodes - (nodes_number - i)) boost_score = float(-pow(booster, float(2))) negscore = abs(boost_score) + negative_scoring if negscore > 40: boost_score = float(5) text_node = self.parser.getText(node) word_stats = self.stopwords_class( language=self.get_language()).get_stopword_count(text_node) upscore = int(word_stats.get_stopword_count() + boost_score) self.update_score(node, upscore) self.update_node_count(node, 1) if text_node not in dupe_track: parent_nodes.append(node) dupe_track.append(text_node) top_node = HtmlElement() for itm in parent_nodes: score = self.get_score(itm) print("{}\n--------------------\n{}\n".format( score, itm.text_content())) top_node.append(itm) return top_node
def set_inner_html(elem: HtmlElement, html: str): """Replace innerHTML of a lxml element.""" # Clear the element contents child: HtmlElement for child in elem.getchildren(): elem.remove(child) # Create and add new contents content = fragment_fromstring(html) elem.append(content)
def xhtml(div: HtmlElement, data: PostData) -> str: title = escape(data.title) date = data.date.strftime("%Y-%m-%d") url = ArchiveURL + data.href div.tag = "body" div.append( fromstring( f'<p class="postwebpage"><a href="{url}">[{date}]</a></p>\n\n')) body = html_to_string(div) if data.kind == "miscellaneous": body = pants.process(body) return f"""<?xml version="1.0" encoding="utf-8" standalone="no"?>
def reduce_tree(node): """Removes all but the important tags from a node and its children.""" newnode = HtmlElement(attrib=node.attrib) newnode.tag = node.tag newnode.text = node.text for child in node.getchildren(): newchild = reduce_tree(child) if child.tag.lower() in ('a', 'dl', 'h1', 'h3'): newnode.append(newchild) else: # we don't want this node, so get its children instead for grandchild in newchild.getchildren(): newnode.append(grandchild) return newnode
def process(self, default_idname=None, extra_filters=None, reqinfo=None): ''' Process the extracted element, before rendering as a string This is for an HTML element that has been extracted and parsed from the document source. We apply certain transformations and mods needed before it can be rendered into a string. Operates on self.elem, replacing it as a side effect. The element will be wrapped in a new div, which is given the class and ID according to the classvalue and idname member variables. default_idname is used as a fallback idname; If self.idname has already been set, that will be used instead. It is a runtime error if neither are set. @param elem : HTML element to process @type elem : lxml.html.HtmlElement @param default_idname : Optional fallback ID attribute to apply to the enclosing div @type default_idname : str @param extra_filters : Additional filters to post-apply, from moplate @type extra_filters : list of callable; or None for no filters (empty list) @return : New element with the applied changes @rtype : lxml.html.HtmlElement ''' from lxml.html import HtmlElement if extra_filters is None: extra_filters = [] def applyfilters(elem): from itertools import chain def relevant(filt): _is_relevant = True if hasattr(filt, 'relevant'): assert callable(filt.relevant), filt.relevant _is_relevant = filt.relevant(reqinfo) return _is_relevant for filt in chain(self.filters, extra_filters): if relevant(filt): filt(elem) assert type(self.elems) is list, self.elems if self.idname is None: assert default_idname is not None, 'cannot determine an idname!' idname = default_idname else: idname = self.idname if self.filtermode == FILT_EACHELEM: # applying filters to extracted elements individually for elem in self.elems: applyfilters(elem) # wrap in special mobilize class, id if self.innerhtml and len(self.elems) == 1: newelem = copy.deepcopy(self.elems[0]) newelem.tag = self.tag else: newelem = HtmlElement() newelem.tag = self.tag for elem in self.elems: newelem.append(elem) if self.filtermode == FILT_COLLAPSED: # applying filters to the single collapsed element applyfilters(newelem) newelem.attrib['class'] = self.classvalue newelem.attrib['id'] = idname if bool(self.style): newelem.attrib['style'] = self.style self.elem = newelem return newelem