def _key_element_to_cell(self, key: str, key_element: HtmlElement) -> bool: """Converts a |key_element| Element to a table cell and tries to modify the corresponding value to a cell. Args: key: (string) the key that |key_element| represents key_element: (HtmlElement) the element to be modified Returns: True if a modification was made and False otherwise. """ # <foo><bar>key</bar>value</foo> # Create a new td element containing the following-sibling's text and # add it after the key cell. following_siblings = key_element.xpath("following-sibling::text()") if following_siblings: following_text = following_siblings[0].strip() if following_text: key_element.tag = "td" following_cell = HtmlElement(following_text) following_cell.tag = "td" key_element.addnext(following_cell) return True # <foo>key</foo><bar>value</bar> # The key and value are already adjacent, so just make them both cells. if key_element.getnext() is not None: key_element.tag = "td" key_element.getnext().tag = "td" return True # <foo><bar/><baz></baz>key: value</foo> # Create new td elements for the key and the value and insert them. for child in key_element: if child.tail and child.tail.startswith(key): if self._insert_cells_from_text(key, child.tail, key_element): return True # <foo>key<bar>value</bar></foo> # Create a new td element containing the key and add it before the # value cell. if len(key_element) == 1: key_cell = HtmlElement(key) key_cell.tag = "td" value_cell = key_element[0] value_cell.tag = "td" value_cell.addprevious(key_cell) return True # <foo>key : value</foo> # Create new td elements for the key and the value and insert them. text = self._get_text_from_element(key_element) if text and text.startswith(key): if self._insert_cells_from_text(key, text, key_element): return True return False
def _insert_cells_from_text(self, key: str, text: str, container) -> bool: """Given a |text| string in the format '<key>: <value>', inserts corresponding key/value td cells into |container|. Returns True if insertion is performed, False otherwise.""" remaining = text[len(key):].strip().strip(":").strip() if remaining: key_cell = HtmlElement(key) key_cell.tag = "td" value_cell = HtmlElement(remaining) value_cell.tag = "td" container.insert(0, key_cell) container.insert(1, value_cell) return True return False
def calculate_best_node(self, doc): top_node = None nodes_to_check = self.nodes_to_check(doc) starting_boost = float(1.0) cnt = 0 i = 0 parent_nodes = [] nodes_with_text = [] for node in nodes_to_check: text_node = self.parser.getText(node) word_stats = self.stopwords_class( language=self.get_language()).get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: nodes_with_text.append(node) nodes_number = len(nodes_with_text) negative_scoring = 0 bottom_negativescore_nodes = float(nodes_number) * 0.25 dupe_track = [] for node in nodes_with_text: boost_score = float(0) # boost if self.is_boostable(node): if cnt >= 0: boost_score = float((1.0 / starting_boost) * 50) starting_boost += 1 # nodes_number if nodes_number > 15: if (nodes_number - i) <= bottom_negativescore_nodes: booster = float(bottom_negativescore_nodes - (nodes_number - i)) boost_score = float(-pow(booster, float(2))) negscore = abs(boost_score) + negative_scoring if negscore > 40: boost_score = float(5) text_node = self.parser.getText(node) word_stats = self.stopwords_class( language=self.get_language()).get_stopword_count(text_node) upscore = int(word_stats.get_stopword_count() + boost_score) self.update_score(node, upscore) self.update_node_count(node, 1) if text_node not in dupe_track: parent_nodes.append(node) dupe_track.append(text_node) top_node = HtmlElement() for itm in parent_nodes: score = self.get_score(itm) print("{}\n--------------------\n{}\n".format( score, itm.text_content())) top_node.append(itm) return top_node
def calculate_best_node(self, html): html_element = Parser().fromstring(html) script_contents = html_element.get_element_by_id( 'shoebox-content-items').text top_node = HtmlElement() for k, v in json.loads(script_contents).items(): data = json.loads(v) chapters = data['chapters'] for i, chapter in enumerate(chapters): if 'text' in chapter: text_element = Parser().fromstring(chapter['text']) text_element.attrib['class'] = 'le-droit-content' top_node.insert(i, text_element) top_node.insert(-1, HtmlElement()) return top_node
def _css_key_to_cell(self, content: HtmlElement, css_key: str) -> None: matches = content.cssselect(css_key) for match in matches: logging.debug("Adding cell [%s] which matches css key [%s]", tostring(match), css_key) key_cell = HtmlElement(css_key) key_cell.tag = "td" match.tag = "td" match.addprevious(key_cell)
def reduce_tree(node): """Removes all but the important tags from a node and its children.""" newnode = HtmlElement(attrib=node.attrib) newnode.tag = node.tag newnode.text = node.text for child in node.getchildren(): newchild = reduce_tree(child) if child.tag.lower() in ('a', 'dl', 'h1', 'h3'): newnode.append(newchild) else: # we don't want this node, so get its children instead for grandchild in newchild.getchildren(): newnode.append(grandchild) return newnode