def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs, good_path): """\ adds any siblings that may have a decent score to this node """ if currentSibling.tag == 'p' and len(Parser.getText(currentSibling)) > 0: e0 = currentSibling if e0.tail: e0 = deepcopy(e0) e0.tail = '' return [e0] else: potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p') if potentialParagraphs is None: return None else: ps = [] for firstParagraph in potentialParagraphs: path = Parser.getPath(firstParagraph) text = Parser.getText(firstParagraph) if path == good_path and len(Parser.getElementsByTag(firstParagraph, tag='a')) == 0: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) continue if len(text) > 0: wordStats = self.stopwordsCls(language=self.language).getStopWordCount(text) paragraphScore = wordStats.getStopWordCount() siblingBaseLineScore = float(.30) highLinkDensity = self.isHighLinkDensity(firstParagraph) score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore) if score < paragraphScore and not highLinkDensity: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) return ps
def get_siblings_content(self, current_sibling, baselinescore_siblings_para): """\ adds any siblings that may have a decent score to this node """ if current_sibling.tag == 'p' and len(Parser.getText(current_sibling)) > 0: e0 = current_sibling if e0.tail: e0 = deepcopy(e0) e0.tail = '' return [e0] else: potential_paragraphs = Parser.getElementsByTag(current_sibling, tag='p') if potential_paragraphs is None: return None else: ps = [] for first_paragraph in potential_paragraphs: text = Parser.getText(first_paragraph) if len(text) > 0: word_stats = self.stopwords_class(language=self.language).get_stopword_count(text) paragraph_score = word_stats.get_stopword_count() sibling_baseline_score = float(.30) high_link_density = self.is_highlink_density(first_paragraph) score = float(baselinescore_siblings_para * sibling_baseline_score) if score < paragraph_score and not high_link_density: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) return ps
def get_siblings_content(self, current_sibling, baselinescore_siblings_para): """\ adds any siblings that may have a decent score to this node """ if current_sibling.tag == 'p' and len(Parser.getText(current_sibling)) > 0: e0 = current_sibling if e0.tail: e0 = deepcopy(e0) e0.tail = '' return [e0] else: potential_paragraphs = Parser.getElementsByTag(current_sibling, tag='p') if potential_paragraphs is None: return None else: ps = [] for first_paragraph in potential_paragraphs: text = Parser.getText(first_paragraph) if len(text) > 0: word_stats = self.stopwords_class(language=self.language).get_stopword_count(text) paragraph_score = word_stats.get_stopword_count() sibling_baseline_score = float(.30) high_link_density = self.is_highlink_density(first_paragraph) score = float(baselinescore_siblings_para * sibling_baseline_score) if score < paragraph_score and not high_link_density: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) return ps
def postExtractionCleanup(self, targetNode): """\ remove any divs that looks like non-content, clusters of links, or paras with no gusto """ if targetNode.text is not None: e = Parser.createElement(text=targetNode.text) targetNode.text = None targetNode.insert(0, e) node = self.addSiblings(targetNode) for e in node: if e.tag in ('h2','h3','h4'): continue if e.tag not in ('p','pre','font'): textLen,stopCount,isHighLink = self.getTextStats(e) if isHighLink \ or self.isTableTagAndNoParagraphsExist(e) \ or not self.isNodeScoreThreshholdMet(node, e): Parser.remove(e) for e in reversed(node): if e.tag not in ('h2','h3','h4'): break Parser.remove(e) return node
def postExtractionCleanup(self, targetNode): """\ remove any divs that looks like non-content, clusters of links, or paras with no gusto """ if targetNode.text is not None: e = Parser.createElement(text=targetNode.text) targetNode.text = None targetNode.insert(0, e) node = self.addSiblings(targetNode) for e in node: if e.tag in ['h2','h3','h4']: continue if e.tag not in ['p','pre','font']: if self.isHighLinkDensity(e) \ or self.isTableTagAndNoParagraphsExist(e) \ or not self.isNodeScoreThreshholdMet(node, e): Parser.remove(e) for e in reversed(node): if e.tag == 'p' and list(e) == []: if e.text is None or re.search('[^ \t\r\n]',e.text) == None: Parser.remove(e) continue if e.tag not in ['h2','h3','h4']: break Parser.remove(e) return node
def getReplacementNodes(self, div): replacementText = [] nodesToReturn = [] p = Parser.createElement(tag='p', text='', tail=None) last_inline_node = None if div.text is not None: div.text = self.parser.unescape(div.text).strip('\t\r\n') if len(div.text): replacementText.append(div.text) for kid in list(div): if kid.tail is not None: kid.tail = self.parser.unescape(kid.tail).strip('\t\r\n') if replacementText: text = ''.join(replacementText) replacementText = [] if len(p): last_inline_node.tail = text else: p.text = text if kid.tag in self.goodInlineTags: p.append(kid) last_inline_node = kid else: if len(p) or len(p.text): nodesToReturn.append(p) p = Parser.createElement(tag='p', text='', tail=None) if kid.tail is not None and len(kid.tail): replacementText.append(kid.tail) kid.tail = None nodesToReturn.append(kid) # flush out anything still remaining if replacementText: text = ''.join(replacementText) if len(p): last_inline_node.tail = text else: p.text = text if len(p) or len(p.text): nodesToReturn.append(p) return nodesToReturn
def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs): """\ adds any siblings that may have a decent score to this node """ if currentSibling.tag == 'p' and len( Parser.getText(currentSibling)) > 0: e0 = currentSibling if e0.tail: e0 = deepcopy(e0) e0.tail = '' return [e0] else: potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p') if potentialParagraphs is None: return None else: ps = [] for firstParagraph in potentialParagraphs: text = Parser.getText(firstParagraph) if len(text) > 0: wordStats = StopWords( language=self.language).getStopWordCount(text) paragraphScore = wordStats.getStopWordCount() siblingBaseLineScore = float(.30) highLinkDensity = self.isHighLinkDensity( firstParagraph) score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore) if score < paragraphScore and not highLinkDensity: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) return ps