def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs, good_path): """\ adds any siblings that may have a decent score to this node """ if currentSibling.tag == 'p' and len(Parser.getText(currentSibling)) > 0: e0 = currentSibling if e0.tail: e0 = deepcopy(e0) e0.tail = '' return [e0] else: potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p') if potentialParagraphs is None: return None else: ps = [] for firstParagraph in potentialParagraphs: path = Parser.getPath(firstParagraph) text = Parser.getText(firstParagraph) if path == good_path and len(Parser.getElementsByTag(firstParagraph, tag='a')) == 0: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) continue if len(text) > 0: wordStats = self.stopwordsCls(language=self.language).getStopWordCount(text) paragraphScore = wordStats.getStopWordCount() siblingBaseLineScore = float(.30) highLinkDensity = self.isHighLinkDensity(firstParagraph) score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore) if score < paragraphScore and not highLinkDensity: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) return ps
def addSiblings(self, topNode): baselineScoreForSiblingParagraphs = self.getBaselineScoreForSiblings(topNode) results = self.walkSiblings(topNode) good_ps = Parser.getElementsByTag(topNode, tag='p') good_path = [] if len(good_ps) > 0: good_path = Parser.getPath(good_ps[0]) for currentNode in results: ps = self.getSiblingContent(currentNode, baselineScoreForSiblingParagraphs, good_path) for p in ps: topNode.insert(0, p) return topNode
def addSiblings(self, topNode): baselineScoreForSiblingParagraphs = self.getBaselineScoreForSiblings(topNode) parent = topNode.getparent() if len(parent) == 1 and topNode.tail is None: results = self.walkSiblings(parent) else: results = self.walkSiblings(topNode) good_ps = topNode.find('.//p') good_path = [] if good_ps is not None: good_path = Parser.getPath(good_ps) for currentNode in results: ps = self.getSiblingContent(currentNode, baselineScoreForSiblingParagraphs, good_path) for p in ps: topNode.insert(0, p) return topNode
def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs, good_path): """\ adds any siblings that may have a decent score to this node """ if currentSibling.tag in ('p','h2','h3','h4') and len(Parser.getText(currentSibling)) > 0: return [currentSibling] else: potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p') if potentialParagraphs is None: return None else: ps = [] for firstParagraph in potentialParagraphs: path = Parser.getPath(firstParagraph) textLen,stopCount,isHighLink = self.getTextStats(firstParagraph) if path == good_path and not Parser.hasChildTag(firstParagraph, 'a'): ps.insert(0,firstParagraph) continue if textLen > 0: score = float(baselineScoreForSiblingParagraphs * 0.30) if score < stopCount and not isHighLink: ps.insert(0,firstParagraph) return ps