def getBaselineScoreForSiblings(self, topNode): """\ we could have long articles that have tons of paragraphs so if we tried to calculate the base score against the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of 100 then 100 should be our base. """ base = 100000 numberOfParagraphs = 0 scoreOfParagraphs = 0 nodesToCheck = Parser.getElementsByTag(topNode, tag='p') for node in nodesToCheck: nodeText = Parser.getText(node) wordStats = StopWords( language=self.language).getStopWordCount(nodeText) highLinkDensity = self.isHighLinkDensity(node) if wordStats.getStopWordCount() > 2 and not highLinkDensity: numberOfParagraphs += 1 scoreOfParagraphs += wordStats.getStopWordCount() if numberOfParagraphs > 0: base = scoreOfParagraphs / numberOfParagraphs return base
def getBaselineScoreForSiblings(self, topNode): """\ we could have long articles that have tons of paragraphs so if we tried to calculate the base score against the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of 100 then 100 should be our base. """ base = 100000 numberOfParagraphs = 0 scoreOfParagraphs = 0 nodesToCheck = Parser.getElementsByTag(topNode, tag='p') for node in nodesToCheck: nodeText = Parser.getText(node) wordStats = StopWords().getStopWordCount(nodeText) highLinkDensity = self.isHighLinkDensity(node) if wordStats.getStopWordCount() > 2 and not highLinkDensity: numberOfParagraphs += 1 scoreOfParagraphs += wordStats.getStopWordCount() if numberOfParagraphs > 0: base = scoreOfParagraphs / numberOfParagraphs return base
def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs): """\ adds any siblings that may have a decent score to this node """ if currentSibling.tag == 'p' and len(Parser.getText(currentSibling)) > 0: e0 = currentSibling if e0.tail: e0 = deepcopy(e0) e0.tail = '' return [e0] else: potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p') if potentialParagraphs is None: return None else: ps = [] for firstParagraph in potentialParagraphs: text = Parser.getText(firstParagraph) if len(text) > 0: wordStats = StopWords().getStopWordCount(text) paragraphScore = wordStats.getStopWordCount() siblingBaseLineScore = float(.30) highLinkDensity = self.isHighLinkDensity(firstParagraph) score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore) if score < paragraphScore and not highLinkDensity: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) return ps
def removeParagraphsWithFewWords(self): """\ remove paragraphs that have less than x number of words, would indicate that it's some sort of link """ allNodes = Parser.getElementsByTags(self.getTopNode(),['*'])#.cssselect('*') allNodes.reverse() for el in allNodes: text = Parser.getText(el) stopWords = StopWords().getStopWordCount(text) if stopWords.getStopWordCount() < 3 \ and len(Parser.getElementsByTag(el, tag='object')) == 0 \ and len(Parser.getElementsByTag(el, tag='embed')) == 0: Parser.remove(el) # TODO # check if it is in the right place else: trimmed = Parser.getText(el) if trimmed.startswith("(") and trimmed.endswith(")"): Parser.remove(el)
def removeParagraphsWithFewWords(self): """\ remove paragraphs that have less than x number of words, would indicate that it's some sort of link """ allNodes = Parser.getElementsByTags(self.getTopNode(), ['*']) #.cssselect('*') allNodes.reverse() for el in allNodes: text = Parser.getText(el) stopWords = StopWords().getStopWordCount(text) if stopWords.getStopWordCount() < 3 \ and len(Parser.getElementsByTag(el, tag='object')) == 0 \ and len(Parser.getElementsByTag(el, tag='embed')) == 0: Parser.remove(el) # TODO # check if it is in the right place else: trimmed = Parser.getText(el) if trimmed.startswith("(") and trimmed.endswith(")"): Parser.remove(el)
def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs): """\ adds any siblings that may have a decent score to this node """ if currentSibling.tag == 'p' and len( Parser.getText(currentSibling)) > 0: e0 = currentSibling if e0.tail: e0 = deepcopy(e0) e0.tail = '' return [e0] else: potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p') if potentialParagraphs is None: return None else: ps = [] for firstParagraph in potentialParagraphs: text = Parser.getText(firstParagraph) if len(text) > 0: wordStats = StopWords( language=self.language).getStopWordCount(text) paragraphScore = wordStats.getStopWordCount() siblingBaseLineScore = float(.30) highLinkDensity = self.isHighLinkDensity( firstParagraph) score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore) if score < paragraphScore and not highLinkDensity: p = Parser.createElement(tag='p', text=text, tail=None) ps.append(p) return ps
def calculateBestNodeBasedOnClustering(self, article): doc = article.doc topNode = None nodesToCheck = self.getNodesToCheck(doc) startingBoost = float(1.0) cnt = 0 i = 0 parentNodes = set() nodesWithText = [] for node in nodesToCheck: nodeText = Parser.getText(node) wordStats = StopWords( language=self.language).getStopWordCount(nodeText) highLinkDensity = self.isHighLinkDensity(node) if wordStats.getStopWordCount() > 2 and not highLinkDensity: nodesWithText.append(node) numberOfNodes = len(nodesWithText) negativeScoring = 0 bottomNodesForNegativeScore = float(numberOfNodes) * 0.25 for node in nodesWithText: boostScore = float(0) # boost if (self.isOkToBoost(node)): if cnt >= 0: boostScore = float((1.0 / startingBoost) * 50) startingBoost += 1 # numberOfNodes if numberOfNodes > 15: if (numberOfNodes - i) <= bottomNodesForNegativeScore: booster = float(bottomNodesForNegativeScore - (numberOfNodes - i)) boostScore = float(-pow(booster, float(2))) negscore = -abs(boostScore) + negativeScoring if negscore > 40: boostScore = float(5) nodeText = Parser.getText(node) wordStats = StopWords( language=self.language).getStopWordCount(nodeText) upscore = int(wordStats.getStopWordCount() + boostScore) # parent node parentNode = Parser.getParent(node) self.updateScore(parentNode, upscore) self.updateNodeCount(node.getparent(), 1) if node.getparent() not in parentNodes: parentNodes.add(node.getparent()) # parentparent node parentParentNode = Parser.getParent(parentNode) if parentParentNode is not None: self.updateNodeCount(parentParentNode, 1) self.updateScore(parentParentNode, upscore / 2) if parentParentNode not in parentNodes: parentNodes.add(parentParentNode) cnt += 1 i += 1 topNodeScore = 0 for e in parentNodes: score = self.getScore(e) if score > topNodeScore: topNode = e topNodeScore = score if topNode is None: topNode = e return topNode
def calculateBestNodeBasedOnClustering(self, article): doc = article.doc topNode = None nodesToCheck = self.getNodesToCheck(doc) startingBoost = float(1.0) cnt = 0 i = 0 parentNodes = set() nodesWithText = [] for node in nodesToCheck: nodeText = Parser.getText(node) wordStats = StopWords().getStopWordCount(nodeText) highLinkDensity = self.isHighLinkDensity(node) if wordStats.getStopWordCount() > 2 and not highLinkDensity: nodesWithText.append(node) numberOfNodes = len(nodesWithText) negativeScoring = 0 bottomNodesForNegativeScore = float(numberOfNodes) * 0.25 for node in nodesWithText: boostScore = float(0) # boost if(self.isOkToBoost(node)): if cnt >= 0: boostScore = float((1.0 / startingBoost) * 50) startingBoost += 1 # numberOfNodes if numberOfNodes > 15: if (numberOfNodes - i) <= bottomNodesForNegativeScore: booster = float(bottomNodesForNegativeScore - (numberOfNodes - i)) boostScore = float(-pow(booster, float(2))) negscore = -abs(boostScore) + negativeScoring if negscore > 40: boostScore = float(5) nodeText = Parser.getText(node) wordStats = StopWords().getStopWordCount(nodeText) upscore = int(wordStats.getStopWordCount() + boostScore) # parent node parentNode = Parser.getParent(node) self.updateScore(parentNode, upscore) self.updateNodeCount(node.getparent(), 1) if node.getparent() not in parentNodes: parentNodes.add(node.getparent()) # parentparent node parentParentNode = Parser.getParent(parentNode) if parentParentNode is not None: self.updateNodeCount(parentParentNode, 1) self.updateScore(parentParentNode, upscore / 2) if parentParentNode not in parentNodes: parentNodes.add(parentParentNode) cnt += 1 i += 1 topNodeScore = 0 for e in parentNodes: score = self.getScore(e) if score > topNodeScore: topNode = e topNodeScore = score if topNode is None: topNode = e return topNode