Exemplos de StopWords.getStopWordCount em Python, exemplos de goose.text.StopWords.getStopWordCount em Python

Exemplo n.º 1

0

Exibir arquivo

    def getBaselineScoreForSiblings(self, topNode):
        """\
        we could have long articles that have tons of paragraphs
        so if we tried to calculate the base score against
        the total text score of those paragraphs it would be unfair.
        So we need to normalize the score based on the average scoring
        of the paragraphs within the top node.
        For example if our total score of 10 paragraphs was 1000
        but each had an average value of 100 then 100 should be our base.
        """
        base = 100000
        numberOfParagraphs = 0
        scoreOfParagraphs = 0
        nodesToCheck = Parser.getElementsByTag(topNode, tag='p')

        for node in nodesToCheck:
            nodeText = Parser.getText(node)
            wordStats = StopWords(
                language=self.language).getStopWordCount(nodeText)
            highLinkDensity = self.isHighLinkDensity(node)
            if wordStats.getStopWordCount() > 2 and not highLinkDensity:
                numberOfParagraphs += 1
                scoreOfParagraphs += wordStats.getStopWordCount()

        if numberOfParagraphs > 0:
            base = scoreOfParagraphs / numberOfParagraphs

        return base

Exemplo n.º 2

0

Exibir arquivo

Arquivo: __init__.py Projeto: AnthonyNystrom/python-goose

 def getBaselineScoreForSiblings(self, topNode):
     """\
     we could have long articles that have tons of paragraphs 
     so if we tried to calculate the base score against
     the total text score of those paragraphs it would be unfair. 
     So we need to normalize the score based on the average scoring
     of the paragraphs within the top node. 
     For example if our total score of 10 paragraphs was 1000 
     but each had an average value of 100 then 100 should be our base.
     """
     base = 100000
     numberOfParagraphs = 0
     scoreOfParagraphs = 0
     nodesToCheck = Parser.getElementsByTag(topNode, tag='p')
     
     for node in nodesToCheck:
         nodeText = Parser.getText(node)
         wordStats = StopWords().getStopWordCount(nodeText)
         highLinkDensity = self.isHighLinkDensity(node)
         if wordStats.getStopWordCount() > 2 and not highLinkDensity:
             numberOfParagraphs += 1
             scoreOfParagraphs += wordStats.getStopWordCount()
     
     if numberOfParagraphs > 0:
         base = scoreOfParagraphs / numberOfParagraphs
     
     return base

Exemplo n.º 3

0

Exibir arquivo

Arquivo: __init__.py Projeto: AnthonyNystrom/python-goose

 def getSiblingContent(self, currentSibling, baselineScoreForSiblingParagraphs):
     """\
     adds any siblings that may have a decent score to this node
     """
     if currentSibling.tag == 'p' and len(Parser.getText(currentSibling)) > 0:
         e0 = currentSibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     
     else:
         potentialParagraphs = Parser.getElementsByTag(currentSibling, tag='p')
         if potentialParagraphs is None:
             return None
         else:
             ps = []
             for firstParagraph in potentialParagraphs:
                 text = Parser.getText(firstParagraph)
                 if len(text) > 0:
                     wordStats = StopWords().getStopWordCount(text)
                     paragraphScore = wordStats.getStopWordCount()
                     siblingBaseLineScore = float(.30)
                     highLinkDensity = self.isHighLinkDensity(firstParagraph)
                     score = float(baselineScoreForSiblingParagraphs * siblingBaseLineScore)
                     if score < paragraphScore and not highLinkDensity:
                         p = Parser.createElement(tag='p', text=text, tail=None)
                         ps.append(p)
             return ps

Exemplo n.º 4

0

Exibir arquivo

Arquivo: outputformatters.py Projeto: SalesLoft/python-goose

 def removeParagraphsWithFewWords(self):
     """\
     remove paragraphs that have less than x number of words, 
     would indicate that it's some sort of link
     """
     allNodes = Parser.getElementsByTags(self.getTopNode(),['*'])#.cssselect('*')
     allNodes.reverse()
     for el in allNodes:
         text = Parser.getText(el)
         stopWords = StopWords().getStopWordCount(text)
         if stopWords.getStopWordCount() < 3 \
             and len(Parser.getElementsByTag(el, tag='object')) == 0 \
             and len(Parser.getElementsByTag(el, tag='embed')) == 0:
             Parser.remove(el)
         # TODO
         # check if it is in the right place
         else:
             trimmed = Parser.getText(el)
             if trimmed.startswith("(") and trimmed.endswith(")"):
                 Parser.remove(el)

Exemplo n.º 5

0

Exibir arquivo

 def removeParagraphsWithFewWords(self):
     """\
     remove paragraphs that have less than x number of words, 
     would indicate that it's some sort of link
     """
     allNodes = Parser.getElementsByTags(self.getTopNode(),
                                         ['*'])  #.cssselect('*')
     allNodes.reverse()
     for el in allNodes:
         text = Parser.getText(el)
         stopWords = StopWords().getStopWordCount(text)
         if stopWords.getStopWordCount() < 3 \
             and len(Parser.getElementsByTag(el, tag='object')) == 0 \
             and len(Parser.getElementsByTag(el, tag='embed')) == 0:
             Parser.remove(el)
         # TODO
         # check if it is in the right place
         else:
             trimmed = Parser.getText(el)
             if trimmed.startswith("(") and trimmed.endswith(")"):
                 Parser.remove(el)

Exemplo n.º 6

0

Exibir arquivo

 def getSiblingContent(self, currentSibling,
                       baselineScoreForSiblingParagraphs):
     """\
     adds any siblings that may have a decent score to this node
     """
     if currentSibling.tag == 'p' and len(
             Parser.getText(currentSibling)) > 0:
         e0 = currentSibling
         if e0.tail:
             e0 = deepcopy(e0)
             e0.tail = ''
         return [e0]
     else:
         potentialParagraphs = Parser.getElementsByTag(currentSibling,
                                                       tag='p')
         if potentialParagraphs is None:
             return None
         else:
             ps = []
             for firstParagraph in potentialParagraphs:
                 text = Parser.getText(firstParagraph)
                 if len(text) > 0:
                     wordStats = StopWords(
                         language=self.language).getStopWordCount(text)
                     paragraphScore = wordStats.getStopWordCount()
                     siblingBaseLineScore = float(.30)
                     highLinkDensity = self.isHighLinkDensity(
                         firstParagraph)
                     score = float(baselineScoreForSiblingParagraphs *
                                   siblingBaseLineScore)
                     if score < paragraphScore and not highLinkDensity:
                         p = Parser.createElement(tag='p',
                                                  text=text,
                                                  tail=None)
                         ps.append(p)
             return ps

Exemplo n.º 7

0

Exibir arquivo

    def calculateBestNodeBasedOnClustering(self, article):
        doc = article.doc
        topNode = None
        nodesToCheck = self.getNodesToCheck(doc)

        startingBoost = float(1.0)
        cnt = 0
        i = 0
        parentNodes = set()
        nodesWithText = []

        for node in nodesToCheck:
            nodeText = Parser.getText(node)
            wordStats = StopWords(
                language=self.language).getStopWordCount(nodeText)
            highLinkDensity = self.isHighLinkDensity(node)
            if wordStats.getStopWordCount() > 2 and not highLinkDensity:
                nodesWithText.append(node)

        numberOfNodes = len(nodesWithText)
        negativeScoring = 0
        bottomNodesForNegativeScore = float(numberOfNodes) * 0.25

        for node in nodesWithText:
            boostScore = float(0)
            # boost
            if (self.isOkToBoost(node)):
                if cnt >= 0:
                    boostScore = float((1.0 / startingBoost) * 50)
                    startingBoost += 1
            # numberOfNodes
            if numberOfNodes > 15:
                if (numberOfNodes - i) <= bottomNodesForNegativeScore:
                    booster = float(bottomNodesForNegativeScore -
                                    (numberOfNodes - i))
                    boostScore = float(-pow(booster, float(2)))
                    negscore = -abs(boostScore) + negativeScoring
                    if negscore > 40:
                        boostScore = float(5)

            nodeText = Parser.getText(node)
            wordStats = StopWords(
                language=self.language).getStopWordCount(nodeText)
            upscore = int(wordStats.getStopWordCount() + boostScore)

            # parent node
            parentNode = Parser.getParent(node)
            self.updateScore(parentNode, upscore)
            self.updateNodeCount(node.getparent(), 1)

            if node.getparent() not in parentNodes:
                parentNodes.add(node.getparent())

            # parentparent node
            parentParentNode = Parser.getParent(parentNode)
            if parentParentNode is not None:
                self.updateNodeCount(parentParentNode, 1)
                self.updateScore(parentParentNode, upscore / 2)
                if parentParentNode not in parentNodes:
                    parentNodes.add(parentParentNode)
            cnt += 1
            i += 1

        topNodeScore = 0
        for e in parentNodes:
            score = self.getScore(e)

            if score > topNodeScore:
                topNode = e
                topNodeScore = score

            if topNode is None:
                topNode = e

        return topNode

Exemplo n.º 8

0

Exibir arquivo

Arquivo: __init__.py Projeto: AnthonyNystrom/python-goose

 def calculateBestNodeBasedOnClustering(self, article):
     doc = article.doc
     topNode = None
     nodesToCheck = self.getNodesToCheck(doc)
     
     startingBoost = float(1.0)
     cnt = 0
     i = 0
     parentNodes = set()
     nodesWithText = []
     
     for node in nodesToCheck:
         nodeText = Parser.getText(node)
         wordStats = StopWords().getStopWordCount(nodeText)
         highLinkDensity = self.isHighLinkDensity(node)
         if wordStats.getStopWordCount() > 2 and not highLinkDensity:
             nodesWithText.append(node)
             
     numberOfNodes = len(nodesWithText)
     negativeScoring = 0
     bottomNodesForNegativeScore = float(numberOfNodes) * 0.25
     
     for node in nodesWithText:
         boostScore = float(0)
         # boost
         if(self.isOkToBoost(node)):
             if cnt >= 0:
                 boostScore = float((1.0 / startingBoost) * 50)
                 startingBoost += 1
         # numberOfNodes
         if numberOfNodes > 15:
             if (numberOfNodes - i) <= bottomNodesForNegativeScore:
                 booster = float(bottomNodesForNegativeScore - (numberOfNodes - i))
                 boostScore = float(-pow(booster, float(2)))
                 negscore = -abs(boostScore) + negativeScoring
                 if negscore > 40:
                     boostScore = float(5)
         
         nodeText = Parser.getText(node)
         wordStats = StopWords().getStopWordCount(nodeText)
         upscore = int(wordStats.getStopWordCount() + boostScore)
         
         # parent node
         parentNode = Parser.getParent(node)
         self.updateScore(parentNode, upscore)
         self.updateNodeCount(node.getparent(), 1)
         
         if node.getparent() not in parentNodes:
             parentNodes.add(node.getparent())
         
         # parentparent node
         parentParentNode = Parser.getParent(parentNode)
         if parentParentNode is not None:
             self.updateNodeCount(parentParentNode, 1)
             self.updateScore(parentParentNode, upscore / 2)
             if parentParentNode not in parentNodes:
                 parentNodes.add(parentParentNode)
         cnt += 1
         i += 1
     
     topNodeScore = 0
     for e in parentNodes:
         score = self.getScore(e)
         
         if score > topNodeScore:
             topNode = e
             topNodeScore = score
         
         if topNode is None:
             topNode = e
     
     return topNode