예제 #1
0
    def relevancy(self, inputtier):
        """
        Add very frequent tokens in a copy of the stopwords list.
        Return a WordsList instance

        Estimate the relevance of each term by using the number of
        occurrences of this term in the input and compare this value
        to a threshold, to add the term (or not) in the stopwords list.

        @param inputtier (Tier)

        """
        l = self.stopwords.copy()

        # Create the Unigram and put data
        u = Unigram()
        for a in inputtier:
            if a.GetLabel().IsSpeech() is True:
                u.add( a.GetLabel().GetValue() )

        # Estimate if a token is relevant, put in the stoplist
        for token in u.get_tokens():
            freq  = u.get_value(token)
            proba = float(freq) / float(u.get_sum())
            relevant = 1.0 / (float(u.get_size())*float(self._alpha))
            if proba > relevant:
                l.add( token )
                if self.logfile is not None:
                    self.logfile.print_message('Add in the stoplist: '+token, indent=3)
                elif DEBUG is True:
                    print(' ... ... ... Add in the stoplist: '+token.encode('utf8'))

        return l
예제 #2
0
 def test_unigram(self):
     gram = Unigram()
     gram.add( 'a' )
     self.assertEqual( gram.get_size(), 1)
     self.assertEqual( gram.get_count('a'), 1)
     gram.add( 'a' )
     self.assertEqual( gram.get_size(), 1)
     self.assertEqual( gram.get_count('a'), 2)
     gram.add( 'a',3 )
     self.assertEqual( gram.get_size(), 1)
     self.assertEqual( gram.get_count('a'), 5)