def relevancy(self, inputtier): """ Add very frequent tokens in a copy of the stopwords list. Return a WordsList instance Estimate the relevance of each term by using the number of occurrences of this term in the input and compare this value to a threshold, to add the term (or not) in the stopwords list. @param inputtier (Tier) """ l = self.stopwords.copy() # Create the Unigram and put data u = Unigram() for a in inputtier: if a.GetLabel().IsSpeech() is True: u.add( a.GetLabel().GetValue() ) # Estimate if a token is relevant, put in the stoplist for token in u.get_tokens(): freq = u.get_value(token) proba = float(freq) / float(u.get_sum()) relevant = 1.0 / (float(u.get_size())*float(self._alpha)) if proba > relevant: l.add( token ) if self.logfile is not None: self.logfile.print_message('Add in the stoplist: '+token, indent=3) elif DEBUG is True: print(' ... ... ... Add in the stoplist: '+token.encode('utf8')) return l
def test_unigram(self): gram = Unigram() gram.add( 'a' ) self.assertEqual( gram.get_size(), 1) self.assertEqual( gram.get_count('a'), 1) gram.add( 'a' ) self.assertEqual( gram.get_size(), 1) self.assertEqual( gram.get_count('a'), 2) gram.add( 'a',3 ) self.assertEqual( gram.get_size(), 1) self.assertEqual( gram.get_count('a'), 5)