示例#1
0
 def _calculate_word_frequencies(self, text):
     text = common_utils.prepare_text(text)
     words = common_utils.tokenize(text)
     res = collections.defaultdict(int)
     for word in words:
         res[word] += 1
     return res
示例#2
0
 def _calculate_word_frequencies(self, text):
     text = common_utils.prepare_text(text)
     words = common_utils.tokenize(text)
     res = collections.defaultdict(int)
     for word in words:
         res[word] += 1
     return res
示例#3
0
 def score(self, query, normalized=True, synonimizer=None):
     if synonimizer:
         synonyms = synonimizer.get_synonyms()
         query_words = common_utils.tokenize(query)
         for i in xrange(len(query_words)):
             query_words[i] = synonyms[query_words[i]] + [query_words[i]]
         possible_queries = map(lambda words: "".join(words),
                                itertools.product(*query_words))
         return max(self._score(q) for q in possible_queries)
     else:
         return self._score(query.replace(" ", ""), normalized)
示例#4
0
 def score(self,
           query,
           normalized=True,
           synonimizer=None,
           return_suffix_scores=False):
     if synonimizer:
         synonyms = synonimizer.get_synonyms()
         query_words = common_utils.tokenize(query)
         for i in range(len(query_words)):
             query_words[i] = synonyms[query_words[i]] + [query_words[i]]
         possible_queries = map(lambda words: "".join(words),
                                itertools.product(*query_words))
         return max(self._score(q) for q in possible_queries)
     else:
         return self._score(query.replace(" ", ""), normalized,
                            return_suffix_scores)
 def test_tokenize(self):
     text = "Well, what a sunny day!"
     tokens = ["Well", "what", "a", "sunny", "day"]
     self.assertEqual(utils.tokenize(text), tokens)
示例#6
0
 def test_tokenize(self):
     text = "Well, what a sunny day!"
     tokens = ["Well", "what", "a", "sunny", "day"]
     self.assertEqual(utils.tokenize(text), tokens)