def _calculate_word_frequencies(self, text): text = common_utils.prepare_text(text) words = common_utils.tokenize(text) res = collections.defaultdict(int) for word in words: res[word] += 1 return res
def score(self, query, normalized=True, synonimizer=None): if synonimizer: synonyms = synonimizer.get_synonyms() query_words = common_utils.tokenize(query) for i in xrange(len(query_words)): query_words[i] = synonyms[query_words[i]] + [query_words[i]] possible_queries = map(lambda words: "".join(words), itertools.product(*query_words)) return max(self._score(q) for q in possible_queries) else: return self._score(query.replace(" ", ""), normalized)
def score(self, query, normalized=True, synonimizer=None, return_suffix_scores=False): if synonimizer: synonyms = synonimizer.get_synonyms() query_words = common_utils.tokenize(query) for i in range(len(query_words)): query_words[i] = synonyms[query_words[i]] + [query_words[i]] possible_queries = map(lambda words: "".join(words), itertools.product(*query_words)) return max(self._score(q) for q in possible_queries) else: return self._score(query.replace(" ", ""), normalized, return_suffix_scores)
def test_tokenize(self): text = "Well, what a sunny day!" tokens = ["Well", "what", "a", "sunny", "day"] self.assertEqual(utils.tokenize(text), tokens)