def get_min_count(value): if not value: return 0 tokens = [x for x in xngrams(value, 2, False)] if tokens: return min([value2count[x] for x in tokens]) else: return 100000
def get_min_count(value): if not value: return 0 tokens = [x for x in xngrams(value, 2, False)] if tokens: return min([self.bigram_dict.get(x, 0) for x in tokens]) else: return 100000
def get_exceptions(self): rule_values = list([x.after_str for x in self.rule2prob.keys()]) one_grams = Counter(itertools.chain.from_iterable(rule_values)) two_ngrams = Counter( itertools.chain.from_iterable([ "".join(x) for val in rule_values for x in xngrams(val, 2, add_regex=False) ])) return list(set(list(one_grams.keys()) + list(two_ngrams.keys())))
def min_ngram_counts(es_query, values): trigrams = set( itertools.chain.from_iterable( [["".join(x) for x in xngrams(list(value), 3, False)] for value in values] ) ) counts = es_query.get_char_ngram_counts(trigrams) value2count = {trigram: count for trigram, count in zip(trigrams, counts)} return [ min( [ value2count[x] for x in ["".join(x) for x in xngrams(list(value), 3, False)] ] ) for value in values ]
def fit(self, values): trigram = [["".join(x) for x in list(xngrams(val, 3))] for val in values] ngrams = list(itertools.chain.from_iterable(trigram)) self.trigram_counter = Counter(ngrams) sym_ngrams = [str2regex(x, False) for x in ngrams] self.sym_trigram_counter = Counter(sym_ngrams) self.val_counter = Counter(values) sym_values = [str2regex(x, False) for x in values] self.sym_val_counter = Counter(sym_values) self.func2counter = { val_trigrams: self.trigram_counter, sym_trigrams: self.sym_trigram_counter, value_freq: self.val_counter, sym_value_freq: self.sym_val_counter, }
def min_char_ngram_counts(es_query, values): bigrams = list( itertools.chain.from_iterable( [xngrams(value, 2, False) for value in values])) counts = es_query.get_char_ngram_counts(bigrams) value2count = { trigram: int(count) for trigram, count in zip(bigrams, counts) } def get_min_count(value): if not value: return 0 tokens = [x for x in xngrams(value, 2, False)] if tokens: return min([value2count[x] for x in tokens]) else: return 100000 return [get_min_count(value) for value in values]