示例#1
0
 def get_min_count(value):
     if not value:
         return 0
     tokens = [x for x in xngrams(value, 2, False)]
     if tokens:
         return min([value2count[x] for x in tokens])
     else:
         return 100000
示例#2
0
 def get_min_count(value):
     if not value:
         return 0
     tokens = [x for x in xngrams(value, 2, False)]
     if tokens:
         return min([self.bigram_dict.get(x, 0) for x in tokens])
     else:
         return 100000
示例#3
0
 def get_exceptions(self):
     rule_values = list([x.after_str for x in self.rule2prob.keys()])
     one_grams = Counter(itertools.chain.from_iterable(rule_values))
     two_ngrams = Counter(
         itertools.chain.from_iterable([
             "".join(x) for val in rule_values
             for x in xngrams(val, 2, add_regex=False)
         ]))
     return list(set(list(one_grams.keys()) + list(two_ngrams.keys())))
示例#4
0
文件: metal.py 项目: minhptx/spade
def min_ngram_counts(es_query, values):
    trigrams = set(
        itertools.chain.from_iterable(
            [["".join(x) for x in xngrams(list(value), 3, False)] for value in values]
        )
    )
    counts = es_query.get_char_ngram_counts(trigrams)
    value2count = {trigram: count for trigram, count in zip(trigrams, counts)}

    return [
        min(
            [
                value2count[x]
                for x in ["".join(x) for x in xngrams(list(value), 3, False)]
            ]
        )
        for value in values
    ]
示例#5
0
文件: holo.py 项目: minhptx/spade
    def fit(self, values):
        trigram = [["".join(x) for x in list(xngrams(val, 3))]
                   for val in values]
        ngrams = list(itertools.chain.from_iterable(trigram))
        self.trigram_counter = Counter(ngrams)
        sym_ngrams = [str2regex(x, False) for x in ngrams]

        self.sym_trigram_counter = Counter(sym_ngrams)
        self.val_counter = Counter(values)

        sym_values = [str2regex(x, False) for x in values]
        self.sym_val_counter = Counter(sym_values)

        self.func2counter = {
            val_trigrams: self.trigram_counter,
            sym_trigrams: self.sym_trigram_counter,
            value_freq: self.val_counter,
            sym_value_freq: self.sym_val_counter,
        }
示例#6
0
def min_char_ngram_counts(es_query, values):
    bigrams = list(
        itertools.chain.from_iterable(
            [xngrams(value, 2, False) for value in values]))

    counts = es_query.get_char_ngram_counts(bigrams)

    value2count = {
        trigram: int(count)
        for trigram, count in zip(bigrams, counts)
    }

    def get_min_count(value):
        if not value:
            return 0
        tokens = [x for x in xngrams(value, 2, False)]
        if tokens:
            return min([value2count[x] for x in tokens])
        else:
            return 100000

    return [get_min_count(value) for value in values]