示例#1
0
def calculate_score(phrases: Phrases, worda: str, wordb: str) -> str:
    """
    Calculate the score of any given word. Only for Phrases, not applicable for Phraser
    :param phrases:
    :param worda:
    :param wordb:
    :return:
    """
    score = -1
    try:
        score = original_scorer(phrases.vocab[worda.encode('utf-8')], phrases.vocab[wordb.encode('utf-8')],
                                phrases.vocab[(worda+'_'+wordb).encode('utf-8')], len(phrases.vocab),
                                phrases.min_count, phrases.corpus_word_count)
    except Exception as e:
        print(str(e))
        pass
    print("[%s, %s]: %f" % (worda, wordb, score))
示例#2
0
len_vocab = float(len(vocab2))
min_count = float(bigram.min_count)
corpus_word_count = float(bigram.corpus_word_count)
print('-----> Table of bigrams')
for c in sorted(bi_vocab.keys(), reverse=True):
    for val in bi_vocab[c]:
        if any((v.isalpha() for v in val.decode(decode_format))):
            [worda, wordb] = re.split(b'_', val, 1)

            s1 = vocab2[worda]
            s2 = vocab2[wordb]

            if s1 > 0 and s2 > 0:
                score = original_scorer(worda_count=float(s1),
                                        wordb_count=float(s2),
                                        bigram_count=float(c),
                                        len_vocab=len_vocab,
                                        min_count=min_count,
                                        corpus_word_count=corpus_word_count)
                if c > 100:
                    print(f"{val.decode(decode_format):10} {c:5} \t{score:.4}")
                grams[val.decode(decode_format)] = (c, score)

vocab3 = trigram.vocab
tri_vocab = defaultdict(list)
for p, c in vocab3.items():
    if len(re.findall(b'_', p)) == 2:
        tri_vocab[c].append(p)

len_vocab = float(len(vocab3))
min_count = float(trigram.min_count)
corpus_word_count = float(trigram.corpus_word_count)
示例#3
0
def default_scorer(worda_count, wordb_count, bigram_count, min_count):
    return original_scorer(worda_count, wordb_count, bigram_count, 59543584,
                           15, 0)