Exemplo n.º 1
0
def score_by_embedding_model(lex,
                             embedding_model,
                             candidates,
                             lex_sim_weight=0.8,
                             n_cands=-1):
    '''Described in "Exploring Word Embeddings for Unsupervised Textual User-Generated
    Content Normalization, Bertaglia and Nunes(2016)"

    Args:
        lex (dict): The lexicon dictionary.
        embedding_model (obj): The embedding model in word2vec format. Must be readable by gensim.
        k (int): Number of neares neighbours to evaluate (all experiments ran with k=25).
        lex_sim_weight (float): Weight given to the lexical similarity.
        n_cands (int): Number of candidates to be returned.

    Returns:
        dict(str: list(str)): Top ``n_cands`` scored corrections for each word.
    '''
    scored_candidates = {}

    for word in candidates:
        cands_list = []
        for cand in candidates[word]:
            similarity = (lex_sim_weight * metrics.hassan_similarity(word, cand)) + \
                ((1 - lex_sim_weight) * embedding_model.similarity(word, cand))
            cands_list.append((cand, similarity))
        scored_candidates[word] = sorted(cands_list,
                                         key=lambda x: x[1],
                                         reverse=True)
    return scored_candidates if n_cands == -1 or len(
        scored_candidates) < n_cands else scored_candidates[:n_cands]
Exemplo n.º 2
0
def generate_and_score(lex,
                       embedding_model,
                       k=25,
                       lex_sim_weight=0.8,
                       dump_pickle=True):
    """Described in "Exploring Word Embeddings for Unsupervised Textual User-Generated
    Content Normalization, Bertaglia and Nunes(2016)"

    Args:
        lex (dict): The lexicon dictionary.
        embedding_model (obj): The embedding model in word2vec format. Must be readable by gensim.
        k (int): Number of neares neighbours to evaluate (all experiments ran with k=25).
        lex_sim_weight (float): Weight given to the lexical similarity.
        dump_pickle (boolean): Whether to dump the learnt normalization lexicon to a pickle

    Returns:
        dict(str: list(str)): A list of scored possible corrections for each word.
    """
    cands = {}
    corrs = {}

    cands = {
        word: [
            sims[0] for sims in embedding_model.most_similar(word, topn=k)
            if sims[0] not in lex
        ]
        for word in lex if word in embedding_model
    }
    for word in cands:
        cands_list = []
        for c in cands[word]:
            if c not in corrs:
                corrs[c] = []
            similarity = (
                lex_sim_weight * metrics.hassan_similarity(word, c)) + (
                    (1 - lex_sim_weight) * embedding_model.similarity(word, c))
            corrs[c].append((word, similarity))
    # Expansion step (read the paper for more details):
    v = {w: 0 for w in list(embedding_model.vocab.keys())}
    noisy_words = {w: 0 for w in v if w not in corrs and w not in lex}
    for w in noisy_words:
        ed_cands = baselines.generate_by_similarity_metric(lex=lex, word=w)
        scored_cands = candidate_scoring.baselines.score_by_similarity_metrics(
            lex=lex,
            candidates=ed_cands,
            metrics=[metrics.hassan_similarity],
            n_cands=1,
            reverse=True,
        )
        if scored_cands[1]:
            corrs[w] = [scored_cands[1][0]]
    # Sorting the list by score
    for c in corrs:
        corrs[c] = sorted(corrs[c], key=lambda x: x[1], reverse=True)
    if dump_pickle:
        pickle.dump(corrs, open("norm_lexicon.pickle", "wb"))
    return corrs
Exemplo n.º 3
0
def test_hassan_similarity():
    assert metrics.hassan_similarity("casa", "caza") == 0.75
    assert metrics.hassan_similarity("qq eh isso",
                                     "o que é isso") == (7 / 12) / 5
    assert metrics.hassan_similarity("abc def zzz", "abc def zzz") == 1.0