Пример #1
0
def get_word_translations(emb1, emb2, knn, softmax_temp=30.):
    """
    Given source and target word embeddings, and a list of source words,
    produce a list of lists of k-best translations for each source word.
    """
    # normalize word embeddings
    emb1 = emb1 / emb1.norm(2, 1, keepdim=True).expand_as(emb1)
    emb2 = emb2 / emb2.norm(2, 1, keepdim=True).expand_as(emb2)

    # we always use the contextual dissimilarity measure as this gives the best performance (csls_knn_10)
    # calculate the average distances to k nearest neighbors
    average_dist1 = get_nn_avg_dist(emb2, emb1, knn)
    average_dist2 = get_nn_avg_dist(emb1, emb2, knn)
    average_dist1 = torch.from_numpy(average_dist1).type_as(emb1)
    average_dist2 = torch.from_numpy(average_dist2).type_as(emb2)

    top_k_match_ids = []
    step_size = 1000

    for i in range(0, emb1.shape[0], step_size):
        print('Processing word ids %d-%d...' % (i, i + step_size))
        word_ids = range(i, i + step_size)

        # use the embeddings of the current word ids
        query = emb1[word_ids]

        # calculate the scores with the contextual dissimilarity measure
        scores = query.mm(emb2.transpose(0, 1))
        scores.mul_(2)
        scores.sub_(average_dist1[word_ids][:, None] + average_dist2[None, :])

        # get the indices of the highest scoring target words
        top_sim_scores, top_match_ids = scores.topk(
            knn, 1,
            True)  # returns a (values, indices) tuple (same as torch.topk)
        top_sim_scores = F.softmax(softmax_temp * top_sim_scores, 1)
        top_k_match_ids += [
            (ids, scores) for ids, scores in zip(top_match_ids, top_sim_scores)
        ]

    return top_k_match_ids
Пример #2
0
def get_sent_translation_accuracy(data,
                                  labels,
                                  lg1,
                                  word2id1,
                                  emb1,
                                  lg2,
                                  word2id2,
                                  emb2,
                                  method,
                                  idf,
                                  test,
                                  device=2):
    """
    Given parallel sentences from Europarl, evaluate the
    sentence translation accuracy using the precision@k.
    """
    # get word vectors dictionaries
    emb1 = emb1.cpu().numpy()
    emb2 = emb2.cpu().numpy()
    word_vec1 = dict([(w, emb1[word2id1[w]]) for w in word2id1])
    word_vec2 = dict([(w, emb2[word2id2[w]]) for w in word2id2])
    word_vect = {lg1: word_vec1, lg2: word_vec2}
    lg_keys = lg2
    lg_query = lg1

    # get n_keys pairs of sentences
    src_keys = torch.arange(len(data[lg1]))
    tgt_keys = torch.arange(len(data[lg2]))
    keys = data[lg_keys]
    key_ids, keys = bow_idf(keys, word_vect[lg_keys], idf_dict=idf[lg_keys])

    # get n_queries query pairs from these n_keys pairs
    rng = np.random.RandomState(1234)
    queries = [data[lg_query][i.item()] for i in src_keys]
    query_ids, queries = bow_idf(queries,
                                 word_vect[lg_query],
                                 idf_dict=idf[lg_query])

    # normalize embeddings
    queries = torch.from_numpy(queries).float()
    queries = queries / queries.norm(2, 1, keepdim=True).expand_as(queries)
    keys = torch.from_numpy(keys).float()
    keys = keys / keys.norm(2, 1, keepdim=True).expand_as(keys)
    keys = keys.to(device)

    # nearest neighbors
    if method == 'nn':
        top2 = top2_scores(queries, keys, 1500, device=device)
        pickle.dump(top2, open('fr-en.sample.scores', 'wb'))

    # contextual dissimilarity measure
    elif method.startswith('csls_knn_'):
        knn = method[len('csls_knn_'):]
        assert knn.isdigit()
        knn = int(knn)
        # average distances to k nearest neighbors
        knn = method[len('csls_knn_'):]
        assert knn.isdigit()
        knn = int(knn)
        average_dist_keys = torch.from_numpy(
            get_nn_avg_dist(queries, keys, knn)).to(device)
        average_dist_queries = torch.from_numpy(
            get_nn_avg_dist(keys, queries, knn)).to(device)
        # scores
        top2 = top2_scores_csls(queries,
                                keys,
                                1000,
                                average_dist_keys,
                                average_dist_queries,
                                device=device)
        pickle.dump(top2, open('fr-en.sample.scores', 'wb'))
        # scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1)
        # scores.mul_(2)
        # scores.sub_(average_dist_queries[:, None].float() + average_dist_keys[None, :].float())
        # scores = scores.cpu()

    results = []
    top_matches = scores.topk(10, 1, True)[1]
    predictions = top_matches[:, 0]

    if not test:
        for k in [1, 5, 10]:
            top_k_matches = (top_matches[:, :k] == tgt_keys[:, None]).sum(1)
            precision_at_k = 100 * top_k_matches.float().numpy().mean()
            logger.info("%i queries (%s) - %s - Precision at k = %i: %f" %
                        (len(top_k_matches), lg_query.upper(), method, k,
                         precision_at_k))
            results.append(('sent-precision_at_%i' % k, precision_at_k))

    return predictions, results
Пример #3
0
def get_sent_translation_accuracy(data, lg1, word2id1, emb1, lg2, word2id2, emb2,
                                  n_keys, n_queries, method, idf):

    """
    Given parallel sentences from Europarl, evaluate the
    sentence translation accuracy using the precision@k.
    """
    # get word vectors dictionaries
    emb1 = emb1.cpu().numpy()
    emb2 = emb2.cpu().numpy()
    word_vec1 = dict([(w, emb1[word2id1[w]]) for w in word2id1])
    word_vec2 = dict([(w, emb2[word2id2[w]]) for w in word2id2])
    word_vect = {lg1: word_vec1, lg2: word_vec2}
    lg_keys = lg2
    lg_query = lg1

    # get n_keys pairs of sentences
    keys = data[lg_keys][:n_keys]
    keys = bow_idf(keys, word_vect[lg_keys], idf_dict=idf[lg_keys])

    # get n_queries query pairs from these n_keys pairs
    rng = np.random.RandomState(1234)
    idx_query = rng.choice(range(n_keys), size=n_queries, replace=False)
    queries = data[lg_query][idx_query]
    queries = bow_idf(queries, word_vect[lg_query], idf_dict=idf[lg_query])

    # normalize embeddings
    queries = torch.from_numpy(queries).float()
    queries = queries / queries.norm(2, 1, keepdim=True).expand_as(queries)
    keys = torch.from_numpy(keys).float()
    keys = keys / keys.norm(2, 1, keepdim=True).expand_as(keys)

    # nearest neighbors
    if method == 'nn':
        scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1)
        scores = scores.cpu()

    # inverted softmax
    elif method.startswith('invsm_beta_'):
        beta = float(method[len('invsm_beta_'):])
        scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1)
        scores.mul_(beta).exp_()
        scores.div_(scores.sum(0, keepdim=True).expand_as(scores))
        scores = scores.cpu()

    # contextual dissimilarity measure
    elif method.startswith('csls_knn_'):
        knn = method[len('csls_knn_'):]
        assert knn.isdigit()
        knn = int(knn)
        # average distances to k nearest neighbors
        knn = method[len('csls_knn_'):]
        assert knn.isdigit()
        knn = int(knn)
        average_dist_keys = torch.from_numpy(get_nn_avg_dist(queries, keys, knn))
        average_dist_queries = torch.from_numpy(get_nn_avg_dist(keys, queries, knn))
        # scores
        scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1)
        scores.mul_(2)
        scores.sub_(average_dist_queries[:, None].float() + average_dist_keys[None, :].float())
        scores = scores.cpu()

    results = []
    top_matches = scores.topk(10, 1, True)[1]
    for k in [1, 5, 10]:
        top_k_matches = (top_matches[:, :k] == torch.from_numpy(idx_query)[:, None]).sum(1)
        precision_at_k = 100 * np.mean(top_k_matches.float().mean())
        logger.info("%i queries (%s) - %s - Precision at k = %i: %f" %
                    (len(top_k_matches), lg_query.upper(), method, k, precision_at_k))
        results.append(('sent-precision_at_%i' % k, precision_at_k))

    return results
Пример #4
0
def get_sent_translation_accuracy(data, lg1, word2id1, emb1, lg2, word2id2, emb2,
                                  n_keys, n_queries, method, idf):

    """
    Given parallel sentences from Europarl, evaluate the
    sentence translation accuracy using the precision@k.
    """
    # get word vectors dictionaries
    emb1 = emb1.cpu().numpy()
    emb2 = emb2.cpu().numpy()
    word_vec1 = dict([(w, emb1[word2id1[w]]) for w in word2id1])
    word_vec2 = dict([(w, emb2[word2id2[w]]) for w in word2id2])
    word_vect = {lg1: word_vec1, lg2: word_vec2}
    lg_keys = lg2
    lg_query = lg1

    # get n_keys pairs of sentences
    keys = data[lg_keys][:n_keys]
    keys = bow_idf(keys, word_vect[lg_keys], idf_dict=idf[lg_keys])

    # get n_queries query pairs from these n_keys pairs
    rng = np.random.RandomState(1234)
    idx_query = rng.choice(range(n_keys), size=n_queries, replace=False)
    queries = data[lg_query][idx_query]
    queries = bow_idf(queries, word_vect[lg_query], idf_dict=idf[lg_query])

    # normalize embeddings
    queries = torch.from_numpy(queries).float()
    queries = queries / queries.norm(2, 1, keepdim=True).expand_as(queries)
    keys = torch.from_numpy(keys).float()
    keys = keys / keys.norm(2, 1, keepdim=True).expand_as(keys)

    # nearest neighbors
    if method == 'nn':
        scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1)
        scores = scores.cpu()

    # inverted softmax
    elif method.startswith('invsm_beta_'):
        beta = float(method[len('invsm_beta_'):])
        scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1)
        scores.mul_(beta).exp_()
        scores.div_(scores.sum(0, keepdim=True).expand_as(scores))
        scores = scores.cpu()

    # contextual dissimilarity measure
    elif method.startswith('csls_knn_'):
        knn = method[len('csls_knn_'):]
        assert knn.isdigit()
        knn = int(knn)
        # average distances to k nearest neighbors
        knn = method[len('csls_knn_'):]
        assert knn.isdigit()
        knn = int(knn)
        average_dist_keys = torch.from_numpy(get_nn_avg_dist(queries, keys, knn))
        average_dist_queries = torch.from_numpy(get_nn_avg_dist(keys, queries, knn))
        # scores
        scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1)
        scores.mul_(2)
        scores.sub_(average_dist_queries[:, None].float() + average_dist_keys[None, :].float())
        scores = scores.cpu()

    results = []
    top_matches = scores.topk(10, 1, True)[1]
    for k in [1, 5, 10]:
        top_k_matches = (top_matches[:, :k] == torch.from_numpy(idx_query)[:, None]).sum(1)
        precision_at_k = 100 * np.mean(top_k_matches.float().mean())
        logger.info("%i queries (%s) - %s - Precision at k = %i: %f" %
                    (len(top_k_matches), lg_query.upper(), method, k, precision_at_k))
        results.append(('sent-precision_at_%i' % k, precision_at_k))

    return results
Пример #5
0
def get_word_translation_accuracy(dico, word2id1, emb1, word2id2, emb2,
                                  method):
    """
    Given source and target word embeddings, and a dictionary,
    evaluate the translation accuracy using the precision@k.
    """
    dico = dico.cuda() if emb1.is_cuda else dico

    assert dico[:, 0].max() < emb1.size(0)
    assert dico[:, 1].max() < emb2.size(0)

    # normalize word embeddings
    emb1 = emb1 / emb1.norm(2, 1, keepdim=True).expand_as(emb1)
    emb2 = emb2 / emb2.norm(2, 1, keepdim=True).expand_as(emb2)

    # nearest neighbors
    if method == 'nn':
        query = emb1[dico[:, 0]]
        scores = query.mm(emb2.transpose(0, 1))

    # inverted softmax
    elif method.startswith('invsm_beta_'):
        beta = float(method[len('invsm_beta_'):])
        bs = 128
        word_scores = []
        for i in range(0, emb2.size(0), bs):
            scores = emb1.mm(emb2[i:i + bs].transpose(0, 1))
            scores.mul_(beta).exp_()
            scores.div_(scores.sum(0, keepdim=True).expand_as(scores))
            word_scores.append(scores.index_select(0, dico[:, 0]))
        scores = torch.cat(word_scores, 1)

    # contextual dissimilarity measure
    elif method.startswith('csls_knn_'):
        # average distances to k nearest neighbors
        knn = method[len('csls_knn_'):]
        assert knn.isdigit()
        knn = int(knn)
        average_dist1 = get_nn_avg_dist(emb2, emb1, knn)
        average_dist2 = get_nn_avg_dist(emb1, emb2, knn)
        average_dist1 = torch.from_numpy(average_dist1).type_as(emb1)
        average_dist2 = torch.from_numpy(average_dist2).type_as(emb2)
        # queries / scores
        query = emb1[dico[:, 0]]
        scores = query.mm(emb2.transpose(0, 1))
        scores.mul_(2)
        scores.sub_(average_dist1[dico[:, 0]][:, None] +
                    average_dist2[None, :])

    else:
        raise Exception('Unknown method: "%s"' % method)

    results = []
    top_matches = scores.topk(100, 1, True)[1]
    for k in [1, 5, 10]:
        top_k_matches = top_matches[:, :k]
        _matching = (
            top_k_matches == dico[:, 1][:,
                                        None].expand_as(top_k_matches)).sum(1)
        # allow for multiple possible translations
        matching = {}
        for i, src_id in enumerate(dico[:, 0]):
            matching[src_id] = min(matching.get(src_id, 0) + _matching[i], 1)
        # evaluate precision@k
        precision_at_k = 100 * np.mean(list(matching.values()))
        print("%i source words - %s - Precision at k = %i: %f" %
              (len(matching), method, k, precision_at_k))
        results.append(('precision_at_%i' % k, precision_at_k))

    return results
Пример #6
0
    default="_",
    help=
    "Replace phrase word delimiter by empty space (empty string to disable)")
params = parser.parse_args()

# read embeddings
print("Loading embeddings ...")
src_dico, src_emb = load_embeddings(params, source=True)
tgt_dico, tgt_emb = load_embeddings(params, source=False)
n_src = src_emb.size(0)
n_tgt = tgt_emb.size(0)
print("Loaded %i / %i source / target embeddings." % (n_src, n_tgt))

# use CSLS
print("Computing average distance ...")
src_avg_dist = get_nn_avg_dist(emb=tgt_emb, query=src_emb,
                               knn=10) if params.csls else None
tgt_avg_dist = get_nn_avg_dist(emb=src_emb, query=tgt_emb,
                               knn=10) if params.csls else None

# get translations
print("Generating translations ...")
s2t_translations = get_translations(src_emb, tgt_emb, src_avg_dist,
                                    tgt_avg_dist, N_TRANSLATE)
if params.inverse_score:
    t2s_translations = get_translations(tgt_emb, src_emb, tgt_avg_dist,
                                        src_avg_dist, N_TRANSLATE)

# get scores
print("Generating scores ...")
s2t_scores = get_s2t_scores(src_emb, tgt_emb, s2t_translations,
                            params.temperature)