示例#1
0
def load_data(src_lang, tgt_lang, n_eval_ex=20000):
    src_id2word, src_word2id, src_embeddings = utils.read_txt_embeddings('data/wiki.%s.vec' % src_lang,
                                                                         n_eval_ex, False)
    tgt_id2word, tgt_word2id, tgt_embeddings = utils.read_txt_embeddings('data/wiki.%s.vec' % tgt_lang,
                                                                         n_eval_ex, False)

    s2t_dict = utils.load_dictionary('data/%s-%s.5000-6500.txt' % (src_lang, tgt_lang), src_word2id,
                                       tgt_word2id)
    t2s_dict = utils.load_dictionary('data/%s-%s.5000-6500.txt' % (tgt_lang, src_lang), tgt_word2id,
                                     src_word2id)
    return src_embeddings, tgt_embeddings, s2t_dict, t2s_dict
示例#2
0
import numpy as np
import utils
import params


src_id2word, src_word2id, src_embeddings = utils.read_txt_embeddings('data/wiki.%s.vec' % params.src_lang, params.n_eval_ex, False)
tgt_id2word, tgt_word2id, tgt_embeddings = utils.read_txt_embeddings('data/wiki.%s.vec' % params.tgt_lang, params.n_eval_ex, False)

print("%s_%s" % (params.src_lang, params.tgt_lang))

TranslatedX  = np.load("output/TranslatedX-relu.npy")

cross_dict = utils.load_dictionary('../data-test/medical/test-dict-only-words.txt', src_word2id, tgt_word2id)
utils.get_word_translation_accuracy(params.src_lang, src_word2id, TranslatedX,
                                    params.tgt_lang, tgt_word2id, tgt_embeddings,
                                    params.method, cross_dict, src_id2word)

示例#3
0
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

import numpy as np
import utils
import params

src_id2word, src_word2id, src_embeddings = utils.read_txt_embeddings(
    'data/wiki.%s.vec' % params.src_lang, params.n_init_ex,
    False)  # = 5000, only read en 500 words
np.save('data/%s_%d' % (params.src_lang, params.n_init_ex), src_embeddings)
tgt_id2word, tgt_word2id, tgt_embeddings = utils.read_txt_embeddings(
    'data/wiki.%s.vec' % params.tgt_lang, params.n_init_ex,
    False)  # = 5000, only read es 500 words
np.save('data/%s_%d' % (params.tgt_lang, params.n_init_ex), tgt_embeddings)

src_id2word, src_word2id, src_embeddings = utils.read_txt_embeddings(
    'data/wiki.%s.vec' % params.src_lang, params.n_ft_ex,
    False)  # = 7500, only read en 7500 words
np.save('data/%s_%d' % (params.src_lang, params.n_ft_ex), src_embeddings)
tgt_id2word, tgt_word2id, tgt_embeddings = utils.read_txt_embeddings(
    'data/wiki.%s.vec' % params.tgt_lang, params.n_ft_ex,
    False)  # = 7500, only read es 7500 words
np.save('data/%s_%d' % (params.tgt_lang, params.n_ft_ex), tgt_embeddings)

src_id2word, src_word2id, src_embeddings = utils.read_txt_embeddings(
    'data/wiki.%s.vec' % params.src_lang, 10000,
import numpy as np
import utils
import params

src_id2word, src_word2id, src_embeddings = utils.read_txt_embeddings(
    'data/full-new-test-dict-only-words-lower-%s-emb.vec' % params.src_lang,
    params.n_eval_ex,
    full_vocab=False)
tgt_id2word, tgt_word2id, tgt_embeddings = utils.read_txt_embeddings(
    'data/full-new-test-dict-only-words-lower-%s-emb.vec' % params.tgt_lang,
    params.n_eval_ex,
    full_vocab=False)

to_translate_src_id2word, to_translate_src_word2id, to_translate_src_embeddings = utils.read_txt_embeddings(
    'data/intersection-1-parsed-%s-emb.vec' % params.src_lang,
    params.n_eval_ex, False)
translated_tgt_id2word, translated_tgt_word2id, translated_tgt_embeddings = utils.read_txt_embeddings(
    'data/intersection-1-parsed-%s-emb.vec' % params.tgt_lang,
    params.n_eval_ex, False)

print("%s_%s" % (params.src_lang, params.tgt_lang))

TranslatedX = translated_tgt_embeddings

#uncomment for comparing with the global transformation

# T = np.load("../Non-adversarialTranslation/%s/%s_%s_T.npy" % (params.cp_dir, params.src_lang, params.tgt_lang))
# TranslatedX = to_translate_src_embeddings.dot(np.transpose(T))

cross_dict, existing_emb_tgt_translated, existing_emb_tgt_real, existing_src_id2word = utils.load_dictionary(
    'data/new-test-dict-only-words-no-parsed.txt',
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--cluster_num',
                        type=int,
                        default=2,
                        help='The number of cluster to be divided')
    args = parser.parse_args()
    n_clusters = args.cluster_num

    src_id2word, src_word2id, src_embeddings = utils.read_txt_embeddings(
        'data/wiki.%s.vec' % params.src_lang, params.n_eval_ex,
        False)  #n_eval_ex = 200000
    tgt_id2word, tgt_word2id, tgt_embeddings = utils.read_txt_embeddings(
        'data/wiki.%s.vec' % params.tgt_lang, params.n_eval_ex, False)
    src_normed = normalize(src_embeddings)
    tgt_normed = normalize(tgt_embeddings)
    cross_dict_src2tgt = utils.load_dictionary(
        'data/%s-%s.5000-6500.txt' % (params.src_lang, params.tgt_lang),
        src_word2id, tgt_word2id)
    cross_dict_tgt2src = utils.load_dictionary(
        'data/%s-%s.5000-6500.txt' % (params.tgt_lang, params.src_lang),
        tgt_word2id, src_word2id)

    T = np.load("%s/%s_%s_T.npy" %
                (params.cp_dir, params.src_lang, params.tgt_lang))
    T2 = np.load("%s/%s_%s_T.npy" %
                 (params.cp_dir, params.tgt_lang, params.src_lang))
    TranslatedX = src_embeddings.dot(np.transpose(T))

    src_full = np.load(
        "data/%s_%d.npy" %
        (params.src_lang, params.n_init_ex))  # 10000, 10000 english
    src_trans = src_full.dot(np.transpose(T))
    tgt_full = np.load("data/%s_%d.npy" %
                       (params.tgt_lang, params.n_init_ex))  # 300, 10000 es

    src_trans_normed = normalize(src_trans)
    tgt_full_normed = normalize(tgt_full)

    src_clt = sklearn.cluster.KMeans(n_clusters=n_clusters,
                                     n_init=40,
                                     random_state=200)
    src_y = src_clt.fit_predict(src_trans_normed)
    tgt_y = src_clt.predict(tgt_full_normed)

    src_centers, tgt_centers, src_dic, tgt_dic = find_centers(
        src_full, tgt_full, src_y, tgt_y, n_clts=n_clusters)
    Translated_centers = src_centers.dot(np.transpose(T))

    trans_c = normalize(Translated_centers)
    tgt_c = normalize(tgt_centers)

    cross_dict_src2tgt = utils.load_dictionary(
        'data/%s-%s.5000-6500.txt' % (params.src_lang, params.tgt_lang),
        src_word2id, tgt_word2id)
    cross_dict_tgt2src = utils.load_dictionary(
        'data/%s-%s.5000-6500.txt' % (params.tgt_lang, params.src_lang),
        tgt_word2id, src_word2id)

    src_classes = find_clts(data=src_embeddings,
                            centers=src_centers,
                            dico=cross_dict_src2tgt)
    tgt_classes = find_clts(data=tgt_embeddings,
                            centers=tgt_centers,
                            dico=cross_dict_tgt2src)

    src_classes_trans = find_clts(data=src_embeddings,
                                  centers=src_centers,
                                  dico=cross_dict_src2tgt,
                                  trans=True)
    src_correct = np.where(src_classes == src_classes_trans)
    src_acc = src_correct[0].shape[0] / 1500
    tgt_classes_trans = find_clts(data=tgt_embeddings,
                                  centers=tgt_centers,
                                  dico=cross_dict_tgt2src,
                                  trans=True)
    tgt_correct = np.where(tgt_classes == tgt_classes_trans)
    tgt_acc = tgt_correct[0].shape[0] / 1500
    print(src_acc, tgt_acc)

    TX, TY = utils.multi_ICP(src_full,
                             tgt_full,
                             src_y,
                             tgt_y,
                             src_dic,
                             tgt_dic,
                             n_clusters,
                             time_run_icp=100)

    result_src = separate_eva(src_embeddings,
                              tgt_embeddings,
                              T,
                              TX,
                              src_classes,
                              dico=cross_dict_src2tgt)
    result_tgt = separate_eva(tgt_embeddings,
                              src_embeddings,
                              T2,
                              TY,
                              tgt_classes,
                              dico=cross_dict_tgt2src)
示例#6
0
import numpy as np
import utils
import params
import json


def write_to_json_file(data, file_path):
    with open(file_path, 'w') as outfile:
        json.dump(data, outfile)


src_id2word, src_word2id, src_embeddings = utils.read_txt_embeddings(
    params.init_emb_path_src, params.n_init_ex, False)
np.save('data/%s_init' % (params.src_lang), src_embeddings)
write_to_json_file(src_id2word, params.init_src_id2word_path)
write_to_json_file(src_word2id, params.init_src_word2id_path)

tgt_id2word, tgt_word2id, tgt_embeddings = utils.read_txt_embeddings(
    params.init_emb_path_tgt, params.n_init_ex, False)
np.save('data/%s_init' % (params.tgt_lang), tgt_embeddings)
write_to_json_file(tgt_id2word, params.init_tgt_id2word_path)
write_to_json_file(tgt_word2id, params.init_tgt_word2id_path)

src_id2word, src_word2id, src_embeddings = utils.read_txt_embeddings(
    params.training_emb_path_src, params.n_ft_ex, False)
np.save('data/%s_training' % (params.src_lang), src_embeddings)
write_to_json_file(src_id2word, params.training_src_id2word_path)
write_to_json_file(src_word2id, params.training_src_word2id_path)

tgt_id2word, tgt_word2id, tgt_embeddings = utils.read_txt_embeddings(
    params.training_emb_path_tgt, params.n_ft_ex, False)