예제 #1
0
    def main(self):
        self.model = {}
        # Loading fastText
        self.model['eng'] = FastVector(
            vector_file='/Users/arai9814/model/wiki.en.vec')
        self.model['jpn'] = FastVector(
            vector_file='/Users/arai9814/model/wiki.ja.vec')
        self.model['fra'] = FastVector(
            vector_file='/Users/arai9814/model/wiki.fr.vec')
        # Transform multi-lingual vector to same vector space
        self.model['eng'].apply_transform('alignment_matrices/en.txt')
        self.model['jpn'].apply_transform('alignment_matrices/ja.txt')
        self.model['fra'].apply_transform('alignment_matrices/fr.txt')

        ver = wn.get_version()
        print("RESOURCE: WN " + str(ver) + "\n")
        print("LANGUAGE: " + str(self.langs) + "\n")
        print("VECTORS: " + self.folder + "\n")
        print("TARGET: " + self.folder + "\n")

        self.extractWordsAndSynsets(self.folder + "words.txt",
                                    self.folder + "synsets.txt",
                                    self.folder + "lemmas.txt")
        self.extractSynsetRelations(self.folder + "hypernym.txt", '@')
        self.extractSynsetRelations(self.folder + "similar.txt", '&')
        self.extractSynsetRelations(self.folder + "verbGroup.txt", '$')
        self.extractSynsetRelations(self.folder + "antonym.txt", '!')

        print("DONE")
def main():
    ja_dic = FastVector(vector_file='../vecmap/data/wiki.ja.vec')
    en_dic = FastVector(vector_file='../vecmap/data/wiki.en.vec')
    print("loaded the dictionaries")

    ja_dic.apply_transform('alignment_matrices/ja.txt')
    en_dic.apply_transform('alignment_matrices/en.txt')
    print("transformed the dictionaries")

    idx = 0
    result = {}
    result_f = open("en_ja_multifast.txt", "w")
    en_word_list = list(en_dic.word2id.keys())
    print("The total length of English pretrained vector : " +
          str(len(en_word_list)))

    for en_word in tqdm(en_word_list):
        ja_words = ja_dic.translate_k_nearest_neighbour(en_dic[en_word], k=15)
        result[en_word] = ja_words
        idx += 1
        result[en_word] = ja_words
        resut_str = ",".join(result[en_word])
        result_f.write(str(idx) + "," + en_word + "," + resut_str + "\n")
        if idx > 5000:
            exit()

    result_f.close()
예제 #3
0
def ground_truth(en_sent, fr_sent):
    """
    Function that extracts the ground truth for a pair of sentences in english and french
    :param en_sent: The the sentence in english
    :param fr_sent: The sentence in french
    :return:
    """
    # keys = set(fr_sent)

    # score matrix
    score = np.empty([len(en_sent), len(fr_sent)], dtype=np.float32)

    # label
    truth = np.zeros([len(en_sent), len(fr_sent)], dtype=np.float32)

    # we find the ground truth. We randomize access to break ties randomly
    for j in range(len(en_sent)):
        for k in range(len(fr_sent)):
            score[j, k] = FastVector.cosine_similarity(en_dict[en_sent[j]],
                                                       fr_dict[fr_sent[k]])

    # we find the ground truth. We randomize access to break ties randomly
    for j in range(len(en_sent)):
        argmax = int(score[j].argmax())
        truth[j, argmax] = 1.

    return truth.reshape(-1)
예제 #4
0
def load_data():
    global skt_dictionary, wdict_skt, skt_words, skt_stop
    print("Loading data")
    skt_dictionary = FastVector(vector_file='../data/skt_vectors.vec')
    wdict_skt = read_weight_dictionary("../data/word_count_skt.txt")
    skt_words = set(skt_dictionary.word2id.keys())
    skt_stop = read_stopwords(
        "/home/basti/deeplearning/bilingual/skt2tib/data/skt_stop.txt")
def loadfasttextmodel(filename):
  filename='/home/ahmad/fastText_multilingual/'
  w2v=dict()
  #['en','es','zh','hr','de','fa','ar','fr']['es','en','de']
  for lng in ['en','es','de','fa','ar','fr']:
    w2v[lng] = FastVector(vector_file=filename+'wiki.'+lng+'.vec')
    w2v[lng].apply_transform(filename+'alignment_matrices/'+lng+'.txt')
  
  return w2v
예제 #6
0
파일: align.py 프로젝트: leondz/bornholmsk
def cached_load_vecs(filename):
    if os.path.isfile(filename + '.pickle'):
        return pickle.load(open(filename + '.pickle', 'rb'))
    else:
        print(' slow read for', filename)
        vecs = FastVector(vector_file=filename)
        print(' caching pickle for', filename)
        try:
            pickle.dump(vecs, open(filename + 'pickle', 'wb'))
        except:
            print(' ..failed')
예제 #7
0
def loadmultilingualw2vmodel(filename):
    filename = '/home/ahmad/fastText_multilingual/'
    w2v = dict()
    w2v['fr'] = FastVector(vector_file=filename + 'wiki.fr.vec')
    w2v['fr'].apply_transform(filename + 'alignment_matrices/fr.txt')

    w2v['en'] = FastVector(vector_file=filename + 'wiki.en.vec')
    w2v['en'].apply_transform(filename + 'alignment_matrices/en.txt')

    w2v['es'] = FastVector(vector_file=filename + 'wiki.es.vec')
    w2v['es'].apply_transform(filename + 'alignment_matrices/es.txt')

    w2v['zh'] = FastVector(vector_file=filename + 'wiki.zh.vec')
    w2v['zh'].apply_transform(filename + 'alignment_matrices/zh.txt')

    w2v['hr'] = FastVector(vector_file=filename + 'wiki.hr.vec')
    w2v['hr'].apply_transform(filename + 'alignment_matrices/hr.txt')

    w2v['de'] = FastVector(vector_file=filename + 'wiki.de.vec')
    w2v['de'].apply_transform(filename + 'alignment_matrices/de.txt')

    #en_vector = w2v['en']["cat"]
    #es_vector = w2v['es']["gato"]
    #print(FastVector.cosine_similarity(es_vector, en_vector))

    return w2v
예제 #8
0
def loadfasttextmodel(filename):
    filename = '/home/ahmad/fastText_multilingual/'
    w2v = dict()
    #['en','es','zh','hr','de','fa','ar','fr']
    for lng in ['es', 'en', 'de']:
        w2v[lng] = FastVector(vector_file=filename + 'wiki.' + lng + '.vec')
        w2v[lng].apply_transform(filename + 'alignment_matrices/' + lng +
                                 '.txt')

    #en_vector = w2v['en']["cat"]
    #es_vector = w2v['es']["gato"]
    #print(FastVector.cosine_similarity(es_vector, en_vector))
    return w2v
예제 #9
0
def IRI_encoder(IRIs):
    # loading word vectors
    vec = FastVector(vector_file='thin_2018-11-23_d100_e5.bin.vec')
    wordvector = []
    for IRI in IRIs:
        if IRI in vec.word2id.keys():
            idx = vec.word2id[IRI]
            wordvector.append(vec.embed[idx])
        else:
            print(IRI)
            wordvector.append([0] * 100)
    wordvector = np.array(wordvector)
    return wordvector
예제 #10
0
def main2():
    for zzz in LANGUAGE_LIST:
        lang = zzz[0]
        # get original embed
        system(
            "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec"
            % (OUT_DIR, lang, lang),
            pp=True)
        # project with LIB-matrix
        lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang))
        lang_dict.apply_transform("%s/alignment_matrices/%s.txt" %
                                  (LIB_DIR, lang))
        lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
예제 #11
0
def main():
    # first get the English one
    lang = "en"
    system(
        "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec"
        % (OUT_DIR, lang, lang),
        pp=True)
    # en_dict = FastVector(vector_file='%s/wiki.en.vec' % OUT_DIR)
    for zzz in LANGUAGE_LIST:
        lang, fnames = zzz[0], zzz[1]
        printing("Dealing with lang %s." % lang)
        for curf in ["train", "dev", "test"]:
            out_fname = "%s/%s_%s.conllu" % (OUT_DIR, lang, curf)
            fout = zopen(out_fname, "w")
            for fname in fnames:
                last_name = fname.split("-")[-1].lower()
                path_name = "%s/%s/%s_%s-ud-%s.conllu" % (UD2_DIR, fname, lang,
                                                          last_name, curf)
                if os.path.exists(path_name):
                    with zopen(path_name) as fin:
                        deal_conll_file(fin, fout)
            fout.close()
            # stat
            system('cat %s | grep -E "^$" | wc' % out_fname, pp=True)
            system('cat %s | grep -Ev "^$" | wc' % out_fname, pp=True)
            system(
                "cat %s | grep -Ev '^$' | cut -f 5 -d $'\t'| grep -Ev 'PUNCT|SYM' | wc"
                % out_fname,
                pp=True)
        # get original embed
        system(
            "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec"
            % (OUT_DIR, lang, lang),
            pp=True)
        # project with LIB-matrix
        lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang))
        lang_dict.apply_transform("%s/alignment_matrices/%s.txt" %
                                  (LIB_DIR, lang))
        lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
예제 #12
0
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)


# copy embedding files from https://fasttext.cc/docs/en/crawl-vectors.html#models
en_dictionary = FastVector(vector_file='cc.en.300.vec')
zh_dictionary = FastVector(vector_file='cc.zh.300.vec')

en_vector = en_dictionary["love"]
zh_vector = zh_dictionary["爱"]

# going to print 0.0004326613965749648
print(FastVector.cosine_similarity(en_vector, zh_vector))

zh_words = set(zh_dictionary.word2id.keys())
en_words = set(en_dictionary.word2id.keys())
overlap = list(zh_words & en_words)
bilingual_dictionary = [(entry, entry) for entry in overlap]

# form the training matrices
source_matrix, target_matrix = make_training_matrices(en_dictionary,
예제 #13
0
def prepare_data(data_raw, labels_raw, params, data_path):
    # get embeddings, prepare data
    print("building dictionary")
    data_dict = Dictionary(data_raw, labels_raw, params.vocab_drop)
    save_data(data_dict.sentences,
              "./trained_embeddings_" + params.name + "/sentences_mod.pickle",
              os.path.join(data_path, 'data_mod.txt'))
    save_data(data_dict.labels,
              "./trained_embeddings_" + params.name + "/labels_mod.pickle",
              os.path.join(data_path, 'labels_mod.txt'))

    sizes = data_dict.sizes
    b1 = sizes[0]
    b2 = sizes[0] + sizes[1]
    b3 = sizes[0] + sizes[1] + sizes[2]

    model_path = "./trained_embeddings_" + params.name
    filename = os.path.join(model_path, "embedding_file.pkl")

    if os.path.exists(filename):
        with open(filename, 'r') as rf:
            embed_arr = pickle.load(rf)

    else:
        hi_align_dictionary = FastVector(
            vector_file='/home/bidisha/sharmila/wiki.hi.align.vec')
        en_align_dictionary = FastVector(
            vector_file='/home/bidisha/sharmila/wiki.en.align.vec')
        print("loaded the files..")

        embed_arr = embed_arr = np.zeros(
            [data_dict.vocab_size, params.embed_size])
        for i in range(embed_arr.shape[0]):
            print(i)
            if i == 0:
                continue
            elif (i > 0 and i < b1):
                try:
                    embed_arr[i] = en_align_dictionary[data_dict.idx2word[i]]
                    print(str(i), "english")
                except:
                    pass
                try:
                    embed_arr[i] = hi_align_dictionary[data_dict.idx2word[i]]
                    print(str(i), "hindi")
                except:
                    embed_arr[i] = hi_align_dictionary["unk"]
                    print(str(i), "unk")

            elif (i >= b1 and i < b2):
                try:
                    embed_arr[i] = en_align_dictionary[data_dict.idx2word[i]]
                    print(str(i), "english")
                except:
                    embed_arr[i] = hi_align_dictionary["unk"]
                    print(str(i), "unk")

            elif (i >= b2 and i < b3):
                try:
                    embed_arr[i] = hi_align_dictionary[data_dict.idx2word[i]]
                    print(str(i), "hindi")
                except:
                    embed_arr[i] = hi_align_dictionary["unk"]
                    print(str(i), "unk")

        print("Embedding created")
        if not os.path.exists(model_path):
            os.makedirs(model_path)

        with open(filename, 'w') as wf:
            pickle.dump(embed_arr, wf)

    # if params.pre_trained_embed:
    #     w2_vec = train_w2vec(params.input, params.embed_size,
    #                         w2vec_it=5,
    #                         sentences=data_dict.sentences,
    #                         model_path="./trained_embeddings_"+params.name)
    #     embed_arr = np.zeros([data_dict.vocab_size, params.embed_size])
    #     for i in range(embed_arr.shape[0]):
    #         if i == 0:
    #             continue
    #         try:
    #             embed_arr[i] = w2_vec.word_vec(unicode(data_dict.idx2word[i], "utf-8"))
    #             # print(data_dict.idx2word[i])

    #         except:
    #             ax=2
    #             # embed_arr[i] = w2_vec.word_vec('<unk>')
    data = [[data_dict.word2idx[word] \
             for word in sent[:-1]] for sent in data_dict.sentences \
            if len(sent) < params.sent_max_size - 2]

    encoder_data = [[data_dict.word2idx[word] \
                   for word in sent[1:]] for sent in data_dict.sentences \
                  if len(sent) < params.sent_max_size - 2]

    decoder_labels = []
    for sent in data_dict.sentences:
        a = []
        for word in sent[1:]:
            index = data_dict.word2idx[word]
            if (index >= b1 and index < b2):
                a.append(index - b1)
            elif (index >= b2):
                a.append(index - b2)
            else:
                a.append(index)

        decoder_labels.append(a)

    # for i in range(5):
    #     print(encoder_data[i])
    #     print(decoder_labels[i])
    #     print("------------------")

    # exit()
    filename = os.path.join(model_path, "data_dict.pkl")
    with open(filename, 'w') as wf:
        pickle.dump(data_dict, wf)

    print("----Corpus_Information--- \n "
          "Raw data size: {} sentences \n Vocabulary size {}"
          "\n Limited data size {} sentences \n".format(
              len(data_raw), data_dict.vocab_size, len(data)))
    return data, encoder_data, decoder_labels, embed_arr, data_dict
예제 #14
0
embeddingsmodel0=loadtransfasttextmodel('Path To Vectors')
vecten=[]
lng = 'en'
for word in embeddingsmodel0[lng].id2word:
      vecten.append(embeddingsmodel0[lng][word])

#.reshape(-1,300)[0]
vectes=[]
lng = 'es'
for word in embeddingsmodel0[lng].id2word:
      vectes.append(embeddingsmodel0[lng][word])

lng = 'ar'
vectar=[]
embeddingsmodel0[lng] = FastVector(vector_file=filename+'wiki.'+lng+'.vec')
for word in embeddingsmodel0[lng].id2word:
      vectar.append(embeddingsmodel0[lng][word])

vectar=np.asarray(vectar)
vecten=np.asarray(vecten)
vectes=np.asarray(vectes)
    
    #stanford_ner_path = '/home/ahmad/nltk_data/stanford/stanford-ner.jar'
    #os.environ['CLASSPATH'] = stanford_ner_path
    #stanford_classifier = "/home/ahmad/nltk_data/stanford/es/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz"
    #stes = StanfordNERTagger(stanford_classifier)
    #stanford_classifier = '/home/ahmad/nltk_data/stanford/english.all.3class.distsim.crf.ser.gz'
    #sten = StanfordNERTagger(stanford_classifier)
    #stanford_classifier = "/home/ahmad/nltk_data/stanford/de/edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz"
    #stde = StanfordNERTagger(stanford_classifier)
예제 #15
0
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)


# Now we load the French and Russian word vectors, and evaluate the similarity of "chat" and "кот":

# In[2]:

fr_dictionary = FastVector(vector_file='zh_vec.txt')
ru_dictionary = FastVector(vector_file='en_vec.txt')

fr_vector = fr_dictionary["chat"]
ru_vector = ru_dictionary["кот"]
print(FastVector.cosine_similarity(fr_vector, ru_vector))

# "chat" and "кот" both mean "cat", so they should be highly similar; clearly the two word vector spaces are not yet aligned. To align them, we need a bilingual dictionary of French and Russian translation pairs. As it happens, this is a great opportunity to show you something truly amazing...
#
# Many words appear in the vocabularies of more than one language; words like "alberto", "london" and "presse". These words usually mean similar things in each language. Therefore we can form a bilingual dictionary, by simply extracting every word that appears in both the French and Russian vocabularies.

# In[3]:

ru_words = set(ru_dictionary.word2id.keys())
fr_words = set(fr_dictionary.word2id.keys())
overlap = list(ru_words & fr_words)
예제 #16
0
from fasttext import FastVector
import json

ja_dic = FastVector(vector_file='../vecmap/data/wiki.ja.vec')
en_dic = FastVector(vector_file='../vecmap/data/wiki.en.vec')
print("loaded the dictionaries")

ja_dic.apply_transform('alignment_matrices/ja.txt')
en_dic.apply_transform('alignment_matrices/en.txt')
print("transformed the dictionaries")

en_word_list = [
    "cat", "dog", "apple", "car", "train", "school", "student", "teacher"
]
ja_word_list = ["猫", "犬", "りんご", "車", "電車", "学校", "生徒", "先生"]

result_f = open("multi_fast.txt", "w")
result = {}
# Ja_word_list 10 nearest neighbor
for ja_word in ja_word_list:
    en_words = en_dic.translate_k_nearest_neighbour(ja_dic[ja_word], k=20)
    result[ja_word] = en_words
    resut_str = ",".join(result[ja_word])
    result_f.write(ja_word + "," + resut_str + "\n")

# En_word_list 10 nearest neighbor
for en_word in en_word_list:
    ja_words = ja_dic.translate_k_nearest_neighbour(en_dic[en_word], k=20)
    result[en_word] = ja_words
    resut_str = ",".join(result[en_word])
    result_f.write(en_word + "," + resut_str + "\n")
예제 #17
0
    else:
        doc_emb = np.zeros([len(df), 300])
        doc = df['data'].apply(lambda x: x.lower().split())

        for idx, sent in enumerate(doc):
            for word in sent:
                if word in lang_vec.word2id.keys():
                    doc_emb[idx] += lang_vec[word]
    return doc_emb, tfidfvec


if __name__ == '__main__':
    args = parse_args()

    print('loading vectors')
    en_dictionary = FastVector(vector_file=args.en_embedding)
    fr_dictionary = FastVector(vector_file=args.fr_embedding)

    #print('transforming vectors')
    #fr_dictionary.apply_transform('alignment_matrices/fr.txt')

    #print('CCA...')
    #en_fr = read_dictionary(args.embedding_path+'en_fr.txt')
    #en_dictionary.embed, fr_dictionary.embed = cca(en_dictionary, fr_dictionary, en_fr, dim=250)

    print(
        "Hello score:",
        FastVector.cosine_similarity(en_dictionary["hello"],
                                     fr_dictionary["bonjour"]))

    print('processing data')
예제 #18
0
synonyms_file = args.synonyms
lang = args.lang
skip_lang = args.skip_lang
out_folder = args.output
pairs_file = args.test

### Create synonyms dictionary
synonyms_dict = dict()
syn_file = open(synonyms_file, "r")
lines = syn_file.readlines()
for line in lines:
    line = re.sub(r'\n', '', line)
    w1, w2 = line.split('\t')
    synonyms_dict[w1] = w2.split(',')

fr_dictionary = FastVector(vector_file=args.lang_vec)
it_dictionary = FastVector(vector_file=args.lang_p_vec)

# Start

out_file_name = out_folder + "/skip" + skip_lang + "_" + lang + ".txt"

if not os.path.exists(out_folder):
    os.makedirs(out_folder)

out_file = codecs.open(out_file_name, 'w', "utf-8")

vec_file_name = re.sub(r'\.txt', '', out_file_name)
vec_file_name = vec_file_name + "_vec.txt"
vec_file = codecs.open(vec_file_name, 'w', "utf-8")
예제 #19
0
def test_word(en_dictionary, other_dictionary, SRC_WORD, TGT_WORD):
    print "Testing WORD[%s->%s]" % (SRC_WORD, TGT_WORD)
    en_vector = en_dictionary[SRC_WORD]
    other_vector = other_dictionary[TGT_WORD]
    print(FastVector.cosine_similarity(en_vector, other_vector))
예제 #20
0
    f = codecs.open("Qe", "w")
    for word in en_dict:
        pro = " ".join(en_dict[word])
        f.write(word + " " + pro + '\n')


print "Readling Dictionary"
BI_DICT = codecs.open("o.s2t_f", "r").readlines()
BI_DICT = parse_BI(BI_DICT)
print "Readling Dictionary (END)"

# SRC_WORD = "昨天"
# TGT_WORD = "yesterday"
SRC_WORD = "钥匙"
TGT_WORD = "keys"
en_dictionary = FastVector(vector_file='en.emb.orig.vec')
other_dictionary = FastVector(vector_file='tizh.emb.orig.vec')

test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD)

# form the training matrices
print "Learning SVD"
source_matrix, target_matrix = make_training_matrices(other_dictionary,
                                                      en_dictionary, BI_DICT)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
other_dictionary.apply_transform(transform)
# zh
test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD)
# ti
예제 #21
0
#coding=utf-8
from fasttext import FastVector
fr_dictionary = FastVector(vector_file='wiki.en.vec')
fr_dictionary.export('fr.vec.txt')
예제 #22
0
        losses.append(gap)

    return w, l, losses


if __name__ == "__main__":
    # load the datasets and perform split into training and test set
    dir = os.path.join(os.getcwd(), "expcode", "numerical_code")
    en_corpus = pickle.load(open(os.path.join(dir, 'english_vocab.pkl'),
                                 'rb'))[:100]  # CHANGE THIS WHEN WE HAVE DB
    fr_corpus = pickle.load(open(os.path.join(dir, 'french_vocab.pkl'),
                                 'rb'))[:100]  # CHANGE THIS WHEN WE HAVE DB

    # load the counts and co-occurences
    en_dict = FastVector(
        vector_file='/Users/williamst-arnaud/Downloads/cc.en.300.vec')
    fr_dict = FastVector(
        vector_file='/Users/williamst-arnaud/Downloads/cc.fr.300.vec')

    en_dict.apply_transform(
        '/Users/williamst-arnaud/Downloads/fastText_multilingual-master/alignment_matrices/en.txt'
    )
    fr_dict.apply_transform(
        '/Users/williamst-arnaud/Downloads/fastText_multilingual-master/alignment_matrices/fr.txt'
    )

    # number of items in dataset
    n = len(en_corpus)

    start = time.time()
    w, l, losses = train(en_corpus,
예제 #23
0
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)


lang1_dictionary = FastVector(vector_file=args.lang1)
lang2_dictionary = FastVector(vector_file=args.lang2)

bilingual_dictionary = []
file_object = open(args.dict, "r")
lines = file_object.readlines()
for line in lines:
    line = re.sub(r'\n', '', line)
    w_lang2, w_lang1 = line.split('\t')
    if w_lang1 in lang1_dictionary.word2id.keys(
    ) and w_lang2 in lang2_dictionary.word2id.keys():
        bilingual_dictionary.append(tuple((w_lang2, w_lang1)))

print("Dic Size: " + str(len(bilingual_dictionary)))

# form the training matrices# form
예제 #24
0
def train(args):
    train_data_src = read_corpus(args.train_src, source='src')
    train_data_tgt = read_corpus(args.train_tar, source='tgt')
    train_data_src_mono = read_corpus(args.train_src_mono, source='src')
    train_data_tgt_mono = read_corpus(args.train_tar_mono, source='tgt')

    dev_data_src = read_corpus(args.dev_src, source='src')
    dev_data_tgt = read_corpus(args.dev_tar, source='tgt')

    train_data = list(
        zip(train_data_src, train_data_tgt, train_data_src_mono,
            train_data_tgt_mono))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args.batch_size)
    valid_niter = int(args.valid_iter)
    log_every = int(args.log_every)
    model_save_path = args.save_path

    "Vocab dict"
    vocab_src = pickle.load(open(args.vocab_src, 'rb'))
    vocab_tar = pickle.load(open(args.vocab_tar, 'rb'))

    "Optimizer params"
    s2s_param = []
    t2t_param = []
    s2t_param = []
    t2s_param = []

    "Embed"  # pretrained (and fixed), cross-lingual embeddings
    args.embed_size = 300
    from fasttext import FastVector
    src_embed_path = 'embed/' + args.embed_src
    tar_embed_path = 'embed/' + args.embed_tar
    try:
        vectors_src = pickle.load(open(src_embed_path, 'rb'))
    except FileNotFoundError:
        vectors_src = FastVector(vector_file=args.embed_src)
        vectors_src.apply_transform(args.embed_alignment)
        pickle.dump(vectors_src, open(src_embed_path, 'wb+'))
    try:
        vectors_tar = pickle.load(open(tar_embed_path, 'rb'))
    except FileNotFoundError:
        vectors_tar = FastVector(
            vector_file=args.embed_tar)  # tar is en, no alignment required
        pickle.dump(vectors_tar, open(tar_embed_path, 'wb+'))

    src2embed = lambda word: torch.FloatTensor(vectors_src[
        word]) if word in vectors_src else torch.zeros(300)
    tar2embed = lambda word: torch.FloatTensor(vectors_tar[
        word]) if word in vectors_tar else torch.zeros(300)
    embedder_src = Embedder(
        vocab_src.dict_size(), args.embed_size,
        nn.Embedding.from_pretrained(torch.stack([
            src2embed(word.lower() if word is not None else word)
            for word in vocab_src.id2word
        ],
                                                 dim=0),
                                     freeze=True))
    embedder_tar = Embedder(
        vocab_tar.dict_size(), args.embed_size,
        nn.Embedding.from_pretrained(torch.stack([
            tar2embed(word.lower() if word is not None else word)
            for word in vocab_tar.id2word
        ],
                                                 dim=0),
                                     freeze=True))

    "Generator"
    gen_src = EmbeddingGenerator(args.hidden_size, args.embed_size).cuda()
    gen_src_wrapper = WrapperEmbeddingGenerator(gen_src, embedder_src).cuda()
    gen_tar = EmbeddingGenerator(args.hidden_size, args.embed_size).cuda()
    gen_tar_wrapper = WrapperEmbeddingGenerator(gen_tar, embedder_tar).cuda()

    if args.gen_src != "":
        gen_src_wrapper.load_weight(args.gen_src)
    else:
        [s2s_param, s2t_param,
         t2s_param] = add_to_optimizer(gen_src_wrapper,
                                       [s2s_param, s2t_param, t2s_param])
    if args.gen_tar != "":
        gen_tar_wrapper.load_weight(args.gen_tar)
    else:
        [s2t_param, t2s_param,
         t2t_param] = add_to_optimizer(gen_tar_wrapper,
                                       [s2t_param, t2s_param, t2t_param])

    if args.multi_gpu:
        gen_src_wrapper = nn.DataParallel(gen_src_wrapper, device_ids=[0, 1])
        gen_tar_wrapper = nn.DataParallel(gen_tar_wrapper, device_ids=[0, 1])

    "encoder"  # shared encoder
    encoder = GRUEncoder(args.embed_size,
                         args.hidden_size,
                         bidirectional=args.encoder_bidir,
                         layers=args.encoder_layer,
                         dropout=args.dropout).cuda()
    if args.multi_gpu:
        encoder = nn.DataParallel(encoder, device_ids=[0, 1])

    [s2s_param, s2t_param, t2s_param, t2t_param
     ] = add_to_optimizer(encoder,
                          [s2s_param, s2t_param, t2s_param, t2t_param])

    "Decoder"
    decoder_src = AttentionDecoder(args.embed_size,
                                   args.hidden_size,
                                   1,
                                   args.dropout,
                                   input_feed=True).cuda()
    decoder_tar = AttentionDecoder(args.embed_size,
                                   args.hidden_size,
                                   1,
                                   args.dropout,
                                   input_feed=True).cuda()
    if args.multi_gpu:
        decoder_src = nn.DataParallel(decoder_src, device_ids=[0, 1])
        decoder_tar = nn.DataParallel(decoder_tar, device_ids=[0, 1])

    [s2s_param, s2t_param,
     t2s_param] = add_to_optimizer(decoder_src,
                                   [s2s_param, s2t_param, t2s_param])
    [s2t_param, t2s_param,
     t2t_param] = add_to_optimizer(decoder_tar,
                                   [s2t_param, t2s_param, t2t_param])

    "Translators"
    s2s_model = MT(vocab_src,
                   vocab_src,
                   embedder_src,
                   embedder_src,
                   gen_src_wrapper,
                   encoder,
                   decoder_src,
                   denoising=True,
                   multi_gpu=args.multi_gpu)
    t2t_model = MT(vocab_tar,
                   vocab_tar,
                   embedder_tar,
                   embedder_tar,
                   gen_tar_wrapper,
                   encoder,
                   decoder_tar,
                   denoising=True,
                   multi_gpu=args.multi_gpu)
    s2t_model = MT(vocab_src,
                   vocab_tar,
                   embedder_src,
                   embedder_tar,
                   gen_tar_wrapper,
                   encoder,
                   decoder_tar,
                   denoising=False,
                   multi_gpu=args.multi_gpu)
    t2s_model = MT(vocab_tar,
                   vocab_src,
                   embedder_tar,
                   embedder_src,
                   gen_src_wrapper,
                   encoder,
                   decoder_src,
                   denoising=False,
                   multi_gpu=args.multi_gpu)

    "optimizers"
    s2s_optimizer = torch.optim.Adam(s2s_param, lr=args.lr)
    t2t_optimizer = torch.optim.Adam(t2t_param, lr=args.lr)
    s2t_optimizer = torch.optim.Adam(s2t_param, lr=args.lr)
    t2s_optimizer = torch.optim.Adam(t2s_param, lr=args.lr)

    def save_model():
        # save embedder
        if args.embed_src == "":
            embedder_src.save_weight(args.save_path + "/embed_src.bin")
        if args.embed_tar == "":
            embedder_tar.save_weight(args.save_path + "/embed_tar.bin")

        # save generator
        if args.gen_src == "":
            gen_src_wrapper.save_weight(args.save_path + "/gen_src.bin")
        if args.gen_tar == "":
            gen_tar_wrapper.save_weight(args.save_path + "/gen_tar.bin")

        # save encoder
        encoder.save_weight(args.save_path + "/encoder.bin")

        # save decoder
        decoder_src.save_weight(args.save_path + "/decoder_src.bin")
        decoder_tar.save_weight(args.save_path + "/decoder_tar.bin")

        # save optimizer

        print("all models saved")

    def train_step(mt, optimizer, src_sents, tar_sents):
        optimizer.zero_grad()
        loss = mt.get_loss(src_sents, tar_sents, train=True)
        loss += loss.data[0]
        res = loss.cpu().detach().item()
        loss.div(args.batch_size).backward()
        optimizer.step()
        return res

    def train_step_backtranslate(mt, optimizer, src_sents, max_ratio):
        tar_sents = mt.greedy(src_sents, max_ratio, mode=False)
        res = train_step(mt, optimizer, src_sents, tar_sents)
        return res

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cumulative_tgt_words = report_tgt_words = 0
    cumulative_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    if not os.path.exists(args.save_path):
        os.mkdir(args.save_path)
    while True:
        epoch += 1

        for src_sents, tgt_sents, src_mono_sents, tgt_mono_sents in batch_iter(
                train_data, batch_size=train_batch_size, shuffle=True):
            train_iter += 1

            batch_size = len(src_sents)

            srclen = max(map(len, src_sents))
            tar_len = max(map(len, tgt_sents))
            print("SRCLEN {} TARLEN {}".format(srclen, tar_len))

            model = s2t_model
            # (batch_size)
            train_step(s2s_model, s2s_optimizer, src_sents, src_sents)
            print("finish s2s")
            train_step(t2t_model, t2t_optimizer, tgt_sents, tgt_sents)
            print("finish t2t")
            train_step(s2s_model, s2s_optimizer, src_mono_sents, src_sents)
            print("finish s2s mono")
            train_step(t2t_model, t2t_optimizer, tgt_mono_sents, tgt_sents)
            print("finish t2t mono")

            train_step(t2s_model, t2s_optimizer, tgt_sents, src_sents)
            print("finish t2s")
            loss = train_step(model, s2t_optimizer, src_sents, tgt_sents)
            print("finish s2t")

            train_step_backtranslate(s2t_model, s2t_optimizer, src_sents,
                                     (tar_len / srclen))
            print("finish s2t back")
            train_step_backtranslate(t2s_model, t2s_optimizer, tgt_sents,
                                     (srclen / tar_len))
            print("finish t2s back")
            train_step_backtranslate(s2t_model, s2t_optimizer, src_mono_sents,
                                     (tar_len / srclen))
            print("finish s2t back mono")
            train_step_backtranslate(t2s_model, t2s_optimizer, tgt_mono_sents,
                                     (srclen / tar_len))
            print("finish t2s back mono")
            os.system("nvidia-smi")
            report_loss += loss
            cum_loss += loss

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cumulative_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cumulative_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cumulative_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # the following code performs validation on dev set, and controls the learning schedule
            # if the dev score is better than the last check point, then the current model is saved.
            # otherwise, we allow for that performance degeneration for up to `--patience` times;
            # if the dev score does not increase after `--patience` iterations, we reload the previously
            # saved best model (and the state of the optimizer), halve the learning rate and continue
            # training. This repeats for up to `--max-num-trial` times.
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cumulative_examples,
                       np.exp(cum_loss / cumulative_tgt_words),
                       cumulative_examples),
                    file=sys.stderr)

                cum_loss = cumulative_examples = cumulative_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = model.evaluate_ppl(
                    dev_data, batch_size=args.batch_size
                )  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' %
                      (train_iter, dev_ppl),
                      file=sys.stderr)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                save_model()

                # if is_better:
                #     patience = 0
                #     print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                #     model.save(model_save_path)

                #     # You may also save the optimizer's state
                # elif patience < int(args.patience):
                #     patience += 1
                #     print('hit patience %d' % patience, file=sys.stderr)

                #     if patience == int(args.patience):
                #         num_trial += 1
                #         print('hit #%d trial' % num_trial, file=sys.stderr)
                #         if num_trial == int(args.max_num_trail):
                #             print('early stop!', file=sys.stderr)
                #             exit(0)

                #         # decay learning rate, and restore from previously best checkpoint
                #         lr = lr * float(args.lr_decay)
                #         print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                #         # load model
                #         model_save_path

                #         print('restore parameters of the optimizers', file=sys.stderr)
                #         # You may also need to load the state of the optimizer saved before

                #         # reset patience
                #         patience = 0

                if epoch == int(args.max_epoch):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)
def main():
    vec = FastVector('thin_2018-11-23_d100_e5.bin.vec')
    iris = vec.word2id.keys()
    cond2vec = MedicalCond2Vector(iris)
    cond2vec.save_model('thin_to_word2vec.bin.vec')
    print(cond2vec.vector)
예제 #26
0
 def load_embeddings_dict(self, language):
     vector_file = path.join(self.embs_dir + language, language + '.vec')
     dictionary = FastVector(vector_file=vector_file)
     return dictionary