def ground_truth(en_sent, fr_sent): """ Function that extracts the ground truth for a pair of sentences in english and french :param en_sent: The the sentence in english :param fr_sent: The sentence in french :return: """ # keys = set(fr_sent) # score matrix score = np.empty([len(en_sent), len(fr_sent)], dtype=np.float32) # label truth = np.zeros([len(en_sent), len(fr_sent)], dtype=np.float32) # we find the ground truth. We randomize access to break ties randomly for j in range(len(en_sent)): for k in range(len(fr_sent)): score[j, k] = FastVector.cosine_similarity(en_dict[en_sent[j]], fr_dict[fr_sent[k]]) # we find the ground truth. We randomize access to break ties randomly for j in range(len(en_sent)): argmax = int(score[j].argmax()) truth[j, argmax] = 1. return truth.reshape(-1)
U, s, V = np.linalg.svd(product) # return orthogonal transformation which aligns source language to the target return np.matmul(U, V) # Now we load the French and Russian word vectors, and evaluate the similarity of "chat" and "кот": # In[2]: fr_dictionary = FastVector(vector_file='zh_vec.txt') ru_dictionary = FastVector(vector_file='en_vec.txt') fr_vector = fr_dictionary["chat"] ru_vector = ru_dictionary["кот"] print(FastVector.cosine_similarity(fr_vector, ru_vector)) # "chat" and "кот" both mean "cat", so they should be highly similar; clearly the two word vector spaces are not yet aligned. To align them, we need a bilingual dictionary of French and Russian translation pairs. As it happens, this is a great opportunity to show you something truly amazing... # # Many words appear in the vocabularies of more than one language; words like "alberto", "london" and "presse". These words usually mean similar things in each language. Therefore we can form a bilingual dictionary, by simply extracting every word that appears in both the French and Russian vocabularies. # In[3]: ru_words = set(ru_dictionary.word2id.keys()) fr_words = set(fr_dictionary.word2id.keys()) overlap = list(ru_words & fr_words) bilingual_dictionary = [(entry, entry) for entry in overlap] # Let's align the French vectors to the Russian vectors, using only this "free" dictionary that we acquired without any bilingual expert knowledge. # In[ ]:
product = np.matmul(source_matrix.transpose(), target_matrix) U, s, V = np.linalg.svd(product) # return orthogonal transformation which aligns source language to the target return np.matmul(U, V) # copy embedding files from https://fasttext.cc/docs/en/crawl-vectors.html#models en_dictionary = FastVector(vector_file='cc.en.300.vec') zh_dictionary = FastVector(vector_file='cc.zh.300.vec') en_vector = en_dictionary["love"] zh_vector = zh_dictionary["爱"] # going to print 0.0004326613965749648 print(FastVector.cosine_similarity(en_vector, zh_vector)) zh_words = set(zh_dictionary.word2id.keys()) en_words = set(en_dictionary.word2id.keys()) overlap = list(zh_words & en_words) bilingual_dictionary = [(entry, entry) for entry in overlap] # form the training matrices source_matrix, target_matrix = make_training_matrices(en_dictionary, zh_dictionary, bilingual_dictionary) # learn and apply the transformation transform = learn_transformation(source_matrix, target_matrix) en_dictionary.apply_transform(transform)
except ValueError: continue if it_w not in it_dictionary: print(it_w + " - " + fr_w + " - it word not found") if (lang == skip_lang): continue if fr_w not in fr_dictionary: print(it_w + " - " + fr_w + " - " + lang + " word not found") if (lang == skip_lang): continue # Cosine between words try: cosine = FastVector.cosine_similarity(it_dictionary[it_w], fr_dictionary[fr_w]) except KeyError: cosine = 1 # Synonyms list synonyms_list = [] if it_w in synonyms_dict: synonyms_list = synonyms_dict[it_w] if len(synonyms_list) < 1: synonyms_list.append(it_w) # Array containing all cosines from synonyms synonyms_cosine_list = [] for s in synonyms_list: if s not in it_dictionary:
def test_word(en_dictionary, other_dictionary, SRC_WORD, TGT_WORD): print "Testing WORD[%s->%s]" % (SRC_WORD, TGT_WORD) en_vector = en_dictionary[SRC_WORD] other_vector = other_dictionary[TGT_WORD] print(FastVector.cosine_similarity(en_vector, other_vector))
args = parse_args() print('loading vectors') en_dictionary = FastVector(vector_file=args.en_embedding) fr_dictionary = FastVector(vector_file=args.fr_embedding) #print('transforming vectors') #fr_dictionary.apply_transform('alignment_matrices/fr.txt') #print('CCA...') #en_fr = read_dictionary(args.embedding_path+'en_fr.txt') #en_dictionary.embed, fr_dictionary.embed = cca(en_dictionary, fr_dictionary, en_fr, dim=250) print( "Hello score:", FastVector.cosine_similarity(en_dictionary["hello"], fr_dictionary["bonjour"])) print('processing data') en_train_file = args.source_path + 'en_train.tsv' en_test_file = args.source_path + 'en_test.tsv' fr_train_file = args.source_path + 'fr_train.tsv' fr_test_file = args.source_path + 'fr_test.tsv' print('english train') en_train_df = read_dataset(en_train_file) en_train_y, en_train_x, en_vectorizor = process_dataset( en_train_df, en_dictionary, None) n_classes = len(set(en_train_y)) label_encoder = dict(zip(list(set(en_train_y)), np.arange(n_classes))) en_train_y = np.array([label_encoder[i] for i in en_train_y])