def testPersistence(self): """Test storing/loading the entire model.""" model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) model.train(self.word_pairs) model.save(temp_save_file()) loaded_model = translation_matrix.TranslationMatrix.load(temp_save_file()) self.assertTrue(np.allclose(model.translation_matrix, loaded_model.translation_matrix))
def testPersistence(self): """Test storing/loading the entire model.""" tmpf = get_tmpfile('transmat-en-it.pkl') model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) model.train(self.word_pairs) model.save(tmpf) loaded_model = translation_matrix.TranslationMatrix.load(tmpf) self.assertTrue(np.allclose(model.translation_matrix, loaded_model.translation_matrix))
def test_translate_gc(self): # Test globally corrected neighbour retrieval method model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) model.train(self.word_pairs) test_source_word, test_target_word = zip(*self.test_word_pairs) translated_words = model.translate(test_source_word, topn=5, gc=1, sample_num=3, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec) for idx, item in enumerate(self.test_word_pairs): self.assertTrue(item[1] in translated_words[item[0]])
def test_translate_nn(self): # Test the nearest neighbor retrieval method model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) model.train(self.word_pairs) test_source_word, test_target_word = zip(*self.test_word_pairs) translated_words = model.translate( test_source_word, topn=5, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec, ) for idx, item in enumerate(self.test_word_pairs): self.assertTrue(item[1] in translated_words[item[0]])
def test_translation_matrix(self): model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) model.train(self.word_pairs) self.assertEqual(model.translation_matrix.shape, (300, 300))
from gensim.models import KeyedVectors from gensim.models import Word2Vec from gensim.models import translation_matrix from gensim.models import BackMappingTranslationMatrix from pprint import pprint w2v_bin_path_old = '/home/dpappas/COVID/COVID/pubmed2018_w2v_30D.bin' w2v_bin_path_new = '/home/dpappas/COVID/covid_19_w2v_embeds_30.model' wv_old = KeyedVectors.load_word2vec_format(w2v_bin_path_old, binary=True) wv_new = Word2Vec.load(w2v_bin_path_new) common_tokens = set(wv_old.vocab.keys()).intersection(set(wv_new.wv.vocab.keys())) common_tokens = [(tok, tok) for tok in common_tokens] transmat = translation_matrix.TranslationMatrix(wv_new.wv, wv_old, common_tokens) transmat.train(common_tokens) # transmat.apply_transmat(transmat.source_space) pprint(transmat.translate('covid-19', topn=25)) from scipy import spatial result = 1 - spatial.distance.cosine(wv_old['fredriksberg'], wv_old['non-neurologist']) # def project_words_vectors(transmat, source_words): # source_space = translation_matrix.Space.build(transmat.source_lang_vec, source_words) # source_space.normalize() # mapped_source_space = transmat.apply_transmat(source_space) # return mapped_source_space #