def test_fasttext_embeddings(self): # First we will add smaller test embeddings to the MODELS['ddt.swv'] = { 'url': 'https://danlp.alexandra.dk/304bd159d5de/tests/ddt.swv.zip', 'vocab_size': 5000, 'dimensions': 100, 'md5_checksum': 'c50c61e1b434908e2732c80660abf8bf', 'size': 741125088, 'file_extension': '.bin' } AVAILABLE_SUBWORD_EMBEDDINGS.append('ddt.swv') download_model('ddt.swv', process_func=_unzip_process_func) fasttext_embeddings = load_wv_with_gensim('ddt.swv') self.assertEqual(type(fasttext_embeddings), FastTextKeyedVectors) # The word is not in the vocab self.assertNotIn('institutmedarbejdskontrakt', fasttext_embeddings.vocab) # However we can get an embedding because of subword units self.assertEqual( fasttext_embeddings['institutmedarbejdskontrakt'].size, 100)
def test_embeddings_with_gensim(self): embeddings = load_wv_with_gensim('wiki.da.small.wv') most_similar = embeddings.most_similar( positive=['københavn', 'england'], negative=['danmark'], topn=1) self.assertEqual(most_similar[0], ('london', 0.5180857181549072))
def test_embeddings_with_gensim(self): for emb in self.embeddings_for_testing: embeddings = load_wv_with_gensim(emb) self.assertEqual(MODELS[emb]['vocab_size'], len(embeddings.vocab))
def load_wv_models(): for da_wv_model in AVAILABLE_EMBEDDINGS: yield da_wv_model, load_wv_with_gensim(da_wv_model)