示例#1
0
    def test_fasttext_embeddings(self):
        # First we will add smaller test embeddings to the
        MODELS['ddt.swv'] = {
            'url': 'https://danlp.alexandra.dk/304bd159d5de/tests/ddt.swv.zip',
            'vocab_size': 5000,
            'dimensions': 100,
            'md5_checksum': 'c50c61e1b434908e2732c80660abf8bf',
            'size': 741125088,
            'file_extension': '.bin'
        }

        AVAILABLE_SUBWORD_EMBEDDINGS.append('ddt.swv')

        download_model('ddt.swv', process_func=_unzip_process_func)

        fasttext_embeddings = load_wv_with_gensim('ddt.swv')

        self.assertEqual(type(fasttext_embeddings), FastTextKeyedVectors)

        # The word is not in the vocab
        self.assertNotIn('institutmedarbejdskontrakt',
                         fasttext_embeddings.vocab)

        # However we can get an embedding because of subword units
        self.assertEqual(
            fasttext_embeddings['institutmedarbejdskontrakt'].size, 100)
示例#2
0
    def test_embeddings_with_gensim(self):
        embeddings = load_wv_with_gensim('wiki.da.small.wv')

        most_similar = embeddings.most_similar(
            positive=['københavn', 'england'], negative=['danmark'], topn=1)

        self.assertEqual(most_similar[0], ('london', 0.5180857181549072))
示例#3
0
 def test_embeddings_with_gensim(self):
     for emb in self.embeddings_for_testing:
         embeddings = load_wv_with_gensim(emb)
         self.assertEqual(MODELS[emb]['vocab_size'], len(embeddings.vocab))
def load_wv_models():
    for da_wv_model in AVAILABLE_EMBEDDINGS:
        yield da_wv_model, load_wv_with_gensim(da_wv_model)