def init_word_embs_model(model_path, model_type, force_reload=False, top_k=None): global WORD_EMBS_MODELS if model_type in WORD_EMBS_MODELS and not force_reload: WORD_EMBS_MODELS[model_type].top_k = top_k return WORD_EMBS_MODELS[model_type] if model_type == 'word2vec': model = nmw.Word2vec(top_k=top_k) model.read(model_path) elif model_type == 'glove': model = nmw.GloVe(top_k=top_k) model.read(model_path) elif model_type == 'fasttext': model = nmw.Fasttext(top_k=top_k) model.read(model_path) else: raise ValueError( 'Model type value is unexpected. Expected values include {}'. format(model_types)) WORD_EMBS_MODELS[model_type] = model return model
def init_fasttext_model(model_path, force_reload=False): # Load model once at runtime global FASTTEXT_MODEL if FASTTEXT_MODEL and not force_reload: return FASTTEXT_MODEL fasttext = nmw.Fasttext() fasttext.read(model_path) FASTTEXT_MODEL = fasttext return FASTTEXT_MODEL
def test_bogus_fasttext_loading(self): test_file = os.path.join(os.environ.get("TEST_DIR"), 'res', 'text', 'bogus_fasttext.vec') # Change to not supporting incorrect format file after switching to use gensim package with self.assertRaises(Exception) as error: fasttext = nmw.Fasttext() fasttext.read(test_file) self.assertIn( 'cannot copy sequence with size 11 to array axis with dimension 10', str(error.exception))
def init_fasttext_model(model_path, force_reload=False, top_k=None): # Load model once at runtime global FASTTEXT_MODEL if model_path in FASTTEXT_MODEL and not force_reload: FASTTEXT_MODEL[model_path].top_k = top_k return FASTTEXT_MODEL[model_path] fasttext = nmw.Fasttext(top_k=top_k) fasttext.read(model_path) FASTTEXT_MODEL[model_path] = fasttext return FASTTEXT_MODEL[model_path]
def test_bogus_fasttext_loading(self): test_file = os.path.join(os.environ.get("TEST_DIR"), 'res', 'text', 'bogus_fasttext.vec') expected_vector = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] fasttext = nmw.Fasttext() fasttext.read(test_file) for word in fasttext.w2v: self.assertSequenceEqual(list(fasttext.w2v[word]), expected_vector) self.assertSequenceEqual( ["test1", "test2", "test_3", "test 4", "test -> 5"], fasttext.get_vocab()) self.assertEqual(len(fasttext.normalized_vectors), 5)