def test_pretrained(self): tw = TextWiser(Embedding.TfIdf(pretrained=None, min_df=2), dtype=torch.float32) expected = torch.tensor( [[0.4813341796, 0.6198053956, 0.0000000000, 0.6198053956], [0.4091228545, 0.5268201828, 0.5268201828, 0.5268201828], [0.6133555174, 0.0000000000, 0.7898069024, 0.0000000000]], dtype=torch.float32) self._test_fit_transform(tw, expected) # Test loading from bytes with NamedTemporaryFile() as file: pickle.dump(tw._imp[0].vectorizer, file) file.seek(0) tw = TextWiser(Embedding.TfIdf(pretrained=file), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertTrue( torch.allclose(predicted, expected.to(device), atol=1e-6)) # Test loading from file file_path = self._get_test_path('data', 'tfidf.pkl') with open(file_path, 'wb') as fp: pickle.dump(tw._imp[0].vectorizer, fp) tw = TextWiser(Embedding.TfIdf(pretrained=file_path), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertTrue( torch.allclose(predicted, expected.to(device), atol=1e-6)) os.remove(file_path)
def test_dtype(self): tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=PoolOptions.max), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted.dtype, torch.float32) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=PoolOptions.max), dtype=np.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted.dtype, np.float32) with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted[0].dtype, torch.float32) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=np.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted[0].dtype, np.float32)
def _test_index(self, pool_option): index = 0 if pool_option == PoolOptions.first else -1 with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=torch.float32) expected = tw.fit_transform(docs[0])[0][index].view(1, -1) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=pool_option), dtype=torch.float32) pooled = tw.fit_transform(docs[0]) self.assertTrue(torch.allclose(expected.to(device), pooled.to(device)))
def test_num_components(self): # The natural # of components is 3. n_components = 2 # Restrict the # of components tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.SVD(n_components=n_components), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted.shape[1], n_components) self._reset_seed() n_components = 200 # Expand the # of components tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.SVD(n_components=n_components), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted.shape[1], n_components)
def test_list_handling(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.SVD(n_components=2), dtype=torch.float32) predicted = tw.fit_transform(docs) expected = [ torch.tensor([[-0.9719871283, 0.0947150663], [-0.3805825114, -1.0427029133], [-0.6929296255, 0.1793890595], [0.0000000000, 0.0000000000]], dtype=torch.float32), torch.tensor([[-0.9719871283, 0.0947150663], [-0.3805825114, -1.0427029133], [-0.7170552015, 0.0105144158], [-0.9385635853, 0.6596723199], [0.0000000000, 0.0000000000]], dtype=torch.float32), torch.tensor([[-0.8687936068, -0.9333068132], [-0.6859120131, 0.0732812732], [-0.9385635853, 0.6596723199], [0.0000000000, 0.0000000000]], dtype=torch.float32) ] for p, e in zip(predicted, expected): self.assertTrue(torch.allclose(p, e.to(device), atol=1e-6))
def test_lazy_load(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), lazy_load=True) self.assertIsNone(tw._imp) tw.fit(docs) self.assertIsNotNone(tw._imp) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), lazy_load=True, dtype=torch.float32, is_finetuneable=True) self.assertIsNone(tw._imp) tw.fit_transform(docs) self.assertIsNotNone(tw._imp)
def test_pretrained(self): tw = TextWiser(Embedding.Doc2Vec(deterministic=True, seed=1234, vector_size=2, min_count=1, workers=1, sample=0, negative=0, hashfxn=det_hash), dtype=torch.float32) expected = torch.tensor( [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602], [0.1042766869, -0.0033877781]], dtype=torch.float32) self._test_fit_before_transform(tw, expected) # Test loading from bytes with NamedTemporaryFile() as file: pickle.dump(tw._imp[0].model, file) file.seek(0) tw = TextWiser(Embedding.Doc2Vec(pretrained=file, deterministic=True, seed=1234), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertTrue( torch.allclose(predicted, expected.to(device), atol=1e-6)) # Test loading from file file_path = self._get_test_path('data', 'doc2vec.pkl') with open(file_path, 'wb') as fp: pickle.dump(tw._imp[0].model, fp) tw = TextWiser(Embedding.Doc2Vec(pretrained=file_path, deterministic=True, seed=1234), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertTrue( torch.allclose(predicted, expected.to(device), atol=1e-6)) os.remove(file_path)