def test_save_load(self): # Create a model with a downstream task tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), [ Transformation.SVD(n_components=2), Transformation.Pool(pool_option=PoolOptions.mean) ], dtype=torch.float32) tw.fit(docs) model = nn.Sequential(tw, nn.Linear(2, 1)).to(device) # Get results of the model expected = model(docs) # Save the model to a temporary file with NamedTemporaryFile() as file: torch.save(model.state_dict(), file) # Use string name of the file # Get rid of the original model del tw del model # Create the same model tw = TextWiser(Embedding.Word( word_option=WordOptions.word2vec, pretrained='en-turian'), [ Transformation.SVD(n_components=2), Transformation.Pool(pool_option=PoolOptions.mean) ], dtype=torch.float32) tw.fit() model = nn.Sequential(tw, nn.Linear(2, 1)).to(device) # Load the model from file file.seek(0) model.load_state_dict(torch.load(file, map_location=device)) # Do predictions with the loaded model predicted = model(docs) self.assertTrue(torch.allclose(predicted, expected, atol=1e-6))
def test_dtype(self): tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=PoolOptions.max), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted.dtype, torch.float32) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=PoolOptions.max), dtype=np.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted.dtype, np.float32) with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted[0].dtype, torch.float32) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=np.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted[0].dtype, np.float32)
def _test_index(self, pool_option): index = 0 if pool_option == PoolOptions.first else -1 with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=torch.float32) expected = tw.fit_transform(docs[0])[0][index].view(1, -1) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=pool_option), dtype=torch.float32) pooled = tw.fit_transform(docs[0]) self.assertTrue(torch.allclose(expected.to(device), pooled.to(device)))
def test_list_handling(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.SVD(n_components=2), dtype=torch.float32) predicted = tw.fit_transform(docs) expected = [ torch.tensor([[-0.9719871283, 0.0947150663], [-0.3805825114, -1.0427029133], [-0.6929296255, 0.1793890595], [0.0000000000, 0.0000000000]], dtype=torch.float32), torch.tensor([[-0.9719871283, 0.0947150663], [-0.3805825114, -1.0427029133], [-0.7170552015, 0.0105144158], [-0.9385635853, 0.6596723199], [0.0000000000, 0.0000000000]], dtype=torch.float32), torch.tensor([[-0.8687936068, -0.9333068132], [-0.6859120131, 0.0732812732], [-0.9385635853, 0.6596723199], [0.0000000000, 0.0000000000]], dtype=torch.float32) ] for p, e in zip(predicted, expected): self.assertTrue(torch.allclose(p, e.to(device), atol=1e-6))
def test_fit_transform(self): tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=PoolOptions.max), dtype=torch.float32) expected = torch.from_numpy(np.genfromtxt( self._get_test_path('data', 'pooled_embeddings.csv'), dtype=np.float32)) self._test_fit_transform(tw, expected) self._test_fit_before_transform(tw, expected)
def test_lazy_load(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), lazy_load=True) self.assertIsNone(tw._imp) tw.fit(docs) self.assertIsNotNone(tw._imp) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), lazy_load=True, dtype=torch.float32, is_finetuneable=True) self.assertIsNone(tw._imp) tw.fit_transform(docs) self.assertIsNotNone(tw._imp)
def test_finetune_validation(self): # Nothing is fine-tuneable if dtype is numpy with self.assertRaises(TypeError): with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en_turian'), dtype=np.float32, is_finetuneable=True) # Word2Vec is fine-tuneable try: with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail("Word2vec is fine tuneable") # ELMo is not fine-tuneable, and should raise an error with self.assertRaises(ValueError): with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.elmo), dtype=torch.float32, is_finetuneable=True, lazy_load=True) # TfIdf is not fine-tuneable, and should raise an error with self.assertRaises(ValueError): TextWiser(Embedding.TfIdf(), dtype=torch.float32, is_finetuneable=True, lazy_load=True) # TfIdf is not fine-tuneable, but SVD is try: TextWiser(Embedding.TfIdf(), Transformation.SVD(), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail("SVD is fine tuneable") # LDA cannot propagate gradients, so the whole thing is not fine-tuneable with self.assertRaises(ValueError): with warnings.catch_warnings(): warnings.simplefilter("ignore") TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en'), Transformation.LDA(), dtype=torch.float32, is_finetuneable=True, lazy_load=True) schema = { 'concat': [{ 'transform': [('word2vec', { 'pretrained': 'en-turian' }), ('pool', { 'pool_option': 'max' })] }, { 'transform': ['tfidf', ('nmf', { 'n_components': 30 })] }] } # Word2Vec is fine-tuneable, therefore the whole schema is fine-tuneable try: TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail( "Any fine-tuneable weights is enough for the model to be fine-tuneable" ) # TfIdf is not fine-tuneable, but SVD is schema = {'transform': ['tfidf', 'svd']} try: TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32, is_finetuneable=True, lazy_load=True) except ValueError: self.fail("SVD is fine tuneable")