def test_save_load(self): # Create a model with a downstream task tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), [ Transformation.SVD(n_components=2), Transformation.Pool(pool_option=PoolOptions.mean) ], dtype=torch.float32) tw.fit(docs) model = nn.Sequential(tw, nn.Linear(2, 1)).to(device) # Get results of the model expected = model(docs) # Save the model to a temporary file with NamedTemporaryFile() as file: torch.save(model.state_dict(), file) # Use string name of the file # Get rid of the original model del tw del model # Create the same model tw = TextWiser(Embedding.Word( word_option=WordOptions.word2vec, pretrained='en-turian'), [ Transformation.SVD(n_components=2), Transformation.Pool(pool_option=PoolOptions.mean) ], dtype=torch.float32) tw.fit() model = nn.Sequential(tw, nn.Linear(2, 1)).to(device) # Load the model from file file.seek(0) model.load_state_dict(torch.load(file, map_location=device)) # Do predictions with the loaded model predicted = model(docs) self.assertTrue(torch.allclose(predicted, expected, atol=1e-6))
def test_save_load(self): try: os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '3' # shut tensorflow up during testing # Create a model with a downstream task tw = TextWiser(Embedding.USE(), dtype=torch.float32).fit(docs) model = nn.Sequential(tw, nn.Linear(512, 1)).to(device) # Get results of the model expected = model(docs) # Save the model to a temporary file with NamedTemporaryFile() as file: state_dict = model.state_dict() self.assertNotIn('0._imp.0.use', state_dict) torch.save(state_dict, file) # Use string name of the file # Get rid of the original model del tw del model # Create the same model tw = TextWiser(Embedding.USE(), dtype=torch.float32) tw.fit() model = nn.Sequential(tw, nn.Linear(512, 1)).to(device) # Load the model from file file.seek(0) model.load_state_dict(torch.load(file, map_location=device)) # Do predictions with the loaded model predicted = model(docs) self.assertTrue(torch.allclose(predicted, expected, atol=1e-6)) except ModuleNotFoundError: print('No Tensorflow found. Skipping the test. ...', end=" ", flush=True)
def test_deterministic_transform(self): """Specifying the `deterministic` option should make Doc2Vec transformation deterministic. By default, running inference with doc2vec is not deterministic in gensim. This test makes sure we can get a deterministic result when necessary. """ tw = TextWiser(Embedding.Doc2Vec(deterministic=True, seed=1234, vector_size=2, min_count=1, workers=1, sample=0, negative=0, hashfxn=det_hash), dtype=torch.float32) expected = torch.tensor( [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602], [0.1042766869, -0.0033877781]], dtype=torch.float32) self._test_fit_before_transform(tw, expected) tw = TextWiser(Embedding.Doc2Vec(pretrained=None, deterministic=True, seed=1234, vector_size=2, min_count=1, workers=1, sample=0, negative=0, hashfxn=det_hash), dtype=torch.float32) self._test_fit_before_transform(tw, expected)
def test_set_params(self): # Set the arguments in container classes tw = TextWiser(Embedding.TfIdf(min_df=5), Transformation.NMF(n_components=30), lazy_load=True) tw.set_params(embedding__min_df=10, transformations__0__n_components=10) self.assertEqual(tw.embedding.min_df, 10) self.assertEqual(tw.transformations[0].n_components, 10) # Set the arguments in implementation tw = TextWiser(Embedding.Doc2Vec(vector_size=2, min_count=1, workers=1)) tw.fit(docs) tw.set_params(_imp__0__seed=10) self.assertEqual(tw._imp[0].seed, 10) # Set the arguments in a schema schema = {'transform': ['tfidf', ['nmf', {'n_components': 30}]]} tw = TextWiser(Embedding.Compound(schema=schema)) tw.set_params(embedding__schema__transform__0__min_df=10, embedding__schema__transform__1__n_components=10) self.assertEqual(tw.embedding.schema['transform'][0][1]['min_df'], 10) self.assertEqual( tw.embedding.schema['transform'][1][1]['n_components'], 10) # Replace a part of the schema in a list tw.set_params(embedding__schema__transform__0='bow') self.assertEqual(tw.embedding.schema['transform'][0], 'bow') # Replace a part of the schema tw.set_params(embedding__schema__transform=['bow']) self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
def test_pretrained(self): tw = TextWiser(Embedding.TfIdf(pretrained=None, min_df=2), dtype=torch.float32) expected = torch.tensor( [[0.4813341796, 0.6198053956, 0.0000000000, 0.6198053956], [0.4091228545, 0.5268201828, 0.5268201828, 0.5268201828], [0.6133555174, 0.0000000000, 0.7898069024, 0.0000000000]], dtype=torch.float32) self._test_fit_transform(tw, expected) # Test loading from bytes with NamedTemporaryFile() as file: pickle.dump(tw._imp[0].vectorizer, file) file.seek(0) tw = TextWiser(Embedding.TfIdf(pretrained=file), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertTrue( torch.allclose(predicted, expected.to(device), atol=1e-6)) # Test loading from file file_path = self._get_test_path('data', 'tfidf.pkl') with open(file_path, 'wb') as fp: pickle.dump(tw._imp[0].vectorizer, fp) tw = TextWiser(Embedding.TfIdf(pretrained=file_path), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertTrue( torch.allclose(predicted, expected.to(device), atol=1e-6)) os.remove(file_path)
def test_dtype(self): tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=PoolOptions.max), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted.dtype, torch.float32) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=PoolOptions.max), dtype=np.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted.dtype, np.float32) with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted[0].dtype, torch.float32) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=np.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted[0].dtype, np.float32)
def _test_index(self, pool_option): index = 0 if pool_option == PoolOptions.first else -1 with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), dtype=torch.float32) expected = tw.fit_transform(docs[0])[0][index].view(1, -1) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=pool_option), dtype=torch.float32) pooled = tw.fit_transform(docs[0]) self.assertTrue(torch.allclose(expected.to(device), pooled.to(device)))
def test_pretrained_error(self): # Not a string with self.assertRaises(ValueError): TextWiser(Embedding.TfIdf(pretrained=3), dtype=torch.float32) # Not a path with self.assertRaises(ValueError): TextWiser(Embedding.TfIdf(pretrained='|||||||'), dtype=torch.float32) # Not a path on the embedding object with self.assertRaises(ValueError): _TfIdfEmbeddings(pretrained='|||||||')._init_vectorizer()
def test_pretrained_error(self): # Not a string with self.assertRaises(ValueError): TextWiser(Embedding.Doc2Vec(pretrained=3), dtype=torch.float32) # Not a path with self.assertRaises(ValueError): TextWiser(Embedding.Doc2Vec(pretrained='|||||||'), dtype=torch.float32) # Not a path on the embedding object with self.assertRaises(ValueError): _Doc2VecEmbeddings(pretrained='|||||||').fit([])
def test_tokenizer_validation(self): # shouldn't raise an error try: TextWiser( Embedding.Doc2Vec(tokenizer=lambda doc: doc.lower().split())) except TypeError: self.fail("This tokenizer should pass the validation.") # should raise the first error with self.assertRaises(TypeError): TextWiser(Embedding.Doc2Vec(tokenizer=lambda doc: doc.lower())) # should raise the second error with self.assertRaises(TypeError): TextWiser(Embedding.Doc2Vec(tokenizer=lambda doc: [1]))
def test_num_components(self): # The natural # of components is 3. n_components = 2 # Restrict the # of components tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.SVD(n_components=n_components), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted.shape[1], n_components) self._reset_seed() n_components = 200 # Expand the # of components tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.SVD(n_components=n_components), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertEqual(predicted.shape[1], n_components)
def test_fit_transform(self): tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=PoolOptions.max), dtype=torch.float32) expected = torch.from_numpy(np.genfromtxt( self._get_test_path('data', 'pooled_embeddings.csv'), dtype=np.float32)) self._test_fit_transform(tw, expected) self._test_fit_before_transform(tw, expected)
def test_fine_tuneable(self): tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.SVD(n_components=2), dtype=torch.float32, is_finetuneable=True) tw.fit(docs) embeddings1 = tw._imp[1].V.data.clone().detach() # Give a fake task to train embeddings on # Have a linear layer with a single output after pooling linear = nn.Linear(2, 1, bias=False) model = nn.Sequential(tw, linear).to(device).train() y_pred = model(docs) # Use ones as the target y_act = torch.ones_like(y_pred) # Optimize MSE using SGD criterion = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=1e-3) # Calculate the loss & gradients optimizer.zero_grad() loss = criterion(y_pred, y_act) loss.backward() # The embedding layer should have gradients now self.assertIsNotNone([p for p in tw._imp[1].named_parameters() ][0][1].grad) # Update weights optimizer.step() # The weights should be updated if fine_tune is true, else it should be the same self.assertFalse(torch.allclose(embeddings1, tw._imp[1].V.data))
def test_v_in_parameters(self): n_components = 2 # Restrict the # of components tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.SVD(n_components=n_components), dtype=torch.float32) tw.fit(docs) self.assertIn('_imp.1.V', [p[0] for p in tw.named_parameters()])
def test_list_handling(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.SVD(n_components=2), dtype=torch.float32) predicted = tw.fit_transform(docs) expected = [ torch.tensor([[-0.9719871283, 0.0947150663], [-0.3805825114, -1.0427029133], [-0.6929296255, 0.1793890595], [0.0000000000, 0.0000000000]], dtype=torch.float32), torch.tensor([[-0.9719871283, 0.0947150663], [-0.3805825114, -1.0427029133], [-0.7170552015, 0.0105144158], [-0.9385635853, 0.6596723199], [0.0000000000, 0.0000000000]], dtype=torch.float32), torch.tensor([[-0.8687936068, -0.9333068132], [-0.6859120131, 0.0732812732], [-0.9385635853, 0.6596723199], [0.0000000000, 0.0000000000]], dtype=torch.float32) ] for p, e in zip(predicted, expected): self.assertTrue(torch.allclose(p, e.to(device), atol=1e-6))
def _test_schema(self, schema): tw = TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32) expected = torch.tensor([[-1.5983865261, 1.8820908070, 0.1802073568], [-1.8616025448, -0.4420224428, -0.9159017205], [-2.0401582718, -1.0712100267, 0.6945561171]], dtype=torch.float32) self._test_fit_transform(tw, expected, atol=1e-4) self._reset_seed() self._test_fit_before_transform(tw, expected, atol=1e-4)
def test_lazy_load(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), lazy_load=True) self.assertIsNone(tw._imp) tw.fit(docs) self.assertIsNotNone(tw._imp) tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), lazy_load=True, dtype=torch.float32, is_finetuneable=True) self.assertIsNone(tw._imp) tw.fit_transform(docs) self.assertIsNotNone(tw._imp)
def test_min_components(self): try: with self.assertRaises(ValueError): TextWiser(Embedding.TfIdf(min_df=2), Transformation.UMAP(n_components=1), dtype=torch.float32) except ModuleNotFoundError: print('No UMAP found. Skipping the test. ...', end=" ", flush=True)
def test_fit_transform(self): tw = TextWiser(Embedding.TfIdf(min_df=2), dtype=torch.float32) expected = torch.tensor( [[0.4813341796, 0.6198053956, 0.0000000000, 0.6198053956], [0.4091228545, 0.5268201828, 0.5268201828, 0.5268201828], [0.6133555174, 0.0000000000, 0.7898069024, 0.0000000000]], dtype=torch.float32) self._test_fit_transform(tw, expected) self._test_fit_before_transform(tw, expected)
def _test_schema(self, schema): tw = TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32) expected = torch.tensor([[-1.5983779430, 1.8820992708, 0.1802130789], [-1.8616007566, -0.4420076311, -0.9159148335], [-2.0401744843, -1.0712141991, 0.6945576668]], dtype=torch.float32) self._test_fit_transform(tw, expected) self._reset_seed() self._test_fit_before_transform(tw, expected)
def test_fit_transform(self): self._reset_seed(seed=12345) tw = TextWiser(Embedding.Random(), dtype=torch.float32) expected = torch.tensor( [[1., 0., 1., 0., 0., 1.], [1., 0., 0., 1., 0., 1.], [0., 1., 0., 1., 1., 0.]], dtype=torch.float32) self._test_fit_transform(tw, expected) self._test_fit_before_transform(tw, expected)
def test_immutable_schema(self): schema = { "transform": [ ["word", {"word_option": "word2vec", "pretrained": "en-turian"}], ["pool", {"pool_option": "max"}] ] } emb = Embedding.Compound(schema=schema) schema['transform'][1][1]['pool_option'] = 'min' self.assertEqual(emb.schema['transform'][1][1]['pool_option'], 'max')
def test_fit_transform(self): tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.SVD(n_components=2), dtype=torch.float32) expected = torch.tensor( [[-0.8526761532, 0.5070778131], [-0.9837458134, 0.0636523664], [-0.7350711226, -0.6733918786]], dtype=torch.float32) self._test_fit_transform(tw, expected) self._reset_seed() self._test_fit_before_transform(tw, expected)
def test_fit_transform(self): tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.LDA(n_components=2), dtype=torch.float32) expected = torch.tensor( [[0.7724367976, 0.2275632024], [0.5895692706, 0.4104307294], [0.2381444573, 0.7618555427]], dtype=torch.float32) self._test_fit_transform(tw, expected) self._reset_seed() self._test_fit_before_transform(tw, expected)
def test_fit_transform(self): tw = TextWiser(Embedding.TfIdf(min_df=2), Transformation.NMF(n_components=2), dtype=torch.float32) expected = torch.tensor( [[0.8865839243, 0.0000000000], [0.6736079454, 0.5221673250], [0.0203559380, 1.1122620106]], dtype=torch.float32) self._test_fit_transform(tw, expected) self._reset_seed() self._test_fit_before_transform(tw, expected, atol=1e-5)
def test_pretrained_error(self): # Not a pretrained model try: os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '3' # shut tensorflow up during testing with self.assertRaises(ValueError): TextWiser(Embedding.USE(pretrained=None), dtype=torch.float32) except ModuleNotFoundError: print('No Tensorflow found. Skipping the test. ...', end=" ", flush=True)
def test_use_versions(self): """Tests if the previous versions of USE are useable""" try: os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '3' # shut tensorflow up during testing TextWiser(Embedding.USE( pretrained= 'https://tfhub.dev/google/universal-sentence-encoder-large/5'), dtype=torch.float32).fit_transform(docs) TextWiser(Embedding.USE( pretrained= 'https://tfhub.dev/google/universal-sentence-encoder-large/4'), dtype=torch.float32).fit_transform(docs) TextWiser(Embedding.USE( pretrained= 'https://tfhub.dev/google/universal-sentence-encoder-large/3'), dtype=torch.float32).fit_transform(docs) except ModuleNotFoundError: print('No Tensorflow found. Skipping the test. ...', end=" ", flush=True)
def test_pretrained(self): tw = TextWiser(Embedding.Doc2Vec(deterministic=True, seed=1234, vector_size=2, min_count=1, workers=1, sample=0, negative=0, hashfxn=det_hash), dtype=torch.float32) expected = torch.tensor( [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602], [0.1042766869, -0.0033877781]], dtype=torch.float32) self._test_fit_before_transform(tw, expected) # Test loading from bytes with NamedTemporaryFile() as file: pickle.dump(tw._imp[0].model, file) file.seek(0) tw = TextWiser(Embedding.Doc2Vec(pretrained=file, deterministic=True, seed=1234), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertTrue( torch.allclose(predicted, expected.to(device), atol=1e-6)) # Test loading from file file_path = self._get_test_path('data', 'doc2vec.pkl') with open(file_path, 'wb') as fp: pickle.dump(tw._imp[0].model, fp) tw = TextWiser(Embedding.Doc2Vec(pretrained=file_path, deterministic=True, seed=1234), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertTrue( torch.allclose(predicted, expected.to(device), atol=1e-6)) os.remove(file_path)
def test_fit_transform(self): tw = TextWiser(Embedding.Doc2Vec(seed=1234, vector_size=2, min_count=1, workers=1, sample=0, negative=0, hashfxn=det_hash), dtype=torch.float32) expected = torch.tensor( [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602], [0.1042766869, -0.0033877781]], dtype=torch.float32) self._test_fit_transform(tw, expected)
def test_fit_transform(self): try: os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '3' # shut tensorflow up during testing tw = TextWiser(Embedding.USE(), dtype=torch.float32) expected = torch.from_numpy( np.genfromtxt(self._get_test_path('data', 'use_embeddings.csv'), dtype=np.float32)) self._test_fit_transform(tw, expected) self._test_fit_before_transform(tw, expected) except ModuleNotFoundError: print('No Tensorflow found. Skipping the test. ...', end=" ", flush=True)