예제 #1
0
 def test_fine_tuneable(self):
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=2),
                    dtype=torch.float32,
                    is_finetuneable=True)
     tw.fit(docs)
     embeddings1 = tw._imp[1].V.data.clone().detach()
     # Give a fake task to train embeddings on
     # Have a linear layer with a single output after pooling
     linear = nn.Linear(2, 1, bias=False)
     model = nn.Sequential(tw, linear).to(device).train()
     y_pred = model(docs)
     # Use ones as the target
     y_act = torch.ones_like(y_pred)
     # Optimize MSE using SGD
     criterion = nn.MSELoss()
     optimizer = optim.SGD(model.parameters(), lr=1e-3)
     # Calculate the loss & gradients
     optimizer.zero_grad()
     loss = criterion(y_pred, y_act)
     loss.backward()
     # The embedding layer should have gradients now
     self.assertIsNotNone([p for p in tw._imp[1].named_parameters()
                           ][0][1].grad)
     # Update weights
     optimizer.step()
     # The weights should be updated if fine_tune is true, else it should be the same
     self.assertFalse(torch.allclose(embeddings1, tw._imp[1].V.data))
예제 #2
0
 def test_v_in_parameters(self):
     n_components = 2  # Restrict the # of components
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=n_components),
                    dtype=torch.float32)
     tw.fit(docs)
     self.assertIn('_imp.1.V', [p[0] for p in tw.named_parameters()])
예제 #3
0
 def test_save_load(self):
     try:
         os.environ[
             'TF_CPP_MIN_LOG_LEVEL'] = '3'  # shut tensorflow up during testing
         # Create a model with a downstream task
         tw = TextWiser(Embedding.USE(), dtype=torch.float32).fit(docs)
         model = nn.Sequential(tw, nn.Linear(512, 1)).to(device)
         # Get results of the model
         expected = model(docs)
         # Save the model to a temporary file
         with NamedTemporaryFile() as file:
             state_dict = model.state_dict()
             self.assertNotIn('0._imp.0.use', state_dict)
             torch.save(state_dict, file)  # Use string name of the file
             # Get rid of the original model
             del tw
             del model
             # Create the same model
             tw = TextWiser(Embedding.USE(), dtype=torch.float32)
             tw.fit()
             model = nn.Sequential(tw, nn.Linear(512, 1)).to(device)
             # Load the model from file
             file.seek(0)
             model.load_state_dict(torch.load(file, map_location=device))
             # Do predictions with the loaded model
             predicted = model(docs)
             self.assertTrue(torch.allclose(predicted, expected, atol=1e-6))
     except ModuleNotFoundError:
         print('No Tensorflow found. Skipping the test. ...',
               end=" ",
               flush=True)
예제 #4
0
 def test_save_load(self):
     # Create a model with a downstream task
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                   pretrained='en-turian'),
                    [
                        Transformation.SVD(n_components=2),
                        Transformation.Pool(pool_option=PoolOptions.mean)
                    ],
                    dtype=torch.float32)
     tw.fit(docs)
     model = nn.Sequential(tw, nn.Linear(2, 1)).to(device)
     # Get results of the model
     expected = model(docs)
     # Save the model to a temporary file
     with NamedTemporaryFile() as file:
         torch.save(model.state_dict(), file)  # Use string name of the file
         # Get rid of the original model
         del tw
         del model
         # Create the same model
         tw = TextWiser(Embedding.Word(
             word_option=WordOptions.word2vec, pretrained='en-turian'), [
                 Transformation.SVD(n_components=2),
                 Transformation.Pool(pool_option=PoolOptions.mean)
             ],
                        dtype=torch.float32)
         tw.fit()
         model = nn.Sequential(tw, nn.Linear(2, 1)).to(device)
         # Load the model from file
         file.seek(0)
         model.load_state_dict(torch.load(file, map_location=device))
         # Do predictions with the loaded model
         predicted = model(docs)
         self.assertTrue(torch.allclose(predicted, expected, atol=1e-6))
예제 #5
0
 def test_set_params(self):
     # Set the arguments in container classes
     tw = TextWiser(Embedding.TfIdf(min_df=5),
                    Transformation.NMF(n_components=30),
                    lazy_load=True)
     tw.set_params(embedding__min_df=10,
                   transformations__0__n_components=10)
     self.assertEqual(tw.embedding.min_df, 10)
     self.assertEqual(tw.transformations[0].n_components, 10)
     # Set the arguments in implementation
     tw = TextWiser(Embedding.Doc2Vec(vector_size=2, min_count=1,
                                      workers=1))
     tw.fit(docs)
     tw.set_params(_imp__0__seed=10)
     self.assertEqual(tw._imp[0].seed, 10)
     # Set the arguments in a schema
     schema = {'transform': ['tfidf', ['nmf', {'n_components': 30}]]}
     tw = TextWiser(Embedding.Compound(schema=schema))
     tw.set_params(embedding__schema__transform__0__min_df=10,
                   embedding__schema__transform__1__n_components=10)
     self.assertEqual(tw.embedding.schema['transform'][0][1]['min_df'], 10)
     self.assertEqual(
         tw.embedding.schema['transform'][1][1]['n_components'], 10)
     # Replace a part of the schema in a list
     tw.set_params(embedding__schema__transform__0='bow')
     self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
     # Replace a part of the schema
     tw.set_params(embedding__schema__transform=['bow'])
     self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
예제 #6
0
 def test_lazy_load(self):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        lazy_load=True)
         self.assertIsNone(tw._imp)
         tw.fit(docs)
         self.assertIsNotNone(tw._imp)
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        lazy_load=True,
                        dtype=torch.float32,
                        is_finetuneable=True)
         self.assertIsNone(tw._imp)
         tw.fit_transform(docs)
         self.assertIsNotNone(tw._imp)