Пример #1
0
 def test_pretrained(self):
     tw = TextWiser(Embedding.TfIdf(pretrained=None, min_df=2),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.4813341796, 0.6198053956, 0.0000000000, 0.6198053956],
          [0.4091228545, 0.5268201828, 0.5268201828, 0.5268201828],
          [0.6133555174, 0.0000000000, 0.7898069024, 0.0000000000]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     # Test loading from bytes
     with NamedTemporaryFile() as file:
         pickle.dump(tw._imp[0].vectorizer, file)
         file.seek(0)
         tw = TextWiser(Embedding.TfIdf(pretrained=file),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         self.assertTrue(
             torch.allclose(predicted, expected.to(device), atol=1e-6))
     # Test loading from file
     file_path = self._get_test_path('data', 'tfidf.pkl')
     with open(file_path, 'wb') as fp:
         pickle.dump(tw._imp[0].vectorizer, fp)
     tw = TextWiser(Embedding.TfIdf(pretrained=file_path),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertTrue(
         torch.allclose(predicted, expected.to(device), atol=1e-6))
     os.remove(file_path)
Пример #2
0
    def test_pretrained_error(self):
        # Not a string
        with self.assertRaises(ValueError):
            TextWiser(Embedding.TfIdf(pretrained=3), dtype=torch.float32)

        # Not a path
        with self.assertRaises(ValueError):
            TextWiser(Embedding.TfIdf(pretrained='|||||||'),
                      dtype=torch.float32)

        # Not a path on the embedding object
        with self.assertRaises(ValueError):
            _TfIdfEmbeddings(pretrained='|||||||')._init_vectorizer()
Пример #3
0
 def test_num_components(self):
     # The natural # of components is 3.
     n_components = 2  # Restrict the # of components
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=n_components),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.shape[1], n_components)
     self._reset_seed()
     n_components = 200  # Expand the # of components
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=n_components),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.shape[1], n_components)
Пример #4
0
 def test_fine_tuneable(self):
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=2),
                    dtype=torch.float32,
                    is_finetuneable=True)
     tw.fit(docs)
     embeddings1 = tw._imp[1].V.data.clone().detach()
     # Give a fake task to train embeddings on
     # Have a linear layer with a single output after pooling
     linear = nn.Linear(2, 1, bias=False)
     model = nn.Sequential(tw, linear).to(device).train()
     y_pred = model(docs)
     # Use ones as the target
     y_act = torch.ones_like(y_pred)
     # Optimize MSE using SGD
     criterion = nn.MSELoss()
     optimizer = optim.SGD(model.parameters(), lr=1e-3)
     # Calculate the loss & gradients
     optimizer.zero_grad()
     loss = criterion(y_pred, y_act)
     loss.backward()
     # The embedding layer should have gradients now
     self.assertIsNotNone([p for p in tw._imp[1].named_parameters()
                           ][0][1].grad)
     # Update weights
     optimizer.step()
     # The weights should be updated if fine_tune is true, else it should be the same
     self.assertFalse(torch.allclose(embeddings1, tw._imp[1].V.data))
Пример #5
0
 def test_v_in_parameters(self):
     n_components = 2  # Restrict the # of components
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=n_components),
                    dtype=torch.float32)
     tw.fit(docs)
     self.assertIn('_imp.1.V', [p[0] for p in tw.named_parameters()])
Пример #6
0
 def test_set_params(self):
     # Set the arguments in container classes
     tw = TextWiser(Embedding.TfIdf(min_df=5),
                    Transformation.NMF(n_components=30),
                    lazy_load=True)
     tw.set_params(embedding__min_df=10,
                   transformations__0__n_components=10)
     self.assertEqual(tw.embedding.min_df, 10)
     self.assertEqual(tw.transformations[0].n_components, 10)
     # Set the arguments in implementation
     tw = TextWiser(Embedding.Doc2Vec(vector_size=2, min_count=1,
                                      workers=1))
     tw.fit(docs)
     tw.set_params(_imp__0__seed=10)
     self.assertEqual(tw._imp[0].seed, 10)
     # Set the arguments in a schema
     schema = {'transform': ['tfidf', ['nmf', {'n_components': 30}]]}
     tw = TextWiser(Embedding.Compound(schema=schema))
     tw.set_params(embedding__schema__transform__0__min_df=10,
                   embedding__schema__transform__1__n_components=10)
     self.assertEqual(tw.embedding.schema['transform'][0][1]['min_df'], 10)
     self.assertEqual(
         tw.embedding.schema['transform'][1][1]['n_components'], 10)
     # Replace a part of the schema in a list
     tw.set_params(embedding__schema__transform__0='bow')
     self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
     # Replace a part of the schema
     tw.set_params(embedding__schema__transform=['bow'])
     self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
Пример #7
0
 def test_min_components(self):
     try:
         with self.assertRaises(ValueError):
             TextWiser(Embedding.TfIdf(min_df=2),
                       Transformation.UMAP(n_components=1),
                       dtype=torch.float32)
     except ModuleNotFoundError:
         print('No UMAP found. Skipping the test. ...', end=" ", flush=True)
Пример #8
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.TfIdf(min_df=2), dtype=torch.float32)
     expected = torch.tensor(
         [[0.4813341796, 0.6198053956, 0.0000000000, 0.6198053956],
          [0.4091228545, 0.5268201828, 0.5268201828, 0.5268201828],
          [0.6133555174, 0.0000000000, 0.7898069024, 0.0000000000]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     self._test_fit_before_transform(tw, expected)
Пример #9
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=2),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[-0.8526761532, 0.5070778131], [-0.9837458134, 0.0636523664],
          [-0.7350711226, -0.6733918786]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     self._reset_seed()
     self._test_fit_before_transform(tw, expected)
Пример #10
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.NMF(n_components=2),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.8865839243, 0.0000000000], [0.6736079454, 0.5221673250],
          [0.0203559380, 1.1122620106]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     self._reset_seed()
     self._test_fit_before_transform(tw, expected, atol=1e-5)
Пример #11
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.LDA(n_components=2),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.7724367976, 0.2275632024], [0.5895692706, 0.4104307294],
          [0.2381444573, 0.7618555427]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     self._reset_seed()
     self._test_fit_before_transform(tw, expected)
Пример #12
0
 def test_fit_transform(self):
     try:
         tw = TextWiser(Embedding.TfIdf(min_df=1),
                        Transformation.UMAP(init='random',
                                            n_neighbors=2,
                                            n_components=2),
                        dtype=torch.float32)
         expected = torch.tensor([[-12.1613626480, 22.0555286407],
                                  [-11.3154125214, 22.4605998993],
                                  [-10.7626724243, 21.6793708801]],
                                 dtype=torch.float32)
         self._test_fit_transform(tw, expected)
         self._reset_seed()
         self._test_fit_before_transform(tw, expected)
     except ModuleNotFoundError:
         print('No UMAP found. Skipping the test. ...', end=" ", flush=True)
Пример #13
0
 def test_min_components(self):
     with self.assertRaises(ValueError):
         TextWiser(Embedding.TfIdf(min_df=2),
                   Transformation.SVD(n_components=1),
                   dtype=torch.float32)
Пример #14
0
    def test_finetune_validation(self):
        # Nothing is fine-tuneable if dtype is numpy
        with self.assertRaises(TypeError):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                         pretrained='en_turian'),
                          dtype=np.float32,
                          is_finetuneable=True)

        # Word2Vec is fine-tuneable
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                         pretrained='en-turian'),
                          dtype=torch.float32,
                          is_finetuneable=True,
                          lazy_load=True)
        except ValueError:
            self.fail("Word2vec is fine tuneable")

        # ELMo is not fine-tuneable, and should raise an error
        with self.assertRaises(ValueError):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.elmo),
                          dtype=torch.float32,
                          is_finetuneable=True,
                          lazy_load=True)

        # TfIdf is not fine-tuneable, and should raise an error
        with self.assertRaises(ValueError):
            TextWiser(Embedding.TfIdf(),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)

        # TfIdf is not fine-tuneable, but SVD is
        try:
            TextWiser(Embedding.TfIdf(),
                      Transformation.SVD(),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)
        except ValueError:
            self.fail("SVD is fine tuneable")

        # LDA cannot propagate gradients, so the whole thing is not fine-tuneable
        with self.assertRaises(ValueError):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                         pretrained='en'),
                          Transformation.LDA(),
                          dtype=torch.float32,
                          is_finetuneable=True,
                          lazy_load=True)

        schema = {
            'concat': [{
                'transform': [('word2vec', {
                    'pretrained': 'en-turian'
                }), ('pool', {
                    'pool_option': 'max'
                })]
            }, {
                'transform': ['tfidf', ('nmf', {
                    'n_components': 30
                })]
            }]
        }

        # Word2Vec is fine-tuneable, therefore the whole schema is fine-tuneable
        try:
            TextWiser(Embedding.Compound(schema=schema),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)
        except ValueError:
            self.fail(
                "Any fine-tuneable weights is enough for the model to be fine-tuneable"
            )

        # TfIdf is not fine-tuneable, but SVD is
        schema = {'transform': ['tfidf', 'svd']}
        try:
            TextWiser(Embedding.Compound(schema=schema),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)
        except ValueError:
            self.fail("SVD is fine tuneable")
Пример #15
0
 def test_forward_before_fit(self):
     """Calling `forward` before `fit` should fail"""
     with self.assertRaises(NotImplementedError):
         TextWiser(Embedding.TfIdf()).transform('document')