示例#1
0
 def test_save_load(self):
     # Create a model with a downstream task
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                   pretrained='en-turian'),
                    [
                        Transformation.SVD(n_components=2),
                        Transformation.Pool(pool_option=PoolOptions.mean)
                    ],
                    dtype=torch.float32)
     tw.fit(docs)
     model = nn.Sequential(tw, nn.Linear(2, 1)).to(device)
     # Get results of the model
     expected = model(docs)
     # Save the model to a temporary file
     with NamedTemporaryFile() as file:
         torch.save(model.state_dict(), file)  # Use string name of the file
         # Get rid of the original model
         del tw
         del model
         # Create the same model
         tw = TextWiser(Embedding.Word(
             word_option=WordOptions.word2vec, pretrained='en-turian'), [
                 Transformation.SVD(n_components=2),
                 Transformation.Pool(pool_option=PoolOptions.mean)
             ],
                        dtype=torch.float32)
         tw.fit()
         model = nn.Sequential(tw, nn.Linear(2, 1)).to(device)
         # Load the model from file
         file.seek(0)
         model.load_state_dict(torch.load(file, map_location=device))
         # Do predictions with the loaded model
         predicted = model(docs)
         self.assertTrue(torch.allclose(predicted, expected, atol=1e-6))
示例#2
0
 def test_save_load(self):
     try:
         os.environ[
             'TF_CPP_MIN_LOG_LEVEL'] = '3'  # shut tensorflow up during testing
         # Create a model with a downstream task
         tw = TextWiser(Embedding.USE(), dtype=torch.float32).fit(docs)
         model = nn.Sequential(tw, nn.Linear(512, 1)).to(device)
         # Get results of the model
         expected = model(docs)
         # Save the model to a temporary file
         with NamedTemporaryFile() as file:
             state_dict = model.state_dict()
             self.assertNotIn('0._imp.0.use', state_dict)
             torch.save(state_dict, file)  # Use string name of the file
             # Get rid of the original model
             del tw
             del model
             # Create the same model
             tw = TextWiser(Embedding.USE(), dtype=torch.float32)
             tw.fit()
             model = nn.Sequential(tw, nn.Linear(512, 1)).to(device)
             # Load the model from file
             file.seek(0)
             model.load_state_dict(torch.load(file, map_location=device))
             # Do predictions with the loaded model
             predicted = model(docs)
             self.assertTrue(torch.allclose(predicted, expected, atol=1e-6))
     except ModuleNotFoundError:
         print('No Tensorflow found. Skipping the test. ...',
               end=" ",
               flush=True)
示例#3
0
    def test_deterministic_transform(self):
        """Specifying the `deterministic` option should make Doc2Vec transformation deterministic.

        By default, running inference with doc2vec is not deterministic in gensim.
        This test makes sure we can get a deterministic result when necessary.
        """
        tw = TextWiser(Embedding.Doc2Vec(deterministic=True,
                                         seed=1234,
                                         vector_size=2,
                                         min_count=1,
                                         workers=1,
                                         sample=0,
                                         negative=0,
                                         hashfxn=det_hash),
                       dtype=torch.float32)
        expected = torch.tensor(
            [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602],
             [0.1042766869, -0.0033877781]],
            dtype=torch.float32)
        self._test_fit_before_transform(tw, expected)
        tw = TextWiser(Embedding.Doc2Vec(pretrained=None,
                                         deterministic=True,
                                         seed=1234,
                                         vector_size=2,
                                         min_count=1,
                                         workers=1,
                                         sample=0,
                                         negative=0,
                                         hashfxn=det_hash),
                       dtype=torch.float32)
        self._test_fit_before_transform(tw, expected)
示例#4
0
 def test_set_params(self):
     # Set the arguments in container classes
     tw = TextWiser(Embedding.TfIdf(min_df=5),
                    Transformation.NMF(n_components=30),
                    lazy_load=True)
     tw.set_params(embedding__min_df=10,
                   transformations__0__n_components=10)
     self.assertEqual(tw.embedding.min_df, 10)
     self.assertEqual(tw.transformations[0].n_components, 10)
     # Set the arguments in implementation
     tw = TextWiser(Embedding.Doc2Vec(vector_size=2, min_count=1,
                                      workers=1))
     tw.fit(docs)
     tw.set_params(_imp__0__seed=10)
     self.assertEqual(tw._imp[0].seed, 10)
     # Set the arguments in a schema
     schema = {'transform': ['tfidf', ['nmf', {'n_components': 30}]]}
     tw = TextWiser(Embedding.Compound(schema=schema))
     tw.set_params(embedding__schema__transform__0__min_df=10,
                   embedding__schema__transform__1__n_components=10)
     self.assertEqual(tw.embedding.schema['transform'][0][1]['min_df'], 10)
     self.assertEqual(
         tw.embedding.schema['transform'][1][1]['n_components'], 10)
     # Replace a part of the schema in a list
     tw.set_params(embedding__schema__transform__0='bow')
     self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
     # Replace a part of the schema
     tw.set_params(embedding__schema__transform=['bow'])
     self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
示例#5
0
 def test_pretrained(self):
     tw = TextWiser(Embedding.TfIdf(pretrained=None, min_df=2),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.4813341796, 0.6198053956, 0.0000000000, 0.6198053956],
          [0.4091228545, 0.5268201828, 0.5268201828, 0.5268201828],
          [0.6133555174, 0.0000000000, 0.7898069024, 0.0000000000]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     # Test loading from bytes
     with NamedTemporaryFile() as file:
         pickle.dump(tw._imp[0].vectorizer, file)
         file.seek(0)
         tw = TextWiser(Embedding.TfIdf(pretrained=file),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         self.assertTrue(
             torch.allclose(predicted, expected.to(device), atol=1e-6))
     # Test loading from file
     file_path = self._get_test_path('data', 'tfidf.pkl')
     with open(file_path, 'wb') as fp:
         pickle.dump(tw._imp[0].vectorizer, fp)
     tw = TextWiser(Embedding.TfIdf(pretrained=file_path),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertTrue(
         torch.allclose(predicted, expected.to(device), atol=1e-6))
     os.remove(file_path)
示例#6
0
 def test_dtype(self):
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                   pretrained='en-turian'),
                    Transformation.Pool(pool_option=PoolOptions.max),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.dtype, torch.float32)
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                   pretrained='en-turian'),
                    Transformation.Pool(pool_option=PoolOptions.max),
                    dtype=np.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.dtype, np.float32)
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         self.assertEqual(predicted[0].dtype, torch.float32)
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        dtype=np.float32)
         predicted = tw.fit_transform(docs)
         self.assertEqual(predicted[0].dtype, np.float32)
示例#7
0
 def _test_index(self, pool_option):
     index = 0 if pool_option == PoolOptions.first else -1
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'),
                        dtype=torch.float32)
         expected = tw.fit_transform(docs[0])[0][index].view(1, -1)
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'),
                    Transformation.Pool(pool_option=pool_option), dtype=torch.float32)
     pooled = tw.fit_transform(docs[0])
     self.assertTrue(torch.allclose(expected.to(device), pooled.to(device)))
示例#8
0
    def test_pretrained_error(self):
        # Not a string
        with self.assertRaises(ValueError):
            TextWiser(Embedding.TfIdf(pretrained=3), dtype=torch.float32)

        # Not a path
        with self.assertRaises(ValueError):
            TextWiser(Embedding.TfIdf(pretrained='|||||||'),
                      dtype=torch.float32)

        # Not a path on the embedding object
        with self.assertRaises(ValueError):
            _TfIdfEmbeddings(pretrained='|||||||')._init_vectorizer()
示例#9
0
    def test_pretrained_error(self):
        # Not a string
        with self.assertRaises(ValueError):
            TextWiser(Embedding.Doc2Vec(pretrained=3), dtype=torch.float32)

        # Not a path
        with self.assertRaises(ValueError):
            TextWiser(Embedding.Doc2Vec(pretrained='|||||||'),
                      dtype=torch.float32)

        # Not a path on the embedding object
        with self.assertRaises(ValueError):
            _Doc2VecEmbeddings(pretrained='|||||||').fit([])
示例#10
0
    def test_tokenizer_validation(self):
        # shouldn't raise an error
        try:
            TextWiser(
                Embedding.Doc2Vec(tokenizer=lambda doc: doc.lower().split()))
        except TypeError:
            self.fail("This tokenizer should pass the validation.")

        # should raise the first error
        with self.assertRaises(TypeError):
            TextWiser(Embedding.Doc2Vec(tokenizer=lambda doc: doc.lower()))

        # should raise the second error
        with self.assertRaises(TypeError):
            TextWiser(Embedding.Doc2Vec(tokenizer=lambda doc: [1]))
示例#11
0
 def test_num_components(self):
     # The natural # of components is 3.
     n_components = 2  # Restrict the # of components
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=n_components),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.shape[1], n_components)
     self._reset_seed()
     n_components = 200  # Expand the # of components
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=n_components),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertEqual(predicted.shape[1], n_components)
示例#12
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec, pretrained='en-turian'), Transformation.Pool(pool_option=PoolOptions.max), dtype=torch.float32)
     expected = torch.from_numpy(np.genfromtxt(
         self._get_test_path('data', 'pooled_embeddings.csv'),
         dtype=np.float32))
     self._test_fit_transform(tw, expected)
     self._test_fit_before_transform(tw, expected)
示例#13
0
 def test_fine_tuneable(self):
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=2),
                    dtype=torch.float32,
                    is_finetuneable=True)
     tw.fit(docs)
     embeddings1 = tw._imp[1].V.data.clone().detach()
     # Give a fake task to train embeddings on
     # Have a linear layer with a single output after pooling
     linear = nn.Linear(2, 1, bias=False)
     model = nn.Sequential(tw, linear).to(device).train()
     y_pred = model(docs)
     # Use ones as the target
     y_act = torch.ones_like(y_pred)
     # Optimize MSE using SGD
     criterion = nn.MSELoss()
     optimizer = optim.SGD(model.parameters(), lr=1e-3)
     # Calculate the loss & gradients
     optimizer.zero_grad()
     loss = criterion(y_pred, y_act)
     loss.backward()
     # The embedding layer should have gradients now
     self.assertIsNotNone([p for p in tw._imp[1].named_parameters()
                           ][0][1].grad)
     # Update weights
     optimizer.step()
     # The weights should be updated if fine_tune is true, else it should be the same
     self.assertFalse(torch.allclose(embeddings1, tw._imp[1].V.data))
示例#14
0
 def test_v_in_parameters(self):
     n_components = 2  # Restrict the # of components
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=n_components),
                    dtype=torch.float32)
     tw.fit(docs)
     self.assertIn('_imp.1.V', [p[0] for p in tw.named_parameters()])
示例#15
0
 def test_list_handling(self):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        Transformation.SVD(n_components=2),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         expected = [
             torch.tensor([[-0.9719871283, 0.0947150663],
                           [-0.3805825114, -1.0427029133],
                           [-0.6929296255, 0.1793890595],
                           [0.0000000000, 0.0000000000]],
                          dtype=torch.float32),
             torch.tensor([[-0.9719871283, 0.0947150663],
                           [-0.3805825114, -1.0427029133],
                           [-0.7170552015, 0.0105144158],
                           [-0.9385635853, 0.6596723199],
                           [0.0000000000, 0.0000000000]],
                          dtype=torch.float32),
             torch.tensor([[-0.8687936068, -0.9333068132],
                           [-0.6859120131, 0.0732812732],
                           [-0.9385635853, 0.6596723199],
                           [0.0000000000, 0.0000000000]],
                          dtype=torch.float32)
         ]
         for p, e in zip(predicted, expected):
             self.assertTrue(torch.allclose(p, e.to(device), atol=1e-6))
示例#16
0
 def _test_schema(self, schema):
     tw = TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32)
     expected = torch.tensor([[-1.5983865261,  1.8820908070,  0.1802073568],
                              [-1.8616025448, -0.4420224428, -0.9159017205],
                              [-2.0401582718, -1.0712100267,  0.6945561171]], dtype=torch.float32)
     self._test_fit_transform(tw, expected, atol=1e-4)
     self._reset_seed()
     self._test_fit_before_transform(tw, expected, atol=1e-4)
示例#17
0
 def test_lazy_load(self):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        lazy_load=True)
         self.assertIsNone(tw._imp)
         tw.fit(docs)
         self.assertIsNotNone(tw._imp)
         tw = TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                       pretrained='en-turian'),
                        lazy_load=True,
                        dtype=torch.float32,
                        is_finetuneable=True)
         self.assertIsNone(tw._imp)
         tw.fit_transform(docs)
         self.assertIsNotNone(tw._imp)
示例#18
0
 def test_min_components(self):
     try:
         with self.assertRaises(ValueError):
             TextWiser(Embedding.TfIdf(min_df=2),
                       Transformation.UMAP(n_components=1),
                       dtype=torch.float32)
     except ModuleNotFoundError:
         print('No UMAP found. Skipping the test. ...', end=" ", flush=True)
示例#19
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.TfIdf(min_df=2), dtype=torch.float32)
     expected = torch.tensor(
         [[0.4813341796, 0.6198053956, 0.0000000000, 0.6198053956],
          [0.4091228545, 0.5268201828, 0.5268201828, 0.5268201828],
          [0.6133555174, 0.0000000000, 0.7898069024, 0.0000000000]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     self._test_fit_before_transform(tw, expected)
示例#20
0
 def _test_schema(self, schema):
     tw = TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32)
     expected = torch.tensor([[-1.5983779430, 1.8820992708, 0.1802130789],
                              [-1.8616007566, -0.4420076311, -0.9159148335],
                              [-2.0401744843, -1.0712141991, 0.6945576668]],
                             dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     self._reset_seed()
     self._test_fit_before_transform(tw, expected)
示例#21
0
 def test_fit_transform(self):
     self._reset_seed(seed=12345)
     tw = TextWiser(Embedding.Random(), dtype=torch.float32)
     expected = torch.tensor(
         [[1., 0., 1., 0., 0., 1.], [1., 0., 0., 1., 0., 1.],
          [0., 1., 0., 1., 1., 0.]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     self._test_fit_before_transform(tw, expected)
示例#22
0
 def test_immutable_schema(self):
     schema = {
         "transform": [
             ["word", {"word_option": "word2vec", "pretrained": "en-turian"}],
             ["pool", {"pool_option": "max"}]
         ]
     }
     emb = Embedding.Compound(schema=schema)
     schema['transform'][1][1]['pool_option'] = 'min'
     self.assertEqual(emb.schema['transform'][1][1]['pool_option'], 'max')
示例#23
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.SVD(n_components=2),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[-0.8526761532, 0.5070778131], [-0.9837458134, 0.0636523664],
          [-0.7350711226, -0.6733918786]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     self._reset_seed()
     self._test_fit_before_transform(tw, expected)
示例#24
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.LDA(n_components=2),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.7724367976, 0.2275632024], [0.5895692706, 0.4104307294],
          [0.2381444573, 0.7618555427]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     self._reset_seed()
     self._test_fit_before_transform(tw, expected)
示例#25
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.TfIdf(min_df=2),
                    Transformation.NMF(n_components=2),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.8865839243, 0.0000000000], [0.6736079454, 0.5221673250],
          [0.0203559380, 1.1122620106]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     self._reset_seed()
     self._test_fit_before_transform(tw, expected, atol=1e-5)
示例#26
0
 def test_pretrained_error(self):
     # Not a pretrained model
     try:
         os.environ[
             'TF_CPP_MIN_LOG_LEVEL'] = '3'  # shut tensorflow up during testing
         with self.assertRaises(ValueError):
             TextWiser(Embedding.USE(pretrained=None), dtype=torch.float32)
     except ModuleNotFoundError:
         print('No Tensorflow found. Skipping the test. ...',
               end=" ",
               flush=True)
示例#27
0
 def test_use_versions(self):
     """Tests if the previous versions of USE are useable"""
     try:
         os.environ[
             'TF_CPP_MIN_LOG_LEVEL'] = '3'  # shut tensorflow up during testing
         TextWiser(Embedding.USE(
             pretrained=
             'https://tfhub.dev/google/universal-sentence-encoder-large/5'),
                   dtype=torch.float32).fit_transform(docs)
         TextWiser(Embedding.USE(
             pretrained=
             'https://tfhub.dev/google/universal-sentence-encoder-large/4'),
                   dtype=torch.float32).fit_transform(docs)
         TextWiser(Embedding.USE(
             pretrained=
             'https://tfhub.dev/google/universal-sentence-encoder-large/3'),
                   dtype=torch.float32).fit_transform(docs)
     except ModuleNotFoundError:
         print('No Tensorflow found. Skipping the test. ...',
               end=" ",
               flush=True)
示例#28
0
 def test_pretrained(self):
     tw = TextWiser(Embedding.Doc2Vec(deterministic=True,
                                      seed=1234,
                                      vector_size=2,
                                      min_count=1,
                                      workers=1,
                                      sample=0,
                                      negative=0,
                                      hashfxn=det_hash),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602],
          [0.1042766869, -0.0033877781]],
         dtype=torch.float32)
     self._test_fit_before_transform(tw, expected)
     # Test loading from bytes
     with NamedTemporaryFile() as file:
         pickle.dump(tw._imp[0].model, file)
         file.seek(0)
         tw = TextWiser(Embedding.Doc2Vec(pretrained=file,
                                          deterministic=True,
                                          seed=1234),
                        dtype=torch.float32)
         predicted = tw.fit_transform(docs)
         self.assertTrue(
             torch.allclose(predicted, expected.to(device), atol=1e-6))
     # Test loading from file
     file_path = self._get_test_path('data', 'doc2vec.pkl')
     with open(file_path, 'wb') as fp:
         pickle.dump(tw._imp[0].model, fp)
     tw = TextWiser(Embedding.Doc2Vec(pretrained=file_path,
                                      deterministic=True,
                                      seed=1234),
                    dtype=torch.float32)
     predicted = tw.fit_transform(docs)
     self.assertTrue(
         torch.allclose(predicted, expected.to(device), atol=1e-6))
     os.remove(file_path)
示例#29
0
 def test_fit_transform(self):
     tw = TextWiser(Embedding.Doc2Vec(seed=1234,
                                      vector_size=2,
                                      min_count=1,
                                      workers=1,
                                      sample=0,
                                      negative=0,
                                      hashfxn=det_hash),
                    dtype=torch.float32)
     expected = torch.tensor(
         [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602],
          [0.1042766869, -0.0033877781]],
         dtype=torch.float32)
     self._test_fit_transform(tw, expected)
示例#30
0
 def test_fit_transform(self):
     try:
         os.environ[
             'TF_CPP_MIN_LOG_LEVEL'] = '3'  # shut tensorflow up during testing
         tw = TextWiser(Embedding.USE(), dtype=torch.float32)
         expected = torch.from_numpy(
             np.genfromtxt(self._get_test_path('data',
                                               'use_embeddings.csv'),
                           dtype=np.float32))
         self._test_fit_transform(tw, expected)
         self._test_fit_before_transform(tw, expected)
     except ModuleNotFoundError:
         print('No Tensorflow found. Skipping the test. ...',
               end=" ",
               flush=True)