Exemplo n.º 1
0
 def test_set_params(self):
     # Set the arguments in container classes
     tw = TextWiser(Embedding.TfIdf(min_df=5),
                    Transformation.NMF(n_components=30),
                    lazy_load=True)
     tw.set_params(embedding__min_df=10,
                   transformations__0__n_components=10)
     self.assertEqual(tw.embedding.min_df, 10)
     self.assertEqual(tw.transformations[0].n_components, 10)
     # Set the arguments in implementation
     tw = TextWiser(Embedding.Doc2Vec(vector_size=2, min_count=1,
                                      workers=1))
     tw.fit(docs)
     tw.set_params(_imp__0__seed=10)
     self.assertEqual(tw._imp[0].seed, 10)
     # Set the arguments in a schema
     schema = {'transform': ['tfidf', ['nmf', {'n_components': 30}]]}
     tw = TextWiser(Embedding.Compound(schema=schema))
     tw.set_params(embedding__schema__transform__0__min_df=10,
                   embedding__schema__transform__1__n_components=10)
     self.assertEqual(tw.embedding.schema['transform'][0][1]['min_df'], 10)
     self.assertEqual(
         tw.embedding.schema['transform'][1][1]['n_components'], 10)
     # Replace a part of the schema in a list
     tw.set_params(embedding__schema__transform__0='bow')
     self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
     # Replace a part of the schema
     tw.set_params(embedding__schema__transform=['bow'])
     self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
Exemplo n.º 2
0
 def _test_schema(self, schema):
     tw = TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32)
     expected = torch.tensor([[-1.5983865261,  1.8820908070,  0.1802073568],
                              [-1.8616025448, -0.4420224428, -0.9159017205],
                              [-2.0401582718, -1.0712100267,  0.6945561171]], dtype=torch.float32)
     self._test_fit_transform(tw, expected, atol=1e-4)
     self._reset_seed()
     self._test_fit_before_transform(tw, expected, atol=1e-4)
Exemplo n.º 3
0
 def _test_schema(self, schema):
     tw = TextWiser(Embedding.Compound(schema=schema), dtype=torch.float32)
     expected = torch.tensor([[-1.5983779430, 1.8820992708, 0.1802130789],
                              [-1.8616007566, -0.4420076311, -0.9159148335],
                              [-2.0401744843, -1.0712141991, 0.6945576668]],
                             dtype=torch.float32)
     self._test_fit_transform(tw, expected)
     self._reset_seed()
     self._test_fit_before_transform(tw, expected)
Exemplo n.º 4
0
 def test_immutable_schema(self):
     schema = {
         "transform": [
             ["word", {"word_option": "word2vec", "pretrained": "en-turian"}],
             ["pool", {"pool_option": "max"}]
         ]
     }
     emb = Embedding.Compound(schema=schema)
     schema['transform'][1][1]['pool_option'] = 'min'
     self.assertEqual(emb.schema['transform'][1][1]['pool_option'], 'max')
Exemplo n.º 5
0
    def test_finetune_validation(self):
        # Nothing is fine-tuneable if dtype is numpy
        with self.assertRaises(TypeError):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                         pretrained='en_turian'),
                          dtype=np.float32,
                          is_finetuneable=True)

        # Word2Vec is fine-tuneable
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                         pretrained='en-turian'),
                          dtype=torch.float32,
                          is_finetuneable=True,
                          lazy_load=True)
        except ValueError:
            self.fail("Word2vec is fine tuneable")

        # ELMo is not fine-tuneable, and should raise an error
        with self.assertRaises(ValueError):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.elmo),
                          dtype=torch.float32,
                          is_finetuneable=True,
                          lazy_load=True)

        # TfIdf is not fine-tuneable, and should raise an error
        with self.assertRaises(ValueError):
            TextWiser(Embedding.TfIdf(),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)

        # TfIdf is not fine-tuneable, but SVD is
        try:
            TextWiser(Embedding.TfIdf(),
                      Transformation.SVD(),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)
        except ValueError:
            self.fail("SVD is fine tuneable")

        # LDA cannot propagate gradients, so the whole thing is not fine-tuneable
        with self.assertRaises(ValueError):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                TextWiser(Embedding.Word(word_option=WordOptions.word2vec,
                                         pretrained='en'),
                          Transformation.LDA(),
                          dtype=torch.float32,
                          is_finetuneable=True,
                          lazy_load=True)

        schema = {
            'concat': [{
                'transform': [('word2vec', {
                    'pretrained': 'en-turian'
                }), ('pool', {
                    'pool_option': 'max'
                })]
            }, {
                'transform': ['tfidf', ('nmf', {
                    'n_components': 30
                })]
            }]
        }

        # Word2Vec is fine-tuneable, therefore the whole schema is fine-tuneable
        try:
            TextWiser(Embedding.Compound(schema=schema),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)
        except ValueError:
            self.fail(
                "Any fine-tuneable weights is enough for the model to be fine-tuneable"
            )

        # TfIdf is not fine-tuneable, but SVD is
        schema = {'transform': ['tfidf', 'svd']}
        try:
            TextWiser(Embedding.Compound(schema=schema),
                      dtype=torch.float32,
                      is_finetuneable=True,
                      lazy_load=True)
        except ValueError:
            self.fail("SVD is fine tuneable")
Exemplo n.º 6
0
 def test_no_pretrained(self):
     with self.assertRaises(ValueError):
         TextWiser(Embedding.Compound(schema='tfidf', pretrained='path'),
                   dtype=torch.float32)