def test_deterministic_transform(self): """Specifying the `deterministic` option should make Doc2Vec transformation deterministic. By default, running inference with doc2vec is not deterministic in gensim. This test makes sure we can get a deterministic result when necessary. """ tw = TextWiser(Embedding.Doc2Vec(deterministic=True, seed=1234, vector_size=2, min_count=1, workers=1, sample=0, negative=0, hashfxn=det_hash), dtype=torch.float32) expected = torch.tensor( [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602], [0.1042766869, -0.0033877781]], dtype=torch.float32) self._test_fit_before_transform(tw, expected) tw = TextWiser(Embedding.Doc2Vec(pretrained=None, deterministic=True, seed=1234, vector_size=2, min_count=1, workers=1, sample=0, negative=0, hashfxn=det_hash), dtype=torch.float32) self._test_fit_before_transform(tw, expected)
def test_pretrained_error(self): # Not a string with self.assertRaises(ValueError): TextWiser(Embedding.Doc2Vec(pretrained=3), dtype=torch.float32) # Not a path with self.assertRaises(ValueError): TextWiser(Embedding.Doc2Vec(pretrained='|||||||'), dtype=torch.float32) # Not a path on the embedding object with self.assertRaises(ValueError): _Doc2VecEmbeddings(pretrained='|||||||').fit([])
def test_tokenizer_validation(self): # shouldn't raise an error try: TextWiser( Embedding.Doc2Vec(tokenizer=lambda doc: doc.lower().split())) except TypeError: self.fail("This tokenizer should pass the validation.") # should raise the first error with self.assertRaises(TypeError): TextWiser(Embedding.Doc2Vec(tokenizer=lambda doc: doc.lower())) # should raise the second error with self.assertRaises(TypeError): TextWiser(Embedding.Doc2Vec(tokenizer=lambda doc: [1]))
def test_set_params(self): # Set the arguments in container classes tw = TextWiser(Embedding.TfIdf(min_df=5), Transformation.NMF(n_components=30), lazy_load=True) tw.set_params(embedding__min_df=10, transformations__0__n_components=10) self.assertEqual(tw.embedding.min_df, 10) self.assertEqual(tw.transformations[0].n_components, 10) # Set the arguments in implementation tw = TextWiser(Embedding.Doc2Vec(vector_size=2, min_count=1, workers=1)) tw.fit(docs) tw.set_params(_imp__0__seed=10) self.assertEqual(tw._imp[0].seed, 10) # Set the arguments in a schema schema = {'transform': ['tfidf', ['nmf', {'n_components': 30}]]} tw = TextWiser(Embedding.Compound(schema=schema)) tw.set_params(embedding__schema__transform__0__min_df=10, embedding__schema__transform__1__n_components=10) self.assertEqual(tw.embedding.schema['transform'][0][1]['min_df'], 10) self.assertEqual( tw.embedding.schema['transform'][1][1]['n_components'], 10) # Replace a part of the schema in a list tw.set_params(embedding__schema__transform__0='bow') self.assertEqual(tw.embedding.schema['transform'][0], 'bow') # Replace a part of the schema tw.set_params(embedding__schema__transform=['bow']) self.assertEqual(tw.embedding.schema['transform'][0], 'bow')
def test_pretrained(self): tw = TextWiser(Embedding.Doc2Vec(deterministic=True, seed=1234, vector_size=2, min_count=1, workers=1, sample=0, negative=0, hashfxn=det_hash), dtype=torch.float32) expected = torch.tensor( [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602], [0.1042766869, -0.0033877781]], dtype=torch.float32) self._test_fit_before_transform(tw, expected) # Test loading from bytes with NamedTemporaryFile() as file: pickle.dump(tw._imp[0].model, file) file.seek(0) tw = TextWiser(Embedding.Doc2Vec(pretrained=file, deterministic=True, seed=1234), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertTrue( torch.allclose(predicted, expected.to(device), atol=1e-6)) # Test loading from file file_path = self._get_test_path('data', 'doc2vec.pkl') with open(file_path, 'wb') as fp: pickle.dump(tw._imp[0].model, fp) tw = TextWiser(Embedding.Doc2Vec(pretrained=file_path, deterministic=True, seed=1234), dtype=torch.float32) predicted = tw.fit_transform(docs) self.assertTrue( torch.allclose(predicted, expected.to(device), atol=1e-6)) os.remove(file_path)
def test_fit_transform(self): tw = TextWiser(Embedding.Doc2Vec(seed=1234, vector_size=2, min_count=1, workers=1, sample=0, negative=0, hashfxn=det_hash), dtype=torch.float32) expected = torch.tensor( [[0.0471987687, 0.0309393797], [-0.0278387405, -0.2347375602], [0.1042766869, -0.0033877781]], dtype=torch.float32) self._test_fit_transform(tw, expected)
def test_options_immutable(self): """The Embedding and Transformation options should be immutable""" embedding = Embedding.Doc2Vec(deterministic=False) with self.assertRaises(ValueError): embedding.deterministic = True self.assertFalse(embedding.deterministic)