Пример #1
0
 def test_3d_unique_samples(self):
     f = featurize.CharacterSequenceFeaturizer(3, 10)
     f.featurize_stream(io.StringIO("abb\ta\nabb\ta\nbcd\tb"))
     X = f.dataset.X
     self.assertEqual(X.shape, (2, 3, 4))
     y = f.dataset.y
     self.assertEqual(y.shape, (2, 2))
Пример #2
0
 def test_different_alphabet(self):
     f = featurize.CharacterSequenceFeaturizer(3, 10, alphabet='abcd',
                                               replace_rare=True)
     f.featurize_stream(io.StringIO("axz\ta\nakl\ta"))
     s1 = f.dataset.samples[0]
     s2 = f.dataset.samples[1]
     self.assertEqual(s1.features, s2.features)
Пример #3
0
 def test_3d_simple(self):
     f = featurize.CharacterSequenceFeaturizer(3, 10)
     f.featurize_stream(io.StringIO("abb\ta"))
     X = f.dataset.X
     self.assertEqual(X.shape, (1, 3, 2))
     y = f.dataset.y
     self.assertEqual(y.shape, (1, 1))
Пример #4
0
 def test_replace_rare_char(self):
     f = featurize.CharacterSequenceFeaturizer(3, 10, rare_char='x')
     f.featurize_stream(io.StringIO("aデ\ta\nax\ta"))
     s1 = f.dataset.samples[0]
     s2 = f.dataset.samples[1]
     self.assertEqual(s1.features, s2.features)
Пример #5
0
 def test_replace_punct(self):
     f = featurize.CharacterSequenceFeaturizer(3, 10)
     f.featurize_stream(io.StringIO("a!?\ta\na#'\ta"))
     s1 = f.dataset.samples[0]
     s2 = f.dataset.samples[1]
     self.assertEqual(s1.features, s2.features)
Пример #6
0
 def test_lower(self):
     f = featurize.CharacterSequenceFeaturizer(3, 10)
     f.featurize_stream(io.StringIO("AbCd\ta\nabCD\ta"))
     s1 = f.dataset.samples[0]
     s2 = f.dataset.samples[1]
     self.assertEqual(s1.features, s2.features)
Пример #7
0
 def test_skip_rare(self):
     f = featurize.CharacterSequenceFeaturizer(3, 10, replace_rare=False)
     f.featurize_stream(io.StringIO("aßbc\ta\nabc\ta"))
     s1 = f.dataset.samples.pop()
     s2 = f.dataset.samples.pop()
     self.assertEqual(s1.features, s2.features)
Пример #8
0
 def test_feature_extraction_several_lines(self):
     f = featurize.CharacterSequenceFeaturizer(3, 10)
     f.featurize_stream(io.StringIO(input_simple))
     l = len(input_simple.strip().split('\n'))
     self.assertEqual(len(f.dataset), l)
Пример #9
0
 def test_feature_extraction_short_word(self):
     f = featurize.CharacterSequenceFeaturizer(3, 10)
     f.featurize_stream(io.StringIO("ab\tdef"))
     s = f.dataset.samples.pop()
     self.assertEqual(s.features,
                      [{'ch': ' '}, {'ch': 'a'}, {'ch': 'b'}])
Пример #10
0
 def test_init(self):
     f = featurize.CharacterSequenceFeaturizer(5, 10)
     self.assertIsInstance(f, featurize.Featurizer)
     self.assertEqual(f.dataset.max_sample_per_class, 10)