예제 #1
0
 def test_no_padding_bagof(self):
     f = featurize.NGramFeaturizer(2, 5, max_sample_per_class=2,
                                   use_padding=False, bagof=True)
     f.featurize_stream(io.StringIO("abcab\tdef"))
     features = f.dataset.samples[0].features
     self.assertEqual(set(features.keys()), {'ab', 'bc', 'ca'})
     self.assertEqual(set(features.values()), {True})
예제 #2
0
 def test_last_char_with_padding(self):
     f = featurize.NGramFeaturizer(2, 3, max_sample_per_class=2,
                                   use_padding=True, bagof=False)
     f.featurize_stream(io.StringIO("abcdef\tdef"))
     features = f.dataset.samples[0].features
     self.assertEqual(set(features.keys()), {'2.0', '2.1', '2.2', '2.3'})
     self.assertEqual(set(features.values()), {'de', 'ef', ' d', 'f '})
예제 #3
0
 def test_no_padding_positional(self):
     f = featurize.NGramFeaturizer(2, 3, max_sample_per_class=2,
                                   use_padding=False)
     f.featurize_stream(io.StringIO("abc\tdef"))
     features = f.dataset.samples[0].features
     self.assertEqual(set(features.values()), {'ab', 'bc'})
     self.assertEqual(set(features.keys()), {'2.0', '2.1'})
예제 #4
0
 def test_2d_nonunique_samples(self):
     f = featurize.NGramFeaturizer(1, 3, max_sample_per_class=2,
                                   skip_duplicates=False, use_padding=False)
     f.featurize_stream(io.StringIO("abc\tdef\nabd\t12\nabc\tdef"))
     X = f.dataset.X
     self.assertEqual(X.shape, (3, 4))
     y = f.dataset.y
     self.assertEqual(y.shape, (3, 2))
예제 #5
0
 def test_2d_unique_samples2(self):
     f = featurize.NGramFeaturizer(1, 3, max_sample_per_class=2,
                                   use_padding=False)
     f.featurize_stream(io.StringIO("abc\tdef\nabd\t12"))
     X = f.dataset.X
     self.assertEqual(X.shape, (2, 4))
     y = f.dataset.y
     self.assertEqual(y.shape, (2, 2))