def test_no_padding_bagof(self): f = featurize.NGramFeaturizer(2, 5, max_sample_per_class=2, use_padding=False, bagof=True) f.featurize_stream(io.StringIO("abcab\tdef")) features = f.dataset.samples[0].features self.assertEqual(set(features.keys()), {'ab', 'bc', 'ca'}) self.assertEqual(set(features.values()), {True})
def test_last_char_with_padding(self): f = featurize.NGramFeaturizer(2, 3, max_sample_per_class=2, use_padding=True, bagof=False) f.featurize_stream(io.StringIO("abcdef\tdef")) features = f.dataset.samples[0].features self.assertEqual(set(features.keys()), {'2.0', '2.1', '2.2', '2.3'}) self.assertEqual(set(features.values()), {'de', 'ef', ' d', 'f '})
def test_no_padding_positional(self): f = featurize.NGramFeaturizer(2, 3, max_sample_per_class=2, use_padding=False) f.featurize_stream(io.StringIO("abc\tdef")) features = f.dataset.samples[0].features self.assertEqual(set(features.values()), {'ab', 'bc'}) self.assertEqual(set(features.keys()), {'2.0', '2.1'})
def test_2d_nonunique_samples(self): f = featurize.NGramFeaturizer(1, 3, max_sample_per_class=2, skip_duplicates=False, use_padding=False) f.featurize_stream(io.StringIO("abc\tdef\nabd\t12\nabc\tdef")) X = f.dataset.X self.assertEqual(X.shape, (3, 4)) y = f.dataset.y self.assertEqual(y.shape, (3, 2))
def test_2d_unique_samples2(self): f = featurize.NGramFeaturizer(1, 3, max_sample_per_class=2, use_padding=False) f.featurize_stream(io.StringIO("abc\tdef\nabd\t12")) X = f.dataset.X self.assertEqual(X.shape, (2, 4)) y = f.dataset.y self.assertEqual(y.shape, (2, 2))