def test_3d_unique_samples(self): f = featurize.CharacterSequenceFeaturizer(3, 10) f.featurize_stream(io.StringIO("abb\ta\nabb\ta\nbcd\tb")) X = f.dataset.X self.assertEqual(X.shape, (2, 3, 4)) y = f.dataset.y self.assertEqual(y.shape, (2, 2))
def test_different_alphabet(self): f = featurize.CharacterSequenceFeaturizer(3, 10, alphabet='abcd', replace_rare=True) f.featurize_stream(io.StringIO("axz\ta\nakl\ta")) s1 = f.dataset.samples[0] s2 = f.dataset.samples[1] self.assertEqual(s1.features, s2.features)
def test_3d_simple(self): f = featurize.CharacterSequenceFeaturizer(3, 10) f.featurize_stream(io.StringIO("abb\ta")) X = f.dataset.X self.assertEqual(X.shape, (1, 3, 2)) y = f.dataset.y self.assertEqual(y.shape, (1, 1))
def test_replace_rare_char(self): f = featurize.CharacterSequenceFeaturizer(3, 10, rare_char='x') f.featurize_stream(io.StringIO("aデ\ta\nax\ta")) s1 = f.dataset.samples[0] s2 = f.dataset.samples[1] self.assertEqual(s1.features, s2.features)
def test_replace_punct(self): f = featurize.CharacterSequenceFeaturizer(3, 10) f.featurize_stream(io.StringIO("a!?\ta\na#'\ta")) s1 = f.dataset.samples[0] s2 = f.dataset.samples[1] self.assertEqual(s1.features, s2.features)
def test_lower(self): f = featurize.CharacterSequenceFeaturizer(3, 10) f.featurize_stream(io.StringIO("AbCd\ta\nabCD\ta")) s1 = f.dataset.samples[0] s2 = f.dataset.samples[1] self.assertEqual(s1.features, s2.features)
def test_skip_rare(self): f = featurize.CharacterSequenceFeaturizer(3, 10, replace_rare=False) f.featurize_stream(io.StringIO("aßbc\ta\nabc\ta")) s1 = f.dataset.samples.pop() s2 = f.dataset.samples.pop() self.assertEqual(s1.features, s2.features)
def test_feature_extraction_several_lines(self): f = featurize.CharacterSequenceFeaturizer(3, 10) f.featurize_stream(io.StringIO(input_simple)) l = len(input_simple.strip().split('\n')) self.assertEqual(len(f.dataset), l)
def test_feature_extraction_short_word(self): f = featurize.CharacterSequenceFeaturizer(3, 10) f.featurize_stream(io.StringIO("ab\tdef")) s = f.dataset.samples.pop() self.assertEqual(s.features, [{'ch': ' '}, {'ch': 'a'}, {'ch': 'b'}])
def test_init(self): f = featurize.CharacterSequenceFeaturizer(5, 10) self.assertIsInstance(f, featurize.Featurizer) self.assertEqual(f.dataset.max_sample_per_class, 10)