def test_word_tokenizer(self): tokenizer = WordTokenizer(True) tokens = tokenizer("\nCall me Ishmael. Some years ago--never mind how long precisely ") self.assertEqual(["call", "me", "ishmael", "some", "years", "ago", "never", "mind", "how", "long", "precisely"], tokens) tokenizer = WordTokenizer(False) tokens = tokenizer("\nCall me Ishmael. Some years ago--never mind how long precisely ") self.assertEqual(["Call", "me", "Ishmael", "Some", "years", "ago", "never", "mind", "how", "long", "precisely"], tokens) self.assertEqual("The man ran", tokenizer.join_tokens(["The", "man", "ran"]))
def test_invalid_partition(self): self.assertRaises( ValueError, PartitionedData.from_text, { "train": ["red red red", "blue blue green"], "validate": ["red blue blue orange"], "test": ["green green red black"] }, ["bogus"], Vocabulary.factory(WordTokenizer(True)))
def create_partitioned_data(): return PartitionedData.from_text( { "train": ["blue blue green", "red red red"], "test": ["green green red black"], "validate": ["red blue blue orange"] }, ["train"], Vocabulary.factory(WordTokenizer(True)))
def test_limited_vocabulary(self): v = Vocabulary(["to be or not to be"], WordTokenizer(True), max_vocabulary=2) self.assertEqual({"to", "be"}, set(v.type_to_index.keys())) self.assertEqual(2, len(v)) v = Vocabulary(["hamlet hamlet hamlet to be or not to be"], WordTokenizer(True), min_frequency=2) self.assertEqual({"to", "be", "hamlet"}, set(v.type_to_index.keys())) self.assertEqual(3, len(v)) v = Vocabulary(["hamlet hamlet hamlet to be or not to be"], WordTokenizer(True), max_vocabulary=2, min_frequency=2) self.assertEqual({"be", "hamlet"}, set(v.type_to_index.keys())) self.assertEqual(2, len(v))
def test_full_vocabulary(self): v = Vocabulary(["the quick brown fox jumped over the lazy dog"], WordTokenizer(True)) self.assertEqual( "Vocabulary, size 8: None:1 the:2 brown:3 dog:4 fox:5 ...", str(v)) self.assertEqual( {"the", "quick", "brown", "fox", "jumped", "over", "lazy", "dog"}, set(v.type_to_index.keys())) self.assertEqual(8, len(v))
def test_tokenizer_serialization(self): tokenizer_name = os.path.join(self.directory, "tokenizer.pkl") tokenizer = WordTokenizer(True) self._serialize(tokenizer_name, tokenizer) self.assertTrue(os.path.isfile(tokenizer_name)) deserialized_tokenizer = self._deserialize(tokenizer_name) self.assertEqual(tokenizer, deserialized_tokenizer) self.assertEqual(tokenizer("The quick brown fox"), deserialized_tokenizer("The quick brown fox"))
def test_out_of_vocabulary(self): vocabulary = Vocabulary(["a a a b b OOV c"], WordTokenizer(True), out_of_vocabulary="OOV") self.assertEqual(1, vocabulary.index("OOV")) self.assertEqual(1, vocabulary.index("z")) self.assertEqual(2, vocabulary.index("a")) self.assertEqual(3, vocabulary.index("b")) self.assertEqual(4, vocabulary.index("c"))
def test_word_tokenizer(self): tokenizer = WordTokenizer(True) tokens = tokenizer( "\nCall me Ishmael. Some years ago--never mind how long precisely " ) self.assertEqual([ "call", "me", "ishmael", "some", "years", "ago", "never", "mind", "how", "long", "precisely" ], tokens) tokenizer = WordTokenizer(False) tokens = tokenizer( "\nCall me Ishmael. Some years ago--never mind how long precisely " ) self.assertEqual([ "Call", "me", "Ishmael", "Some", "years", "ago", "never", "mind", "how", "long", "precisely" ], tokens) self.assertEqual("The man ran", tokenizer.join_tokens(["The", "man", "ran"]))
def test_vocabulary_serialization(self): vocabulary_name = os.path.join(self.directory, "vocabulary.pkl") vocabulary = Vocabulary( ["the quick brown fox jumped over the lazy dog"], WordTokenizer(True)) self._serialize(vocabulary_name, vocabulary) self.assertTrue(os.path.isfile(vocabulary_name)) deserialized_vocabulary = self._deserialize(vocabulary_name) self.assertEqual(vocabulary, deserialized_vocabulary) s = "The quick black fox" np.testing.assert_equal(vocabulary.index_string(s), deserialized_vocabulary.index_string(s))
def test_equality(self): self.assertEqual(WordTokenizer(True), WordTokenizer(True)) self.assertNotEqual(WordTokenizer(True), WordTokenizer(False)) self.assertNotEqual(WordTokenizer(True), CharacterTokenizer(True))
def test_invalid_index(self): document = "the quick brown fox jumped over the lazy dog" vocabulary = Vocabulary([document], WordTokenizer(True)) self.assertRaises(ValueError, vocabulary.type, 0) self.assertRaises(ValueError, vocabulary.type, -1)
def test_index_tokens(self): document = "the quick brown fox jumped over the lazy dog" vocabulary = Vocabulary([document], WordTokenizer(True)) np.testing.assert_equal(np.array([2, 9, 3, 5, 6, 8, 2, 7, 4]), vocabulary.index_string(document))
def test_vocabulary_factory(self): factory = Vocabulary.factory(WordTokenizer(True), max_vocabulary=2) self.assertEqual( Vocabulary(["to be or not to be"], WordTokenizer(True), max_vocabulary=2), factory(["to be or not to be"]))