def test_limit_vocabulary(self): # Given config = CooccurrenceVectorizerConfig(filter_stop_words=False) vectorizer = CooccurrenceVectorizer(config=config) train_data = [ text_to_utterance(t) for t in ("a b", "a c", "a d", "a e") ] data = [text_to_utterance(t) for t in ("a c e", "a d e")] vectorizer.fit(train_data, get_empty_dataset("en")) x_0 = vectorizer.transform(data) pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2, ("a", "e"): 3} kept_pairs = [("a", "b"), ("a", "c"), ("a", "d")] self.assertDictEqual(pairs, vectorizer.word_pairs) # When kept_pairs_indexes = [pairs[p] for p in kept_pairs] vectorizer.limit_word_pairs(kept_pairs) # Then expected_pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2} self.assertDictEqual(expected_pairs, vectorizer.word_pairs) x_1 = vectorizer.transform(data) self.assertListEqual(x_0[:, kept_pairs_indexes].todense().tolist(), x_1.todense().tolist())
def test_transform(self): # Given config = CooccurrenceVectorizerConfig( filter_stop_words=True, window_size=3, unknown_words_replacement_string="d") t_0 = "yo a b c d e f yo" t_1 = "yo a b c d e" u_0 = text_to_utterance(t_0) u_1 = text_to_utterance(t_1) resources = {STOP_WORDS: {"b"}} builtin_ents = [{ "value": "e", "resolved_value": "e", "range": { "start": 11, "end": 12 }, "entity_kind": "the_snips_e_entity" }] custom_ents = [{ "value": "c", "resolved_value": "c", "range": { "start": 7, "end": 8 }, "entity_kind": "the_c_entity" }] builtin_parser = EntityParserMock({ t_0: builtin_ents, t_1: builtin_ents }) custom_parser = EntityParserMock({t_0: custom_ents, t_1: custom_ents}) vectorizer = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) vectorizer._language = "en" vectorizer._word_pairs = { ("THE_SNIPS_E_ENTITY", "f"): 0, ("a", "THE_C_ENTITY"): 1, ("a", "THE_SNIPS_E_ENTITY"): 2, ("b", "THE_SNIPS_E_ENTITY"): 3, ("yo", "yo"): 4, ("d", "THE_SNIPS_E_ENTITY"): 5 } data = [u_0, u_1] # When x = vectorizer.transform(data) # Then expected = [[1, 1, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0]] self.assertEqual(expected, x.todense().tolist())