def test_cooccurrence_vectorizer_should_load(self): # Given config = CooccurrenceVectorizerConfig() word_pairs = {("a", "b"): 0, ("a", 'c'): 12} serializable_word_pairs = {0: ["a", "b"], 12: ["a", "c"]} vectorizer_dict = { "unit_name": "cooccurrence_vectorizer", "language_code": "en", "word_pairs": serializable_word_pairs, "builtin_entity_scope": ["snips/datetime"], "config": config.to_dict(), } self.tmp_file_path.mkdir() self.writeJsonContent(self.tmp_file_path / "vectorizer.json", vectorizer_dict) # When vectorizer = CooccurrenceVectorizer.from_path(self.tmp_file_path) # Then self.assertDictEqual(config.to_dict(), vectorizer.config.to_dict()) self.assertEqual("en", vectorizer.language) self.assertDictEqual(vectorizer.word_pairs, word_pairs) self.assertEqual({"snips/datetime"}, vectorizer.builtin_entity_scope)
def test_limit_vocabulary(self): # Given config = CooccurrenceVectorizerConfig(filter_stop_words=False) vectorizer = CooccurrenceVectorizer(config=config) train_data = [ text_to_utterance(t) for t in ("a b", "a c", "a d", "a e") ] data = [text_to_utterance(t) for t in ("a c e", "a d e")] vectorizer.fit(train_data, get_empty_dataset("en")) x_0 = vectorizer.transform(data) pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2, ("a", "e"): 3} kept_pairs = [("a", "b"), ("a", "c"), ("a", "d")] self.assertDictEqual(pairs, vectorizer.word_pairs) # When kept_pairs_indexes = [pairs[p] for p in kept_pairs] vectorizer.limit_word_pairs(kept_pairs) # Then expected_pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2} self.assertDictEqual(expected_pairs, vectorizer.word_pairs) x_1 = vectorizer.transform(data) self.assertListEqual(x_0[:, kept_pairs_indexes].todense().tolist(), x_1.todense().tolist())
def test_fit_cooccurrence_vectorizer_feature_selection(self, mocked_chi2): # Given vectorizer_config = CooccurrenceVectorizerConfig( filter_stop_words=False) config = FeaturizerConfig( added_cooccurrence_feature_ratio=.3, cooccurrence_vectorizer_config=vectorizer_config) featurizer = Featurizer(config) mocked_dataset = {"language": "fr", "entities": {}, "intents": {}} utterances = [ text_to_utterance("a b c d e"), text_to_utterance("f g h i j"), text_to_utterance("none"), ] mocked_vectorizer = MagicMock() mocked_vectorizer.idf_diag = range(10) featurizer.tfidf_vectorizer = mocked_vectorizer classes = [0, 0, 1] # When mocked_chi2.return_value = (None, [0.1, 1.0, 0.2, 1.0, 0.3, 1.0] + [1.0 for _ in range(100)]) featurizer._fit_cooccurrence_vectorizer(utterances, classes, 1, mocked_dataset) # Then expected_pairs = {("a", "b"): 0, ("a", "d"): 1, ("b", "c"): 2} self.assertDictEqual(expected_pairs, featurizer.cooccurrence_vectorizer.word_pairs)
def test_featurizer_config(self): # Given tfid_vectorizer_config = TfidfVectorizerConfig() cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig() config_dict = { "unit_name": "featurizer", "pvalue_threshold": 0.2, "added_cooccurrence_feature_ratio": 0.2, "tfidf_vectorizer_config": tfid_vectorizer_config.to_dict(), "cooccurrence_vectorizer_config": cooccurrence_vectorizer_config.to_dict() } # When config = FeaturizerConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_fit_transform(self, mocked_preprocess): t = "a b c d e f" u = text_to_utterance(t) builtin_ents = [ { "value": "e", "resolved_value": "e", "range": { "start": 8, "end": 9 }, "entity_kind": "the_snips_e_entity" } ] custom_ents = [ { "value": "c", "resolved_value": "c", "range": { "start": 4, "end": 5 }, "entity_kind": "the_c_entity" } ] mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents] config = CooccurrenceVectorizerConfig( window_size=3, unknown_words_replacement_string="b", filter_stop_words=False ) dataset = get_empty_dataset("en") builtin_parser = EntityParserMock({t: builtin_ents}) custom_parser = EntityParserMock({t: custom_ents}) resources = {STOP_WORDS: set()} vectorizer1 = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) vectorizer2 = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) # When x = [u] x_0 = vectorizer1.fit(x, dataset).transform(x).todense().tolist() x_1 = vectorizer2.fit_transform(x, dataset).todense().tolist() # Then self.assertListEqual(x_0, x_1)
def test_fit_unordered(self, mocked_preprocess): t = "a b c d e f" u = text_to_utterance(t) builtin_ents = [ { "value": "e", "resolved_value": "e", "range": { "start": 8, "end": 9 }, "entity_kind": "the_snips_e_entity" } ] custom_ents = [ { "value": "c", "resolved_value": "c", "range": { "start": 4, "end": 5 }, "entity_kind": "the_c_entity" } ] mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents] config = CooccurrenceVectorizerConfig( window_size=3, unknown_words_replacement_string="b", filter_stop_words=False, keep_order=False, ) dataset = get_empty_dataset("en") shared = self.get_shared_data(dataset) # When expected_pairs = { ("THE_C_ENTITY", "THE_SNIPS_E_ENTITY"): 0, ("THE_C_ENTITY", "a"): 1, ("THE_C_ENTITY", "d"): 2, ("THE_C_ENTITY", "f"): 3, ("THE_SNIPS_E_ENTITY", "a"): 4, ("THE_SNIPS_E_ENTITY", "d"): 5, ("THE_SNIPS_E_ENTITY", "f"): 6, ("a", "d"): 7, ("d", "f"): 8, } vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset) # Then self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
def test_cooccurrence_vectorizer_config(self): # Given config_dict = { "unit_name": "cooccurrence_vectorizer", "unknown_words_replacement_string": None, "window_size": 5, "filter_stop_words": True, "keep_order": True, } # When config = CooccurrenceVectorizerConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_transform(self): # Given config = CooccurrenceVectorizerConfig( filter_stop_words=True, window_size=3, unknown_words_replacement_string="d") t_0 = "yo a b c d e f yo" t_1 = "yo a b c d e" u_0 = text_to_utterance(t_0) u_1 = text_to_utterance(t_1) resources = {STOP_WORDS: {"b"}} builtin_ents = [{ "value": "e", "resolved_value": "e", "range": { "start": 11, "end": 12 }, "entity_kind": "the_snips_e_entity" }] custom_ents = [{ "value": "c", "resolved_value": "c", "range": { "start": 7, "end": 8 }, "entity_kind": "the_c_entity" }] builtin_parser = EntityParserMock({ t_0: builtin_ents, t_1: builtin_ents }) custom_parser = EntityParserMock({t_0: custom_ents, t_1: custom_ents}) vectorizer = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) vectorizer._language = "en" vectorizer._word_pairs = { ("THE_SNIPS_E_ENTITY", "f"): 0, ("a", "THE_C_ENTITY"): 1, ("a", "THE_SNIPS_E_ENTITY"): 2, ("b", "THE_SNIPS_E_ENTITY"): 3, ("yo", "yo"): 4, ("d", "THE_SNIPS_E_ENTITY"): 5 } data = [u_0, u_1] # When x = vectorizer.transform(data) # Then expected = [[1, 1, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0]] self.assertEqual(expected, x.todense().tolist())