def test_limit_vocabulary(self): # Given vectorizer = TfidfVectorizer() dataset = get_empty_dataset("en") utterances = [ text_to_utterance("5 55 6 66 666"), text_to_utterance("55 66") ] voca = {"5": 0, "55": 1, "6": 2, "66": 3, "666": 4} kept_unigrams = ["5", "6", "666"] vectorizer.fit(utterances, dataset) self.assertDictEqual(voca, vectorizer.vocabulary) diag = vectorizer.idf_diag.copy() # When vectorizer.limit_vocabulary(kept_unigrams) # Then expected_voca = {"5": 0, "6": 1, "666": 2} self.assertDictEqual(expected_voca, vectorizer.vocabulary) expected_diag = diag[[voca[u] for u in kept_unigrams]].tolist() self.assertListEqual(expected_diag, vectorizer.idf_diag.tolist())
def test_limit_vocabulary_should_raise(self): # Given vectorizer = TfidfVectorizer() dataset = {"language": "en", "entities": dict(), "intents": dict()} utterances = [text_to_utterance("5 55 6 66 666")] vectorizer.fit(utterances, dataset) # When / Then kept_indexes = ["7", "8"] with self.assertRaises(ValueError): vectorizer.limit_vocabulary(kept_indexes)
def test_training_should_be_reproducible(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json x = [text_to_utterance("please make me two hots cups of tea")] shared = self.get_shared_data(dataset) shared["random_state"] = 42 # When vectorizer1 = TfidfVectorizer(**shared) vectorizer1.fit(x, dataset) vectorizer2 = TfidfVectorizer(**shared) vectorizer2.fit(x, dataset) # Then with temp_dir() as tmp_dir: dir_vectorizer1 = tmp_dir / "vectorizer1" dir_vectorizer2 = tmp_dir / "vectorizer2" vectorizer1.persist(dir_vectorizer1) vectorizer2.persist(dir_vectorizer2) hash1 = dirhash(str(dir_vectorizer1), 'sha256') hash2 = dirhash(str(dir_vectorizer2), 'sha256') self.assertEqual(hash1, hash2)
def test_preprocess(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2]""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) utterances = [ text_to_utterance("hÉllo wOrld Éntity_2"), text_to_utterance("beauTiful World entity 1"), text_to_utterance("Bird bïrdy"), text_to_utterance("Bird birdy"), ] config = TfidfVectorizerConfig(use_stemming=True, word_clusters_name="my_word_clusters") vectorizer = TfidfVectorizer( config=config, custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language vectorizer.builtin_entity_scope = {"snips/number"} # When processed_data = vectorizer._preprocess(utterances) processed_data = list(zip(*processed_data)) # Then u_0 = {"data": [{"text": "hello world entity_2"}]} u_1 = {"data": [{"text": "beauty world ent 1"}]} u_2 = {"data": [{"text": "bird bird"}]} u_3 = {"data": [{"text": "bird bird"}]} ent_0 = { "entity_kind": "entity_2", "value": "entity_2", "resolved_value": "Éntity 2", "range": { "start": 12, "end": 20 } } num_0 = { "entity_kind": "snips/number", "value": "2", "resolved_value": { "value": 2.0, "kind": "Number" }, "range": { "start": 19, "end": 20 } } ent_11 = { "entity_kind": "entity_1", "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 13, "end": 18 } } ent_12 = { "entity_kind": "entity_2", "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 13, "end": 18 } } num_1 = { "entity_kind": "snips/number", "value": "1", "range": { "start": 23, "end": 24 }, "resolved_value": { "value": 1.0, "kind": "Number" }, } expected_data = [(u_0, [num_0], [ent_0], []), (u_1, [num_1], [ent_11, ent_12], ["cluster_1", "cluster_3"]), (u_2, [], [], []), (u_3, [], [], ["cluster_2"])] self.assertSequenceEqual(expected_data, processed_data)
def test_enrich_utterance(self): # Given utterances = [ { "data": [ { "text": "one", "entity": "snips/number" }, { "text": "beauty world", }, { "text": "ent 1", "entity": "dummy_entity_1" }, ] }, text_to_utterance("one beauty world ent 1"), text_to_utterance("hello world entity_2"), text_to_utterance("bird bird"), ] builtin_ents = [[{ "value": "one", "resolved_value": 1, "range": { "start": 0, "end": 3 }, "entity_kind": "snips/number" }], [{ "value": "one", "resolved_value": 1, "range": { "start": 0, "end": 3 }, "entity_kind": "snips/number" }, { "value": "1", "resolved_value": 1, "range": { "start": 27, "end": 28 }, "entity_kind": "snips/number" }], [{ "value": "2", "resolved_value": 2, "range": { "start": 19, "end": 20 }, "entity_kind": "snips/number" }], []] custom_ents = [[{ "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 20, "end": 28 }, "entity_kind": "dummy_entity_1" }], [{ "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 20, "end": 28 }, "entity_kind": "dummy_entity_1" }], [{ "value": "entity_2", "resolved_value": "Éntity_2", "range": { "start": 12, "end": 20 }, "entity_kind": "dummy_entity_2" }], []] w_clusters = [["111", "112"], ["111", "112"], [], []] vectorizer = TfidfVectorizer() vectorizer._language = "en" # When enriched_utterances = [ vectorizer._enrich_utterance(*data) for data in zip(utterances, builtin_ents, custom_ents, w_clusters) ] # Then expected_u0 = "beauty world ent 1 " \ "builtinentityfeaturesnipsnumber " \ "entityfeaturedummy_entity_1 111 112" expected_u1 = "one beauty world ent 1 " \ "builtinentityfeaturesnipsnumber " \ "builtinentityfeaturesnipsnumber " \ "entityfeaturedummy_entity_1 111 112" expected_u2 = "hello world entity_2 builtinentityfeaturesnipsnumber " \ "entityfeaturedummy_entity_2" expected_u3 = "bird bird" expected_utterances = [ expected_u0, expected_u1, expected_u2, expected_u3 ] self.assertEqual(expected_utterances, enriched_utterances)
def test_preprocess_for_training(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 automatically_extensible: false use_synononyms: false matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 automatically_extensible: false use_synononyms: true matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2]""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) utterances = [{ "data": [{ "text": "hÉllo wOrld " }, { "text": " yo " }, { "text": " yo " }, { "text": "yo " }, { "text": "Éntity_2", "entity": "entity_2" }, { "text": " " }, { "text": "Éntity_2", "entity": "entity_2" }] }, { "data": [{ "text": "beauTiful World " }, { "text": "entity 1", "entity": "entity_1" }, { "text": " " }, { "text": "2", "entity": "snips/number" }] }, { "data": [{ "text": "Bird bïrdy" }] }, { "data": [{ "text": "Bird birdy" }] }] config = TfidfVectorizerConfig(use_stemming=True, word_clusters_name="my_word_clusters") vectorizer = TfidfVectorizer( config=config, custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language # When processed_data = vectorizer._preprocess(utterances, training=True) processed_data = list(zip(*processed_data)) # Then u_0 = { "data": [{ "text": "hello world" }, { "text": "yo" }, { "text": "yo" }, { "text": "yo" }, { "text": "entity_2", "entity": "entity_2" }, { "text": "" }, { "text": "entity_2", "entity": "entity_2" }] } u_1 = { "data": [{ "text": "beauty world" }, { "text": "ent 1", "entity": "entity_1" }, { "text": "" }, { "text": "2", "entity": "snips/number" }] } u_2 = {"data": [{"text": "bird bird"}]} ent_00 = { "entity_kind": "entity_2", "value": "Éntity_2", "range": { "start": 23, "end": 31 } } ent_01 = { "entity_kind": "entity_2", "value": "Éntity_2", "range": { "start": 32, "end": 40 } } ent_1 = { "entity_kind": "entity_1", "value": "entity 1", "range": { "start": 16, "end": 24 } } num_1 = { "entity_kind": "snips/number", "value": "2", "range": { "start": 25, "end": 26 } } expected_data = [(u_0, [], [ent_00, ent_01], []), (u_1, [num_1], [ent_1], ["cluster_1", "cluster_3"]), (u_2, [], [], []), (u_2, [], [], ["cluster_2"])] self.assertSequenceEqual(expected_data, processed_data)