def test_ngram_factory_with_gazetteer(self, mock_get_gazetteer): # Given config = { "factory_name": "ngram", "args": { "n": 2, "use_stemming": False, "common_words_gazetteer_name": "mocked_gazetteer" }, "offsets": [0] } mock_get_gazetteer.return_value = {"hello", "beautiful", "world"} tokens = tokenize("hello beautiful foobar world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res = features[0].compute(1, cache) # Then self.assertIsInstance(factory, NgramFactory) self.assertEqual(features[0].base_name, "ngram_2") self.assertEqual(res, "beautiful rare_word")
def test_builtin_entity_match_factory(self, mock_supported_entities): # Given def mocked_supported_entities(language): if language == LANGUAGE_EN: return {SNIPS_NUMBER, SNIPS_DATETIME} return set() mock_supported_entities.side_effect = mocked_supported_entities config = { "factory_name": "builtin_entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, }, "offsets": [0] } tokens = tokenize("one tea tomorrow at 2pm", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) # When features = factory.build_features() features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, BuiltinEntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "builtin_entity_match_snips/datetime") self.assertEqual(features[1].base_name, "builtin_entity_match_snips/number") self.assertEqual(res0, UNIT_PREFIX) self.assertEqual(res1, None) self.assertEqual(res2, BEGINNING_PREFIX) self.assertEqual(res3, INSIDE_PREFIX) self.assertEqual(res4, LAST_PREFIX) self.assertEqual(res5, UNIT_PREFIX) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, None)
def test_builtin_entity_match_factory(self, mock_supported_entities): # Given def mocked_supported_entities(language): if language == LANGUAGE_EN: return {SNIPS_NUMBER, SNIPS_DATETIME} return set() mock_supported_entities.side_effect = mocked_supported_entities config = { "factory_name": "builtin_entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, }, "offsets": [0] } tokens = tokenize("one tea tomorrow at 2pm", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) # When features = factory.build_features() features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, BuiltinEntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "builtin_entity_match_snips/datetime") self.assertEqual(features[1].base_name, "builtin_entity_match_snips/number") self.assertEqual(res0, UNIT_PREFIX) self.assertEqual(res1, None) self.assertEqual(res2, BEGINNING_PREFIX) self.assertEqual(res3, INSIDE_PREFIX) self.assertEqual(res4, LAST_PREFIX) self.assertEqual(res5, UNIT_PREFIX) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, None)
def get_required_resources(self): # Import here to avoid circular imports from snips_nlu.slot_filler.feature_factory import get_feature_factory resources = self.data_augmentation_config.get_required_resources() for config in self.feature_factory_configs: factory = get_feature_factory(config) resources = merge_required_resources( resources, factory.get_required_resources()) return resources
def test_entity_match_factory(self): # Given config = { "factory_name": "entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, "use_stemming": True }, "offsets": [0] } tokens = tokenize("2 dummy a had dummy_c", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) dataset = deepcopy(SAMPLE_DATASET) dataset = validate_and_format_dataset(dataset) custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS) factory.fit(dataset, "dummy_intent_1") # When features = factory.build_features( custom_entity_parser=custom_entity_parser) features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, CustomEntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1") self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2") self.assertEqual(res0, BEGINNING_PREFIX) self.assertEqual(res1, INSIDE_PREFIX) self.assertEqual(res2, LAST_PREFIX) self.assertEqual(res3, None) self.assertEqual(res4, None) self.assertEqual(res5, None) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, UNIT_PREFIX)
def __init__(self, config=None): """The CRF slot filler can be configured by passing a :class:`.CRFSlotFillerConfig`""" if config is None: config = self.config_type() super(CRFSlotFiller, self).__init__(config) self.crf_model = None self.features_factories = [get_feature_factory(conf) for conf in self.config.feature_factory_configs] self._features = None self.language = None self.intent = None self.slot_name_mapping = None
def __init__(self, config=None): """The CRF slot filler can be configured by passing a :class:`.CRFSlotFillerConfig`""" if config is None: config = self.config_type() super(CRFSlotFiller, self).__init__(config) self.crf_model = None self.features_factories = [get_feature_factory(conf) for conf in self.config.feature_factory_configs] self._features = None self.language = None self.intent = None self.slot_name_mapping = None
def test_entity_match_factory(self): # Given config = { "factory_name": "entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, "use_stemming": False }, "offsets": [0] } tokens = tokenize("2 dummy a and dummy_c", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) dataset = deepcopy(SAMPLE_DATASET) dataset = validate_and_format_dataset(dataset) factory.fit(dataset, "dummy_intent_1") # When features = factory.build_features() features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, EntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1") self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2") self.assertEqual(res0, BEGINNING_PREFIX) self.assertEqual(res1, INSIDE_PREFIX) self.assertEqual(res2, LAST_PREFIX) self.assertEqual(res3, None) self.assertEqual(res4, None) self.assertEqual(res5, None) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, UNIT_PREFIX)
def test_length_factory(self): # Given config = {"factory_name": "length", "args": {}, "offsets": [0]} tokens = tokenize("hello beautiful world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) factory.fit(None, None) features = factory.build_features() # When res = features[0].compute(2, cache) # Then self.assertIsInstance(factory, LengthFactory) self.assertEqual(features[0].base_name, "length") self.assertEqual(res, "5")
def test_is_first_factory(self): # Given config = {"factory_name": "is_first", "args": {}, "offsets": [0]} tokens = tokenize("hello beautiful world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) factory.fit(None, None) features = factory.build_features() # When res1 = features[0].compute(0, cache) res2 = features[0].compute(1, cache) # Then self.assertIsInstance(factory, IsFirstFactory) self.assertEqual(features[0].base_name, "is_first") self.assertEqual(res1, "1") self.assertEqual(res2, None)
def test_length_factory(self): # Given config = { "factory_name": "length", "args": {}, "offsets": [0] } tokens = tokenize("hello beautiful world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) factory.fit(None, None) features = factory.build_features() # When res = features[0].compute(2, cache) # Then self.assertIsInstance(factory, LengthFactory) self.assertEqual(features[0].base_name, "length") self.assertEqual(res, 5)
def test_word_cluster_factory(self, mock_get_word_clusters): # Given def mocked_get_word_clusters(language): if language == LANGUAGE_EN: return { "mocked_cluster": { "word1": "00", "word2": "11" } } return dict() mock_get_word_clusters.side_effect = mocked_get_word_clusters config = { "factory_name": "word_cluster", "args": { "cluster_name": "mocked_cluster", "use_stemming": False }, "offsets": [0] } tokens = tokenize("hello word1 word2", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) # Then self.assertIsInstance(factory, WordClusterFactory) self.assertEqual(features[0].base_name, "word_cluster_mocked_cluster") self.assertEqual(res0, None) self.assertEqual(res1, "00") self.assertEqual(res2, "11")
def test_is_first_factory(self): # Given config = { "factory_name": "is_first", "args": {}, "offsets": [0] } tokens = tokenize("hello beautiful world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) factory.fit(None, None) features = factory.build_features() # When res1 = features[0].compute(0, cache) res2 = features[0].compute(1, cache) # Then self.assertIsInstance(factory, IsFirstFactory) self.assertEqual(features[0].base_name, "is_first") self.assertEqual(res1, "1") self.assertEqual(res2, None)
def test_word_cluster_factory(self, mock_get_word_clusters): # Given def mocked_get_word_clusters(language): if language == LANGUAGE_EN: return {"mocked_cluster": {"word1": "00", "word2": "11"}} return dict() mock_get_word_clusters.side_effect = mocked_get_word_clusters config = { "factory_name": "word_cluster", "args": { "cluster_name": "mocked_cluster", "use_stemming": False }, "offsets": [0] } tokens = tokenize("hello word1 word2", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) # Then self.assertIsInstance(factory, WordClusterFactory) self.assertEqual(features[0].base_name, "word_cluster_mocked_cluster") self.assertEqual(res0, None) self.assertEqual(res1, "00") self.assertEqual(res2, "11")
def test_shape_ngram_factory(self): # Given config = { "factory_name": "shape_ngram", "args": { "n": 3, }, "offsets": [0] } tokens = tokenize("hello Beautiful foObar world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res = features[0].compute(1, cache) # Then self.assertIsInstance(factory, ShapeNgramFactory) self.assertEqual(features[0].base_name, "shape_ngram_3") self.assertEqual(res, "Xxx xX xxx")
def test_shape_ngram_factory(self): # Given config = { "factory_name": "shape_ngram", "args": { "n": 3, }, "offsets": [0] } tokens = tokenize("hello Beautiful foObar world", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) mocked_dataset = {"language": "en"} factory.fit(mocked_dataset, None) features = factory.build_features() # When res = features[0].compute(1, cache) # Then self.assertIsInstance(factory, ShapeNgramFactory) self.assertEqual(features[0].base_name, "shape_ngram_3") self.assertEqual(res, "Xxx xX xxx")