def fit(self, dataset): """Fit the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ logger.debug("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) language = dataset[LANGUAGE] random_state = check_random_state(self.config.random_seed) data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( language, data_augmentation_config.unknown_words_replacement_string, self.config.featurizer_config) self.featurizer = self.featurizer.fit(dataset, utterances, classes) if self.featurizer is None: return self X = self.featurizer.transform(utterances) # pylint: disable=C0103 alpha = get_regularization_factor(dataset) self.classifier = SGDClassifier(random_state=random_state, alpha=alpha, **LOG_REG_ARGS) self.classifier.fit(X, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self
def fit(self, dataset): """Fits the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ from sklearn.linear_model import SGDClassifier from sklearn.utils import compute_class_weight logger.info("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) language = dataset[LANGUAGE] data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, self.resources, self.random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( config=self.config.featurizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, resources=self.resources, random_state=self.random_state, ) self.featurizer.language = language none_class = max(classes) try: x = self.featurizer.fit_transform(dataset, utterances, classes, none_class) except _EmptyDatasetUtterancesError: logger.warning("No (non-empty) utterances found in dataset") self.featurizer = None return self alpha = get_regularization_factor(dataset) class_weights_arr = compute_class_weight("balanced", range(none_class + 1), classes) # Re-weight the noise class class_weights_arr[-1] *= self.config.noise_reweight_factor class_weight = {idx: w for idx, w in enumerate(class_weights_arr)} self.classifier = SGDClassifier(random_state=self.random_state, alpha=alpha, class_weight=class_weight, **LOG_REG_ARGS) self.classifier.fit(x, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self
def test_should_build_training_data_with_noise(self, mocked_augment_utterances, mocked_get_noise): # Given mocked_noises = ["mocked_noise_%s" % i for i in range(100)] mocked_get_noise.return_value = mocked_noises mocked_augment_utterances.side_effect = get_mocked_augment_utterances num_intents = 3 utterances_length = 5 num_queries_per_intent = 3 fake_utterance = { "data": [{ "text": " ".join("1" for _ in range(utterances_length)) }] } dataset = { "intents": { str(i): { "utterances": [fake_utterance] * num_queries_per_intent } for i in range(num_intents) } } random_state = np.random.RandomState(1) # When np.random.seed(42) noise_factor = 2 data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=noise_factor, unknown_word_prob=0, unknown_words_replacement_string=None) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, random_state) # Then expected_utterances = [ utterance for intent in itervalues(dataset[INTENTS]) for utterance in intent[UTTERANCES] ] np.random.seed(42) noise = list(mocked_noises) noise_size = int(min(noise_factor * num_queries_per_intent, len(noise))) noise_it = get_noise_it(mocked_noises, utterances_length, 0, random_state) noisy_utterances = [ text_to_utterance(next(noise_it)) for _ in range(noise_size) ] expected_utterances += noisy_utterances expected_intent_mapping = sorted(dataset["intents"]) expected_intent_mapping.append(None) self.assertListEqual(expected_utterances, utterances) self.assertListEqual(intent_mapping, expected_intent_mapping)
def test_should_build_training_data_with_noise( self, mocked_augment_utterances, mocked_get_noises): # Given mocked_noises = ["mocked_noise_%s" % i for i in range(100)] mocked_get_noises.return_value = mocked_noises mocked_augment_utterances.side_effect = get_mocked_augment_utterances num_intents = 3 utterances_length = 5 num_queries_per_intent = 3 fake_utterance = { "data": [ {"text": " ".join("1" for _ in range(utterances_length))} ] } dataset = { "intents": { str(i): { "utterances": [fake_utterance] * num_queries_per_intent } for i in range(num_intents) } } random_state = np.random.RandomState(1) # When np.random.seed(42) noise_factor = 2 data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=noise_factor, unknown_word_prob=0, unknown_words_replacement_string=None) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, random_state) # Then expected_utterances = [get_text_from_chunks(utterance[DATA]) for intent in itervalues(dataset[INTENTS]) for utterance in intent[UTTERANCES]] np.random.seed(42) noise = list(mocked_noises) noise_size = int(min(noise_factor * num_queries_per_intent, len(noise))) noise_it = get_noise_it(mocked_noises, utterances_length, 0, random_state) noisy_utterances = [next(noise_it) for _ in range(noise_size)] expected_utterances += list(noisy_utterances) expected_intent_mapping = sorted(dataset["intents"]) expected_intent_mapping.append(None) self.assertListEqual(utterances, expected_utterances) self.assertListEqual(intent_mapping, expected_intent_mapping)
def test_should_build_training_data_with_no_data(self): # Given language = LANGUAGE_EN dataset = validate_and_format_dataset(get_empty_dataset(language)) random_state = np.random.RandomState(1) # When data_augmentation_config = LogRegIntentClassifierConfig() \ .data_augmentation_config utterances, _, intent_mapping = build_training_data( dataset, language, data_augmentation_config, random_state) # Then expected_utterances = [] expected_intent_mapping = [] self.assertListEqual(utterances, expected_utterances) self.assertListEqual(intent_mapping, expected_intent_mapping)
def fit(self, dataset): """Fits the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ logger.info("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) language = dataset[LANGUAGE] data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, self.resources, self.random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( config=self.config.featurizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, resources=self.resources, random_state=self.random_state, ) self.featurizer.language = language none_class = max(classes) try: x = self.featurizer.fit_transform(dataset, utterances, classes, none_class) except _EmptyDatasetUtterancesError: self.featurizer = None return self alpha = get_regularization_factor(dataset) self.classifier = SGDClassifier(random_state=self.random_state, alpha=alpha, **LOG_REG_ARGS) self.classifier.fit(x, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self
def test_should_build_training_data_with_no_stemming_no_noise( self, mocked_augment_utterances): # Given dataset = validate_and_format_dataset(SAMPLE_DATASET) mocked_augment_utterances.side_effect = get_mocked_augment_utterances random_state = np.random.RandomState(1) # When data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=0) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, random_state) # Then expected_utterances = [utterance for intent in itervalues(dataset[INTENTS]) for utterance in intent[UTTERANCES]] expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2'] self.assertListEqual(expected_utterances, utterances) self.assertListEqual(expected_intent_mapping, intent_mapping)
def test_should_build_training_data_with_no_stemming_no_noise( self, mocked_augment_utterances): # Given dataset = SAMPLE_DATASET mocked_augment_utterances.side_effect = get_mocked_augment_utterances random_state = np.random.RandomState(1) # When data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=0) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, random_state) # Then expected_utterances = [get_text_from_chunks(utterance[DATA]) for intent in itervalues(dataset[INTENTS]) for utterance in intent[UTTERANCES]] expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2'] self.assertListEqual(utterances, expected_utterances) self.assertListEqual(expected_intent_mapping, intent_mapping)
def test_should_build_training_data_with_no_noise( self, mocked_augment_utterances): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_first_intent utterances: - how are you - hello how are you? - what's up --- type: intent name: my_second_intent utterances: - what is the weather today ? - does it rain - will it rain tomorrow""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json resources = self.get_resources(dataset[LANGUAGE]) mocked_augment_utterances.side_effect = get_mocked_augment_utterances random_state = np.random.RandomState(1) # When data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=0) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, resources, random_state) # Then expected_utterances = [ utterance for _, intent in sorted(iteritems(dataset[INTENTS])) for utterance in intent[UTTERANCES] ] expected_intent_mapping = ["my_first_intent", "my_second_intent"] self.assertListEqual(expected_utterances, utterances) self.assertListEqual(expected_intent_mapping, intent_mapping)