def test_generate_noise_utterances(self, mocked_get_noise): # Given language = LANGUAGE_EN num_intents = 2 noise_factor = 1 utterances_length = 5 noise = [str(i) for i in range(utterances_length)] mocked_get_noise.return_value = noise augmented_utterances = [ { "data": [ { "text": " ".join( "{}".format(i) for i in range(utterances_length)) } ] } ] num_utterances = 10 random_state = np.random.RandomState(1) augmented_utterances = augmented_utterances * num_utterances config = IntentClassifierDataAugmentationConfig( noise_factor=noise_factor) # When noise_utterances = generate_noise_utterances( augmented_utterances, noise, num_intents, config, language, random_state) # Then joined_noise = text_to_utterance(" ".join(noise)) for u in noise_utterances: self.assertEqual(u, joined_noise)
def test_should_build_training_data_with_noise(self, mocked_augment_utterances, mocked_get_noise): # Given mocked_noises = ["mocked_noise_%s" % i for i in range(100)] mocked_get_noise.return_value = mocked_noises mocked_augment_utterances.side_effect = get_mocked_augment_utterances num_intents = 3 utterances_length = 5 num_queries_per_intent = 3 fake_utterance = { "data": [{ "text": " ".join("1" for _ in range(utterances_length)) }] } dataset = { "intents": { str(i): { "utterances": [fake_utterance] * num_queries_per_intent } for i in range(num_intents) } } random_state = np.random.RandomState(1) # When np.random.seed(42) noise_factor = 2 data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=noise_factor, unknown_word_prob=0, unknown_words_replacement_string=None) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, random_state) # Then expected_utterances = [ utterance for intent in itervalues(dataset[INTENTS]) for utterance in intent[UTTERANCES] ] np.random.seed(42) noise = list(mocked_noises) noise_size = int(min(noise_factor * num_queries_per_intent, len(noise))) noise_it = get_noise_it(mocked_noises, utterances_length, 0, random_state) noisy_utterances = [ text_to_utterance(next(noise_it)) for _ in range(noise_size) ] expected_utterances += noisy_utterances expected_intent_mapping = sorted(dataset["intents"]) expected_intent_mapping.append(None) self.assertListEqual(expected_utterances, utterances) self.assertListEqual(intent_mapping, expected_intent_mapping)
def test_intent_classifier_data_augmentation_config(self): # Given config_dict = { "min_utterances": 3, "noise_factor": 2, "unknown_word_prob": 0.1, "unknown_words_replacement_string": "foobar", } # When config = IntentClassifierDataAugmentationConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_intent_classifier_config(self): # Given config_dict = { "unit_name": LogRegIntentClassifier.unit_name, "data_augmentation_config": IntentClassifierDataAugmentationConfig().to_dict(), "featurizer_config": FeaturizerConfig().to_dict(), "random_seed": 42 } # When config = LogRegIntentClassifierConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_should_build_training_data_with_no_stemming_no_noise( self, mocked_augment_utterances): # Given dataset = validate_and_format_dataset(SAMPLE_DATASET) mocked_augment_utterances.side_effect = get_mocked_augment_utterances random_state = np.random.RandomState(1) # When data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=0) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, random_state) # Then expected_utterances = [utterance for intent in itervalues(dataset[INTENTS]) for utterance in intent[UTTERANCES]] expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2'] self.assertListEqual(expected_utterances, utterances) self.assertListEqual(expected_intent_mapping, intent_mapping)
def test_should_build_training_data_with_no_noise( self, mocked_augment_utterances): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_first_intent utterances: - how are you - hello how are you? - what's up --- type: intent name: my_second_intent utterances: - what is the weather today ? - does it rain - will it rain tomorrow""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json resources = self.get_resources(dataset[LANGUAGE]) mocked_augment_utterances.side_effect = get_mocked_augment_utterances random_state = np.random.RandomState(1) # When data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=0) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, resources, random_state) # Then expected_utterances = [ utterance for _, intent in sorted(iteritems(dataset[INTENTS])) for utterance in intent[UTTERANCES] ] expected_intent_mapping = ["my_first_intent", "my_second_intent"] self.assertListEqual(expected_utterances, utterances) self.assertListEqual(expected_intent_mapping, intent_mapping)