예제 #1
0
    def test_generate_noise_utterances(self, mocked_get_noise):
        # Given
        language = LANGUAGE_EN
        num_intents = 2
        noise_factor = 1
        utterances_length = 5

        noise = [str(i) for i in range(utterances_length)]
        mocked_get_noise.return_value = noise

        augmented_utterances = [
            {
                "data": [
                    {
                        "text": " ".join(
                            "{}".format(i) for i in range(utterances_length))
                    }
                ]
            }
        ]
        num_utterances = 10
        random_state = np.random.RandomState(1)

        augmented_utterances = augmented_utterances * num_utterances
        config = IntentClassifierDataAugmentationConfig(
            noise_factor=noise_factor)
        # When
        noise_utterances = generate_noise_utterances(
            augmented_utterances, noise, num_intents, config, language,
            random_state)

        # Then
        joined_noise = text_to_utterance(" ".join(noise))
        for u in noise_utterances:
            self.assertEqual(u, joined_noise)
    def test_should_build_training_data_with_noise(self,
                                                   mocked_augment_utterances,
                                                   mocked_get_noise):
        # Given
        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
        mocked_get_noise.return_value = mocked_noises
        mocked_augment_utterances.side_effect = get_mocked_augment_utterances

        num_intents = 3
        utterances_length = 5
        num_queries_per_intent = 3
        fake_utterance = {
            "data": [{
                "text": " ".join("1" for _ in range(utterances_length))
            }]
        }
        dataset = {
            "intents": {
                str(i): {
                    "utterances": [fake_utterance] * num_queries_per_intent
                }
                for i in range(num_intents)
            }
        }
        random_state = np.random.RandomState(1)

        # When
        np.random.seed(42)
        noise_factor = 2
        data_augmentation_config = IntentClassifierDataAugmentationConfig(
            noise_factor=noise_factor,
            unknown_word_prob=0,
            unknown_words_replacement_string=None)
        utterances, _, intent_mapping = build_training_data(
            dataset, LANGUAGE_EN, data_augmentation_config, random_state)

        # Then
        expected_utterances = [
            utterance for intent in itervalues(dataset[INTENTS])
            for utterance in intent[UTTERANCES]
        ]
        np.random.seed(42)
        noise = list(mocked_noises)
        noise_size = int(min(noise_factor * num_queries_per_intent,
                             len(noise)))
        noise_it = get_noise_it(mocked_noises, utterances_length, 0,
                                random_state)
        noisy_utterances = [
            text_to_utterance(next(noise_it)) for _ in range(noise_size)
        ]
        expected_utterances += noisy_utterances
        expected_intent_mapping = sorted(dataset["intents"])
        expected_intent_mapping.append(None)
        self.assertListEqual(expected_utterances, utterances)
        self.assertListEqual(intent_mapping, expected_intent_mapping)
예제 #3
0
    def test_intent_classifier_data_augmentation_config(self):
        # Given
        config_dict = {
            "min_utterances": 3,
            "noise_factor": 2,
            "unknown_word_prob": 0.1,
            "unknown_words_replacement_string": "foobar",
        }

        # When
        config = IntentClassifierDataAugmentationConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
예제 #4
0
    def test_intent_classifier_config(self):
        # Given
        config_dict = {
            "unit_name": LogRegIntentClassifier.unit_name,
            "data_augmentation_config":
                IntentClassifierDataAugmentationConfig().to_dict(),
            "featurizer_config": FeaturizerConfig().to_dict(),
            "random_seed": 42
        }

        # When
        config = LogRegIntentClassifierConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
예제 #5
0
    def test_should_build_training_data_with_no_stemming_no_noise(
            self, mocked_augment_utterances):
        # Given
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
        random_state = np.random.RandomState(1)

        # When
        data_augmentation_config = IntentClassifierDataAugmentationConfig(
            noise_factor=0)
        utterances, _, intent_mapping = build_training_data(
            dataset, LANGUAGE_EN, data_augmentation_config, random_state)

        # Then
        expected_utterances = [utterance for intent
                               in itervalues(dataset[INTENTS])
                               for utterance in intent[UTTERANCES]]
        expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2']
        self.assertListEqual(expected_utterances, utterances)
        self.assertListEqual(expected_intent_mapping, intent_mapping)
    def test_should_build_training_data_with_no_noise(
            self, mocked_augment_utterances):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_first_intent
utterances:
- how are you
- hello how are you?
- what's up

---
type: intent
name: my_second_intent
utterances:
- what is the weather today ?
- does it rain
- will it rain tomorrow""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        resources = self.get_resources(dataset[LANGUAGE])
        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
        random_state = np.random.RandomState(1)

        # When
        data_augmentation_config = IntentClassifierDataAugmentationConfig(
            noise_factor=0)
        utterances, _, intent_mapping = build_training_data(
            dataset, LANGUAGE_EN, data_augmentation_config, resources,
            random_state)

        # Then
        expected_utterances = [
            utterance for _, intent in sorted(iteritems(dataset[INTENTS]))
            for utterance in intent[UTTERANCES]
        ]
        expected_intent_mapping = ["my_first_intent", "my_second_intent"]
        self.assertListEqual(expected_utterances, utterances)
        self.assertListEqual(expected_intent_mapping, intent_mapping)