Пример #1
0
def generate_noise_utterances(augmented_utterances, num_intents,
                              data_augmentation_config, language,
                              random_state):
    if not augmented_utterances or not num_intents:
        return []
    avg_num_utterances = len(augmented_utterances) / float(num_intents)
    if data_augmentation_config.unknown_words_replacement_string is not None:
        noise = generate_smart_noise(
            augmented_utterances,
            data_augmentation_config.unknown_words_replacement_string,
            language)
    else:
        noise = get_noise(language)

    noise_size = min(
        int(data_augmentation_config.noise_factor * avg_num_utterances),
        len(noise))
    utterances_lengths = [
        len(tokenize_light(get_text_from_chunks(u[DATA]), language))
        for u in augmented_utterances
    ]
    mean_utterances_length = np.mean(utterances_lengths)
    std_utterances_length = np.std(utterances_lengths)
    noise_it = get_noise_it(noise, mean_utterances_length,
                            std_utterances_length, random_state)
    # Remove duplicate 'unknownword unknownword'
    return [
        text_to_utterance(UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it)))
        for _ in range(noise_size)
    ]
Пример #2
0
def generate_smart_noise(augmented_utterances, replacement_string, language):
    text_utterances = [
        get_text_from_chunks(u[DATA]) for u in augmented_utterances
    ]
    vocab = [w for u in text_utterances for w in tokenize_light(u, language)]
    vocab = set(vocab)
    noise = get_noise(language)
    return [w if w in vocab else replacement_string for w in noise]
def get_dataset_specific_noise(dataset, language):
    """Return a noise list that excludes the dataset entity values"""
    entities_values = set()
    for ent_name, ent in iteritems(dataset[ENTITIES]):
        if is_builtin_entity(ent_name):
            continue
        for k, v in iteritems(ent[UTTERANCES]):
            entities_values.add(k)
            entities_values.add(v)
    original_noise = get_noise(language)
    specific_noise = [n for n in original_noise if n not in entities_values]
    if not specific_noise:  # Avoid returning an empty noise
        return original_noise
    return specific_noise
Пример #4
0
def build_training_data(dataset, language, data_augmentation_config, resources,
                        random_state):
    import numpy as np

    # Create class mapping
    intents = dataset[INTENTS]
    intent_index = 0
    classes_mapping = dict()
    for intent in sorted(intents):
        classes_mapping[intent] = intent_index
        intent_index += 1

    noise_class = intent_index

    augmented_utterances = []
    utterance_classes = []
    for intent_name, intent_data in sorted(iteritems(intents)):
        nb_utterances = len(intent_data[UTTERANCES])
        min_utterances_to_generate = max(
            data_augmentation_config.min_utterances, nb_utterances)
        utterances = augment_utterances(
            dataset,
            intent_name,
            language=language,
            min_utterances=min_utterances_to_generate,
            capitalization_ratio=0.0,
            add_builtin_entities_examples=data_augmentation_config.
            add_builtin_entities_examples,
            resources=resources,
            random_state=random_state)
        augmented_utterances += utterances
        utterance_classes += [
            classes_mapping[intent_name] for _ in range(len(utterances))
        ]
    if data_augmentation_config.unknown_words_replacement_string is not None:
        augmented_utterances = add_unknown_word_to_utterances(
            augmented_utterances,
            data_augmentation_config.unknown_words_replacement_string,
            data_augmentation_config.unknown_word_prob,
            data_augmentation_config.max_unknown_words, random_state)

    # Adding noise
    noise = get_noise(resources)
    noisy_utterances = generate_noise_utterances(augmented_utterances, noise,
                                                 len(intents),
                                                 data_augmentation_config,
                                                 language, random_state)

    augmented_utterances += noisy_utterances
    utterance_classes += [noise_class for _ in noisy_utterances]
    if noisy_utterances:
        classes_mapping[NOISE_NAME] = noise_class

    nb_classes = len(set(itervalues(classes_mapping)))
    intent_mapping = [None for _ in range(nb_classes)]
    for intent, intent_class in iteritems(classes_mapping):
        if intent == NOISE_NAME:
            intent_mapping[intent_class] = None
        else:
            intent_mapping[intent_class] = intent

    return augmented_utterances, np.array(utterance_classes), intent_mapping