def generate_noise_utterances(augmented_utterances, num_intents, data_augmentation_config, language, random_state): if not augmented_utterances or not num_intents: return [] avg_num_utterances = len(augmented_utterances) / float(num_intents) if data_augmentation_config.unknown_words_replacement_string is not None: noise = generate_smart_noise( augmented_utterances, data_augmentation_config.unknown_words_replacement_string, language) else: noise = get_noise(language) noise_size = min( int(data_augmentation_config.noise_factor * avg_num_utterances), len(noise)) utterances_lengths = [ len(tokenize_light(get_text_from_chunks(u[DATA]), language)) for u in augmented_utterances ] mean_utterances_length = np.mean(utterances_lengths) std_utterances_length = np.std(utterances_lengths) noise_it = get_noise_it(noise, mean_utterances_length, std_utterances_length, random_state) # Remove duplicate 'unknownword unknownword' return [ text_to_utterance(UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it))) for _ in range(noise_size) ]
def generate_smart_noise(augmented_utterances, replacement_string, language): text_utterances = [ get_text_from_chunks(u[DATA]) for u in augmented_utterances ] vocab = [w for u in text_utterances for w in tokenize_light(u, language)] vocab = set(vocab) noise = get_noise(language) return [w if w in vocab else replacement_string for w in noise]
def get_dataset_specific_noise(dataset, language): """Return a noise list that excludes the dataset entity values""" entities_values = set() for ent_name, ent in iteritems(dataset[ENTITIES]): if is_builtin_entity(ent_name): continue for k, v in iteritems(ent[UTTERANCES]): entities_values.add(k) entities_values.add(v) original_noise = get_noise(language) specific_noise = [n for n in original_noise if n not in entities_values] if not specific_noise: # Avoid returning an empty noise return original_noise return specific_noise
def build_training_data(dataset, language, data_augmentation_config, resources, random_state): import numpy as np # Create class mapping intents = dataset[INTENTS] intent_index = 0 classes_mapping = dict() for intent in sorted(intents): classes_mapping[intent] = intent_index intent_index += 1 noise_class = intent_index augmented_utterances = [] utterance_classes = [] for intent_name, intent_data in sorted(iteritems(intents)): nb_utterances = len(intent_data[UTTERANCES]) min_utterances_to_generate = max( data_augmentation_config.min_utterances, nb_utterances) utterances = augment_utterances( dataset, intent_name, language=language, min_utterances=min_utterances_to_generate, capitalization_ratio=0.0, add_builtin_entities_examples=data_augmentation_config. add_builtin_entities_examples, resources=resources, random_state=random_state) augmented_utterances += utterances utterance_classes += [ classes_mapping[intent_name] for _ in range(len(utterances)) ] if data_augmentation_config.unknown_words_replacement_string is not None: augmented_utterances = add_unknown_word_to_utterances( augmented_utterances, data_augmentation_config.unknown_words_replacement_string, data_augmentation_config.unknown_word_prob, data_augmentation_config.max_unknown_words, random_state) # Adding noise noise = get_noise(resources) noisy_utterances = generate_noise_utterances(augmented_utterances, noise, len(intents), data_augmentation_config, language, random_state) augmented_utterances += noisy_utterances utterance_classes += [noise_class for _ in noisy_utterances] if noisy_utterances: classes_mapping[NOISE_NAME] = noise_class nb_classes = len(set(itervalues(classes_mapping))) intent_mapping = [None for _ in range(nb_classes)] for intent, intent_class in iteritems(classes_mapping): if intent == NOISE_NAME: intent_mapping[intent_class] = None else: intent_mapping[intent_class] = intent return augmented_utterances, np.array(utterance_classes), intent_mapping