def fit(self, dataset, intent): """Fit the slot filler Args: dataset (dict): A valid Snips dataset intent (str): The specific intent of the dataset to train the slot filler on Returns: :class:`CRFSlotFiller`: The same instance, trained """ logger.debug("Fitting %s slot filler...", intent) dataset = validate_and_format_dataset(dataset) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) self.language = dataset[LANGUAGE] self.intent = intent self.slot_name_mapping = get_slot_name_mapping(dataset, intent) if not self.slot_name_mapping: # No need to train the CRF if the intent has no slots return self random_state = check_random_state(self.config.random_seed) augmented_intent_utterances = augment_utterances( dataset, self.intent, language=self.language, random_state=random_state, **self.config.data_augmentation_config.to_dict()) crf_samples = [ utterance_to_sample(u[DATA], self.config.tagging_scheme, self.language) for u in augmented_intent_utterances ] for factory in self.features_factories: factory.fit(dataset, intent) # Ensure that X, Y are safe and that the OUTSIDE label is learnt to # avoid segfault at inference time # pylint: disable=C0103 X = [ self.compute_features(sample[TOKENS], drop_out=True) for sample in crf_samples ] Y = [[tag for tag in sample[TAGS]] for sample in crf_samples] X, Y = _ensure_safe(X, Y) # ensure ascii tags Y = [[_encode_tag(tag) for tag in y] for y in Y] # pylint: enable=C0103 self.crf_model = _get_crf_model(self.config.crf_args) self.crf_model.fit(X, Y) logger.debug("Most relevant features for %s:\n%s", self.intent, DifferedLoggingMessage(self.log_weights)) return self
def build_training_data(dataset, language, data_augmentation_config, resources, random_state): # Create class mapping intents = dataset[INTENTS] intent_index = 0 classes_mapping = dict() for intent in sorted(intents): classes_mapping[intent] = intent_index intent_index += 1 noise_class = intent_index # Computing dataset statistics nb_utterances = [len(intent[UTTERANCES]) for intent in itervalues(intents)] augmented_utterances = [] utterance_classes = [] for nb_utterance, intent_name in zip(nb_utterances, intents): min_utterances_to_generate = max( data_augmentation_config.min_utterances, nb_utterance) utterances = augment_utterances( dataset, intent_name, language=language, min_utterances=min_utterances_to_generate, capitalization_ratio=0.0, add_builtin_entities_examples= data_augmentation_config.add_builtin_entities_examples, resources=resources, random_state=random_state) augmented_utterances += utterances utterance_classes += [classes_mapping[intent_name] for _ in range(len(utterances))] if data_augmentation_config.unknown_words_replacement_string is not None: augmented_utterances = add_unknown_word_to_utterances( augmented_utterances, data_augmentation_config.unknown_words_replacement_string, data_augmentation_config.unknown_word_prob, data_augmentation_config.max_unknown_words, random_state ) # Adding noise noise = get_dataset_specific_noise(dataset, resources) noisy_utterances = generate_noise_utterances( augmented_utterances, noise, len(intents), data_augmentation_config, language, random_state) augmented_utterances += noisy_utterances utterance_classes += [noise_class for _ in noisy_utterances] if noisy_utterances: classes_mapping[NOISE_NAME] = noise_class nb_classes = len(set(itervalues(classes_mapping))) intent_mapping = [None for _ in range(nb_classes)] for intent, intent_class in iteritems(classes_mapping): if intent == NOISE_NAME: intent_mapping[intent_class] = None else: intent_mapping[intent_class] = intent return augmented_utterances, np.array(utterance_classes), intent_mapping
def build_training_data(dataset, language, data_augmentation_config, random_state): # Create class mapping intents = dataset[INTENTS] intent_index = 0 classes_mapping = dict() for intent in sorted(intents): classes_mapping[intent] = intent_index intent_index += 1 noise_class = intent_index # Computing dataset statistics nb_utterances = [len(intent[UTTERANCES]) for intent in itervalues(intents)] augmented_utterances = [] utterance_classes = [] for nb_utterance, intent_name in zip(nb_utterances, intents): min_utterances_to_generate = max( data_augmentation_config.min_utterances, nb_utterance) utterances = augment_utterances( dataset, intent_name, language=language, min_utterances=min_utterances_to_generate, capitalization_ratio=0.0, random_state=random_state) augmented_utterances += utterances utterance_classes += [classes_mapping[intent_name] for _ in range(len(utterances))] augmented_utterances = add_unknown_word_to_utterances( augmented_utterances, data_augmentation_config.unknown_words_replacement_string, data_augmentation_config.unknown_word_prob, random_state ) # Adding noise noisy_utterances = generate_noise_utterances( augmented_utterances, len(intents), data_augmentation_config, language, random_state) augmented_utterances = [get_text_from_chunks(u[DATA]) for u in augmented_utterances] augmented_utterances += noisy_utterances utterance_classes += [noise_class for _ in noisy_utterances] if noisy_utterances: classes_mapping[NOISE_NAME] = noise_class nb_classes = len(set(itervalues(classes_mapping))) intent_mapping = [None for _ in range(nb_classes)] for intent, intent_class in iteritems(classes_mapping): if intent == NOISE_NAME: intent_mapping[intent_class] = None else: intent_mapping[intent_class] = intent return augmented_utterances, np.array(utterance_classes), intent_mapping
def fit(self, dataset, intent, verbose=False): """Fit the slot filler Args: dataset (dict): A valid Snips dataset intent (str): The specific intent of the dataset to train the slot filler on verbose (bool, optional): If *True*, it will print the weights of the CRF once the training is done Returns: :class:`CRFSlotFiller`: The same instance, trained """ dataset = validate_and_format_dataset(dataset) self.intent = intent self.slot_name_mapping = get_slot_name_mapping(dataset, intent) self.language = dataset[LANGUAGE] random_state = check_random_state(self.config.random_seed) augmented_intent_utterances = augment_utterances( dataset, self.intent, language=self.language, random_state=random_state, **self.config.data_augmentation_config.to_dict()) crf_samples = [ utterance_to_sample(u[DATA], self.config.tagging_scheme, self.language) for u in augmented_intent_utterances ] for factory in self.features_factories: factory.fit(dataset, intent) # pylint: disable=C0103 X = [ self.compute_features(sample[TOKENS], drop_out=True) for sample in crf_samples ] # ensure ascii tags Y = [[_encode_tag(tag) for tag in sample[TAGS]] for sample in crf_samples] # pylint: enable=C0103 self.crf_model = _get_crf_model(self.config.crf_args) self.crf_model.fit(X, Y) if verbose: self.print_weights() return self
def fit(self, dataset, intent): """Fit the slot filler Args: dataset (dict): A valid Snips dataset intent (str): The specific intent of the dataset to train the slot filler on Returns: :class:`CRFSlotFiller`: The same instance, trained """ logger.debug("Fitting %s slot filler...", intent) dataset = validate_and_format_dataset(dataset) self.intent = intent self.slot_name_mapping = get_slot_name_mapping(dataset, intent) self.language = dataset[LANGUAGE] random_state = check_random_state(self.config.random_seed) augmented_intent_utterances = augment_utterances( dataset, self.intent, language=self.language, random_state=random_state, **self.config.data_augmentation_config.to_dict()) crf_samples = [ utterance_to_sample(u[DATA], self.config.tagging_scheme, self.language) for u in augmented_intent_utterances ] for factory in self.features_factories: factory.fit(dataset, intent) # pylint: disable=C0103 X = [ self.compute_features(sample[TOKENS], drop_out=True) for sample in crf_samples ] # ensure ascii tags Y = [[_encode_tag(tag) for tag in sample[TAGS]] for sample in crf_samples] # pylint: enable=C0103 self.crf_model = _get_crf_model(self.config.crf_args) self.crf_model.fit(X, Y) logger.debug("Most relevant features for %s:\n%s", self.intent, DifferedLoggingMessage(self.log_weights)) return self
def fit(self, dataset, intent, verbose=False): """Fit the slot filler Args: dataset (dict): A valid Snips dataset intent (str): The specific intent of the dataset to train the slot filler on verbose (bool, optional): If *True*, it will print the weights of the CRF once the training is done Returns: :class:`CRFSlotFiller`: The same instance, trained """ dataset = validate_and_format_dataset(dataset) self.intent = intent self.slot_name_mapping = get_slot_name_mapping(dataset, intent) self.language = dataset[LANGUAGE] random_state = check_random_state(self.config.random_seed) augmented_intent_utterances = augment_utterances( dataset, self.intent, language=self.language, random_state=random_state, **self.config.data_augmentation_config.to_dict()) crf_samples = [ utterance_to_sample(u[DATA], self.config.tagging_scheme, self.language) for u in augmented_intent_utterances] for factory in self.features_factories: factory.fit(dataset, intent) # pylint: disable=C0103 X = [self.compute_features(sample[TOKENS], drop_out=True) for sample in crf_samples] # ensure ascii tags Y = [[_encode_tag(tag) for tag in sample[TAGS]] for sample in crf_samples] # pylint: enable=C0103 self.crf_model = _get_crf_model(self.config.crf_args) self.crf_model.fit(X, Y) if verbose: self.print_weights() return self