def _prepare_training_data(self, filename, max_history, augmentation_factor, max_training_samples=None, max_number_of_trackers=2000, remove_duplicates=True): """Reads training data from file and prepares it for the training.""" from rasa_core.training import extract_training_data_from_file if filename: training_data = extract_training_data_from_file( filename, self.domain, self.featurizer, interpreter=RegexInterpreter(), augmentation_factor=augmentation_factor, max_history=max_history, remove_duplicates=remove_duplicates, max_number_of_trackers=max_number_of_trackers) if max_training_samples is not None: training_data.limit_training_data_to(max_training_samples) return training_data else: return DialogueTrainingData.empty(self.domain)
def featurize_trackers(self, trackers: List[DialogueStateTracker], domain: Domain) -> DialogueTrainingData: """Create training data""" self.state_featurizer.prepare_from_domain(domain) (trackers_as_states, trackers_as_actions) = self.training_states_and_actions( trackers, domain) # noinspection PyPep8Naming X, true_lengths = self._featurize_states(trackers_as_states) y = self._featurize_labels(trackers_as_actions, domain) return DialogueTrainingData(X, y, true_lengths)
def _fit_example(self, X, y, domain): # takes the new example labelled and learns it # via taking `epochs` samples of n_batch-1 parts of the training data, # inserting our new example and learning them. this means that we can # ask the network to fit the example without overemphasising # its importance (and therefore throwing off the biases) num_samples = self.batch_size - 1 for _ in range(self.epochs): sampled_X, sampled_y = self.train_data.random_samples(num_samples) batch_X = np.vstack((sampled_X, X)) batch_y = np.hstack((sampled_y, y)) data = DialogueTrainingData(batch_X, batch_y) self.continue_training(data, domain) self.train_data.append(X, y)
def featurize_trackers(self, trackers, # type: List[DialogueStateTracker] domain # type: Domain ): # type: (...) -> DialogueTrainingData """Create training data""" self.state_featurizer.prepare_from_domain(domain) (trackers_as_states, trackers_as_actions) = self.training_states_and_actions(trackers, domain) X, true_lengths = self._featurize_states(trackers_as_states) y = self._featurize_labels(trackers_as_actions, domain) return DialogueTrainingData(X, y, true_lengths)
def generate(self): # type: () -> DialogueTrainingData self._mark_first_action_in_story_steps_as_unpredictable() all_features = [] # type: List[ndarray] all_actions = [] # type: List[int] unused_checkpoints = set() # type: Set[Text] used_checkpoints = set() # type: Set[Text] init_tracker = FeaturizedTracker.from_domain(self.domain, self.config.max_history, self.config.tracker_limit) active_trackers = defaultdict(list) active_trackers[STORY_START].append(init_tracker) finished_trackers = [] phases = self._phase_names() for i, phase_name in enumerate(phases): num_trackers = self._count_trackers(active_trackers) logger.debug("Starting {} (phase {} of {})... (using {} trackers)" "".format(phase_name, i + 1, len(phases), num_trackers)) pbar = tqdm(self.story_graph.ordered_steps(), desc="Processed Story Blocks") for step in pbar: incoming_trackers = [] for start in step.start_checkpoints: if not active_trackers[start.name]: # need to skip - there was no previous step that # had this start checkpoint as an end checkpoint unused_checkpoints.add(start.name) else: ts = start.filter_trackers(active_trackers[start.name]) incoming_trackers.extend(ts) used_checkpoints.add(start.name) if incoming_trackers: # these are the trackers that reached this story # step and that need to handle all events of the step incoming_trackers = self._subsample_trackers( incoming_trackers, phase_idx=i) features, labels, trackers = self._process_step( step, incoming_trackers) # collect all the training samples created while # processing the steps events with the trackers all_features.extend(features) all_actions.extend(labels) # update progress bar pbar.set_postfix({ "# trackers": len(incoming_trackers), "samples": len(all_actions) }) # update our tracker dictionary with the trackers # that handled the events of the step and # that can now be used for further story steps # that start with the checkpoint this step ended with for end in step.end_checkpoints: active_trackers[end.name].extend(trackers) if not step.end_checkpoints: active_trackers[STORY_END].extend(trackers) # trackers that reached the end of a story completed = [t.tracker for t in active_trackers[STORY_END]] finished_trackers.extend(completed) active_trackers = self._create_start_trackers(active_trackers) logger.debug("Finished phase. ({} training samples found)".format( len(all_actions))) unused_checkpoints -= used_checkpoints self._issue_unused_checkpoint_notification(unused_checkpoints) logger.debug("Found {} action examples.".format(len(all_actions))) X = np.array(all_features) y = np.array(all_actions) metadata = { "events": self.events_metadata, "trackers": finished_trackers } if self.config.remove_duplicates: X_unique, y_unique = self._deduplicate_training_data(X, y) logger.debug("Deduplicated to {} unique action examples.".format( y_unique.shape[0])) return DialogueTrainingData(X_unique, y_unique, metadata) else: return DialogueTrainingData(X, y, metadata)