示例#1
0
    def _prepare_training_data(self,
                               filename,
                               max_history,
                               augmentation_factor,
                               max_training_samples=None,
                               max_number_of_trackers=2000,
                               remove_duplicates=True):
        """Reads training data from file and prepares it for the training."""

        from rasa_core.training import extract_training_data_from_file

        if filename:
            training_data = extract_training_data_from_file(
                filename,
                self.domain,
                self.featurizer,
                interpreter=RegexInterpreter(),
                augmentation_factor=augmentation_factor,
                max_history=max_history,
                remove_duplicates=remove_duplicates,
                max_number_of_trackers=max_number_of_trackers)
            if max_training_samples is not None:
                training_data.limit_training_data_to(max_training_samples)
            return training_data
        else:
            return DialogueTrainingData.empty(self.domain)
示例#2
0
    def featurize_trackers(self, trackers: List[DialogueStateTracker],
                           domain: Domain) -> DialogueTrainingData:
        """Create training data"""
        self.state_featurizer.prepare_from_domain(domain)

        (trackers_as_states,
         trackers_as_actions) = self.training_states_and_actions(
             trackers, domain)

        # noinspection PyPep8Naming
        X, true_lengths = self._featurize_states(trackers_as_states)
        y = self._featurize_labels(trackers_as_actions, domain)

        return DialogueTrainingData(X, y, true_lengths)
示例#3
0
 def _fit_example(self, X, y, domain):
     # takes the new example labelled and learns it
     # via taking `epochs` samples of n_batch-1 parts of the training data,
     # inserting our new example and learning them. this means that we can
     # ask the network to fit the example without overemphasising
     # its importance (and therefore throwing off the biases)
     num_samples = self.batch_size - 1
     for _ in range(self.epochs):
         sampled_X, sampled_y = self.train_data.random_samples(num_samples)
         batch_X = np.vstack((sampled_X, X))
         batch_y = np.hstack((sampled_y, y))
         data = DialogueTrainingData(batch_X, batch_y)
         self.continue_training(data, domain)
     self.train_data.append(X, y)
示例#4
0
    def featurize_trackers(self,
                           trackers,  # type: List[DialogueStateTracker]
                           domain  # type: Domain
                           ):
        # type: (...) -> DialogueTrainingData
        """Create training data"""
        self.state_featurizer.prepare_from_domain(domain)

        (trackers_as_states,
         trackers_as_actions) = self.training_states_and_actions(trackers,
                                                                 domain)

        X, true_lengths = self._featurize_states(trackers_as_states)
        y = self._featurize_labels(trackers_as_actions, domain)

        return DialogueTrainingData(X, y, true_lengths)
示例#5
0
    def generate(self):
        # type: () -> DialogueTrainingData

        self._mark_first_action_in_story_steps_as_unpredictable()

        all_features = []  # type: List[ndarray]
        all_actions = []  # type: List[int]
        unused_checkpoints = set()  # type: Set[Text]
        used_checkpoints = set()  # type: Set[Text]

        init_tracker = FeaturizedTracker.from_domain(self.domain,
                                                     self.config.max_history,
                                                     self.config.tracker_limit)
        active_trackers = defaultdict(list)
        active_trackers[STORY_START].append(init_tracker)
        finished_trackers = []

        phases = self._phase_names()

        for i, phase_name in enumerate(phases):
            num_trackers = self._count_trackers(active_trackers)

            logger.debug("Starting {} (phase {} of {})... (using {} trackers)"
                         "".format(phase_name, i + 1, len(phases),
                                   num_trackers))

            pbar = tqdm(self.story_graph.ordered_steps(),
                        desc="Processed Story Blocks")
            for step in pbar:
                incoming_trackers = []
                for start in step.start_checkpoints:
                    if not active_trackers[start.name]:
                        # need to skip - there was no previous step that
                        # had this start checkpoint as an end checkpoint
                        unused_checkpoints.add(start.name)
                    else:
                        ts = start.filter_trackers(active_trackers[start.name])
                        incoming_trackers.extend(ts)
                        used_checkpoints.add(start.name)

                if incoming_trackers:
                    # these are the trackers that reached this story
                    # step and that need to handle all events of the step
                    incoming_trackers = self._subsample_trackers(
                        incoming_trackers, phase_idx=i)

                    features, labels, trackers = self._process_step(
                        step, incoming_trackers)

                    # collect all the training samples created while
                    # processing the steps events with the trackers
                    all_features.extend(features)
                    all_actions.extend(labels)

                    # update progress bar
                    pbar.set_postfix({
                        "# trackers": len(incoming_trackers),
                        "samples": len(all_actions)
                    })

                    # update our tracker dictionary with the trackers
                    # that handled the events of the step and
                    # that can now be used for further story steps
                    # that start with the checkpoint this step ended with
                    for end in step.end_checkpoints:
                        active_trackers[end.name].extend(trackers)

                    if not step.end_checkpoints:
                        active_trackers[STORY_END].extend(trackers)

            # trackers that reached the end of a story
            completed = [t.tracker for t in active_trackers[STORY_END]]
            finished_trackers.extend(completed)
            active_trackers = self._create_start_trackers(active_trackers)
            logger.debug("Finished phase. ({} training samples found)".format(
                len(all_actions)))

        unused_checkpoints -= used_checkpoints
        self._issue_unused_checkpoint_notification(unused_checkpoints)
        logger.debug("Found {} action examples.".format(len(all_actions)))

        X = np.array(all_features)
        y = np.array(all_actions)

        metadata = {
            "events": self.events_metadata,
            "trackers": finished_trackers
        }

        if self.config.remove_duplicates:
            X_unique, y_unique = self._deduplicate_training_data(X, y)
            logger.debug("Deduplicated to {} unique action examples.".format(
                y_unique.shape[0]))
            return DialogueTrainingData(X_unique, y_unique, metadata)
        else:
            return DialogueTrainingData(X, y, metadata)