示例#1
0
    def load(cls, path: Text) -> "TEDPolicy":
        """Loads a policy from the storage.

        **Needs to load its featurizer**
        """

        if not os.path.exists(path):
            raise Exception(f"Failed to load TED policy model. Path "
                            f"'{os.path.abspath(path)}' doesn't exist.")

        model_path = Path(path)
        tf_model_file = model_path / f"{SAVE_MODEL_FILE_NAME}.tf_model"

        featurizer = TrackerFeaturizer.load(path)

        if not (model_path /
                f"{SAVE_MODEL_FILE_NAME}.data_example.pkl").is_file():
            return cls(featurizer=featurizer)

        loaded_data = io_utils.json_unpickle(
            model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl")
        label_data = io_utils.json_unpickle(
            model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl")
        meta = io_utils.pickle_load(model_path /
                                    f"{SAVE_MODEL_FILE_NAME}.meta.pkl")
        priority = io_utils.json_unpickle(
            model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl")

        model_data_example = RasaModelData(label_key=LABEL_IDS,
                                           data=loaded_data)
        meta = train_utils.update_similarity_type(meta)

        model = TED.load(
            str(tf_model_file),
            model_data_example,
            data_signature=model_data_example.get_signature(),
            config=meta,
            max_history_tracker_featurizer_used=isinstance(
                featurizer, MaxHistoryTrackerFeaturizer),
            label_data=label_data,
        )

        # build the graph for prediction
        predict_data_example = RasaModelData(
            label_key=LABEL_IDS,
            data={
                feature_name: features
                for feature_name, features in model_data_example.items()
                if DIALOGUE in feature_name
            },
        )
        model.build_for_predict(predict_data_example)

        return cls(featurizer=featurizer,
                   priority=priority,
                   model=model,
                   **meta)
示例#2
0
    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
        """Prepares data for training.

        Performs sanity checks on training data, extracts encodings for labels.
        """

        if self.retrieval_intent:
            training_data = training_data.filter_by_intent(self.retrieval_intent)

        label_id_index_mapping = self._label_id_index_mapping(
            training_data, attribute=RESPONSE
        )

        if not label_id_index_mapping:
            # no labels are present to train
            return RasaModelData()

        self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)

        self._label_data = self._create_label_data(
            training_data, label_id_index_mapping, attribute=RESPONSE
        )

        model_data = self._create_model_data(
            training_data.intent_examples,
            label_id_index_mapping,
            label_attribute=RESPONSE,
        )

        self._check_input_dimension_consistency(model_data)

        return model_data
示例#3
0
    def _assemble_label_data(self, attribute_data: Data,
                             domain: Domain) -> RasaModelData:
        """Constructs data regarding labels to be fed to the model.

        The resultant model data can possibly contain one or both of the
        keys - [`label_action_name`, `label_action_text`] but will definitely
        contain the `label` key.
        `label_action_*` will contain the sequence, sentence and mask features
        for corresponding labels and `label` will contain the numerical label ids.

        Args:
            attribute_data: Feature data for all labels.
            domain: Domain of the assistant.

        Returns:
            Features of labels ready to be fed to the model.
        """
        label_data = RasaModelData()
        label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_")
        label_data.add_lengths(
            f"{LABEL}_{ACTION_TEXT}",
            SEQUENCE_LENGTH,
            f"{LABEL}_{ACTION_TEXT}",
            SEQUENCE,
        )
        label_ids = np.arange(domain.num_actions)
        label_data.add_features(
            LABEL_KEY,
            LABEL_SUB_KEY,
            [
                FeatureArray(np.expand_dims(label_ids, -1),
                             number_of_dimensions=2)
            ],
        )
        return label_data
示例#4
0
    def _assemble_label_data(
        self, attribute_data: Data, domain: Domain
    ) -> RasaModelData:
        """Constructs data regarding labels to be fed to the model.

        The resultant model data should contain the keys `label_intent`, `label`.
        `label_intent` will contain the sequence, sentence and mask features
        for all intent labels and `label` will contain the numerical label ids.

        Args:
            attribute_data: Feature data for all intent labels.
            domain: Domain of the assistant.

        Returns:
            Features of labels ready to be fed to the model.
        """
        label_data = RasaModelData()
        label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_")
        label_data.add_lengths(
            f"{LABEL}_{INTENT}", SEQUENCE_LENGTH, f"{LABEL}_{INTENT}", SEQUENCE,
        )
        label_ids = np.arange(len(domain.intents))
        label_data.add_features(
            LABEL_KEY,
            LABEL_SUB_KEY,
            [FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2)],
        )
        return label_data
示例#5
0
    def _load_model_class(
        cls,
        tf_model_file: Text,
        model_data_example: RasaModelData,
        label_data: RasaModelData,
        entity_tag_specs: List[EntityTagSpec],
        config: Dict[Text, Any],
        finetune_mode: bool = False,
    ) -> "RasaModel":

        predict_data_example = RasaModelData(
            label_key=model_data_example.label_key,
            data={
                feature_name: features
                for feature_name, features in model_data_example.items()
                if TEXT in feature_name
            },
        )
        return cls.model_class(config[USE_TEXT_AS_LABEL]).load(
            tf_model_file,
            model_data_example,
            predict_data_example,
            data_signature=model_data_example.get_signature(),
            label_data=label_data,
            entity_tag_specs=entity_tag_specs,
            config=copy.deepcopy(config),
            finetune_mode=finetune_mode,
        )
示例#6
0
    def _create_label_data(self, domain: Domain) -> RasaModelData:
        # encode all label_ids with policies' featurizer
        state_featurizer = self.featurizer.state_featurizer
        all_labels = state_featurizer.create_encoded_all_actions(domain)
        all_labels = all_labels.astype(np.float32)

        label_data = RasaModelData()
        label_data.add_features(LABEL_FEATURES, [all_labels])
        return label_data
示例#7
0
 def _construct_model_initialization_data(
     cls, loaded_data: Dict[Text, Dict[Text, List[FeatureArray]]]
 ) -> Tuple[RasaModelData, RasaModelData]:
     model_data_example = RasaModelData(label_key=LABEL_KEY,
                                        label_sub_key=LABEL_SUB_KEY,
                                        data=loaded_data)
     predict_data_example = RasaModelData(
         label_key=LABEL_KEY,
         label_sub_key=LABEL_SUB_KEY,
         data={
             feature_name: features
             for feature_name, features in model_data_example.items()
             if feature_name
             # we need to remove label features for prediction if they are present
             in PREDICTION_FEATURES
         },
     )
     return model_data_example, predict_data_example
示例#8
0
def test_not_balance_model_data(model_data: RasaModelData):
    test_model_data = RasaModelData(label_key="entities",
                                    label_sub_key="tag_ids",
                                    data=model_data.data)

    data = test_model_data._balanced_data(test_model_data.data, 2, False)

    assert np.all(data["entities"]["tag_ids"] == test_model_data.get(
        "entities", "tag_ids"))
示例#9
0
async def model_data() -> RasaModelData:
    return RasaModelData(
        label_key="intent",
        label_sub_key="ids",
        data={
            "text_features": {
                "sentence": [
                    np.array([
                        np.random.rand(5, 14),
                        np.random.rand(2, 14),
                        np.random.rand(3, 14),
                        np.random.rand(1, 14),
                        np.random.rand(3, 14),
                    ]),
                    np.array([
                        scipy.sparse.csr_matrix(
                            np.random.randint(5, size=(5, 10))),
                        scipy.sparse.csr_matrix(
                            np.random.randint(5, size=(2, 10))),
                        scipy.sparse.csr_matrix(
                            np.random.randint(5, size=(3, 10))),
                        scipy.sparse.csr_matrix(
                            np.random.randint(5, size=(1, 10))),
                        scipy.sparse.csr_matrix(
                            np.random.randint(5, size=(3, 10))),
                    ]),
                ]
            },
            "intent_features": {
                "sentence": [
                    np.array([
                        np.random.randint(2, size=(5, 10)),
                        np.random.randint(2, size=(2, 10)),
                        np.random.randint(2, size=(3, 10)),
                        np.random.randint(2, size=(1, 10)),
                        np.random.randint(2, size=(3, 10)),
                    ])
                ]
            },
            "intent": {
                "ids": [np.array([0, 1, 0, 1, 1])]
            },
            "entities": {
                "tag_ids": [
                    np.array([
                        np.array([[0], [1], [1], [0], [2]]),
                        np.array([[2], [0]]),
                        np.array([[0], [1], [1]]),
                        np.array([[0], [1]]),
                        np.array([[0], [0], [0]]),
                    ])
                ]
            },
        },
    )
示例#10
0
def test_batch_inference(
    batch_size: int,
    number_of_data_points: int,
    expected_number_of_batch_iterations: int,
):
    model = RasaModel()

    def _batch_predict(
        batch_in: Tuple[np.ndarray],
    ) -> Dict[Text, Union[np.ndarray, Dict[Text, np.ndarray]]]:

        dummy_output = batch_in[0]
        output = {
            "dummy_output":
            dummy_output,
            "non_input_affected_output":
            tf.constant(np.array([[1, 2]]), dtype=tf.int32),
        }
        return output

    # Monkeypatch batch predict so that run_inference interface can be tested
    model.batch_predict = _batch_predict

    # Create dummy model data to pass to model
    model_data = RasaModelData(
        label_key=LABEL,
        label_sub_key=IDS,
        data={
            TEXT: {
                SENTENCE: [
                    FeatureArray(
                        np.random.rand(number_of_data_points, 2),
                        number_of_dimensions=2,
                    ),
                ]
            }
        },
    )
    output = model.run_inference(model_data, batch_size=batch_size)

    # Firstly, the number of data points in dummy_output should be equal
    # to the number of data points sent as input.
    assert output["dummy_output"].shape[0] == number_of_data_points

    # Secondly, the number of data points inside diagnostic_data should be
    # equal to the number of batches passed to the model because for every
    # batch passed as input, it would have created a
    # corresponding diagnostic data entry.
    assert output["non_input_affected_output"].shape == (
        expected_number_of_batch_iterations,
        2,
    )
示例#11
0
    def _load_model(
        cls,
        index_label_id_mapping: Dict[int, Text],
        index_tag_id_mapping: Dict[int, Text],
        label_data: RasaModelData,
        meta: Dict[Text, Any],
        data_example: Dict[Text, List[np.ndarray]],
        model_dir: Text,
    ):
        file_name = meta.get("file")
        tf_model_file = os.path.join(model_dir, file_name + ".tf_model")

        label_key = LABEL_IDS if meta[INTENT_CLASSIFICATION] else None
        model_data_example = RasaModelData(label_key=label_key,
                                           data=data_example)

        model = cls.model_class(meta).load(
            tf_model_file,
            model_data_example,
            data_signature=model_data_example.get_signature(),
            label_data=label_data,
            index_label_id_mapping=index_label_id_mapping,
            index_tag_id_mapping=index_tag_id_mapping,
            config=meta,
        )

        # build the graph for prediction
        predict_data_example = RasaModelData(
            label_key=label_key,
            data={
                feature_name: features
                for feature_name, features in model_data_example.items()
                if TEXT in feature_name
            },
        )

        model.build_for_predict(predict_data_example)

        return model
示例#12
0
    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
        """Prepares data for training.

        Performs sanity checks on training data, extracts encodings for labels.

        Args:
            training_data: training data to preprocessed.
        """
        # Collect all retrieval intents present in the data before filtering
        self.all_retrieval_intents = list(training_data.retrieval_intents)

        if self.retrieval_intent:
            training_data = training_data.filter_training_examples(
                lambda ex: self.retrieval_intent == ex.get(INTENT)
            )
        else:
            # retrieval intent was left to its default value
            logger.info(
                "Retrieval intent parameter was left to its default value. This "
                "response selector will be trained on training examples combining "
                "all retrieval intents."
            )

        label_attribute = RESPONSE if self.use_text_as_label else INTENT_RESPONSE_KEY

        label_id_index_mapping = self._label_id_index_mapping(
            training_data, attribute=label_attribute
        )

        self.responses = training_data.responses

        if not label_id_index_mapping:
            # no labels are present to train
            return RasaModelData()

        self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)

        self._label_data = self._create_label_data(
            training_data, label_id_index_mapping, attribute=label_attribute
        )

        model_data = self._create_model_data(
            training_data.intent_examples,
            label_id_index_mapping,
            label_attribute=label_attribute,
        )

        self._check_input_dimension_consistency(model_data)

        return model_data
示例#13
0
    def _prepare_for_training(
        self,
        training_trackers: List[TrackerWithCachedStates],
        domain: Domain,
        interpreter: NaturalLanguageInterpreter,
        **kwargs: Any,
    ) -> Tuple[RasaModelData, np.ndarray]:
        """Prepares data to be fed into the model.

        Args:
            training_trackers: List of training trackers to be featurized.
            domain: Domain of the assistant.
            interpreter: NLU interpreter to be used for featurizing states.
            **kwargs: Any other arguments.

        Returns:
            Featurized data to be fed to the model and corresponding label ids.
        """
        training_trackers = self._get_trackers_for_training(training_trackers)
        # dealing with training data
        tracker_state_features, label_ids, entity_tags = self._featurize_for_training(
            training_trackers,
            domain,
            interpreter,
            bilou_tagging=self.config[BILOU_FLAG],
            **kwargs,
        )

        if not tracker_state_features:
            return RasaModelData(), label_ids

        self._label_data, encoded_all_labels = self._create_label_data(
            domain, interpreter)

        # extract actual training data to feed to model
        model_data = self._create_model_data(tracker_state_features, label_ids,
                                             entity_tags, encoded_all_labels)

        if self.config[ENTITY_RECOGNITION]:
            self._entity_tag_specs = self.featurizer.state_featurizer.entity_tag_specs

        # keep one example for persisting and loading
        self.data_example = model_data.first_data_example()

        return model_data, label_ids
示例#14
0
    def _create_label_data(
        self, domain: Domain, interpreter: NaturalLanguageInterpreter
    ) -> Tuple[RasaModelData, List[Dict[Text, List["Features"]]]]:
        # encode all label_ids with policies' featurizer
        state_featurizer = self.featurizer.state_featurizer
        encoded_all_labels = state_featurizer.encode_all_actions(domain, interpreter)

        attribute_data, _ = convert_to_data_format(encoded_all_labels)

        label_data = RasaModelData()
        label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_")

        label_ids = np.arange(domain.num_actions)
        label_data.add_features(
            LABEL_KEY, LABEL_SUB_KEY, [np.expand_dims(label_ids, -1)]
        )

        return label_data, encoded_all_labels
示例#15
0
    def _compile_and_fit(
        self, data_example: Dict[Text, Dict[Text, List[FeatureArray]]]
    ) -> None:
        """Compiles modified model and fits a sample data on it.

        Args:
            data_example: a data example that is stored with the ML component.
        """
        self.compile(optimizer=tf.keras.optimizers.Adam(self.config[LEARNING_RATE]))
        label_key = LABEL_KEY if self.config[INTENT_CLASSIFICATION] else None
        label_sub_key = LABEL_SUB_KEY if self.config[INTENT_CLASSIFICATION] else None

        model_data = RasaModelData(
            label_key=label_key, label_sub_key=label_sub_key, data=data_example
        )
        self._update_data_signatures(model_data)
        data_generator = RasaBatchDataGenerator(model_data, batch_size=1)
        self.fit(data_generator, verbose=False)
示例#16
0
    def _load_model_utilities(cls, model_path: Path) -> Dict[Text, Any]:
        """Loads model's utility attributes.

        Args:
            model_path: Path where model is to be persisted.
        """
        tf_model_file = model_path / f"{cls._metadata_filename()}.tf_model"
        loaded_data = io_utils.pickle_load(
            model_path / f"{cls._metadata_filename()}.data_example.pkl")
        label_data = io_utils.pickle_load(
            model_path / f"{cls._metadata_filename()}.label_data.pkl")
        fake_features = io_utils.pickle_load(
            model_path / f"{cls._metadata_filename()}.fake_features.pkl")
        label_data = RasaModelData(data=label_data)
        meta = io_utils.pickle_load(model_path /
                                    f"{cls._metadata_filename()}.meta.pkl")
        priority = io_utils.json_unpickle(
            model_path / f"{cls._metadata_filename()}.priority.pkl")
        entity_tag_specs = rasa.shared.utils.io.read_json_file(
            model_path / f"{cls._metadata_filename()}.entity_tag_specs.json")
        entity_tag_specs = [
            EntityTagSpec(
                tag_name=tag_spec["tag_name"],
                ids_to_tags={
                    int(key): value
                    for key, value in tag_spec["ids_to_tags"].items()
                },
                tags_to_ids={
                    key: int(value)
                    for key, value in tag_spec["tags_to_ids"].items()
                },
                num_tags=tag_spec["num_tags"],
            ) for tag_spec in entity_tag_specs
        ]

        return {
            "tf_model_file": tf_model_file,
            "loaded_data": loaded_data,
            "fake_features": fake_features,
            "label_data": label_data,
            "meta": meta,
            "priority": priority,
            "entity_tag_specs": entity_tag_specs,
        }
示例#17
0
    def _create_model_data(
        self,
        tracker_state_features: List[List[Dict[Text, List["Features"]]]],
        label_ids: Optional[np.ndarray] = None,
        encoded_all_labels: Optional[List[Dict[Text, List["Features"]]]] = None,
    ) -> RasaModelData:
        """Combine all model related data into RasaModelData.

        Args:
            tracker_state_features: a dictionary of attributes (INTENT, TEXT, ACTION_NAME, ACTION_TEXT,
                ENTITIES, SLOTS, ACTIVE_LOOP) to a list of features for all dialogue
                turns in all training trackers
            label_ids: the label ids (e.g. action ids) for every dialogue turn in all
                training trackers
            encoded_all_labels: a list of dictionaries containing attribute features for labels ids

        Returns:
            RasaModelData
        """
        model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY)

        if label_ids is not None and encoded_all_labels is not None:

            label_ids = np.array(
                [np.expand_dims(seq_label_ids, -1) for seq_label_ids in label_ids]
            )
            model_data.add_features(LABEL_KEY, LABEL_SUB_KEY, [label_ids])

            attribute_data, self.zero_state_features = convert_to_data_format(
                tracker_state_features
            )
        else:
            # method is called during prediction
            attribute_data, _ = convert_to_data_format(
                tracker_state_features, self.zero_state_features
            )

        model_data.add_data(attribute_data)
        model_data.add_lengths(
            DIALOGUE, LENGTH, next(iter(list(attribute_data.keys()))), MASK
        )

        return model_data
示例#18
0
    def _create_label_data(
        self,
        training_data: TrainingData,
        label_id_dict: Dict[Text, int],
        attribute: Text,
    ) -> RasaModelData:
        """Create matrix with label_ids encoded in rows as bag of words.

        Find a training example for each label and get the encoded features
        from the corresponding Message object.
        If the features are already computed, fetch them from the message object
        else compute a one hot encoding for the label as the feature vector.
        """

        # Collect one example for each label
        labels_idx_examples = []
        for label_name, idx in label_id_dict.items():
            label_example = self._find_example_for_label(
                label_name, training_data.intent_examples, attribute)
            labels_idx_examples.append((idx, label_example))

        # Sort the list of tuples based on label_idx
        labels_idx_examples = sorted(labels_idx_examples, key=lambda x: x[0])
        labels_example = [example for (_, example) in labels_idx_examples]

        # Collect features, precomputed if they exist, else compute on the fly
        if self._check_labels_features_exist(labels_example, attribute):
            features = self._extract_labels_precomputed_features(
                labels_example, attribute)
        else:
            features = self._compute_default_label_features(labels_example)

        label_data = RasaModelData()
        label_data.add_features(LABEL_FEATURES, features)

        label_ids = np.array([idx for (idx, _) in labels_idx_examples])
        # explicitly add last dimension to label_ids
        # to track correctly dynamic sequences
        label_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)])

        label_data.add_mask(LABEL_MASK, LABEL_FEATURES)

        return label_data
示例#19
0
    def preprocess_train_data(self,
                              training_data: TrainingData) -> RasaModelData:
        """Prepares data for training.

        Performs sanity checks on training data, extracts encodings for labels.
        """

        if self.retrieval_intent:
            training_data = training_data.filter_by_intent(
                self.retrieval_intent)
        else:
            # retrieval intent was left to its default value
            logger.info(
                "Retrieval intent parameter was left to its default value. This "
                "response selector will be trained on training examples combining "
                "all retrieval intents.")

        label_id_index_mapping = self._label_id_index_mapping(
            training_data, attribute=RESPONSE)
        self.retrieval_intent_mapping = self._create_retrieval_intent_mapping(
            training_data)

        if not label_id_index_mapping:
            # no labels are present to train
            return RasaModelData()

        self.index_label_id_mapping = self._invert_mapping(
            label_id_index_mapping)

        self._label_data = self._create_label_data(training_data,
                                                   label_id_index_mapping,
                                                   attribute=RESPONSE)

        model_data = self._create_model_data(
            training_data.intent_examples,
            label_id_index_mapping,
            label_attribute=RESPONSE,
        )

        self._check_input_dimension_consistency(model_data)

        return model_data
示例#20
0
    def preprocess_train_data(self,
                              training_data: TrainingData) -> RasaModelData:
        """Prepares data for training.

        Performs sanity checks on training data, extracts encodings for labels.
        """

        if self.component_config[BILOU_FLAG]:
            bilou_utils.apply_bilou_schema(training_data)

        label_id_index_mapping = self._label_id_index_mapping(training_data,
                                                              attribute=INTENT)

        if not label_id_index_mapping:
            # no labels are present to train
            return RasaModelData()

        self.index_label_id_mapping = self._invert_mapping(
            label_id_index_mapping)

        self._label_data = self._create_label_data(training_data,
                                                   label_id_index_mapping,
                                                   attribute=INTENT)

        tag_id_index_mapping = self._tag_id_index_mapping(training_data)
        self.index_tag_id_mapping = self._invert_mapping(tag_id_index_mapping)

        label_attribute = (
            INTENT if self.component_config[INTENT_CLASSIFICATION] else None)

        model_data = self._create_model_data(
            training_data.training_examples,
            label_id_index_mapping,
            tag_id_index_mapping,
            label_attribute=label_attribute,
        )

        self.num_tags = len(self.index_tag_id_mapping)

        self._check_input_dimension_consistency(model_data)

        return model_data
示例#21
0
    def _prepare_data_for_prediction(model_data: RasaModelData) -> RasaModelData:
        """Transforms training model data to data usable for making model predictions.

        Transformation involves filtering out all features which
        are not useful at prediction time. This is important
        because the prediction signature will not contain these
        attributes and hence prediction will break.

        Args:
            model_data: Data used during model training.

        Returns:
            Transformed data usable for making predictions.
        """
        filtered_data: Dict[Text, Dict[Text, Any]] = {
            key: features
            for key, features in model_data.data.items()
            if key in PREDICTION_FEATURES
        }
        return RasaModelData(data=filtered_data)
示例#22
0
    def _create_model_data(
        self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None
    ) -> RasaModelData:
        """Combine all model related data into RasaModelData."""

        label_ids = np.array([])
        Y = np.array([])

        if data_Y is not None:
            label_ids = self._label_ids_for_Y(data_Y)
            Y = self._label_features_for_Y(label_ids)
            # explicitly add last dimension to label_ids
            # to track correctly dynamic sequences
            label_ids = np.expand_dims(label_ids, -1)

        model_data = RasaModelData(label_key=LABEL_IDS)
        model_data.add_features(DIALOGUE_FEATURES, [data_X])
        model_data.add_features(LABEL_FEATURES, [Y])
        model_data.add_features(LABEL_IDS, [label_ids])

        return model_data
示例#23
0
def test_not_balance_model_data(model_data: RasaModelData):
    test_model_data = RasaModelData(label_key="tag_ids", data=model_data.data)

    data = test_model_data._balanced_data(test_model_data.data, 2, False)

    assert np.all(data.get("tag_ids") == test_model_data.get("tag_ids"))
示例#24
0
async def model_data() -> RasaModelData:
    return RasaModelData(
        label_key="label",
        label_sub_key="ids",
        data={
            "text": {
                "sentence": [
                    FeatureArray(
                        np.array([
                            np.random.rand(5, 14),
                            np.random.rand(2, 14),
                            np.random.rand(3, 14),
                            np.random.rand(1, 14),
                            np.random.rand(3, 14),
                        ]),
                        number_of_dimensions=3,
                    ),
                    FeatureArray(
                        np.array([
                            scipy.sparse.csr_matrix(
                                np.random.randint(5, size=(5, 10))),
                            scipy.sparse.csr_matrix(
                                np.random.randint(5, size=(2, 10))),
                            scipy.sparse.csr_matrix(
                                np.random.randint(5, size=(3, 10))),
                            scipy.sparse.csr_matrix(
                                np.random.randint(5, size=(1, 10))),
                            scipy.sparse.csr_matrix(
                                np.random.randint(5, size=(3, 10))),
                        ]),
                        number_of_dimensions=3,
                    ),
                ]
            },
            "action_text": {
                "sequence": [
                    FeatureArray(
                        np.array([
                            [
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(5, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(2, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(3, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(1, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(3, 10))),
                            ],
                            [
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(5, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(2, 10))),
                            ],
                            [
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(5, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(1, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(3, 10))),
                            ],
                            [
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(3, 10)))
                            ],
                            [
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(3, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(1, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(7, 10))),
                            ],
                        ]),
                        number_of_dimensions=4,
                    ),
                    FeatureArray(
                        np.array([
                            [
                                np.random.rand(5, 14),
                                np.random.rand(2, 14),
                                np.random.rand(3, 14),
                                np.random.rand(1, 14),
                                np.random.rand(3, 14),
                            ],
                            [np.random.rand(5, 14),
                             np.random.rand(2, 14)],
                            [
                                np.random.rand(5, 14),
                                np.random.rand(1, 14),
                                np.random.rand(3, 14),
                            ],
                            [np.random.rand(3, 14)],
                            [
                                np.random.rand(3, 14),
                                np.random.rand(1, 14),
                                np.random.rand(7, 14),
                            ],
                        ]),
                        number_of_dimensions=4,
                    ),
                ]
            },
            "dialogue": {
                "sentence": [
                    FeatureArray(
                        np.array([
                            np.random.randint(2, size=(5, 10)),
                            np.random.randint(2, size=(2, 10)),
                            np.random.randint(2, size=(3, 10)),
                            np.random.randint(2, size=(1, 10)),
                            np.random.randint(2, size=(3, 10)),
                        ]),
                        number_of_dimensions=3,
                    )
                ]
            },
            "label": {
                "ids": [
                    FeatureArray(np.array([0, 1, 0, 1, 1]),
                                 number_of_dimensions=1)
                ]
            },
            "entities": {
                "tag_ids": [
                    FeatureArray(
                        np.array([
                            np.array([[0], [1], [1], [0], [2]]),
                            np.array([[2], [0]]),
                            np.array([[0], [1], [1]]),
                            np.array([[0], [1]]),
                            np.array([[0], [0], [0]]),
                        ]),
                        number_of_dimensions=3,
                    )
                ]
            },
        },
    )
示例#25
0
    def _create_model_data(
        self,
        tracker_state_features: List[List[Dict[Text, List["Features"]]]],
        label_ids: Optional[np.ndarray] = None,
        entity_tags: Optional[List[List[Dict[Text, List["Features"]]]]] = None,
        encoded_all_labels: Optional[List[Dict[Text,
                                               List["Features"]]]] = None,
    ) -> RasaModelData:
        """Combine all model related data into RasaModelData.

        Args:
            tracker_state_features: a dictionary of attributes
                (INTENT, TEXT, ACTION_NAME, ACTION_TEXT, ENTITIES, SLOTS, ACTIVE_LOOP)
                to a list of features for all dialogue turns in all training trackers
            label_ids: the label ids (e.g. action ids) for every dialogue turn in all
                training trackers
            entity_tags: a dictionary of entity type (ENTITY_TAGS) to a list of features
                containing entity tag ids for text user inputs otherwise empty dict
                for all dialogue turns in all training trackers
            encoded_all_labels: a list of dictionaries containing attribute features
                for label ids

        Returns:
            RasaModelData
        """
        model_data = RasaModelData(label_key=LABEL_KEY,
                                   label_sub_key=LABEL_SUB_KEY)

        if label_ids is not None and encoded_all_labels is not None:
            label_ids = np.array([
                np.expand_dims(seq_label_ids, -1)
                for seq_label_ids in label_ids
            ])
            model_data.add_features(
                LABEL_KEY,
                LABEL_SUB_KEY,
                [FeatureArray(label_ids, number_of_dimensions=3)],
            )

            attribute_data, self.fake_features = convert_to_data_format(
                tracker_state_features, featurizers=self.config[FEATURIZERS])

            entity_tags_data = self._create_data_for_entities(entity_tags)
            if entity_tags_data is not None:
                model_data.add_data(entity_tags_data)
        else:
            # method is called during prediction
            attribute_data, _ = convert_to_data_format(
                tracker_state_features,
                self.fake_features,
                featurizers=self.config[FEATURIZERS],
            )

        model_data.add_data(attribute_data)
        model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
        model_data.add_lengths(ACTION_TEXT, SEQUENCE_LENGTH, ACTION_TEXT,
                               SEQUENCE)

        # add the dialogue lengths
        attribute_present = next(iter(list(attribute_data.keys())))
        dialogue_lengths = np.array([
            np.size(np.squeeze(f, -1))
            for f in model_data.data[attribute_present][MASK][0]
        ])
        model_data.data[DIALOGUE][LENGTH] = [
            FeatureArray(dialogue_lengths, number_of_dimensions=1)
        ]

        # make sure all keys are in the same order during training and prediction
        model_data.sort()

        return model_data
示例#26
0
    def _create_model_data(
        self,
        training_data: List[Message],
        label_id_dict: Optional[Dict[Text, int]] = None,
        tag_id_dict: Optional[Dict[Text, int]] = None,
        label_attribute: Optional[Text] = None,
    ) -> RasaModelData:
        """Prepare data for training and create a RasaModelData object"""

        X_sparse = []
        X_dense = []
        Y_sparse = []
        Y_dense = []
        label_ids = []
        tag_ids = []

        for e in training_data:
            if label_attribute is None or e.get(label_attribute):
                _sparse, _dense = self._extract_features(e, TEXT)
                if _sparse is not None:
                    X_sparse.append(_sparse)
                if _dense is not None:
                    X_dense.append(_dense)

            if e.get(label_attribute):
                _sparse, _dense = self._extract_features(e, label_attribute)
                if _sparse is not None:
                    Y_sparse.append(_sparse)
                if _dense is not None:
                    Y_dense.append(_dense)

                if label_id_dict:
                    label_ids.append(label_id_dict[e.get(label_attribute)])

            if self.component_config.get(ENTITY_RECOGNITION) and tag_id_dict:
                if self.component_config[BILOU_FLAG]:
                    _tags = bilou_utils.tags_to_ids(e, tag_id_dict)
                else:
                    _tags = []
                    for t in e.get(TOKENS_NAMES[TEXT]):
                        _tag = determine_token_labels(t, e.get(ENTITIES), None)
                        _tags.append(tag_id_dict[_tag])
                # transpose to have seq_len x 1
                tag_ids.append(np.array([_tags]).T)

        X_sparse = np.array(X_sparse)
        X_dense = np.array(X_dense)
        Y_sparse = np.array(Y_sparse)
        Y_dense = np.array(Y_dense)
        label_ids = np.array(label_ids)
        tag_ids = np.array(tag_ids)

        model_data = RasaModelData(label_key=self.label_key)
        model_data.add_features(TEXT_FEATURES, [X_sparse, X_dense])
        model_data.add_features(LABEL_FEATURES, [Y_sparse, Y_dense])
        if label_attribute and model_data.feature_not_exist(LABEL_FEATURES):
            # no label features are present, get default features from _label_data
            model_data.add_features(
                LABEL_FEATURES, self._use_default_label_features(label_ids))

        # explicitly add last dimension to label_ids
        # to track correctly dynamic sequences
        model_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)])
        model_data.add_features(TAG_IDS, [tag_ids])

        model_data.add_mask(TEXT_MASK, TEXT_FEATURES)
        model_data.add_mask(LABEL_MASK, LABEL_FEATURES)

        return model_data
示例#27
0
    def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
        """Loads a policy from the storage.
        **Needs to load its featurizer**
        """
        model_path = Path(path)

        if not model_path.exists():
            logger.error(
                f"Failed to load TED policy model. Path "
                f"'{model_path.absolute()}' doesn't exist."
            )
            return

        tf_model_file = model_path / f"{SAVE_MODEL_FILE_NAME}.tf_model"

        featurizer = TrackerFeaturizer.load(path)

        if not (model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl").is_file():
            return cls(featurizer=featurizer)

        loaded_data = io_utils.pickle_load(
            model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl"
        )
        label_data = io_utils.pickle_load(
            model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl"
        )
        zero_state_features = io_utils.pickle_load(
            model_path / f"{SAVE_MODEL_FILE_NAME}.zero_state_features.pkl"
        )
        label_data = RasaModelData(data=label_data)
        meta = io_utils.pickle_load(model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl")
        priority = io_utils.json_unpickle(
            model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl"
        )

        model_data_example = RasaModelData(
            label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data
        )
        meta = train_utils.update_similarity_type(meta)

        model = TED.load(
            str(tf_model_file),
            model_data_example,
            data_signature=model_data_example.get_signature(),
            config=meta,
            max_history_tracker_featurizer_used=isinstance(
                featurizer, MaxHistoryTrackerFeaturizer
            ),
            label_data=label_data,
        )

        # build the graph for prediction
        predict_data_example = RasaModelData(
            label_key=LABEL_KEY,
            label_sub_key=LABEL_SUB_KEY,
            data={
                feature_name: features
                for feature_name, features in model_data_example.items()
                if feature_name
                in STATE_LEVEL_FEATURES + FEATURES_TO_ENCODE + [DIALOGUE]
            },
        )
        model.build_for_predict(predict_data_example)

        return cls(
            featurizer=featurizer,
            priority=priority,
            model=model,
            zero_state_features=zero_state_features,
            **meta,
        )