Exemplo n.º 1
0
    def _load_model_class(
        cls,
        tf_model_file: Text,
        model_data_example: RasaModelData,
        label_data: RasaModelData,
        entity_tag_specs: List[EntityTagSpec],
        config: Dict[Text, Any],
        finetune_mode: bool = False,
    ) -> "RasaModel":

        predict_data_example = RasaModelData(
            label_key=model_data_example.label_key,
            data={
                feature_name: features
                for feature_name, features in model_data_example.items()
                if TEXT in feature_name
            },
        )
        return cls.model_class(config[USE_TEXT_AS_LABEL]).load(
            tf_model_file,
            model_data_example,
            predict_data_example,
            data_signature=model_data_example.get_signature(),
            label_data=label_data,
            entity_tag_specs=entity_tag_specs,
            config=copy.deepcopy(config),
            finetune_mode=finetune_mode,
        )
Exemplo n.º 2
0
    def __init__(
        self,
        name: Text,
        config: Dict[Text, Any],
        data_signature: Dict[Text, Dict[Text, List[FeatureSignature]]],
        label_data: RasaModelData,
    ) -> None:
        super().__init__(
            name=name,
            random_seed=config[RANDOM_SEED],
            tensorboard_log_dir=config[TENSORBOARD_LOG_DIR],
            tensorboard_log_level=config[TENSORBOARD_LOG_LEVEL],
            checkpoint_model=config[CHECKPOINT_MODEL],
        )

        self.config = config
        self.data_signature = data_signature
        self.label_signature = label_data.get_signature()

        self._check_data()

        label_batch = label_data.prepare_batch()
        self.tf_label_data = self.batch_to_model_data_format(
            label_batch, self.label_signature)

        # set up tf layers
        self._tf_layers: Dict[Text, tf.keras.layers.Layer] = {}
Exemplo n.º 3
0
def test_train_val_split(model_data: RasaModelData):
    train_model_data, test_model_data = model_data.split(2, 42)

    for key, values in model_data.items():
        assert len(values) == len(train_model_data.get(key))
        assert len(values) == len(test_model_data.get(key))
        for sub_key, data in values.items():
            assert len(data) == len(train_model_data.get(key, sub_key))
            assert len(data) == len(test_model_data.get(key, sub_key))
            for i, v in enumerate(data):
                if isinstance(v[0], list):
                    assert (
                        v[0][0].dtype
                        == train_model_data.get(key, sub_key)[i][0][0].dtype
                    )
                else:
                    assert v[0].dtype == train_model_data.get(key, sub_key)[i][0].dtype

    for values in train_model_data.values():
        for data in values.values():
            for v in data:
                assert np.array(v).shape[0] == 3

    for values in test_model_data.values():
        for data in values.values():
            for v in data:
                assert np.array(v).shape[0] == 2
Exemplo n.º 4
0
def test_update_key(model_data: RasaModelData):
    assert model_data.does_feature_exist("label", "ids")

    model_data.update_key("label", "ids", "intent", "ids")

    assert not model_data.does_feature_exist("label", "ids")
    assert model_data.does_feature_exist("intent", "ids")
    assert "label" not in model_data.data
Exemplo n.º 5
0
def test_split_data_by_label(model_data: RasaModelData):
    split_model_data = model_data._split_by_label_ids(
        model_data.data,
        model_data.get("intent", "ids")[0], np.array([0, 1]))

    assert len(split_model_data) == 2
    for s in split_model_data:
        assert len(set(s.get("intent", "ids")[0])) == 1
Exemplo n.º 6
0
def test_not_balance_model_data(model_data: RasaModelData):
    test_model_data = RasaModelData(label_key="entities",
                                    label_sub_key="tag_ids",
                                    data=model_data.data)

    data = test_model_data._balanced_data(test_model_data.data, 2, False)

    assert np.all(data["entities"]["tag_ids"] == test_model_data.get(
        "entities", "tag_ids"))
Exemplo n.º 7
0
    def _create_label_data(self, domain: Domain) -> RasaModelData:
        # encode all label_ids with policies' featurizer
        state_featurizer = self.featurizer.state_featurizer
        all_labels = state_featurizer.create_encoded_all_actions(domain)
        all_labels = all_labels.astype(np.float32)

        label_data = RasaModelData()
        label_data.add_features(LABEL_FEATURES, [all_labels])
        return label_data
Exemplo n.º 8
0
    def load(cls, path: Text) -> "TEDPolicy":
        """Loads a policy from the storage.

        **Needs to load its featurizer**
        """

        if not os.path.exists(path):
            raise Exception(f"Failed to load TED policy model. Path "
                            f"'{os.path.abspath(path)}' doesn't exist.")

        model_path = Path(path)
        tf_model_file = model_path / f"{SAVE_MODEL_FILE_NAME}.tf_model"

        featurizer = TrackerFeaturizer.load(path)

        if not (model_path /
                f"{SAVE_MODEL_FILE_NAME}.data_example.pkl").is_file():
            return cls(featurizer=featurizer)

        loaded_data = io_utils.json_unpickle(
            model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl")
        label_data = io_utils.json_unpickle(
            model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl")
        meta = io_utils.pickle_load(model_path /
                                    f"{SAVE_MODEL_FILE_NAME}.meta.pkl")
        priority = io_utils.json_unpickle(
            model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl")

        model_data_example = RasaModelData(label_key=LABEL_IDS,
                                           data=loaded_data)
        meta = train_utils.update_similarity_type(meta)

        model = TED.load(
            str(tf_model_file),
            model_data_example,
            data_signature=model_data_example.get_signature(),
            config=meta,
            max_history_tracker_featurizer_used=isinstance(
                featurizer, MaxHistoryTrackerFeaturizer),
            label_data=label_data,
        )

        # build the graph for prediction
        predict_data_example = RasaModelData(
            label_key=LABEL_IDS,
            data={
                feature_name: features
                for feature_name, features in model_data_example.items()
                if DIALOGUE in feature_name
            },
        )
        model.build_for_predict(predict_data_example)

        return cls(featurizer=featurizer,
                   priority=priority,
                   model=model,
                   **meta)
Exemplo n.º 9
0
def test_session_data_for_ids(model_data: RasaModelData):
    filtered_data = model_data._data_for_ids(model_data.data, np.array([0, 1]))

    for values in filtered_data.values():
        for v in values:
            assert v.shape[0] == 2

    k = list(model_data.keys())[0]

    assert np.all(np.array(filtered_data[k][0][0]) == np.array(model_data.get(k)[0][0]))
    assert np.all(np.array(filtered_data[k][0][1]) == np.array(model_data.get(k)[0][1]))
Exemplo n.º 10
0
    def _check_input_dimension_consistency(self,
                                           model_data: RasaModelData) -> None:
        """Checks if features have same dimensionality if hidden layers are shared."""

        if self.component_config.get(SHARE_HIDDEN_LAYERS):
            num_text_features = model_data.feature_dimension(TEXT_FEATURES)
            num_label_features = model_data.feature_dimension(LABEL_FEATURES)

            if num_text_features != num_label_features:
                raise ValueError(
                    "If embeddings are shared text features and label features "
                    "must coincide. Check the output dimensions of previous components."
                )
Exemplo n.º 11
0
def test_split_data_by_label(model_data: RasaModelData):
    split_model_data = model_data._split_by_label_ids(
        model_data.data, model_data.get("label", "ids")[0], np.array([0, 1])
    )

    assert len(split_model_data) == 2
    for s in split_model_data:
        assert len(set(s.get("label", "ids")[0])) == 1

    for key, attribute_data in split_model_data[0].items():
        for sub_key, features in attribute_data.items():
            assert len(features) == len(model_data.data[key][sub_key])
            assert len(features[0]) == 2
Exemplo n.º 12
0
def test_shuffle_session_data(model_data: RasaModelData):
    before = copy.copy(model_data)

    # precondition
    assert np.all(
        np.array(list(before.values())) == np.array(list(model_data.values())))

    data = model_data._shuffled_data(model_data.data)

    # check that original data didn't change
    assert np.all(
        np.array(list(before.values())) == np.array(list(model_data.values())))
    # check that new data is different
    assert np.all(np.array(model_data.values()) != np.array(data.values()))
Exemplo n.º 13
0
def test_split_data_by_none_label(model_data: RasaModelData):
    model_data.label_key = None

    split_model_data = model_data.split(2, 42)

    assert len(split_model_data) == 2

    train_data = split_model_data[0]
    test_data = split_model_data[1]

    # train data should have 3 examples
    assert len(train_data.get("intent_ids")[0]) == 3
    # test data should have 2 examples
    assert len(test_data.get("intent_ids")[0]) == 2
Exemplo n.º 14
0
    def _assemble_label_data(self, attribute_data: Data,
                             domain: Domain) -> RasaModelData:
        """Constructs data regarding labels to be fed to the model.

        The resultant model data can possibly contain one or both of the
        keys - [`label_action_name`, `label_action_text`] but will definitely
        contain the `label` key.
        `label_action_*` will contain the sequence, sentence and mask features
        for corresponding labels and `label` will contain the numerical label ids.

        Args:
            attribute_data: Feature data for all labels.
            domain: Domain of the assistant.

        Returns:
            Features of labels ready to be fed to the model.
        """
        label_data = RasaModelData()
        label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_")
        label_data.add_lengths(
            f"{LABEL}_{ACTION_TEXT}",
            SEQUENCE_LENGTH,
            f"{LABEL}_{ACTION_TEXT}",
            SEQUENCE,
        )
        label_ids = np.arange(domain.num_actions)
        label_data.add_features(
            LABEL_KEY,
            LABEL_SUB_KEY,
            [
                FeatureArray(np.expand_dims(label_ids, -1),
                             number_of_dimensions=2)
            ],
        )
        return label_data
Exemplo n.º 15
0
    def _assemble_label_data(
        self, attribute_data: Data, domain: Domain
    ) -> RasaModelData:
        """Constructs data regarding labels to be fed to the model.

        The resultant model data should contain the keys `label_intent`, `label`.
        `label_intent` will contain the sequence, sentence and mask features
        for all intent labels and `label` will contain the numerical label ids.

        Args:
            attribute_data: Feature data for all intent labels.
            domain: Domain of the assistant.

        Returns:
            Features of labels ready to be fed to the model.
        """
        label_data = RasaModelData()
        label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_")
        label_data.add_lengths(
            f"{LABEL}_{INTENT}", SEQUENCE_LENGTH, f"{LABEL}_{INTENT}", SEQUENCE,
        )
        label_ids = np.arange(len(domain.intents))
        label_data.add_features(
            LABEL_KEY,
            LABEL_SUB_KEY,
            [FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2)],
        )
        return label_data
Exemplo n.º 16
0
def test_train_val_split(model_data: RasaModelData):
    train_model_data, test_model_data = model_data.split(2, 42)

    for k, values in model_data.items():
        assert len(values) == len(train_model_data.get(k))
        assert len(values) == len(test_model_data.get(k))
        for i, v in enumerate(values):
            assert v[0].dtype == train_model_data.get(k)[i][0].dtype

    for values in train_model_data.values():
        for v in values:
            assert v.shape[0] == 3

    for values in test_model_data.values():
        for v in values:
            assert v.shape[0] == 2
Exemplo n.º 17
0
 def _instantiate_model_class(self, model_data: RasaModelData) -> "RasaModel":
     return self.model_class(self.use_text_as_label)(
         data_signature=model_data.get_signature(),
         label_data=self._label_data,
         entity_tag_specs=self._entity_tag_specs,
         config=self.component_config,
     )
Exemplo n.º 18
0
    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
        """Prepares data for training.

        Performs sanity checks on training data, extracts encodings for labels.
        """

        if self.retrieval_intent:
            training_data = training_data.filter_by_intent(self.retrieval_intent)

        label_id_index_mapping = self._label_id_index_mapping(
            training_data, attribute=RESPONSE
        )

        if not label_id_index_mapping:
            # no labels are present to train
            return RasaModelData()

        self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)

        self._label_data = self._create_label_data(
            training_data, label_id_index_mapping, attribute=RESPONSE
        )

        model_data = self._create_model_data(
            training_data.intent_examples,
            label_id_index_mapping,
            label_attribute=RESPONSE,
        )

        self._check_input_dimension_consistency(model_data)

        return model_data
Exemplo n.º 19
0
 def _update_data_signatures(self, model_data: RasaModelData) -> None:
     self.data_signature = model_data.get_signature()
     self.predict_data_signature = {
         feature_name: features
         for feature_name, features in self.data_signature.items()
         if TEXT in feature_name
     }
Exemplo n.º 20
0
def test_session_data_for_ids(model_data: RasaModelData):
    filtered_data = model_data._data_for_ids(model_data.data, np.array([0, 1]))

    for values in filtered_data.values():
        for data in values.values():
            for v in data:
                assert v.shape[0] == 2

    key = model_data.keys()[0]
    sub_key = model_data.keys(key)[0]

    assert np.all(
        np.array(filtered_data[key][sub_key][0][0]) == np.array(
            model_data.get(key, sub_key)[0][0]))
    assert np.all(
        np.array(filtered_data[key][sub_key][0][1]) == np.array(
            model_data.get(key, sub_key)[0][1]))
Exemplo n.º 21
0
    def __init__(
        self,
        data_signature: Dict[Text, List[FeatureSignature]],
        config: Dict[Text, Any],
        max_history_tracker_featurizer_used: bool,
        label_data: RasaModelData,
    ) -> None:
        super().__init__(
            name="TED",
            random_seed=config[RANDOM_SEED],
            tensorboard_log_dir=config[TENSORBOARD_LOG_DIR],
            tensorboard_log_level=config[TENSORBOARD_LOG_LEVEL],
        )

        self.config = config
        self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used

        # data
        self.data_signature = data_signature
        self._check_data()

        self.predict_data_signature = {
            feature_name: features
            for feature_name, features in data_signature.items()
            if DIALOGUE in feature_name
        }

        # optimizer
        self.optimizer = tf.keras.optimizers.Adam()

        self.all_labels_embed = None

        label_batch = label_data.prepare_batch()
        self.tf_label_data = self.batch_to_model_data_format(
            label_batch, label_data.get_signature()
        )

        # metrics
        self.action_loss = tf.keras.metrics.Mean(name="loss")
        self.action_acc = tf.keras.metrics.Mean(name="acc")
        self.metrics_to_log += ["loss", "acc"]

        # set up tf layers
        self._tf_layers: Dict[Text : tf.keras.layers.Layer] = {}
        self._prepare_layers()
Exemplo n.º 22
0
def test_sort(model_data: RasaModelData):
    assert list(model_data.data.keys()) == [
        "text",
        "action_text",
        "dialogue",
        "label",
        "entities",
    ]

    model_data.sort()

    assert list(model_data.data.keys()) == [
        "action_text",
        "dialogue",
        "entities",
        "label",
        "text",
    ]
Exemplo n.º 23
0
 def _construct_model_initialization_data(
     cls, loaded_data: Dict[Text, Dict[Text, List[FeatureArray]]]
 ) -> Tuple[RasaModelData, RasaModelData]:
     model_data_example = RasaModelData(label_key=LABEL_KEY,
                                        label_sub_key=LABEL_SUB_KEY,
                                        data=loaded_data)
     predict_data_example = RasaModelData(
         label_key=LABEL_KEY,
         label_sub_key=LABEL_SUB_KEY,
         data={
             feature_name: features
             for feature_name, features in model_data_example.items()
             if feature_name
             # we need to remove label features for prediction if they are present
             in PREDICTION_FEATURES
         },
     )
     return model_data_example, predict_data_example
Exemplo n.º 24
0
    def predict(self, predict_data: RasaModelData) -> Dict[Text, tf.Tensor]:
        if self._predict_function is None:
            logger.debug("There is no tensorflow prediction graph.")
            self.build_for_predict(predict_data)

        # Prepare a single batch of the size of the input
        batch_in = predict_data.prepare_batch()

        self._training = False  # needed for eager mode
        return self._predict_function(batch_in)
Exemplo n.º 25
0
    def predict(self, predict_data: RasaModelData) -> Dict[Text, tf.Tensor]:
        if self._predict_function is None:
            logger.debug("There is no tensorflow prediction graph.")
            self.build_for_predict(predict_data)

        predict_dataset = predict_data.as_tf_dataset(batch_size=1)
        batch_in = next(iter(predict_dataset))

        self._training = False  # needed for eager mode
        return self._predict_function(batch_in)
Exemplo n.º 26
0
async def model_data() -> RasaModelData:
    return RasaModelData(
        label_key="intent",
        label_sub_key="ids",
        data={
            "text_features": {
                "sentence": [
                    np.array([
                        np.random.rand(5, 14),
                        np.random.rand(2, 14),
                        np.random.rand(3, 14),
                        np.random.rand(1, 14),
                        np.random.rand(3, 14),
                    ]),
                    np.array([
                        scipy.sparse.csr_matrix(
                            np.random.randint(5, size=(5, 10))),
                        scipy.sparse.csr_matrix(
                            np.random.randint(5, size=(2, 10))),
                        scipy.sparse.csr_matrix(
                            np.random.randint(5, size=(3, 10))),
                        scipy.sparse.csr_matrix(
                            np.random.randint(5, size=(1, 10))),
                        scipy.sparse.csr_matrix(
                            np.random.randint(5, size=(3, 10))),
                    ]),
                ]
            },
            "intent_features": {
                "sentence": [
                    np.array([
                        np.random.randint(2, size=(5, 10)),
                        np.random.randint(2, size=(2, 10)),
                        np.random.randint(2, size=(3, 10)),
                        np.random.randint(2, size=(1, 10)),
                        np.random.randint(2, size=(3, 10)),
                    ])
                ]
            },
            "intent": {
                "ids": [np.array([0, 1, 0, 1, 1])]
            },
            "entities": {
                "tag_ids": [
                    np.array([
                        np.array([[0], [1], [1], [0], [2]]),
                        np.array([[2], [0]]),
                        np.array([[0], [1], [1]]),
                        np.array([[0], [1]]),
                        np.array([[0], [0], [0]]),
                    ])
                ]
            },
        },
    )
Exemplo n.º 27
0
def test_batch_inference(
    batch_size: int,
    number_of_data_points: int,
    expected_number_of_batch_iterations: int,
):
    model = RasaModel()

    def _batch_predict(
        batch_in: Tuple[np.ndarray],
    ) -> Dict[Text, Union[np.ndarray, Dict[Text, np.ndarray]]]:

        dummy_output = batch_in[0]
        output = {
            "dummy_output":
            dummy_output,
            "non_input_affected_output":
            tf.constant(np.array([[1, 2]]), dtype=tf.int32),
        }
        return output

    # Monkeypatch batch predict so that run_inference interface can be tested
    model.batch_predict = _batch_predict

    # Create dummy model data to pass to model
    model_data = RasaModelData(
        label_key=LABEL,
        label_sub_key=IDS,
        data={
            TEXT: {
                SENTENCE: [
                    FeatureArray(
                        np.random.rand(number_of_data_points, 2),
                        number_of_dimensions=2,
                    ),
                ]
            }
        },
    )
    output = model.run_inference(model_data, batch_size=batch_size)

    # Firstly, the number of data points in dummy_output should be equal
    # to the number of data points sent as input.
    assert output["dummy_output"].shape[0] == number_of_data_points

    # Secondly, the number of data points inside diagnostic_data should be
    # equal to the number of batches passed to the model because for every
    # batch passed as input, it would have created a
    # corresponding diagnostic data entry.
    assert output["non_input_affected_output"].shape == (
        expected_number_of_batch_iterations,
        2,
    )
Exemplo n.º 28
0
    def _load_model(
        cls,
        index_label_id_mapping: Dict[int, Text],
        index_tag_id_mapping: Dict[int, Text],
        label_data: RasaModelData,
        meta: Dict[Text, Any],
        data_example: Dict[Text, List[np.ndarray]],
        model_dir: Text,
    ):
        file_name = meta.get("file")
        tf_model_file = os.path.join(model_dir, file_name + ".tf_model")

        label_key = LABEL_IDS if meta[INTENT_CLASSIFICATION] else None
        model_data_example = RasaModelData(label_key=label_key,
                                           data=data_example)

        model = cls.model_class(meta).load(
            tf_model_file,
            model_data_example,
            data_signature=model_data_example.get_signature(),
            label_data=label_data,
            index_label_id_mapping=index_label_id_mapping,
            index_tag_id_mapping=index_tag_id_mapping,
            config=meta,
        )

        # build the graph for prediction
        predict_data_example = RasaModelData(
            label_key=label_key,
            data={
                feature_name: features
                for feature_name, features in model_data_example.items()
                if TEXT in feature_name
            },
        )

        model.build_for_predict(predict_data_example)

        return model
Exemplo n.º 29
0
    def __init__(
        self,
        data_signature: Dict[Text, List[FeatureSignature]],
        label_data: RasaModelData,
        index_label_id_mapping: Optional[Dict[int, Text]],
        index_tag_id_mapping: Optional[Dict[int, Text]],
        config: Dict[Text, Any],
    ) -> None:

        super().__init__(
            name="CRFTransformer",
            random_seed=config[RANDOM_SEED],
            tensorboard_log_dir=config[TENSORBOARD_LOG_DIR],
            tensorboard_log_level=config[TENSORBOARD_LOG_LEVEL],
        )

        self.config = config
        self.data_signature = data_signature
        self._check_data()

        self.predict_data_signature = {
            feature_name: features
            for feature_name, features in data_signature.items()
            if TEXT in feature_name
        }
        label_batch = label_data.prepare_batch()
        self.tf_label_data = self.batch_to_model_data_format(
            label_batch, label_data.get_signature())
        self._num_intents = len(index_label_id_mapping
                                ) if index_label_id_mapping is not None else 0
        self._num_tags = len(
            index_tag_id_mapping) if index_tag_id_mapping is not None else 0

        # tf objects, training
        self._prepare_layers()
        self._set_optimizer(tf.keras.optimizers.Adam(config[LEARNING_RATE]))
        self._create_metrics()
        self._update_metrics_to_log()
Exemplo n.º 30
0
    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
        """Prepares data for training.

        Performs sanity checks on training data, extracts encodings for labels.

        Args:
            training_data: training data to preprocessed.
        """
        # Collect all retrieval intents present in the data before filtering
        self.all_retrieval_intents = list(training_data.retrieval_intents)

        if self.retrieval_intent:
            training_data = training_data.filter_training_examples(
                lambda ex: self.retrieval_intent == ex.get(INTENT)
            )
        else:
            # retrieval intent was left to its default value
            logger.info(
                "Retrieval intent parameter was left to its default value. This "
                "response selector will be trained on training examples combining "
                "all retrieval intents."
            )

        label_attribute = RESPONSE if self.use_text_as_label else INTENT_RESPONSE_KEY

        label_id_index_mapping = self._label_id_index_mapping(
            training_data, attribute=label_attribute
        )

        self.responses = training_data.responses

        if not label_id_index_mapping:
            # no labels are present to train
            return RasaModelData()

        self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)

        self._label_data = self._create_label_data(
            training_data, label_id_index_mapping, attribute=label_attribute
        )

        model_data = self._create_model_data(
            training_data.intent_examples,
            label_id_index_mapping,
            label_attribute=label_attribute,
        )

        self._check_input_dimension_consistency(model_data)

        return model_data