예제 #1
0
    def _assemble_label_data(
        self, attribute_data: Data, domain: Domain
    ) -> RasaModelData:
        """Constructs data regarding labels to be fed to the model.

        The resultant model data should contain the keys `label_intent`, `label`.
        `label_intent` will contain the sequence, sentence and mask features
        for all intent labels and `label` will contain the numerical label ids.

        Args:
            attribute_data: Feature data for all intent labels.
            domain: Domain of the assistant.

        Returns:
            Features of labels ready to be fed to the model.
        """
        label_data = RasaModelData()
        label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_")
        label_data.add_lengths(
            f"{LABEL}_{INTENT}", SEQUENCE_LENGTH, f"{LABEL}_{INTENT}", SEQUENCE,
        )
        label_ids = np.arange(len(domain.intents))
        label_data.add_features(
            LABEL_KEY,
            LABEL_SUB_KEY,
            [FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2)],
        )
        return label_data
예제 #2
0
    def _pad_dense_data(array_of_dense: FeatureArray) -> np.ndarray:
        """Pad data of different lengths.

        Sequential data is padded with zeros. Zeros are added to the end of data.

        Args:
            array_of_dense: The array to pad.

        Returns:
            The padded array.
        """
        if array_of_dense.number_of_dimensions == 4:
            return RasaDataGenerator._pad_4d_dense_data(array_of_dense)

        if array_of_dense[0].ndim < 2:
            # data doesn't contain a sequence
            return array_of_dense.astype(np.float32)

        data_size = len(array_of_dense)
        max_seq_len = max([x.shape[0] for x in array_of_dense])

        data_padded = np.zeros(
            [data_size, max_seq_len, array_of_dense[0].shape[-1]],
            dtype=array_of_dense[0].dtype,
        )
        for i in range(data_size):
            data_padded[i, :array_of_dense[i].shape[0], :] = array_of_dense[i]

        return data_padded.astype(np.float32)
예제 #3
0
    def _assemble_label_data(self, attribute_data: Data,
                             domain: Domain) -> RasaModelData:
        """Constructs data regarding labels to be fed to the model.

        The resultant model data can possibly contain one or both of the
        keys - [`label_action_name`, `label_action_text`] but will definitely
        contain the `label` key.
        `label_action_*` will contain the sequence, sentence and mask features
        for corresponding labels and `label` will contain the numerical label ids.

        Args:
            attribute_data: Feature data for all labels.
            domain: Domain of the assistant.

        Returns:
            Features of labels ready to be fed to the model.
        """
        label_data = RasaModelData()
        label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_")
        label_data.add_lengths(
            f"{LABEL}_{ACTION_TEXT}",
            SEQUENCE_LENGTH,
            f"{LABEL}_{ACTION_TEXT}",
            SEQUENCE,
        )
        label_ids = np.arange(domain.num_actions)
        label_data.add_features(
            LABEL_KEY,
            LABEL_SUB_KEY,
            [
                FeatureArray(np.expand_dims(label_ids, -1),
                             number_of_dimensions=2)
            ],
        )
        return label_data
예제 #4
0
def test_batch_inference(
    batch_size: int,
    number_of_data_points: int,
    expected_number_of_batch_iterations: int,
):
    model = RasaModel()

    def _batch_predict(
        batch_in: Tuple[np.ndarray],
    ) -> Dict[Text, Union[np.ndarray, Dict[Text, np.ndarray]]]:

        dummy_output = batch_in[0]
        output = {
            "dummy_output":
            dummy_output,
            "non_input_affected_output":
            tf.constant(np.array([[1, 2]]), dtype=tf.int32),
        }
        return output

    # Monkeypatch batch predict so that run_inference interface can be tested
    model.batch_predict = _batch_predict

    # Create dummy model data to pass to model
    model_data = RasaModelData(
        label_key=LABEL,
        label_sub_key=IDS,
        data={
            TEXT: {
                SENTENCE: [
                    FeatureArray(
                        np.random.rand(number_of_data_points, 2),
                        number_of_dimensions=2,
                    ),
                ]
            }
        },
    )
    output = model.run_inference(model_data, batch_size=batch_size)

    # Firstly, the number of data points in dummy_output should be equal
    # to the number of data points sent as input.
    assert output["dummy_output"].shape[0] == number_of_data_points

    # Secondly, the number of data points inside diagnostic_data should be
    # equal to the number of batches passed to the model because for every
    # batch passed as input, it would have created a
    # corresponding diagnostic data entry.
    assert output["non_input_affected_output"].shape == (
        expected_number_of_batch_iterations,
        2,
    )
예제 #5
0
def _feature_arrays_for_attribute(
    attribute: Text,
    absent_features: List[Any],
    attribute_to_features: Dict[Text, List[List[List["Features"]]]],
    training: bool,
    fake_features: Dict[Text, List["Features"]],
    consider_dialogue_dimension: bool,
) -> Dict[Text, List[FeatureArray]]:
    """Create the features for the given attribute from the all examples features.

    Args:
        attribute: the attribute of Message to be featurized
        absent_features: list of Nones, used as features if `attribute_to_features`
            does not contain the `attribute`
        attribute_to_features: features for every example
        training: boolean indicating whether we are currently in training or not
        fake_features: zero features
        consider_dialogue_dimension: If set to false the dialogue dimension will be
          removed from the resulting sequence features.

    Returns:
        A dictionary of feature type to actual features for the given attribute.
    """
    features = (attribute_to_features[attribute]
                if attribute in attribute_to_features else absent_features)

    # in case some features for a specific attribute are
    # missing, replace them with a feature vector of zeros
    if training:
        fake_features[attribute] = _create_fake_features(features)

    (attribute_masks, _dense_features,
     _sparse_features) = _extract_features(features, fake_features[attribute],
                                           attribute)

    sparse_features = {}
    dense_features = {}

    for key, values in _sparse_features.items():
        if consider_dialogue_dimension:
            sparse_features[key] = FeatureArray(np.array(values),
                                                number_of_dimensions=4)
        else:
            sparse_features[key] = FeatureArray(np.array(
                [v[0] for v in values]),
                                                number_of_dimensions=3)

    for key, values in _dense_features.items():
        if consider_dialogue_dimension:
            dense_features[key] = FeatureArray(np.array(values),
                                               number_of_dimensions=4)
        else:
            dense_features[key] = FeatureArray(np.array([v[0]
                                                         for v in values]),
                                               number_of_dimensions=3)

    attribute_to_feature_arrays = {
        MASK:
        [FeatureArray(np.array(attribute_masks), number_of_dimensions=3)]
    }

    feature_types = set()
    feature_types.update(list(dense_features.keys()))
    feature_types.update(list(sparse_features.keys()))

    for feature_type in feature_types:
        attribute_to_feature_arrays[feature_type] = []
        if feature_type in sparse_features:
            attribute_to_feature_arrays[feature_type].append(
                sparse_features[feature_type])
        if feature_type in dense_features:
            attribute_to_feature_arrays[feature_type].append(
                dense_features[feature_type])

    return attribute_to_feature_arrays
예제 #6
0
    assert len(data_generator) == len(expected_batch_sizes)

    for i in range(len(data_generator)):
        batch, _ = next(iterator)
        assert len(batch) == 11
        assert len(batch[0]) == expected_batch_sizes[i]

    with pytest.raises(StopIteration):
        next(iterator)


@pytest.mark.parametrize(
    "incoming_data, expected_shape",
    [
        (FeatureArray(np.random.rand(7, 12), number_of_dimensions=2), (7, 12)),
        (FeatureArray(np.random.rand(7), number_of_dimensions=1), (7, )),
        (
            FeatureArray(
                np.array([
                    np.random.rand(1, 10),
                    np.random.rand(3, 10),
                    np.random.rand(7, 10),
                    np.random.rand(1, 10),
                ]),
                number_of_dimensions=3,
            ),
            (4, 7, 10),
        ),
        (
            FeatureArray(
예제 #7
0
    def _create_model_data(
        self,
        tracker_state_features: List[List[Dict[Text, List["Features"]]]],
        label_ids: Optional[np.ndarray] = None,
        entity_tags: Optional[List[List[Dict[Text, List["Features"]]]]] = None,
        encoded_all_labels: Optional[List[Dict[Text,
                                               List["Features"]]]] = None,
    ) -> RasaModelData:
        """Combine all model related data into RasaModelData.

        Args:
            tracker_state_features: a dictionary of attributes
                (INTENT, TEXT, ACTION_NAME, ACTION_TEXT, ENTITIES, SLOTS, ACTIVE_LOOP)
                to a list of features for all dialogue turns in all training trackers
            label_ids: the label ids (e.g. action ids) for every dialogue turn in all
                training trackers
            entity_tags: a dictionary of entity type (ENTITY_TAGS) to a list of features
                containing entity tag ids for text user inputs otherwise empty dict
                for all dialogue turns in all training trackers
            encoded_all_labels: a list of dictionaries containing attribute features
                for label ids

        Returns:
            RasaModelData
        """
        model_data = RasaModelData(label_key=LABEL_KEY,
                                   label_sub_key=LABEL_SUB_KEY)

        if label_ids is not None and encoded_all_labels is not None:
            label_ids = np.array([
                np.expand_dims(seq_label_ids, -1)
                for seq_label_ids in label_ids
            ])
            model_data.add_features(
                LABEL_KEY,
                LABEL_SUB_KEY,
                [FeatureArray(label_ids, number_of_dimensions=3)],
            )

            attribute_data, self.fake_features = convert_to_data_format(
                tracker_state_features, featurizers=self.config[FEATURIZERS])

            entity_tags_data = self._create_data_for_entities(entity_tags)
            if entity_tags_data is not None:
                model_data.add_data(entity_tags_data)
        else:
            # method is called during prediction
            attribute_data, _ = convert_to_data_format(
                tracker_state_features,
                self.fake_features,
                featurizers=self.config[FEATURIZERS],
            )

        model_data.add_data(attribute_data)
        model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
        model_data.add_lengths(ACTION_TEXT, SEQUENCE_LENGTH, ACTION_TEXT,
                               SEQUENCE)

        # add the dialogue lengths
        attribute_present = next(iter(list(attribute_data.keys())))
        dialogue_lengths = np.array([
            np.size(np.squeeze(f, -1))
            for f in model_data.data[attribute_present][MASK][0]
        ])
        model_data.data[DIALOGUE][LENGTH] = [
            FeatureArray(dialogue_lengths, number_of_dimensions=1)
        ]

        # make sure all keys are in the same order during training and prediction
        model_data.sort()

        return model_data
예제 #8
0
async def model_data() -> RasaModelData:
    return RasaModelData(
        label_key="label",
        label_sub_key="ids",
        data={
            "text": {
                "sentence": [
                    FeatureArray(
                        np.array([
                            np.random.rand(5, 14),
                            np.random.rand(2, 14),
                            np.random.rand(3, 14),
                            np.random.rand(1, 14),
                            np.random.rand(3, 14),
                        ]),
                        number_of_dimensions=3,
                    ),
                    FeatureArray(
                        np.array([
                            scipy.sparse.csr_matrix(
                                np.random.randint(5, size=(5, 10))),
                            scipy.sparse.csr_matrix(
                                np.random.randint(5, size=(2, 10))),
                            scipy.sparse.csr_matrix(
                                np.random.randint(5, size=(3, 10))),
                            scipy.sparse.csr_matrix(
                                np.random.randint(5, size=(1, 10))),
                            scipy.sparse.csr_matrix(
                                np.random.randint(5, size=(3, 10))),
                        ]),
                        number_of_dimensions=3,
                    ),
                ]
            },
            "action_text": {
                "sequence": [
                    FeatureArray(
                        np.array([
                            [
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(5, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(2, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(3, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(1, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(3, 10))),
                            ],
                            [
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(5, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(2, 10))),
                            ],
                            [
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(5, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(1, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(3, 10))),
                            ],
                            [
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(3, 10)))
                            ],
                            [
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(3, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(1, 10))),
                                scipy.sparse.csr_matrix(
                                    np.random.randint(5, size=(7, 10))),
                            ],
                        ]),
                        number_of_dimensions=4,
                    ),
                    FeatureArray(
                        np.array([
                            [
                                np.random.rand(5, 14),
                                np.random.rand(2, 14),
                                np.random.rand(3, 14),
                                np.random.rand(1, 14),
                                np.random.rand(3, 14),
                            ],
                            [np.random.rand(5, 14),
                             np.random.rand(2, 14)],
                            [
                                np.random.rand(5, 14),
                                np.random.rand(1, 14),
                                np.random.rand(3, 14),
                            ],
                            [np.random.rand(3, 14)],
                            [
                                np.random.rand(3, 14),
                                np.random.rand(1, 14),
                                np.random.rand(7, 14),
                            ],
                        ]),
                        number_of_dimensions=4,
                    ),
                ]
            },
            "dialogue": {
                "sentence": [
                    FeatureArray(
                        np.array([
                            np.random.randint(2, size=(5, 10)),
                            np.random.randint(2, size=(2, 10)),
                            np.random.randint(2, size=(3, 10)),
                            np.random.randint(2, size=(1, 10)),
                            np.random.randint(2, size=(3, 10)),
                        ]),
                        number_of_dimensions=3,
                    )
                ]
            },
            "label": {
                "ids": [
                    FeatureArray(np.array([0, 1, 0, 1, 1]),
                                 number_of_dimensions=1)
                ]
            },
            "entities": {
                "tag_ids": [
                    FeatureArray(
                        np.array([
                            np.array([[0], [1], [1], [0], [2]]),
                            np.array([[2], [0]]),
                            np.array([[0], [1], [1]]),
                            np.array([[0], [1]]),
                            np.array([[0], [0], [0]]),
                        ]),
                        number_of_dimensions=3,
                    )
                ]
            },
        },
    )