def _assemble_label_data( self, attribute_data: Data, domain: Domain ) -> RasaModelData: """Constructs data regarding labels to be fed to the model. The resultant model data should contain the keys `label_intent`, `label`. `label_intent` will contain the sequence, sentence and mask features for all intent labels and `label` will contain the numerical label ids. Args: attribute_data: Feature data for all intent labels. domain: Domain of the assistant. Returns: Features of labels ready to be fed to the model. """ label_data = RasaModelData() label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_") label_data.add_lengths( f"{LABEL}_{INTENT}", SEQUENCE_LENGTH, f"{LABEL}_{INTENT}", SEQUENCE, ) label_ids = np.arange(len(domain.intents)) label_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2)], ) return label_data
def _pad_dense_data(array_of_dense: FeatureArray) -> np.ndarray: """Pad data of different lengths. Sequential data is padded with zeros. Zeros are added to the end of data. Args: array_of_dense: The array to pad. Returns: The padded array. """ if array_of_dense.number_of_dimensions == 4: return RasaDataGenerator._pad_4d_dense_data(array_of_dense) if array_of_dense[0].ndim < 2: # data doesn't contain a sequence return array_of_dense.astype(np.float32) data_size = len(array_of_dense) max_seq_len = max([x.shape[0] for x in array_of_dense]) data_padded = np.zeros( [data_size, max_seq_len, array_of_dense[0].shape[-1]], dtype=array_of_dense[0].dtype, ) for i in range(data_size): data_padded[i, :array_of_dense[i].shape[0], :] = array_of_dense[i] return data_padded.astype(np.float32)
def _assemble_label_data(self, attribute_data: Data, domain: Domain) -> RasaModelData: """Constructs data regarding labels to be fed to the model. The resultant model data can possibly contain one or both of the keys - [`label_action_name`, `label_action_text`] but will definitely contain the `label` key. `label_action_*` will contain the sequence, sentence and mask features for corresponding labels and `label` will contain the numerical label ids. Args: attribute_data: Feature data for all labels. domain: Domain of the assistant. Returns: Features of labels ready to be fed to the model. """ label_data = RasaModelData() label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_") label_data.add_lengths( f"{LABEL}_{ACTION_TEXT}", SEQUENCE_LENGTH, f"{LABEL}_{ACTION_TEXT}", SEQUENCE, ) label_ids = np.arange(domain.num_actions) label_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [ FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2) ], ) return label_data
def test_batch_inference( batch_size: int, number_of_data_points: int, expected_number_of_batch_iterations: int, ): model = RasaModel() def _batch_predict( batch_in: Tuple[np.ndarray], ) -> Dict[Text, Union[np.ndarray, Dict[Text, np.ndarray]]]: dummy_output = batch_in[0] output = { "dummy_output": dummy_output, "non_input_affected_output": tf.constant(np.array([[1, 2]]), dtype=tf.int32), } return output # Monkeypatch batch predict so that run_inference interface can be tested model.batch_predict = _batch_predict # Create dummy model data to pass to model model_data = RasaModelData( label_key=LABEL, label_sub_key=IDS, data={ TEXT: { SENTENCE: [ FeatureArray( np.random.rand(number_of_data_points, 2), number_of_dimensions=2, ), ] } }, ) output = model.run_inference(model_data, batch_size=batch_size) # Firstly, the number of data points in dummy_output should be equal # to the number of data points sent as input. assert output["dummy_output"].shape[0] == number_of_data_points # Secondly, the number of data points inside diagnostic_data should be # equal to the number of batches passed to the model because for every # batch passed as input, it would have created a # corresponding diagnostic data entry. assert output["non_input_affected_output"].shape == ( expected_number_of_batch_iterations, 2, )
def _feature_arrays_for_attribute( attribute: Text, absent_features: List[Any], attribute_to_features: Dict[Text, List[List[List["Features"]]]], training: bool, fake_features: Dict[Text, List["Features"]], consider_dialogue_dimension: bool, ) -> Dict[Text, List[FeatureArray]]: """Create the features for the given attribute from the all examples features. Args: attribute: the attribute of Message to be featurized absent_features: list of Nones, used as features if `attribute_to_features` does not contain the `attribute` attribute_to_features: features for every example training: boolean indicating whether we are currently in training or not fake_features: zero features consider_dialogue_dimension: If set to false the dialogue dimension will be removed from the resulting sequence features. Returns: A dictionary of feature type to actual features for the given attribute. """ features = (attribute_to_features[attribute] if attribute in attribute_to_features else absent_features) # in case some features for a specific attribute are # missing, replace them with a feature vector of zeros if training: fake_features[attribute] = _create_fake_features(features) (attribute_masks, _dense_features, _sparse_features) = _extract_features(features, fake_features[attribute], attribute) sparse_features = {} dense_features = {} for key, values in _sparse_features.items(): if consider_dialogue_dimension: sparse_features[key] = FeatureArray(np.array(values), number_of_dimensions=4) else: sparse_features[key] = FeatureArray(np.array( [v[0] for v in values]), number_of_dimensions=3) for key, values in _dense_features.items(): if consider_dialogue_dimension: dense_features[key] = FeatureArray(np.array(values), number_of_dimensions=4) else: dense_features[key] = FeatureArray(np.array([v[0] for v in values]), number_of_dimensions=3) attribute_to_feature_arrays = { MASK: [FeatureArray(np.array(attribute_masks), number_of_dimensions=3)] } feature_types = set() feature_types.update(list(dense_features.keys())) feature_types.update(list(sparse_features.keys())) for feature_type in feature_types: attribute_to_feature_arrays[feature_type] = [] if feature_type in sparse_features: attribute_to_feature_arrays[feature_type].append( sparse_features[feature_type]) if feature_type in dense_features: attribute_to_feature_arrays[feature_type].append( dense_features[feature_type]) return attribute_to_feature_arrays
assert len(data_generator) == len(expected_batch_sizes) for i in range(len(data_generator)): batch, _ = next(iterator) assert len(batch) == 11 assert len(batch[0]) == expected_batch_sizes[i] with pytest.raises(StopIteration): next(iterator) @pytest.mark.parametrize( "incoming_data, expected_shape", [ (FeatureArray(np.random.rand(7, 12), number_of_dimensions=2), (7, 12)), (FeatureArray(np.random.rand(7), number_of_dimensions=1), (7, )), ( FeatureArray( np.array([ np.random.rand(1, 10), np.random.rand(3, 10), np.random.rand(7, 10), np.random.rand(1, 10), ]), number_of_dimensions=3, ), (4, 7, 10), ), ( FeatureArray(
def _create_model_data( self, tracker_state_features: List[List[Dict[Text, List["Features"]]]], label_ids: Optional[np.ndarray] = None, entity_tags: Optional[List[List[Dict[Text, List["Features"]]]]] = None, encoded_all_labels: Optional[List[Dict[Text, List["Features"]]]] = None, ) -> RasaModelData: """Combine all model related data into RasaModelData. Args: tracker_state_features: a dictionary of attributes (INTENT, TEXT, ACTION_NAME, ACTION_TEXT, ENTITIES, SLOTS, ACTIVE_LOOP) to a list of features for all dialogue turns in all training trackers label_ids: the label ids (e.g. action ids) for every dialogue turn in all training trackers entity_tags: a dictionary of entity type (ENTITY_TAGS) to a list of features containing entity tag ids for text user inputs otherwise empty dict for all dialogue turns in all training trackers encoded_all_labels: a list of dictionaries containing attribute features for label ids Returns: RasaModelData """ model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY) if label_ids is not None and encoded_all_labels is not None: label_ids = np.array([ np.expand_dims(seq_label_ids, -1) for seq_label_ids in label_ids ]) model_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [FeatureArray(label_ids, number_of_dimensions=3)], ) attribute_data, self.fake_features = convert_to_data_format( tracker_state_features, featurizers=self.config[FEATURIZERS]) entity_tags_data = self._create_data_for_entities(entity_tags) if entity_tags_data is not None: model_data.add_data(entity_tags_data) else: # method is called during prediction attribute_data, _ = convert_to_data_format( tracker_state_features, self.fake_features, featurizers=self.config[FEATURIZERS], ) model_data.add_data(attribute_data) model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE) model_data.add_lengths(ACTION_TEXT, SEQUENCE_LENGTH, ACTION_TEXT, SEQUENCE) # add the dialogue lengths attribute_present = next(iter(list(attribute_data.keys()))) dialogue_lengths = np.array([ np.size(np.squeeze(f, -1)) for f in model_data.data[attribute_present][MASK][0] ]) model_data.data[DIALOGUE][LENGTH] = [ FeatureArray(dialogue_lengths, number_of_dimensions=1) ] # make sure all keys are in the same order during training and prediction model_data.sort() return model_data
async def model_data() -> RasaModelData: return RasaModelData( label_key="label", label_sub_key="ids", data={ "text": { "sentence": [ FeatureArray( np.array([ np.random.rand(5, 14), np.random.rand(2, 14), np.random.rand(3, 14), np.random.rand(1, 14), np.random.rand(3, 14), ]), number_of_dimensions=3, ), FeatureArray( np.array([ scipy.sparse.csr_matrix( np.random.randint(5, size=(5, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(2, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(1, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), ]), number_of_dimensions=3, ), ] }, "action_text": { "sequence": [ FeatureArray( np.array([ [ scipy.sparse.csr_matrix( np.random.randint(5, size=(5, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(2, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(1, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), ], [ scipy.sparse.csr_matrix( np.random.randint(5, size=(5, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(2, 10))), ], [ scipy.sparse.csr_matrix( np.random.randint(5, size=(5, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(1, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), ], [ scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))) ], [ scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(1, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(7, 10))), ], ]), number_of_dimensions=4, ), FeatureArray( np.array([ [ np.random.rand(5, 14), np.random.rand(2, 14), np.random.rand(3, 14), np.random.rand(1, 14), np.random.rand(3, 14), ], [np.random.rand(5, 14), np.random.rand(2, 14)], [ np.random.rand(5, 14), np.random.rand(1, 14), np.random.rand(3, 14), ], [np.random.rand(3, 14)], [ np.random.rand(3, 14), np.random.rand(1, 14), np.random.rand(7, 14), ], ]), number_of_dimensions=4, ), ] }, "dialogue": { "sentence": [ FeatureArray( np.array([ np.random.randint(2, size=(5, 10)), np.random.randint(2, size=(2, 10)), np.random.randint(2, size=(3, 10)), np.random.randint(2, size=(1, 10)), np.random.randint(2, size=(3, 10)), ]), number_of_dimensions=3, ) ] }, "label": { "ids": [ FeatureArray(np.array([0, 1, 0, 1, 1]), number_of_dimensions=1) ] }, "entities": { "tag_ids": [ FeatureArray( np.array([ np.array([[0], [1], [1], [0], [2]]), np.array([[2], [0]]), np.array([[0], [1], [1]]), np.array([[0], [1]]), np.array([[0], [0], [0]]), ]), number_of_dimensions=3, ) ] }, }, )