def _assemble_label_data( self, attribute_data: Data, domain: Domain ) -> RasaModelData: """Constructs data regarding labels to be fed to the model. The resultant model data should contain the keys `label_intent`, `label`. `label_intent` will contain the sequence, sentence and mask features for all intent labels and `label` will contain the numerical label ids. Args: attribute_data: Feature data for all intent labels. domain: Domain of the assistant. Returns: Features of labels ready to be fed to the model. """ label_data = RasaModelData() label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_") label_data.add_lengths( f"{LABEL}_{INTENT}", SEQUENCE_LENGTH, f"{LABEL}_{INTENT}", SEQUENCE, ) label_ids = np.arange(len(domain.intents)) label_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2)], ) return label_data
def _assemble_label_data(self, attribute_data: Data, domain: Domain) -> RasaModelData: """Constructs data regarding labels to be fed to the model. The resultant model data can possibly contain one or both of the keys - [`label_action_name`, `label_action_text`] but will definitely contain the `label` key. `label_action_*` will contain the sequence, sentence and mask features for corresponding labels and `label` will contain the numerical label ids. Args: attribute_data: Feature data for all labels. domain: Domain of the assistant. Returns: Features of labels ready to be fed to the model. """ label_data = RasaModelData() label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_") label_data.add_lengths( f"{LABEL}_{ACTION_TEXT}", SEQUENCE_LENGTH, f"{LABEL}_{ACTION_TEXT}", SEQUENCE, ) label_ids = np.arange(domain.num_actions) label_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [ FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2) ], ) return label_data
def _create_label_data(self, domain: Domain) -> RasaModelData: # encode all label_ids with policies' featurizer state_featurizer = self.featurizer.state_featurizer all_labels = state_featurizer.create_encoded_all_actions(domain) all_labels = all_labels.astype(np.float32) label_data = RasaModelData() label_data.add_features(LABEL_FEATURES, [all_labels]) return label_data
def _create_label_data( self, domain: Domain, interpreter: NaturalLanguageInterpreter ) -> Tuple[RasaModelData, List[Dict[Text, List["Features"]]]]: # encode all label_ids with policies' featurizer state_featurizer = self.featurizer.state_featurizer encoded_all_labels = state_featurizer.encode_all_actions(domain, interpreter) attribute_data, _ = convert_to_data_format(encoded_all_labels) label_data = RasaModelData() label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_") label_ids = np.arange(domain.num_actions) label_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [np.expand_dims(label_ids, -1)] ) return label_data, encoded_all_labels
def _create_model_data( self, tracker_state_features: List[List[Dict[Text, List["Features"]]]], label_ids: Optional[np.ndarray] = None, encoded_all_labels: Optional[List[Dict[Text, List["Features"]]]] = None, ) -> RasaModelData: """Combine all model related data into RasaModelData. Args: tracker_state_features: a dictionary of attributes (INTENT, TEXT, ACTION_NAME, ACTION_TEXT, ENTITIES, SLOTS, ACTIVE_LOOP) to a list of features for all dialogue turns in all training trackers label_ids: the label ids (e.g. action ids) for every dialogue turn in all training trackers encoded_all_labels: a list of dictionaries containing attribute features for labels ids Returns: RasaModelData """ model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY) if label_ids is not None and encoded_all_labels is not None: label_ids = np.array( [np.expand_dims(seq_label_ids, -1) for seq_label_ids in label_ids] ) model_data.add_features(LABEL_KEY, LABEL_SUB_KEY, [label_ids]) attribute_data, self.zero_state_features = convert_to_data_format( tracker_state_features ) else: # method is called during prediction attribute_data, _ = convert_to_data_format( tracker_state_features, self.zero_state_features ) model_data.add_data(attribute_data) model_data.add_lengths( DIALOGUE, LENGTH, next(iter(list(attribute_data.keys()))), MASK ) return model_data
def _create_label_data( self, training_data: TrainingData, label_id_dict: Dict[Text, int], attribute: Text, ) -> RasaModelData: """Create matrix with label_ids encoded in rows as bag of words. Find a training example for each label and get the encoded features from the corresponding Message object. If the features are already computed, fetch them from the message object else compute a one hot encoding for the label as the feature vector. """ # Collect one example for each label labels_idx_examples = [] for label_name, idx in label_id_dict.items(): label_example = self._find_example_for_label( label_name, training_data.intent_examples, attribute) labels_idx_examples.append((idx, label_example)) # Sort the list of tuples based on label_idx labels_idx_examples = sorted(labels_idx_examples, key=lambda x: x[0]) labels_example = [example for (_, example) in labels_idx_examples] # Collect features, precomputed if they exist, else compute on the fly if self._check_labels_features_exist(labels_example, attribute): features = self._extract_labels_precomputed_features( labels_example, attribute) else: features = self._compute_default_label_features(labels_example) label_data = RasaModelData() label_data.add_features(LABEL_FEATURES, features) label_ids = np.array([idx for (idx, _) in labels_idx_examples]) # explicitly add last dimension to label_ids # to track correctly dynamic sequences label_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)]) label_data.add_mask(LABEL_MASK, LABEL_FEATURES) return label_data
def _create_model_data( self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None ) -> RasaModelData: """Combine all model related data into RasaModelData.""" label_ids = np.array([]) Y = np.array([]) if data_Y is not None: label_ids = self._label_ids_for_Y(data_Y) Y = self._label_features_for_Y(label_ids) # explicitly add last dimension to label_ids # to track correctly dynamic sequences label_ids = np.expand_dims(label_ids, -1) model_data = RasaModelData(label_key=LABEL_IDS) model_data.add_features(DIALOGUE_FEATURES, [data_X]) model_data.add_features(LABEL_FEATURES, [Y]) model_data.add_features(LABEL_IDS, [label_ids]) return model_data
def _create_model_data( self, training_data: List[Message], label_id_dict: Optional[Dict[Text, int]] = None, tag_id_dict: Optional[Dict[Text, int]] = None, label_attribute: Optional[Text] = None, ) -> RasaModelData: """Prepare data for training and create a RasaModelData object""" X_sparse = [] X_dense = [] Y_sparse = [] Y_dense = [] label_ids = [] tag_ids = [] for e in training_data: if label_attribute is None or e.get(label_attribute): _sparse, _dense = self._extract_features(e, TEXT) if _sparse is not None: X_sparse.append(_sparse) if _dense is not None: X_dense.append(_dense) if e.get(label_attribute): _sparse, _dense = self._extract_features(e, label_attribute) if _sparse is not None: Y_sparse.append(_sparse) if _dense is not None: Y_dense.append(_dense) if label_id_dict: label_ids.append(label_id_dict[e.get(label_attribute)]) if self.component_config.get(ENTITY_RECOGNITION) and tag_id_dict: if self.component_config[BILOU_FLAG]: _tags = bilou_utils.tags_to_ids(e, tag_id_dict) else: _tags = [] for t in e.get(TOKENS_NAMES[TEXT]): _tag = determine_token_labels(t, e.get(ENTITIES), None) _tags.append(tag_id_dict[_tag]) # transpose to have seq_len x 1 tag_ids.append(np.array([_tags]).T) X_sparse = np.array(X_sparse) X_dense = np.array(X_dense) Y_sparse = np.array(Y_sparse) Y_dense = np.array(Y_dense) label_ids = np.array(label_ids) tag_ids = np.array(tag_ids) model_data = RasaModelData(label_key=self.label_key) model_data.add_features(TEXT_FEATURES, [X_sparse, X_dense]) model_data.add_features(LABEL_FEATURES, [Y_sparse, Y_dense]) if label_attribute and model_data.feature_not_exist(LABEL_FEATURES): # no label features are present, get default features from _label_data model_data.add_features( LABEL_FEATURES, self._use_default_label_features(label_ids)) # explicitly add last dimension to label_ids # to track correctly dynamic sequences model_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)]) model_data.add_features(TAG_IDS, [tag_ids]) model_data.add_mask(TEXT_MASK, TEXT_FEATURES) model_data.add_mask(LABEL_MASK, LABEL_FEATURES) return model_data
def _create_model_data( self, tracker_state_features: List[List[Dict[Text, List["Features"]]]], label_ids: Optional[np.ndarray] = None, entity_tags: Optional[List[List[Dict[Text, List["Features"]]]]] = None, encoded_all_labels: Optional[List[Dict[Text, List["Features"]]]] = None, ) -> RasaModelData: """Combine all model related data into RasaModelData. Args: tracker_state_features: a dictionary of attributes (INTENT, TEXT, ACTION_NAME, ACTION_TEXT, ENTITIES, SLOTS, ACTIVE_LOOP) to a list of features for all dialogue turns in all training trackers label_ids: the label ids (e.g. action ids) for every dialogue turn in all training trackers entity_tags: a dictionary of entity type (ENTITY_TAGS) to a list of features containing entity tag ids for text user inputs otherwise empty dict for all dialogue turns in all training trackers encoded_all_labels: a list of dictionaries containing attribute features for label ids Returns: RasaModelData """ model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY) if label_ids is not None and encoded_all_labels is not None: label_ids = np.array([ np.expand_dims(seq_label_ids, -1) for seq_label_ids in label_ids ]) model_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [FeatureArray(label_ids, number_of_dimensions=3)], ) attribute_data, self.fake_features = convert_to_data_format( tracker_state_features, featurizers=self.config[FEATURIZERS]) entity_tags_data = self._create_data_for_entities(entity_tags) if entity_tags_data is not None: model_data.add_data(entity_tags_data) else: # method is called during prediction attribute_data, _ = convert_to_data_format( tracker_state_features, self.fake_features, featurizers=self.config[FEATURIZERS], ) model_data.add_data(attribute_data) model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE) model_data.add_lengths(ACTION_TEXT, SEQUENCE_LENGTH, ACTION_TEXT, SEQUENCE) # add the dialogue lengths attribute_present = next(iter(list(attribute_data.keys()))) dialogue_lengths = np.array([ np.size(np.squeeze(f, -1)) for f in model_data.data[attribute_present][MASK][0] ]) model_data.data[DIALOGUE][LENGTH] = [ FeatureArray(dialogue_lengths, number_of_dimensions=1) ] # make sure all keys are in the same order during training and prediction model_data.sort() return model_data