def load(cls, path: Text) -> "TEDPolicy": """Loads a policy from the storage. **Needs to load its featurizer** """ if not os.path.exists(path): raise Exception(f"Failed to load TED policy model. Path " f"'{os.path.abspath(path)}' doesn't exist.") model_path = Path(path) tf_model_file = model_path / f"{SAVE_MODEL_FILE_NAME}.tf_model" featurizer = TrackerFeaturizer.load(path) if not (model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl").is_file(): return cls(featurizer=featurizer) loaded_data = io_utils.json_unpickle( model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl") label_data = io_utils.json_unpickle( model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl") meta = io_utils.pickle_load(model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl") priority = io_utils.json_unpickle( model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl") model_data_example = RasaModelData(label_key=LABEL_IDS, data=loaded_data) meta = train_utils.update_similarity_type(meta) model = TED.load( str(tf_model_file), model_data_example, data_signature=model_data_example.get_signature(), config=meta, max_history_tracker_featurizer_used=isinstance( featurizer, MaxHistoryTrackerFeaturizer), label_data=label_data, ) # build the graph for prediction predict_data_example = RasaModelData( label_key=LABEL_IDS, data={ feature_name: features for feature_name, features in model_data_example.items() if DIALOGUE in feature_name }, ) model.build_for_predict(predict_data_example) return cls(featurizer=featurizer, priority=priority, model=model, **meta)
def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData: """Prepares data for training. Performs sanity checks on training data, extracts encodings for labels. """ if self.retrieval_intent: training_data = training_data.filter_by_intent(self.retrieval_intent) label_id_index_mapping = self._label_id_index_mapping( training_data, attribute=RESPONSE ) if not label_id_index_mapping: # no labels are present to train return RasaModelData() self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping) self._label_data = self._create_label_data( training_data, label_id_index_mapping, attribute=RESPONSE ) model_data = self._create_model_data( training_data.intent_examples, label_id_index_mapping, label_attribute=RESPONSE, ) self._check_input_dimension_consistency(model_data) return model_data
def _assemble_label_data(self, attribute_data: Data, domain: Domain) -> RasaModelData: """Constructs data regarding labels to be fed to the model. The resultant model data can possibly contain one or both of the keys - [`label_action_name`, `label_action_text`] but will definitely contain the `label` key. `label_action_*` will contain the sequence, sentence and mask features for corresponding labels and `label` will contain the numerical label ids. Args: attribute_data: Feature data for all labels. domain: Domain of the assistant. Returns: Features of labels ready to be fed to the model. """ label_data = RasaModelData() label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_") label_data.add_lengths( f"{LABEL}_{ACTION_TEXT}", SEQUENCE_LENGTH, f"{LABEL}_{ACTION_TEXT}", SEQUENCE, ) label_ids = np.arange(domain.num_actions) label_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [ FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2) ], ) return label_data
def _assemble_label_data( self, attribute_data: Data, domain: Domain ) -> RasaModelData: """Constructs data regarding labels to be fed to the model. The resultant model data should contain the keys `label_intent`, `label`. `label_intent` will contain the sequence, sentence and mask features for all intent labels and `label` will contain the numerical label ids. Args: attribute_data: Feature data for all intent labels. domain: Domain of the assistant. Returns: Features of labels ready to be fed to the model. """ label_data = RasaModelData() label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_") label_data.add_lengths( f"{LABEL}_{INTENT}", SEQUENCE_LENGTH, f"{LABEL}_{INTENT}", SEQUENCE, ) label_ids = np.arange(len(domain.intents)) label_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2)], ) return label_data
def _load_model_class( cls, tf_model_file: Text, model_data_example: RasaModelData, label_data: RasaModelData, entity_tag_specs: List[EntityTagSpec], config: Dict[Text, Any], finetune_mode: bool = False, ) -> "RasaModel": predict_data_example = RasaModelData( label_key=model_data_example.label_key, data={ feature_name: features for feature_name, features in model_data_example.items() if TEXT in feature_name }, ) return cls.model_class(config[USE_TEXT_AS_LABEL]).load( tf_model_file, model_data_example, predict_data_example, data_signature=model_data_example.get_signature(), label_data=label_data, entity_tag_specs=entity_tag_specs, config=copy.deepcopy(config), finetune_mode=finetune_mode, )
def _create_label_data(self, domain: Domain) -> RasaModelData: # encode all label_ids with policies' featurizer state_featurizer = self.featurizer.state_featurizer all_labels = state_featurizer.create_encoded_all_actions(domain) all_labels = all_labels.astype(np.float32) label_data = RasaModelData() label_data.add_features(LABEL_FEATURES, [all_labels]) return label_data
def _construct_model_initialization_data( cls, loaded_data: Dict[Text, Dict[Text, List[FeatureArray]]] ) -> Tuple[RasaModelData, RasaModelData]: model_data_example = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data) predict_data_example = RasaModelData( label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data={ feature_name: features for feature_name, features in model_data_example.items() if feature_name # we need to remove label features for prediction if they are present in PREDICTION_FEATURES }, ) return model_data_example, predict_data_example
def test_not_balance_model_data(model_data: RasaModelData): test_model_data = RasaModelData(label_key="entities", label_sub_key="tag_ids", data=model_data.data) data = test_model_data._balanced_data(test_model_data.data, 2, False) assert np.all(data["entities"]["tag_ids"] == test_model_data.get( "entities", "tag_ids"))
async def model_data() -> RasaModelData: return RasaModelData( label_key="intent", label_sub_key="ids", data={ "text_features": { "sentence": [ np.array([ np.random.rand(5, 14), np.random.rand(2, 14), np.random.rand(3, 14), np.random.rand(1, 14), np.random.rand(3, 14), ]), np.array([ scipy.sparse.csr_matrix( np.random.randint(5, size=(5, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(2, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(1, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), ]), ] }, "intent_features": { "sentence": [ np.array([ np.random.randint(2, size=(5, 10)), np.random.randint(2, size=(2, 10)), np.random.randint(2, size=(3, 10)), np.random.randint(2, size=(1, 10)), np.random.randint(2, size=(3, 10)), ]) ] }, "intent": { "ids": [np.array([0, 1, 0, 1, 1])] }, "entities": { "tag_ids": [ np.array([ np.array([[0], [1], [1], [0], [2]]), np.array([[2], [0]]), np.array([[0], [1], [1]]), np.array([[0], [1]]), np.array([[0], [0], [0]]), ]) ] }, }, )
def test_batch_inference( batch_size: int, number_of_data_points: int, expected_number_of_batch_iterations: int, ): model = RasaModel() def _batch_predict( batch_in: Tuple[np.ndarray], ) -> Dict[Text, Union[np.ndarray, Dict[Text, np.ndarray]]]: dummy_output = batch_in[0] output = { "dummy_output": dummy_output, "non_input_affected_output": tf.constant(np.array([[1, 2]]), dtype=tf.int32), } return output # Monkeypatch batch predict so that run_inference interface can be tested model.batch_predict = _batch_predict # Create dummy model data to pass to model model_data = RasaModelData( label_key=LABEL, label_sub_key=IDS, data={ TEXT: { SENTENCE: [ FeatureArray( np.random.rand(number_of_data_points, 2), number_of_dimensions=2, ), ] } }, ) output = model.run_inference(model_data, batch_size=batch_size) # Firstly, the number of data points in dummy_output should be equal # to the number of data points sent as input. assert output["dummy_output"].shape[0] == number_of_data_points # Secondly, the number of data points inside diagnostic_data should be # equal to the number of batches passed to the model because for every # batch passed as input, it would have created a # corresponding diagnostic data entry. assert output["non_input_affected_output"].shape == ( expected_number_of_batch_iterations, 2, )
def _load_model( cls, index_label_id_mapping: Dict[int, Text], index_tag_id_mapping: Dict[int, Text], label_data: RasaModelData, meta: Dict[Text, Any], data_example: Dict[Text, List[np.ndarray]], model_dir: Text, ): file_name = meta.get("file") tf_model_file = os.path.join(model_dir, file_name + ".tf_model") label_key = LABEL_IDS if meta[INTENT_CLASSIFICATION] else None model_data_example = RasaModelData(label_key=label_key, data=data_example) model = cls.model_class(meta).load( tf_model_file, model_data_example, data_signature=model_data_example.get_signature(), label_data=label_data, index_label_id_mapping=index_label_id_mapping, index_tag_id_mapping=index_tag_id_mapping, config=meta, ) # build the graph for prediction predict_data_example = RasaModelData( label_key=label_key, data={ feature_name: features for feature_name, features in model_data_example.items() if TEXT in feature_name }, ) model.build_for_predict(predict_data_example) return model
def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData: """Prepares data for training. Performs sanity checks on training data, extracts encodings for labels. Args: training_data: training data to preprocessed. """ # Collect all retrieval intents present in the data before filtering self.all_retrieval_intents = list(training_data.retrieval_intents) if self.retrieval_intent: training_data = training_data.filter_training_examples( lambda ex: self.retrieval_intent == ex.get(INTENT) ) else: # retrieval intent was left to its default value logger.info( "Retrieval intent parameter was left to its default value. This " "response selector will be trained on training examples combining " "all retrieval intents." ) label_attribute = RESPONSE if self.use_text_as_label else INTENT_RESPONSE_KEY label_id_index_mapping = self._label_id_index_mapping( training_data, attribute=label_attribute ) self.responses = training_data.responses if not label_id_index_mapping: # no labels are present to train return RasaModelData() self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping) self._label_data = self._create_label_data( training_data, label_id_index_mapping, attribute=label_attribute ) model_data = self._create_model_data( training_data.intent_examples, label_id_index_mapping, label_attribute=label_attribute, ) self._check_input_dimension_consistency(model_data) return model_data
def _prepare_for_training( self, training_trackers: List[TrackerWithCachedStates], domain: Domain, interpreter: NaturalLanguageInterpreter, **kwargs: Any, ) -> Tuple[RasaModelData, np.ndarray]: """Prepares data to be fed into the model. Args: training_trackers: List of training trackers to be featurized. domain: Domain of the assistant. interpreter: NLU interpreter to be used for featurizing states. **kwargs: Any other arguments. Returns: Featurized data to be fed to the model and corresponding label ids. """ training_trackers = self._get_trackers_for_training(training_trackers) # dealing with training data tracker_state_features, label_ids, entity_tags = self._featurize_for_training( training_trackers, domain, interpreter, bilou_tagging=self.config[BILOU_FLAG], **kwargs, ) if not tracker_state_features: return RasaModelData(), label_ids self._label_data, encoded_all_labels = self._create_label_data( domain, interpreter) # extract actual training data to feed to model model_data = self._create_model_data(tracker_state_features, label_ids, entity_tags, encoded_all_labels) if self.config[ENTITY_RECOGNITION]: self._entity_tag_specs = self.featurizer.state_featurizer.entity_tag_specs # keep one example for persisting and loading self.data_example = model_data.first_data_example() return model_data, label_ids
def _create_label_data( self, domain: Domain, interpreter: NaturalLanguageInterpreter ) -> Tuple[RasaModelData, List[Dict[Text, List["Features"]]]]: # encode all label_ids with policies' featurizer state_featurizer = self.featurizer.state_featurizer encoded_all_labels = state_featurizer.encode_all_actions(domain, interpreter) attribute_data, _ = convert_to_data_format(encoded_all_labels) label_data = RasaModelData() label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_") label_ids = np.arange(domain.num_actions) label_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [np.expand_dims(label_ids, -1)] ) return label_data, encoded_all_labels
def _compile_and_fit( self, data_example: Dict[Text, Dict[Text, List[FeatureArray]]] ) -> None: """Compiles modified model and fits a sample data on it. Args: data_example: a data example that is stored with the ML component. """ self.compile(optimizer=tf.keras.optimizers.Adam(self.config[LEARNING_RATE])) label_key = LABEL_KEY if self.config[INTENT_CLASSIFICATION] else None label_sub_key = LABEL_SUB_KEY if self.config[INTENT_CLASSIFICATION] else None model_data = RasaModelData( label_key=label_key, label_sub_key=label_sub_key, data=data_example ) self._update_data_signatures(model_data) data_generator = RasaBatchDataGenerator(model_data, batch_size=1) self.fit(data_generator, verbose=False)
def _load_model_utilities(cls, model_path: Path) -> Dict[Text, Any]: """Loads model's utility attributes. Args: model_path: Path where model is to be persisted. """ tf_model_file = model_path / f"{cls._metadata_filename()}.tf_model" loaded_data = io_utils.pickle_load( model_path / f"{cls._metadata_filename()}.data_example.pkl") label_data = io_utils.pickle_load( model_path / f"{cls._metadata_filename()}.label_data.pkl") fake_features = io_utils.pickle_load( model_path / f"{cls._metadata_filename()}.fake_features.pkl") label_data = RasaModelData(data=label_data) meta = io_utils.pickle_load(model_path / f"{cls._metadata_filename()}.meta.pkl") priority = io_utils.json_unpickle( model_path / f"{cls._metadata_filename()}.priority.pkl") entity_tag_specs = rasa.shared.utils.io.read_json_file( model_path / f"{cls._metadata_filename()}.entity_tag_specs.json") entity_tag_specs = [ EntityTagSpec( tag_name=tag_spec["tag_name"], ids_to_tags={ int(key): value for key, value in tag_spec["ids_to_tags"].items() }, tags_to_ids={ key: int(value) for key, value in tag_spec["tags_to_ids"].items() }, num_tags=tag_spec["num_tags"], ) for tag_spec in entity_tag_specs ] return { "tf_model_file": tf_model_file, "loaded_data": loaded_data, "fake_features": fake_features, "label_data": label_data, "meta": meta, "priority": priority, "entity_tag_specs": entity_tag_specs, }
def _create_model_data( self, tracker_state_features: List[List[Dict[Text, List["Features"]]]], label_ids: Optional[np.ndarray] = None, encoded_all_labels: Optional[List[Dict[Text, List["Features"]]]] = None, ) -> RasaModelData: """Combine all model related data into RasaModelData. Args: tracker_state_features: a dictionary of attributes (INTENT, TEXT, ACTION_NAME, ACTION_TEXT, ENTITIES, SLOTS, ACTIVE_LOOP) to a list of features for all dialogue turns in all training trackers label_ids: the label ids (e.g. action ids) for every dialogue turn in all training trackers encoded_all_labels: a list of dictionaries containing attribute features for labels ids Returns: RasaModelData """ model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY) if label_ids is not None and encoded_all_labels is not None: label_ids = np.array( [np.expand_dims(seq_label_ids, -1) for seq_label_ids in label_ids] ) model_data.add_features(LABEL_KEY, LABEL_SUB_KEY, [label_ids]) attribute_data, self.zero_state_features = convert_to_data_format( tracker_state_features ) else: # method is called during prediction attribute_data, _ = convert_to_data_format( tracker_state_features, self.zero_state_features ) model_data.add_data(attribute_data) model_data.add_lengths( DIALOGUE, LENGTH, next(iter(list(attribute_data.keys()))), MASK ) return model_data
def _create_label_data( self, training_data: TrainingData, label_id_dict: Dict[Text, int], attribute: Text, ) -> RasaModelData: """Create matrix with label_ids encoded in rows as bag of words. Find a training example for each label and get the encoded features from the corresponding Message object. If the features are already computed, fetch them from the message object else compute a one hot encoding for the label as the feature vector. """ # Collect one example for each label labels_idx_examples = [] for label_name, idx in label_id_dict.items(): label_example = self._find_example_for_label( label_name, training_data.intent_examples, attribute) labels_idx_examples.append((idx, label_example)) # Sort the list of tuples based on label_idx labels_idx_examples = sorted(labels_idx_examples, key=lambda x: x[0]) labels_example = [example for (_, example) in labels_idx_examples] # Collect features, precomputed if they exist, else compute on the fly if self._check_labels_features_exist(labels_example, attribute): features = self._extract_labels_precomputed_features( labels_example, attribute) else: features = self._compute_default_label_features(labels_example) label_data = RasaModelData() label_data.add_features(LABEL_FEATURES, features) label_ids = np.array([idx for (idx, _) in labels_idx_examples]) # explicitly add last dimension to label_ids # to track correctly dynamic sequences label_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)]) label_data.add_mask(LABEL_MASK, LABEL_FEATURES) return label_data
def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData: """Prepares data for training. Performs sanity checks on training data, extracts encodings for labels. """ if self.retrieval_intent: training_data = training_data.filter_by_intent( self.retrieval_intent) else: # retrieval intent was left to its default value logger.info( "Retrieval intent parameter was left to its default value. This " "response selector will be trained on training examples combining " "all retrieval intents.") label_id_index_mapping = self._label_id_index_mapping( training_data, attribute=RESPONSE) self.retrieval_intent_mapping = self._create_retrieval_intent_mapping( training_data) if not label_id_index_mapping: # no labels are present to train return RasaModelData() self.index_label_id_mapping = self._invert_mapping( label_id_index_mapping) self._label_data = self._create_label_data(training_data, label_id_index_mapping, attribute=RESPONSE) model_data = self._create_model_data( training_data.intent_examples, label_id_index_mapping, label_attribute=RESPONSE, ) self._check_input_dimension_consistency(model_data) return model_data
def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData: """Prepares data for training. Performs sanity checks on training data, extracts encodings for labels. """ if self.component_config[BILOU_FLAG]: bilou_utils.apply_bilou_schema(training_data) label_id_index_mapping = self._label_id_index_mapping(training_data, attribute=INTENT) if not label_id_index_mapping: # no labels are present to train return RasaModelData() self.index_label_id_mapping = self._invert_mapping( label_id_index_mapping) self._label_data = self._create_label_data(training_data, label_id_index_mapping, attribute=INTENT) tag_id_index_mapping = self._tag_id_index_mapping(training_data) self.index_tag_id_mapping = self._invert_mapping(tag_id_index_mapping) label_attribute = ( INTENT if self.component_config[INTENT_CLASSIFICATION] else None) model_data = self._create_model_data( training_data.training_examples, label_id_index_mapping, tag_id_index_mapping, label_attribute=label_attribute, ) self.num_tags = len(self.index_tag_id_mapping) self._check_input_dimension_consistency(model_data) return model_data
def _prepare_data_for_prediction(model_data: RasaModelData) -> RasaModelData: """Transforms training model data to data usable for making model predictions. Transformation involves filtering out all features which are not useful at prediction time. This is important because the prediction signature will not contain these attributes and hence prediction will break. Args: model_data: Data used during model training. Returns: Transformed data usable for making predictions. """ filtered_data: Dict[Text, Dict[Text, Any]] = { key: features for key, features in model_data.data.items() if key in PREDICTION_FEATURES } return RasaModelData(data=filtered_data)
def _create_model_data( self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None ) -> RasaModelData: """Combine all model related data into RasaModelData.""" label_ids = np.array([]) Y = np.array([]) if data_Y is not None: label_ids = self._label_ids_for_Y(data_Y) Y = self._label_features_for_Y(label_ids) # explicitly add last dimension to label_ids # to track correctly dynamic sequences label_ids = np.expand_dims(label_ids, -1) model_data = RasaModelData(label_key=LABEL_IDS) model_data.add_features(DIALOGUE_FEATURES, [data_X]) model_data.add_features(LABEL_FEATURES, [Y]) model_data.add_features(LABEL_IDS, [label_ids]) return model_data
def test_not_balance_model_data(model_data: RasaModelData): test_model_data = RasaModelData(label_key="tag_ids", data=model_data.data) data = test_model_data._balanced_data(test_model_data.data, 2, False) assert np.all(data.get("tag_ids") == test_model_data.get("tag_ids"))
async def model_data() -> RasaModelData: return RasaModelData( label_key="label", label_sub_key="ids", data={ "text": { "sentence": [ FeatureArray( np.array([ np.random.rand(5, 14), np.random.rand(2, 14), np.random.rand(3, 14), np.random.rand(1, 14), np.random.rand(3, 14), ]), number_of_dimensions=3, ), FeatureArray( np.array([ scipy.sparse.csr_matrix( np.random.randint(5, size=(5, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(2, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(1, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), ]), number_of_dimensions=3, ), ] }, "action_text": { "sequence": [ FeatureArray( np.array([ [ scipy.sparse.csr_matrix( np.random.randint(5, size=(5, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(2, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(1, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), ], [ scipy.sparse.csr_matrix( np.random.randint(5, size=(5, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(2, 10))), ], [ scipy.sparse.csr_matrix( np.random.randint(5, size=(5, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(1, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), ], [ scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))) ], [ scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(1, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(7, 10))), ], ]), number_of_dimensions=4, ), FeatureArray( np.array([ [ np.random.rand(5, 14), np.random.rand(2, 14), np.random.rand(3, 14), np.random.rand(1, 14), np.random.rand(3, 14), ], [np.random.rand(5, 14), np.random.rand(2, 14)], [ np.random.rand(5, 14), np.random.rand(1, 14), np.random.rand(3, 14), ], [np.random.rand(3, 14)], [ np.random.rand(3, 14), np.random.rand(1, 14), np.random.rand(7, 14), ], ]), number_of_dimensions=4, ), ] }, "dialogue": { "sentence": [ FeatureArray( np.array([ np.random.randint(2, size=(5, 10)), np.random.randint(2, size=(2, 10)), np.random.randint(2, size=(3, 10)), np.random.randint(2, size=(1, 10)), np.random.randint(2, size=(3, 10)), ]), number_of_dimensions=3, ) ] }, "label": { "ids": [ FeatureArray(np.array([0, 1, 0, 1, 1]), number_of_dimensions=1) ] }, "entities": { "tag_ids": [ FeatureArray( np.array([ np.array([[0], [1], [1], [0], [2]]), np.array([[2], [0]]), np.array([[0], [1], [1]]), np.array([[0], [1]]), np.array([[0], [0], [0]]), ]), number_of_dimensions=3, ) ] }, }, )
def _create_model_data( self, tracker_state_features: List[List[Dict[Text, List["Features"]]]], label_ids: Optional[np.ndarray] = None, entity_tags: Optional[List[List[Dict[Text, List["Features"]]]]] = None, encoded_all_labels: Optional[List[Dict[Text, List["Features"]]]] = None, ) -> RasaModelData: """Combine all model related data into RasaModelData. Args: tracker_state_features: a dictionary of attributes (INTENT, TEXT, ACTION_NAME, ACTION_TEXT, ENTITIES, SLOTS, ACTIVE_LOOP) to a list of features for all dialogue turns in all training trackers label_ids: the label ids (e.g. action ids) for every dialogue turn in all training trackers entity_tags: a dictionary of entity type (ENTITY_TAGS) to a list of features containing entity tag ids for text user inputs otherwise empty dict for all dialogue turns in all training trackers encoded_all_labels: a list of dictionaries containing attribute features for label ids Returns: RasaModelData """ model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY) if label_ids is not None and encoded_all_labels is not None: label_ids = np.array([ np.expand_dims(seq_label_ids, -1) for seq_label_ids in label_ids ]) model_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [FeatureArray(label_ids, number_of_dimensions=3)], ) attribute_data, self.fake_features = convert_to_data_format( tracker_state_features, featurizers=self.config[FEATURIZERS]) entity_tags_data = self._create_data_for_entities(entity_tags) if entity_tags_data is not None: model_data.add_data(entity_tags_data) else: # method is called during prediction attribute_data, _ = convert_to_data_format( tracker_state_features, self.fake_features, featurizers=self.config[FEATURIZERS], ) model_data.add_data(attribute_data) model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE) model_data.add_lengths(ACTION_TEXT, SEQUENCE_LENGTH, ACTION_TEXT, SEQUENCE) # add the dialogue lengths attribute_present = next(iter(list(attribute_data.keys()))) dialogue_lengths = np.array([ np.size(np.squeeze(f, -1)) for f in model_data.data[attribute_present][MASK][0] ]) model_data.data[DIALOGUE][LENGTH] = [ FeatureArray(dialogue_lengths, number_of_dimensions=1) ] # make sure all keys are in the same order during training and prediction model_data.sort() return model_data
def _create_model_data( self, training_data: List[Message], label_id_dict: Optional[Dict[Text, int]] = None, tag_id_dict: Optional[Dict[Text, int]] = None, label_attribute: Optional[Text] = None, ) -> RasaModelData: """Prepare data for training and create a RasaModelData object""" X_sparse = [] X_dense = [] Y_sparse = [] Y_dense = [] label_ids = [] tag_ids = [] for e in training_data: if label_attribute is None or e.get(label_attribute): _sparse, _dense = self._extract_features(e, TEXT) if _sparse is not None: X_sparse.append(_sparse) if _dense is not None: X_dense.append(_dense) if e.get(label_attribute): _sparse, _dense = self._extract_features(e, label_attribute) if _sparse is not None: Y_sparse.append(_sparse) if _dense is not None: Y_dense.append(_dense) if label_id_dict: label_ids.append(label_id_dict[e.get(label_attribute)]) if self.component_config.get(ENTITY_RECOGNITION) and tag_id_dict: if self.component_config[BILOU_FLAG]: _tags = bilou_utils.tags_to_ids(e, tag_id_dict) else: _tags = [] for t in e.get(TOKENS_NAMES[TEXT]): _tag = determine_token_labels(t, e.get(ENTITIES), None) _tags.append(tag_id_dict[_tag]) # transpose to have seq_len x 1 tag_ids.append(np.array([_tags]).T) X_sparse = np.array(X_sparse) X_dense = np.array(X_dense) Y_sparse = np.array(Y_sparse) Y_dense = np.array(Y_dense) label_ids = np.array(label_ids) tag_ids = np.array(tag_ids) model_data = RasaModelData(label_key=self.label_key) model_data.add_features(TEXT_FEATURES, [X_sparse, X_dense]) model_data.add_features(LABEL_FEATURES, [Y_sparse, Y_dense]) if label_attribute and model_data.feature_not_exist(LABEL_FEATURES): # no label features are present, get default features from _label_data model_data.add_features( LABEL_FEATURES, self._use_default_label_features(label_ids)) # explicitly add last dimension to label_ids # to track correctly dynamic sequences model_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)]) model_data.add_features(TAG_IDS, [tag_ids]) model_data.add_mask(TEXT_MASK, TEXT_FEATURES) model_data.add_mask(LABEL_MASK, LABEL_FEATURES) return model_data
def load(cls, path: Union[Text, Path]) -> "TEDPolicy": """Loads a policy from the storage. **Needs to load its featurizer** """ model_path = Path(path) if not model_path.exists(): logger.error( f"Failed to load TED policy model. Path " f"'{model_path.absolute()}' doesn't exist." ) return tf_model_file = model_path / f"{SAVE_MODEL_FILE_NAME}.tf_model" featurizer = TrackerFeaturizer.load(path) if not (model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl").is_file(): return cls(featurizer=featurizer) loaded_data = io_utils.pickle_load( model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl" ) label_data = io_utils.pickle_load( model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl" ) zero_state_features = io_utils.pickle_load( model_path / f"{SAVE_MODEL_FILE_NAME}.zero_state_features.pkl" ) label_data = RasaModelData(data=label_data) meta = io_utils.pickle_load(model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl") priority = io_utils.json_unpickle( model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl" ) model_data_example = RasaModelData( label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data ) meta = train_utils.update_similarity_type(meta) model = TED.load( str(tf_model_file), model_data_example, data_signature=model_data_example.get_signature(), config=meta, max_history_tracker_featurizer_used=isinstance( featurizer, MaxHistoryTrackerFeaturizer ), label_data=label_data, ) # build the graph for prediction predict_data_example = RasaModelData( label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data={ feature_name: features for feature_name, features in model_data_example.items() if feature_name in STATE_LEVEL_FEATURES + FEATURES_TO_ENCODE + [DIALOGUE] }, ) model.build_for_predict(predict_data_example) return cls( featurizer=featurizer, priority=priority, model=model, zero_state_features=zero_state_features, **meta, )