def test_load_multi_file_training_data(domain: Domain): featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2) trackers = training.load_data( "data/test_yaml_stories/stories.yml", domain, augmentation_factor=0 ) trackers = sorted(trackers, key=lambda t: t.sender_id) (tr_as_sts, tr_as_acts) = featurizer.training_states_and_labels(trackers, domain) hashed = [] for sts, acts in zip(tr_as_sts, tr_as_acts): hashed.append(json.dumps(sts + acts, sort_keys=True)) hashed = sorted(hashed, reverse=True) data, label_ids, _ = featurizer.featurize_trackers( trackers, domain, precomputations=None ) featurizer_mul = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2) trackers_mul = training.load_data( "data/test_multifile_yaml_stories", domain, augmentation_factor=0 ) trackers_mul = sorted(trackers_mul, key=lambda t: t.sender_id) (tr_as_sts_mul, tr_as_acts_mul) = featurizer.training_states_and_labels( trackers_mul, domain ) hashed_mul = [] for sts_mul, acts_mul in zip(tr_as_sts_mul, tr_as_acts_mul): hashed_mul.append(json.dumps(sts_mul + acts_mul, sort_keys=True)) hashed_mul = sorted(hashed_mul, reverse=True) data_mul, label_ids_mul, _ = featurizer_mul.featurize_trackers( trackers_mul, domain, precomputations=None ) assert hashed == hashed_mul # we check for intents, action names and entities -- the features which # are included in the story files data = _surface_attributes(data) data_mul = _surface_attributes(data_mul) for attribute in [INTENT, ACTION_NAME, ENTITIES]: if attribute not in data or attribute not in data_mul: continue assert len(data.get(attribute)) == len(data_mul.get(attribute)) for idx_tracker in range(len(data.get(attribute))): for idx_dialogue in range(len(data.get(attribute)[idx_tracker])): f1 = data.get(attribute)[idx_tracker][idx_dialogue] f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue] if f1 is None or f2 is None: assert f1 == f2 continue for idx_turn in range(len(f1)): f1 = data.get(attribute)[idx_tracker][idx_dialogue][idx_turn] f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue][idx_turn] assert np.all((f1 == f2).data) assert np.all(label_ids == label_ids_mul)
def test_can_read_test_story_with_entities(domain: Domain): trackers = training.load_data( "data/test_yaml_stories/story_with_or_and_entities.yml", domain, use_story_concatenation=False, tracker_limit=1000, remove_duplicates=False, ) assert len(trackers) == 2 assert trackers[0].events[-3] == UserUttered( intent={"name": "greet", "confidence": 1.0}, parse_data={ "text": "/greet", "intent_ranking": [{"confidence": 1.0, "name": "greet"}], "intent": {"confidence": 1.0, "name": "greet"}, "entities": [], }, ) assert trackers[0].events[-2] == ActionExecuted("utter_greet") assert trackers[0].events[-1] == ActionExecuted("action_listen") assert trackers[1].events[-4] == UserUttered( intent={"name": "greet", "confidence": 1.0}, entities=[{"entity": "name", "value": "peter"}], parse_data={ "text": "/greet", "intent_ranking": [{"confidence": 1.0, "name": "greet"}], "intent": {"confidence": 1.0, "name": "greet"}, "entities": [{"entity": "name", "value": "peter"}], }, ) assert trackers[1].events[-3] == SlotSet(key="name", value="peter") assert trackers[1].events[-2] == ActionExecuted("utter_greet") assert trackers[1].events[-1] == ActionExecuted("action_listen")
def train_trackers( domain: Domain, stories_file: Text, augmentation_factor: int = 20) -> List[TrackerWithCachedStates]: return training.load_data(stories_file, domain, augmentation_factor=augmentation_factor)
def test_generate_training_data_with_unused_checkpoints(domain: Domain): training_trackers = training.load_data( "data/test_yaml_stories/stories_unused_checkpoints.yml", domain) # there are 3 training stories: # 2 with unused end checkpoints -> training_trackers # 1 with unused start checkpoints -> ignored assert len(training_trackers) == 2
def test_can_read_test_story(domain: Domain): trackers = training.load_data( "data/test_yaml_stories/stories.yml", domain, use_story_concatenation=False, tracker_limit=1000, remove_duplicates=False, ) assert len(trackers) == 7 # this should be the story simple_story_with_only_end -> show_it_all # the generated stories are in a non stable order - therefore we need to # do some trickery to find the one we want to test tracker = [t for t in trackers if len(t.events) == 5][0] assert tracker.events[0] == ActionExecuted("action_listen") assert tracker.events[1] == UserUttered( intent={INTENT_NAME_KEY: "simple", "confidence": 1.0}, parse_data={ "text": "/simple", "intent_ranking": [{"confidence": 1.0, INTENT_NAME_KEY: "simple"}], "intent": {"confidence": 1.0, INTENT_NAME_KEY: "simple"}, "entities": [], }, ) assert tracker.events[2] == ActionExecuted("utter_default") assert tracker.events[3] == ActionExecuted("utter_greet") assert tracker.events[4] == ActionExecuted("action_listen")
def test_parsing_of_e2e_stories(domain: Domain): yaml_file = "data/test_yaml_stories/stories_hybrid_e2e.yml" tracker = training.load_data( yaml_file, domain, use_story_concatenation=False, tracker_limit=1000, remove_duplicates=False, ) assert len(tracker) == 1 actual = list(tracker[0].events) expected = [ ActionExecuted(ACTION_LISTEN_NAME), UserUttered(intent={"name": "simple"}), ActionExecuted("utter_greet"), ActionExecuted(ACTION_LISTEN_NAME), UserUttered( "I am looking for a Kenyan restaurant", {"name": None}, entities=[{"start": 19, "end": 25, "value": "Kenyan", "entity": "cuisine"}], ), ActionExecuted("", action_text="good for you"), ActionExecuted(ACTION_LISTEN_NAME), UserUttered(intent={"name": "goodbye"}), ActionExecuted("utter_goodbye"), ActionExecuted(ACTION_LISTEN_NAME), UserUttered("One more thing", {"name": None}), ActionExecuted("", action_text="What?"), ActionExecuted(ACTION_LISTEN_NAME), ] assert actual == expected
def test_can_read_test_story_with_checkpoint_after_or(domain: Domain): trackers = training.load_data( "data/test_yaml_stories/stories_checkpoint_after_or.yml", domain, use_story_concatenation=False, tracker_limit=1000, remove_duplicates=False, ) assert len(trackers) == 2
def test_or_statement_story_with_or_slot_was_set(domain: Domain): training_trackers = training.load_data( "data/test_yaml_stories/story_with_or_slot_was_set.yml", domain, use_story_concatenation=False, tracker_limit=1000, remove_duplicates=False, ) assert len(training_trackers) == 2 assert training_trackers[0].events[3] == SlotSet(key="name", value="peter") assert training_trackers[1].events[3] == SlotSet(key="name", value="bob")
def test_yaml_wrong_yaml_format_warning(domain: Domain): yaml_file = "data/test_wrong_yaml_stories/wrong_yaml.yml" with pytest.raises(YamlSyntaxException): _ = training.load_data( yaml_file, domain, use_story_concatenation=False, tracker_limit=1000, remove_duplicates=False, )
def test_can_read_test_story_with_slots(domain: Domain): trackers = training.load_data( "data/test_yaml_stories/simple_story_with_only_end.yml", domain, use_story_concatenation=False, tracker_limit=1000, remove_duplicates=False, ) assert len(trackers) == 1 assert trackers[0].events[-2] == SlotSet(key="name", value="peter") assert trackers[0].events[-1] == ActionExecuted("action_listen")
def test_yaml_slot_without_value_is_parsed(domain: Domain): yaml_file = "data/test_yaml_stories/story_with_slot_was_set.yml" tracker = training.load_data( yaml_file, domain, use_story_concatenation=False, tracker_limit=1000, remove_duplicates=False, ) assert tracker[0].events[-2] == SlotSet(key="name", value=DEFAULT_VALUE_TEXT_SLOTS)
def test_yaml_slot_different_types(domain: Domain): with pytest.warns(None): tracker = training.load_data( "data/test_yaml_stories/story_slot_different_types.yml", domain, use_story_concatenation=False, tracker_limit=1000, remove_duplicates=False, ) assert tracker[0].events[3] == SlotSet(key="list_slot", value=["value1", "value2"]) assert tracker[0].events[4] == SlotSet(key="bool_slot", value=True) assert tracker[0].events[5] == SlotSet(key="text_slot", value="some_text")
def trained_ted( tmp_path_factory: TempPathFactory, moodbot_domain_path: Path, ) -> TEDPolicyGraphComponent: training_files = "data/test_moodbot/data/stories.yml" domain = Domain.load(moodbot_domain_path) trackers = training.load_data(str(training_files), domain) policy = TEDPolicyGraphComponent.create( {**TEDPolicyGraphComponent.get_default_config(), EPOCHS: 1}, LocalModelStorage.create(tmp_path_factory.mktemp("storage")), Resource("ted"), ExecutionContext(GraphSchema({})), ) policy.train(trackers, domain) return policy
def test_yaml_intent_with_leading_slash_warning(domain: Domain): yaml_file = "data/test_wrong_yaml_stories/intent_with_leading_slash.yml" with pytest.warns(UserWarning) as record: tracker = training.load_data( yaml_file, domain, use_story_concatenation=False, tracker_limit=1000, remove_duplicates=False, ) # one for leading slash assert len(record) == 1 assert tracker[0].latest_message == UserUttered(intent={"name": "simple"})
def test_generate_training_data_original_and_augmented_trackers(domain: Domain): training_trackers = training.load_data( "data/test_yaml_stories/stories_defaultdomain.yml", domain, augmentation_factor=3, ) # there are three original stories # augmentation factor of 3 indicates max of 3*10 augmented stories generated # maximum number of stories should be augmented+original = 33 original_trackers = [ t for t in training_trackers if not hasattr(t, "is_augmented") or not t.is_augmented ] assert len(original_trackers) == 4 assert len(training_trackers) <= 34
def load_data( self, training_resource: Union[Text, TrainingDataImporter], remove_duplicates: bool = True, unique_last_num_states: Optional[int] = None, augmentation_factor: int = 50, tracker_limit: Optional[int] = None, use_story_concatenation: bool = True, debug_plots: bool = False, exclusion_percentage: Optional[int] = None, ) -> List["TrackerWithCachedStates"]: """Load training data from a resource.""" max_history = self._max_history() if unique_last_num_states is None: # for speed up of data generation # automatically detect unique_last_num_states # if it was not set and # if all featurizers are MaxHistoryTrackerFeaturizer if self._are_all_featurizers_using_a_max_history(): unique_last_num_states = max_history elif unique_last_num_states < max_history: # possibility of data loss rasa.shared.utils.io.raise_warning( f"unique_last_num_states={unique_last_num_states} but " f"maximum max_history={max_history}. " f"Possibility of data loss. " f"It is recommended to set " f"unique_last_num_states to " f"at least maximum max_history.") return training.load_data( training_resource, self.domain, remove_duplicates, unique_last_num_states, augmentation_factor, tracker_limit, use_story_concatenation, debug_plots, exclusion_percentage=exclusion_percentage, )
def test_generate_training_data_with_cycles(domain: Domain): featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=4) training_trackers = training.load_data( "data/test_yaml_stories/stories_with_cycle.yml", domain, augmentation_factor=0, ) _, label_ids, _ = featurizer.featurize_trackers( training_trackers, domain, precomputations=None ) # how many there are depends on the graph which is not created in a # deterministic way but should always be 3 or 4 assert len(training_trackers) == 3 or len(training_trackers) == 4 # if we have 4 trackers, there is going to be one example more for label 10 num_tens = len(training_trackers) - 1 # if new default actions are added the keys of the actions will be changed all_label_ids = [id for ids in label_ids for id in ids] assert Counter(all_label_ids) == {0: 6, 15: 3, 14: num_tens, 1: 2, 16: 1}
def load_data( self, training_resource: Union[Text, TrainingDataImporter], remove_duplicates: bool = True, unique_last_num_states: Optional[int] = None, augmentation_factor: int = 50, tracker_limit: Optional[int] = None, use_story_concatenation: bool = True, debug_plots: bool = False, exclusion_percentage: Optional[int] = None, ) -> List["TrackerWithCachedStates"]: """Load training data from a resource.""" return training.load_data( training_resource, self.domain, remove_duplicates, unique_last_num_states, augmentation_factor=augmentation_factor, tracker_limit=tracker_limit, use_story_concatenation=use_story_concatenation, debug_plots=debug_plots, exclusion_percentage=exclusion_percentage, )
def test_load_training_data_reader_not_found_throws(tmp_path: Path, domain: Domain): (tmp_path / "file").touch() with pytest.raises(Exception): training.load_data(str(tmp_path), domain)