async def test_load_training_data_handles_hidden_files(tmpdir, default_domain): # create a hidden file Path(tmpdir / ".hidden").touch() # create a normal file Path(tmpdir / "normal_file").touch() featurizer = MaxHistoryTrackerFeaturizer(BinarySingleStateFeaturizer(), max_history=2) trackers = await training.load_data(tmpdir.strpath, default_domain) data = featurizer.featurize_trackers(trackers, default_domain) assert len(data.X) == 0 assert len(data.y) == 0
async def test_FullDialogueTrackerFeaturizer(): # viz_domain(default_domain) default_domain = Domain.load("{}/data/domain_with_slots.yml".format(prj_dir)) stories_file = "{}/data/stories.md".format(prj_dir) trackers = await training.load_data( stories_file, default_domain, augmentation_factor=0, debug_plots=False ) # viz_trackers(trackers) featurizer = FullDialogueTrackerFeaturizer(state_featurizer=BinarySingleStateFeaturizer()) # print_title("START TRAINING STATES") (trackers_as_states, trackers_as_actions) = featurizer.training_states_and_actions(trackers, default_domain) print_data_training(trackers_as_states, trackers_as_actions)
async def test_load_multi_file_training_data(default_domain): # the stories file in `data/test_multifile_stories` is the same as in # `data/test_stories/stories.md`, but split across multiple files featurizer = MaxHistoryTrackerFeaturizer( BinarySingleStateFeaturizer(), max_history=2 ) trackers = await training.load_data( "data/test_stories/stories.md", default_domain, augmentation_factor=0 ) (tr_as_sts, tr_as_acts) = featurizer.training_states_and_actions( trackers, default_domain ) hashed = [] for sts, acts in zip(tr_as_sts, tr_as_acts): hashed.append(json.dumps(sts + acts, sort_keys=True)) hashed = sorted(hashed, reverse=True) data = featurizer.featurize_trackers(trackers, default_domain) featurizer_mul = MaxHistoryTrackerFeaturizer( BinarySingleStateFeaturizer(), max_history=2 ) trackers_mul = await training.load_data( "data/test_multifile_stories", default_domain, augmentation_factor=0 ) (tr_as_sts_mul, tr_as_acts_mul) = featurizer.training_states_and_actions( trackers_mul, default_domain ) hashed_mul = [] for sts_mul, acts_mul in zip(tr_as_sts_mul, tr_as_acts_mul): hashed_mul.append(json.dumps(sts_mul + acts_mul, sort_keys=True)) hashed_mul = sorted(hashed_mul, reverse=True) data_mul = featurizer_mul.featurize_trackers(trackers_mul, default_domain) assert hashed == hashed_mul assert np.all(data.X.sort(axis=0) == data_mul.X.sort(axis=0)) assert np.all(data.y.sort(axis=0) == data_mul.y.sort(axis=0))
async def test_load_training_data_handles_hidden_files(tmpdir, default_domain): # create a hidden file with open(os.path.join(tmpdir.strpath, ".hidden"), "a") as f: f.close() # create a normal file normal_file = os.path.join(tmpdir.strpath, "normal_file") with open(normal_file, "a") as f: f.close() featurizer = MaxHistoryTrackerFeaturizer(BinarySingleStateFeaturizer(), max_history=2) trackers = await training.load_data(tmpdir.strpath, default_domain) data = featurizer.featurize_trackers(trackers, default_domain) assert len(data.X) == 0 assert len(data.y) == 0
def test_BinarySingleStateFeaturizer(): f = BinarySingleStateFeaturizer() f.input_state_map = {"a": 0, "b": 3, "c": 2, "d": 1} # "a" "d" "c" "b" f.num_features = len(f.input_state_map) encoded = f.encode({"a": 1.0, "b": 1.0, "c": 0.0, "e": 1.0}) assert is_numpy(encoded) assert list(encoded) == [1, 0, 0, 1] encoded = f.encode({"a": 1.0, "b": 0.1, "c": 0.2, "e": 1.0}) assert is_numpy(encoded) assert list(encoded) == [1.0, 0.0, 0.2, 0.1]
async def test_generate_training_data_with_cycles(tmpdir, default_domain): featurizer = MaxHistoryTrackerFeaturizer(BinarySingleStateFeaturizer(), max_history=4) training_trackers = await training.load_data( "data/test_stories/stories_with_cycle.md", default_domain, augmentation_factor=0) training_data = featurizer.featurize_trackers(training_trackers, default_domain) y = training_data.y.argmax(axis=-1) # how many there are depends on the graph which is not created in a # deterministic way but should always be 3 or 4 assert len(training_trackers) == 3 or len(training_trackers) == 4 # if we have 4 trackers, there is going to be one example more for label 4 num_threes = len(training_trackers) - 1 # if new default actions are added the keys of the actions will be changed assert Counter(y) == {0: 6, 1: 2, 8: num_threes, 9: 1, 10: 3}
def _standard_featurizer(): return MaxHistoryTrackerFeaturizer(BinarySingleStateFeaturizer())
def test_binary_featurizer_handles_on_non_existing_features(): f = BinarySingleStateFeaturizer() f.input_state_map = {"a": 0, "b": 3, "c": 2, "d": 1} f.num_features = len(f.input_state_map) encoded = f.encode({"a": 1.0, "b": 1.0, "c": 0.0, "e": 1.0}) assert (encoded == np.array([1, 0, 0, 1])).all()
def test_binary_featurizer_handles_probabilistic_intents(): f = BinarySingleStateFeaturizer() f.input_state_map = {"intent_a": 0, "b": 3, "intent_c": 2, "d": 1} f.num_features = len(f.input_state_map) encoded = f.encode({"intent_a": 0.5, "b": 0.2, "intent_c": 1.0}) assert (encoded == np.array([0.5, 0, 1.0, 0.2])).all()
def test_binary_featurizer_uses_correct_dtype_float(): f = BinarySingleStateFeaturizer() f.input_state_map = {"a": 0, "b": 3, "c": 2, "d": 1} f.num_features = len(f.input_state_map) encoded = f.encode({"a": 1.0, "b": 0.2, "c": 0.0}) assert encoded.dtype == np.float64
def featurizer(self): featurizer = MaxHistoryTrackerFeaturizer(BinarySingleStateFeaturizer(), max_history=self.max_history) return featurizer
def _standard_featurizer(max_history=None) -> MaxHistoryTrackerFeaturizer: return MaxHistoryTrackerFeaturizer(BinarySingleStateFeaturizer(), max_history=max_history)