def test_different_seeds_for_subset_of_trials(self): n_trials = 25 cols = ["col_" + str(i) for i in range(4)] dfs = [ pd.DataFrame(np.random.rand(10, len(cols)), columns=cols) for _ in range(n_trials) ] self.write_dataframes_to_file(dfs, self.hdf_path) first_dataset = dataset.read_dataset(self.hdf_path, shuffle=True, n_trials=4, seed=90) second_dataset = dataset.read_dataset(self.hdf_path, shuffle=True, n_trials=4, seed=0) different_order = False for df1, df2 in zip(first_dataset, second_dataset): if not df1.equals(df2): different_order = True break self.assertTrue(different_order)
def test_cols_that_do_not_exist(self): n_trials = 5 dfs = [pd.DataFrame(np.random.rand(10, 4)) for _ in range(n_trials)] self.write_dataframes_to_file(dfs, self.hdf_path) inexisting_cols = ["inexisting_col_a", "inexisting_col_b"] with self.assertRaises(KeyError): dataset.read_dataset(self.hdf_path, cols=inexisting_cols, add_class_columns=False)
def test_n_trials_is_none(self): n_trials = 5 dfs = [pd.DataFrame(np.random.rand(10, 4)) for _ in range(n_trials)] self.write_dataframes_to_file(dfs, self.hdf_path) read_dataset = dataset.read_dataset(self.hdf_path) self.assertEqual(n_trials, len(read_dataset))
def test_n_trials_larger_than_available(self): n_trials = 5 dfs = [pd.DataFrame(np.random.rand(10, 4)) for _ in range(n_trials)] self.write_dataframes_to_file(dfs, self.hdf_path) read_dataset = dataset.read_dataset(self.hdf_path, n_trials=n_trials * 5) self.assertEqual(n_trials, len(read_dataset))
def test_datasets_sizes(self): gps.create_passive_datasets_for_training(**self.args) for path, dataset_trials in zip( [self.train_hdf_path, self.val_hdf_path, self.test_hdf_path], [ self.args["n_simulations_train"], self.args["n_simulations_val"], self.args["n_simulations_test"] ]): dataset = read_dataset(path) self.assertEqual(len(dataset), dataset_trials)
def test_reading_all_trials_with_different_seeds_but_without_shuffle(self): n_trials = 25 cols = ["col_" + str(i) for i in range(4)] dfs = [ pd.DataFrame(np.random.rand(10, len(cols)), columns=cols) for _ in range(n_trials) ] self.write_dataframes_to_file(dfs, self.hdf_path) # Specifying shuffle=False (or not specifying at all) should make the method ignore # the seed argument first_dataset = dataset.read_dataset(self.hdf_path, shuffle=False, seed=90) second_dataset = dataset.read_dataset(self.hdf_path, shuffle=False, seed=0) for df1, df2 in zip(first_dataset, second_dataset): pd.testing.assert_frame_equal(df1, df2)
def test_reproducibility_for_subset_of_trials(self): n_trials = 25 cols = ["col_" + str(i) for i in range(4)] dfs = [ pd.DataFrame(np.random.rand(10, len(cols)), columns=cols) for _ in range(n_trials) ] self.write_dataframes_to_file(dfs, self.hdf_path) first_dataset = dataset.read_dataset(self.hdf_path, shuffle=True, n_trials=4, seed=90) second_dataset = dataset.read_dataset(self.hdf_path, shuffle=True, n_trials=4, seed=90) for df1, df2 in zip(first_dataset, second_dataset): pd.testing.assert_frame_equal(df1, df2)
def test_cols_is_none(self): n_trials = 5 cols = ["col_" + str(i) for i in range(4)] dfs = [ pd.DataFrame(np.random.rand(10, 4), columns=cols) for _ in range(n_trials) ] self.write_dataframes_to_file(dfs, self.hdf_path) read_dataset = dataset.read_dataset(self.hdf_path, cols=None, add_class_columns=False) self.assertTrue(self.are_columns_in_all_trials(read_dataset, cols))
def test_add_class_columns(self): n_trials = 5 cols = ["col_" + str(i) for i in range(4)] cols_to_write = cols + list(MASS_CLASS_COLS) + list(FORCE_CLASS_COLS) dfs = [ pd.DataFrame(np.random.rand(10, len(cols_to_write)), columns=cols_to_write) for _ in range(n_trials) ] self.write_dataframes_to_file(dfs, self.hdf_path) read_dataset = dataset.read_dataset(self.hdf_path, cols=cols, add_class_columns=True) self.assertTrue( self.are_columns_in_all_trials(read_dataset, cols_to_write))
def test_duplicated_cols_are_only_read_once(self): n_trials = 5 cols = ["col_" + str(i) for i in range(4)] dfs = [ pd.DataFrame(np.random.rand(10, 4), columns=cols) for _ in range(n_trials) ] self.write_dataframes_to_file(dfs, self.hdf_path) read_dataset = dataset.read_dataset(self.hdf_path, cols=cols + cols, add_class_columns=False) self.assertTrue(self.are_columns_in_all_trials(read_dataset, cols)) self.assertTrue( reduce(lambda x, y: x and y, [len(df.columns) == 4 for df in read_dataset]))
def test_empty_dataset(self): args_copy = self.args.copy() args_copy["n_simulations_train"] = 0 gps.create_passive_datasets_for_training(**args_copy) for path, dataset_trials in zip( [self.train_hdf_path, self.val_hdf_path, self.test_hdf_path], [ args_copy["n_simulations_train"], args_copy["n_simulations_val"], args_copy["n_simulations_test"] ]): if dataset_trials > 0: dataset = read_dataset(path) self.assertEqual(len(dataset), dataset_trials) else: self.assertFalse(os.path.exists(path))
def test_do_not_add_class_columns_when_cols_unspecified(self): n_trials = 5 cols = ["col_" + str(i) for i in range(4)] dfs = [ pd.DataFrame(np.random.rand(10, len(cols)), columns=cols) for _ in range(n_trials) ] self.write_dataframes_to_file(dfs, self.hdf_path) try: read_dataset = dataset.read_dataset(self.hdf_path, cols=None, add_class_columns=True) except KeyError: self.fail("read_dataset failed unexpectedly") self.assertTrue(self.are_columns_in_all_trials(read_dataset, cols))
def test_datasets_sizes_with_previously_existing_dataset(self): # Create initial datasets with default key_prefix gps.create_passive_datasets_for_training(**self.args) # Create new datasets with a different key_prefix and number of trials # If the previous dataset isn't deleted: either there will be more trials (added to the # same path) or an incorrect number of trials. args_copy = self.args.copy() args_copy["trial_hdf_key_prefix"] = "newtrials_" args_copy["n_simulations_train"] = 1 args_copy["n_simulations_val"] = 1 args_copy["n_simulations_test"] = 1 gps.create_passive_datasets_for_training(**args_copy) for path, dataset_trials in zip( [self.train_hdf_path, self.val_hdf_path, self.test_hdf_path], [ args_copy["n_simulations_train"], args_copy["n_simulations_val"], args_copy["n_simulations_test"] ]): dataset = read_dataset(path) self.assertEqual(len(dataset), dataset_trials)