def test_removing_rows(): # Set up a simple DatasetMatrix dm = DatasetMatrix('testmatrix') configure_default_datasetmatrix(dm) # Remove the third row. Affects X and Y at the same time. dm.delete_row(2) expected_X = scipy.sparse.csr_matrix(numpy.array([ [1, 2, 3, 4], [5, 6, 7, 8], [13, 14, 15, 16]])) expected_Y = scipy.sparse.csr_matrix(numpy.array([ [101, 102], [201, 202], [401, 402]])) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert ["row0", "row1", "row3"] == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y # Remove the first row. dm.delete_row(0) expected_X = scipy.sparse.csr_matrix(numpy.array([ [5, 6, 7, 8], [13, 14, 15, 16]])) expected_Y = scipy.sparse.csr_matrix(numpy.array([ [201, 202], [401, 402]])) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert ["row1", "row3"] == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows') check_saving_and_loading(dm, folder) dm.unfinalize() # Remove both remaining rows. dm.delete_row(0) dm.delete_row(0) expected_X = scipy.sparse.csr_matrix((0, 4)) expected_Y = scipy.sparse.csr_matrix((0, 2)) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert [] == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows') check_saving_and_loading(dm, folder)
def test_removing_columns_Y(): # Set up a simple DatasetMatrix dm = DatasetMatrix('testmatrix') configure_default_datasetmatrix(dm) # Remove the first column from the Y matrix. dm.delete_column_Y(0) expected_X = default_matrix_X() expected_Y = scipy.sparse.csr_matrix(numpy.array([ [102], [202], [302], [402]])) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert ['coly1'] == dm.column_labels_Y # Remove the last remaining column from the Y matrix. dm.delete_column_Y(0) expected_X = default_matrix_X() expected_Y = scipy.sparse.csr_matrix((4, 0)) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert [] == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_Y') check_saving_and_loading(dm, folder)
def test_saving_and_loading(): # Set up a simple DatasetMatrix dm = DatasetMatrix('testmatrix') configure_default_datasetmatrix(dm) folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__save_load') check_saving_and_loading(dm, folder)
def test_exds_saving_and_loading(): folder = testutil.ensure_empty_tmp_subfolder( 'test_exds_repository__test_saving_and_loading') definition = default_exds_definition(folder) exds = definition.create_exds() # Due to the definition provided by default_exds_definition(), the # exds will be saved after building. exds.build() # Verify if the matrices have been finalized. assert exds.matrix.final is True assert exds.matrix_train.final is True assert exds.matrix_test.final is True # Verify if the matrices can be loaded individually from the saved # ModelBuildingExperimentalDataset. # - The original matrix: loadedMatrix_original = DatasetMatrix("dataset") loadedMatrix_original.load(exds.definition.path) assert exds.matrix == loadedMatrix_original # - The training matrix: loadedMatrix_train = DatasetMatrix("dataset_train") loadedMatrix_train.load(exds.definition.path) assert exds.matrix_train == loadedMatrix_train # - The test matrix: loadedMatrix_test = DatasetMatrix("dataset_test") loadedMatrix_test.load(exds.definition.path) assert exds.matrix_test == loadedMatrix_test
def test_feature_removal__thresholds_on_train(): folder = testutil.ensure_empty_tmp_subfolder( 'test_binary_exds_repository__test_feature_removal__train') definition = default_exds_definition(folder) # First, build the exds WITHOUT removing features. We will inspect what features # will be chosen for removal by calling the internal method # BinaryExperimentalDataset.determine_thresholded_features_to_remove(). definition.options['remove_features_by_p_thresholds'] = False definition.options['remove_objectives_by_p_thresholds'] = False # But we do configure thresholds for features, to be able to verify # what the exds would remove, if allowed to. definition.options['probability_thresholds__features'] = { 'train': (0.1, 0.8) } definition.options['probability_thresholds__objectives'] = {} exds = definition.create_exds() exds.build() assertExDsDimensions(exds, 25, 75, 8, 4) # Only when analysing the full matrix should we see features to be # removed, since we specified thresholds only for 'train'. expected_features_to_remove = { 3: 'galaxy', 4: 'oxygen', 5: 'polyrhythm', 6: 'python', 7: 'rocket' } assertThresholdedFeaturesToRemove(exds, expected_features_to_remove, ['train']) assertThresholdedObjectivesToRemove(exds, {}, []) # Now we rebuild the exds, but with feature removal enabled. folder = testutil.ensure_empty_tmp_subfolder( 'test_binary_exds_repository__test_feature_removal__train') definition = default_exds_definition(folder) definition.options['probability_thresholds__features'] = { 'train': (0.1, 0.8) } definition.options['probability_thresholds__objectives'] = {} definition.options['remove_features_by_p_thresholds'] = True definition.options['remove_objectives_by_p_thresholds'] = True exds = definition.create_exds() exds.build() assertExDsDimensions(exds, 25, 75, 3, 4) assertFeaturesNotInExDs(exds, expected_features_to_remove.keys())
def test_objective_removal__thresholds_on_full(): folder = testutil.ensure_empty_tmp_subfolder( 'test_binary_exds_repository__test_objective_removal__full') definition = default_exds_definition(folder) # First, build the exds WITHOUT removing objectives. We will inspect what objectives # will be chosen for removal by calling the internal method # BinaryExperimentalDataset.determine_thresholded_objectives_to_remove(). definition.options['remove_features_by_p_thresholds'] = False definition.options['remove_objectives_by_p_thresholds'] = False # But we do configure thresholds for objectives, to be able to verify # what the exds would remove, if allowed to. definition.options['probability_thresholds__features'] = {} definition.options['probability_thresholds__objectives'] = { 'full': (0.1, 0.9) } exds = definition.create_exds() exds.build() assertExDsDimensions(exds, 25, 75, 8, 4) # Only when analysing the full matrix should we see objectives to be # removed, since we specified thresholds only for 'full'. expected_objectives_to_remove = {2: 'sidereal', 3: 'unknown'} assertThresholdedFeaturesToRemove(exds, {}, []) assertThresholdedObjectivesToRemove(exds, expected_objectives_to_remove, ['full']) # Now we rebuild the exds, but with feature removal enabled. folder = testutil.ensure_empty_tmp_subfolder( 'test_binary_exds_repository__test_objective_removal__full') definition = default_exds_definition(folder) definition.options['probability_thresholds__features'] = {} definition.options['probability_thresholds__objectives'] = { 'full': (0.1, 0.9) } definition.options['remove_features_by_p_thresholds'] = True definition.options['remove_objectives_by_p_thresholds'] = True exds = definition.create_exds() exds.build() assertExDsDimensions(exds, 25, 75, 8, 2) assertObjectivesNotInExDs(exds, expected_objectives_to_remove.keys())
def test_exds_build(): folder = testutil.ensure_empty_tmp_subfolder( 'test_exds_repository__test_build') definition = default_exds_definition(folder) exds = definition.create_exds() exds.build(finalize_and_save=False) # Make sure 'training_set_size = 0.25' has been properly taken into # account. assert 16 == exds.total_row_count assert 4 == len(exds.train_rows) assert 12 == len(exds.test_rows) assert 4 == exds.matrix_train.X.get_shape()[0] assert 4 == exds.matrix_train.Y.get_shape()[0] assert 12 == exds.matrix_test.X.get_shape()[0] assert 12 == exds.matrix_test.Y.get_shape()[0] # Reconstruct the list of all row indices, to make sure the split is # consistent. all_rows = set(exds.train_rows) | set(exds.test_rows) assert set(range(16)) == all_rows assert 0 == len(set(exds.train_rows) & set(exds.test_rows)) # Ensure that any row of exds.matrix is found either in # exds.matrix_train or exds.matrix_test. # First try for X. for row in range(15): original_row = exds.matrix.X.getrow(row) if row in exds.train_rows: train_row = exds.matrix_train.X.getrow(exds.train_rows.index(row)) assert DatasetMatrix.sparse_equal(original_row, train_row) is True elif row in exds.test_rows: test_row = exds.matrix_test.X.getrow(exds.test_rows.index(row)) assert DatasetMatrix.sparse_equal(original_row, test_row) is True else: raise AssertionError( "Row {} not found in neither train nor test X matrices".format( row)) # Do the same for Y. for row in range(15): original_row = exds.matrix.X.getrow(row) if row in exds.train_rows: train_row = exds.matrix_train.X.getrow(exds.train_rows.index(row)) assert DatasetMatrix.sparse_equal(original_row, train_row) is True elif row in exds.test_rows: test_row = exds.matrix_test.X.getrow(exds.test_rows.index(row)) assert DatasetMatrix.sparse_equal(original_row, test_row) is True else: raise AssertionError( "Row {} not found in neither train nor test Y matrices".format( row))
def testfolders(): folders = dict() root = testutil.ensure_empty_tmp_subfolder('profiling') subfolders = ['dofCache', 'jht', 'dynamic_adtrees', 'adtrees'] for subfolder in subfolders: path = root / subfolder path.mkdir(parents=True, exist_ok=True) folders[subfolder] = path folders['root'] = root return folders
def testfolders(): folders = dict() root = testutil.ensure_empty_tmp_subfolder('test_iamb_with_gtests') subfolders = ['ci_test_results', 'heuristic_results'] for subfolder in subfolders: path = root / subfolder path.mkdir(parents=True, exist_ok=True) folders[subfolder] = path folders['root'] = root return folders
def testfolders(): folders = dict() root = testutil.ensure_empty_tmp_subfolder('test_demo_ipcmb_with_gtests') subfolders = ['dofCache', 'ci_test_results', 'jht', 'adtrees', 'dynamic_adtrees'] for subfolder in subfolders: path = root / subfolder path.mkdir(parents=True, exist_ok=True) folders[subfolder] = path folders['root'] = root return folders
def test_feature_removal__no_thresholds(): folder = testutil.ensure_empty_tmp_subfolder( 'test_binary_exds_repository__test_feature_removal__no_thresholds') definition = default_exds_definition(folder) definition.options['remove_features_by_p_thresholds'] = True definition.options['remove_objectives_by_p_thresholds'] = True definition.options['probability_thresholds__features'] = {} definition.options['probability_thresholds__objectives'] = {} exds = definition.create_exds() # There should be no change made to exds.matrix, exds.matrix_train and # exds.matrix_test because we specified no thresholds, in spite of the # flags 'remove_features_by_p_thresholds' and # 'remove_objectives_by_p_thresholds' set to True. exds.build() assertExDsDimensions(exds, 25, 75, 8, 4)
def test_keeping_rows(): # Set up a simple DatasetMatrix dm = DatasetMatrix('testmatrix') configure_default_datasetmatrix(dm) # Empty lists are not allowed. with pytest.raises(ValueError): dm.keep_rows([]) # Keep rows 1 and 3. dm.keep_rows([1, 3]) expected_X = scipy.sparse.csr_matrix(numpy.array([ [5, 6, 7, 8], [13, 14, 15, 16]])) expected_Y = scipy.sparse.csr_matrix(numpy.array([ [201, 202], [401, 402]])) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert ['row1', 'row3'] == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y # Keep row 0 of the remaining 2 (labeled 'row1'). dm.keep_rows([0]) expected_X = scipy.sparse.csr_matrix(numpy.array([ [5, 6, 7, 8]])) expected_Y = scipy.sparse.csr_matrix(numpy.array([ [201, 202]])) assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert ['row1'] == dm.row_labels assert default_column_labels_X() == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__keeping_rows') check_saving_and_loading(dm, folder)
def test_removing_columns_X(): # Set up a simple DatasetMatrix dm = DatasetMatrix('testmatrix') configure_default_datasetmatrix(dm) # Remove the third column from the X matrix. dm.delete_column_X(2) expected_X = scipy.sparse.csr_matrix(numpy.array([ [1, 2, 4], [5, 6, 8], [9, 10, 12], [13, 14, 16]])) expected_Y = default_matrix_Y() assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert ['colx0', 'colx1', 'colx3'] == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y # Remove the last column from the X matrix. dm.delete_column_X(2) expected_X = scipy.sparse.csr_matrix(numpy.array([ [1, 2], [5, 6], [9, 10], [13, 14]])) expected_Y = default_matrix_Y() assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert ['colx0', 'colx1'] == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_X') check_saving_and_loading(dm, folder)
def test_selecting_columns_X(): # Set up a simple DatasetMatrix dm = DatasetMatrix('testmatrix') configure_default_datasetmatrix(dm) # Empty lists are not allowed. with pytest.raises(ValueError): dm.select_columns_X([]) # Create new datasetmatrix where X has only columns 1 and 2. dm = dm.select_columns_X([1, 2], 'test_matrix_selected_colsX') expected_X = scipy.sparse.csr_matrix(numpy.array([ [2, 3], [6, 7], [10, 11], [14, 15]])) expected_Y = default_matrix_Y() assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert ['colx1', 'colx2'] == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y # Select X column 0 from the resulting datasetmatrix. dm = dm.select_columns_X([0], 'test_matrix_selected_colsX_2') expected_X = scipy.sparse.csr_matrix(numpy.array([ [2], [6], [10], [14]])) expected_Y = default_matrix_Y() assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True assert default_row_labels() == dm.row_labels assert ['colx1'] == dm.column_labels_X assert default_column_labels_Y() == dm.column_labels_Y folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__selecting_colsX') check_saving_and_loading(dm, folder)
def test_experiment_run__simple(): # Prepare folders. exds_folder = testutil.ensure_empty_tmp_subfolder( 'test_exds_repository__experiment_run') experiments_folder = testutil.ensure_empty_tmp_subfolder( 'test_experiment_repository__experiment_run') # Prepare the ExDs. exds_definition = default_exds_definition(exds_folder, 'test_exds_experiment_run') exds = exds_definition.create_exds() exds.build() # Define the parameters to be passed to each AlgorithmRun, in order in which they should run. algrun_id_format = '{}__test_AlgorithmIGt_BernoulliNB__Q{}_Obj{}' algorithm_run_parameters = [ { 'ID': algrun_id_format.format(0, 2, 0), 'Q': 2, 'objective_index': 0 }, { 'ID': algrun_id_format.format(1, 4, 0), 'Q': 4, 'objective_index': 0 }, { 'ID': algrun_id_format.format(2, 6, 0), 'Q': 6, 'objective_index': 0 }, { 'ID': algrun_id_format.format(3, 8, 0), 'Q': 8, 'objective_index': 0 }, ] # Prepare the experiment experiment_definition = default_experiment_definition( experiments_folder, exds_folder, algorithm_run_parameters) experiment_run = experiment_definition.create_experiment_run() # Run the experiment experiment_run.run() # Test whether the experiment run has generated the expected log files (one for each AlgorithmRun) log_folder = experiments_folder / 'test_experiment_run' / 'algorithm_run_logs' / 'main' assert Path(log_folder).exists() is True expected_log_files = [ '0__test_AlgorithmIGt_BernoulliNB__Q2_Obj0.log', '1__test_AlgorithmIGt_BernoulliNB__Q4_Obj0.log', '2__test_AlgorithmIGt_BernoulliNB__Q6_Obj0.log', '3__test_AlgorithmIGt_BernoulliNB__Q8_Obj0.log' ] created_log_files = sorted( [f.name for f in log_folder.iterdir() if f.is_file()]) assert expected_log_files == created_log_files # Test whether the experiment run has generated the expected pickle files (one for each AlgorithmRunDatapoint) datapoints_folder = experiments_folder / 'test_experiment_run' / 'algorithm_run_datapoints' / 'main' assert datapoints_folder.exists() is True expected_datapoints_files = [ '0__test_AlgorithmIGt_BernoulliNB__Q2_Obj0.pickle', '1__test_AlgorithmIGt_BernoulliNB__Q4_Obj0.pickle', '2__test_AlgorithmIGt_BernoulliNB__Q6_Obj0.pickle', '3__test_AlgorithmIGt_BernoulliNB__Q8_Obj0.pickle' ] created_datapoints_files = sorted( [f.name for f in datapoints_folder.iterdir() if f.is_file()]) assert expected_datapoints_files == created_datapoints_files # Disallow running the experiment again, because finishing the # experiment should also lock its folder. assert experiment_run.definition.folder_is_locked() is True with pytest.raises(ExperimentFolderLockedException): experiment_run.run() # After unlocking and deleting logs and datapoint files, the experiment should run again normally. experiment_run.definition.unlock_folder() assert experiment_run.definition.folder_is_locked() is False experiment_run.definition.delete_subfolder('algorithm_run_logs') experiment_run.definition.delete_subfolder('algorithm_run_datapoints') experiment_run.run() # Re-test whether the expected logs and datapoint files were generated. created_log_files = sorted( [f.name for f in log_folder.iterdir() if f.is_file()]) created_datapoints_files = sorted( [f.name for f in datapoints_folder.iterdir() if f.is_file()]) assert expected_log_files == created_log_files assert expected_datapoints_files == created_datapoints_files # The experiment should have locked its folder. assert experiment_run.definition.folder_is_locked() is True
def testfolder(): return testutil.ensure_empty_tmp_subfolder( 'test_exds_repository__test_build')