예제 #1
0
def test_removing_rows():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the third row. Affects X and Y at the same time.
    dm.delete_row(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2, 3, 4],
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [101, 102],
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ["row0", "row1", "row3"] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Remove the first row.
    dm.delete_row(0)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ["row1", "row3"] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows')
    check_saving_and_loading(dm, folder)

    dm.unfinalize()

    # Remove both remaining rows.
    dm.delete_row(0)
    dm.delete_row(0)
    expected_X = scipy.sparse.csr_matrix((0, 4))
    expected_Y = scipy.sparse.csr_matrix((0, 2))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert [] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_rows')
    check_saving_and_loading(dm, folder)
예제 #2
0
def test_removing_columns_Y():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the first column from the Y matrix.
    dm.delete_column_Y(0)
    expected_X = default_matrix_X()
    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [102],
        [202],
        [302],
        [402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert ['coly1'] == dm.column_labels_Y

    # Remove the last remaining column from the Y matrix.
    dm.delete_column_Y(0)
    expected_X = default_matrix_X()
    expected_Y = scipy.sparse.csr_matrix((4, 0))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert [] == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_Y')
    check_saving_and_loading(dm, folder)
예제 #3
0
def test_saving_and_loading():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__save_load')
    check_saving_and_loading(dm, folder)
예제 #4
0
def test_exds_saving_and_loading():
    folder = testutil.ensure_empty_tmp_subfolder(
        'test_exds_repository__test_saving_and_loading')
    definition = default_exds_definition(folder)
    exds = definition.create_exds()

    # Due to the definition provided by default_exds_definition(), the
    # exds will be saved after building.
    exds.build()

    # Verify if the matrices have been finalized.
    assert exds.matrix.final is True
    assert exds.matrix_train.final is True
    assert exds.matrix_test.final is True

    # Verify if the matrices can be loaded individually from the saved
    # ModelBuildingExperimentalDataset.
    # - The original matrix:
    loadedMatrix_original = DatasetMatrix("dataset")
    loadedMatrix_original.load(exds.definition.path)
    assert exds.matrix == loadedMatrix_original
    # - The training matrix:
    loadedMatrix_train = DatasetMatrix("dataset_train")
    loadedMatrix_train.load(exds.definition.path)
    assert exds.matrix_train == loadedMatrix_train
    # - The test matrix:
    loadedMatrix_test = DatasetMatrix("dataset_test")
    loadedMatrix_test.load(exds.definition.path)
    assert exds.matrix_test == loadedMatrix_test
예제 #5
0
def test_feature_removal__thresholds_on_train():
    folder = testutil.ensure_empty_tmp_subfolder(
        'test_binary_exds_repository__test_feature_removal__train')
    definition = default_exds_definition(folder)

    # First, build the exds WITHOUT removing features. We will inspect what features
    # will be chosen for removal by calling the internal method
    # BinaryExperimentalDataset.determine_thresholded_features_to_remove().
    definition.options['remove_features_by_p_thresholds'] = False
    definition.options['remove_objectives_by_p_thresholds'] = False
    # But we do configure thresholds for features, to be able to verify
    # what the exds would remove, if allowed to.
    definition.options['probability_thresholds__features'] = {
        'train': (0.1, 0.8)
    }
    definition.options['probability_thresholds__objectives'] = {}
    exds = definition.create_exds()
    exds.build()
    assertExDsDimensions(exds, 25, 75, 8, 4)

    # Only when analysing the full matrix should we see features to be
    # removed, since we specified thresholds only for 'train'.
    expected_features_to_remove = {
        3: 'galaxy',
        4: 'oxygen',
        5: 'polyrhythm',
        6: 'python',
        7: 'rocket'
    }
    assertThresholdedFeaturesToRemove(exds, expected_features_to_remove,
                                      ['train'])
    assertThresholdedObjectivesToRemove(exds, {}, [])

    # Now we rebuild the exds, but with feature removal enabled.
    folder = testutil.ensure_empty_tmp_subfolder(
        'test_binary_exds_repository__test_feature_removal__train')
    definition = default_exds_definition(folder)
    definition.options['probability_thresholds__features'] = {
        'train': (0.1, 0.8)
    }
    definition.options['probability_thresholds__objectives'] = {}
    definition.options['remove_features_by_p_thresholds'] = True
    definition.options['remove_objectives_by_p_thresholds'] = True
    exds = definition.create_exds()
    exds.build()
    assertExDsDimensions(exds, 25, 75, 3, 4)
    assertFeaturesNotInExDs(exds, expected_features_to_remove.keys())
예제 #6
0
def test_objective_removal__thresholds_on_full():
    folder = testutil.ensure_empty_tmp_subfolder(
        'test_binary_exds_repository__test_objective_removal__full')
    definition = default_exds_definition(folder)

    # First, build the exds WITHOUT removing objectives. We will inspect what objectives
    # will be chosen for removal by calling the internal method
    # BinaryExperimentalDataset.determine_thresholded_objectives_to_remove().
    definition.options['remove_features_by_p_thresholds'] = False
    definition.options['remove_objectives_by_p_thresholds'] = False
    # But we do configure thresholds for objectives, to be able to verify
    # what the exds would remove, if allowed to.
    definition.options['probability_thresholds__features'] = {}
    definition.options['probability_thresholds__objectives'] = {
        'full': (0.1, 0.9)
    }
    exds = definition.create_exds()
    exds.build()
    assertExDsDimensions(exds, 25, 75, 8, 4)

    # Only when analysing the full matrix should we see objectives to be
    # removed, since we specified thresholds only for 'full'.
    expected_objectives_to_remove = {2: 'sidereal', 3: 'unknown'}
    assertThresholdedFeaturesToRemove(exds, {}, [])
    assertThresholdedObjectivesToRemove(exds, expected_objectives_to_remove,
                                        ['full'])

    # Now we rebuild the exds, but with feature removal enabled.
    folder = testutil.ensure_empty_tmp_subfolder(
        'test_binary_exds_repository__test_objective_removal__full')
    definition = default_exds_definition(folder)
    definition.options['probability_thresholds__features'] = {}
    definition.options['probability_thresholds__objectives'] = {
        'full': (0.1, 0.9)
    }
    definition.options['remove_features_by_p_thresholds'] = True
    definition.options['remove_objectives_by_p_thresholds'] = True
    exds = definition.create_exds()
    exds.build()
    assertExDsDimensions(exds, 25, 75, 8, 2)
    assertObjectivesNotInExDs(exds, expected_objectives_to_remove.keys())
예제 #7
0
def test_exds_build():
    folder = testutil.ensure_empty_tmp_subfolder(
        'test_exds_repository__test_build')
    definition = default_exds_definition(folder)
    exds = definition.create_exds()

    exds.build(finalize_and_save=False)

    # Make sure 'training_set_size = 0.25' has been properly taken into
    # account.
    assert 16 == exds.total_row_count
    assert 4 == len(exds.train_rows)
    assert 12 == len(exds.test_rows)
    assert 4 == exds.matrix_train.X.get_shape()[0]
    assert 4 == exds.matrix_train.Y.get_shape()[0]
    assert 12 == exds.matrix_test.X.get_shape()[0]
    assert 12 == exds.matrix_test.Y.get_shape()[0]

    # Reconstruct the list of all row indices, to make sure the split is
    # consistent.
    all_rows = set(exds.train_rows) | set(exds.test_rows)
    assert set(range(16)) == all_rows
    assert 0 == len(set(exds.train_rows) & set(exds.test_rows))

    # Ensure that any row of exds.matrix is found either in
    # exds.matrix_train or exds.matrix_test.
    # First try for X.
    for row in range(15):
        original_row = exds.matrix.X.getrow(row)
        if row in exds.train_rows:
            train_row = exds.matrix_train.X.getrow(exds.train_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, train_row) is True
        elif row in exds.test_rows:
            test_row = exds.matrix_test.X.getrow(exds.test_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, test_row) is True
        else:
            raise AssertionError(
                "Row {} not found in neither train nor test X matrices".format(
                    row))

    # Do the same for Y.
    for row in range(15):
        original_row = exds.matrix.X.getrow(row)
        if row in exds.train_rows:
            train_row = exds.matrix_train.X.getrow(exds.train_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, train_row) is True
        elif row in exds.test_rows:
            test_row = exds.matrix_test.X.getrow(exds.test_rows.index(row))
            assert DatasetMatrix.sparse_equal(original_row, test_row) is True
        else:
            raise AssertionError(
                "Row {} not found in neither train nor test Y matrices".format(
                    row))
예제 #8
0
def testfolders():
    folders = dict()
    root = testutil.ensure_empty_tmp_subfolder('profiling')

    subfolders = ['dofCache', 'jht', 'dynamic_adtrees', 'adtrees']
    for subfolder in subfolders:
        path = root / subfolder
        path.mkdir(parents=True, exist_ok=True)
        folders[subfolder] = path

    folders['root'] = root
    return folders
def testfolders():
    folders = dict()
    root = testutil.ensure_empty_tmp_subfolder('test_iamb_with_gtests')

    subfolders = ['ci_test_results', 'heuristic_results']
    for subfolder in subfolders:
        path = root / subfolder
        path.mkdir(parents=True, exist_ok=True)
        folders[subfolder] = path

    folders['root'] = root
    return folders
예제 #10
0
def testfolders():
    folders = dict()
    root = testutil.ensure_empty_tmp_subfolder('test_demo_ipcmb_with_gtests')

    subfolders = ['dofCache', 'ci_test_results', 'jht', 'adtrees', 'dynamic_adtrees']
    for subfolder in subfolders:
        path = root / subfolder
        path.mkdir(parents=True, exist_ok=True)
        folders[subfolder] = path

    folders['root'] = root
    return folders
예제 #11
0
def test_feature_removal__no_thresholds():
    folder = testutil.ensure_empty_tmp_subfolder(
        'test_binary_exds_repository__test_feature_removal__no_thresholds')
    definition = default_exds_definition(folder)

    definition.options['remove_features_by_p_thresholds'] = True
    definition.options['remove_objectives_by_p_thresholds'] = True
    definition.options['probability_thresholds__features'] = {}
    definition.options['probability_thresholds__objectives'] = {}
    exds = definition.create_exds()

    # There should be no change made to exds.matrix, exds.matrix_train and
    # exds.matrix_test because we specified no thresholds, in spite of the
    # flags 'remove_features_by_p_thresholds' and
    # 'remove_objectives_by_p_thresholds' set to True.
    exds.build()

    assertExDsDimensions(exds, 25, 75, 8, 4)
예제 #12
0
def test_keeping_rows():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Empty lists are not allowed.
    with pytest.raises(ValueError):
        dm.keep_rows([])

    # Keep rows 1 and 3.
    dm.keep_rows([1, 3])
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8],
        [13, 14, 15, 16]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202],
        [401, 402]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ['row1', 'row3'] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Keep row 0 of the remaining 2 (labeled 'row1').
    dm.keep_rows([0])
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [5, 6, 7, 8]]))

    expected_Y = scipy.sparse.csr_matrix(numpy.array([
        [201, 202]]))

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert ['row1'] == dm.row_labels
    assert default_column_labels_X() == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__keeping_rows')
    check_saving_and_loading(dm, folder)
예제 #13
0
def test_removing_columns_X():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Remove the third column from the X matrix.
    dm.delete_column_X(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2, 4],
        [5, 6, 8],
        [9, 10, 12],
        [13, 14, 16]]))

    expected_Y = default_matrix_Y()

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx0', 'colx1', 'colx3'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Remove the last column from the X matrix.
    dm.delete_column_X(2)
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [1, 2],
        [5, 6],
        [9, 10],
        [13, 14]]))

    expected_Y = default_matrix_Y()

    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx0', 'colx1'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__removing_columns_X')
    check_saving_and_loading(dm, folder)
예제 #14
0
def test_selecting_columns_X():
    # Set up a simple DatasetMatrix
    dm = DatasetMatrix('testmatrix')
    configure_default_datasetmatrix(dm)

    # Empty lists are not allowed.
    with pytest.raises(ValueError):
        dm.select_columns_X([])

    # Create new datasetmatrix where X has only columns 1 and 2.
    dm = dm.select_columns_X([1, 2], 'test_matrix_selected_colsX')
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [2, 3],
        [6, 7],
        [10, 11],
        [14, 15]]))
    expected_Y = default_matrix_Y()
    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx1', 'colx2'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    # Select X column 0 from the resulting datasetmatrix.
    dm = dm.select_columns_X([0], 'test_matrix_selected_colsX_2')
    expected_X = scipy.sparse.csr_matrix(numpy.array([
        [2],
        [6],
        [10],
        [14]]))
    expected_Y = default_matrix_Y()
    assert DatasetMatrix.sparse_equal(expected_X, dm.X) is True
    assert DatasetMatrix.sparse_equal(expected_Y, dm.Y) is True
    assert default_row_labels() == dm.row_labels
    assert ['colx1'] == dm.column_labels_X
    assert default_column_labels_Y() == dm.column_labels_Y

    folder = testutil.ensure_empty_tmp_subfolder('test_datasetmatrix__selecting_colsX')
    check_saving_and_loading(dm, folder)
예제 #15
0
def test_experiment_run__simple():
    # Prepare folders.
    exds_folder = testutil.ensure_empty_tmp_subfolder(
        'test_exds_repository__experiment_run')
    experiments_folder = testutil.ensure_empty_tmp_subfolder(
        'test_experiment_repository__experiment_run')

    # Prepare the ExDs.
    exds_definition = default_exds_definition(exds_folder,
                                              'test_exds_experiment_run')
    exds = exds_definition.create_exds()
    exds.build()

    # Define the parameters to be passed to each AlgorithmRun, in order in which they should run.
    algrun_id_format = '{}__test_AlgorithmIGt_BernoulliNB__Q{}_Obj{}'
    algorithm_run_parameters = [
        {
            'ID': algrun_id_format.format(0, 2, 0),
            'Q': 2,
            'objective_index': 0
        },
        {
            'ID': algrun_id_format.format(1, 4, 0),
            'Q': 4,
            'objective_index': 0
        },
        {
            'ID': algrun_id_format.format(2, 6, 0),
            'Q': 6,
            'objective_index': 0
        },
        {
            'ID': algrun_id_format.format(3, 8, 0),
            'Q': 8,
            'objective_index': 0
        },
    ]

    # Prepare the experiment
    experiment_definition = default_experiment_definition(
        experiments_folder, exds_folder, algorithm_run_parameters)
    experiment_run = experiment_definition.create_experiment_run()

    # Run the experiment
    experiment_run.run()

    # Test whether the experiment run has generated the expected log files (one for each AlgorithmRun)
    log_folder = experiments_folder / 'test_experiment_run' / 'algorithm_run_logs' / 'main'
    assert Path(log_folder).exists() is True
    expected_log_files = [
        '0__test_AlgorithmIGt_BernoulliNB__Q2_Obj0.log',
        '1__test_AlgorithmIGt_BernoulliNB__Q4_Obj0.log',
        '2__test_AlgorithmIGt_BernoulliNB__Q6_Obj0.log',
        '3__test_AlgorithmIGt_BernoulliNB__Q8_Obj0.log'
    ]
    created_log_files = sorted(
        [f.name for f in log_folder.iterdir() if f.is_file()])
    assert expected_log_files == created_log_files

    # Test whether the experiment run has generated the expected pickle files (one for each AlgorithmRunDatapoint)
    datapoints_folder = experiments_folder / 'test_experiment_run' / 'algorithm_run_datapoints' / 'main'
    assert datapoints_folder.exists() is True
    expected_datapoints_files = [
        '0__test_AlgorithmIGt_BernoulliNB__Q2_Obj0.pickle',
        '1__test_AlgorithmIGt_BernoulliNB__Q4_Obj0.pickle',
        '2__test_AlgorithmIGt_BernoulliNB__Q6_Obj0.pickle',
        '3__test_AlgorithmIGt_BernoulliNB__Q8_Obj0.pickle'
    ]
    created_datapoints_files = sorted(
        [f.name for f in datapoints_folder.iterdir() if f.is_file()])
    assert expected_datapoints_files == created_datapoints_files

    # Disallow running the experiment again, because finishing the
    # experiment should also lock its folder.
    assert experiment_run.definition.folder_is_locked() is True
    with pytest.raises(ExperimentFolderLockedException):
        experiment_run.run()

    # After unlocking and deleting logs and datapoint files, the experiment should run again normally.
    experiment_run.definition.unlock_folder()
    assert experiment_run.definition.folder_is_locked() is False
    experiment_run.definition.delete_subfolder('algorithm_run_logs')
    experiment_run.definition.delete_subfolder('algorithm_run_datapoints')

    experiment_run.run()

    # Re-test whether the expected logs and datapoint files were generated.
    created_log_files = sorted(
        [f.name for f in log_folder.iterdir() if f.is_file()])
    created_datapoints_files = sorted(
        [f.name for f in datapoints_folder.iterdir() if f.is_file()])
    assert expected_log_files == created_log_files
    assert expected_datapoints_files == created_datapoints_files

    # The experiment should have locked its folder.
    assert experiment_run.definition.folder_is_locked() is True
예제 #16
0
def testfolder():
    return testutil.ensure_empty_tmp_subfolder(
        'test_exds_repository__test_build')