Exemplo n.º 1
0
def load_and_filter_processed_data(info):
    """
    :param info:
    :return:
    """
    assert isinstance(info, dict)
    file_name = '%s_processed.pickle' % info['data_name']
    file_name = data_dir / file_name
    data, cvindices = load_processed_data(file_name = file_name.with_suffix('.pickle'))
    # file_name
    if 'fold_id' in info:
        if 'fold_num' in info:
            data = filter_data_to_fold(data, cvindices = cvindices, fold_id = info['fold_id'], fold_num = info['fold_num'])
        else:
            data = filter_data_to_fold(data, cvindices = cvindices, fold_id = info['fold_id'], fold_num = 0)

    print_log('loaded dataset %s' % file_name)
    return data
Exemplo n.º 2
0
def test_solution_pool():

    s_pool = SolutionPool()

    data_name = 'breastcancer'
    data_file_name = '%s/%s_processed.csv' % (data_dir, data_name)
    data_file_name = Path(data_file_name)

    random_seed = 1337
    np.random.seed(seed=random_seed)

    ## load data from disk
    data, cvindices = load_processed_data(
        file_name=data_file_name.with_suffix('.pickle'))
    data = filter_data_to_fold(data,
                               cvindices=cvindices,
                               fold_id='K05N01',
                               fold_num=1)

    # about the folds
    # fold_id = K[total # of folds]N[replicate number]
    # 'K05N01' has 5 total folds, N01 means this is the first replicate
    # fold_num = 1 means that we use fold # 1/5 as the test set
    # fold_num = 2 means that we use fold # 2/5 as the test set
    # fold_num = 0 means that we don't use a fold as the test set (i.e., so filtering with fold_num = 0 just returns the full training dataset)

    mip = ZeroOneLossMIP(data)
    mip.print_flag = True
    mip.set_parallel(True)
    out = mip.solve(time_limit=10)

    # add solutions manually
    s_pool.add(solution=[1, 2, 3],
               coefs=[1, 2],
               objval=5,
               lowerbound=4,
               prediction_constraint=('x', 'y'))
    s_pool.add(solution=[[0, 0, 0], [1, 1, 1]],
               coefs=[[1, 0], [1, 1]],
               objval=[0, 1],
               lowerbound=[2, 3])
    preds = s_pool.get_preds([-2, 3])
    small_pool = s_pool.get_solutions_with_pred([-2, 3], 1)

    assert all(preds == [1, -1, 1])
    assert small_pool._df.shape[0] == 2
    s_pool.clear()

    s_pool.add_from_mip(mip)
    print(s_pool)
    s_pool.clear()

    s_pool.add_from_mip(mip, add_full_solution_pool=True)
    print(s_pool)
Exemplo n.º 3
0
def get_compas_subgroups(outcome="arrest"):
    compas_file = data_dir / ("compas_%s_processed.pickle" % outcome)
    with open(compas_file, "rb") as f:
        data = dill.load(f)
        data = filter_data_to_fold(data['data'], data['cvindices'], info['fold_id'], fold_num=1)
        race_is_caucasian = data['X'][:, data['variable_names'].index('race_is_causasian')]
        race_is_african_american = data['X'][:, data['variable_names'].index('race_is_african_american')]
        race_is_hispanic = data['X'][:, data['variable_names'].index('race_is_hispanic')]
        race_is_other = data['X'][:, data['variable_names'].index('race_is_other')]
        subgroups = [
            'caucasian' if race_is_caucasian[i] else
            'african_american' if race_is_african_american[i] else
            'hispanic' if race_is_hispanic[i] else
            'other' for i in range(len(data['X']))
            ]
        return np.array(subgroups)
Exemplo n.º 4
0
def get_summary_row(info):
    # set up directories
    working_dir = output_dir / info['data_name']
    processed_file = working_dir / get_processed_file_name(info)
    discrepancy_file = working_dir / get_discrepancy_file_name(info)
    data_file = data_dir / (info['data_name'] + "_processed.pickle")

    # get the baseline and flipped models
    with open(processed_file, "rb") as f:
        processed = dill.load(f)['results_df']

    # get the discrepancy models
    with open(discrepancy_file, "rb") as f:
        discrepancy = dill.load(f)['results_df']

    # load data
    with open(data_file, "rb") as f:
        data = dill.load(f)
        data = filter_data_to_fold(data['data'],
                                   data['cvindices'],
                                   fold_id=info['fold_id'],
                                   fold_num=1,
                                   include_validation=True)

    X, Y = data['X'], data['Y']
    X_test, Y_test = data['X_validation'], data['Y_validation']

    # make a classifier for each model in the df
    processed['clf'] = processed['coefficients'].apply(
        get_classifier_from_coefficients)
    discrepancy['clf'] = discrepancy['coefficients'].apply(
        get_classifier_from_coefficients)

    # get the baseline classifier
    baseline = processed.query("model_type == 'baseline'").iloc[0]
    baseline_clf = baseline['clf']
    baseline_metrics = get_metrics(baseline_clf, X, Y, X_test, Y_test)

    baseline_metrics.update({"Dataset": info['data_name']})

    return baseline_metrics
Exemplo n.º 5
0
def get_overview_table_row(info):
    """
    creates a row of the overview data frame
    :param info: dictionary containing data_name, fold_id, fold_num
    :return: dictionary containing all fields for the overview data frame
    """

    ## setup file names
    output_dir = results_dir / info['data_name']

    # file names
    file_names = {
        'data': '%s/%s_processed.pickle' % (data_dir, info['data_name']),
        'baseline': output_dir / get_baseline_file_name(info),
        'discrepancy': output_dir / get_discrepancy_file_name(info),
        'flipped': output_dir / get_processed_file_name(info),
    }

    # load data
    data, cvindices = load_processed_data(file_name=file_names['data'])
    data = filter_data_to_fold(data,
                               cvindices=cvindices,
                               fold_id=info['fold_id'],
                               fold_num=info['fold_num'],
                               include_validation=True)

    XY = np.hstack([data['X'], data['Y'][:, None]])
    XY_test = np.hstack([data['X_validation'], data['Y_validation'][:, None]])
    U = np.unique(XY, axis=0)
    U_test = np.unique(XY_test, axis=0)
    observed_idx = get_common_row_indices(U, U_test)

    # data-related fields
    row = {
        'data_name': info['data_name'],
        'fold_id': info['fold_id'],
        'fold_num': info['fold_num'],
        #
        'n': data['X'].shape[0],
        'd': data['X'].shape[1] - 1,
        'n_pos': np.sum(data['Y'] == 1),
        'n_neg': np.sum(data['Y'] == -1),
        'n_xy_unique': U.shape[0],
        'n_xy_unobs_test': U_test.shape[0] - len(observed_idx),
        #
        'n_test': data['X_validation'].shape[0],
        'n_test_pos': np.sum(data['Y_validation'] == 1),
        'n_test_neg': np.sum(data['Y_validation'] == -1),
        #
        'has_baseline_results': file_names['baseline'].exists(),
        'has_flipped_results': file_names['flipped'].exists(),
        'has_discrepancy_results': file_names['discrepancy'].exists(),
    }

    # fields from baseline results
    baseline_fields = [
        'baseline_train_error', 'baseline_test_error', 'baseline_ub',
        'baseline_lb', 'baseline_gap', 'baseline_n_equivalent',
        'baseline_time_limit'
    ]
    row.update({k: float('nan') for k in baseline_fields})

    if file_names['baseline'].exists():

        with open(file_names['baseline'], 'rb') as infile:
            baseline_results = dill.load(infile)

        baseline_coefs = baseline_results['pool_df'].query(
            'model_type=="baseline"')['coefficients'].values[0]
        out = baseline_results['baseline_output']
        err_test = np.sign(data['X_validation'].dot(baseline_coefs))

        row.update({
            'baseline_train_error':
            out['upperbound'],
            'baseline_test_error':
            np.not_equal(err_test, data['Y_validation']).sum(),
            'baseline_n_equivalent':
            len(baseline_results['equivalent_output']),
            'baseline_time_limit':
            baseline_results['info']['time_limit'],
            'baseline_ub':
            out['upperbound'],
            'baseline_lb':
            out['lowerbound'],
            'baseline_gap':
            out['gap'],
        })

    # initialize discrepancy as 'nan'
    disc_fields = [
        'disc_instances', 'disc_nnz_instances', 'disc_instances_gap_eq_0',
        'disc_instances_gap_leq_0.1', 'disc_instances_gap_leq_0.5',
        'disc_instances_gap_gt_0.5', 'disc_min_epsilon',
        'disc_discrepancy_ratio_min', 'disc_discrepancy_ratio_med',
        'disc_discrepancy_ratio_max', 'disc_discrepancy_min',
        'disc_discrepancy_med', 'disc_discrepancy_max'
    ]
    row.update({k: float('nan') for k in disc_fields})

    if file_names['discrepancy'].exists():
        with open(file_names['discrepancy'], 'rb') as infile:
            discrepancy_results = dill.load(infile)

        n_instances = len(discrepancy_results['epsilon_values'])
        disc_df = discrepancy_results['results_df']

        nnz_df = disc_df.query('total_discrepancy > 0')
        n_nnz = len(nnz_df)
        if len(nnz_df) > 0:
            ratio = nnz_df['total_discrepancy'] / nnz_df['epsilon']
            row.update({
                'disc_min_epsilon': nnz_df['epsilon'].idxmin(),
                'disc_discrepancy_ratio_min': np.nanmin(ratio),
                'disc_discrepancy_ratio_med': np.nanmedian(ratio),
                'disc_discrepancy_ratio_max': np.nanmax(ratio),
            })

        # stats
        row.update({
            'disc_instances':
            n_instances,
            'disc_nnz_instances':
            n_nnz,
            #
            'disc_instances_gap_eq_0':
            len(disc_df.query('gap == 0.0')),
            'disc_instances_gap_leq_0.1':
            len(disc_df.query('gap <= 0.1')),
            'disc_instances_gap_leq_0.5':
            len(disc_df.query('gap <= 0.5')),
            'disc_instances_gap_gt_0.5':
            len(disc_df.query('gap > 0.5')),
            #
            'disc_discrepancy_min':
            np.nanmin(disc_df['total_discrepancy']),
            'disc_discrepancy_med':
            np.nanmedian(disc_df['total_discrepancy']),
            'disc_discrepancy_max':
            np.nanmax(disc_df['total_discrepancy']),
        })

    #initialize flipped fields
    flipped_fields = [
        'flipped_instances', 'flipped_instances_gap_eq_0',
        'flipped_instances_gap_leq_0.1', 'flipped_instances_gap_leq_0.5',
        'flipped_instances_gap_gt_0.5', 'flipped_change_in_error_min',
        'flipped_change_in_error_med', 'flipped_change_in_error_max',
        'flipped_change_in_test_error_min', 'flipped_change_in_test_error_med',
        'flipped_change_in_test_error_max'
    ]
    row.update({k: float('nan') for k in flipped_fields})

    if file_names['flipped'].exists():

        with open(file_names['flipped'], 'rb') as infile:
            flipped_results = dill.load(infile)

        flip_df = flipped_results['results_df']
        baseline = flip_df.query('model_type == "baseline"')
        flipped = flip_df.query('model_type == "flipped"')
        change_in_train_error = flipped['train_error'] - baseline[
            'train_error'].values[0]
        change_in_test_error = flipped['validation_error'] - baseline[
            'validation_error'].values[0]

        row.update({
            #
            'flipped_instances':
            len(flipped),
            'flipped_instances_missing':
            flipped_results['n_missing'],
            'flipped_instances_gap_eq_0':
            len(flipped.query('gap == 0.0')),
            'flipped_instances_gap_leq_0.1':
            len(flipped.query('gap <= 0.1')),
            'flipped_instances_gap_leq_0.5':
            len(flipped.query('gap <= 0.5')),
            'flipped_instances_gap_gt_0.5':
            len(flipped.query('gap > 0.5')),
            #
            'flipped_change_in_error_min':
            np.nanmin(change_in_train_error),
            'flipped_change_in_error_med':
            np.nanmedian(change_in_train_error),
            'flipped_change_in_error_max':
            np.nanmax(change_in_train_error),
            #
            'flipped_change_in_test_error_min':
            np.nanmin(change_in_test_error),
            'flipped_change_in_test_error_med':
            np.nanmedian(change_in_test_error),
            'flipped_change_in_test_error_max':
            np.nanmax(change_in_test_error),
        })

    return row
Exemplo n.º 6
0
time_limit_instance = 60
initialize_mip = True
custom_mip_params = True
search_global_equivalent = False
max_iterations = 1e8  #shorten loop during development
fold_id = 'K01N01'
fold_num = 1

# setup seed
np.random.seed(seed = random_seed)

## load data from disk
data_file_name = '%s/%s_processed.csv' % (data_dir, data_name)
data_file_name = Path(data_file_name)
data, cvindices = load_processed_data(file_name = data_file_name.with_suffix('.pickle'))
data = filter_data_to_fold(data, cvindices = cvindices, fold_id = fold_id, fold_num = fold_num, include_validation=True)

# compress dataset into distinct feature vectors and counts
compressed = compress_data(data)
U, N_pos, N_neg, x_to_u_idx, u_to_x_idx = tuple(compressed[var] for var in ('U', 'N_pos', 'N_neg', 'x_to_u_idx', 'u_to_x_idx'))

selection = pd.DataFrame({'n_pos': N_pos, 'n_neg': N_neg, 'idx': x_to_u_idx})

# solve zero-one loss MIP
mip = ZeroOneLossMIP(data, print_flag = True, parallel_flag = True, random_seed = random_seed)
if custom_mip_params:
    mip.mip = set_mip_parameters(mip.mip, CPX_MIP_PARAMETERS)
out = mip.solve(time_limit = time_limit_global)
mip.check_solution()
global_coefs = mip.coefficients
global_lb = out['lowerbound']
Exemplo n.º 7
0
def aggregate_baseline_and_flipped_results(info):
    """
    :param data_name:
    :param fold_id:
    :param fold_num:
    :return:
    """
    assert isinstance(info, dict)
    assert 'data_name' in info and 'fold_id' in info and 'fold_num' in info

    # setup file names
    output_dir = results_dir / info['data_name']
    baseline_results_file = output_dir / get_baseline_file_name(info)
    data_file_name = Path('%s/%s_processed.pickle' % (data_dir, info['data_name']))

    # load baseline results
    assert baseline_results_file.exists()
    with open(baseline_results_file, 'rb') as f:
        baseline = dill.load(f)

    # load data from disk
    assert baseline_results_file.exists()
    data, cvindices = load_processed_data(file_name = data_file_name)
    data = filter_data_to_fold(data, cvindices = cvindices, fold_id = info['fold_id'], fold_num = info['fold_num'], include_validation = True)

    # load flipped results
    flipped_raw_results_files = [f for f in output_dir.iterdir() if f.suffix == '.pickle' and 'flipped_raw_results' in f.name]
    all_partitions = get_part_id_helper(flipped_raw_results_files)
    flipped_partition = select_partition_for_flipped_training(all_partitions)
    flipped_results_files = flipped_partition['matched_files']
    print_log('partition contains %d raw results files.' % len(flipped_results_files))

    # load files
    flipped = []
    for fname in flipped_results_files:
        with open(fname, 'rb') as f:
            flipped.append(dill.load(f))

     # concatenate data frames
    if len(flipped):
        flipped_df = pd.concat([info['flipped_df'] for info in flipped], sort = False)
        flipped_df['model_type'] = 'flipped'
        results_df = pd.concat([baseline['pool_df'], flipped_df], sort=False).reset_index(drop=True)
    else:
        print("Warning: no flipped model files found during results aggregation.")
        results_df = baseline['pool_df']

    # combine all results in dataframe
    results_df['train_error'] = float('nan')
    results_df['validation_error'] = float('nan')

    # compute error metrics
    W = np.stack(results_df['coefficients'].values)
    results_df['train_error'] = compute_error_rate_from_coefficients(W = W, X = data['X'], Y = data['Y'])
    if 'X_validation' in data and 'Y_validation' in data:
        results_df['validation_error'] = compute_error_rate_from_coefficients(W = W, X = data['X_validation'], Y = data['Y_validation'])

    # info to return
    now = datetime.now()
    out = {
        #
        'date': now.strftime("%y_%m_%d_%H_%M"),
        'info': info,
        'training_info': {'baseline': baseline['info'], 'flipped_info': [f['info'] for f in flipped]},
        #
        'data_file': baseline['data_file'],
        'baseline_results_file': baseline_results_file,
        'flipped_results_files': flipped_results_files,
        'results_df': results_df,
        'n_missing': len(flipped_partition['missing_parts']),
        #
        }

    return out
Exemplo n.º 8
0
def train_discrepancy_classifier(info):
    """
    :param info:
    :return:
    """

    print_log('entered train_discrepancy_classifier')

    # dashboard
    for k, v in DISCREPANCY_TRAINING_SETTINGS.items():
        if k not in info:
            info[k] = v

    for k, v in info.items():
        print("info['%s'] = %r" % (k, info[k]))

    # setup files
    output_dir = results_dir / info['data_name']
    output_dir.mkdir(exist_ok = True)

    results_file_name = output_dir / get_discrepancy_file_name(info)
    baseline_file_name = output_dir / get_baseline_file_name(info)
    processed_file_name = output_dir / get_processed_file_name(info)

    # print file names
    print_log('baseline_file_name: %s' % baseline_file_name)
    print_log('results_file_name: %s' % results_file_name)

    # load baseline file
    assert baseline_file_name.exists()
    with open(baseline_file_name, 'rb') as infile:
        baseline_results = dill.load(infile)
    print_log('loaded baseline file %s' % baseline_file_name)

    # setup seed
    np.random.seed(seed = info['random_seed'])

    # load data from disk
    data_file_name = '%s_processed.csv' % info['data_name']
    data_file_name = data_dir / data_file_name
    data, cvindices = load_processed_data(file_name = data_file_name.with_suffix('.pickle'))
    data = filter_data_to_fold(data, cvindices = cvindices, fold_id = info['fold_id'], fold_num = info['fold_num'])
    print_log('loaded dataset %s' % data_file_name)

    # load baseline
    initial_pool = baseline_results['pool_df']
    baseline_stats = baseline_results['baseline_output']
    baseline_coefs = initial_pool.query('model_type == "baseline"')['coefficients'].values[0]

    n_samples = data['X'].shape[0]
    baseline_ub = baseline_stats['upperbound']
    baseline_lb = baseline_stats['lowerbound']
    loss_values = np.arange(baseline_ub, n_samples // 2, step=DISCREPANCY_TRAINING_SETTINGS['epsilon_step'])
    epsilon_values = sorted(np.array(loss_values - baseline_ub, dtype = int).tolist())
    print_log('%1.0f values of epsilon in {%1.0f,...,%1.0f}' % (len(epsilon_values), min(epsilon_values), max(epsilon_values)))

    # load existing results file if it exists
    if info['load_from_disk'] and results_file_name.exists():

        # load from disk
        with open(results_file_name, 'rb') as infile:
            results = dill.load(infile)

        for k in ['data_name', 'fold_id', 'fold_num']:
            assert info[k] == results[k], 'mismatch in loaded results'
        assert np.isclose(baseline_coefs, results['baseline_coefs']).all()
        assert np.isclose(epsilon_values, results['epsilon_values']).all()
        print_log('loaded existing results from disk %s' % results_file_name)

    else:

        results = {
            'data_name': info['data_name'],
            'fold_id': info['fold_id'],
            'fold_num': info['fold_num'],
            'baseline_ub': baseline_ub,
            'baseline_lb': baseline_lb,
            'baseline_coefs': baseline_coefs,
            'epsilon_values': epsilon_values,
            }

        results['output'] = {e: None for e in results['epsilon_values']}
        print_log('did not find file with existing results on disk: %s' % results_file_name)

    initial_pool = build_coefficient_pool(info)  # overwrites initial pool above with superset (also different format)
    initial_coefs = np.vstack(initial_pool['coefficients'])
    initial_errors = initial_pool['error_ub']

    # build discrepancy MIP
    mip = DiscrepancyMIP(data, baseline_coefs, baseline_stats, print_flag = info['print_flag'], parallel_flag = True, random_seed = info['random_seed'])
    mip.mip = set_mip_parameters(mip.mip, CPX_MIP_PARAMETERS)
    mip.bound_error_gap(lb = int(np.floor(baseline_lb - baseline_ub)))

    # update date
    now = datetime.now()
    results['results_file'] = results_file_name

    # compute baseline classifier gap
    g = mip.get_classifier(coefs = mip.baseline_coefs)
    G = g.predict(data['X'])
    err_g = np.not_equal(data['Y'], G).sum()

    # run training
    epsilon_values = results['epsilon_values']
    total_iterations = len(epsilon_values)
    training_epsilon_values = sorted([e for (e, out) in results['output'].items() if out is None])
    remaining_time = info['time_limit']
    for e in training_epsilon_values:

        i = epsilon_values.index(e)

        # record start time
        start_time = time.time()
        print_log('=' * 70)
        print_log('started training for epsilon = %1.0f, iteration %d/%d' % (e, i, total_iterations))
        print("\n")

        # setup MIP
        mip.bound_error_gap(ub = e)

        # add initial solutions that are valid
        if info['initialize']:
            keep_idx = np.less_equal(initial_errors, e + err_g)
            feasible_coefs = initial_coefs[keep_idx, :]
            print_log('initializing with %d solutions' % feasible_coefs.shape[0])
            for w in feasible_coefs:
                mip.add_initial_solution_with_coefficients(coefs = w)

        train_time = min(remaining_time, info['instance_time_limit'])
        out = mip.solve(time_limit = train_time)

        # compute current classifier stats
        h = mip.get_classifier()
        H = h.predict(data['X'])
        err_h = np.not_equal(H, data['Y']).sum()

        # record stats
        out['epsilon'] = e
        out['coefficients'] = mip.coefficients
        out['total_error_gap'] = mip.solution.get_values(mip.names['total_error_gap'])
        out['total_discrepancy'] = np.not_equal(H, G).sum()
        out['total_agreement'] = mip.solution.get_values(mip.names['total_agreement'])

        # check solution
        print_log(('=' * 20) + ' WARNINGS ' + ('=' * 20))
        mip.check_solution()

        # store solution
        results['output'][e] = out

        # print details about solution
        msg = [('=' * 20) + ' SUMMARY ' + ('=' * 20),
               '-' * 70,
               'mistakes',
               'R(h): %1.0f' % err_h,
               'R(g): %1.0f' % err_g,
               'R(h)-R(g): %1.0f' % (err_h - err_g),
               '-' * 70,
               'alignment',
               '#[h(x)==g(x)]: %1.0f' % np.equal(H, G).sum(),
               '#[h(x)!=g(x)]: %1.0f' % np.not_equal(H, G).sum(),
               '-' * 70,
               'mip output',
               'max error gap (=epsilon): %1.0f' % e,
               'total error gap (=epsilon): %1.0f' % out['total_error_gap'],
               'total discrepancy (objval): %1.0f' % out['total_discrepancy'],
               'total agreement (objval): %1.0f' % out['total_agreement'],
               'objval ub: %1.0f' % out['upperbound'],
               'objval lb: %1.0f' % out['lowerbound'],
               'objval gap: %1.2f%%' % (100.0 * out['gap']),
               '=' * 70,
               ]
        msg = '\n'.join(msg)
        print_log(msg)

        # print completion message
        print_log('completed training for epsilon = %1.0f, iteration %d/%d' % (e, i, total_iterations))

        # save file
        results['date'] = now.strftime("%y_%m_%d_%H_%M")
        results_df = [out for out in results['output'].values() if out is not None]
        results_df = pd.DataFrame(results_df)
        results_df.set_index('epsilon')
        results['results_df'] = results_df
        save_results_to_disk(results, file_name = results_file_name)

        # check whether to stop
        time_elapsed = time.time() - start_time
        remaining_time = remaining_time - time_elapsed
        if remaining_time < 30.0 + info['instance_time_limit']:
            print_log('STOPPING TRAINING: out of time')
            print(results_df[['epsilon', 'gap', 'total_discrepancy', 'total_agreement']])
            break

    # print final results
    print_log('leaving train_discrepancy_classifier')
    return results
Exemplo n.º 9
0
def train_flipped_classifiers(info):
    """
    :param info:
    :return:
    """

    print_log('entered train_flipped_classifiers')
    print_log('-' * 50)

    # dashboard
    for k, v in FLIPPED_TRAINING_SETTINGS.items():
        if k not in info:
            info[k] = v

    for k, v in info.items():
        print("info['%s'] = %r" % (k, info[k]))

    assert "part_id" in info
    output_dir = results_dir / info['data_name']
    output_dir.mkdir(exist_ok = True)

    baseline_file_name = output_dir / get_baseline_file_name(info)
    results_file_name = output_dir / get_flipped_file_name(info)
    print_log('baseline_file_name: %s' % baseline_file_name)
    print_log('results_file_name: %s' % results_file_name)

    # load baseline file
    assert baseline_file_name.exists()
    print_log('loading results from %s' % baseline_file_name)
    with open(baseline_file_name, 'rb') as infile:
        baseline_results = dill.load(infile)
    print_log('loaded results from %s' % baseline_file_name)

    # setup pool
    pool = SolutionPool(df = baseline_results['pool_df'])

    # setup baseline classifier
    coefs = baseline_results['pool_df'].query('model_type == "baseline"')['coefficients'].values[0]
    h = ClassificationModel(predict_handle = lambda X: 1,
                            model_info = {'coefficients': coefs[1:], 'intercept': coefs[0]},
                            model_type = ClassificationModel.LINEAR_MODEL_TYPE)

    # load data from disk
    data_file_name = '%s_processed.csv' % info['data_name']
    data_file_name = data_dir / data_file_name
    data, cvindices = load_processed_data(file_name = data_file_name.with_suffix('.pickle'))
    data = filter_data_to_fold(data, cvindices = cvindices, fold_id = info['fold_id'], fold_num = info['fold_num'])
    print_log('loaded dataset %s' % data_file_name)

    # compress dataset into distinct feature vectors and counts
    compressed = compress_data(data)
    for k, v in compressed.items():
        compressed[k] = filter_indices_to_part(v, part_id = info['part_id'])
    U, N_pos, N_neg, x_to_u_idx, u_to_x_idx = tuple(compressed[var] for var in ('U', 'N_pos', 'N_neg', 'x_to_u_idx', 'u_to_x_idx'))

    # setup MIP
    mip = ZeroOneLossMIP(data, print_flag = True, parallel_flag = True, random_seed = info['random_seed'])
    mip.mip = set_mip_parameters(mip.mip, CPX_MIP_PARAMETERS)
    mip.mip.parameters.emphasis.mip.set(3)
    mip.set_total_mistakes(lb = baseline_results['baseline_output']['lowerbound'])

    # solve flipped versions
    results = []
    total_iterations = U.shape[0]
    start_time = time.process_time()

    #### START INITIALIZATION
    # pre-initialize the mip from disk
    if info['load_from_disk']:

        # load saved coefficients from all methods
        initial_pool = build_coefficient_pool(info)
        if initial_pool.shape[0] > 0:

            print_log('initializing with %d solutions' % len(initial_pool))

            # use the best lowerbound we have
            max_error_lb = np.ceil(initial_pool['error_lb'].max())
            if np.isfinite(max_error_lb):
                mip.set_total_mistakes(lb=max_error_lb)

            # supply all possible initializations
            for k, row in initial_pool.iterrows():
                mip.add_initial_solution_with_coefficients(coefs=row['coefficients'])

    #### END INITIALIZATION

    for k, x in enumerate(U):

        print_log('iteration %d/%d' % (k, total_iterations))
        print_log('solution pool size: %d' % pool.size)

        yhat = int(h.predict(x[None, :]))

        # adjust prediction constraints
        mip.clear_prediction_constraints()
        mip.add_prediction_constraint(x = x, yhat = -yhat, name = 'pred_constraint')


        # initialize model
        good_pool = pool.get_solutions_with_pred(x, -yhat)
        if good_pool.size > 0:
            s = good_pool.get_best_solution()
            mip.add_initial_solution(solution = s['solution'], objval = s['objval'], name = 'init_from_pred_cons')
            mip.add_initial_solution_with_coefficients(coefs = s['coefficients'])
            print_log('initialized\nobjval:{}'.format(s['objval']))

        # solve MIP
        out = mip.solve(time_limit = info['time_limit_flipped'])
        mip.check_solution()

        # update solution pool
        pool.add_from_mip(mip, prediction_constraint = (x, yhat))

        # update out
        out.update(
                {
                    'k': k,
                    'i': np.flatnonzero(u_to_x_idx == k).tolist(),
                    'x': x,
                    'n_pos': N_pos[k],
                    'n_neg': N_neg[k],
                    'coefficients': mip.coefficients,
                    'elapsed_time': time.process_time() - start_time,
                    'prediction_constraint': (x, yhat),
                    'solution': list(out['values']),
                    }
                )
        results.append(out)

    results_df = pd.DataFrame(results)
    results_df.drop(['upperbound', 'values'], axis=1, inplace=True)

    # create output dictionary
    now = datetime.now()
    results = {
        'date': now.strftime("%y_%m_%d_%H_%M"),
        'info': info,
        'part_id': info['part_id'],
        'data_file': data_file_name,
        'baseline_file': baseline_file_name,
        'results_file': results_file_name,
        'flipped_df': results_df,
        }

    print_log('leaving train_flipped_classifier')
    return results
Exemplo n.º 10
0
def train_baseline_classifier(info):
    """
    trains baseline classifier via zero one loss minimization
    :param info:
    :return:
    """
    print_log('entered train_baseline_classifier')

    # dashboard
    for k, v in BASELINE_TRAINING_SETTINGS.items():
        if k not in info:
            info[k] = v

    print_log('settings')
    print_log('-' * 50)
    for k, v in info.items():
        print_log("info['%s'] = %r" % (k, info[k]))

    # name and create output directory
    output_dir = results_dir / info['data_name']
    output_dir.mkdir(exist_ok = True)

    # set output results file
    results_file_name = output_dir / get_baseline_file_name(info)
    print_log('-' * 50)
    print_log('saving results in %s' % results_file_name)

    # load dataset
    data_file_name = '%s_processed.pickle' % info['data_name']
    data_file_name = data_dir / data_file_name
    data, cvindices = load_processed_data(file_name = data_file_name.with_suffix('.pickle'))
    data = filter_data_to_fold(data, cvindices = cvindices, fold_id = info['fold_id'], fold_num = info['fold_num'])
    print_log('loaded dataset %s' % data_file_name)

    # solve zero-one loss MIP
    mip = ZeroOneLossMIP(data, print_flag = info['print_flag'], parallel_flag = True, random_seed = info['random_seed'], error_constraint_type = info['error_constraint_type'])
    mip.mip = set_mip_parameters(mip.mip, CPX_MIP_PARAMETERS)

    # load initializations from disk
    if info['load_from_disk']:

        # load saved coefficients from all methods
        initial_pool = build_coefficient_pool(info)
        if initial_pool.shape[0] > 0:

            print_log('initializing with %d solutions' % len(initial_pool))

            # use the best lowerbound we have
            max_error_lb = np.ceil(initial_pool['error_lb'].max())
            if np.isfinite(max_error_lb):
                mip.set_total_mistakes(lb=max_error_lb)

            # supply all possible initializations
            for k, row in initial_pool.iterrows():
                mip.add_initial_solution_with_coefficients(coefs=row['coefficients'])

            # report expected objective value
            print_log('upperbound should be at most %d' % min(initial_pool['error_ub']))

    out = mip.solve(time_limit = info['time_limit'])
    baseline_output = mip.solution_info
    print_log(str(baseline_output))
    mip.check_solution(debug_flag = True)

    base_df = {
        'solution': list(out['values']),
        'objval': out['objval'],
        'coefficients': mip.coefficients,
        'lowerbound': out['lowerbound'],
        'prediction_constraint': None,
        'model_type': 'baseline'
        }

    # initialize solution pool
    pool = SolutionPool(mip = mip)

    # search for equivalent models
    if info['equivalent_time_limit'] > 0:
        equivalent_output, pool = mip.enumerate_equivalent_solutions(pool, time_limit = info['equivalent_time_limit'])
        print_log('{} global equivalent models found.'.format(len(equivalent_output)))
    else:
        equivalent_output = []

    # generate additional solutions using populate
    if info['populate_time_limit'] > 0:
        mip.populate(max_gap = 0, time_limit = info['populate_time_limit'])
        mip.populate(max_gap = data['X'].shape[0] // 2, time_limit = info['populate_time_limit'])
        pool.add_from_mip(mip)

    # get alternative models
    pool_df = pool.get_df()
    pool_df['model_type'] = 'alternative'
    pool_df = pool_df.append(base_df, sort = False, ignore_index = True).reset_index(drop = True)

    # get time
    now = datetime.now()

    results = {
        'date': now.strftime("%y_%m_%d_%H_%M"),
        'info': info,
        'data_file': data_file_name,
        'results_file': results_file_name,
        'pool_df': pool_df,
        'baseline_output': baseline_output,
        'equivalent_output': equivalent_output,
        }

    print_log('leaving train_baseline_classifier')
    return results
Exemplo n.º 11
0
processed_file = output_dir / get_processed_file_name(info)
discrepancy_file = output_dir / get_discrepancy_file_name(info)
data_file = data_dir / (altered_info['data_name'] + "_processed.pickle")

# get the models
with open(processed_file, "rb") as f:
    processed = dill.load(f)
with open(discrepancy_file, "rb") as f:
    discrepancy = dill.load(f)

# get the data
with open(data_file, "rb") as f:
    data = dill.load(f)
    data = filter_data_to_fold(data['data'],
                               data['cvindices'],
                               info['fold_id'],
                               fold_num=1,
                               include_validation=True)

# get the unbalanced data
unbalanced = load_data_from_csv(data_file.with_suffix(".csv"))

#
if "compas" in data_name and "_small" in data_name:
    for feat in [
            'race_is_causasian', 'race_is_african_american',
            'race_is_hispanic', 'race_is_other'
    ]:
        if feat in data['variable_names']:
            data = remove_variable(data, feat)
        if feat in unbalanced['variable_names']:
Exemplo n.º 12
0
def multiplicity_table(info):

    # set up directories
    working_dir = output_dir / info['data_name']
    processed_file = working_dir / get_processed_file_name(info)
    discrepancy_file = working_dir / get_discrepancy_file_name(info)

    # get the baseline and flipped models
    with open(processed_file, "rb") as f:
        processed = dill.load(f)['results_df']

    # get the discrepancy models
    with open(discrepancy_file, "rb") as f:
        discrepancy = dill.load(f)['results_df']

    # load data
    with open(info['data_file'], "rb") as f:
        data = dill.load(f)
        data = filter_data_to_fold(data['data'], data['cvindices'], fold_id=info['fold_id'], fold_num=1, include_validation=True)

    X, Y = data['X'], data['Y']
    X_test, Y_test = data['X_validation'], data['Y_validation']

    # make a classifier for each model in the df
    processed['clf'] = processed['coefficients'].apply(get_classifier_from_coefficients)
    discrepancy['clf'] = discrepancy['coefficients'].apply(get_classifier_from_coefficients)

    # get the baseline classifier
    baseline = processed.query("model_type == 'baseline'").iloc[0]
    baseline_clf = baseline['clf']
    baseline_scores = baseline_clf.score(X)
    baseline_train_preds = baseline_clf.predict(X)
    baseline_test_preds = baseline_clf.predict(X_test)
    baseline_train_error = np.sum(baseline_train_preds != Y)
    baseline_train_error_rate = np.mean(baseline_train_preds != Y)
    baseline_test_error = np.sum(baseline_test_preds != Y_test)
    baseline_test_error_rate = np.mean(baseline_test_preds != Y_test)

    # get the equivalent classifier
    epsilon = discrepancy.query("epsilon <= %s" % int(0.01 * X.shape[0]))['epsilon'].max()
    equivalent = discrepancy.query("epsilon == %s" % epsilon).iloc[0]
    equivalent_clf = equivalent['clf']
    equivalent_train_preds = equivalent_clf.predict(X)
    equivalent_test_preds = equivalent_clf.predict(X_test)
    equivalent_train_error = np.sum(equivalent_train_preds != Y)
    equivalent_train_error_rate = np.mean(equivalent_train_preds != Y)
    equivalent_test_error = np.sum(equivalent_test_preds != Y_test)
    equivalent_test_error_rate = np.mean(equivalent_test_preds != Y_test)
    equivalent_train_discrepancy = np.sum(equivalent_train_preds != baseline_train_preds)
    equivalent_test_discrepancy = np.sum(equivalent_test_preds != baseline_test_preds)
    equivalent_train_discrepancy_rate = np.mean(equivalent_train_preds != baseline_train_preds)
    equivalent_test_discrepancy_rate = np.mean(equivalent_test_preds != baseline_test_preds)

    is_flipped = np.logical_and(baseline_train_preds == 1, equivalent_train_preds == -1)
    is_flipped_nonzero = np.logical_and(is_flipped, abs(baseline_scores) > 0.001)


    flipped_train_errors = []
    for instance in np.unique(X[is_flipped_nonzero], axis=0):
        # get the train error of the best flipped model for each instance
        instance = np.expand_dims(instance, 0)
        baseline_pred = baseline_clf.predict(instance)[0]
        preds = processed.apply(df_predict_handle, axis=1, x=instance)
        min_flipped_train_error = processed[preds != baseline_pred]['objval'].min()
        flipped_model = processed[preds != baseline_pred].query('objval == %s' % min_flipped_train_error).iloc[0]
        flipped_train_errors.append({'min_flipped_train_error': min_flipped_train_error,
                                     'instance': instance,
                                     'model': flipped_model})

    # choose the instance to show
    flipped_info = min(flipped_train_errors, key=lambda x: x["min_flipped_train_error"])
    x_i = flipped_info['instance']

    # get the flipped classifier
    flipped = flipped_info['model']
    flipped_clf = flipped['clf']
    flipped_train_preds = flipped_clf.predict(X)
    flipped_test_preds = flipped_clf.predict(X_test)
    flipped_train_error = np.sum(flipped_train_preds != Y)
    flipped_train_error_rate = np.mean(flipped_train_preds != Y)
    flipped_test_error = np.sum(flipped_test_preds != Y_test)
    flipped_test_error_rate = np.mean(flipped_test_preds != Y_test)
    flipped_train_discrepancy = np.sum(flipped_train_preds != baseline_train_preds)
    flipped_test_discrepancy = np.sum(flipped_test_preds != baseline_test_preds)
    flipped_train_discrepancy_rate = np.mean(flipped_train_preds != baseline_train_preds)
    flipped_test_discrepancy_rate = np.mean(flipped_test_preds != baseline_test_preds)


    results_df = pd.DataFrame(columns=['model_type', 'coefficients',
                                       'train_error_rate', 'test_error_rate',
                                       'train_discrepancy_rate', 'test_discrepancy_rate',
                                       'score_xi', 'prediction_xi'])

    results_df = results_df.append({'model_type': 'baseline',
                                    'coefficients': baseline['coefficients'],
                                    'train_error_rate': baseline_train_error_rate,
                                    'test_error_rate': baseline_test_error_rate,
                                    'train_discrepancy_rate': 0,
                                    'test_discrepancy_rate': 0,
                                    'score_xi': baseline_clf.score(x_i)[0],
                                    'prediction_xi': baseline_clf.predict(x_i)[0]
                                       }, ignore_index=True)

    results_df = results_df.append({'model_type': 'discrepancy',
                                    'coefficients': equivalent['coefficients'],
                                    'train_error_rate': equivalent_train_error_rate,
                                    'test_error_rate': equivalent_test_error_rate,
                                    'train_discrepancy_rate': equivalent_train_discrepancy_rate,
                                    'test_discrepancy_rate': equivalent_test_discrepancy_rate,
                                    'score_xi': equivalent_clf.score(x_i)[0],
                                    'prediction_xi': equivalent_clf.predict(x_i)[0]
                                       }, ignore_index=True)

    results_df = results_df.append({'model_type': 'flipped',
                                    'coefficients': flipped['coefficients'],
                                    'train_error_rate': flipped_train_error_rate,
                                    'test_error_rate': flipped_test_error_rate,
                                    'train_discrepancy_rate': flipped_train_discrepancy_rate,
                                    'test_discrepancy_rate': flipped_test_discrepancy_rate,
                                    'score_xi': flipped_clf.score(x_i)[0],
                                    'prediction_xi': flipped_clf.predict(x_i)[0]
                                       }, ignore_index=True)


    x_i_features = pd.Series(x_i[0], index=data['variable_names'])
    results = {'results_df': results_df, 'x_i': x_i_features}
    return results
Exemplo n.º 13
0
def subgroup_ambiguity_analysis(info, subgroup_getter):

    # set up directories
    output_dir = paper_dir / info['data_name']
    processed_file = output_dir / get_processed_file_name(info)
    data_file = data_dir / (info['data_name'] + "_processed.pickle")

    # get the models
    with open(processed_file, "rb") as f:
        processed = dill.load(f)

    # get the data
    with open(data_file, "rb") as f:
        data = dill.load(f)
        data = filter_data_to_fold(data['data'], data['cvindices'], info['fold_id'], fold_num=1)

    # make a classifier for each model in the df
    results = processed['results_df']
    results['clf'] = results['coefficients'].apply(get_classifier_from_coefficients)

    # get the baseline classifier
    assert results.query("model_type == 'baseline'").shape[0] == 1
    baseline = results.query("model_type == 'baseline'").iloc[0]
    baseline_clf = baseline['clf']
    baseline_train_error = baseline['train_error']

    # compress data and get subgroups vector for analysis (e.g. race)
    U, x_to_u_idxs, counts = np.unique(data['X'], axis=0, return_counts=True, return_index=True)
    subgroups = subgroup_getter()
    subgroups = subgroups[x_to_u_idxs]

    # get the train error of the best flipped model for each instance
    flipped_train_errors = []
    for i, instance in enumerate(U):
        instance = np.expand_dims(instance, 0)
        baseline_pred = baseline_clf.predict(instance)[0]
        preds = results.apply(df_predict_handle, axis=1, x=instance)

        min_flipped_train_error = results[preds != baseline_pred]['train_error'].min()
        flipped_train_errors.append(min_flipped_train_error)

    # subgroup analysis dataframe
    df = pd.DataFrame({
        'subgroup': subgroups,
        'flipped_error': flipped_train_errors,
        'num_instances': counts,
        })
    # get number of instances per subgroup
    group_counts = {grp: df.query("subgroup == '%s'" % grp)['num_instances'].sum() for grp in np.unique(df['subgroup'])}

    # check the percent of epsilon-flippable instances per subgroup
    epsilon = 0.01
    subgroup_multiplicity = {}
    for sgroup in np.unique(df['subgroup']):
        grp = df.query("subgroup == '%s'" % sgroup)
        flippable = grp['flipped_error'] <= baseline_train_error + epsilon
        num_flippable = grp[flippable]['num_instances'].sum()
        pct_flippable = np.round(num_flippable / group_counts[sgroup], 4)

        subgroup_multiplicity[sgroup] = {'num_flippable': num_flippable,
                           'pct_flippable': pct_flippable,
                           'num_instances': group_counts[sgroup],
                           }
    return subgroup_multiplicity
Exemplo n.º 14
0
def subgroup_disparity_analysis(info, subgroup_getter, protected_subgroup, epsilon=0.01):

    # set up directories
    output_dir = paper_dir / info['data_name']
    processed_file = output_dir / get_processed_file_name(info)
    discrepancy_file = output_dir / get_discrepancy_file_name(info)
    data_file = data_dir / (info['data_name'] + "_processed.pickle")

    # get the models
    with open(processed_file, "rb") as f:
        processed = dill.load(f)
    with open(discrepancy_file, "rb") as f:
        discrepancy = dill.load(f)

    # get the data
    with open(data_file, "rb") as f:
        data = dill.load(f)
        data = filter_data_to_fold(data['data'], data['cvindices'], info['fold_id'], fold_num=1, include_validation=True)

    X, Y = data['X'], data['Y']
    X_test, Y_test = data['X_validation'], data['Y_validation']
    subgroups = subgroup_getter()
    # subgroups = [g if g == "african_american" else "other" for g in subgroups]

    # make a classifier for each model in the df
    proc_results = processed['results_df']
    disc_results = discrepancy['results_df']

    proc_results['clf'] = proc_results['coefficients'].apply(get_classifier_from_coefficients)
    disc_results['clf'] = disc_results['coefficients'].apply(get_classifier_from_coefficients)

    # get the baseline classifier
    assert proc_results.query("model_type == 'baseline'").shape[0] == 1
    baseline = proc_results.query("model_type == 'baseline'").iloc[0]
    baseline_clf = baseline['clf']
    baseline_train_error = baseline['train_error']
    baseline_disparity = get_disparity(baseline_clf, X=X, subgroups=subgroups, protected_name=protected_subgroup)

    proc_metrics = pd.DataFrame.from_records(proc_results['clf'].apply(get_metrics, X=X, Y=Y, X_test=X_test, Y_test=Y_test, metrics=["train_error", "test_error"]))
    proc_results = pd.concat([proc_results, proc_metrics], axis=1)

    disc_metrics = pd.DataFrame.from_records(disc_results['clf'].apply(get_metrics, X=X, Y=Y, X_test=X_test, Y_test=Y_test, metrics=["train_error", "test_error"]))
    disc_results = pd.concat([disc_results, disc_metrics], axis=1)

    proc_level_set = proc_results.query('train_error <= %s' % (baseline_train_error + epsilon))
    disc_level_set = disc_results.query('train_error <= %s' % (baseline_train_error + epsilon))
    level_set_clfs = proc_level_set['clf'].append(disc_level_set['clf'])

    all_metrics = pd.concat([proc_metrics, disc_metrics], axis=0)
    all_metrics_no_baseline = all_metrics.query("train_error != %s" % all_metrics['train_error'].min())
    train_error_of_best_test = all_metrics_no_baseline['train_error'][all_metrics_no_baseline['test_error'].idxmin()]
    empirical_epsilon = (train_error_of_best_test - baseline_train_error).to_list()[0]

    disparities = level_set_clfs.apply(get_disparity, X=X, subgroups=subgroups, protected_name=protected_subgroup)
    smallest_disparity = min(abs(disparities)) * np.sign(min(disparities))

    results = {"baseline_disparity": baseline_disparity,
               "best_disparity": smallest_disparity,
               "difference": baseline_disparity - smallest_disparity,
               "epsilon": epsilon,
               "empirical_epsilon": empirical_epsilon,
               "level_set_size": len(disparities),
               "error_&_disparity_level_set_size": sum(abs(disparities - baseline_disparity) <= epsilon),
               }

    results = {k: num_to_signed(v) for k, v in results.items()}
    # ipsh()
    return results