예제 #1
0
def evaluation_phase(args_in,
                     model,
                     data_mtxs,
                     x_data,
                     x_data_clean,
                     target_errors_cell,
                     data_loader,
                     iterations=1):

    model.eval()

    # Check outlier metrics
    outlier_score_cell_S, outlier_score_row_S, outlier_score_cell_recon, outlier_score_row_recon = \
        error_detection(args_in.dataset_defs, data_mtxs['S'], x_data, data_mtxs['LD'], args_in.l21_method)

    outlier_score_cell_S = outlier_score_cell_S.cpu().numpy()
    outlier_score_cell_recon = outlier_score_cell_recon.cpu().numpy()
    outlier_score_row_S = outlier_score_row_S.cpu().numpy()
    outlier_score_row_recon = outlier_score_row_recon.cpu().numpy()

    target_errors_cell = target_errors_cell.cpu().numpy().astype(np.uint8)
    target_errors_row = (target_errors_cell.sum(axis=1) > 0).astype(np.uint8)

    ## Cell metrics
    auc_cell_S, auc_feats_S = get_auc_metrics(target_errors_cell,
                                              outlier_score_cell_S)
    avpr_cell_S, avpr_feats_S = get_avpr_metrics(target_errors_cell,
                                                 outlier_score_cell_S)

    auc_cell_recon, auc_feats_recon = get_auc_metrics(
        target_errors_cell, outlier_score_cell_recon)
    avpr_cell_recon, avpr_feats_recon = get_avpr_metrics(
        target_errors_cell, outlier_score_cell_recon)

    ## Row metrics
    auc_row_S = auc_compute(target_errors_row, outlier_score_row_S)
    avpr_row_S = avpr_compute(target_errors_row, outlier_score_row_S)

    auc_row_recon = auc_compute(target_errors_row, outlier_score_row_recon)
    avpr_row_recon = avpr_compute(target_errors_row, outlier_score_row_recon)

    outlier_metrics = OrderedDict([
        ('score_cell_S', outlier_score_cell_S),
        ('score_row_S', outlier_score_row_S),
        ('score_cell_recon', outlier_score_cell_recon),
        ('score_row_recon', outlier_score_row_recon), ('auc_cell_S',
                                                       auc_cell_S),
        ('avpr_cell_S', avpr_cell_S), ('auc_cell_recon', auc_cell_recon),
        ('avpr_cell_recon', avpr_cell_recon), ('auc_row_S', auc_row_S),
        ('avpr_row_S', avpr_row_S), ('auc_row_recon', auc_row_recon),
        ('avpr_row_recon', avpr_row_recon),
        ('avpr_per_feature', [avpr_feats_S, avpr_feats_recon]),
        ('auc_per_feature', [auc_feats_S, auc_feats_recon])
    ])

    # Repair analysis
    repair_metrics = repair_phase(args_in, model, data_mtxs, x_data,
                                  x_data_clean, target_errors_cell)

    return outlier_metrics, repair_metrics
예제 #2
0
def get_auc_metrics(target_matrix, score_matrix):

    auc_feats = np.zeros(target_matrix.shape[1])

    for ii in range(target_matrix.shape[1]):
        if target_matrix[:, ii].any():
            auc_feats[ii] = auc_compute(target_matrix[:, ii], score_matrix[:,
                                                                           ii])
        else:
            auc_feats[ii] = -10.

    # macro average of auc through feature set
    macro_auc = auc_feats[auc_feats >= 0].mean()

    return macro_auc, auc_feats
예제 #3
0
def main(args):

    # Load datasets
    _, X_train, target_errors_train, _, _ = utils.load_data(args.data_folder, args.batch_size, is_train=True, is_one_hot=args.is_one_hot)
    # _, X_test, target_errors_test, _, _ = utils.load_data(folder_path, args.batch_size, is_train=False) # NOTE: used in hyper-parameter selection

    # Vest parameters from CV
    clf = svm.OneClassSVM(nu=0.2, kernel="rbf", gamma=0.1)
    clf.fit(X_train)

    target_row = (target_errors_train.sum(dim=1)>0).numpy()

    outlier_score_row = -clf.score_samples(X_train)

    auc_row = auc_compute(target_row, outlier_score_row)
    avpr_row = avpr_compute(target_row, outlier_score_row)

    print('OC-SVM Train - AUC: ' + str(auc_row) + ', AVPR: ' + str(avpr_row))

    #Save results into csv
    if args.save_on:

        # create folder for saving experiment data (if necessary)
        folder_output = args.output_folder + "/" + args.outlier_model

        try:
            os.makedirs(folder_output)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        columns = ['AUC','AVPR']
        results = {'AUC': [auc_row], 'AVPR': [avpr_row]}

        #Dataframe
        df_out = pd.DataFrame(data=results, columns=columns)
        df_out.index.name = "Epochs"
        df_out.to_csv(folder_output + "/train_epochs_data.csv")
예제 #4
0
def row_metrics(target_errors, outlier_score_mat, weights=False):

    target_errors = target_errors.cpu()
    outlier_score_mat = outlier_score_mat.cpu()

    target_errors_row = (target_errors.sum(dim=1) > 0)

    if weights:  # assumes outlier_score_mat is pi's
        outlier_score_row = -outlier_score_mat.log().sum(dim=1)
    else:  # assumes outlier_score_mat is -log p(x | ... )
        outlier_score_row = outlier_score_mat.sum(dim=1)

    target_row = target_errors_row.numpy()
    outlier_score_row = outlier_score_row.numpy()

    if target_row.any():
        auc_row = auc_compute(target_row, outlier_score_row)
        avpr_row = avpr_compute(target_row, outlier_score_row)
    else:
        auc_row = np.ones(target_row.shape) * -10.
        avpr_row = np.ones(target_row.shape) * -10.

    return auc_row, avpr_row
예제 #5
0
def main(args):

    # Load datasets
    train_loader, X_train, target_errors_train, dataset_obj_train, attributes = utils.load_data(args.data_folder, args.batch_size, 
                                                                                                is_train=True)
    train_loader_clean, X_train_clean, _, dataset_obj_clean, _ = utils.load_data(args.data_folder, args.batch_size,
                                                                        is_train=True, is_clean=True, stdize_dirty=True)

    dataset_obj = dataset_obj_train
    df_data_train = dataset_obj_train.df_dataset_instance

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        p_mat_train, dict_densities, _, repair_mat = get_prob_matrix(df_data_train, dataset_obj.cat_cols, n_comp_max=40)


    mean_error_dirty, features_errors_dirty = error_computation(dataset_obj_clean, X_train_clean.detach().numpy(),
                                                    repair_mat, dict_densities, target_errors_train.detach().numpy())
    mean_error_clean, features_errors_clean = error_computation(dataset_obj_clean, X_train_clean.detach().numpy(),
                                                    repair_mat, dict_densities, (1-target_errors_train).detach().numpy())

    #print(features_errors)
    logp_mat_train = np.log(p_mat_train + 1e-9)

    target_row_train = (target_errors_train.sum(dim=1)>0).numpy()

    # Uses the NLL score as outlier score (just like VAE outlier score)
    outlier_score_cell_train = -logp_mat_train
    outlier_score_row_train = -logp_mat_train.sum(axis=1)


    ## Cell metrics
    auc_cell_train, auc_feats = get_auc_metrics(target_errors_train, outlier_score_cell_train)
    avpr_cell_train, avpr_feats = get_avpr_metrics(target_errors_train, outlier_score_cell_train)

    print("AVPR per feature")
    print(avpr_feats)
    print("AUC per feature")
    print(auc_feats)

    ## Row metrics
    auc_row_train = auc_compute(target_row_train, outlier_score_row_train)
    avpr_row_train = avpr_compute(target_row_train, outlier_score_row_train)


    print('Marginals Prob. Train - Cell AUC: {}, Cell AVPR: {}, Row AUC: {}, Row AVPR: {}'.format(
                                    auc_cell_train, avpr_cell_train, auc_row_train, avpr_row_train))

    #Save results into csv
    if args.save_on:

        # create folder for saving experiment data (if necessary)
        folder_output = args.output_folder + "/" + args.outlier_model

        try:
            os.makedirs(folder_output)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        columns = ['AUC row','AVPR row','AUC cell','AVPR cell','Error repair on dirty pos', 'Error repair on clean pos']
        results = {'AUC row': [auc_row_train], 'AVPR row': [avpr_row_train],
                   'AUC cell': [auc_cell_train], 'AVPR cell': [avpr_cell_train],
                   'Error repair on dirty pos': [mean_error_dirty], 'Error repair on clean pos': [mean_error_clean]}


        #Dataframe
        df_out = pd.DataFrame(data=results, columns=columns)
        df_out.index.name = "Epochs"
        df_out.to_csv(folder_output + "/train_epochs_data.csv")

        # store AVPR for features (cell only)
        df_avpr_feat_cell = pd.DataFrame([], index=['AVPR'], columns=attributes)
        df_avpr_feat_cell.loc['AVPR'] = avpr_feats
        df_avpr_feat_cell.to_csv(folder_output + "/train_avpr_features.csv")

        # store AUC for features (cell only)
        df_auc_feat_cell = pd.DataFrame([], index=['AUC'], columns=attributes)
        df_auc_feat_cell.loc['AUC'] = auc_feats
        df_auc_feat_cell.to_csv(folder_output + "/train_auc_features.csv")

        df_errors_repair = pd.DataFrame([], index=['error_repair_dirtycells','error_repair_cleancells'], columns=attributes)
        df_errors_repair.loc['error_repair_dirtycells'] = features_errors_dirty
        df_errors_repair.loc['error_repair_cleancells'] = features_errors_clean
        df_errors_repair.to_csv(folder_output + "/train_error_repair_features.csv")
예제 #6
0
def test(args_in, model, optim_proc, data_mtxs, x_data, x_data_clean,
         admm_iter):

    converged_flag = False

    # Run ADMM iteration
    c1, c2 = run_iteration_ADMM(args_in,
                                model,
                                optim_proc,
                                x_data,
                                args_in.test_loader,
                                data_mtxs,
                                AE_train_mode=False)

    print("\nTest Data:")

    # Print convergence values (akin to losses, lower is better)
    print("\n\t\t(test) c1 value: {}".format(c1))
    print("\t\t(test) c2 value: {}\n\n".format(c2))

    # Check convergence
    if c1 < args_in.eps_bar_X or c2 < args_in.eps_bar_diff_iter:
        converged_flag = True

    # Print DeepRPCA cost (is minimized by RAE -- DeepRPCA)
    model.eval()
    with torch.no_grad():
        LD_test_recon = model(data_mtxs['LD'])

    if args_in.l1_method:
        loss_RAE = loss_function_RAE_opt_l1(LD_test_recon, data_mtxs['LD'],
                                            data_mtxs['S'],
                                            args_in.lambda_param).item()
    elif args_in.l21_method:
        loss_RAE = loss_function_RAE_opt_l21(LD_test_recon, data_mtxs['LD'],
                                             data_mtxs['S'],
                                             args_in.lambda_param).item()

    print(
        "\n\t\t(test) Loss for RAE cost (opt via ADMM): {}; Lambda val: {} \n".
        format(loss_RAE, args_in.lambda_param))

    # Check outlier metrics
    outlier_score_cell_S, outlier_score_row_S, outlier_score_cell_recon, outlier_score_row_recon = \
        error_detection(args_in.dataset_defs, data_mtxs['S'], x_data, data_mtxs['LD'], args_in.l21_method)

    outlier_score_cell_S = outlier_score_cell_S.cpu().numpy()
    outlier_score_cell_recon = outlier_score_cell_recon.cpu().numpy()
    outlier_score_row_S = outlier_score_row_S.cpu().numpy()
    outlier_score_row_recon = outlier_score_row_recon.cpu().numpy()

    target_errors_test_cell = args_in.target_errors_test
    target_errors_test_cell = target_errors_test_cell.cpu().numpy().astype(
        np.uint8)
    target_errors_test_row = (target_errors_test_cell.sum(axis=1) > 0).astype(
        np.uint8)

    ## Cell metrics
    auc_cell_test_S, _ = get_auc_metrics(target_errors_test_cell,
                                         outlier_score_cell_S)
    avpr_cell_test_S, avpr_feats_S = get_avpr_metrics(target_errors_test_cell,
                                                      outlier_score_cell_S)

    auc_cell_test_recon, _ = get_auc_metrics(target_errors_test_cell,
                                             outlier_score_cell_recon)
    avpr_cell_test_recon, avpr_feats_recon = get_avpr_metrics(
        target_errors_test_cell, outlier_score_cell_recon)

    ## Row metrics
    auc_row_test_S = auc_compute(target_errors_test_row, outlier_score_row_S)
    avpr_row_test_S = avpr_compute(target_errors_test_row, outlier_score_row_S)

    auc_row_test_recon = auc_compute(target_errors_test_row,
                                     outlier_score_row_recon)
    avpr_row_test_recon = avpr_compute(target_errors_test_row,
                                       outlier_score_row_recon)

    outlier_metrics = OrderedDict([('auc_cell_S', auc_cell_test_S),
                                   ('avpr_cell_S', avpr_cell_test_S),
                                   ('auc_cell_recon', auc_cell_test_recon),
                                   ('avpr_cell_recon', avpr_cell_test_recon),
                                   ('auc_row_S', auc_row_test_S),
                                   ('avpr_row_S', avpr_row_test_S),
                                   ('auc_row_recon', auc_row_test_recon),
                                   ('avpr_row_recon', avpr_row_test_recon),
                                   ('avpr_per_feature',
                                    [avpr_feats_S, avpr_feats_recon])])

    print(
        'Test (S) -- Cell AUC: {}, Cell AVPR: {}, Row AUC: {}, Row AVPR: {} \n\n'
        .format(auc_cell_test_S, avpr_cell_test_S, auc_row_test_S,
                avpr_row_test_S))

    print(
        'Test (Recon.) -- Cell AUC: {}, Cell AVPR: {}, Row AUC: {}, Row AVPR: {}\n'
        .format(auc_cell_test_recon, avpr_cell_test_recon, auc_row_test_recon,
                avpr_row_test_recon))

    # Repair analysis
    repair_metrics = repair_phase(args_in, model, data_mtxs, x_data,
                                  x_data_clean, target_errors_test_cell)

    store_metrics_iter('test', args_in, loss_RAE, outlier_metrics,
                       repair_metrics, c1, c2, admm_iter)

    return converged_flag
def main(args):

    # Load datasets
    train_loader, X_train, target_errors_train, dataset_obj_train, attributes = utils.load_data(
        args.data_folder,
        args.batch_size,
        is_train=True,
        is_one_hot=args.is_one_hot)
    test_loader, X_test, target_errors_test, _, _ = utils.load_data(
        args.data_folder, args.batch_size, is_train=False)

    df_data_train = dataset_obj_train.df_dataset_instance

    # Run Marginals to obtain cell log probs
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        p_mat_train, _, _, _ = get_prob_matrix(df_data_train,
                                               dataset_obj_train.cat_cols,
                                               n_comp_max=40)
    nll_marginal_cell = -np.log(p_mat_train + 1e-8)

    target_errors_row_train = (target_errors_train.sum(dim=1) > 0)
    target_row_train = target_errors_row_train.numpy()

    target_errors_row_test = (target_errors_test.sum(dim=1) > 0)
    target_row_test = target_errors_row_test.numpy()

    # Run OCSVM row outlier detection
    clf = svm.OneClassSVM(nu=0.2, kernel="rbf", gamma=0.1)
    clf.fit(X_train)

    outlier_score_row_train = -clf.score_samples(X_train)
    outlier_score_row_test = -clf.score_samples(X_test)

    # Platt Scaling (uses Logistic Regression) of OCSVM scores
    lr_calib = LogisticRegression(solver='lbfgs')
    lr_calib.fit(outlier_score_row_test.reshape(-1, 1), target_row_test)
    p_inlier_train = lr_calib.predict_proba(
        outlier_score_row_train.reshape(-1, 1))[:, 0]
    nll_inlier_row_train = -np.log(p_inlier_train + 1e-8)  # -log (p_inlier)

    # Row metrics
    auc_row_train = auc_compute(target_row_train, outlier_score_row_train)
    avpr_row_train = avpr_compute(target_row_train, outlier_score_row_train)
    ll_row_train = log_loss(target_row_train, outlier_score_row_train)

    auc_row_train_calibed = auc_compute(target_row_train, nll_inlier_row_train)
    avpr_row_train_calibed = avpr_compute(target_row_train,
                                          nll_inlier_row_train)
    ll_row_train_calibed = log_loss(target_row_train, 1. - p_inlier_train)

    print("AUC Prev. Calib.: {}".format(auc_row_train))
    print("AVPR Prev. Calib.: {}".format(avpr_row_train))
    print("Cross-Entropy Prev. Calib. {}".format(ll_row_train))

    # Re-check score is still good after calibration (AVPR and AUC should be same);
    # then Cross-Entropy should drop !!
    print("AUC Post. Calib.: {}".format(auc_row_train_calibed))
    print("AVPR Post. Calib.: {}".format(avpr_row_train_calibed))
    print("Cross-Entropy Post. Calib. {}".format(ll_row_train_calibed))

    # combine calibrated OCSVM and Marginals for cell outlier detection
    nll_cells_final_train = nll_inlier_row_train.reshape(-1,
                                                         1) + nll_marginal_cell

    # Cell metrics
    auc_cell_train, auc_feats = get_auc_metrics(target_errors_train,
                                                nll_cells_final_train)
    avpr_cell_train, avpr_feats = get_avpr_metrics(target_errors_train,
                                                   nll_cells_final_train)

    print(
        'Combined: OCSVM + Marginals Train -- Cell AUC: {}, Cell AVPR: {}, Row AUC: {}, Row AVPR: {}'
        .format(auc_cell_train, avpr_cell_train, auc_row_train,
                avpr_row_train))

    #Save results into csv
    if args.save_on:

        # create folder for saving experiment data (if necessary)
        folder_output = args.output_folder + "/" + args.outlier_model

        try:
            os.makedirs(folder_output)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        columns = ['AUC row', 'AVPR row', 'AUC cell', 'AVPR cell']
        results = {
            'AUC row': [auc_row_train],
            'AVPR row': [avpr_row_train],
            'AUC cell': [auc_cell_train],
            'AVPR cell': [avpr_cell_train]
        }

        #Dataframe
        df_out = pd.DataFrame(data=results, columns=columns)
        df_out.index.name = "Epochs"
        df_out.to_csv(folder_output + "/train_epochs_data.csv")

        # store AVPR for features (cell only)
        df_avpr_feat_cell = pd.DataFrame([],
                                         index=['AVPR'],
                                         columns=attributes)
        df_avpr_feat_cell.loc['AVPR'] = avpr_feats
        df_avpr_feat_cell.to_csv(folder_output + "/train_avpr_features.csv")

        # store AUC for features (cell only)
        df_auc_feat_cell = pd.DataFrame([], index=['AUC'], columns=attributes)
        df_auc_feat_cell.loc['AUC'] = auc_feats
        df_auc_feat_cell.to_csv(folder_output + "/train_auc_features.csv")