def evaluation_phase(args_in, model, data_mtxs, x_data, x_data_clean, target_errors_cell, data_loader, iterations=1): model.eval() # Check outlier metrics outlier_score_cell_S, outlier_score_row_S, outlier_score_cell_recon, outlier_score_row_recon = \ error_detection(args_in.dataset_defs, data_mtxs['S'], x_data, data_mtxs['LD'], args_in.l21_method) outlier_score_cell_S = outlier_score_cell_S.cpu().numpy() outlier_score_cell_recon = outlier_score_cell_recon.cpu().numpy() outlier_score_row_S = outlier_score_row_S.cpu().numpy() outlier_score_row_recon = outlier_score_row_recon.cpu().numpy() target_errors_cell = target_errors_cell.cpu().numpy().astype(np.uint8) target_errors_row = (target_errors_cell.sum(axis=1) > 0).astype(np.uint8) ## Cell metrics auc_cell_S, auc_feats_S = get_auc_metrics(target_errors_cell, outlier_score_cell_S) avpr_cell_S, avpr_feats_S = get_avpr_metrics(target_errors_cell, outlier_score_cell_S) auc_cell_recon, auc_feats_recon = get_auc_metrics( target_errors_cell, outlier_score_cell_recon) avpr_cell_recon, avpr_feats_recon = get_avpr_metrics( target_errors_cell, outlier_score_cell_recon) ## Row metrics auc_row_S = auc_compute(target_errors_row, outlier_score_row_S) avpr_row_S = avpr_compute(target_errors_row, outlier_score_row_S) auc_row_recon = auc_compute(target_errors_row, outlier_score_row_recon) avpr_row_recon = avpr_compute(target_errors_row, outlier_score_row_recon) outlier_metrics = OrderedDict([ ('score_cell_S', outlier_score_cell_S), ('score_row_S', outlier_score_row_S), ('score_cell_recon', outlier_score_cell_recon), ('score_row_recon', outlier_score_row_recon), ('auc_cell_S', auc_cell_S), ('avpr_cell_S', avpr_cell_S), ('auc_cell_recon', auc_cell_recon), ('avpr_cell_recon', avpr_cell_recon), ('auc_row_S', auc_row_S), ('avpr_row_S', avpr_row_S), ('auc_row_recon', auc_row_recon), ('avpr_row_recon', avpr_row_recon), ('avpr_per_feature', [avpr_feats_S, avpr_feats_recon]), ('auc_per_feature', [auc_feats_S, auc_feats_recon]) ]) # Repair analysis repair_metrics = repair_phase(args_in, model, data_mtxs, x_data, x_data_clean, target_errors_cell) return outlier_metrics, repair_metrics
def get_avpr_metrics(target_matrix, score_matrix): avpr_feats = np.zeros(target_matrix.shape[1]) for ii in range(target_matrix.shape[1]): if target_matrix[:, ii].any(): avpr_feats[ii] = avpr_compute(target_matrix[:, ii], score_matrix[:, ii]) else: avpr_feats[ii] = -10. # macro average of avpr through feature set macro_avpr = avpr_feats[avpr_feats >= 0].mean() return macro_avpr, avpr_feats
def main(args): # Load datasets _, X_train, target_errors_train, _, _ = utils.load_data(args.data_folder, args.batch_size, is_train=True, is_one_hot=args.is_one_hot) # _, X_test, target_errors_test, _, _ = utils.load_data(folder_path, args.batch_size, is_train=False) # NOTE: used in hyper-parameter selection # Vest parameters from CV clf = svm.OneClassSVM(nu=0.2, kernel="rbf", gamma=0.1) clf.fit(X_train) target_row = (target_errors_train.sum(dim=1)>0).numpy() outlier_score_row = -clf.score_samples(X_train) auc_row = auc_compute(target_row, outlier_score_row) avpr_row = avpr_compute(target_row, outlier_score_row) print('OC-SVM Train - AUC: ' + str(auc_row) + ', AVPR: ' + str(avpr_row)) #Save results into csv if args.save_on: # create folder for saving experiment data (if necessary) folder_output = args.output_folder + "/" + args.outlier_model try: os.makedirs(folder_output) except OSError as e: if e.errno != errno.EEXIST: raise columns = ['AUC','AVPR'] results = {'AUC': [auc_row], 'AVPR': [avpr_row]} #Dataframe df_out = pd.DataFrame(data=results, columns=columns) df_out.index.name = "Epochs" df_out.to_csv(folder_output + "/train_epochs_data.csv")
def row_metrics(target_errors, outlier_score_mat, weights=False): target_errors = target_errors.cpu() outlier_score_mat = outlier_score_mat.cpu() target_errors_row = (target_errors.sum(dim=1) > 0) if weights: # assumes outlier_score_mat is pi's outlier_score_row = -outlier_score_mat.log().sum(dim=1) else: # assumes outlier_score_mat is -log p(x | ... ) outlier_score_row = outlier_score_mat.sum(dim=1) target_row = target_errors_row.numpy() outlier_score_row = outlier_score_row.numpy() if target_row.any(): auc_row = auc_compute(target_row, outlier_score_row) avpr_row = avpr_compute(target_row, outlier_score_row) else: auc_row = np.ones(target_row.shape) * -10. avpr_row = np.ones(target_row.shape) * -10. return auc_row, avpr_row
def main(args): # Load datasets train_loader, X_train, target_errors_train, dataset_obj_train, attributes = utils.load_data(args.data_folder, args.batch_size, is_train=True) train_loader_clean, X_train_clean, _, dataset_obj_clean, _ = utils.load_data(args.data_folder, args.batch_size, is_train=True, is_clean=True, stdize_dirty=True) dataset_obj = dataset_obj_train df_data_train = dataset_obj_train.df_dataset_instance with warnings.catch_warnings(): warnings.simplefilter("ignore") p_mat_train, dict_densities, _, repair_mat = get_prob_matrix(df_data_train, dataset_obj.cat_cols, n_comp_max=40) mean_error_dirty, features_errors_dirty = error_computation(dataset_obj_clean, X_train_clean.detach().numpy(), repair_mat, dict_densities, target_errors_train.detach().numpy()) mean_error_clean, features_errors_clean = error_computation(dataset_obj_clean, X_train_clean.detach().numpy(), repair_mat, dict_densities, (1-target_errors_train).detach().numpy()) #print(features_errors) logp_mat_train = np.log(p_mat_train + 1e-9) target_row_train = (target_errors_train.sum(dim=1)>0).numpy() # Uses the NLL score as outlier score (just like VAE outlier score) outlier_score_cell_train = -logp_mat_train outlier_score_row_train = -logp_mat_train.sum(axis=1) ## Cell metrics auc_cell_train, auc_feats = get_auc_metrics(target_errors_train, outlier_score_cell_train) avpr_cell_train, avpr_feats = get_avpr_metrics(target_errors_train, outlier_score_cell_train) print("AVPR per feature") print(avpr_feats) print("AUC per feature") print(auc_feats) ## Row metrics auc_row_train = auc_compute(target_row_train, outlier_score_row_train) avpr_row_train = avpr_compute(target_row_train, outlier_score_row_train) print('Marginals Prob. Train - Cell AUC: {}, Cell AVPR: {}, Row AUC: {}, Row AVPR: {}'.format( auc_cell_train, avpr_cell_train, auc_row_train, avpr_row_train)) #Save results into csv if args.save_on: # create folder for saving experiment data (if necessary) folder_output = args.output_folder + "/" + args.outlier_model try: os.makedirs(folder_output) except OSError as e: if e.errno != errno.EEXIST: raise columns = ['AUC row','AVPR row','AUC cell','AVPR cell','Error repair on dirty pos', 'Error repair on clean pos'] results = {'AUC row': [auc_row_train], 'AVPR row': [avpr_row_train], 'AUC cell': [auc_cell_train], 'AVPR cell': [avpr_cell_train], 'Error repair on dirty pos': [mean_error_dirty], 'Error repair on clean pos': [mean_error_clean]} #Dataframe df_out = pd.DataFrame(data=results, columns=columns) df_out.index.name = "Epochs" df_out.to_csv(folder_output + "/train_epochs_data.csv") # store AVPR for features (cell only) df_avpr_feat_cell = pd.DataFrame([], index=['AVPR'], columns=attributes) df_avpr_feat_cell.loc['AVPR'] = avpr_feats df_avpr_feat_cell.to_csv(folder_output + "/train_avpr_features.csv") # store AUC for features (cell only) df_auc_feat_cell = pd.DataFrame([], index=['AUC'], columns=attributes) df_auc_feat_cell.loc['AUC'] = auc_feats df_auc_feat_cell.to_csv(folder_output + "/train_auc_features.csv") df_errors_repair = pd.DataFrame([], index=['error_repair_dirtycells','error_repair_cleancells'], columns=attributes) df_errors_repair.loc['error_repair_dirtycells'] = features_errors_dirty df_errors_repair.loc['error_repair_cleancells'] = features_errors_clean df_errors_repair.to_csv(folder_output + "/train_error_repair_features.csv")
def test(args_in, model, optim_proc, data_mtxs, x_data, x_data_clean, admm_iter): converged_flag = False # Run ADMM iteration c1, c2 = run_iteration_ADMM(args_in, model, optim_proc, x_data, args_in.test_loader, data_mtxs, AE_train_mode=False) print("\nTest Data:") # Print convergence values (akin to losses, lower is better) print("\n\t\t(test) c1 value: {}".format(c1)) print("\t\t(test) c2 value: {}\n\n".format(c2)) # Check convergence if c1 < args_in.eps_bar_X or c2 < args_in.eps_bar_diff_iter: converged_flag = True # Print DeepRPCA cost (is minimized by RAE -- DeepRPCA) model.eval() with torch.no_grad(): LD_test_recon = model(data_mtxs['LD']) if args_in.l1_method: loss_RAE = loss_function_RAE_opt_l1(LD_test_recon, data_mtxs['LD'], data_mtxs['S'], args_in.lambda_param).item() elif args_in.l21_method: loss_RAE = loss_function_RAE_opt_l21(LD_test_recon, data_mtxs['LD'], data_mtxs['S'], args_in.lambda_param).item() print( "\n\t\t(test) Loss for RAE cost (opt via ADMM): {}; Lambda val: {} \n". format(loss_RAE, args_in.lambda_param)) # Check outlier metrics outlier_score_cell_S, outlier_score_row_S, outlier_score_cell_recon, outlier_score_row_recon = \ error_detection(args_in.dataset_defs, data_mtxs['S'], x_data, data_mtxs['LD'], args_in.l21_method) outlier_score_cell_S = outlier_score_cell_S.cpu().numpy() outlier_score_cell_recon = outlier_score_cell_recon.cpu().numpy() outlier_score_row_S = outlier_score_row_S.cpu().numpy() outlier_score_row_recon = outlier_score_row_recon.cpu().numpy() target_errors_test_cell = args_in.target_errors_test target_errors_test_cell = target_errors_test_cell.cpu().numpy().astype( np.uint8) target_errors_test_row = (target_errors_test_cell.sum(axis=1) > 0).astype( np.uint8) ## Cell metrics auc_cell_test_S, _ = get_auc_metrics(target_errors_test_cell, outlier_score_cell_S) avpr_cell_test_S, avpr_feats_S = get_avpr_metrics(target_errors_test_cell, outlier_score_cell_S) auc_cell_test_recon, _ = get_auc_metrics(target_errors_test_cell, outlier_score_cell_recon) avpr_cell_test_recon, avpr_feats_recon = get_avpr_metrics( target_errors_test_cell, outlier_score_cell_recon) ## Row metrics auc_row_test_S = auc_compute(target_errors_test_row, outlier_score_row_S) avpr_row_test_S = avpr_compute(target_errors_test_row, outlier_score_row_S) auc_row_test_recon = auc_compute(target_errors_test_row, outlier_score_row_recon) avpr_row_test_recon = avpr_compute(target_errors_test_row, outlier_score_row_recon) outlier_metrics = OrderedDict([('auc_cell_S', auc_cell_test_S), ('avpr_cell_S', avpr_cell_test_S), ('auc_cell_recon', auc_cell_test_recon), ('avpr_cell_recon', avpr_cell_test_recon), ('auc_row_S', auc_row_test_S), ('avpr_row_S', avpr_row_test_S), ('auc_row_recon', auc_row_test_recon), ('avpr_row_recon', avpr_row_test_recon), ('avpr_per_feature', [avpr_feats_S, avpr_feats_recon])]) print( 'Test (S) -- Cell AUC: {}, Cell AVPR: {}, Row AUC: {}, Row AVPR: {} \n\n' .format(auc_cell_test_S, avpr_cell_test_S, auc_row_test_S, avpr_row_test_S)) print( 'Test (Recon.) -- Cell AUC: {}, Cell AVPR: {}, Row AUC: {}, Row AVPR: {}\n' .format(auc_cell_test_recon, avpr_cell_test_recon, auc_row_test_recon, avpr_row_test_recon)) # Repair analysis repair_metrics = repair_phase(args_in, model, data_mtxs, x_data, x_data_clean, target_errors_test_cell) store_metrics_iter('test', args_in, loss_RAE, outlier_metrics, repair_metrics, c1, c2, admm_iter) return converged_flag
def main(args): # Load datasets train_loader, X_train, target_errors_train, dataset_obj_train, attributes = utils.load_data( args.data_folder, args.batch_size, is_train=True, is_one_hot=args.is_one_hot) test_loader, X_test, target_errors_test, _, _ = utils.load_data( args.data_folder, args.batch_size, is_train=False) df_data_train = dataset_obj_train.df_dataset_instance # Run Marginals to obtain cell log probs with warnings.catch_warnings(): warnings.simplefilter("ignore") p_mat_train, _, _, _ = get_prob_matrix(df_data_train, dataset_obj_train.cat_cols, n_comp_max=40) nll_marginal_cell = -np.log(p_mat_train + 1e-8) target_errors_row_train = (target_errors_train.sum(dim=1) > 0) target_row_train = target_errors_row_train.numpy() target_errors_row_test = (target_errors_test.sum(dim=1) > 0) target_row_test = target_errors_row_test.numpy() # Run OCSVM row outlier detection clf = svm.OneClassSVM(nu=0.2, kernel="rbf", gamma=0.1) clf.fit(X_train) outlier_score_row_train = -clf.score_samples(X_train) outlier_score_row_test = -clf.score_samples(X_test) # Platt Scaling (uses Logistic Regression) of OCSVM scores lr_calib = LogisticRegression(solver='lbfgs') lr_calib.fit(outlier_score_row_test.reshape(-1, 1), target_row_test) p_inlier_train = lr_calib.predict_proba( outlier_score_row_train.reshape(-1, 1))[:, 0] nll_inlier_row_train = -np.log(p_inlier_train + 1e-8) # -log (p_inlier) # Row metrics auc_row_train = auc_compute(target_row_train, outlier_score_row_train) avpr_row_train = avpr_compute(target_row_train, outlier_score_row_train) ll_row_train = log_loss(target_row_train, outlier_score_row_train) auc_row_train_calibed = auc_compute(target_row_train, nll_inlier_row_train) avpr_row_train_calibed = avpr_compute(target_row_train, nll_inlier_row_train) ll_row_train_calibed = log_loss(target_row_train, 1. - p_inlier_train) print("AUC Prev. Calib.: {}".format(auc_row_train)) print("AVPR Prev. Calib.: {}".format(avpr_row_train)) print("Cross-Entropy Prev. Calib. {}".format(ll_row_train)) # Re-check score is still good after calibration (AVPR and AUC should be same); # then Cross-Entropy should drop !! print("AUC Post. Calib.: {}".format(auc_row_train_calibed)) print("AVPR Post. Calib.: {}".format(avpr_row_train_calibed)) print("Cross-Entropy Post. Calib. {}".format(ll_row_train_calibed)) # combine calibrated OCSVM and Marginals for cell outlier detection nll_cells_final_train = nll_inlier_row_train.reshape(-1, 1) + nll_marginal_cell # Cell metrics auc_cell_train, auc_feats = get_auc_metrics(target_errors_train, nll_cells_final_train) avpr_cell_train, avpr_feats = get_avpr_metrics(target_errors_train, nll_cells_final_train) print( 'Combined: OCSVM + Marginals Train -- Cell AUC: {}, Cell AVPR: {}, Row AUC: {}, Row AVPR: {}' .format(auc_cell_train, avpr_cell_train, auc_row_train, avpr_row_train)) #Save results into csv if args.save_on: # create folder for saving experiment data (if necessary) folder_output = args.output_folder + "/" + args.outlier_model try: os.makedirs(folder_output) except OSError as e: if e.errno != errno.EEXIST: raise columns = ['AUC row', 'AVPR row', 'AUC cell', 'AVPR cell'] results = { 'AUC row': [auc_row_train], 'AVPR row': [avpr_row_train], 'AUC cell': [auc_cell_train], 'AVPR cell': [avpr_cell_train] } #Dataframe df_out = pd.DataFrame(data=results, columns=columns) df_out.index.name = "Epochs" df_out.to_csv(folder_output + "/train_epochs_data.csv") # store AVPR for features (cell only) df_avpr_feat_cell = pd.DataFrame([], index=['AVPR'], columns=attributes) df_avpr_feat_cell.loc['AVPR'] = avpr_feats df_avpr_feat_cell.to_csv(folder_output + "/train_avpr_features.csv") # store AUC for features (cell only) df_auc_feat_cell = pd.DataFrame([], index=['AUC'], columns=attributes) df_auc_feat_cell.loc['AUC'] = auc_feats df_auc_feat_cell.to_csv(folder_output + "/train_auc_features.csv")