def _get_dataset_obj(self): self.train_loader, \ self.X_train, \ self.target_errors_train, \ dataset_obj, \ self.attributes = utils.load_data(self.args.data_path, self.args.batch_size, is_train=True, get_data_idxs=False) self.test_loader, self.X_test, self.target_errors_test, _, _ = utils.load_data( self.args.data_path, self.args.batch_size, is_train=False ) # -- clean versions for evaluation _, self.X_train_clean, _, _, _ = utils.load_data( self.args.data_path, self.args.batch_size, is_train=True, is_clean=True, stdize_dirty=True ) _, self.X_test_clean, _, _, _ = utils.load_data( self.args.data_path, self.args.batch_size, is_train=False, is_clean=True, stdize_dirty=True ) return dataset_obj
def main(args): # Load datasets _, X_train, target_errors_train, _, _ = utils.load_data(args.data_folder, args.batch_size, is_train=True, is_one_hot=args.is_one_hot) # _, X_test, target_errors_test, _, _ = utils.load_data(folder_path, args.batch_size, is_train=False) # NOTE: used in hyper-parameter selection # Vest parameters from CV clf = svm.OneClassSVM(nu=0.2, kernel="rbf", gamma=0.1) clf.fit(X_train) target_row = (target_errors_train.sum(dim=1)>0).numpy() outlier_score_row = -clf.score_samples(X_train) auc_row = auc_compute(target_row, outlier_score_row) avpr_row = avpr_compute(target_row, outlier_score_row) print('OC-SVM Train - AUC: ' + str(auc_row) + ', AVPR: ' + str(avpr_row)) #Save results into csv if args.save_on: # create folder for saving experiment data (if necessary) folder_output = args.output_folder + "/" + args.outlier_model try: os.makedirs(folder_output) except OSError as e: if e.errno != errno.EEXIST: raise columns = ['AUC','AVPR'] results = {'AUC': [auc_row], 'AVPR': [avpr_row]} #Dataframe df_out = pd.DataFrame(data=results, columns=columns) df_out.index.name = "Epochs" df_out.to_csv(folder_output + "/train_epochs_data.csv")
def main(args_in): #### MAIN #### # saving data of experiment to folder is on if args_in.save_on: # create folder for saving experiment data (if necessary) folder_output = args_in.output_folder + "/" try: os.makedirs(folder_output) except OSError as e: if e.errno != errno.EEXIST: raise # structs for saving data losses_save = {"train":{},"test":{}} # dtype definitions for runing if args_in.cuda_on: dtype_float = torch.cuda.FloatTensor dtype_byte = torch.cuda.ByteTensor else: dtype_float = torch.FloatTensor dtype_byte = torch.ByteTensor print(args_in) # Load datasets train_loader, X_train, target_errors_train, dataset_obj_train, attributes = utils.load_data(args_in.data_folder, args_in.batch_size, is_train=True, get_data_idxs=True) args_in.dataset_defs = dataset_obj_train X_train = X_train.type(dtype_float) test_loader, X_test, target_errors_test, dataset_obj_test, _ = utils.load_data(args_in.data_folder, args_in.batch_size, is_train=False) X_test = X_test.type(dtype_float) # -- clean versions for data repair evaluation (standardized according to the dirty data statistics) train_loader_clean, X_train_clean, _, dataset_obj_clean, _ = utils.load_data(args_in.data_folder, args_in.batch_size, is_train=True, is_clean=True, stdize_dirty=True) X_train_clean = X_train_clean.type(dtype_float) test_loader_clean, X_test_clean, _, _, _ = utils.load_data(args_in.data_folder, args_in.batch_size, is_train=False, is_clean=True, stdize_dirty=True) X_test_clean = X_test_clean.type(dtype_float) ### Run CondPred Model ### model = CondPred(args_in.dataset_defs, args_in) if args_in.cuda_on: model.cuda() # define optimizers for each cond pred model optimizer_dict = OrderedDict() for col_name, col_type, col_size in args_in.dataset_defs.feat_info: if args_in.base_type == 'linear': optimizer_dict[col_name] = optim.SGD(model.cond_models[col_name].parameters(), lr=args_in.lr, weight_decay=args_in.l2_reg, nesterov=args_in.nest_mom, # default: False momentum=args_in.mom_val) else: optimizer_dict[col_name] = optim.Adam(model.cond_models[col_name].parameters(), lr=args_in.lr, weight_decay=args_in.l2_reg) # Run epochs for epoch in range(1, args_in.number_epochs + 1): training_phase(args_in, model, optimizer_dict, train_loader, epoch) # Train set evaluation evaluation_phase(args_in, model, X_train, X_train_clean, target_errors_train, losses_save, epoch, mode='train') # Test set evaluation evaluation_phase(args_in, model, X_test, X_test_clean, target_errors_test, losses_save, epoch, mode='test') if args_in.save_on: ### Train Data outlier_metrics_train, repair_metrics_train, outlier_scores_train = \ evaluation_phase(args_in, model, X_train, X_train_clean, target_errors_train, [], -1, mode='train') # (outlier_score_cells_train, outlier_score_rows_train) store_metrics_final('train', outlier_scores_train, args_in.dataset_defs, attributes, outlier_metrics_train, repair_metrics_train, target_errors_train, losses_save, folder_output) ### Test Data outlier_metrics_test, repair_metrics_test, outlier_scores_test = \ evaluation_phase(args_in, model, X_test, X_test_clean, target_errors_test, [], -1, mode='test') store_metrics_final('test', outlier_scores_test, args_in.dataset_defs, attributes, outlier_metrics_test, repair_metrics_test, target_errors_test, losses_save, folder_output) # save model parameters model.cpu() torch.save(model.state_dict(), folder_output + "/model_params.pth") # remove non-serializable stuff del args_in.dataset_defs # save to .json file the args that were used for running the model with open(folder_output + "/args_run.json", "w") as outfile: json.dump(vars(args_in), outfile, indent=4, sort_keys=True)
def main(args): # Load datasets train_loader, X_train, target_errors_train, dataset_obj_train, attributes = utils.load_data(args.data_folder, args.batch_size, is_train=True) train_loader_clean, X_train_clean, _, dataset_obj_clean, _ = utils.load_data(args.data_folder, args.batch_size, is_train=True, is_clean=True, stdize_dirty=True) dataset_obj = dataset_obj_train df_data_train = dataset_obj_train.df_dataset_instance with warnings.catch_warnings(): warnings.simplefilter("ignore") p_mat_train, dict_densities, _, repair_mat = get_prob_matrix(df_data_train, dataset_obj.cat_cols, n_comp_max=40) mean_error_dirty, features_errors_dirty = error_computation(dataset_obj_clean, X_train_clean.detach().numpy(), repair_mat, dict_densities, target_errors_train.detach().numpy()) mean_error_clean, features_errors_clean = error_computation(dataset_obj_clean, X_train_clean.detach().numpy(), repair_mat, dict_densities, (1-target_errors_train).detach().numpy()) #print(features_errors) logp_mat_train = np.log(p_mat_train + 1e-9) target_row_train = (target_errors_train.sum(dim=1)>0).numpy() # Uses the NLL score as outlier score (just like VAE outlier score) outlier_score_cell_train = -logp_mat_train outlier_score_row_train = -logp_mat_train.sum(axis=1) ## Cell metrics auc_cell_train, auc_feats = get_auc_metrics(target_errors_train, outlier_score_cell_train) avpr_cell_train, avpr_feats = get_avpr_metrics(target_errors_train, outlier_score_cell_train) print("AVPR per feature") print(avpr_feats) print("AUC per feature") print(auc_feats) ## Row metrics auc_row_train = auc_compute(target_row_train, outlier_score_row_train) avpr_row_train = avpr_compute(target_row_train, outlier_score_row_train) print('Marginals Prob. Train - Cell AUC: {}, Cell AVPR: {}, Row AUC: {}, Row AVPR: {}'.format( auc_cell_train, avpr_cell_train, auc_row_train, avpr_row_train)) #Save results into csv if args.save_on: # create folder for saving experiment data (if necessary) folder_output = args.output_folder + "/" + args.outlier_model try: os.makedirs(folder_output) except OSError as e: if e.errno != errno.EEXIST: raise columns = ['AUC row','AVPR row','AUC cell','AVPR cell','Error repair on dirty pos', 'Error repair on clean pos'] results = {'AUC row': [auc_row_train], 'AVPR row': [avpr_row_train], 'AUC cell': [auc_cell_train], 'AVPR cell': [avpr_cell_train], 'Error repair on dirty pos': [mean_error_dirty], 'Error repair on clean pos': [mean_error_clean]} #Dataframe df_out = pd.DataFrame(data=results, columns=columns) df_out.index.name = "Epochs" df_out.to_csv(folder_output + "/train_epochs_data.csv") # store AVPR for features (cell only) df_avpr_feat_cell = pd.DataFrame([], index=['AVPR'], columns=attributes) df_avpr_feat_cell.loc['AVPR'] = avpr_feats df_avpr_feat_cell.to_csv(folder_output + "/train_avpr_features.csv") # store AUC for features (cell only) df_auc_feat_cell = pd.DataFrame([], index=['AUC'], columns=attributes) df_auc_feat_cell.loc['AUC'] = auc_feats df_auc_feat_cell.to_csv(folder_output + "/train_auc_features.csv") df_errors_repair = pd.DataFrame([], index=['error_repair_dirtycells','error_repair_cleancells'], columns=attributes) df_errors_repair.loc['error_repair_dirtycells'] = features_errors_dirty df_errors_repair.loc['error_repair_cleancells'] = features_errors_clean df_errors_repair.to_csv(folder_output + "/train_error_repair_features.csv")
def get_dataset(data_pars, task_type="train"): """ :param data_pars: :param task_type: :return: """ clean = data_pars["data_pars"].get('clean', True) data_path = data_pars["data_pars"]["data_path"] batch_size = data_pars["data_pars"]["batch_size"] if task_type == 'pred_encode': train_loader, X_train, target_errors_train, dataset_obj, attributes = utils.load_data(data_path, batch_size, is_train=True, get_data_idxs=False) return X_train elif task_type == 'pred_decode': train_loader, X_train, target_errors_train, dataset_obj, attributes = utils.load_data(data_path, batch_size, is_train=True, get_data_idxs=False) return target_errors_train if not clean: if task_type == 'train': train_loader, X_train, target_errors_train, dataset_obj, attributes = utils.load_data(data_path, batch_size, is_train=True, get_data_idxs=False) return train_loader, X_train, target_errors_train, dataset_obj, attributes elif task_type == 'test': test_loader, X_test, target_errors_test, _, _ = utils.load_data( data_path, batch_size, is_train=False ) return test_loader, X_test, target_errors_test elif task_type == 'predict': train_loader, _, _, _, _ = utils.load_data(data_path, batch_size, is_train=True, get_data_idxs=False) return train_loader # -- clean versions for evaluation else: if task_type == 'train': _, X_train_clean, _, _, _ = utils.load_data( data_path, batch_size, is_train=True, is_clean=True, stdize_dirty=True ) return X_train_clean elif task_type == 'test': _, X_test_clean, _, _, _ = utils.load_data( data_path, batch_size, is_train=False, is_clean=True, stdize_dirty=True ) return X_test_clean
def main(args_in): #### MAIN #### # saving data of experiment to folder is on if args_in.save_on: # create folder for saving experiment data (if necessary) folder_output = args_in.output_folder args_in.folder_output = folder_output # structs for saving data args_in.losses_save = {"train": {}, "test": {}} try: os.makedirs(folder_output + '/') except OSError as e: if e.errno != errno.EEXIST: raise # dtype definitions for runing if args_in.cuda_on: dtype_float = torch.cuda.FloatTensor dtype_byte = torch.cuda.ByteTensor else: dtype_float = torch.FloatTensor dtype_byte = torch.ByteTensor # only one type of prior assumption on errors / outliers if (not args_in.l1_method) and (not args_in.l21_method): args_in.l21_method = True elif args_in.l1_method and args_in.l21_method: args_in.l21_method = False # Choose dataset to run on folder_path = args_in.data_folder # Load datasets train_loader, X_train, target_errors_train, dataset_obj_train, attributes = utils.load_data( folder_path, args_in.batch_size, is_train=True, get_data_idxs=True, is_one_hot=True) args_in.dataset_defs = dataset_obj_train args_in.train_loader = train_loader args_in.target_errors_train = target_errors_train.type(dtype_byte) X_train = X_train.type(dtype_float) test_loader, X_test, target_errors_test, dataset_obj_test, _ = utils.load_data( folder_path, args_in.batch_size, is_train=False, get_data_idxs=True, is_one_hot=True) args_in.test_loader = test_loader args_in.target_errors_test = target_errors_test.type(dtype_byte) X_test = X_test.type(dtype_float) # -- clean versions for data repair evaluation (standardized according to the dirty data statistics) train_loader_clean, X_train_clean, _, dataset_obj_clean, _ = utils.load_data( args_in.data_folder, args_in.batch_size, is_train=True, is_clean=True, is_one_hot=True, stdize_dirty=True) args_in.train_loader_clean = train_loader_clean X_train_clean = X_train_clean.type(dtype_float) test_loader_clean, X_test_clean, _, _, _ = utils.load_data( args_in.data_folder, args_in.batch_size, is_train=False, is_clean=True, is_one_hot=True, stdize_dirty=True) args_in.test_loader_clean = test_loader_clean X_test_clean = X_test_clean.type(dtype_float) # RAE model matrices rae_data_train = dict() rae_data_test = dict() rae_data_train['LD'] = torch.zeros_like(X_train).type(dtype_float) rae_data_test['LD'] = torch.zeros_like(X_test).type(dtype_float) rae_data_train['LS'] = X_train.clone() rae_data_test['LS'] = X_test.clone() rae_data_train['S'] = torch.zeros_like(X_train).type(dtype_float) rae_data_test['S'] = torch.zeros_like(X_test).type(dtype_float) # Run RAE model model = RAE(args_in.dataset_defs, args_in) if args_in.cuda_on: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args_in.lr, weight_decay=args_in.l2_reg) for admm_iter in range(1, args_in.number_ADMM_iters + 1): # train converged_train = train(args_in, model, optimizer, rae_data_train, X_train, X_train_clean, admm_iter) if converged_train: print("--> RAE for train data has converged!") # validation if args_in.turn_on_validation: test(args_in, model, optimizer, rae_data_test, X_test, X_test_clean, admm_iter) if args_in.save_on: ### Train Data outlier_metrics_train, repair_metrics_train = evaluation_phase( args_in, model, rae_data_train, X_train, X_train_clean, target_errors_train, train_loader) store_metrics_final('train', args_in.dataset_defs, attributes, outlier_metrics_train, repair_metrics_train, target_errors_train, rae_data_train['S'], args_in) ### Test Data outlier_metrics_test, repair_metrics_test = evaluation_phase( args_in, model, rae_data_test, X_test, X_test_clean, target_errors_test, test_loader) store_metrics_final('test', args_in.dataset_defs, attributes, outlier_metrics_test, repair_metrics_test, target_errors_test, rae_data_test['S'], args_in) # save model parameters model.cpu() torch.save(model.state_dict(), folder_output + "/model_params.pth") # remove non-serializable stuff del args_in.dataset_defs # = [] del args_in.train_loader # = [] del args_in.target_errors_train # = [] del args_in.test_loader # = [] del args_in.target_errors_test # = [] del args_in.train_loader_clean del args_in.test_loader_clean del args_in.folder_output del args_in.losses_save # save to .json file the args that were used for running the model with open(folder_output + "/args_run.json", "w") as outfile: json.dump(vars(args_in), outfile, indent=4, sort_keys=True)
def main(args): # Load datasets train_loader, X_train, target_errors_train, dataset_obj_train, attributes = utils.load_data( args.data_folder, args.batch_size, is_train=True, is_one_hot=args.is_one_hot) test_loader, X_test, target_errors_test, _, _ = utils.load_data( args.data_folder, args.batch_size, is_train=False) df_data_train = dataset_obj_train.df_dataset_instance # Run Marginals to obtain cell log probs with warnings.catch_warnings(): warnings.simplefilter("ignore") p_mat_train, _, _, _ = get_prob_matrix(df_data_train, dataset_obj_train.cat_cols, n_comp_max=40) nll_marginal_cell = -np.log(p_mat_train + 1e-8) target_errors_row_train = (target_errors_train.sum(dim=1) > 0) target_row_train = target_errors_row_train.numpy() target_errors_row_test = (target_errors_test.sum(dim=1) > 0) target_row_test = target_errors_row_test.numpy() # Run OCSVM row outlier detection clf = svm.OneClassSVM(nu=0.2, kernel="rbf", gamma=0.1) clf.fit(X_train) outlier_score_row_train = -clf.score_samples(X_train) outlier_score_row_test = -clf.score_samples(X_test) # Platt Scaling (uses Logistic Regression) of OCSVM scores lr_calib = LogisticRegression(solver='lbfgs') lr_calib.fit(outlier_score_row_test.reshape(-1, 1), target_row_test) p_inlier_train = lr_calib.predict_proba( outlier_score_row_train.reshape(-1, 1))[:, 0] nll_inlier_row_train = -np.log(p_inlier_train + 1e-8) # -log (p_inlier) # Row metrics auc_row_train = auc_compute(target_row_train, outlier_score_row_train) avpr_row_train = avpr_compute(target_row_train, outlier_score_row_train) ll_row_train = log_loss(target_row_train, outlier_score_row_train) auc_row_train_calibed = auc_compute(target_row_train, nll_inlier_row_train) avpr_row_train_calibed = avpr_compute(target_row_train, nll_inlier_row_train) ll_row_train_calibed = log_loss(target_row_train, 1. - p_inlier_train) print("AUC Prev. Calib.: {}".format(auc_row_train)) print("AVPR Prev. Calib.: {}".format(avpr_row_train)) print("Cross-Entropy Prev. Calib. {}".format(ll_row_train)) # Re-check score is still good after calibration (AVPR and AUC should be same); # then Cross-Entropy should drop !! print("AUC Post. Calib.: {}".format(auc_row_train_calibed)) print("AVPR Post. Calib.: {}".format(avpr_row_train_calibed)) print("Cross-Entropy Post. Calib. {}".format(ll_row_train_calibed)) # combine calibrated OCSVM and Marginals for cell outlier detection nll_cells_final_train = nll_inlier_row_train.reshape(-1, 1) + nll_marginal_cell # Cell metrics auc_cell_train, auc_feats = get_auc_metrics(target_errors_train, nll_cells_final_train) avpr_cell_train, avpr_feats = get_avpr_metrics(target_errors_train, nll_cells_final_train) print( 'Combined: OCSVM + Marginals Train -- Cell AUC: {}, Cell AVPR: {}, Row AUC: {}, Row AVPR: {}' .format(auc_cell_train, avpr_cell_train, auc_row_train, avpr_row_train)) #Save results into csv if args.save_on: # create folder for saving experiment data (if necessary) folder_output = args.output_folder + "/" + args.outlier_model try: os.makedirs(folder_output) except OSError as e: if e.errno != errno.EEXIST: raise columns = ['AUC row', 'AVPR row', 'AUC cell', 'AVPR cell'] results = { 'AUC row': [auc_row_train], 'AVPR row': [avpr_row_train], 'AUC cell': [auc_cell_train], 'AVPR cell': [avpr_cell_train] } #Dataframe df_out = pd.DataFrame(data=results, columns=columns) df_out.index.name = "Epochs" df_out.to_csv(folder_output + "/train_epochs_data.csv") # store AVPR for features (cell only) df_avpr_feat_cell = pd.DataFrame([], index=['AVPR'], columns=attributes) df_avpr_feat_cell.loc['AVPR'] = avpr_feats df_avpr_feat_cell.to_csv(folder_output + "/train_avpr_features.csv") # store AUC for features (cell only) df_auc_feat_cell = pd.DataFrame([], index=['AUC'], columns=attributes) df_auc_feat_cell.loc['AUC'] = auc_feats df_auc_feat_cell.to_csv(folder_output + "/train_auc_features.csv")
def main(args): # Load datasets train_loader, X_train, target_errors_train, dataset_obj, attributes = utils.load_data( args.data_folder, args.batch_size, is_train=True, get_data_idxs=False) test_loader, X_test, target_errors_test, _, _ = utils.load_data( args.data_folder, args.batch_size, is_train=False) # -- clean versions for evaluation _, X_train_clean, _, _, _ = utils.load_data(args.data_folder, args.batch_size, is_train=True, is_clean=True, stdize_dirty=True) _, X_test_clean, _, _, _ = utils.load_data(args.data_folder, args.batch_size, is_train=False, is_clean=True, stdize_dirty=True) # if runnin on gpu, then load data there if args.cuda_on: X_test = X_test.cuda() target_errors_test = target_errors_test.cuda() X_train_clean = X_train_clean.cuda() X_test_clean = X_test_clean.cuda() target_errors_train = target_errors_train.cuda() X_train = X_train.cuda() # for checking w (pi) raw convergence logit_pi_prev_train = torch.tensor([]) logit_pi_prev_test = torch.tensor([]) # Import the model from the correct file outlier_model = __import__(args.outlier_model) model = outlier_model.VAE(dataset_obj, args) if args.load_model: model.load_state_dict(torch.load(args.load_model_path)) print(args) if args.cuda_on: model.cuda() optimizer = optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.l2_reg) # excludes frozen params / layers # structs for saving data losses_save = { "train": {}, "test": {}, "train_per_feature": {}, "test_per_feature": {} } # Run epochs for epoch in range(1, args.number_epochs + 1): # Training Phase _train_loader, _dataset_obj = train_loader, dataset_obj training_phase(model, optimizer, _train_loader, args, epoch) #Compute all the losses and metrics per epoch (Train set) compute_metrics(model, X_train, _dataset_obj, args, epoch, losses_save, logit_pi_prev_train, X_train_clean, target_errors_train, mode="train") #Test Phase compute_metrics(model, X_test, dataset_obj, args, epoch, losses_save, logit_pi_prev_test, X_test_clean, target_errors_test, mode="test") # save to folder AVPR / AUC per feature if args.save_on: # create folder for saving experiment data (if necessary) folder_output = args.output_folder + "/" + args.outlier_model ### Train Data save_to_csv(model, X_train, X_train_clean, target_errors_train, attributes, losses_save, dataset_obj, folder_output, args, epoch, mode='train') ### Test Data save_to_csv(model, X_test, X_test_clean, target_errors_test, attributes, losses_save, dataset_obj, folder_output, args, epoch, mode='test') # save model parameters model.cpu() torch.save(model.state_dict(), folder_output + "/model_params.pth") # save to .json file the args that were used for running the model with open(folder_output + "/args_run.json", "w") as outfile: json.dump(vars(args), outfile, indent=4, sort_keys=True)