def getFinalResultsForTask(self, setting_dict): if self.users_as_tasks: task_df = self.data_df[self.data_df['user_id'] == setting_dict['task_name']] target_label = [self.target_label] else: task_df = self.data_df target_label = [helper.getOfficialLabelName(setting_dict['task_name'])] self.net = tfnet.TensorFlowNetwork(task_df, copy.deepcopy(self.wanted_feats),target_label, verbose=False, val_type=self.val_type) self.net.setParams(l2_beta=setting_dict['l2_beta'], initial_learning_rate=setting_dict['learning_rate'], decay=setting_dict['decay'], decay_steps=setting_dict['decay_steps'], decay_rate=setting_dict['decay_rate'], batch_size=setting_dict['batch_size'], optimizer=setting_dict['optimizer'], dropout=setting_dict['dropout']) self.constructNetwork(setting_dict['hidden_layers']) self.net.setUpGraph() preds = self.net.runGraph(self.test_steps, print_test=True, return_test_preds=True) preds_df = self.net.get_preds_for_df() label_name = setting_dict['task_name'] preds_df.to_csv(self.results_path + "Preds-" + self.save_prefix + label_name + '.csv') print "Preds df saved to", self.results_path + "Preds-" + self.save_prefix + label_name + '.csv' return self.net.final_test_results['acc'], self.net.final_test_results['auc'], preds
def getFinalResultsAndSave(self, results_dict): print "\nRetraining on full training data with the best settings..." self.drop20 = False self.initializeAndTrainMTMKL(self.train_tasks, results_dict['C'], results_dict['beta'], results_dict['kernel'], results_dict['v'], results_dict['regularizer'], verbose=True) print "\nEvaluating results on held-out test set!! ..." all_preds = [] all_true_y = [] per_task_accs = [np.nan] * self.n_tasks per_task_aucs = [np.nan] * self.n_tasks per_task_f1 = [np.nan] * self.n_tasks per_task_precision = [np.nan] * self.n_tasks per_task_recall = [np.nan] * self.n_tasks for t in range(self.n_tasks): preds = self.classifier.predictOneTask(self.test_tasks, t) true_y = list(self.test_tasks[t]['Y'].flatten()) if len(preds) == 0 or len(true_y) == 0: print "no y for task", t, "... skipping" continue all_preds.extend(preds) all_true_y.extend(true_y) # save the per-task results t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds( preds, true_y) per_task_accs[t] = t_acc per_task_aucs[t] = t_auc per_task_f1[t] = t_f1 per_task_precision[t] = t_precision per_task_recall[t] = t_recall print "\nPlotting cool stuff about the final model..." self.saveImagePlot(self.classifier.eta, 'Etas') pd.DataFrame( self.classifier.eta).to_csv(self.etas_path + self.save_prefix + "-etas.csv") print "\tHELD OUT TEST METRICS COMPUTED BY APPENDING ALL PREDS" acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds( all_preds, all_true_y) print '\t\tAcc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall print "\n\tHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS" avg_acc = np.nanmean(per_task_accs) avg_auc = np.nanmean(per_task_aucs) avg_f1 = np.nanmean(per_task_f1) avg_precision = np.nanmean(per_task_precision) avg_recall = np.nanmean(per_task_recall) print '\t\tAcc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall print "\n\tHELD OUT TEST METRICS COMPUTED FOR EACH TASK" if not self.users_as_tasks: for t in range(self.n_tasks): task_name = self.test_tasks[t]['Name'] task_name = helper.getFriendlyLabelName(task_name) print "\t\t", task_name, "- Acc:", per_task_accs[ t], "AUC:", per_task_aucs[t], 'F1:', per_task_f1[ t], 'Precision:', per_task_precision[ t], 'Recall:', per_task_recall[t] if self.test_csv_filename is not None: print "\tSAVING HELD OUT PREDICITONS" if 'Big5GenderKMeansCluster' in self.file_prefix: task_column = 'Big5GenderKMeansCluster' tasks_are_ints = True label_name = helper.getFriendlyLabelName(self.file_prefix) wanted_label = helper.getOfficialLabelName(label_name) predictions_df = helper.get_test_predictions_for_df_with_task_column( self.classifier.predict_01, self.test_csv_filename, task_column, self.test_tasks, wanted_label=wanted_label, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1], label_name=label_name, tasks_are_ints=tasks_are_ints) elif not self.users_as_tasks: predictions_df = helper.get_test_predictions_for_df_with_no_task_column( self.classifier.predict_01, self.test_csv_filename, self.test_tasks, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1]) else: print "Error! Cannot determine what type of model you are training and therefore cannot save predictions." return predictions_df.to_csv(self.results_path + "Preds-" + self.save_prefix + '.csv') else: print "Uh oh, the test csv filename was not set, can't save test preds"
def retrainAndPlot(self, setting_dict): print "\nRETRAINING WITH THE BEST SETTINGS:" self.net.verbose = True self.net.setParams(l2_beta=setting_dict['l2_beta'], initial_learning_rate=setting_dict['learning_rate'], decay=setting_dict['decay'], decay_steps=setting_dict['decay_steps'], decay_rate=setting_dict['decay_rate'], batch_size=setting_dict['batch_size'], optimizer=setting_dict['optimizer'], dropout=setting_dict['dropout']) self.constructNetwork(setting_dict['hidden_layers']) self.net.setUpGraph() self.net.runGraph(self.test_steps, print_test=True) if self.multilabel: for label in self.optimize_labels: friendly_label = helper.getFriendlyLabelName(label) self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.eps', label=label) self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.png', label=label) print "Final validation results for", friendly_label,"... Acc:", \ self.net.training_val_results_per_task['acc'][label][-1], "Auc:", self.net.training_val_results_per_task['auc'][label][-1] elif self.print_per_task: for label in self.wanted_labels: friendly_label = helper.getFriendlyLabelName(label) self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.eps', label=label) self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.png', label=label) print "Final validation results for", friendly_label,"... Acc:", \ self.net.training_val_results_per_task['acc'][label][-1], "Auc:", self.net.training_val_results_per_task['auc'][label][-1] else: self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '.eps') self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '.png') print "Final AUC:", self.net.training_val_results['auc'][-1] if self.test_csv_filename is not None: if self.multitask: task_column = None if 'Cluster' in self.dataset_name: print "Guessing the task column is Big5GenderKMeansCluster - if this is incorrect expect errors" task_column = 'Big5GenderKMeansCluster' tasks_are_ints = True if 'User' in self.dataset_name: print "Guessing the task column is user_id - if this is incorrect expect errors" task_column = 'user_id' tasks_are_ints = False if task_column is not None: label_name = helper.getFriendlyLabelName(self.dataset_name) wanted_label = helper.getOfficialLabelName(label_name) test_preds_df = helper.get_test_predictions_for_df_with_task_column( self.net.predict, self.test_csv_filename, task_column, self.net.test_tasks, wanted_label=wanted_label, num_feats_expected=np.shape( self.net.test_tasks[0]['X'])[1], label_name=label_name, tasks_are_ints=tasks_are_ints) else: test_preds_df = helper.get_test_predictions_for_df_with_no_task_column( self.net.predict, self.test_csv_filename, self.net.test_tasks, num_feats_expected=np.shape( self.net.test_tasks[0]['X'])[1]) else: test_preds_df = self.net.get_preds_for_df() print "Got a test preds df! Saving it to:", self.results_path + "Preds-" + self.val_output_prefix + '.csv' test_preds_df.to_csv(self.results_path + 'Preds-' + self.val_output_prefix + '.csv') else: print "Uh oh, the test csv filename was not set, can't save test preds" print "Saving a copy of the final model!" self.net.save_model(self.val_output_prefix, self.results_path)
def getFinalResultsAndSave(self, setting_dict): if self.val_type == 'cross': print "\nPlotting cross-validation results for best settings..." self.getCrossValidationResults(dict(), setting_dict['tau10'], setting_dict['tau20'], setting_dict['sigma_multiplier'], setting_dict['mu_multiplier'], save_plots=True) print "\nRetraining on training data with the best settings..." self.initializeHBLRModel(self.train_tasks) self.classifier.verbose = True self.setClassifierToSetting(setting_dict['tau10'], setting_dict['tau20'], setting_dict['sigma_multiplier'], setting_dict['mu_multiplier']) self.classifier.trainUntilConverged() print "\nPlotting and saving cool stuff about the final model..." self.saveImagePlot(self.classifier.phi, 'Phi') pd.DataFrame(self.classifier.phi).to_csv(self.results_path + self.save_prefix + "-phi.csv") self.saveConvergencePlots() print "\nEvaluating results on held-out test set!! ..." all_preds = [] all_true_y = [] all_X_data = [] per_task_accs = [np.nan] * self.n_tasks per_task_aucs = [np.nan] * self.n_tasks per_task_f1 = [np.nan] * self.n_tasks per_task_precision = [np.nan] * self.n_tasks per_task_recall = [np.nan] * self.n_tasks for t in range(self.n_tasks): preds = self.classifier.predictBinary(self.test_tasks[t]['X'], t) true_y = list(self.test_tasks[t]['Y'].flatten()) if len(preds) == 0 or len(true_y) == 0: continue all_preds.extend(preds) all_true_y.extend(true_y) all_X_data.extend(self.test_tasks[t]['X']) # save the per-task results t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds( preds, true_y) per_task_accs[t] = t_acc per_task_aucs[t] = t_auc per_task_f1[t] = t_f1 per_task_precision[t] = t_precision per_task_recall[t] = t_recall print "\tHELD OUT TEST METRICS COMPUTED BY APPENDING ALL PREDS" acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds( all_preds, all_true_y) print '\t\tAcc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall print "\n\tHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS" avg_acc = np.nanmean(per_task_accs) avg_auc = np.nanmean(per_task_aucs) avg_f1 = np.nanmean(per_task_f1) avg_precision = np.nanmean(per_task_precision) avg_recall = np.nanmean(per_task_recall) print '\t\tAcc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall print "\n\tHELD OUT TEST METRICS COMPUTED FOR EACH TASK" if not self.users_as_tasks: for t in range(self.n_tasks): task_name = self.test_tasks[t]['Name'] if not self.users_as_tasks: task_name = helper.getFriendlyLabelName(task_name) print "\t\t", task_name, "- Acc:", per_task_accs[ t], "AUC:", per_task_aucs[t], 'F1:', per_task_f1[ t], 'Precision:', per_task_precision[ t], 'Recall:', per_task_recall[t] if self.test_csv_filename is not None: print "\tSAVING HELD OUT PREDICITONS" if self.users_as_tasks: task_column = 'user_id' label_name = helper.getFriendlyLabelName(self.file_prefix) wanted_label = helper.getOfficialLabelName(label_name) predictions_df = helper.get_test_predictions_for_df_with_task_column( self.classifier.predictBinary, self.test_csv_filename, task_column, self.test_tasks, wanted_label=wanted_label, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1], label_name=label_name, tasks_are_ints=False) else: predictions_df = helper.get_test_predictions_for_df_with_no_task_column( self.classifier.predictBinary, self.test_csv_filename, self.test_tasks, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1]) predictions_df.to_csv(self.results_path + "Preds-" + self.save_prefix + '.csv') else: print "Uh oh, the test csv filename was not set, can't save test preds" print "\t SAVING CLASSIFIER" with open( self.results_path + "PickledModel-" + self.save_prefix + '.p', "w") as f: pickle.dump(self.classifier, f)
def get_final_results(self, optimize_for='val_acc'): if self.users_as_tasks and not self.check_test: print "check_test is set to false, Will not evaluate performance on held-out test set." return print "\nAbout to evaluate results on held-out test set!!" print "Will use the settings that produced the best", optimize_for all_preds = [] all_true_y = [] per_task_accs = [] per_task_aucs = [] per_task_f1 = [] per_task_precision = [] per_task_recall = [] for t in range(self.n_tasks): task_settings = self.find_best_setting_for_task( t, optimize_for=optimize_for) assert (task_settings['task_num'] == t) if not self.users_as_tasks: print "\nBEST SETTING FOR TASK", t, "-", task_settings[ 'task_name'] print "The highest", optimize_for, "of", task_settings[ optimize_for], "was found with the following settings:" print task_settings task_settings = self.convert_param_dict_for_use(task_settings) preds, true_y = self.get_preds_true_for_task( self.train_tasks, self.test_tasks, task_settings) if preds is None or true_y is None: continue all_preds.extend(preds) all_true_y.extend(true_y) # save the per-task results t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds( preds, true_y) per_task_accs.append(t_acc) per_task_aucs.append(t_auc) per_task_f1.append(t_f1) per_task_precision.append(t_precision) per_task_recall.append(t_recall) if not self.users_as_tasks: print "\nFINAL TEST RESULTS FOR", helper.getFriendlyLabelName( self.train_tasks[t]['Name']) print 'Acc:', t_acc, 'AUC:', t_auc, 'F1:', t_f1, 'Precision:', t_precision, 'Recall:', t_recall print "\nHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS" avg_acc = np.nanmean(per_task_accs) avg_auc = np.nanmean(per_task_aucs) avg_f1 = np.nanmean(per_task_f1) avg_precision = np.nanmean(per_task_precision) avg_recall = np.nanmean(per_task_recall) print 'Acc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall if self.test_csv_filename is not None: print "\tSAVING HELD OUT PREDICITONS" if self.users_as_tasks: task_column = 'user_id' label_name = helper.getFriendlyLabelName(self.file_prefix) wanted_label = helper.getOfficialLabelName(label_name) predictions_df = helper.get_test_predictions_for_df_with_task_column( self.predict_task, self.test_csv_filename, task_column, self.test_tasks, wanted_label=wanted_label, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1], label_name=label_name, tasks_are_ints=False) else: predictions_df = helper.get_test_predictions_for_df_with_no_task_column( self.predict_task, self.test_csv_filename, self.test_tasks, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1]) predictions_df.to_csv(self.results_path + "Preds-" + self.save_prefix + '.csv') else: print "Uh oh, the test csv filename was not set, can't save test preds"