def getValidationResults(self, results_dict, C, beta, kernel, v,
                             regularizer):
        converged = self.initializeAndTrainMTMKL(self.train_tasks, C, beta,
                                                 kernel, v, regularizer)

        if self.users_as_tasks:
            if not converged:
                val_acc = np.nan
                val_auc = np.nan
            else:
                val_acc, val_auc = self.classifier.getAccuracyAucAllTasks(
                    self.val_tasks)
            results_dict['val_acc'] = val_acc
            results_dict['val_auc'] = val_auc
        else:
            accs = []
            aucs = []
            for t in range(self.n_tasks):
                if not converged:
                    acc = np.nan
                    auc = np.nan
                else:
                    acc, auc = self.classifier.getAccuracyAucOnOneTask(
                        self.val_tasks, t)
                task_name = self.val_tasks[t]['Name']
                results_dict['TaskAcc-' +
                             helper.getFriendlyLabelName(task_name)] = acc
                results_dict['TaskAuc-' +
                             helper.getFriendlyLabelName(task_name)] = auc
                if self.cluster_users or task_name in self.optimize_labels:
                    accs.append(acc)
                    aucs.append(auc)
            results_dict['val_acc'] = np.mean(accs)
            results_dict['val_auc'] = np.mean(aucs)
        return results_dict
	def run(self):
		print "\nYou have chosen to test a total of", self.num_settings, "settings for each task"
		print "There are", self.n_tasks, "tasks, meaning you are training a total of..."
		print "\t", self.num_settings * self.n_tasks, "neural networks!!"
		sys.stdout.flush()

		if self.users_as_tasks:
			tasks = self.data_df['user_id'].unique()
		else:
			tasks = [helper.getFriendlyLabelName(x) for x in self.wanted_labels]

		i = 0
		for t in range(len(tasks)):
			if self.users_as_tasks:
				self.runOneTask(tasks[i], self.target_label)
			else:
				self.runOneTask(tasks[i], self.wanted_labels[i])
			if self.test_run and i > 2:
				break
			i += 1
		self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv')

		if self.users_as_tasks:
			print "\n\nFINAL RESULTS - Averaging individual models:"
			print "\tValidation set: Accuracy =", np.nanmean(self.val_results_df['val_acc']), "AUC = ", np.nanmean(self.val_results_df['val_auc'])
			print "\tTest set: Accuracy =", np.nanmean(self.val_results_df['test_acc']), "AUC = ", np.nanmean(self.val_results_df['test_auc'])
			print ""
			print "FINAL RESULTS - Aggregating predictions of individual models"
			agg_auc = helper.computeAuc(self.cumulative_test_preds, self.cumulative_test_true)
			agg_acc = helper.getBinaryAccuracy(self.cumulative_test_preds, self.cumulative_test_true)
			print "\tTest set: Accuracy =", agg_acc, "AUC = ", agg_auc
    def getValidationResults(self, results_dict):
        self.classifier.trainUntilConverged()
        results_dict['num_clusters'] = self.classifier.K

        if self.users_as_tasks:
            val_acc, val_auc = self.getAccuracyAucOnAllTasks(self.val_tasks)
            results_dict['val_acc'] = val_acc
            results_dict['val_auc'] = val_auc
        else:
            accs = []
            aucs = []
            for t in range(self.n_tasks):
                acc, auc = self.getAccuracyAucOnOneTask(self.val_tasks, t)
                task_name = self.val_tasks[t]['Name']
                results_dict['TaskAcc-' +
                             helper.getFriendlyLabelName(task_name)] = acc
                results_dict['TaskAuc-' +
                             helper.getFriendlyLabelName(task_name)] = auc
                if task_name in self.optimize_labels:
                    accs.append(acc)
                    aucs.append(auc)
            results_dict['val_acc'] = np.nanmean(accs)
            results_dict['val_auc'] = np.nanmean(aucs)
        return results_dict
示例#4
0
 def getSavePrefix(self, file_name, target_label, replace=False):
     if '/' in file_name:
         slash_loc = file_name.find('/')
         file_name = file_name[slash_loc:]
     dash_loc = file_name.find('-')
     if self.users_as_tasks:
         task_name = "tfSTLUsers"
         label_name = '-' + helper.getFriendlyLabelName(target_label)
     else:
         task_name = "tfSTLWellbeing"
         label_name = ""
     prefix = task_name + file_name[dash_loc:-4] + label_name
     if not replace:
         while os.path.exists(self.results_path + prefix + '.csv'):
             prefix = prefix + '2'
     return prefix
示例#5
0
	def get_preds_for_df(self):
		X = self.data_df[self.wanted_feats].as_matrix()
		preds = self.predict(X)
		assert len(preds) == len(self.data_df)
		preds_df = copy.deepcopy(self.data_df)

		for i,wanted_label in enumerate(self.wanted_labels):
			label_name = helper.getFriendlyLabelName(wanted_label)
			preds_df['test_pred_'+label_name] = preds

			test_df = preds_df[preds_df['dataset']=='Test']
			test_df = test_df.dropna(subset=[wanted_label], how='any')
			all_preds = test_df['test_pred_'+label_name].tolist()
			all_true = test_df[wanted_label].tolist()
			print "FINAL METRICS ON TEST SET for label", label_name, ":", helper.computeAllMetricsForPreds(all_preds, all_true)

		print "Predictions have been computed and are stored in dataframe."
		return preds_df
    def getFinalResultsAndSave(self, results_dict):
        print "\nRetraining on full training data with the best settings..."
        self.drop20 = False
        self.initializeAndTrainMTMKL(self.train_tasks,
                                     results_dict['C'],
                                     results_dict['beta'],
                                     results_dict['kernel'],
                                     results_dict['v'],
                                     results_dict['regularizer'],
                                     verbose=True)

        print "\nEvaluating results on held-out test set!! ..."
        all_preds = []
        all_true_y = []
        per_task_accs = [np.nan] * self.n_tasks
        per_task_aucs = [np.nan] * self.n_tasks
        per_task_f1 = [np.nan] * self.n_tasks
        per_task_precision = [np.nan] * self.n_tasks
        per_task_recall = [np.nan] * self.n_tasks
        for t in range(self.n_tasks):
            preds = self.classifier.predictOneTask(self.test_tasks, t)
            true_y = list(self.test_tasks[t]['Y'].flatten())

            if len(preds) == 0 or len(true_y) == 0:
                print "no y for task", t, "... skipping"
                continue

            all_preds.extend(preds)
            all_true_y.extend(true_y)

            # save the per-task results
            t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(
                preds, true_y)
            per_task_accs[t] = t_acc
            per_task_aucs[t] = t_auc
            per_task_f1[t] = t_f1
            per_task_precision[t] = t_precision
            per_task_recall[t] = t_recall

        print "\nPlotting cool stuff about the final model..."
        self.saveImagePlot(self.classifier.eta, 'Etas')
        pd.DataFrame(
            self.classifier.eta).to_csv(self.etas_path + self.save_prefix +
                                        "-etas.csv")

        print "\tHELD OUT TEST METRICS COMPUTED BY APPENDING ALL PREDS"
        acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(
            all_preds, all_true_y)
        print '\t\tAcc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall

        print "\n\tHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS"
        avg_acc = np.nanmean(per_task_accs)
        avg_auc = np.nanmean(per_task_aucs)
        avg_f1 = np.nanmean(per_task_f1)
        avg_precision = np.nanmean(per_task_precision)
        avg_recall = np.nanmean(per_task_recall)
        print '\t\tAcc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall

        print "\n\tHELD OUT TEST METRICS COMPUTED FOR EACH TASK"
        if not self.users_as_tasks:
            for t in range(self.n_tasks):
                task_name = self.test_tasks[t]['Name']
                task_name = helper.getFriendlyLabelName(task_name)
                print "\t\t", task_name, "- Acc:", per_task_accs[
                    t], "AUC:", per_task_aucs[t], 'F1:', per_task_f1[
                        t], 'Precision:', per_task_precision[
                            t], 'Recall:', per_task_recall[t]

        if self.test_csv_filename is not None:
            print "\tSAVING HELD OUT PREDICITONS"
            if 'Big5GenderKMeansCluster' in self.file_prefix:
                task_column = 'Big5GenderKMeansCluster'
                tasks_are_ints = True
                label_name = helper.getFriendlyLabelName(self.file_prefix)
                wanted_label = helper.getOfficialLabelName(label_name)
                predictions_df = helper.get_test_predictions_for_df_with_task_column(
                    self.classifier.predict_01,
                    self.test_csv_filename,
                    task_column,
                    self.test_tasks,
                    wanted_label=wanted_label,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1],
                    label_name=label_name,
                    tasks_are_ints=tasks_are_ints)
            elif not self.users_as_tasks:
                predictions_df = helper.get_test_predictions_for_df_with_no_task_column(
                    self.classifier.predict_01,
                    self.test_csv_filename,
                    self.test_tasks,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1])
            else:
                print "Error! Cannot determine what type of model you are training and therefore cannot save predictions."
                return
            predictions_df.to_csv(self.results_path + "Preds-" +
                                  self.save_prefix + '.csv')
        else:
            print "Uh oh, the test csv filename was not set, can't save test preds"
    def getCrossValidationResults(self,
                                  results_dict,
                                  C,
                                  beta,
                                  kernel,
                                  v,
                                  regularizer,
                                  save_plots=False,
                                  print_per_fold=True):
        all_acc = []
        all_auc = []
        all_f1 = []
        all_precision = []
        all_recall = []
        if not self.users_as_tasks:
            per_task_accs = [[] for i in range(self.n_tasks)]
            per_task_aucs = [[] for i in range(self.n_tasks)]
            per_task_f1 = [[] for i in range(self.n_tasks)]
            per_task_precision = [[] for i in range(self.n_tasks)]
            per_task_recall = [[] for i in range(self.n_tasks)]

        for f in range(self.num_cross_folds):
            train_tasks, val_tasks = helper.loadCrossValData(
                self.datasets_path,
                self.file_prefix,
                f,
                reshape=False,
                fix_y=True)
            converged = self.initializeAndTrainMTMKL(train_tasks, C, beta,
                                                     kernel, v, regularizer)
            if not converged:
                all_acc.append(np.nan)
                all_auc.append(np.nan)
                all_f1.append(np.nan)
                all_precision.append(np.nan)
                all_recall.append(np.nan)
                continue

            # Get results!
            fold_preds = []
            fold_true_y = []
            for t in range(self.n_tasks):
                preds = self.classifier.predictOneTask(val_tasks, t)
                true_y = list(val_tasks[t]['Y'].flatten())

                if not self.users_as_tasks:
                    # save the per-task results
                    t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(
                        preds, true_y)
                    per_task_accs[t].append(t_acc)
                    per_task_aucs[t].append(t_auc)
                    per_task_f1[t].append(t_f1)
                    per_task_precision[t].append(t_precision)
                    per_task_recall[t].append(t_recall)
                    if print_per_fold:
                        print "Fold", f, "Task", val_tasks[t][
                            'Name'], "acc", t_acc, "auc", t_auc, "f1", t_f1, "precision", t_precision, "recall", t_recall

                fold_preds.extend(preds)
                fold_true_y.extend(true_y)

            acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(
                fold_preds, fold_true_y)
            all_acc.append(acc)
            all_auc.append(auc)
            all_f1.append(f1)
            all_precision.append(precision)
            all_recall.append(recall)
            if print_per_fold:
                print "Fold", f, "acc", acc, "auc", auc, "f1", f1, "precision", precision, "recall", recall

        print "accs for all folds", all_acc
        print "aucs for all folds", all_auc

        # Add results to the dictionary
        results_dict['val_acc'] = np.nanmean(all_acc)
        results_dict['val_auc'] = np.nanmean(all_auc)
        results_dict['val_f1'] = np.nanmean(all_f1)
        results_dict['val_precision'] = np.nanmean(all_precision)
        results_dict['val_recall'] = np.nanmean(all_recall)

        # Add per-task results to the dictionary
        if not self.users_as_tasks:
            for t in range(self.n_tasks):
                task_name = val_tasks[t]['Name']
                results_dict[
                    'TaskAcc-' +
                    helper.getFriendlyLabelName(task_name)] = np.nanmean(
                        per_task_accs[t])
                results_dict[
                    'TaskAuc-' +
                    helper.getFriendlyLabelName(task_name)] = np.nanmean(
                        per_task_aucs[t])
                results_dict[
                    'TaskF1-' +
                    helper.getFriendlyLabelName(task_name)] = np.nanmean(
                        per_task_f1[t])
                results_dict[
                    'TaskPrecision-' +
                    helper.getFriendlyLabelName(task_name)] = np.nanmean(
                        per_task_precision[t])
                results_dict[
                    'TaskRecall-' +
                    helper.getFriendlyLabelName(task_name)] = np.nanmean(
                        per_task_recall[t])

        return results_dict
示例#8
0
    def retrainAndPlot(self, setting_dict):
        print "\nRETRAINING WITH THE BEST SETTINGS:"

        self.net.verbose = True
        self.net.setParams(l2_beta=setting_dict['l2_beta'],
                           initial_learning_rate=setting_dict['learning_rate'],
                           decay=setting_dict['decay'],
                           decay_steps=setting_dict['decay_steps'],
                           decay_rate=setting_dict['decay_rate'],
                           batch_size=setting_dict['batch_size'],
                           optimizer=setting_dict['optimizer'],
                           dropout=setting_dict['dropout'])
        self.constructNetwork(setting_dict['hidden_layers'])

        self.net.setUpGraph()
        self.net.runGraph(self.test_steps, print_test=True)

        if self.multilabel:
            for label in self.optimize_labels:
                friendly_label = helper.getFriendlyLabelName(label)
                self.net.plotValResults(save_path=self.figures_path +
                                        self.val_output_prefix + '-' +
                                        friendly_label + '.eps',
                                        label=label)
                self.net.plotValResults(save_path=self.figures_path +
                                        self.val_output_prefix + '-' +
                                        friendly_label + '.png',
                                        label=label)
                print "Final validation results for", friendly_label,"... Acc:", \
                  self.net.training_val_results_per_task['acc'][label][-1], "Auc:", self.net.training_val_results_per_task['auc'][label][-1]
        elif self.print_per_task:
            for label in self.wanted_labels:
                friendly_label = helper.getFriendlyLabelName(label)
                self.net.plotValResults(save_path=self.figures_path +
                                        self.val_output_prefix + '-' +
                                        friendly_label + '.eps',
                                        label=label)
                self.net.plotValResults(save_path=self.figures_path +
                                        self.val_output_prefix + '-' +
                                        friendly_label + '.png',
                                        label=label)
                print "Final validation results for", friendly_label,"... Acc:", \
                 self.net.training_val_results_per_task['acc'][label][-1], "Auc:", self.net.training_val_results_per_task['auc'][label][-1]
        else:
            self.net.plotValResults(save_path=self.figures_path +
                                    self.val_output_prefix + '.eps')
            self.net.plotValResults(save_path=self.figures_path +
                                    self.val_output_prefix + '.png')
            print "Final AUC:", self.net.training_val_results['auc'][-1]

        if self.test_csv_filename is not None:
            if self.multitask:
                task_column = None
                if 'Cluster' in self.dataset_name:
                    print "Guessing the task column is Big5GenderKMeansCluster - if this is incorrect expect errors"
                    task_column = 'Big5GenderKMeansCluster'
                    tasks_are_ints = True

                if 'User' in self.dataset_name:
                    print "Guessing the task column is user_id - if this is incorrect expect errors"
                    task_column = 'user_id'
                    tasks_are_ints = False

                if task_column is not None:
                    label_name = helper.getFriendlyLabelName(self.dataset_name)
                    wanted_label = helper.getOfficialLabelName(label_name)
                    test_preds_df = helper.get_test_predictions_for_df_with_task_column(
                        self.net.predict,
                        self.test_csv_filename,
                        task_column,
                        self.net.test_tasks,
                        wanted_label=wanted_label,
                        num_feats_expected=np.shape(
                            self.net.test_tasks[0]['X'])[1],
                        label_name=label_name,
                        tasks_are_ints=tasks_are_ints)
                else:
                    test_preds_df = helper.get_test_predictions_for_df_with_no_task_column(
                        self.net.predict,
                        self.test_csv_filename,
                        self.net.test_tasks,
                        num_feats_expected=np.shape(
                            self.net.test_tasks[0]['X'])[1])
            else:
                test_preds_df = self.net.get_preds_for_df()
            print "Got a test preds df! Saving it to:", self.results_path + "Preds-" + self.val_output_prefix + '.csv'
            test_preds_df.to_csv(self.results_path + 'Preds-' +
                                 self.val_output_prefix + '.csv')
        else:
            print "Uh oh, the test csv filename was not set, can't save test preds"

        print "Saving a copy of the final model!"
        self.net.save_model(self.val_output_prefix, self.results_path)
示例#9
0
    def __init__(self,
                 dataset_name,
                 target_label=None,
                 trial_name=None,
                 multilabel=False,
                 multitask=False,
                 print_per_task=False,
                 test_steps=9001,
                 results_path=DEFAULT_RESULTS_PATH,
                 datasets_path=DEFAULT_DATASETS_PATH,
                 figures_path=DEFAULT_FIGURES_PATH,
                 val_output_file=None,
                 val_type=DEFAULT_VAL_TYPE,
                 cont=False,
                 architectures=None,
                 test_csv_filename=None):
        assert not (multilabel and multitask)

        self.multilabel = multilabel
        self.multitask = multitask
        self.results_path = results_path
        self.figures_path = figures_path
        self.datasets_path = datasets_path
        self.dataset_name = dataset_name
        self.test_steps = test_steps
        self.val_type = val_type
        self.cont = cont
        self.print_per_task = print_per_task
        if test_csv_filename is not None:
            self.test_csv_filename = self.datasets_path + test_csv_filename
        else:
            self.test_csv_filename = None
        if cont:
            replace = True
        else:
            replace = False
        if trial_name is None and target_label is not None:
            trial_name = helper.getFriendlyLabelName(target_label)
        self.trial_name = trial_name
        self.val_output_prefix = self.getValOutputName(val_output_file,
                                                       dataset_name,
                                                       trial_name,
                                                       replace=replace)

        #dataset stuff
        if multitask:
            train_tasks = pickle.load(
                open(self.datasets_path + dataset_name + "Train.p", "rb"))
            val_tasks = pickle.load(
                open(self.datasets_path + dataset_name + "Val.p", "rb"))
            test_tasks = pickle.load(
                open(self.datasets_path + dataset_name + "Test.p", "rb"))

            self.net = mtltf.TensorFlowNetworkMTL(
                train_tasks,
                val_tasks,
                test_tasks,
                verbose=False,
                val_type=self.val_type,
                print_per_task=print_per_task)
            self.wanted_labels = self.net.optimize_labels
        else:
            self.data_df = pd.DataFrame.from_csv(self.datasets_path +
                                                 self.dataset_name)
            self.wanted_feats = [
                x for x in self.data_df.columns.values if x != 'user_id'
                and x != 'timestamp' and x != 'dataset' and '_Label' not in x
            ]
            if self.multilabel:
                self.wanted_labels = [
                    x for x in self.data_df.columns.values
                    if '_Label' in x and 'tomorrow_' in x and 'Evening' in x
                    and 'Alertness' not in x and 'Energy' not in x
                ]
                self.optimize_labels = [
                    x for x in self.wanted_labels
                    if 'tomorrow_' in x and 'Evening_' in x
                ]
            else:
                self.wanted_labels = [target_label]

            #actual network
            self.net = tfnet.TensorFlowNetwork(
                self.data_df,
                self.wanted_feats,
                self.wanted_labels,
                optimize_labels=self.wanted_labels,
                multilabel=self.multilabel,
                verbose=False,
                val_type=self.val_type)

        #parameters that can be tuned:
        self.l2_regularizers = [1e-2, 1e-4]
        self.dropout = [True, False]
        self.decay = [True]
        self.decay_steps = [1000]
        self.decay_rates = [0.95]
        self.optimizers = [
            tf.train.AdamOptimizer
        ]  #[tf.train.AdagradOptimizer,  tf.train.GradientDescentOptimizer
        self.train_steps = [5001]
        if multitask:
            self.batch_sizes = [20]
            self.learning_rates = [.01, .001, .0001]
            self.architectures = [[500, 50], [300, 20, 10]
                                  ] if architectures is None else architectures
        else:
            self.batch_sizes = [50, 75]
            self.learning_rates = [.01, .001, .0001]
            self.architectures = [[1024, 256], [500, 50], [1024]
                                  ] if architectures is None else architectures

        #storing the results
        self.time_sum = 0
        if cont:
            self.val_results_df = pd.DataFrame.from_csv(
                self.results_path + self.val_output_prefix + '.csv')
            print '\nPrevious validation results df loaded. It has', len(
                self.val_results_df), "rows"
            self.started_from = len(self.val_results_df)
        else:
            self.val_results_df = pd.DataFrame()
            self.started_from = 0
示例#10
0
    def testOneSetting(self, hidden_layers, l2_beta, lrate, dropout, decay,
                       dsteps, drate, bsize, opt, tsteps, num_settings):
        print "Testing setting with layers", hidden_layers, "beta", l2_beta, "lrate", lrate, "dropout", dropout, "decay", decay, "dsteps", dsteps, "drate", drate, "bsize", bsize, "opt", opt, "tsteps", tsteps
        if self.cont:
            if self.settingAlreadyDone(hidden_layers, l2_beta, lrate, dropout,
                                       decay, dsteps, drate, bsize, opt,
                                       tsteps):
                return

        t0 = time()
        self.net.setParams(l2_beta=l2_beta,
                           initial_learning_rate=lrate,
                           decay=decay,
                           decay_steps=dsteps,
                           decay_rate=drate,
                           batch_size=bsize,
                           optimizer=opt,
                           n_steps=tsteps,
                           dropout=dropout)
        self.constructNetwork(hidden_layers)
        if self.val_type == 'cross':
            acc, auc, f1, precision, recall = self.net.trainAndCrossValidate()
        else:
            acc, auc, f1, precision, recall = self.net.trainAndValidate()

        results_dict = {
            'hidden_layers': hidden_layers,
            'l2_beta': l2_beta,
            'learning_rate': lrate,
            'dropout': dropout,
            'decay': decay,
            'decay_steps': dsteps,
            'decay_rate': drate,
            'batch_size': bsize,
            'optimizer': opt,
            'val_acc': acc,
            'val_auc': auc,
            'val_f1': f1,
            'val_precision': precision,
            'val_recall': recall
        }
        if self.multitask:
            results_dict['train_nan_percent'] = self.net.train_nan_percent[-1]
            results_dict['val_nan_percent'] = self.net.val_nan_percent[-1]

        if self.multilabel or self.print_per_task:
            for label in self.wanted_labels:
                friendly_label = helper.getFriendlyLabelName(label)
                results_dict[friendly_label +
                             '_acc'] = self.net.training_val_results_per_task[
                                 'acc'][label][-1]
                results_dict[friendly_label +
                             '_auc'] = self.net.training_val_results_per_task[
                                 'auc'][label][-1]
                results_dict[friendly_label +
                             '_f1'] = self.net.training_val_results_per_task[
                                 'f1'][label][-1]
                results_dict[
                    friendly_label +
                    '_precision'] = self.net.training_val_results_per_task[
                        'precision'][label][-1]
                results_dict[
                    friendly_label +
                    '_recall'] = self.net.training_val_results_per_task[
                        'recall'][label][-1]
        self.val_results_df = self.val_results_df.append(results_dict,
                                                         ignore_index=True)

        print self.val_results_df.tail(n=1)
        t1 = time()
        this_time = t1 - t0
        print "It took", this_time, "seconds to obtain this result"

        self.time_sum = self.time_sum + this_time

        self.printTimeEstimate(
            len(self.val_results_df) - self.started_from, num_settings)
        sys.stdout.flush()

        #output the file every few iterations for safekeeping
        if len(self.val_results_df) % OUTPUT_EVERY_NTH == 0:
            self.val_results_df.to_csv(self.results_path +
                                       self.val_output_prefix + '.csv')
示例#11
0
def getUserTaskListFromDataset(datafile,
                               target_label,
                               suppress_output=False,
                               group_on='user_id',
                               subdivide_phys=False):
    """Partitions a .csv file into a task-dict-list pickle file by separating
	different individuals (users) into the different tasks."""
    df = pd.DataFrame.from_csv(datafile)
    wanted_feats = [
        x for x in df.columns.values if x != 'user_id' and x != 'timestamp'
        and x != 'dataset' and x != 'classifier_friendly_ppt_id'
        and 'Cluster' not in x and '_Label' not in x
    ]

    df = helper.normalizeAndFillDataDf(df,
                                       wanted_feats, [target_label],
                                       suppress_output=True)
    df = df.reindex(np.random.permutation(df.index))

    dataset_name, datapath = getDatasetCoreNameAndPath(datafile)
    label_name = helper.getFriendlyLabelName(target_label)

    modality_dict = getModalityDict(wanted_feats,
                                    subdivide_phys=subdivide_phys)

    train_task_dict_list = []
    val_task_dict_list = []
    test_task_dict_list = []
    for user in df[group_on].unique():
        if not suppress_output:
            print("Processing task", user)
        mini_df = df[df[group_on] == user]

        train_task_dict_list.append(
            constructTaskDict(user, mini_df, wanted_feats, target_label,
                              modality_dict, 'Train'))
        val_task_dict_list.append(
            constructTaskDict(user, mini_df, wanted_feats, target_label,
                              modality_dict, 'Val'))
        test_task_dict_list.append(
            constructTaskDict(user, mini_df, wanted_feats, target_label,
                              modality_dict, 'Test'))

    if group_on == 'user_id':
        dataset_prefix = "datasetUserTaskList-"
    elif group_on == 'Cluster':
        dataset_prefix = 'datasetClusterTasks-'
    else:
        dataset_prefix = group_on
    pickle.dump(
        train_task_dict_list,
        open(
            datapath + dataset_prefix + dataset_name + "-" + label_name +
            "_Train.p", "wb"))
    pickle.dump(
        val_task_dict_list,
        open(
            datapath + dataset_prefix + dataset_name + "-" + label_name +
            "_Val.p", "wb"))
    pickle.dump(
        test_task_dict_list,
        open(
            datapath + dataset_prefix + dataset_name + "-" + label_name +
            "_Test.p", "wb"))

    return dataset_prefix + dataset_name + "-" + label_name
    def getFinalResultsAndSave(self, setting_dict):
        if self.val_type == 'cross':
            print "\nPlotting cross-validation results for best settings..."
            self.getCrossValidationResults(dict(),
                                           setting_dict['tau10'],
                                           setting_dict['tau20'],
                                           setting_dict['sigma_multiplier'],
                                           setting_dict['mu_multiplier'],
                                           save_plots=True)

        print "\nRetraining on training data with the best settings..."
        self.initializeHBLRModel(self.train_tasks)
        self.classifier.verbose = True
        self.setClassifierToSetting(setting_dict['tau10'],
                                    setting_dict['tau20'],
                                    setting_dict['sigma_multiplier'],
                                    setting_dict['mu_multiplier'])
        self.classifier.trainUntilConverged()

        print "\nPlotting and saving cool stuff about the final model..."
        self.saveImagePlot(self.classifier.phi, 'Phi')
        pd.DataFrame(self.classifier.phi).to_csv(self.results_path +
                                                 self.save_prefix + "-phi.csv")
        self.saveConvergencePlots()

        print "\nEvaluating results on held-out test set!! ..."
        all_preds = []
        all_true_y = []
        all_X_data = []
        per_task_accs = [np.nan] * self.n_tasks
        per_task_aucs = [np.nan] * self.n_tasks
        per_task_f1 = [np.nan] * self.n_tasks
        per_task_precision = [np.nan] * self.n_tasks
        per_task_recall = [np.nan] * self.n_tasks
        for t in range(self.n_tasks):
            preds = self.classifier.predictBinary(self.test_tasks[t]['X'], t)
            true_y = list(self.test_tasks[t]['Y'].flatten())

            if len(preds) == 0 or len(true_y) == 0:
                continue

            all_preds.extend(preds)
            all_true_y.extend(true_y)
            all_X_data.extend(self.test_tasks[t]['X'])

            # save the per-task results
            t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(
                preds, true_y)
            per_task_accs[t] = t_acc
            per_task_aucs[t] = t_auc
            per_task_f1[t] = t_f1
            per_task_precision[t] = t_precision
            per_task_recall[t] = t_recall

        print "\tHELD OUT TEST METRICS COMPUTED BY APPENDING ALL PREDS"
        acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(
            all_preds, all_true_y)
        print '\t\tAcc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall

        print "\n\tHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS"
        avg_acc = np.nanmean(per_task_accs)
        avg_auc = np.nanmean(per_task_aucs)
        avg_f1 = np.nanmean(per_task_f1)
        avg_precision = np.nanmean(per_task_precision)
        avg_recall = np.nanmean(per_task_recall)
        print '\t\tAcc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall

        print "\n\tHELD OUT TEST METRICS COMPUTED FOR EACH TASK"
        if not self.users_as_tasks:
            for t in range(self.n_tasks):
                task_name = self.test_tasks[t]['Name']
                if not self.users_as_tasks:
                    task_name = helper.getFriendlyLabelName(task_name)
                print "\t\t", task_name, "- Acc:", per_task_accs[
                    t], "AUC:", per_task_aucs[t], 'F1:', per_task_f1[
                        t], 'Precision:', per_task_precision[
                            t], 'Recall:', per_task_recall[t]

        if self.test_csv_filename is not None:
            print "\tSAVING HELD OUT PREDICITONS"
            if self.users_as_tasks:
                task_column = 'user_id'
                label_name = helper.getFriendlyLabelName(self.file_prefix)
                wanted_label = helper.getOfficialLabelName(label_name)
                predictions_df = helper.get_test_predictions_for_df_with_task_column(
                    self.classifier.predictBinary,
                    self.test_csv_filename,
                    task_column,
                    self.test_tasks,
                    wanted_label=wanted_label,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1],
                    label_name=label_name,
                    tasks_are_ints=False)
            else:
                predictions_df = helper.get_test_predictions_for_df_with_no_task_column(
                    self.classifier.predictBinary,
                    self.test_csv_filename,
                    self.test_tasks,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1])
            predictions_df.to_csv(self.results_path + "Preds-" +
                                  self.save_prefix + '.csv')
        else:
            print "Uh oh, the test csv filename was not set, can't save test preds"

        print "\t SAVING CLASSIFIER"
        with open(
                self.results_path + "PickledModel-" + self.save_prefix + '.p',
                "w") as f:
            pickle.dump(self.classifier, f)
    def getCrossValidationResults(self,
                                  results_dict,
                                  tau10,
                                  tau20,
                                  sigma_mult,
                                  mu_mult,
                                  save_plots=False,
                                  print_per_fold=False):
        if save_plots:
            same_task_matrix = np.zeros((self.n_tasks, self.n_tasks))

        clusters = [0] * self.num_cross_folds

        all_acc = []
        all_auc = []
        all_f1 = []
        all_precision = []
        all_recall = []
        if not self.users_as_tasks:
            per_task_accs = [[] for i in range(self.n_tasks)]
            per_task_aucs = [[] for i in range(self.n_tasks)]
            per_task_f1 = [[] for i in range(self.n_tasks)]
            per_task_precision = [[] for i in range(self.n_tasks)]
            per_task_recall = [[] for i in range(self.n_tasks)]

        for f in range(self.num_cross_folds):
            train_tasks, val_tasks = helper.loadCrossValData(
                self.datasets_path, self.file_prefix, f, reshape=True)

            self.initializeHBLRModel(train_tasks)
            self.setClassifierToSetting(tau10, tau20, sigma_mult, mu_mult)
            self.classifier.trainUntilConverged()

            clusters[f] = self.classifier.K

            if save_plots:
                same_task_matrix = self.updateSameTaskMatrix(same_task_matrix)

            # Get results!
            fold_preds = []
            fold_true_y = []
            for t in range(self.n_tasks):
                preds = self.classifier.predictBinary(val_tasks[t]['X'], t)
                true_y = list(val_tasks[t]['Y'].flatten())

                if len(preds) == 0 or len(true_y) == 0:
                    continue

                if not self.users_as_tasks:
                    # save the per-task results
                    t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(
                        preds, true_y)
                    per_task_accs[t].append(t_acc)
                    per_task_aucs[t].append(t_auc)
                    per_task_f1[t].append(t_f1)
                    per_task_precision[t].append(t_precision)
                    per_task_recall[t].append(t_recall)
                    if print_per_fold:
                        print "Fold", f, "Task", val_tasks[t][
                            'Name'], "acc", t_acc, "auc", t_auc, "f1", t_f1, "precision", t_precision, "recall", t_recall

                fold_preds.extend(preds)
                fold_true_y.extend(true_y)

            acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(
                fold_preds, fold_true_y)
            all_acc.append(acc)
            all_auc.append(auc)
            all_f1.append(f1)
            all_precision.append(precision)
            all_recall.append(recall)
            if print_per_fold:
                print "Fold", f, "acc", acc, "auc", auc, "f1", f1, "precision", precision, "recall", recall

        print "accs for all folds", all_acc
        print "aucs for all folds", all_auc
        print "clusters for all folds", clusters

        if save_plots:
            self.plotAccuracyAucAndClusters(all_acc, all_auc, clusters)
            self.saveHintonPlot(same_task_matrix, self.num_cross_folds)
            pd.DataFrame(same_task_matrix).to_csv(self.results_path +
                                                  self.save_prefix +
                                                  "-same_task_matrix.csv")

        # Add results to the dictionary
        results_dict['val_acc'] = np.nanmean(all_acc)
        results_dict['val_auc'] = np.nanmean(all_auc)
        results_dict['val_f1'] = np.nanmean(all_f1)
        results_dict['val_precision'] = np.nanmean(all_precision)
        results_dict['val_recall'] = np.nanmean(all_recall)
        results_dict['num_clusters'] = np.nanmean(clusters)

        # Add per-task results to the dictionary
        if not self.users_as_tasks:
            for t in range(self.n_tasks):
                task_name = val_tasks[t]['Name']
                results_dict[
                    'TaskAcc-' +
                    helper.getFriendlyLabelName(task_name)] = np.nanmean(
                        per_task_accs[t])
                results_dict[
                    'TaskAuc-' +
                    helper.getFriendlyLabelName(task_name)] = np.nanmean(
                        per_task_aucs[t])
                results_dict[
                    'TaskF1-' +
                    helper.getFriendlyLabelName(task_name)] = np.nanmean(
                        per_task_f1[t])
                results_dict[
                    'TaskPrecision-' +
                    helper.getFriendlyLabelName(task_name)] = np.nanmean(
                        per_task_precision[t])
                results_dict[
                    'TaskRecall-' +
                    helper.getFriendlyLabelName(task_name)] = np.nanmean(
                        per_task_recall[t])

        return results_dict
    def get_final_results(self, optimize_for='val_acc'):
        if self.users_as_tasks and not self.check_test:
            print "check_test is set to false, Will not evaluate performance on held-out test set."
            return
        print "\nAbout to evaluate results on held-out test set!!"
        print "Will use the settings that produced the best", optimize_for

        all_preds = []
        all_true_y = []
        per_task_accs = []
        per_task_aucs = []
        per_task_f1 = []
        per_task_precision = []
        per_task_recall = []

        for t in range(self.n_tasks):
            task_settings = self.find_best_setting_for_task(
                t, optimize_for=optimize_for)
            assert (task_settings['task_num'] == t)
            if not self.users_as_tasks:
                print "\nBEST SETTING FOR TASK", t, "-", task_settings[
                    'task_name']
                print "The highest", optimize_for, "of", task_settings[
                    optimize_for], "was found with the following settings:"
                print task_settings

            task_settings = self.convert_param_dict_for_use(task_settings)
            preds, true_y = self.get_preds_true_for_task(
                self.train_tasks, self.test_tasks, task_settings)
            if preds is None or true_y is None:
                continue

            all_preds.extend(preds)
            all_true_y.extend(true_y)

            # save the per-task results
            t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(
                preds, true_y)
            per_task_accs.append(t_acc)
            per_task_aucs.append(t_auc)
            per_task_f1.append(t_f1)
            per_task_precision.append(t_precision)
            per_task_recall.append(t_recall)

            if not self.users_as_tasks:
                print "\nFINAL TEST RESULTS FOR", helper.getFriendlyLabelName(
                    self.train_tasks[t]['Name'])
                print 'Acc:', t_acc, 'AUC:', t_auc, 'F1:', t_f1, 'Precision:', t_precision, 'Recall:', t_recall

        print "\nHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS"
        avg_acc = np.nanmean(per_task_accs)
        avg_auc = np.nanmean(per_task_aucs)
        avg_f1 = np.nanmean(per_task_f1)
        avg_precision = np.nanmean(per_task_precision)
        avg_recall = np.nanmean(per_task_recall)
        print 'Acc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall

        if self.test_csv_filename is not None:
            print "\tSAVING HELD OUT PREDICITONS"
            if self.users_as_tasks:
                task_column = 'user_id'
                label_name = helper.getFriendlyLabelName(self.file_prefix)
                wanted_label = helper.getOfficialLabelName(label_name)
                predictions_df = helper.get_test_predictions_for_df_with_task_column(
                    self.predict_task,
                    self.test_csv_filename,
                    task_column,
                    self.test_tasks,
                    wanted_label=wanted_label,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1],
                    label_name=label_name,
                    tasks_are_ints=False)
            else:
                predictions_df = helper.get_test_predictions_for_df_with_no_task_column(
                    self.predict_task,
                    self.test_csv_filename,
                    self.test_tasks,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1])
            predictions_df.to_csv(self.results_path + "Preds-" +
                                  self.save_prefix + '.csv')
        else:
            print "Uh oh, the test csv filename was not set, can't save test preds"