def load_data(self):
     self.test_tasks = helper.loadPickledTaskList(self.datasets_path,
                                                  self.file_prefix,
                                                  "Test",
                                                  fix_y=True)
     self.train_tasks = helper.loadPickledTaskList(self.datasets_path,
                                                   self.file_prefix,
                                                   "Train",
                                                   fix_y=True)
     self.n_tasks = len(self.train_tasks)
    def __init__(self,
                 file_prefix,
                 users_as_tasks,
                 user_clusters=True,
                 eta_filename=None,
                 regularizers=REGULARIZERS,
                 tolerance=.0001,
                 max_iter=100,
                 val_type=VALIDATION_TYPE,
                 c_vals=C_VALS,
                 beta_vals=B_VALS,
                 v_vals=V_VALS,
                 kernels=KERNELS,
                 print_iters=False,
                 optimize_labels=None,
                 cont=False,
                 test_run=False,
                 results_path=DEFAULT_RESULTS_PATH,
                 figures_path=DEFAULT_FIGURES_PATH,
                 datasets_path=DEFAULT_DATASETS_PATH,
                 etas_path=DEFAULT_ETAS_PATH,
                 num_cross_folds=DEFAULT_NUM_CROSS_FOLDS,
                 drop20=False,
                 test_csv_filename=None):
        self.results_path = results_path
        self.figures_path = figures_path
        self.datasets_path = datasets_path
        self.etas_path = etas_path
        self.file_prefix = file_prefix
        self.cont = cont
        self.val_type = val_type
        self.users_as_tasks = users_as_tasks
        self.cluster_users = user_clusters
        self.drop20 = drop20
        if test_csv_filename is not None:
            self.test_csv_filename = self.datasets_path + test_csv_filename
        else:
            self.test_csv_filename = None
        self.save_prefix = self.getSavePrefix(file_prefix, replace=cont)

        self.test_tasks = helper.loadPickledTaskList(datasets_path,
                                                     file_prefix,
                                                     "Test",
                                                     fix_y=True)
        self.train_tasks = helper.loadPickledTaskList(datasets_path,
                                                      file_prefix,
                                                      "Train",
                                                      fix_y=True)
        if self.val_type != 'cross':
            self.val_tasks = helper.loadPickledTaskList(datasets_path,
                                                        file_prefix,
                                                        "Val",
                                                        fix_y=True)

        # print dataset sizes
        print "Num train points:", sum([len(t['Y']) for t in self.train_tasks])
        if self.val_type != 'cross':
            print "Num val points:", sum([len(t['Y']) for t in self.val_tasks])
        print "Num test points:", sum([len(t['Y']) for t in self.test_tasks])

        if self.val_type != 'cross':
            self.initializeMTMKLModel(self.train_tasks)
        else:
            self.classifier = None

        self.n_feats = helper.calculateNumFeatsInTaskList(self.test_tasks)
        self.n_tasks = len(self.test_tasks)

        if optimize_labels is None:
            self.optimize_labels = [
                'tomorrow_Group_Happiness_Evening_Label',
                'tomorrow_Group_Health_Evening_Label',
                'tomorrow_Group_Calmness_Evening_Label'
            ]
        else:
            self.optimize_labels = optimize_labels

        self.c_vals = c_vals
        self.v_vals = v_vals
        self.kernels = kernels
        self.beta_vals = beta_vals
        self.regularizers = regularizers

        self.tolerance = tolerance
        self.max_iter = max_iter
        self.print_iters = print_iters

        if test_run:
            print "This is only a testing run. Using cheap settings to make it faster"
            self.c_vals = [100]
            self.beta_vals = [.01]
            self.kernels = ['linear']
            self.v_vals = [1.0]
            self.regularizers = ['L1']
            self.max_iter = 1

        self.calcNumSettingsDesired()

        #storing the results
        self.time_sum = 0
        if cont:
            self.val_results_df = pd.DataFrame.from_csv(self.results_path +
                                                        self.save_prefix +
                                                        '.csv')
            print '\nPrevious validation results df loaded. It has', len(
                self.val_results_df), "rows"
            self.started_from = len(self.val_results_df)
        else:
            self.val_results_df = pd.DataFrame()
            self.started_from = 0

        self.num_cross_folds = num_cross_folds
        if self.val_type == 'cross':
            helper.generateCrossValPickleFiles(self.datasets_path,
                                               self.file_prefix,
                                               self.num_cross_folds)
    def __init__(self,
                 file_prefix,
                 users_as_tasks=False,
                 num_cross_folds=DEFAULT_NUM_CROSS_FOLDS,
                 cont=False,
                 results_path=DEFAULT_RESULTS_PATH,
                 figures_path=DEFAULT_FIGURES_PATH,
                 datasets_path=DEFAULT_DATASETS_PATH,
                 test_run=False,
                 max_iters=DEFAULT_MAX_ITERS,
                 val_type=DEFAULT_VALIDATION_TYPE,
                 optimize_labels=None,
                 test_csv_filename=None):
        self.results_path = results_path
        self.figures_path = figures_path
        self.datasets_path = datasets_path
        self.save_prefix = self.getSavePrefix(file_prefix, replace=cont)
        self.cont = cont
        self.max_iters = max_iters
        self.val_type = val_type
        self.users_as_tasks = users_as_tasks
        self.file_prefix = file_prefix
        if test_csv_filename is not None:
            self.test_csv_filename = self.datasets_path + test_csv_filename
        else:
            self.test_csv_filename = None
        self.test_tasks = helper.loadPickledTaskList(datasets_path,
                                                     file_prefix, "Test")
        self.train_tasks = helper.loadPickledTaskList(datasets_path,
                                                      file_prefix, "Train")
        if self.val_type != 'cross':
            self.val_tasks = helper.loadPickledTaskList(
                datasets_path, file_prefix, "Val")
            self.initializeHBLRModel(self.train_tasks)
        else:
            self.classifier = None

        if users_as_tasks:
            self.K = 25
        else:
            self.K = len(self.test_tasks)
        self.n_feats = helper.calculateNumFeatsInTaskList(self.test_tasks)
        self.n_tasks = len(self.test_tasks)

        if optimize_labels is None:
            self.optimize_labels = [
                'tomorrow_Group_Happiness_Evening_Label',
                'tomorrow_Group_Health_Evening_Label',
                'tomorrow_Group_Calmness_Evening_Label'
            ]
        else:
            self.optimize_labels = optimize_labels

        #parameters that can be tuned
        self.tau10s = [10, 1, 0.05, 0.01]
        self.tau20s = [1.0, 0.05, 0.01]
        self.sigma_multipliers = [.01, 0.1, 1]
        self.mu_multipliers = [0.0]

        if test_run:
            print "This is only a testing run. Using cheap settings to make it faster"
            self.K = 2
            self.max_iters = 5
            self.n_tasks = 2
            self.tau10s = [1]
            self.tau20s = [.1]
            self.sigma_multipliers = [.01]
            self.mu_multipliers = [0]

        self.calcNumSettingsDesired()

        #storing the results
        self.time_sum = 0
        if cont:
            self.val_results_df = pd.DataFrame.from_csv(self.results_path +
                                                        self.save_prefix +
                                                        '.csv')
            print '\nPrevious validation results df loaded. It has', len(
                self.val_results_df), "rows"
            self.started_from = len(self.val_results_df)
        else:
            self.val_results_df = pd.DataFrame()
            self.started_from = 0

        self.num_cross_folds = num_cross_folds
        if self.val_type == 'cross':
            helper.generateCrossValPickleFiles(self.datasets_path,
                                               self.file_prefix,
                                               self.num_cross_folds)