def load_data(self): self.test_tasks = helper.loadPickledTaskList(self.datasets_path, self.file_prefix, "Test", fix_y=True) self.train_tasks = helper.loadPickledTaskList(self.datasets_path, self.file_prefix, "Train", fix_y=True) self.n_tasks = len(self.train_tasks)
def __init__(self, file_prefix, users_as_tasks, user_clusters=True, eta_filename=None, regularizers=REGULARIZERS, tolerance=.0001, max_iter=100, val_type=VALIDATION_TYPE, c_vals=C_VALS, beta_vals=B_VALS, v_vals=V_VALS, kernels=KERNELS, print_iters=False, optimize_labels=None, cont=False, test_run=False, results_path=DEFAULT_RESULTS_PATH, figures_path=DEFAULT_FIGURES_PATH, datasets_path=DEFAULT_DATASETS_PATH, etas_path=DEFAULT_ETAS_PATH, num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, drop20=False, test_csv_filename=None): self.results_path = results_path self.figures_path = figures_path self.datasets_path = datasets_path self.etas_path = etas_path self.file_prefix = file_prefix self.cont = cont self.val_type = val_type self.users_as_tasks = users_as_tasks self.cluster_users = user_clusters self.drop20 = drop20 if test_csv_filename is not None: self.test_csv_filename = self.datasets_path + test_csv_filename else: self.test_csv_filename = None self.save_prefix = self.getSavePrefix(file_prefix, replace=cont) self.test_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Test", fix_y=True) self.train_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Train", fix_y=True) if self.val_type != 'cross': self.val_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Val", fix_y=True) # print dataset sizes print "Num train points:", sum([len(t['Y']) for t in self.train_tasks]) if self.val_type != 'cross': print "Num val points:", sum([len(t['Y']) for t in self.val_tasks]) print "Num test points:", sum([len(t['Y']) for t in self.test_tasks]) if self.val_type != 'cross': self.initializeMTMKLModel(self.train_tasks) else: self.classifier = None self.n_feats = helper.calculateNumFeatsInTaskList(self.test_tasks) self.n_tasks = len(self.test_tasks) if optimize_labels is None: self.optimize_labels = [ 'tomorrow_Group_Happiness_Evening_Label', 'tomorrow_Group_Health_Evening_Label', 'tomorrow_Group_Calmness_Evening_Label' ] else: self.optimize_labels = optimize_labels self.c_vals = c_vals self.v_vals = v_vals self.kernels = kernels self.beta_vals = beta_vals self.regularizers = regularizers self.tolerance = tolerance self.max_iter = max_iter self.print_iters = print_iters if test_run: print "This is only a testing run. Using cheap settings to make it faster" self.c_vals = [100] self.beta_vals = [.01] self.kernels = ['linear'] self.v_vals = [1.0] self.regularizers = ['L1'] self.max_iter = 1 self.calcNumSettingsDesired() #storing the results self.time_sum = 0 if cont: self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.save_prefix + '.csv') print '\nPrevious validation results df loaded. It has', len( self.val_results_df), "rows" self.started_from = len(self.val_results_df) else: self.val_results_df = pd.DataFrame() self.started_from = 0 self.num_cross_folds = num_cross_folds if self.val_type == 'cross': helper.generateCrossValPickleFiles(self.datasets_path, self.file_prefix, self.num_cross_folds)
def __init__(self, file_prefix, users_as_tasks=False, num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, cont=False, results_path=DEFAULT_RESULTS_PATH, figures_path=DEFAULT_FIGURES_PATH, datasets_path=DEFAULT_DATASETS_PATH, test_run=False, max_iters=DEFAULT_MAX_ITERS, val_type=DEFAULT_VALIDATION_TYPE, optimize_labels=None, test_csv_filename=None): self.results_path = results_path self.figures_path = figures_path self.datasets_path = datasets_path self.save_prefix = self.getSavePrefix(file_prefix, replace=cont) self.cont = cont self.max_iters = max_iters self.val_type = val_type self.users_as_tasks = users_as_tasks self.file_prefix = file_prefix if test_csv_filename is not None: self.test_csv_filename = self.datasets_path + test_csv_filename else: self.test_csv_filename = None self.test_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Test") self.train_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Train") if self.val_type != 'cross': self.val_tasks = helper.loadPickledTaskList( datasets_path, file_prefix, "Val") self.initializeHBLRModel(self.train_tasks) else: self.classifier = None if users_as_tasks: self.K = 25 else: self.K = len(self.test_tasks) self.n_feats = helper.calculateNumFeatsInTaskList(self.test_tasks) self.n_tasks = len(self.test_tasks) if optimize_labels is None: self.optimize_labels = [ 'tomorrow_Group_Happiness_Evening_Label', 'tomorrow_Group_Health_Evening_Label', 'tomorrow_Group_Calmness_Evening_Label' ] else: self.optimize_labels = optimize_labels #parameters that can be tuned self.tau10s = [10, 1, 0.05, 0.01] self.tau20s = [1.0, 0.05, 0.01] self.sigma_multipliers = [.01, 0.1, 1] self.mu_multipliers = [0.0] if test_run: print "This is only a testing run. Using cheap settings to make it faster" self.K = 2 self.max_iters = 5 self.n_tasks = 2 self.tau10s = [1] self.tau20s = [.1] self.sigma_multipliers = [.01] self.mu_multipliers = [0] self.calcNumSettingsDesired() #storing the results self.time_sum = 0 if cont: self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.save_prefix + '.csv') print '\nPrevious validation results df loaded. It has', len( self.val_results_df), "rows" self.started_from = len(self.val_results_df) else: self.val_results_df = pd.DataFrame() self.started_from = 0 self.num_cross_folds = num_cross_folds if self.val_type == 'cross': helper.generateCrossValPickleFiles(self.datasets_path, self.file_prefix, self.num_cross_folds)