def model_param(self): print_section('PARAMETERS') print("learning rate:", self.nu) print("Lambda:", self.Lambda) print("maximum iteration:", self.maxiter) print("kernel function: ", self.kernel) print("method: ", self.method) return
def input_summary(self): print_section('SUMMARY') print("Analysis type:", self.problem) print("input folder:", self.input_folder) print("output folder:", self.output_folder) print("number of training samples:", self.Ntrain) print("number of testing samples:", self.Ntest) print("number of pathways:", self.Ngroup) print("number of gene predictors:", self.Npred) print("number of clinical predictors:", self.Npred_clin) return
def proc_input(self): """ load corresponding data """ print_section('LOAD DATA') # make output folder if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) # training data thisfile = self.input_folder + "/" + self.group_file have_file(thisfile) self.pred_sets = pd.Series.from_csv(thisfile) thisfile = self.input_folder + "/" + self.train_predictor_file have_file(thisfile) self.train_predictors = pd.DataFrame.from_csv(thisfile) thisfile = self.input_folder + "/" + self.train_response_file have_file(thisfile) self.train_response = pd.DataFrame.from_csv(thisfile) # clinical file if self.hasClinical: thisfile = self.input_folder + "/" + self.clinical_file have_file(thisfile) self.train_clinical = pd.DataFrame.from_csv(thisfile) self.clin_names = self.train_clinical.columns # weights data if self.hasWeights: thisfile = self.input_folder + "/" + self.weights_file have_file(thisfile) raw_weights = pd.read_csv(thisfile, header=None, index_col=0, squeeze=True) self.proc_weight(raw_weights) # data summary self.Ntrain = self.train_predictors.shape[0] self.Ngroup = self.pred_sets.shape[0] self.Npred = self.train_predictors.shape[1] if self.hasClinical: self.Npred_clin = self.train_clinical.shape[1] self.group_names = self.pred_sets.index # change loaded indicator self.loaded = True return
def data_preprocessing(self, center=False, norm=False): print_section('PROCESS DATA') if not self.loaded: print("No data loaded. Can not preprocess.") return # center genomic data if center: print('Centering data.') scale(self.train_predictors, copy=False, with_std=False) # normalize data if norm: print("Normalizing data.") scale(self.train_predictors, copy=False, with_mean=False) # check groups print("Checking groups.") to_drop = [] for i in range(len(self.pred_sets)): genes = self.pred_sets.values[i].split(" ") shared = np.intersect1d(self.train_predictors.columns.values, genes) if len(shared) == 0: print("Drop group:", self.pred_sets.index[i]) to_drop.append(i) else: self.pred_sets.values[i] = ' '.join(shared) if len(to_drop) > 0: self.pred_sets = self.pred_sets.drop(self.pred_sets.index[to_drop]) self.group_names = self.pred_sets.index # add intercept column to clinical data intercept_col = pd.DataFrame({'intercept': np.ones(self.Ntrain)}, index=self.train_predictors.index) if self.hasClinical: if self.problem != "survival": self.train_clinical = pd.concat( [self.train_clinical, intercept_col], axis=1) else: self.train_clinical = intercept_col # calculate summary self.Ngroup = len(self.pred_sets) return
def data_split(self): if not self.hasTest: return print_section('SPLIT DATA') print("Using test label: ", self.test_file) # load test file thisfile = self.input_folder + '/' + self.test_file f = open(thisfile, 'r') test_ind = [x.strip() for x in f] f.close() # split data self.test_predictors = self.train_predictors.loc[test_ind] self.test_response = self.train_response.loc[test_ind] self.test_clinical = self.train_clinical.loc[test_ind] train_ind = np.setdiff1d(self.train_predictors.index.values, np.array(test_ind)) self.train_predictors = self.train_predictors.loc[train_ind] self.train_response = self.train_response.loc[train_ind] self.train_clinical = self.train_clinical.loc[train_ind] # update summary self.Ntest = len(self.test_response) self.Ntrain = len(self.train_response)
def CV_PKB(inputs, K_train, Lambda, nfold=3, ESTOP=50, parallel=False, gr_sub=False, plot=False): ########## split data ############### temp = pd.Series(range(inputs.Ntrain), index=inputs.train_response.index) if inputs.problem == "classification": test_inds = subsamp(inputs.train_response, inputs.train_response.columns[0], nfold) elif inputs.problem == 'survival': test_inds = subsamp(inputs.train_response, inputs.train_response.columns[1], nfold) elif inputs.problem == "regression": test_inds = simple_subsamp(inputs.train_response, nfold) folds = [] for i in range(nfold): folds.append([ temp[test_inds[i]].values, np.setdiff1d(temp.values, temp[test_inds[i]].values) ]) ########## initiate model for each fold ############### Ztrain_ls = [ inputs.train_clinical.values[folds[i][1], :] for i in range(nfold) ] Ztest_ls = [ inputs.train_clinical.values[folds[i][0], :] for i in range(nfold) ] K_train_ls = [ K_train[np.ix_(folds[i][1], folds[i][1])] for i in range(nfold) ] K_test_ls = [ K_train[np.ix_(folds[i][1], folds[i][0])] for i in range(nfold) ] if inputs.problem == "classification": ytrain_ls = [ np.squeeze(inputs.train_response.iloc[folds[i][1]].values) for i in range(nfold) ] ytest_ls = [ np.squeeze(inputs.train_response.iloc[folds[i][0]].values) for i in range(nfold) ] inputs_class = [ CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] models = [ assist.Classification.PKB_Classification(inputs_class[i], ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] elif inputs.problem == 'survival': ytrain_ls = [ inputs.train_response.iloc[folds[i][1], ].values for i in range(nfold) ] ytest_ls = [ inputs.train_response.iloc[folds[i][0], ].values for i in range(nfold) ] inputs_class = [ CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] models = [ assist.Survival.PKB_Survival(inputs_class[i], ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] elif inputs.problem == "regression": ytrain_ls = [ np.squeeze(inputs.train_response.iloc[folds[i][1]].values) for i in range(nfold) ] ytest_ls = [ np.squeeze(inputs.train_response.iloc[folds[i][0]].values) for i in range(nfold) ] inputs_class = [ CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] models = [ assist.Regression.PKB_Regression(inputs_class[i], ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] for x in models: x.init_F() ########## boosting for each fold ############### opt_iter = 0 tmp_list = [x.test_loss[0] for x in models] min_loss = prev_loss = np.mean([x.test_loss[0] for x in models]) ave_loss = [prev_loss] print_section('Cross-Validation') print("{:>9}\t{:>14}\t{:>24}".format("iteration", "Mean test loss", "time (if no E-stop)")) time0 = time.time() for t in range(1, inputs.maxiter + 1): # one iteration for k in range(nfold): if inputs.method == 'L2': [m,beta,gamma] = oneiter_L2(K_train_ls[k],Ztrain_ls[k],models[k],\ Lambda=Lambda,parallel = parallel,group_subset = gr_sub) if inputs.method == 'L1': [m,beta,gamma] = oneiter_L1(K_train_ls[k],Ztrain_ls[k],models[k],\ Lambda=Lambda,parallel = parallel,group_subset = gr_sub) # line search x = line_search(K_train_ls[k], Ztrain_ls[k], models[k], [m, beta, gamma]) beta *= x gamma *= x # update model models[k].update([m, beta, gamma], K_train_ls[k][:, :, m], K_test_ls[k][:, :, m], Ztrain_ls[k], Ztest_ls[k], inputs.nu) # save iteration #print("loss values: {}".format([x.test_loss[-1] for x in models])) cur_loss = np.mean([x.test_loss[-1] for x in models]) #update best loss if cur_loss < min_loss: min_loss = cur_loss opt_iter = t ave_loss.append(cur_loss) # print report if t % 10 == 0: iter_persec = t / (time.time() - time0) # time of one iteration rem_time = (inputs.maxiter - t) / iter_persec # remaining time print("{:9.0f}\t{:14.4f}\t{:24.4f}".format(t, cur_loss, rem_time / 60)) # detect early stop if t - opt_iter >= ESTOP: print('Early stop criterion satisfied: break CV.') break # print the number of iterations used print('using iteration number:', opt_iter) # visualization if plot: folder = inputs.output_folder f = plt.figure() plt.plot(ave_loss) plt.xlabel("iterations") plt.ylabel("CV loss") f.savefig(folder + '/CV_loss.pdf') return opt_iter