def kfold_prediction(self, k=10): # generate indices for kfold cross validation self.num_pred = 0 # number of predictions prediction = pd.Series(index=self.y.index) # predicted class onco_prob = pd.Series(index=self.y.index).fillna(0) tsg_prob = pd.Series(index=self.y.index).fillna(0) for i in range(self.total_iter): # randomize for another round self.x, self.y = futils.randomize(self.x, self.prng) futils.check_num_classes(self.y) # warn user if not 3 classes # set up stratified kfold iterator kf = KFold(n_splits=k) k_fold = kf.split(self.y) # obtain predictions from single round of kfold validation for train_ix, test_ix in k_fold: # retreive indices from pandas dataframe using row number tmp_train_ix = self.x.iloc[train_ix].index tmp_test_ix = self.x.iloc[test_ix].index if self.is_weighted_sample: # figure out sample weights num_train = len(train_ix) sample_weight = np.zeros(num_train) onco_ix = np.nonzero( self.y.loc[tmp_train_ix] == self.onco_num)[0] tsg_ix = np.nonzero( self.y.loc[tmp_train_ix] == self.tsg_num)[0] other_ix = np.nonzero( self.y.loc[tmp_train_ix] == self.other_num)[0] sample_weight[onco_ix] = 1. / len(onco_ix) sample_weight[tsg_ix] = 1. / len(tsg_ix) sample_weight[other_ix] = 1. / len(other_ix) # do training with sample weighting self.clf.fit(self.x.loc[tmp_train_ix].copy(), self.y.loc[tmp_train_ix].copy(), sample_weight=sample_weight) else: # do training without weighting self.clf.fit(self.x.loc[tmp_train_ix].copy(), self.y.loc[tmp_train_ix].copy()) # predict test data in kfold validation tmp_prob = self.clf.predict_proba(self.x.loc[tmp_test_ix]) onco_prob.loc[tmp_test_ix] += tmp_prob[:, self.onco_num] tsg_prob.loc[tmp_test_ix] += tmp_prob[:, self.tsg_num] self.num_pred += 1 # convert number of trees to fraction of trees onco_prob /= self.num_pred tsg_prob /= self.num_pred other_prob = 1 - (onco_prob + tsg_prob) # return prediction.astype(int), prob return onco_prob, tsg_prob, other_prob
def kfold_prediction(self, k=10): # generate indices for kfold cross validation self.num_pred = 0 # number of predictions prediction = pd.Series(index=self.y.index) # predicted class onco_prob = pd.Series(index=self.y.index).fillna(0) tsg_prob = pd.Series(index=self.y.index).fillna(0) for i in range(self.total_iter): # randomize for another round self.x, self.y = futils.randomize(self.x, self.prng) futils.check_num_classes(self.y) # warn user if not 3 classes # set up stratified kfold iterator k_fold = cross_validation.StratifiedKFold(self.y, n_folds=k) # obtain predictions from single round of kfold validation for train_ix, test_ix in k_fold: # retreive indices from pandas dataframe using row number tmp_train_ix = self.x.iloc[train_ix].index tmp_test_ix = self.x.iloc[test_ix].index if self.is_weighted_sample: # figure out sample weights num_train = len(train_ix) sample_weight = np.zeros(num_train) onco_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.onco_num)[0] tsg_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.tsg_num)[0] other_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.other_num)[0] sample_weight[onco_ix] = 1. / len(onco_ix) sample_weight[tsg_ix] = 1. / len(tsg_ix) sample_weight[other_ix] = 1. / len(other_ix) # do training with sample weighting self.clf.fit(self.x.ix[tmp_train_ix].copy(), self.y.ix[tmp_train_ix].copy(), sample_weight=sample_weight) else: # do training without weighting self.clf.fit(self.x.ix[tmp_train_ix].copy(), self.y.ix[tmp_train_ix].copy()) # predict test data in kfold validation tmp_prob = self.clf.predict_proba(self.x.ix[tmp_test_ix]) onco_prob.ix[tmp_test_ix] += tmp_prob[:, self.onco_num] tsg_prob.ix[tmp_test_ix] += tmp_prob[:, self.tsg_num] self.num_pred += 1 # convert number of trees to fraction of trees onco_prob /= self.num_pred tsg_prob /= self.num_pred other_prob = 1 - (onco_prob + tsg_prob) # return prediction.astype(int), prob return onco_prob, tsg_prob, other_prob
def train_cv(self, k=10): """Train classifier on entire data set provided, but done in cross-validation.""" # generate indices for kfold cross validation self.num_pred = 0 # number of predictions self.test_fold_df = pd.DataFrame( {l + 1: 0 for l in range(self.total_iter)}, index=self.x.index) for i in range(self.total_iter): # randomize for another round self.x, self.y = futils.randomize(self.x, self.prng) futils.check_num_classes(self.y) # warn user if not 3 classes # set up stratified kfold iterator kf = KFold(n_splits=k) k_fold = kf.split(self.y) # obtain predictions from single round of kfold validation for nfold, (train_ix, test_ix) in enumerate(k_fold): # retreive indices from pandas dataframe using row number tmp_train_ix = self.x.iloc[train_ix].index # save which genes are in the test fold tmp_test_ix = self.x.iloc[test_ix].index self.test_fold_df.loc[tmp_test_ix, i + 1] = nfold + 1 if self.is_weighted_sample: # figure out sample weights num_train = len(train_ix) sample_weight = np.zeros(num_train) onco_ix = np.nonzero( self.y.loc[tmp_train_ix] == self.onco_num)[0] tsg_ix = np.nonzero( self.y.loc[tmp_train_ix] == self.tsg_num)[0] other_ix = np.nonzero( self.y.loc[tmp_train_ix] == self.other_num)[0] sample_weight[onco_ix] = 1. / len(onco_ix) sample_weight[tsg_ix] = 1. / len(tsg_ix) sample_weight[other_ix] = 1. / len(other_ix) # do training with sample weighting self.clf.fit(self.x.loc[tmp_train_ix].copy(), self.y.loc[tmp_train_ix].copy(), sample_weight=sample_weight) else: # do training without weighting self.clf.fit(self.x.loc[tmp_train_ix].copy(), self.y.loc[tmp_train_ix].copy()) self.clf.append_fold_result( ) # add the training result from each fold self.clf.append_cv_result( ) # add the training result for a single CV to the R variable self.num_pred += 1 self.clf.set_cv_fold(self.test_fold_df)
def train_cv(self, k=10): """Train classifier on entire data set provided, but done in cross-validation.""" # generate indices for kfold cross validation self.num_pred = 0 # number of predictions self.test_fold_df = pd.DataFrame({l+1: 0 for l in range(self.total_iter)}, index=self.x.index) for i in range(self.total_iter): # randomize for another round self.x, self.y = futils.randomize(self.x, self.prng) futils.check_num_classes(self.y) # warn user if not 3 classes # set up stratified kfold iterator k_fold = cross_validation.StratifiedKFold(self.y, n_folds=k) # obtain predictions from single round of kfold validation for nfold, (train_ix, test_ix) in enumerate(k_fold): # retreive indices from pandas dataframe using row number tmp_train_ix = self.x.iloc[train_ix].index # save which genes are in the test fold tmp_test_ix = self.x.iloc[test_ix].index self.test_fold_df.loc[tmp_test_ix, i+1] = nfold + 1 if self.is_weighted_sample: # figure out sample weights num_train = len(train_ix) sample_weight = np.zeros(num_train) onco_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.onco_num)[0] tsg_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.tsg_num)[0] other_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.other_num)[0] sample_weight[onco_ix] = 1. / len(onco_ix) sample_weight[tsg_ix] = 1. / len(tsg_ix) sample_weight[other_ix] = 1. / len(other_ix) # do training with sample weighting self.clf.fit(self.x.ix[tmp_train_ix].copy(), self.y.ix[tmp_train_ix].copy(), sample_weight=sample_weight) else: # do training without weighting self.clf.fit(self.x.ix[tmp_train_ix].copy(), self.y.ix[tmp_train_ix].copy()) self.clf.append_fold_result() # add the training result from each fold self.clf.append_cv_result() # add the training result for a single CV to the R variable self.num_pred += 1 self.clf.set_cv_fold(self.test_fold_df)
def train(self): """Train classifier on entire data set provided.""" self.x, self.y = futils.randomize(self.x, self.prng) futils.check_num_classes(self.y) # warn user if not 3 classes self.clf.fit(self.x, self.y)
def kfold_validation(self, k=10): """Records the performance in terms of ROC and PR AUC for cross-validation. Params ------ k : int (10) Number of cross-validation folds """ self.num_pred = 0 # number of predictions for i in range(self.total_iter): # randomize for another round self.x, self.y = futils.randomize(self.x, self.prng) futils.check_num_classes(self.y) # warn user if not 3 classes # initialize predicted results variables num_genes = len(self.y) onco_pred = np.zeros(num_genes) onco_prob = np.zeros(num_genes) tsg_pred = np.zeros(num_genes) tsg_prob = np.zeros(num_genes) overall_pred = np.zeros(num_genes) # set up stratified kfold iterator k_fold = cross_validation.StratifiedKFold(self.y, n_folds=k) # evaluate k-fold cross validation for train_ix, test_ix in k_fold: if self.is_weighted_sample: # weight classes by using sample weights num_train = len(train_ix) sample_weight = np.zeros(num_train) onco_ix = np.nonzero(self.y[train_ix] == self.onco_num)[0] tsg_ix = np.nonzero(self.y[train_ix] == self.tsg_num)[0] other_ix = np.nonzero( self.y[train_ix] == self.other_num)[0] sample_weight[onco_ix] = 1. / len(onco_ix) sample_weight[tsg_ix] = 1. / len(tsg_ix) sample_weight[other_ix] = 1. / len(other_ix) # do training self.clf.fit(self.x.iloc[train_ix].copy(), self.y.iloc[train_ix].copy(), sample_weight=sample_weight) else: # do training without sample weights self.clf.fit(self.x.iloc[train_ix].copy(), self.y.iloc[train_ix].copy()) # do prediction y_pred = self.clf.predict(self.x.iloc[test_ix]) proba_ = self.clf.predict_proba(self.x.iloc[test_ix]) # update information overall_pred[ test_ix] = y_pred # prediction including all classes onco_pred[test_ix] = (y_pred == self.onco_num).astype( int) # predicted oncogenes onco_prob[test_ix] = proba_[:, self. onco_num] # predicted oncogenes tsg_pred[test_ix] = (y_pred == self.tsg_num).astype( int) # predicted oncogenes tsg_prob[test_ix] = proba_[:, self.tsg_num] # predicted oncogenes # update information true_onco = (self.y == self.onco_num).astype(int) self._update_onco_metrics(true_onco, onco_pred, onco_prob) true_tsg = (self.y == self.tsg_num).astype(int) # true oncogenes self._update_tsg_metrics(true_tsg, tsg_pred, tsg_prob) self._update_metrics(self.y, overall_pred, onco_prob, tsg_prob) self.num_pred += 1 self._on_finish() # update info for kfold cross-validation
def kfold_validation(self, k=10): """Records the performance in terms of ROC and PR AUC for cross-validation. Params ------ k : int (10) Number of cross-validation folds """ self.num_pred = 0 # number of predictions for i in range(self.total_iter): # randomize for another round self.x, self.y = futils.randomize(self.x, self.prng) futils.check_num_classes(self.y) # warn user if not 3 classes # initialize predicted results variables num_genes = len(self.y) onco_pred = np.zeros(num_genes) onco_prob = np.zeros(num_genes) tsg_pred = np.zeros(num_genes) tsg_prob = np.zeros(num_genes) overall_pred = np.zeros(num_genes) # set up stratified kfold iterator k_fold = cross_validation.StratifiedKFold(self.y, n_folds=k) # evaluate k-fold cross validation for train_ix, test_ix in k_fold: if self.is_weighted_sample: # weight classes by using sample weights num_train = len(train_ix) sample_weight = np.zeros(num_train) onco_ix = np.nonzero(self.y[train_ix]==self.onco_num)[0] tsg_ix = np.nonzero(self.y[train_ix]==self.tsg_num)[0] other_ix = np.nonzero(self.y[train_ix]==self.other_num)[0] sample_weight[onco_ix] = 1. / len(onco_ix) sample_weight[tsg_ix] = 1. / len(tsg_ix) sample_weight[other_ix] = 1. / len(other_ix) # do training self.clf.fit(self.x.iloc[train_ix].copy(), self.y.iloc[train_ix].copy(), sample_weight=sample_weight) else: # do training without sample weights self.clf.fit(self.x.iloc[train_ix].copy(), self.y.iloc[train_ix].copy()) # do prediction y_pred = self.clf.predict(self.x.iloc[test_ix]) proba_ = self.clf.predict_proba(self.x.iloc[test_ix]) # update information overall_pred[test_ix] = y_pred # prediction including all classes onco_pred[test_ix] = (y_pred==self.onco_num).astype(int) # predicted oncogenes onco_prob[test_ix] = proba_[:, self.onco_num] # predicted oncogenes tsg_pred[test_ix] = (y_pred==self.tsg_num).astype(int) # predicted oncogenes tsg_prob[test_ix] = proba_[:, self.tsg_num] # predicted oncogenes # update information true_onco = (self.y==self.onco_num).astype(int) self._update_onco_metrics(true_onco, onco_pred, onco_prob) true_tsg = (self.y==self.tsg_num).astype(int) # true oncogenes self._update_tsg_metrics(true_tsg, tsg_pred, tsg_prob) self._update_metrics(self.y, overall_pred, onco_prob, tsg_prob) self.num_pred += 1 self._on_finish() # update info for kfold cross-validation