def fit_and_score_features(X, y): n_features = X.shape[1] scores = np.empty(n_features) m = CoxnetSurvivalAnalysis() for j in range(n_features): Xj = X[:, j:j + 1] m.fit(Xj, y) scores[j] = m.score(Xj, y) return scores
def COX(X, y, best_features, oversampling, undersampling, aggregation): if aggregation == True: results, model = execute_survival( X, y, best_features, lambda: CoxnetSurvivalAnalysis(l1_ratio=0.1, n_alphas=100), oversampling, undersampling) else: results, model = execute_survival( X, y, best_features, lambda: CoxnetSurvivalAnalysis(l1_ratio=0.1, n_alphas=200), oversampling, undersampling) if model: features = model.coef_ else: features = False return results, features, model
def optimizePenalty(self): import warnings from sklearn.exceptions import ConvergenceWarning from sklearn.pipeline import make_pipeline from sklearn.model_selection import KFold from sklearn.model_selection import GridSearchCV pipeline = make_pipeline(self.model) warnings.simplefilter("ignore", ConvergenceWarning) pipeline.fit(self.data.values, self.data.tags) alphas = 10. ** np.linspace(-2, 3,50) cv = KFold(n_splits = 5, shuffle = True) grid = GridSearchCV(make_pipeline(CoxnetSurvivalAnalysis(l1_ratio=1.0, max_iter = 1000000)), param_grid = {"coxnetsurvivalanalysis__alphas" : [[alpha] for alpha in alphas]}, cv = cv, error_score = 0.5, n_jobs = -1).fit(self.data.values, self.data.tags) bestAlpha = grid.best_params_["coxnetsurvivalanalysis__alphas"][0] print("El mejor pare!", bestAlpha) self.model.set_params(**{"alphas" : [bestAlpha]})
def LASSO_COX_bootstrap(fp, num=False): df = pd.read_csv(fp, index_col=0) # configure bootstrap (sampling 50% of data) n_iterations = 100 n_size = int(len(df) * 0.50) # calculate population of statistics metrics = [] for i in range(n_iterations): # prepare sample # if indicated, include number of mets (col 42) if num: sample = resample(df.iloc[:, np.r_[:20, 40, 41, 42]], n_samples=n_size) X = sample.iloc[:, np.r_[:20, 42]].copy() else: sample = resample(df.iloc[:, np.r_[:20, 40, 41]], n_samples=n_size) X = sample.iloc[:, :20].copy() X = X.to_numpy() y = sample[['Event', 'Time']].copy() y['Event'] = y['Event'].astype('bool') y = y.to_records(index=False) estimator = CoxnetSurvivalAnalysis(l1_ratio=1, alphas=[0.001]) estimator.fit(X, y) score = estimator.score(X, y) metrics.append(score) # calculate confidence interval alpha = 0.95 p = ((1.0 - alpha) / 2.0) * 100 lower = max(0.0, np.percentile(metrics, p)) p = (alpha + ((1.0 - alpha) / 2.0)) * 100 upper = min(1.0, np.percentile(metrics, p)) med = np.percentile(metrics, 50) # identify aggregation method name if num: name = fp.split('/')[-1].split('_')[0] + ' + NumMets' else: name = fp.split('/')[-1].split('_')[0] return print(name, 'Lasso-Cox', '%.3f (%.3f-%.3f)' % (med, lower, upper))
def functionToOptimize(**params): self.counter += 1 print(f"Bayesian Optimization model: {2 ** params['alphas']}; time: {self.counter}") model = CoxnetSurvivalAnalysis(l1_ratio = 1.0, max_iter = 1000000) params["alphas"] = [2 ** params["alphas"]] model.set_params(**params) cvAucMeans = [] for trainIndex, testIndex in KFold(n_splits = 4).split(self.data.values): trainX, trainY = self.data.values[trainIndex,], self.data.tags[trainIndex[:, None],] testX, testY = self.data.values[testIndex,:], self.data.tags[testIndex[:, None],] trainY = np.reshape(trainY, -1) testY = np.reshape(testY, -1) model.fit(trainX, trainY) times = np.percentile(testY["Time_in_days"], np.linspace(5, 81, 15)) _, meanAuc = cumulative_dynamic_auc(testY, testY, model.predict(testX), times) cvAucMeans.append(meanAuc) return -np.mean(cvAucMeans)
def execute_survival(X, y, k, headers, survival, aggregation): new_X, best_features = pearson_fs(X, y, headers, k, feature_selection=True, survival=survival) y_for_cv = np.array([t[0] for t in y]) cv = StratifiedKFold(y_for_cv, n_folds=5) # x-validation if aggregation == True: clf = CoxnetSurvivalAnalysis(l1_ratio=0.1, n_alphas=100) else: clf = CoxnetSurvivalAnalysis(l1_ratio=0.1, n_alphas=200) CIscore = 0 print (' ...performing x-validation') for i, (train, test) in enumerate(cv): print (' ...',i+1) y_train = y[train] trained_classifier = clf.fit(new_X[train], y[train]) event_indicators = [] event_times = [] scores = [] for target in y[test]: event_indicators.append(target[0]) event_times.append(target[1]) predictions = trained_classifier.predict(new_X[test]) for prediction in predictions: scores.append(prediction) # print(prediction) result = concordance_index_censored(np.array(event_indicators), np.array(event_times), np.array(scores).reshape(-1)) CIscore += result[0] # TODO fix metrics avgCIscore = CIscore / len(cv) print(avgCIscore) return avgCIscore
def _fit_with_python(self, matrix_test, get_proba=False, return_nonzero_features=False, l1_ratio=0.5): """ """ from sksurv.linear_model import CoxnetSurvivalAnalysis Y = np.asarray([(bool(a), b) for a, b in zip(self.isdead, self.nbdays)], dtype=[("event", np.bool), ("time", np.int)]) self.coxph_python = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, fit_baseline_model=False) with warnings.catch_warnings(): warnings.simplefilter("ignore") self.coxph_python.fit(self.matrix, Y) predictions = self.coxph_python.predict(matrix_test) if get_proba: return self._get_proba_from_prediction(predictions) if return_nonzero_features: for coef in self.coxph_python.coef_.T: if coef.sum() != 0: break if coef.sum() == 0: raise (Exception("All features Coefficient are 0!")) if self.metadata_mat is not None: if coef[:-self.metadata_mat.shape[1]].sum() == 0: raise (Exception("Only metadata features are non zero")) return np.nonzero(coef[:-self.metadata_mat.shape[1]]) else: return np.nonzero(coef) return self._fit_and_dichotomise(predictions, n_clusters=self.n_clusters)
def test(): """ """ #### Compare glmnet with sksurv CoxnetSurvivalAnalysis from sksurv.linear_model import CoxnetSurvivalAnalysis ###################################################### ################ DUMMY DATA ########################## isdead = [0, 1, 1, 1, 0, 1, 0, 0, 1, 0] nbdays = [24, 10, 25, 50, 14, 10, 100, 10, 50, 10] matrix = np.array([[0, 1, 1, 0, 1, 2, 0, 1, 0, 0], [0, 1, 1, 0, 1, 3, 0, 1, 0, 0]]).T ###################################################### res = predict_with_coxph_glmnet(matrix, isdead, nbdays, matrix) coxph = CoxnetSurvivalAnalysis() Y = np.asarray([(bool(a), b) for a, b in zip(isdead, nbdays)], dtype=[("event", np.bool), ("time", np.int)]) coxph.fit(matrix, Y)
def run_coxnet(l1_ratio, n_alphas, x_train, y_train, x_test, y_test): coxnet = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, n_alphas=n_alphas) coxnet.fit(x_train, y_train) outputs = coxnet.predict(x_test) score = coxnet.score(x_test, y_test) return outputs, score
E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False) df2['E'] = E df2['T'] = T X, y = get_x_y(df2, ['E', 'T'], pos_label=True) for c in X.columns.values: if c != 'AGE AT DOC': X[c] = X[c].astype('category') data_x_numeric = OneHotEncoder().fit_transform(X) #%% estimator = CoxnetSurvivalAnalysis(verbose=True) estimator.fit(data_x_numeric, y) #%% print(estimator.score(data_x_numeric, y)) print() scores = fit_and_score_features(data_x_numeric.values, y) print( pd.Series(scores, index=data_x_numeric.columns).sort_values(ascending=False)) #%% from sklearn.feature_selection import SelectKBest from sklearn.pipeline import Pipeline
#df=merge_frames(df1,df2) #Converting the integer to 0 and 1 to boolean for python df["Status"] = df["Status"].astype(bool) #data contains the time and status column and X will have all the mutation present or absent corresponding to each gene data = df.iloc[0:, 1:3] X = df.iloc[0:, 3:] #storing the value used to store status and time in tuple Y = data.to_records(index=False) X = OneHotEncoder().fit_transform(X) #Running the module for 50 randomly generated penalty values estimator = CoxnetSurvivalAnalysis(n_alphas=100, l1_ratio=1, alpha_min_ratio=0.01, max_iter=10000) estimator.fit(X, Y) #Making the dataframe for the coefficients of each genes corresponding to that alpha value coefficients_lasso = pd.DataFrame(estimator.coef_, index=X.columns, columns=np.round(estimator.alphas_, 5)) alphas = estimator.alphas_ print(coefficients_lasso) #Sending parameters to the function to plot the alpha vs coefficient graph for all the genes, with the 10 mostly divergent genes as hightlights plot_coefficients(coefficients_lasso, n_highlight=10) alphas = coefficients_lasso.columns
def train_cox(x, outer_split=leave_two_out, inner_split=leave_two_out, num_folds=None, meas_key=None, key='metabs'): if num_folds is None: print('none') else: print(num_folds) np.random.seed(5) # if feature_grid is None: # feature_grid = np.logspace(7, 20, 14) hazards = [] event_times = [] event_outcomes = [] score_vec = [] model_out_dict = {} ix_inner = outer_split(x, x['outcome'], num_folds=100) lambda_dict = {} for ic_in, ix_in in enumerate(ix_inner): train_index, test_index = ix_in x_train, x_test = x.iloc[train_index, :], x.iloc[test_index, :] week = x_train['week'] outcome = x_train['outcome'] if (x_train < 0).any().any(): x_train_, x_test_ = filter_by_train_set( x_train.drop(['week', 'outcome'], axis=1), x_test.drop(['week', 'outcome'], axis=1), meas_key, key=key, log_transform=False) else: x_train_, x_test_ = filter_by_train_set( x_train.drop(['week', 'outcome'], axis=1), x_test.drop(['week', 'outcome'], axis=1), meas_key, key=key, log_transform=True) temp = x_train_.copy() temp['week'], temp['outcome'] = x_train['week'], x_train['outcome'] x_train = temp.copy() temp = x_test_.copy() temp['week'], temp['outcome'] = x_test['week'], x_test['outcome'] x_test = temp.copy() if np.sum(x_test['outcome'].values) < 1: continue x_train_ = x_train.drop(['week', 'outcome'], axis=1) yy = list(zip(outcome, week)) y_arr = np.array(yy, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')]) ix_inner2 = inner_split(x_train, x_train['outcome'], num_folds=100) lamb_dict = {} lamb_dict['auc'] = {} lamb_dict['ci'] = {} model2 = CoxnetSurvivalAnalysis(l1_ratio=1) model_dict = {} alphas = None hazards_dict = {} e_times_dict = {} e_outcomes_dict = {} score_dict = {} coxnet_pipe = CoxnetSurvivalAnalysis(l1_ratio=1, alpha_min_ratio=0.001, n_alphas=300) coxnet_pipe.fit(x_train_, y_arr) alphas = coxnet_pipe.alphas_ for ic_in2, ix_in2 in enumerate(ix_inner2): start_inner = time.time() train_ix, test_ix = ix_in2 x_tr2, x_ts2 = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :] if np.sum(x_tr2['outcome'].values) < 1: continue y_test = list(zip(x_ts2['outcome'], x_ts2['week'])) y_test_arr = np.array(y_test, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')]) if len(np.unique(y_test_arr)) < len(test_ix): continue week = x_tr2['week'] outcome = x_tr2['outcome'] if (outcome == 0).all(): continue x_tr2_ = x_tr2.drop(['week', 'outcome'], axis=1) yy2 = list(zip(outcome, week)) y_arr2 = np.array(yy2, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')]) model2.set_params(alphas=alphas) try: model2.fit(x_tr2_, y_arr2) except: print('removed alpha ' + str(alphas[0])) alphas_n = np.delete(alphas, 0) model2.set_params(alphas=alphas_n) while (1): try: model2.fit(x_tr2_, y_arr2) alphas = alphas_n break except: print('removed alpha ' + str(alphas_n[0])) alphas_n = np.delete(alphas, 0) model2.set_params(alphas=alphas_n) if len(alphas_n) <= 2: break if len(alphas_n) <= 2: continue # alphas_new = model2.alphas_ # if ic_in2 == 0: # alphas = alphas_new model_dict[ic_in2] = model2 for i, alpha in enumerate(alphas): if i not in hazards_dict.keys(): hazards_dict[i] = {} e_times_dict[i] = {} e_outcomes_dict[i] = {} score_dict[i] = {} risk_scores = model2.predict(x_ts2.drop(['week', 'outcome'], axis=1), alpha=alpha) hazards_dict[i][ic_in2] = risk_scores e_times_dict[i][ic_in2] = x_ts2['week'] e_outcomes_dict[i][ic_in2] = x_ts2['outcome'] if len(test_ix) >= 2: try: ci = concordance_index_censored( e_outcomes_dict[i][ic_in2].astype(bool), e_times_dict[i][ic_in2], hazards_dict[i][ic_in2])[0] except: print('debug') print(x_ts2['outcome']) print(x_ts2['week']) print('') continue if not np.isnan(ci): score_dict[i][ic_in2] = ci if len(score_dict[i]) > 0: scores = { i: sum(score_dict[i].values()) / len(score_dict[i].values()) for i in score_dict.keys() } else: scores = {} for a_ix in hazards_dict.keys(): alpha_num = alphas[a_ix] scores[ alpha_num], concordant, discondordant, tied_risk, tied_time = concordance_index_censored( np.array( np.concatenate(list( e_outcomes_dict[a_ix].values()))).astype(bool), np.array( np.concatenate(list(e_times_dict[a_ix].values()))), np.array( np.concatenate(list(hazards_dict[a_ix].values())))) lambdas, aucs_in = list(zip(*scores.items())) ix_max = np.argmax(aucs_in) best_lamb = alphas[ix_max] lambda_dict[ic_in] = { 'best_lambda': best_lamb, 'scores': scores, 'event_outcomes': event_outcomes, 'times': event_times, 'hazards': hazards, 'lambdas_tested': alphas } model_out = CoxnetSurvivalAnalysis(l1_ratio=1, alphas=alphas) model_out.fit(x_train_, y_arr) risk_scores = model_out.predict(x_test.drop(['week', 'outcome'], axis=1), alpha=best_lamb) hazards.append(risk_scores) event_times.append(x_test['week']) event_outcomes.append(x_test['outcome']) coefs = model_out.coef_[:, ix_max] out_df = pd.DataFrame({'odds_ratio': np.zeros(x.shape[1])}, index=x.columns.values) out_df.loc[x_train.columns.values[:-2]] = np.expand_dims(coefs, 1) model_out_dict[ic_in] = out_df if len(test_index) > 1: ci = concordance_index_censored(x_test['outcome'].astype(bool), x_test['week'], risk_scores)[0] if not np.isnan(ci): score_vec.append(ci) if len(score_vec) > 1: score = sum(score_vec) / len(score_vec) else: score, concordant, discondordant, tied_risk, tied_time = concordance_index_censored( np.array(np.concatenate(event_outcomes)).astype(bool), np.array(np.concatenate(event_times)), np.array(np.concatenate(hazards))) final_dict = {} final_dict['score'] = score final_dict['model'] = model_out_dict final_dict['hazards'] = hazards final_dict['event_times'] = event_times final_dict['event_outcomes'] = event_outcomes final_dict['lambdas'] = lambda_dict return final_dict
def train_survival(X_train, X_test, y_train, alphas, l1_ratios, seed, n_folds=4, max_iter=1000, fit_ridge=False, output_fn=False, debug_info=None): """ Build the logic and sklearn pipelines to predict survival info y from dataset x, using elastic net Cox regression Arguments --------- X_train: pandas DataFrame of feature matrix for training data X_test: pandas DataFrame of feature matrix for testing data y_train: pandas DataFrame of processed y matrix, containing 'status' = False if right-censored else True, 'time_in_days' = survival time alphas: list of alphas to perform cross validation over, if None use the alphas path generated by scikit-survival l1_ratios: list of l1 mixing parameters to perform cross validation over n_folds: int of how many folds of cross validation to perform max_iter: the maximum number of iterations to test until convergence fit_ridge: if True, use ridge regularized model (CoxPHSurvivalAnalysis). This uses a slightly different optimizer than CoxnetSurvivalAnalysis which can be more stable, but also scales poorly to many features. If this is True, l1_ratios range will be ignored, and hyperparameter search will be over alphas range only. Returns ------ The full pipeline sklearn object and y matrix predictions for training, testing, and cross validation """ # set up the cross-validation parameters # sometimes we want to use sksurv to compute the alpha path if alphas is None: cox = CoxnetSurvivalAnalysis(alpha_min_ratio=0.01, n_alphas=100) cox.fit(X_train, _y_df_to_struct(y_train)) alphas = cox.alphas_ if fit_ridge: surv_parameters = { "survival__alpha": alphas } estimator = Pipeline( steps=[ ( "survival", CoxPHSurvivalAnalysis( n_iter=max_iter, tol=1e-5, ), ) ] ) else: surv_parameters = { "survival__alphas": [[a] for a in alphas], "survival__l1_ratio": l1_ratios, } estimator = Pipeline( steps=[ ( "survival", CoxnetSurvivalAnalysis( max_iter=max_iter, tol=1e-5, fit_baseline_model=output_fn ), ) ] ) cv_pipeline = GridSearchCV( estimator=estimator, param_grid=surv_parameters, n_jobs=-1, cv=n_folds, error_score=0.5, return_train_score=True, ) # fit the model cv_pipeline.fit(X=X_train, y=_y_df_to_struct(y_train)) if debug_info is not None: grid_mean_df = pd.DataFrame( cv_pipeline.cv_results_['mean_test_score'].reshape(len(alphas), -1), columns=l1_ratios, index=alphas ) grid_mean_df.to_csv('{}_{}_fold{}_grid.tsv'.format(debug_info['prefix'], debug_info['signal'], debug_info['fold_no']), sep='\t') # Obtain cross validation results y_cv = cross_val_predict( cv_pipeline.best_estimator_, X=X_train, y=_y_df_to_struct(y_train), cv=n_folds, method="predict", ) # get predictions y_predict_train = cv_pipeline.predict(X_train) y_predict_test = cv_pipeline.predict(X_test) return cv_pipeline, y_predict_train, y_predict_test, y_cv
plt.legend() plt.plot() _train_l = numpy.array(list(_train_l), dtype='bool,f4') _test_l = numpy.array(list(_test_l), dtype='bool,f4') # create ph model estimator = CoxPHSurvivalAnalysis() estimator.fit(_train_d, _train_l) # create the cox model clf = CoxnetSurvivalAnalysis(n_alphas=5, tol=0.1) # train model clf.fit(_train_d, _train_l) result = [] # evaluate for every alpha for v in clf.alphas_: res = clf.predict(_test_d, alpha=[v]) result.append(concordance_index_censored(tft, timet, res)) # calculate precision clf.predict(_test_d) res = clf.predict(_test_d) # print out some results
def train_with_inner_folds(x, num_folds=5): final_res_dict = {} scores = [] scores = [] score_d = [] ix_inner = leave_two_out(x, x['outcome'], num_folds=None) final_res_dict['grid_search_model'] = [] final_res_dict['best_model'] = [] final_res_dict['best_alpha'] = [] final_res_dict['alphas'] = [] for ic_in, ix_in in enumerate(ix_inner): train_index, test_index = ix_in x_train, x_test = x.iloc[train_index, :], x.iloc[test_index, :] if np.sum(x_test['outcome'].values < 1): continue y_test = list(zip(x_test['outcome'], x_test['week'])) y_test_arr = np.array(y_test, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')]) if len(np.unique(y_test_arr)) == 1: continue model2 = CoxnetSurvivalAnalysis(l1_ratio=1, n_alphas=300, alpha_min_ratio='auto') week = x_train['week'] outcome = x_train['outcome'] x_train_ = x_train.drop(['week', 'outcome'], axis=1) yy = list(zip(outcome, week)) y_arr = np.array(yy, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')]) model2.fit(x_train_, y_arr) num_rec = np.sum(outcome) if num_folds > num_rec: nf_inner = int(num_rec) else: nf_inner = int(num_folds) cv = StratifiedKFold(n_splits=int(nf_inner), shuffle=True, random_state=0) alphas = model2.alphas_ try: gcv = GridSearchCV(make_pipeline( StandardScaler(), CoxnetSurvivalAnalysis(l1_ratio=1, n_alphas=300, alpha_min_ratio='auto', max_iter=100)), param_grid={ "coxnetsurvivalanalysis__alphas": [[v] for v in alphas] }, cv=cv, error_score=0.5, n_jobs=4).fit(x_train_, y_arr) best_model = gcv.best_estimator_.named_steps[ "coxnetsurvivalanalysis"] best_alpha = best_model.alphas pred = model2.predict(x_test.drop(['week', 'outcome'], axis=1), alpha=best_alpha) score_default = model2.score( x_test.drop(['week', 'outcome'], axis=1), y_test_arr) score = concordance_index_censored(x_test['outcome'].astype(bool), x_test['week'], pred)[0] except: score_default = model2.score( x_test.drop(['week', 'outcome'], axis=1), y_test_arr) score = score_default.copy() best_model = model2 best_alpha = alphas[-1] if not np.isnan(score): scores.append(score) if not np.isnan(score_default): score_d.append(score_default) final_res_dict['grid_search_model'].append(gcv) final_res_dict['best_model'].append(best_model) final_res_dict['best_alpha'].append(best_alpha) final_res_dict['alphas'].append(alphas) conc_ix = np.mean(scores) conc_ix_d = np.mean(score_d) final_res_dict['score'] = conc_ix final_res_dict['score_default'] = conc_ix_d return final_res_dict
def train_with_folds(x, num_folds=5): num_rec = np.sum(x['outcome']) if num_folds > num_rec: num_folds = num_rec skf = StratifiedKFold(n_splits=num_folds) splits = skf.split(x, x['outcome']) score_vec = [] final_res_dict = {} fold = 0 for train_index, test_index in splits: # probs[ic] = [] # train_index, test_index = ix x_train, x_test = x.iloc[train_index, :], x.iloc[test_index, :] week = x_train['week'] outcome = x_train['outcome'] x_train_ = x_train.drop(['week', 'outcome'], axis=1) yy = list(zip(outcome, week)) y_arr = np.array(yy, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')]) y_test = list(zip(x_test['outcome'], x_test['week'])) y_test_arr = np.array(y_test, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')]) if len(np.unique(y_test_arr)) == 1: continue model2 = CoxnetSurvivalAnalysis(l1_ratio=1, alpha_min_ratio='auto', n_alphas=300) warnings.simplefilter("ignore") model2.fit(x_train_, y_arr) estimated_alphas = model2.alphas_ num_rec = np.sum(outcome) if num_folds > num_rec: nf_inner = int(num_rec) else: nf_inner = int(num_folds) cv = StratifiedKFold(n_splits=nf_inner, shuffle=True, random_state=0) try: gcv = GridSearchCV(make_pipeline( StandardScaler(), CoxnetSurvivalAnalysis(l1_ratio=1)), param_grid={ "coxnetsurvivalanalysis__alphas": [[v] for v in estimated_alphas] }, cv=cv, error_score=0.5, n_jobs=4).fit(x_train_, y_arr) cv_results = pd.DataFrame(gcv.cv_results_) alphas = cv_results.param_coxnetsurvivalanalysis__alphas.map( lambda x: x[0]) best_model = gcv.best_estimator_.named_steps[ "coxnetsurvivalanalysis"] best_alpha = best_model.alphas best_coefs = pd.DataFrame(best_model.coef_, index=x_train_.columns, columns=["coefficient"]) except: score_default = model2.score( x_test.drop(['week', 'outcome'], axis=1), y_test_arr) score = score_default.copy() best_model = model2 best_alpha = estimated_alphas[-1] best_coefs = pd.DataFrame(best_model.coef_[:, -1], index=x_train_.columns, columns=["coefficient"]) # model_out = CoxnetSurvivalAnalysis(l1_ratio=1, alphas = best_alpha) # model_out.fit(x_train_, y_arr) week = x_test['week'] outcome = x_test['outcome'] x_test_ = x_test.drop(['week', 'outcome'], axis=1) yy = list(zip(outcome, week)) y_arr = np.array(yy, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')]) score_ix = best_model.score(x_test_, y_arr) score_vec.append(score_ix) final_res_dict[fold] = {} final_res_dict[fold]['score'] = score_ix final_res_dict[fold]['best_model'] = best_model final_res_dict[fold]['best_coefs'] = best_coefs final_res_dict[fold]['train_test'] = (x_train, x_test) fold += 1 return final_res_dict
def RandomGridSearchRFC_Fixed(X, Y, splits, model, survival): """ This function looks for the best set o parameters for RFC method Input: X: training set Y: labels of training set splits: cross validation splits, used to make sure the parameters are stable Output: clf.best_params_: dictionary with the parameters, to use: param_svm['kernel'] """ start_svm = time.time() if model == 'svm': clf = svm.SVC() tuned_parameters = { 'C': ([0.01, 1, 10]), 'kernel': (['rbf', 'linear']), # 'kernel': (['linear', 'rbf', 'sigmoid']), # 'degree': ([1,3,5,10]), # 'decision_function_shape' : (['ovo', 'ovr']), # 'cache_size': ([500,1000,1500,2000]), 'shrinking': ([False, True]), # 'probability': ([False, True]) } if model == 'cart': clf = tree.DecisionTreeClassifier() tuned_parameters = { 'criterion': (['gini', 'entropy']), 'max_depth': ([10, 20]), 'min_samples_split': ([2, 3, 5]), 'min_samples_leaf': ([2, 3, 5]), } if model == 'rf': clf = ensemble.RandomForestClassifier() tuned_parameters = { 'n_estimators': ([200, 500, 1000]), # 'max_features': (['auto', 'sqrt', 'log2',1,4,8]), 'max_depth': ([10, 20]), # 'criterion': (['gini', 'entropy']), 'min_samples_split': [2, 3, 5], 'min_samples_leaf': [2, 3, 5], } if model == 'xgboost': clf = XGBClassifier() tuned_parameters = { 'booster': (['gbtree']), 'max_depth': ([5, 10, 20]), 'reg_lambda': ([0, 1]), 'reg_alpha': ([0, 1]), 'subsample': ([0.5, 1]) } if model == 'lr': clf = linear_model.LogisticRegression() tuned_parameters = {'solver': (['liblinear', 'sag', 'saga'])} if model == 'cox': clf = CoxnetSurvivalAnalysis() tuned_parameters = { 'n_alphas': ([50, 100, 200]), 'l1_ratio': ([0.1, 0.5, 1]), } if model == 'survSVM': clf = FastSurvivalSVM() tuned_parameters = { 'alpha': ([0.5, 1]), 'rank_ratio': ([0.5, 1]), 'max_iter': ([20, 40, 80]), 'optimizer': (['rbtree', 'avltree']), } if model == 'gb': clf = GradientBoostingSurvivalAnalysis() tuned_parameters = { 'learning_rate': ([0.1, 0.3]), 'n_estimators': ([100, 200, 400]), 'max_depth': ([3, 6, 12]) } if survival == True: scorer = make_scorer(CI, greater_is_better=True) y_for_cv = np.array([t[0] for t in Y]) cv = StratifiedKFold(y_for_cv, n_folds=splits) # x-validation else: cv = StratifiedKFold(Y, n_folds=splits) # x-validation scores = ['roc_auc'] print(' ...performing x-validation') clf = GridSearchCV(clf, tuned_parameters, scoring='%s' % scores[0], cv=cv, verbose=10) #scoring='%s' % scores[0] clf.fit(X, Y) end_svm = time.time() print("Total time to process: ", end_svm - start_svm) return (clf.best_params_, clf)
def train_survival_model( x, y, *, outer_cv_splits, inner_cv_splits, param_grid, ): """Train survival model. The model is trained with ssGSEA normalized enrichment scores (NES) from TCGA expression data and cBioPortal survival data on patient survival status and survival times. :param pandas.core.frame.DataFrame x: dataFrame of ssGSEA NES where controls are filtered out, as are patients with missing enrichment scores or survival data :param numpy.ndarray y: Structured array A where binary survival status is first field and survival time is second field. :param int outer_cv_splits: number of folds to split data in train/test sets in outer cross validation loop :param int inner_cv_splits: number of folds to split data in train/test sets in inner cross validation loop :param dict param_grid: parameter types and values to try in grid search :return: concordance scores """ concordance_scores = [] kf = KFold(n_splits=outer_cv_splits, shuffle=True) inner_cv = KFold(n_splits=inner_cv_splits) iterator = tqdm(kf.split(x, y)) # Iterator for each CV step in the outer loop for i, (train_indexes, test_indexes) in enumerate(iterator): # Slice main data frame to get the training and test data for this CV step x_train = x.iloc[train_indexes] x_test = x.iloc[test_indexes] y_train = np.asarray([y[train_index] for train_index in train_indexes]) # y_test = np.asarray([y[test_index] for test_index in test_indexes]) # Instantiate Cox’s proportional hazard’s regression model with elastic net penalty coxnet = CoxnetSurvivalAnalysis() # Tune hyper-parameters (e.g., L1-ratio) of the estimator using grid search (Inner loop in the nested-CV) gcv = GridSearchCV(estimator=coxnet, param_grid=param_grid, cv=inner_cv, return_train_score=True) # Run grid search on training data gcv.fit(x_train, y_train) # Extract best model from the grid coxnet = gcv.best_estimator_ # predict y using the best model from the grid prediction = coxnet.predict(x_test) # Evaluate the performance of the model during grid search using Harrell's concordance index # Note that the main data frame is sliced to use only the test data for this CV step cindex, concordant, discordant, tied_risk, tied_time = concordance_index_censored( [y[test_index]['status'] for test_index in test_indexes], # The status array for test set [y[test_index]['days_to_death'] for test_index in test_indexes], # The days to death for test set prediction, # Prediction scores ) # print C-Index and best parameter found in the grid search print('best c-index: {}'.format(cindex)) print('best parameter: {}'.format(gcv.best_params_)) concordance_scores.append({ "c-index": cindex, "number of concordant pairs": concordant, "number of discordant pairs": discordant, "tied_risk": tied_risk, "tied_time": tied_time, "l1-ratio": gcv.best_estimator_.l1_ratio, "split": i, }) # avg_c_index = np.average([ # iter_result["c-index"] # for iter_result in concordance_scores # ]) # print('Avg C-Index {}'.format(avg_c_index)) print(concordance_scores) # return avg_c_index, concordance_scores return concordance_scores
class ClusterWithSurvival(object): """ """ def __init__(self, isdead, nbdays, n_clusters=2, metadata_mat=None, use_gaussian_to_dichotomize=False, use_sksurv=True): "docstring" self.use_sksurv = use_sksurv self.coxph_python = None self.isdead = isdead self.nbdays = nbdays self.n_clusters = n_clusters self.metadata_mat = metadata_mat self.matrix = None self._glm = None self._labels = None self._use_gaussian_to_dichotomize = use_gaussian_to_dichotomize def get_nonzero_features(self, matrix): """ Get non zero features using lasso coxPH """ if self.metadata_mat is not None: self.matrix = hstack([matrix, self.metadata_mat]) rbs = RobustScaler() self.matrix = rbs.fit_transform(self.matrix) else: self.matrix = matrix return self._fit_with_python(self.matrix, l1_ratio=1.0, return_nonzero_features=True) def fit(self, matrix): """ """ self.matrix = matrix def predict(self, matrix_test): """ """ if self.use_sksurv: return self._fit_with_python(matrix_test) else: return self._fit_with_glm(matrix_test) def predict_proba(self, matrix_test): """ """ if self.use_sksurv: return self._fit_with_python(matrix_test, get_proba=True) else: return self._fit_with_glm(matrix_test, get_proba=True) def _fit_with_python(self, matrix_test, get_proba=False, return_nonzero_features=False, l1_ratio=0.5): """ """ from sksurv.linear_model import CoxnetSurvivalAnalysis Y = np.asarray([(bool(a), b) for a, b in zip(self.isdead, self.nbdays)], dtype=[("event", np.bool), ("time", np.int)]) self.coxph_python = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, fit_baseline_model=False) with warnings.catch_warnings(): warnings.simplefilter("ignore") self.coxph_python.fit(self.matrix, Y) predictions = self.coxph_python.predict(matrix_test) if get_proba: return self._get_proba_from_prediction(predictions) if return_nonzero_features: for coef in self.coxph_python.coef_.T: if coef.sum() != 0: break if coef.sum() == 0: raise (Exception("All features Coefficient are 0!")) if self.metadata_mat is not None: if coef[:-self.metadata_mat.shape[1]].sum() == 0: raise (Exception("Only metadata features are non zero")) return np.nonzero(coef[:-self.metadata_mat.shape[1]]) else: return np.nonzero(coef) return self._fit_and_dichotomise(predictions, n_clusters=self.n_clusters) def _fit_with_glm(self, matrix_test, get_proba=False): """ """ predictions = predict_with_coxph_glmnet(self.matrix, self.isdead, self.nbdays, matrix_test) if get_proba: return self._get_proba_from_prediction(predictions) return self._fit_and_dichotomise(predictions, n_clusters=self.n_clusters) def _fit_and_dichotomise(self, predicted_time, n_clusters=2): """ """ labels = np.zeros(predicted_time.shape) predicted_time[predicted_time == 0] = np.inf if self._use_gaussian_to_dichotomize: glm = GaussianMixture(n_components=n_clusters) self._labels = glm.fit_predict(predicted_time.reshape(1, -1).T) self._glm = glm return self._labels for cluster in range(n_clusters): percentile = 100 * (1.0 - 1.0 / (cluster + 1.0)) value = np.percentile(predicted_time, percentile) labels[predicted_time >= value] = n_clusters - cluster return labels def _get_proba_from_prediction(self, predicted_time, time_of_following=None): """ time_of_following is used to compute the probability of the even happening using the predicted values as referendce => proba = time_predicted / time_of_following if None, time_of_following is computed using the std of time_predicted for all non zero """ if self._glm is not None: return self._glm.predict_proba(predicted_time.reshape(1, -1).T) predicted_time = predicted_time.astype("float32") if not time_of_following: time_of_following = np.max(predicted_time[predicted_time != 0]) + \ np.std(predicted_time[predicted_time != 0]) predicted_time[predicted_time == 0] = time_of_following return predicted_time / time_of_following
normed_features = (feature_matrix - feature_means) / feature_stds normed_features = normed_features.fillna(0.0) # In[ ]: from sksurv.datasets import get_x_y full_dataset = pd.read_csv('training/response.csv').set_index('lab_id').join( normed_features) X, Y = get_x_y(full_dataset, ['vitalStatus', 'overallSurvival'], pos_label='Dead') # In[ ]: from sksurv.linear_model import CoxnetSurvivalAnalysis from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold # This package allows general elastic net tuning, but by setting # l1_ratio=1, we restrict to LASSO. regr = CoxnetSurvivalAnalysis(l1_ratio=1, alpha_min_ratio=0.05, max_iter=3e5) n_folds = 10 alphas = np.logspace(-1.3, 0, num=100) cv = KFold(n_splits=5, shuffle=True, random_state=328) gcv = GridSearchCV(regr, {"alphas": [[v] for v in alphas]}, cv=cv).fit(X, Y) #In[ ]: import matplotlib.pyplot as plt scores = gcv.cv_results_['mean_test_score'] scores_std = gcv.cv_results_['std_test_score'] std_error = scores_std / np.sqrt(n_folds) plt.figure().set_size_inches(8, 6) plt.semilogx(alphas, scores)
ls='--', label=('Best alpha, CI = %0.3f' % gcv.best_score_)) plt.legend() plt.title('Cross Validation Concordance Index') def score_survival_model(model, X, y): prediction = model.predict(X) result = concordance_index_censored(y['vitalStatus'], y['overallSurvival'], prediction) return result[0] # In[ ]: # This package allows general elastic net tuning, but by setting l1_ratio = 1, we restrict to LASSO. regr = CoxnetSurvivalAnalysis(l1_ratio=0.8, alpha_min_ratio=0.1, max_iter=3e5) n_folds = 10 alphas = np.logspace(-1.3, 1.5, num=50) cv = KFold(n_splits=5, shuffle=True, random_state=0) gcv = GridSearchCV(regr, { "alphas": [[v] for v in alphas] }, cv=cv, n_jobs=-1).fit(X, Y) plot_gridcv_results(gcv, alphas) regr_best = CoxnetSurvivalAnalysis(alphas=gcv.best_params_["alphas"], l1_ratio=0.8, alpha_min_ratio=0.1, max_iter=3e5).fit(X, Y) y_regr = regr_best.predict(X_lb)
def train_cox(x, outer_split=leave_two_out, inner_split=leave_two_out, num_folds=None): if num_folds is None: print('none') else: print(num_folds) # if feature_grid is None: # feature_grid = np.logspace(7, 20, 14) hazards = [] event_times = [] event_outcomes = [] score_vec = [] model_out_dict = {} ix_inner = outer_split(x, x['outcome'], num_folds=num_folds) lambda_dict = {} for ic_in, ix_in in enumerate(ix_inner): train_index, test_index = ix_in x_train, x_test = x.iloc[train_index, :], x.iloc[test_index, :] week = x_train['week'] outcome = x_train['outcome'] x_train_ = x_train.drop(['week', 'outcome'], axis=1) yy = list(zip(outcome, week)) y_arr = np.array(yy, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')]) ix_inner2 = inner_split(x_train, x_train['outcome'], num_folds=num_folds) lamb_dict = {} lamb_dict['auc'] = {} lamb_dict['ci'] = {} model2 = CoxnetSurvivalAnalysis(l1_ratio=1) model_dict = {} alphas = None hazards_dict = {} e_times_dict = {} e_outcomes_dict = {} score_dict = {} coxnet_pipe = CoxnetSurvivalAnalysis(l1_ratio=1, alpha_min_ratio=0.001, n_alphas=300) coxnet_pipe.fit(x_train_, y_arr) alphas = coxnet_pipe.alphas_ for ic_in2, ix_in2 in enumerate(ix_inner2): start_inner = time.time() train_ix, test_ix = ix_in2 x_tr2, x_ts2 = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :] y_test = list(zip(x_ts2['outcome'], x_ts2['week'])) y_test_arr = np.array(y_test, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')]) if len(np.unique(y_test_arr)) < len(test_ix): continue week = x_tr2['week'] outcome = x_tr2['outcome'] if (outcome == 0).all(): continue x_tr2_ = x_tr2.drop(['week', 'outcome'], axis=1) yy2 = list(zip(outcome, week)) y_arr2 = np.array(yy2, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')]) model2.set_params(alphas=alphas) try: model2.fit(x_tr2_, y_arr2) except: print('removed alpha ' + str(alphas[0])) alphas_n = np.delete(alphas, 0) model2.set_params(alphas=alphas_n) while (1): try: model2.fit(x_tr2_, y_arr2) alphas = alphas_n break except: print('removed alpha ' + str(alphas_n[0])) alphas_n = np.delete(alphas, 0) model2.set_params(alphas=alphas_n) if len(alphas_n) <= 2: break if len(alphas_n) <= 2: continue # alphas_new = model2.alphas_ # if ic_in2 == 0: # alphas = alphas_new model_dict[ic_in2] = model2 for i, alpha in enumerate(alphas): if i not in hazards_dict.keys(): hazards_dict[i] = {} e_times_dict[i] = {} e_outcomes_dict[i] = {} score_dict[i] = {} risk_scores = model2.predict(x_ts2.drop(['week', 'outcome'], axis=1), alpha=alpha) hazards_dict[i][ic_in2] = risk_scores e_times_dict[i][ic_in2] = x_ts2['week'] e_outcomes_dict[i][ic_in2] = x_ts2['outcome'] if len(test_ix) >= 2: score_dict[i][ ic_in2], _, _, _, _ = concordance_index_censored( e_outcomes_dict[i][ic_in2].astype(bool), e_times_dict[i][ic_in2], hazards_dict[i][ic_in2]) if len(score_dict[i]) > 0: scores = { i: sum(score_dict[i].values()) / len(score_dict[i].values()) for i in score_dict.keys() } else: scores = {} for a_ix in hazards_dict.keys(): alpha_num = alphas[a_ix] scores[ alpha_num], concordant, discondordant, tied_risk, tied_time = concordance_index_censored( np.array( np.concatenate(list( e_outcomes_dict[a_ix].values()))).astype(bool), np.array( np.concatenate(list(e_times_dict[a_ix].values()))), np.array( np.concatenate(list(hazards_dict[a_ix].values())))) lambdas, aucs_in = list(zip(*scores.items())) ix_max = np.argmax(aucs_in) best_lamb = lambdas[ix_max] lambda_dict[ic_in] = { 'best_lambda': best_lamb, 'scores': scores, 'event_outcomes': event_outcomes, 'times': event_times, 'hazards': hazards, 'lambdas_tested': alphas } model_out = CoxnetSurvivalAnalysis(l1_ratio=1, alphas=alphas) model_out.fit(x_train_, y_arr) risk_scores = model_out.predict(x_test.drop(['week', 'outcome'], axis=1), alpha=best_lamb) hazards.append(risk_scores) event_times.append(x_test['week']) event_outcomes.append(x_test['outcome']) model_out_dict[ic_in] = model_out if len(test_index) > 1: score_vec.append( concordance_index_censored(x_test['outcome'].astype(bool), x_test['week'], risk_scores)[0]) if len(test_index) > 1: score = sum(score_vec) / len(score_vec) else: score, concordant, discondordant, tied_risk, tied_time = concordance_index_censored( np.array(np.concatenate(event_outcomes)).astype(bool), np.array(np.concatenate(event_times)), np.array(np.concatenate(hazards))) final_dict = {} final_dict['score'] = score final_dict['model'] = model_out_dict final_dict['hazards'] = hazards final_dict['event_times'] = event_times final_dict['event_outcomes'] = event_outcomes final_dict['lambdas'] = lambda_dict return final_dict
def __init__(self, data): super().__init__(data) self.model = CoxnetSurvivalAnalysis(l1_ratio=1.0, max_iter=1000000)
input_train = input_train[features] input_test = input_test[features] input_train, input_test = preprocessing.normalizing_input( input_train, input_test) structured_y = Surv.from_dataframe('Event', 'SurvivalTime', output_train) # Coxnet # coxnet = CoxnetSurvivalAnalysis() # print(cross_validate(coxnet, input_train, structured_y, cv=5)) # Grid search tuned_params = { "l1_ratio": np.linspace(0.01, 0.02, 100), "n_alphas": range(140, 160, 1), } grid_search = RandomizedSearchCV(CoxnetSurvivalAnalysis(), tuned_params, cv=5, n_jobs=4, n_iter=1000) grid_search.fit(input_train, structured_y) print(grid_search.best_score_) best_params = grid_search.best_params_ print(best_params) # Prediction def predict(model, X, threshold=0.9): prediction = model.predict_survival_function(X) y_pred = [] for pred in prediction:
_test_l = numpy.array(list(_test_l), dtype='bool,f4') '''plot some estimator stuff _event, _time = split_for_kaplan(_train_l, _train_d, 24) for i in range(0, len(_event)): x, y = kaplan_meier_estimator(_event[i], _time[i]) plt.step(x, y, where="post", label="CT_group= "+str(i)); plt.legend(); plt.plot(); plt.show();''' # create and train the coxnet model clf = CoxnetSurvivalAnalysis(n_alphas=100, l1_ratio=0.5, alpha_min_ratio=0.01, tol=0.1, fit_baseline_model=True).fit(_train_d, _train_l) ccx = [] event_indicator = [val[0] for val in _test_l] event_time = [val[1] for val in _test_l] for val in clf.alphas_: res = clf.predict(_test_d, alpha=val) ccx.append( concordance_index_censored(event_indicator, event_time, estimate=res)) #curve concordance over alphas plt.step(clf.alphas_, [val[0] for val in ccx], where="post") plt.show()