def psurvival(row, phenotype_df, duration_col = 'T', event_col = 'E', other_cols = []): """ duration_col: survival time event_col: whether an event (death or other) has ocured or not. 0 for no, 1 for yes other_cols: other variables to consider in the regression """ # phenotype_df = phenotype_df.T phenotype_df = phenotype_df.join(row.astype(float)) phenotype_df[duration_col] = phenotype_df[duration_col].astype(float) phenotype_df[event_col] = phenotype_df[event_col].astype(int) # The following lines deal with char conflicts in patsy formulas duration_col = duration_col.replace(' ','_').replace('.','_').replace('-','_') event_col = event_col.replace(' ','_').replace('.','_').replace('-','_') other_cols = [x.replace(' ','_').replace('.','_').replace('-','_') for x in other_cols] row.name = row.name.replace(' ','_').replace('.','_').replace('-','_') phenotype_df.columns = [x.replace(' ','_').replace('.','_').replace('-','_') for x in phenotype_df.columns] formula = row.name + ' + ' + duration_col + ' + ' + event_col if not not other_cols: other_cols = [x.replace(' ','_').replace('.','_') for x in other_cols] formula = formula + ' + ' + ' + '.join(other_cols) X = patsy.dmatrix(formula_like = formula, data = phenotype_df, return_type = 'dataframe') X = X.drop(['Intercept'], axis = 1) cph = lifelines.CoxPHFitter() cph.fit(X, duration_col = duration_col, event_col = event_col) result = cph.summary.loc[row.name] return result
def _cv_coxph_c( z, survival, penalty, duration_column="duration", observed_column="observed", cv_folds=5, ): try: import lifelines import lifelines.utils except ImportError: raise ImportError( "The module ``lifelines`` was not found. It is required for this functionality. You may install it using `pip install lifelines`." ) cph = lifelines.CoxPHFitter(penalizer=penalty) survdf = pd.concat([survival, z], axis=1, sort=False).dropna() kfold = KFold(cv_folds) scores = list() for train_index, test_index in kfold.split(survdf): x_train, x_test = survdf.iloc[train_index], survdf.iloc[test_index] cph.fit(x_train, duration_column, observed_column) cindex = lifelines.utils.concordance_index( x_test[duration_column], -cph.predict_partial_hazard(x_test), x_test[observed_column], ) scores.append(cindex) return scores
def train(self, trainMatrix): self._trainFMean = trainMatrix[:,3:].mean(0) sur_matrix = np.concatenate((trainMatrix[:,1:3] , trainMatrix[:,3:] - self._trainFMean), axis=1) data = pd.DataFrame(sur_matrix) self._cf = ll.CoxPHFitter() self._cf.fit(data, 1, event_col=0) bh = self._cf.baseline_hazard_ self._bh = np.zeros((np.shape(bh)[0] ,2)) self._bh[:,0] = np.asarray(list(bh.index)) self._bh[:,1] = np.cumsum(np.asarray(bh))
def _cph_coefs(z, survival, duration_column, observed_column, penalizer=0): """Compute one CPH model for each latent factor (column) in z. Return summaries (beta values, p values, confidence intervals) """ try: import lifelines except ImportError: raise ImportError('The module ``lifelines`` was not found. It is required for this functionality. You may install it using `pip install lifelines`.') return pd.concat([ lifelines.CoxPHFitter(penalizer=penalizer).fit(survival.assign(LF=z.loc[:,i]).dropna(), duration_column, observed_column).summary.loc['LF'].rename(i) for i in z.columns], axis=1)
def fit_model(): df = get_df(inpath, filename) # dropping columns for reference categories df.drop(['Concrete', 'Urban'], axis=1, inplace=True) coxph = ll.CoxPHFitter() model = coxph.fit(df, duration_col='duration', event_col='degraded_obs', cluster_col='id', show_progress=True) pprint(model.summary) return model
def _cv_coxph_c(z, survival, penalty, duration_column='duration', observed_column='observed', cv_folds=5): try: import lifelines import lifelines.utils except ImportError: raise ImportError('The module ``lifelines`` was not found. It is required for this functionality. You may install it using `pip install lifelines`.') cph = lifelines.CoxPHFitter(penalizer=penalty) survdf = pd.concat([survival, z], axis=1, sort=False).dropna() scores = lifelines.utils.k_fold_cross_validation(cph, survdf, duration_column, event_col=observed_column, k=cv_folds) return scores
def coxph_model(formula, data, time_col, event_col, **kwargs): # pylint: disable=no-member # pylint gets confused by dmatrix sdata = patsy.dmatrix(formula, data=data, return_type="dataframe").join( data[[time_col, event_col]]) sdata = sdata.ix[:, sdata.columns != "Intercept"] if not (hasattr(kwargs, "penalizer")): kwargs["penalizer"] = 0.1 if not (hasattr(kwargs, "normalize")): kwargs['normalize'] = False cf = ll.CoxPHFitter(**kwargs) cf.fit(sdata, time_col, event_col) cf.print_summary() return cf
def survival_npcs(row, phenotype_df, duration_col = 'T', event_col = 'E', other_cols = []): """ duration_col: survival time event_col: whether an event (death or other) has ocured or not. 0 for no, 1 for yes other_cols: other variables to consider in the regression """ row.name = row.name.replace(' ','_').replace('.','_').replace('-','_') row_npcs = row columns_names = [] formula = '' for n in range(len(row_npcs[0])): pc_name = row.name + '_pc' + str(n+1) columns_names.append(pc_name) formula = formula + pc_name + ' + ' row_npcs = pd.DataFrame(row_npcs.tolist(), index = row_npcs.index) row_npcs.columns = columns_names # phenotype_df = phenotype_df.join(row.astype(float)) phenotype_df = phenotype_df.join(row_npcs.astype(float)) phenotype_df[duration_col] = phenotype_df[duration_col].astype(float) phenotype_df[event_col] = phenotype_df[event_col].astype(int) # The following lines deal with char conflicts in patsy formulas duration_col = duration_col.replace(' ','_').replace('.','_').replace('-','_') event_col = event_col.replace(' ','_').replace('.','_').replace('-','_') other_cols = [x.replace(' ','_').replace('.','_').replace('-','_') for x in other_cols] # row.name = row.name.replace(' ','_').replace('.','_').replace('-','_') phenotype_df.columns = [x.replace(' ','_').replace('.','_').replace('-','_') for x in phenotype_df.columns] # formula = row.name + ' + ' + duration_col + ' + ' + event_col formula = formula + duration_col + ' + ' + event_col if not not other_cols: other_cols = [x.replace(' ','_').replace('.','_') for x in other_cols] formula = formula + ' + ' + ' + '.join(other_cols) X = patsy.dmatrix(formula_like = formula, data = phenotype_df, return_type = 'dataframe') X = X.drop(['Intercept'], axis = 1) cph = lifelines.CoxPHFitter() cph.fit(X, duration_col = duration_col, event_col = event_col) result = cph.summary.loc[columns_names] return result
def test_fit_kwargs(self): ipw = IPW(learner=LogisticRegression(max_iter=1000)) weighted_standardized_survival = WeightedStandardizedSurvival( survival_model=lifelines.CoxPHFitter(), weight_model=ipw) # Without fit_kwargs - should raise StatisticalWarning with a suggestion to pass robust=True in fit with self.assertWarns(lifelines.exceptions.StatisticalWarning): weighted_standardized_survival.fit(self.X, self.a, self.t, self.y) # With fit_kwargs - should not raise StatisticalWarning (might raise other warnings, though) with self.assertRaises( AssertionError ): # negation workaround since there's no assertNotWarns with self.assertWarns(lifelines.exceptions.StatisticalWarning): weighted_standardized_survival.fit(self.X, self.a, self.t, self.y, fit_kwargs={'robust': True})
def coxph(self, **kwargs): """ CoxPH plot using baidutongji all_source dataframe as input. :param kwargs: :return: """ title = kwargs['title'] path = kwargs['path'] df_raw = self.data_frame df_raw = df_raw.applymap(lambda x: x if re.search( "[-+]?[0-9]*\.?[0-9]+", str(x)) else np.nan) if kwargs['exclude']: df = df_raw.drop(kwargs['exclude'], axis=1) df = df.dropna(how='any') fit, ax = plt.subplots() cph = lifelines.CoxPHFitter() cph.fit(df, 'avg_visit_time') cph.plot(hazard_ratios=True, ax=ax) plt.title(title) plt.tight_layout() plt.savefig(path) plt.close('all')
def get_hazard_ratios(df_test): cph = lifelines.CoxPHFitter() cph.fit(df_test, duration_col=TIME, event_col=OBSERVED) return cph.summary
def test_cox(self): standardized_survival_cox = StandardizedSurvival( survival_model=lifelines.CoxPHFitter()) standardized_survival_cox.fit(self.X, self.a, self.t, self.y) _ = standardized_survival_cox.estimate_population_outcome( self.X, self.a, self.t, self.y)
def plot_single_SVR(prediction, mutation_data, label_type, survival=False, show_plots=False, alpha=0.95): if type(prediction) is not pd.core.frame.DataFrame: if os.path.isfile(prediction): prediction = pd.read_hdf(prediction) keys = prediction.keys() SVRs = list() label = keys[0] SVRs = prediction[label]['classifiers'] Y_test = prediction[label]['Y_test'] X_test = prediction[label]['X_test'] Y_train = prediction[label]['X_train'] if survival: # Also extract time to event and if event occurs from mutation data labels = [[label_type], ['E'], ['T']] else: labels = [[label_type]] if type(mutation_data) is not dict: if os.path.isfile(mutation_data): mutation_data = gp.load_mutation_status(mutation_data, labels) patient_IDs = mutation_data['patient_IDs'] mutation_label = mutation_data['mutation_label'] # Initialize scoring metrics r2score = list() MSE = list() coefICC = list() PearsonC = list() PearsonP = list() SpearmanC = list() SpearmanP = list() if survival: cindex = list() coxp = list() coxcoef = list() patient_MSE = dict() for i in range(0, len(Y_test)): test_patient_IDs = prediction[label]['patient_ID_test'][i] # FIXME: Put some wrong patient IDs in test files for num in range(0, len(test_patient_IDs)): if 'features_' in test_patient_IDs[num]: test_patient_IDs[num] = test_patient_IDs[num][9::] if '__tpl.hdf5' in test_patient_IDs[num]: test_patient_IDs[num] = test_patient_IDs[num][0:-10] test_patient_IDs = np.asarray(test_patient_IDs) X_temp = X_test[i] test_indices = list() for i_ID in test_patient_IDs: # FIXME: Error in specific study if i_ID == '112_recurrence-preop': i_ID = '112_recurrence_preop' test_indices.append(np.where(patient_IDs == i_ID)[0][0]) y_truth = [mutation_label[0][k][0] for k in test_indices] if type(SVRs) == list or type(SVRs) == tuple: estimator = SVRs[i] else: estimator = SVRs scaler = estimator.best_scaler try: y_prediction = estimator.predict(scaler.transform(X_temp)) except ValueError: y_prediction = estimator.predict(X_temp) y_truth = np.asarray(y_truth) # if survival: # # Normalize the scores # y_prediction = np.subtract(1.01, np.divide(y_prediction, np.max(y_prediction))) print "Truth: ", y_truth print "Prediction: ", y_prediction # Compute error per patient for i_truth, i_predict, i_test_ID in zip(y_truth, y_prediction, test_patient_IDs): if i_test_ID not in patient_MSE.keys(): patient_MSE[i_test_ID] = list() patient_MSE[i_test_ID].append((i_truth - i_predict)**2) # Compute evaluation metrics r2score.append(r2_score(y_truth, y_prediction)) MSE.append(mean_squared_error(y_truth, y_prediction)) coefICC.append(ICC(np.column_stack((y_prediction, y_truth)))) C = pearsonr(y_prediction, y_truth) PearsonC.append(C[0]) PearsonP.append(C[1]) C = spearmanr(y_prediction, y_truth) SpearmanC.append(C.correlation) SpearmanP.append(C.pvalue) if survival: # Extract time to event and event from label data E_truth = np.asarray( [mutation_label[1][k][0] for k in test_indices]) T_truth = np.asarray( [mutation_label[2][k][0] for k in test_indices]) # Concordance index cindex.append( 1 - ll.utils.concordance_index(T_truth, y_prediction, E_truth)) # Fit Cox model using SVR output, time to event and event data = {'predict': y_prediction, 'E': E_truth, 'T': T_truth} data = pd.DataFrame(data=data, index=test_patient_IDs) cph = ll.CoxPHFitter() cph.fit(data, duration_col='T', event_col='E') coxcoef.append(cph.summary['coef']['predict']) coxp.append(cph.summary['p']['predict']) # Compute confidence intervals for given metrics N_1 = float(len(Y_train[0])) N_2 = float(len(Y_test[0])) if len(r2score) == 1: # No confidence intevals, just take the scores stats = dict() stats["r2_score:"] = str(r2score[0]) stats["MSE:"] = str(MSE[0]) stats["ICC:"] = str(coefICC[0]) stats["PearsonC:"] = str(PearsonC[0]) stats["SpearmanC: "] = str(SpearmanC[0]) stats["PearsonP:"] = str(PearsonP[0]) stats["SpearmanP: "] = str(SpearmanP[0]) if survival: stats["Concordance:"] = str(cindex[0]) stats["Cox coef.:"] = str(coxcoef[0]) stats["Cox p:"] = str(coxp[0]) else: # Compute confidence intervals from cross validations stats = dict() stats["r2_score 95%:"] = str( compute_CI.compute_confidence(r2score, N_1, N_2, alpha)) stats["MSE 95%:"] = str( compute_CI.compute_confidence(MSE, N_1, N_2, alpha)) stats["ICC 95%:"] = str( compute_CI.compute_confidence(coefICC, N_1, N_2, alpha)) stats["PearsonC 95%:"] = str( compute_CI.compute_confidence(PearsonC, N_1, N_2, alpha)) stats["SpearmanC 95%: "] = str( compute_CI.compute_confidence(SpearmanC, N_1, N_2, alpha)) stats["PearsonP 95%:"] = str( compute_CI.compute_confidence(PearsonP, N_1, N_2, alpha)) stats["SpearmanP 95%: "] = str( compute_CI.compute_confidence(SpearmanP, N_1, N_2, alpha)) if survival: stats["Concordance 95%:"] = str( compute_CI.compute_confidence(cindex, N_1, N_2, alpha)) stats["Cox coef. 95%:"] = str( compute_CI.compute_confidence(coxcoef, N_1, N_2, alpha)) stats["Cox p 95%:"] = str( compute_CI.compute_confidence(coxp, N_1, N_2, alpha)) for k, v in stats.iteritems(): print k, v # Calculate and sort individual patient MSE patient_MSE = {k: np.mean(v) for k, v in patient_MSE.iteritems()} order = np.argsort(patient_MSE.values()) sortedkeys = np.asarray(patient_MSE.keys())[order].tolist() sortedvalues = np.asarray(patient_MSE.values())[order].tolist() patient_MSE = [(k, v) for k, v in zip(sortedkeys, sortedvalues)] for p in patient_MSE: print p[0], p[1] stats["Patient_MSE"] = patient_MSE if show_plots: # TODO: Plot metrics, see also plot_SVM pass return stats
def readcrossval(feat_m1, config, sinkfolder, patientinfo, outputfolder, feat_m2=None, feat_m3=None, alpha=0.95, label_type=None, survival=False, n_classifiers=[1, 5, 10]): # n_classifiers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20 ,25, 30, 40 , 50] n_classifiers = [1] config = config_io.load_config(config) sinks = glob.glob(sinkfolder + 'RS*.hdf5') # Sort sinks based on creation date sinktimes = [os.path.getmtime(f) for f in sinks] sinks = [s for _, s in sorted(zip(sinktimes, sinks))] if label_type is None: label_type = config['Genetics']['mutation_type'] if survival: # Also extract time to event and if event occurs from mutation data labels = [label_type, ['E'], ['T']] else: labels = [[label_type]] if feat_m1: label_data, _ =\ readdata(feat_m1, feat_m2, feat_m3, patientinfo, labels) else: # No feature files found label_data, _ = findmutationdata(patientinfo, labels) for n_class in n_classifiers: output_json = os.path.join( outputfolder, ('performance_{}.json').format(str(n_class))) sensitivity = list() specificity = list() precision = list() accuracy = list() auc = list() # auc_train = list() f1_score_list = list() patient_classification_list = dict() patient_IDs = label_data['patient_IDs'] mutation_label = label_data['mutation_label'] trained_classifiers = list() y_score = list() y_test = list() pid_test = list() y_predict = list() # For SVR r2score = list() MSE = list() coefICC = list() PearsonC = list() PearsonP = list() SpearmanC = list() SpearmanP = list() if survival: cindex = list() coxp = list() coxcoef = list() patient_MSE = dict() csvfile = os.path.join(outputfolder, 'scores.csv') towrite = list() empty_scores = {k: '' for k in natsort.natsorted(patient_IDs)} empty_scores = collections.OrderedDict(sorted(empty_scores.items())) towrite.append(["Patient"] + empty_scores.keys()) params = dict() for num, s in enumerate(sinks): scores = empty_scores.copy() print("Processing {} / {}.").format(str(num + 1), str(len(sinks))) with open(s, 'r') as fp: sr = pd.read_hdf(fp) sr = sr['Constructed crossvalidation'] t = sr.trained_classifier trained_classifiers.append(sr.trained_classifier) # Extract test info test_patient_IDs = sr.patient_ID_test X_test = sr.X_test Y_test = sr.Y_test # Extract sample size N_1 = float(len(sr.patient_ID_train)) N_2 = float(len(sr.patient_ID_test)) test_indices = list() for i_ID in test_patient_IDs: test_indices.append(np.where(patient_IDs == i_ID)[0][0]) if i_ID not in patient_classification_list: patient_classification_list[i_ID] = dict() patient_classification_list[i_ID]['N_test'] = 0 patient_classification_list[i_ID]['N_correct'] = 0 patient_classification_list[i_ID]['N_wrong'] = 0 patient_classification_list[i_ID]['N_test'] += 1 # y_truth = [mutation_label[0][k] for k in test_indices] # FIXME: order can be switched, need to find a smart fix # 1 for normal, 0 for KM y_truth = [mutation_label[0][k][0] for k in test_indices] # Predict using the top N classifiers results = t.cv_results_['rank_test_score'] indices = range(0, len(results)) sortedindices = [x for _, x in sorted(zip(results, indices))] sortedindices = sortedindices[0:n_class] y_prediction = np.zeros([n_class, len(y_truth)]) y_score = np.zeros([n_class, len(y_truth)]) # Get some base objects required feature_labels = pd.read_hdf(feat_m1[0]).feature_labels base_estimator = t.estimator X_train = [(x, feature_labels) for x in sr.X_train] y_train = sr.Y_train y_train_prediction = np.zeros([n_class, len(y_train)]) scorer = t.scorer_ train = np.asarray(range(0, len(y_train))) test = train del sr # Save some memory # cv_iter = list(t.cv.iter(X_train, y_train)) # NOTE: need to build this in the SearchCVFastr Object for i, index in enumerate(sortedindices): print("Processing number {} of {} classifiers.").format( str(i + 1), str(n_class)) X_testtemp = X_test[:] # Get the parameters from the index parameters_est = t.cv_results_['params'][index] parameters_all = t.cv_results_['params_all'][index] # NOTE: kernel parameter can be unicode kernel = str(parameters_est[u'kernel']) del parameters_est[u'kernel'] del parameters_all[u'kernel'] parameters_est['kernel'] = kernel parameters_all['kernel'] = kernel # Refit a classifier using the settings given print("Refitting classifier with best settings.") best_estimator = clone(base_estimator).set_params( **parameters_est) ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler =\ fit_and_score(best_estimator, X_train, y_train, scorer, train, test, True, parameters_all, t.fit_params, t.return_train_score, True, True, True, t.error_score) X = [x[0] for x in X_train] if GroupSel is not None: X = GroupSel.transform(X) X_testtemp = GroupSel.transform(X_testtemp) if SelectModel is not None: X = SelectModel.transform(X) X_testtemp = SelectModel.transform(X_testtemp) if VarSel is not None: X = VarSel.transform(X) X_testtemp = VarSel.transform(X_testtemp) if scaler is not None: X = scaler.transform(X) X_testtemp = scaler.transform(X_testtemp) if y_train is not None: best_estimator.fit(X, y_train, **t.fit_params) else: best_estimator.fit(X, **t.fit_params) # Predict the posterios using the fitted classifier for the training set print("Evaluating performance on training set.") if hasattr(best_estimator, 'predict_proba'): probabilities = best_estimator.predict_proba(X) y_train_prediction[i, :] = probabilities[:, 1] else: # Regression has no probabilities probabilities = best_estimator.predict(X) y_train_prediction[i, :] = probabilities[:] # Predict the posterios using the fitted classifier for the test set print("Evaluating performance on test set.") if hasattr(best_estimator, 'predict_proba'): probabilities = best_estimator.predict_proba(X_testtemp) y_prediction[i, :] = probabilities[:, 1] else: # Regression has no probabilities probabilities = best_estimator.predict(X_testtemp) y_prediction[i, :] = probabilities[:] if type(t.estimator) == sklearn.svm.classes.SVC: y_score[i, :] = best_estimator.decision_function( X_testtemp) else: y_score[i, :] = best_estimator.decision_function( X_testtemp)[:, 0] # Add number parameter settings for k in parameters_all.keys(): if k not in params.keys(): params[k] = list() params[k].append(parameters_all[k]) # Save some memory del best_estimator, X, X_testtemp, ret, GroupSel, VarSel, SelectModel, scaler, parameters_est, parameters_all, probabilities # Take mean over posteriors of top n y_train_prediction_m = np.mean(y_train_prediction, axis=0) y_prediction_m = np.mean(y_prediction, axis=0) # NOTE: Not sure if this is best way to compute AUC y_score = y_prediction_m if type(t.estimator) == sklearn.svm.classes.SVC: # Look for optimal F1 performance on training set thresholds = np.arange(0, 1, 0.01) f1_scores = list() y_train_prediction = np.zeros(y_train_prediction_m.shape) for t in thresholds: for ip, y in enumerate(y_train_prediction_m): if y > t: y_train_prediction[ip] = 1 else: y_train_prediction[ip] = 0 f1_scores.append( f1_score(y_train_prediction, y_train, average='weighted')) # Use best threshold to determine test score best_index = np.argmax(f1_scores) best_thresh = thresholds[best_index] best_thresh = 0.5 y_prediction = np.zeros(y_prediction_m.shape) for ip, y in enumerate(y_prediction_m): if y > best_thresh: y_prediction[ip] = 1 else: y_prediction[ip] = 0 # y_prediction = t.predict(X_temp) y_prediction = [min(max(y, 0), 1) for y in y_prediction] else: y_prediction = y_prediction_m y_prediction = [min(max(y, 0), 1) for y in y_prediction] print "Truth: ", y_truth print "Prediction: ", y_prediction for k, v in zip(test_patient_IDs, y_prediction): scores[k] = v # for k, v in scores.iteritems(): # print k, v # # raise IOError towrite.append(["Iteration " + str()] + scores.values()) if type(t.estimator) == sklearn.svm.classes.SVC: for i_truth, i_predict, i_test_ID in zip( y_truth, y_prediction, test_patient_IDs): if i_truth == i_predict: patient_classification_list[i_test_ID][ 'N_correct'] += 1 else: patient_classification_list[i_test_ID]['N_wrong'] += 1 if type(t.estimator) == sklearn.svm.classes.SVC: c_mat = confusion_matrix(y_truth, y_prediction) TN = c_mat[0, 0] FN = c_mat[1, 0] TP = c_mat[1, 1] FP = c_mat[0, 1] if FN == 0 and TP == 0: sensitivity.append(0) else: sensitivity.append(float(TP) / (TP + FN)) if FP == 0 and TN == 0: specificity.append(0) else: specificity.append(float(TN) / (FP + TN)) if TP == 0 and FP == 0: precision.append(0) else: precision.append(float(TP) / (TP + FP)) accuracy.append(accuracy_score(y_truth, y_prediction)) # y_score = t.decision_function(X_temp) auc.append(roc_auc_score(y_truth, y_score)) f1_score_list.append( f1_score(y_truth, y_prediction, average='weighted')) # elif type(t.estimator) == sklearn.svm.classes.SVR: else: # y_score.extend(svm[k].ix('svms')[0].predict_proba(X_test)) # y_predict.extend(svm[k].ix('svms')[0].predict(X_test)) # y_test.extend(Y_test) # pid_test.extend(pidt) r2score.append(r2_score(y_truth, y_prediction)) MSE.append(mean_squared_error(y_truth, y_prediction)) coefICC.append(ICC(np.column_stack((y_prediction, y_truth)))) C = pearsonr(y_prediction, y_truth) PearsonC.append(C[0]) PearsonP.append(C[1]) C = spearmanr(y_prediction, y_truth) SpearmanC.append(C.correlation) SpearmanP.append(C.pvalue) if survival: # Extract time to event and event from label data E_truth = np.asarray( [mutation_label[1][k][0] for k in test_indices]) T_truth = np.asarray( [mutation_label[2][k][0] for k in test_indices]) # Concordance index cindex.append(1 - ll.utils.concordance_index( T_truth, y_prediction, E_truth)) # Fit Cox model using SVR output, time to event and event data = { 'predict': y_prediction, 'E': E_truth, 'T': T_truth } data = pd.DataFrame(data=data, index=test_patient_IDs) try: cph = ll.CoxPHFitter() cph.fit(data, duration_col='T', event_col='E') coxcoef.append(cph.summary['coef']['predict']) coxp.append(cph.summary['p']['predict']) except ValueError: # Convergence halted, delta contains nan values? coxcoef.append(1) coxp.append(0) except np.linalg.LinAlgError: #FIXME: Singular matrix coxcoef.append(1) coxp.append(0) towrite = zip(*towrite) with open(csvfile, 'wb') as csv_file: writer = csv.writer(csv_file) for w in towrite: writer.writerow(w) # print(N_1) # print(N_2) if type(t.estimator) == sklearn.svm.classes.SVC: N_iterations = len(sinks) accuracy_mean = np.mean(accuracy) S_uj = 1.0 / max((N_iterations - 1), 1) * np.sum( (accuracy_mean - accuracy)**2.0) # print Y_test accuracy_var = np.sqrt((1.0 / N_iterations + N_2 / N_1) * S_uj) # print(accuracy_var) # print(np.sqrt(1/N_iterations*S_uj)) # print(st.sem(accuracy)) stats = dict() stats["Accuracy 95%:"] = str(CI(accuracy, N_1, N_2, alpha)) stats["AUC 95%:"] = str(CI(auc, N_1, N_2, alpha)) stats["F1-score 95%:"] = str(CI(f1_score_list, N_1, N_2, alpha)) stats["Precision 95%:"] = str(CI(precision, N_1, N_2, alpha)) stats["Sensitivity 95%: "] = str(CI(sensitivity, N_1, N_2, alpha)) stats["Specificity 95%:"] = str(CI(specificity, N_1, N_2, alpha)) print("Accuracy 95%:" + str(CI(accuracy, N_1, N_2, alpha))) print("AUC 95%:" + str(CI(auc, N_1, N_2, alpha))) print("F1-score 95%:" + str(CI(f1_score_list, N_1, N_2, alpha))) print("Precision 95%:" + str(CI(precision, N_1, N_2, alpha))) print("Sensitivity 95%: " + str(CI(sensitivity, N_1, N_2, alpha))) print("Specificity 95%:" + str(CI(specificity, N_1, N_2, alpha))) alwaysright = dict() alwayswrong = dict() for i_ID in patient_classification_list: percentage_right = patient_classification_list[i_ID][ 'N_correct'] / float( patient_classification_list[i_ID]['N_test']) # print(i_ID + ' , ' + str(patient_classification_list[i_ID]['N_test']) + ' : ' + str(percentage_right) + '\n') if percentage_right == 1.0: label = mutation_label[0][np.where(i_ID == patient_IDs)] label = label[0][0] alwaysright[i_ID] = label # alwaysright.append(('{} ({})').format(i_ID, label)) print(("Always Right: {}, label {}").format(i_ID, label)) if percentage_right == 0: label = mutation_label[0][np.where( i_ID == patient_IDs)].tolist() label = label[0][0] alwayswrong[i_ID] = label # alwayswrong.append(('{} ({})').format(i_ID, label)) print(("Always Wrong: {}, label {}").format(i_ID, label)) stats["Always right"] = alwaysright stats["Always wrong"] = alwayswrong # Gather all scores for all patients and average pid_unique = list(set(pid_test)) pid_unique = sorted(pid_unique) posteriors = dict() for pid in pid_unique: posteriors[pid] = list() counts = 0 for num, allid in enumerate(pid_test): if allid == pid: counts += 1 posteriors[pid].append(y_score[num][0]) truelabel = y_test[num] posteriors[pid] = [np.mean(posteriors[pid]), truelabel, counts] # elif type(t.estimator) == sklearn.svm.classes.SVR: else: # Compute confidence intervals from cross validations stats = dict() stats["r2_score 95%:"] = str(CI(r2score, N_1, N_2, alpha)) stats["MSE 95%:"] = str(CI(MSE, N_1, N_2, alpha)) stats["ICC 95%:"] = str(CI(coefICC, N_1, N_2, alpha)) stats["PearsonC 95%:"] = str(CI(PearsonC, N_1, N_2, alpha)) stats["SpearmanC 95%: "] = str(CI(SpearmanC, N_1, N_2, alpha)) stats["PearsonP 95%:"] = str(CI(PearsonP, N_1, N_2, alpha)) stats["SpearmanP 95%: "] = str(CI(SpearmanP, N_1, N_2, alpha)) if survival: stats["Concordance 95%:"] = str(CI(cindex, N_1, N_2, alpha)) stats["Cox coef. 95%:"] = str(CI(coxcoef, N_1, N_2, alpha)) stats["Cox p 95%:"] = str(CI(coxp, N_1, N_2, alpha)) # Calculate and sort individual patient MSE patient_MSE = {k: np.mean(v) for k, v in patient_MSE.iteritems()} order = np.argsort(patient_MSE.values()) sortedkeys = np.asarray(patient_MSE.keys())[order].tolist() sortedvalues = np.asarray(patient_MSE.values())[order].tolist() patient_MSE = [(k, v) for k, v in zip(sortedkeys, sortedvalues)] for p in patient_MSE: print p[0], p[1] stats["Patient_MSE"] = patient_MSE for k, v in stats.iteritems(): print k, v # Check which parameters were most often used params = paracheck(params) # params = dict() # for num, classf in enumerate(trained_classifiers): # params_temp = classf.best_params_ # if num == 0: # for k in params_temp.keys(): # params[k] = list() # params[k].append(params_temp[k]) # else: # for k in params_temp.keys(): # params[k].append(params_temp[k]) # # print params # # Make histograms or box plots of params # for k in params.keys(): # para = params[k] # print k # if type(para[0]) is unicode: # letter_counts = Counter(para) # values = letter_counts.values() # keys = letter_counts.keys() # print keys, values # plt.bar(range(len(values)), values, align='center') # plt.xticks(range(len(keys)), keys) # plt.show() # else: # # Make a standard boxplot # plt.figure() # plt.boxplot(para, 0, 'gD') # plt.show() # Save output savedict = dict() savedict["Statistics"] = stats savedict['Parameters'] = params if type(output_json) is list: output_json = ''.join(output_json) if not os.path.exists(os.path.dirname(output_json)): os.makedirs(os.path.dirname(output_json)) with open(output_json, 'w') as fp: json.dump(savedict, fp, indent=4) print("Saved data!")
def _run(self): self._cf = lifelines.CoxPHFitter() self._cf.fit(self.df, self.survival_col, event_col=self.cens_col, include_likelihood=True)
def time_between(r): if r.isnull()['Loan Paid In Full Date']: return r['Loan Maturity Date'] - r['Funded Date'] else: return r['Loan Paid In Full Date'] - r['Funded Date'] #if r.isnull()['Charge Off Date']: # return r['Loan Maturity Date'] - r['Funded Date'] #else: # return r['Charge Off Date'] - r['Funded Date'] T = df.apply(lambda r: time_between(r), axis=1).dt.days E = df['Loan Status'] # survival analysis cph = lifelines.CoxPHFitter() X = df X['T'] = T # One Hot encode the categorical features cat_vars = ['Grade', 'Loan Purpose', 'Housing Status'] for var in cat_vars: X = pd.concat((X, pd.get_dummies(X[var])), 1) X = X.drop(var, axis=1) # remove unused datetime features X = X.drop([ 'Charge Off Date', 'Funded Date', 'Loan Maturity Date', 'Loan Paid In Full Date' ], axis=1)
# print(df.shape) # df.head() modelspec = 'manufacturer + capacity' dft = pt.dmatrix(modelspec, df, return_type='dataframe') design_info = dft.design_info dft = dft.join(df[['maxhours', 'failed']]) ## NOTE: CoxPHFitter expects reduced-rank design matrix WITHOUT intercept ## https://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes3.pdf del dft['Intercept'] dft.head().T cx = sa.CoxPHFitter(normalize=False) cx.fit(df=dft, duration_col='maxhours', event_col='failed', show_progress=True, include_likelihood=True) fig, axes = plt.subplots(nrows=1, ncols=2, squeeze=False, sharex=True) cx.baseline_cumulative_hazard_.plot(ax=axes[0, 0], legend=False, title='Baseline cumulative hazard rate') cx.baseline_survival_.plot(ax=axes[0, 1], legend=False, title='Baseline survival rate') cx.summary
def lsReg(M, params): import lifelines as lf cph = lf.CoxPHFitter() return (cph.fit(M, duration_col=params['tmCol'], event_col=params['ixCol']))
#define which cluster feature to examine #options: [coexpression_cluster, functional_proteins_cluster, immunoregulatory_protein_cluster] #can either examine each one-at-a-time or iterate through them cluster_choice = "coexpression_cluster" #manipulate the architecture distinction the variable which is originally dtype: str into a quantitative categorical variable df["Architecture"] = df["Architecture"].astype("category").cat.codes #Create two separate feature matrices for recurrence and survival recurrence_df = df.drop(columns=["Survival", "Survival_time"])[[ cluster_choice, "grade", "age", "Architecture", "Recurrence", "Recurrence_time" ]] survival_df = df.drop(columns=["Recurrence", "Recurrence_time"])[[ cluster_choice, "grade", "age", "Architecture", "Survival", "Survival_time" ]] #Define and fit Cox PH Fitter for recurrence recurrence_cph = lifelines.CoxPHFitter() recurrence_cph.fit(recurrence_df, duration_col='Recurrence_time', event_col='Recurrence') recurrence_summary = recurrence_cph.print_summary() #Define and fit Cox PH Fitter for survival survival_cph = lifelines.CoxPHFitter() survival_cph.fit(survival_df, duration_col='Survival_time', event_col='Survival') survival_summary = survival_cph.print_summary()