def main(): """ """ # simulating a feature matrix for 100 samples with 50 features data = np.random.random((100, 50)) # simulating time of observations (days) min 10 days, max 2500 days observed_time = np.random.randint(10, 2500, (100)) # simulating event (death) 0 did not occur 1 occured observed_event = np.random.randint(0, 2, (100)) test_data = np.random.random((25, 50)) test_observed_time = np.random.randint(10, 2500, (25)) test_observed_event = np.random.randint(0, 2, (25)) for feature_id, feature_vect in enumerate(data.T): dataframe = pd.DataFrame({ 'feature nb{0}'.format(feature_id): feature_vect, 'event': observed_event, 'time': observed_time }) #building a coxph model to see the significance of each independant feature cox_model = CoxPHFitter() cox_model.fit(dataframe, duration_col='time', event_col='event') pvalue = cox_model.summary.p[0] print('pvalue: {0} for feature nb: {1}'.format(pvalue, feature_id)) if pvalue > 0.05: print('feature nb {0} not overall significant!'.format(feature_id)) continue # test the robustness: score close / higher to 0.7 is a good sign scores = k_fold_cross_validation(cox_model, dataframe, duration_col='time', event_col='event', k=3) print('score (mean) (c-index) for {0}'.format(np.mean(scores))) # validate the features on the test set test_dataframe = pd.DataFrame({ 'feature nb{0}'.format(feature_id): test_data.T[feature_id], 'event': test_observed_event, 'time': test_observed_time }) inferred_time = cox_model.predict_expectation(test_dataframe) validation_c_index = concordance_index(test_observed_time, inferred_time, test_observed_event) print('validation c-index: {0}'.format(validation_c_index))
class CoxChurnModel: def __init__(self): self.cf = CoxPHFitter() def fit(self, dataset, pred_col='deltaNextHours', event_col='observed'): self.cf.fit(dataset, pred_col, event_col=event_col) def predict(self, df): pred = self.cf.predict_expectation(df) churned = (pred - df.recency.values.reshape((-1,1))) > predPeriodHours return churned.values.reshape(-1) def predict_proba(self, df): return np.zeros(len(df))
def finalPrediction(image_features, radiomics, clinical_data, y, patient_id, pca=None, cox_model=None): """ Apply the PCA and the cox model to the features extracted from the image and the other features parameters : features, y, patient id, PCA and CoxPH models if we are applying on the dev and train set return the submission as well as the PCA and CoxPH models """ # apply the PCA to the data x, pca = applyPCA(image_features, radiomics, clinical_data, pca=None) # if the cox model is not given, fit it on the (x, y) pair (i.e. we are on the train set) if not cox_model: size = x.shape[1] + y.shape[1] final_data = pd.DataFrame( data=np.hstack((x, y)), columns=['col_' + str(i) for i in range(size)]) cox_model = CoxPHFitter() cox_model.fit(final_data, duration_col='col_' + str(size - 2), event_col='col_' + str(size - 1), step_size=0.6) # then predict using the model size = x.shape[1] final_data = pd.DataFrame(data=x, columns=['col_' + str(i) for i in range(size)]) prediction = cox_model.predict_expectation(final_data).values[:, 0] # put the prediction in a pandas DataFrame to submit or evaluate on the concoardance index nans = np.nan * np.ones(patient_id.shape) submission = pd.DataFrame(np.vstack((patient_id, prediction, nans)).T) submission.columns = ['PatientID', 'SurvivalTime', 'Event'] submission = submission.set_index(['PatientID']) # return the submission as well as both model, that might be used on the dev or test set return submission, pca, cox_model
def predict(dataframe): """ Function for returning the expected lifetime based on the Input Data """ ## Loading the Dataset ## input_path = "input/" df = pd.read_csv(os.path.join(input_path, "pbc.csv")) ## Some Pre-Processing ## for i in df.index: df.at[i, 'sex'] = 0 if df.loc[i, 'sex'] == "f" else 1 ## Splitting the Dataset ## np.random.seed(0) df_dev, df_test = train_test_split(df, test_size=0.2) df_train, df_val = train_test_split(df_dev, test_size=0.25) ## Creating a encoding function ## def one_hot_encoder(dataframe, columns): return pd.get_dummies(dataframe, columns=columns, drop_first=True, dtype=np.float64) to_encode = ["edema", "stage"] one_hot_train = one_hot_encoder(df_train, to_encode) one_hot_val = one_hot_encoder(df_val, to_encode) one_hot_test = one_hot_encoder(df_test, to_encode) one_hot_train.dropna(inplace=True) ## Fitting the Model ## cph = CoxPHFitter() cph.fit(one_hot_train, duration_col='age', event_col='status', step_size=0.1) return cph.predict_expectation(dataframe)[0]
def cross_val(pena, train_size=0.75, selection=False, features=None): """ Hold out method. ---------- pena : float>0 Penalization coefficient for the L2 penalization. train_size : 0<float<1, optional Set the size of the training set. The default is 0.75. selection : bool, optional Feature selection enabling. The default is False. features : pd.Index, optional The features to select. The default is None. Returns ------- cph : COXPHFitter. The Cox model. lifeline object score : float Score from the metrics. """ x_train, x_test, y_train, y_test = get_data_set(x, y, train_size, selection, features) cph = CoxPHFitter(penalizer=pena).fit(pd.concat([x_train, y_train], axis=1), duration_col='SurvivalTime', event_col='Event') y_pred = cph_pred(cph.predict_expectation(x_test)) if not (np.all(y_pred.iloc[:, 0].values) ): # for some reasons sometimes predicted lifetime is null y_test = y_test.drop( y_pred.iloc[np.where(y_pred.iloc[:, 0].values == 0)[0]].index, 0) y_pred = y_pred.drop( y_pred.iloc[np.where(y_pred.iloc[:, 0].values == 0)[0]].index, 0) return (cph, cindex(y_test, y_pred))
data_test = { 'T': y_test[:, 0], 'E': y_test[:, 1], '%s' % name[0]: X_test[:, ID[0] - 1], '%s' % name[1]: X_test[:, ID[1] - 1], '%s' % name[2]: X_test[:, ID[2] - 1], '%s' % name[3]: X_test[:, ID[3] - 1], '%s' % name[4]: X_test[:, ID[4] - 1], # '%s' % name[0]: X_test[:, ID[0]+1], # '%s' % name[1]: X_test[:, ID[1]+1], # '%s' % name[2]: X_test[:, ID[2]+1 ], # '%s' % name[3]: X_test[:, ID[3]+1], # '%s' % name[4]: X_test[:, ID[4]+1 ], } df_test = pd.DataFrame(data_test) predict = cph.predict_expectation(df_test) test_c_index.append(compute_C_index(predict, y_test)) train_c_index = np.asarray(train_c_index) print("Train_c_index_mean:", np.mean(train_c_index)) test_c_index = np.asarray(test_c_index) test_c_index = np.delete(test_c_index, 0) print("test:", test_c_index) print("Test_c_index_mean:", np.mean(test_c_index)) # 将选出的 # print(count) rank = Counter(count).most_common(7) print(rank, rank[0][0]) name = [] for i in (rank): id = i[0]
class ProportionalHazardRegressor_lfl(object): """ Thin wrapper on Lifelines' cox proportional hazards fitter to be used with CVmodel. Attributes: model_kwargs (dict): keyword arguments to pass to CoxPHFitter's constructor model (CoxPHFitter): lifelines.CoxPHFitter object """ def __init__(self, **kwargs): """ Constructs a ProportionalHazardRegressor_lfl object. An lifelines.CoxPHFitter instance is created when the fit method is called. Args: kwargs (dict): kwargs to pass to the constructor of lifelines.CoxPHFitter. Returns: ProportionalHazardRegressor_lfl """ self.model_kwargs = kwargs self.model = None def fit(self, X_train, Y_train, X_validate=None, Y_validate=None, cv_param=0.0, **fit_kwargs): """ Create and fit the ProportionalHazards_lfl. The cv_param is the l2-penalizer term accepted CoxPHFitter. Args: X_train (numpy.ndarray ~ (num_samples, num_units)): training data. Y_train (numpy.ndarray ~ (num_samples,)): training labels. X_validate (numpy.ndarray ~ (num_samples, num_units)): validation data. Unused for this model. Y_validate (numpy.ndarray ~ (num_samples,)): validation labels. Unused for this model. cv_param: the value of the hyperparameter optimized in CV. The l2 penalizer term. fit_kwargs (dict): kwargs to pass to the fit method. Returns: None """ self.model = CoxPHFitter(penalizer=cv_param, **self.model_kwargs) y = pd.DataFrame(Y_train, columns=['time', 'censor']) df = pd.concat((pd.DataFrame(X_train), y), axis=1) self.model.fit(df, 'time', 'censor', **fit_kwargs) def predict(self, X): """ Predict survival expectations for X. Args: X (numpy.ndarray ~ (num_samples, num_features)): samples to predict survival times from. Returns: times (numpy.ndarray ~ (num_samples, num_classes)): expected survival times. """ assert self.model is not None, "Need to fit model first" return self.model.predict_expectation(X)
def train_cox(x_train0, ix_in, y_per_pt, y_int, metric = 'auc', feature_grid = None): if feature_grid is None: feature_grid = np.logspace(7, 20, 14) survival = {} # for ic_in, ix_in in enumerate(ix_inner): train_index, test_index = ix_in x_train, x_test = x_train0.iloc[train_index, :], x_train0.iloc[test_index, :] lamb_dict = {} lamb_dict['auc'] = {} lamb_dict['ci'] = {} for il, lamb in enumerate(feature_grid): ix_inner2 = leave_one_out_cv(x_train, x_train['outcome'], ddtype='all_data') ix_rand_samp = np.random.choice(np.arange(len(ix_inner2)), 10, replace=False) ix_inner2_samp = np.array(ix_inner2, dtype='object')[ix_rand_samp] # ix_inner2_rand_samp = np.random.choice(ix_inner2, 10, replace = False) counter = 0 start = time.time() hazards = [] event_times = [] event_outcomes = [] probs_in = [] true = [] model = CoxPHFitter(penalizer=lamb, l1_ratio=1.) for ic_in2, ix_in2 in enumerate(ix_inner2_samp): start_inner = time.time() train_ix, test_ix = ix_in2 x_tr2, x_ts2 = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :] tmpts_in = [xx.split('-')[1] for xx in x_tr2.index.values] samp_weights = get_class_weights(np.array(y_int[x_tr2.index.values]), tmpts_in) samp_weights[samp_weights <= 0] = 1 x_tr2.insert(x_tr2.shape[1], 'weights', samp_weights) try: model.fit(x_tr2, duration_col='week', event_col='outcome', weights_col='weights', robust=True, show_progress = False) except: counter += 1 continue pred_f = model.predict_survival_function(x_ts2.iloc[0, :]) probs_in.append(1 - pred_f.loc[4.0].item()) true.append(x_ts2['outcome'].iloc[-1]) hazard = model.predict_partial_hazard(x_ts2) hazards.append(hazard) event_times.append(x_ts2['week']) event_outcomes.append(x_ts2['outcome']) end_inner = time.time() # print('Inner ix ' + str(ic_in2) + ' complete in ' + str(end_inner - start_inner)) # if metric == 'CI': try: score = concordance_index(pd.concat(event_times), pd.concat(hazards), pd.concat(event_outcomes)) lamb_dict['ci'][lamb] = score end_t = time.time() print(str(il) + ' complete') print((end_t - start)/60) except: print('No score available') continue # elif metric == 'auc': try: score = sklearn.metrics.roc_auc_score(true, probs_in) lamb_dict['auc'][lamb] = score except: continue lambdas, aucs_in = list(zip(*lamb_dict[metric].items())) ix_max = np.argmax(aucs_in) best_lamb = lambdas[ix_max] model_out = CoxPHFitter(penalizer=best_lamb, l1_ratio=1.) tmpts_in = [xx.split('-')[1] for xx in x_train.index.values] samp_weights = get_class_weights(np.array(y_int[x_train.index.values]), tmpts_in) samp_weights[samp_weights<=0] = 1 x_train.insert(x_train.shape[1], 'weights', samp_weights) x_train['weights'] = samp_weights try: model_out.fit(x_train, duration_col='week', event_col='outcome', weights_col='weights', robust=True) except: return {} pred_f = model_out.predict_survival_function(x_test.iloc[0, :]) pt = x_test.index.values[0].split('-')[0] hazard_out = model_out.predict_partial_hazard(x_test) pts = [ii.split('-')[0] for ii in x.index.values] tmpts = [ii.split('-')[1] for ii in x.index.values] # if pt not in survival.keys(): # survival[pt] = {} ixs = np.where(np.array(pts) == pt)[0] survival['actual'] = str(np.max([float(tmpt) for tmpt in np.array(tmpts)[ixs]])) if y_per_pt[pt] == 'Cleared': survival['actual'] = survival['actual'] + '+' probs_sm = 1 - pred_f.loc[4.0].item() y_pred_exp = model_out.predict_expectation(x_test.iloc[[0], :]) survival['predicted'] = str(np.round(y_pred_exp.item(), 3)) surv_func = pred_f # probs_df = pd.Series(probs_sm) # y_pp = y_per_pt.replace('Cleared', 0).replace('Recur', 1) # final_df = pd.concat([y_pp, probs_df], axis=1).dropna() final_dict = {} # final_dict['probability_df'] = final_df final_dict['model'] = model_out final_dict['survival'] = survival final_dict['survival_function'] = surv_func final_dict['prob_true'] = (probs_sm, y_per_pt[pt]) final_dict['times_hazards_outcomes'] = (x_test['week'], hazard_out, x_test['outcome']) final_dict['lambdas'] = lamb_dict # final_dict['auc'] = sklearn.metrics.roc_auc_score(final_df[0], final_df[1]) return final_dict
def __linear_small(self, is_death, train_data_path, basepath): small_dataset_file = train_data_path small_dataset = pandas.read_csv(small_dataset_file, encoding='UTF-8', index_col=[0]) del small_dataset['patient_id'] del small_dataset['name'] # 哑变量处理 formular = '' classify_attr = { 'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol', 'HBsAg', 'HBsAb', 'HBeAg', 'HBeAb', 'HBcAb', 'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'diuretic', 'LipidD', 'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access', 'ESRDcause', 'hypertension', 'DM', 'cardiovasculardisease', 'cerebrovasculardisease', 'bleeding', 'malignancy', 'ablocker', 'bblocker' } for column in small_dataset.columns: if column in classify_attr: formular = formular + 'C(' + column + ')+' else: formular = formular + column + '+' formular = formular[:-1] small_dataset = patsy.dmatrix(formular + '-1', small_dataset, return_type='dataframe') if is_death: T_true, E_true, T_false, E_false = ('survivaltime1', 'outcome1', 'survivaltime2', 'outcome2') attr_file, p632_file, var_file, kfold_file = ( 'lm_significant_attrs.txt', 'lm_stats632.csv', 'lm_statvar.txt', 'lm_statskfold.csv') beta_file, p_file = ('lm_coef.csv', 'lm_p.csv') else: T_true, E_true, T_false, E_false = ('survivaltime2', 'outcome2', 'survivaltime1', 'outcome1') attr_file, p632_file, var_file, kfold_file = ( 'lm_significant_attrs_e.txt', 'lm_stats632_e.csv', 'lm_statvar_e.txt', 'lm_statskfold_e.csv') beta_file, p_file = ('lm_coef_e.csv', 'lm_p_e.csv') del small_dataset[T_false] del small_dataset[E_false] significant_attrs = list() for column in small_dataset.columns: # print('column', column) if column in {T_true, E_true}: continue subset = small_dataset[[column, T_true, E_true]] # print('subset', subset) try: cox = CoxPHFitter() cox.fit(subset, T_true, E_true) # print('cox.summary['p'][0]:', cox.summary['p'][0]) if cox.summary['p'][0] < 0.05: significant_attrs.append(column) except Exception: continue output = open(attr_file, mode='w') for attr in significant_attrs: output.write(attr + '\n') output.close() input = open(attr_file) significant_attrs = [line.strip() for line in input.readlines()] input.close() significant_attrs.append(T_true) significant_attrs.append(E_true) print('linear_small ## sign_attr : %d' % len(significant_attrs)) small_dataset = small_dataset[significant_attrs] # 10000 times .632 bootstrap count = 0 stats632 = list() statscoef = list() statspvalue = list() while count < 10000: # 线性训练 try: train_set = small_dataset.take( numpy.random.randint(0, len(small_dataset), size=len(small_dataset))) test_set = small_dataset.ix[set( small_dataset.index).difference(set(train_set.index))] train_set.index = range(len(train_set)) test_set.index = range(len(test_set)) cox = CoxPHFitter() cox.fit(train_set, T_true, E_true) train_cindex = concordance_index( cox.durations, -cox.predict_partial_hazard(cox.data).values.ravel(), cox.event_observed) statscoef.append(cox.summary[['coef']].T) statspvalue.append(cox.summary[['p']].T) # test_set test_actual_T = test_set[T_true].copy() test_actual_E = test_set[E_true].copy() test_variable = test_set[test_set.columns.difference( [T_true, E_true])] test_predictT = cox.predict_expectation(test_variable) # small_set all_actual_T = small_dataset[T_true].copy() all_actual_E = small_dataset[E_true].copy() all_variable = small_dataset[small_dataset.columns.difference( [T_true, E_true])] all_predictT = cox.predict_expectation(all_variable) try: test_cindex = concordance_index(test_actual_T, test_predictT, test_actual_E) all_cindex = concordance_index(all_actual_T, all_predictT, all_actual_E) except Exception: test_cindex = concordance_index(test_actual_T, test_predictT) all_cindex = concordance_index(all_actual_T, all_predictT) stats632.append([train_cindex, test_cindex, all_cindex]) count += 1 print('632 -> %d' % count) except Exception: continue stats632_df = pandas.DataFrame(stats632, columns=['train', 'test', 'all']) stats632_df.to_csv(p632_file, encoding='UTF-8') statscoef_df = pandas.DataFrame( pandas.concat(statscoef, ignore_index=True)) statscoef_df.to_csv(beta_file, encoding='UTF-8') statspvalue_df = pandas.DataFrame( pandas.concat(statspvalue, ignore_index=True)) statspvalue_df.to_csv(p_file, encoding='UTF-8') # 2000 times 10-fold cross-validation、十折交叉 count = 0 statskfold = list() while count < 2000: try: cox = CoxPHFitter() scores = k_fold_cross_validation(cox, small_dataset, T_true, E_true, 10) statskfold.append(scores) count += 1 print('k-fold -> %d' % count) except Exception: continue statskfold_df = pandas.DataFrame(statskfold) statskfold_df.to_csv(basepath + "/" + kfold_file, encoding='UTF-8')
def __linear_big(self, is_death, train_data_path, basepath): big_dataset_file = train_data_path big_dataset = pandas.read_csv(big_dataset_file, encoding='UTF-8', index_col=[0]) del big_dataset['patient_id'] del big_dataset['name'] del big_dataset['tx_id'] # del big_dataset['tx_id.1'] del big_dataset['tx_date'] formular = '' # classify_attr = {'subject', 'treat_item', 'vascular_access_type', # 'dialysis_machine', 'reuse_times', 'anticoagulation_scope', # 'anticoagulation', 'protamine', 'replacement_way', # 'take_food', 'fluid_infusion', 'blood_pressure_pos', # 'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol', 'HBsAg', 'HBsAb', # 'HBeAg', 'HBeAb', 'HBcAb', 'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'diuretic', # 'LipidD', 'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access', # 'ESRDcause', 'hypertension', 'DM', 'cardiovasculardisease', 'cerebrovasculardisease', # 'bleeding', 'malignancy', 'ablocker', 'bblocker'} classify_attr = { 'subject', 'treat_item', 'vascular_access_type', 'dialysis_machine', 'anticoagulation_scope', 'anticoagulation', 'protamine', 'replacement_way', 'take_food', 'fluid_infusion', 'blood_pressure_pos', 'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol', 'HBsAg', 'HBsAb', 'HBeAg', 'HBeAb', 'HBcAb', 'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'blocker', 'blocer', 'diuretic', 'LipidD', 'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access', 'ESRDcause', 'hypertension', 'DM', 'cardiovasculardisease', 'cerebrovasculardisease', 'bleeding', 'malignancy' } # u'\xa6\xc2blocker' # print('classify_attr.dtype:', classify_attr.shape) for column in big_dataset.columns: # print("column", column) if column in classify_attr: formular = formular + 'C(' + column + ')+' else: formular = formular + column + '+' # print('formular:', formular) # 去掉最后面的'+' # type(formular): <type 'unicode'> formular = formular[:-1].encode('utf-8') # print('formular[:-1].type:', type(formular)) # '-1'表示不添加截取列 big_dataset = patsy.dmatrix(formular + '-1', big_dataset, return_type='dataframe') # print(type(big_dataset)) # print(big_dataset.columns) # print('big_dataset:', big_dataset) if is_death: T_true, E_true, T_false, E_false = ('survivaltime1', 'outcome1', 'survivaltime2', 'outcome2') attr_file, p632_file, var_file, kfold_file = ( 'lb_significant_attrs.txt', 'lb_stats632.csv', 'lb_statvar.txt', 'lb_statskfold.csv') beta_file, p_file = ('lb_coef.csv', 'lb_p.csv') else: T_true, E_true, T_false, E_false = ('survivaltime2', 'outcome2', 'survivaltime1', 'outcome1') attr_file, p632_file, var_file, kfold_file = ( 'lb_significant_attrs_e.txt', 'lb_stats632_e.csv', 'lb_statvar_e.txt', 'lb_statskfold_e.csv') beta_file, p_file = ('lb_coef_e.csv', 'lb_p_e.csv') del big_dataset[T_false] del big_dataset[E_false] significant_attrs = list() # 根据报错删除部分字段 del big_dataset['k_concentration'] del big_dataset['SDUFR_x'] del big_dataset['SDUFR_y'] del big_dataset['SDUFR_y_v'] del big_dataset['protamine_c'] del big_dataset['k_concentration_c'] """如果已经挑选出了具有统计意义的风险因子则不需要执行以下验证风险因子统计学意义的片段 """ #+++++++++++++++++++++++++++++++++++++++++++++++++++++ # for column in big_dataset.columns: # if column in {T_true, E_true}: # continue # subset = big_dataset[[column, T_true, E_true]] # # print('subset', subset) # try: # # print('start fitting ') # cox = CoxPHFitter() # cox.fit(subset, T_true, E_true) # help(cox) # print('cox value:', cox.print_summary()) # print('p value:', cox.summary['p'][0]) # if cox.summary['p'][0] < 0.05: # # print(column, cox.summary['p'][0]) # significant_attrs.append(column) # except Exception: # continue # output = open(basepath+"/"+attr_file, mode='w') # for attr in significant_attrs: # output.write(attr + '\n') # output.close() #++++++++++++++++++++++++++++++++++++++++++++++++++++ input = open(basepath + "/" + attr_file) significant_attrs = [line.strip() for line in input.readlines()] input.close() significant_attrs.append(T_true) significant_attrs.append(E_true) print('linear_big ## sign_attr : %d' % len(significant_attrs)) print(len(significant_attrs), T_true, E_true) big_dataset = big_dataset[significant_attrs] print(len(big_dataset.columns)) # exit() # 10000 times .632 bootstrap count = 0 stats632 = list() statscoef = list() statspvalue = list() while count < 10000: print('count', count) try: # big_dataset = big_dataset.take(numpy.random.permutation(len(big_dataset))) # big_dataset.index = range(len(big_dataset)) # percent = int(len(big_dataset) * 0.30) # train_set = big_dataset[:-percent] # test_set = big_dataset[-percent:] # train_set.index = range(len(train_set)) # test_set.index = range(len(test_set)) train_set = big_dataset.sample(1500, replace=False) test_set = big_dataset.sample(1500, replace=False) print('try fitting......', len(big_dataset), len(train_set), len(test_set)) cox = CoxPHFitter() cox.fit(train_set, T_true, E_true) train_cindex = concordance_index( cox.durations, -cox.predict_partial_hazard(cox.data).values.ravel(), cox.event_observed) statscoef.append(cox.summary[['coef']].T) statspvalue.append(cox.summary[['p']].T) print('try predicting......') # test_set test_actual_T = test_set[T_true] test_actual_E = test_set[E_true] test_variable = test_set[test_set.columns.difference( [T_true, E_true])] test_predictT = cox.predict_expectation(test_variable) # small_set all_actual_T = big_dataset[T_true] all_actual_E = big_dataset[E_true] all_variable = big_dataset[big_dataset.columns.difference( [T_true, E_true])] all_predictT = cox.predict_expectation(all_variable) print('try cindexing......') try: test_cindex = concordance_index(test_actual_T, test_predictT, test_actual_E) all_cindex = concordance_index(all_actual_T, all_predictT, all_actual_E) except Exception: test_cindex = concordance_index(test_actual_T, test_predictT) all_cindex = concordance_index(all_actual_T, all_predictT) print(train_cindex, test_cindex, all_cindex) # 0.5 0.5 0.5 # 0.963726363744 0.965792024703 0.964552831227 # 0.5 0.5 0.5 # 0.5 0.5 0.5 # 0.940458783243 0.939660104788 0.940145223899 # 0.950570809577 0.946854258363 0.949067405671 # 0.941352881629 0.941623634389 0.941462605414 # 0.5 0.5 0.5 stats632.append([train_cindex, test_cindex, all_cindex]) count += 1 print('632 -> %d' % count) except Exception as e: print(e.message) continue stats632_df = pandas.DataFrame(stats632, columns=['train', 'test', 'all']) stats632_df.to_csv(p632_file, encoding='UTF-8') statscoef_df = pandas.DataFrame( pandas.concat(statscoef, ignore_index=True)) statscoef_df.to_csv(beta_file, encoding='UTF-8') statspvalue_df = pandas.DataFrame( pandas.concat(statspvalue, ignore_index=True)) statspvalue_df.to_csv(p_file, encoding='UTF-8') print('10000 times .632 bootstrap has done.')
def get_surv_curv( data, player): ##add percentile of prediction as an annottion on the graph cph = CoxPHFitter() cph.fit(data, 'NBA_Experience', event_col='active') X = data.loc[[player]].drop(['NBA_Experience', 'active'], axis=1) league_surv = cph.baseline_survival_ player_surv = cph.predict_survival_function(X) x = data.drop(['NBA_Experience', 'active'], axis=1) predictions = cph.predict_expectation(x) percentiles = predictions.rank(pct=True) player_pct = percentiles.loc[player] string = 'Career Length Prediction Percentile: ' + str( round(player_pct.values[0], 2)) trace1 = go.Scatter(name='League Average', x=league_surv.index, y=league_surv['baseline survival'].values, marker={'color': "#253046"}) trace2 = go.Scatter(name=player, x=player_surv.index, y=player_surv[player].values, marker={'color': '#B35E3B'}) data = [trace1, trace2] layout = go.Layout({ "xaxis": { "title": "Years in the NBA", 'color': '#253046' }, "yaxis": { "title": "Probability of remaining in the NBA", 'color': '#253046' }, 'paper_bgcolor': '#F8F3F1', 'plot_bgcolor': '#F8F3F1', 'margin': { 't': 50, 'r': 30 }, 'annotations': [{ 'x': 13, 'y': 0.78, 'text': string, 'showarrow': False, 'font': { 'size': 14, 'color': '#253046' } }], 'legend': { 'x': .8, 'y': 1, 'traceorder': 'normal' } }) fig = go.Figure(data=data, layout=layout) return fig
rad_X_data = rad_X_data[r_features] X_data = pd.concat([X_data, rad_X_data], axis=1) X_data = pd.get_dummies(X_data, drop_first=True) full_data = pd.concat([X_data, y_data], axis=1) full_data = full_data.dropna() """ MODEL TRAIN """ model = CoxPHFitter() model.fit(full_data, 'SurvivalTime', event_col='Event') model.print_summary() p = model.predict_expectation(X_data) print(p) p_df = pd.DataFrame(index=y_data.index) p_df['SurvivalTime'] = p p_df['Event'] = None p_df.SurvivalTime = p_df.SurvivalTime.fillna(p_df.SurvivalTime.mean()) print(p_df.head()) score = metric.cindex(y_data, p_df) print(f'TRAIN : CScore = {score}') model.print_summary() """ VALIDATION """ test_X_c_dir = './x_test/features/clinical_data.csv'
def __predict_individual(self, is_death, train_data_path, basepath): big_dataset_file = train_data_path big_dataset = pd.read_csv(big_dataset_file, encoding='UTF-8', index_col=[0]) del big_dataset['patient_id'] del big_dataset['name'] del big_dataset['tx_id'] # del big_dataset['tx_id.1'] del big_dataset['tx_date'] formular = '' classify_attr = { 'subject', 'treat_item', 'vascular_access_type', 'dialysis_machine', 'anticoagulation_scope', 'anticoagulation', 'protamine', 'replacement_way', 'take_food', 'fluid_infusion', 'blood_pressure_pos', 'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol', 'HBsAg', 'HBsAb', 'HBeAg', 'HBeAb', 'HBcAb', 'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'blocker', 'blocer', 'diuretic', 'LipidD', 'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access', 'ESRDcause', 'hypertension', 'DM', 'cardiovasculardisease', 'cerebrovasculardisease', 'bleeding', 'malignancy' } for column in big_dataset.columns: # print("column", column) if column in classify_attr: formular = formular + 'C(' + column + ')+' else: formular = formular + column + '+' formular = formular[:-1].encode('utf-8') # '-1'表示不添加截取列 big_dataset = patsy.dmatrix(formular + '-1', big_dataset, return_type='dataframe') if is_death: T_true, E_true, T_false, E_false = ('survivaltime1', 'outcome1', 'survivaltime2', 'outcome2') attr_file, p632_file, var_file, kfold_file = ( 'lb_significant_attrs.txt', 'lb_stats632.csv', 'lb_statvar.txt', 'lb_statskfold.csv') beta_file, p_file = ('lb_coef.csv', 'lb_p.csv') else: T_true, E_true, T_false, E_false = ('survivaltime2', 'outcome2', 'survivaltime1', 'outcome1') attr_file, p632_file, var_file, kfold_file = ( 'lb_significant_attrs_e.txt', 'lb_stats632_e.csv', 'lb_statvar_e.txt', 'lb_statskfold_e.csv') beta_file, p_file = ('lb_coef_e.csv', 'lb_p_e.csv') del big_dataset[T_false] del big_dataset[E_false] significant_attrs = list() # 根据报错删除部分字段 del big_dataset['k_concentration'] del big_dataset['SDUFR_x'] del big_dataset['SDUFR_y'] del big_dataset['SDUFR_y_v'] del big_dataset['protamine_c'] del big_dataset['k_concentration_c'] """如果已经挑选出了具有统计意义的风险因子则不需要执行以下验证风险因子统计学意义的片段 """ #+++++++++++++++++++++++++++++++++++++++++++++++++++++ # for column in big_dataset.columns: # if column in {T_true, E_true}: # continue # subset = big_dataset[[column, T_true, E_true]] # # print('subset', subset) # try: # # print('start fitting ') # cox = CoxPHFitter() # cox.fit(subset, T_true, E_true) # help(cox) # print('cox value:', cox.print_summary()) # print('p value:', cox.summary['p'][0]) # if cox.summary['p'][0] < 0.05: # # print(column, cox.summary['p'][0]) # significant_attrs.append(column) # except Exception: # continue # output = open(basepath+"/"+attr_file, mode='w') # for attr in significant_attrs: # output.write(attr + '\n') # output.close() #++++++++++++++++++++++++++++++++++++++++++++++++++++ input = open(basepath + "/" + attr_file) significant_attrs = [line.strip() for line in input.readlines()] input.close() significant_attrs.append(T_true) significant_attrs.append(E_true) print('linear_big ## sign_attr : %d' % len(significant_attrs)) print(len(significant_attrs), T_true, E_true) big_dataset = big_dataset[significant_attrs] print(len(big_dataset.columns)) # 10000 times .632 bootstrap count = 9999 stats632 = list() statscoef = list() statspvalue = list() cox = CoxPHFitter() if count < 10000: print('count', count) try: train_set = big_dataset.sample(1500, replace=False) test_set = big_dataset.sample(1, replace=False) print('try fitting......', len(big_dataset), len(train_set), len(test_set)) # cox = CoxPHFitter() cox = cox.fit(train_set, T_true, E_true) print(test_set) cox.predict_survival_function(test_set).plot() print(cox.predict_log_hazard_relative_to_mean(test_set)) # for t_index,t_item in test_set.iterrows: # print(str(t_index)+"predict_survival_function") # print(cox.predict_survival_function(t_item)) # cox.predict_survival_function(t_item).plot() # print(str(t_index)+"predict_survival_function") # print(cox.predict_survival_function(t_item)) train_cindex = concordance_index( cox.durations, -cox.predict_partial_hazard(cox.data).values.ravel(), cox.event_observed) statscoef.append(cox.summary[['coef']].T) statspvalue.append(cox.summary[['p']].T) print('try predicting......') # test_set test_actual_T = test_set[T_true] test_actual_E = test_set[E_true] test_variable = test_set[test_set.columns.difference( [T_true, E_true])] test_predictT = cox.predict_expectation(test_variable) # small_set # all_actual_T = big_dataset[T_true] # all_actual_E = big_dataset[E_true] # all_variable = big_dataset[big_dataset.columns.difference([T_true, E_true])] # all_predictT = cox.predict_expectation(all_variable) # # print('try cindexing......') try: test_cindex = concordance_index(test_actual_T, test_predictT, test_actual_E) # all_cindex = concordance_index(all_actual_T, all_predictT, all_actual_E) except Exception: test_cindex = concordance_index(test_actual_T, test_predictT) # all_cindex = concordance_index(all_actual_T, all_predictT) # # stats632.append([train_cindex, test_cindex, all_cindex]) count += 1 print('632 -> %d' % count) except Exception as e: print(e.message) mean_patient = self.__filter_dt(test_set) print(cox.predict_log_hazard_relative_to_mean(test_set)) # mean_hazard = cox.predict_expectation(mean_patient) print(mean_hazard)