Exemplo n.º 1
0
def main():
    # Load data
    print('Load data...')
    hp = Hyperparameters()
    data = np.load('../' + hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz')
    
    print('Use all data for model fitting...')
    x = data['x']
    time = data['time']
    event = data['event']
    
    cols_list = load_obj('../' + hp.data_pp_dir + 'cols_list.pkl')
    
    df = pd.DataFrame(x, columns=cols_list)
    df['TIME'] = time
    df['EVENT'] = event

    ###################################################################
    
    print('Add additional columns...')
    df_index_code = feather.read_dataframe('../' + hp.results_dir + 'hr_addcodes_' + hp.gender + '.feather')
    df_index_code = pd.concat([df_index_code[df_index_code['TYPE']==1].head(10), df_index_code[df_index_code['TYPE']==0].head(10)], sort=False)
    
    for index, row in df_index_code.iterrows():
        print(row['DESCRIPTION'])
        df[row['DESCRIPTION']] = (data['codes'] == row['INDEX_CODE']).max(axis=1)
        cols_list = cols_list + [row['DESCRIPTION']]
    
    ###################################################################
    
    print('Fitting...')
    cph = CoxPHFitter()
    cph.fit(df, duration_col='TIME', event_col='EVENT', show_progress=True, step_size=0.5)
    cph.print_summary()
    print('done')
Exemplo n.º 2
0
def main(data_df):

    for key in th_dict.keys():
        if not key.find("HU") > 0:
            data_df[key] = data_df[key].fillna(0)
        data_df[key] = data_df[key].map(lambda input: 1
                                        if input >= th_dict[key] else 0)

    add_DF = pd.DataFrame()
    add_DF["V-HU"] = data_df['HU_of_consolidation'] + data_df[
        'Volume_of_total_pneumonia_infection']  #0,1,2

    combinations_df = pd.concat(
        [
            data_df["Duration"],
            data_df["Death"],
            data_df["Age"],
            data_df["Blood_Oxygen"],
            data_df["C-Reactive_protein"],
            #data_df["White_blood_cell_count"] ,
            data_df["Lymphocyte_count"],
            data_df["Cerebrovascular_Disease"],
            data_df["Sex"],
            #data_df["Neutrophil_count"],
            #data_df["D-dimer"] ,
            data_df["Lactic_dehydrogenase"],
            add_DF["V-HU"],
        ],
        axis=1)

    cph = CoxPHFitter()
    cph.fit(combinations_df, "Duration", event_col="Death", step_size=0.01)

    cph.print_summary()
Exemplo n.º 3
0
def test_proportional_hazard_test_with_weights_and_strata():
    """
    library(survival)
    df <- data.frame(
      "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
      "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458),
      "E" = c(1, 1, 1, 1, 1),
      "w" = c(1, 0.5, 2, 1, 1),
      "s" = c(1, 1, 0, 0, 0)
    )

    c = coxph(formula=Surv(T, E) ~ var1 + strata(s), data=df, weights=w)
    cz = cox.zph(c, transform='identity')

    """

    df = pd.DataFrame({
        "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
        "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458],
        "w": [1, 0.5, 2, 1, 1],
        "s": [1, 1, 0, 0, 0],
    })
    df["E"] = True

    cph = CoxPHFitter()
    cph.fit(df, "T", "E", weights_col="w", strata="s", robust=True)

    results = stats.proportional_hazard_test(cph,
                                             df,
                                             time_transform="identity")
    cph.print_summary()

    npt.assert_allclose(results.summary.loc["var1"]["test_statistic"],
                        0.0283,
                        rtol=1e-3)
Exemplo n.º 4
0
def coxph_coef(data, duration_col, event_col, silence=True):
    cph = CoxPHFitter()
    cph.fit(data, duration_col=duration_col, event_col=event_col, show_progress=(not silence))
    if not silence:
        cph.print_summary()
    # if div is significant, return it's coefficent
    if (cph.summary['p'] < 0.05).any():
        return np.exp(cph.hazards_['div']['coef'])
    # otherwise return negative value
    return -1.0
Exemplo n.º 5
0
def do_baseline(foldnum, train, valid, exp_code, model_str):
    cph = CoxPHFitter()
    df = pd.DataFrame(train.x)
    print(df.shape)
    df['duration'] = train.y
    df['event'] = [1 if v == 0 else 0 for v in train.c]

    df = df.fillna(df.mean())
    cph.fit(df, 'duration', event_col="event")

    cph.print_summary()

    valid_df = pd.DataFrame(valid.x)
    valid_df = valid_df.fillna(valid_df.mean())
    print(cph.predict_log_partial_hazard(valid_df))
Exemplo n.º 6
0
def f(train,threshold,test):
    hi=h(train)
    h_score=pd.DataFrame(hi, index=np.array(range(1,21149)))
    gene_ls=h_score.index[h_score.iloc[:,0]>1].tolist()
    candidate_genes=['V{0}'.format(element) for element in gene_ls]

    # qualified genes were selected 

    stdsc = preprocessing.StandardScaler()
    np_scaled_train = stdsc.fit_transform(train.loc[:,candidate_genes])
    np_scaled_test  = stdsc.transform(test.loc[:,candidate_genes])
    pca = sklearnPCA(n_components=1)   
    X_train_pca = pca.fit_transform(np_scaled_train) # This is the result 
    X_test_pca  = pca.transform(np_scaled_test)
    eigen_val=pca.explained_variance_  #eigen value is the explained variance 

    
    #assign pca score to the test dataset 
    test=test.assign(w=pd.Series(np.ones(len(test.patient_id))))
    test['w']=X_test_pca
    testset_surv=test[['event_free_survival_time_days','death','w']]
    
    #do cox-regression

    # Using Cox Proportional Hazards model
    cph = CoxPHFitter()
    cph.fit(testset_surv,'event_free_survival_time_days',event_col='death')
    
    return cph.print_summary()
Exemplo n.º 7
0
def cox(d_male, d_female):
    df_male = pd.DataFrame({
        "time":d_male,
        "event":1,
        "sex": 0
        })
    df_female = pd.DataFrame({
        "time":d_female,
        "event":1,
        "sex": 1
        })
    df = pd.concat([df_male, df_female])
    
    print(len(d_male), len(d_female))
    cph = CoxPHFitter()
    cph.fit(df, duration_col="time", event_col="event")
    cph.print_summary()
Exemplo n.º 8
0
def test_cox(phenotype_dataset, survival_dataset, pheno_survival_integrated,
             filtered_list, filter_type, filter_na_by_rows):
    headers = phenotype_dataset[0][0:1] + phenotype_dataset[0][
        pheno_start:pheno_limit] + survival_dataset[0][4:]
    pandas.set_option("mode.use_inf_as_na", True)
    df = pandas.DataFrame(
        columns=headers,
        data=[[k] + v for k, v in pheno_survival_integrated.iteritems()
              ])  # np.array().astype(np.float32)
    decode_categorical_values(df)
    for cur_header in headers[1:]:
        if filter_type == FILTER_IN and cur_header not in filtered_list:
            df = df.drop(cur_header, 1)
            print "column {} was dropped as it's not filtered in".format(
                cur_header)
            continue

        if filter_type == FILTER_OUT and cur_header in filtered_list:
            df = df.drop(cur_header, 1)
            print "column {} was dropped as it has low variance".format(
                cur_header)
            continue

        if df[[cur_header
               ]].isnull().values.any() and (not filter_na_by_rows
                                             or filter_type == FILTER_OUT):
            df = df.drop(cur_header, 1)
            print "column {} was dropped as it has NaN values".format(
                cur_header)
            continue

        try:
            df[[cur_header]] = df[[cur_header]].apply(pandas.to_numeric)
            print "column {} has numeric values".format(cur_header)

        except ValueError:
            print "{} cannot be converted to numeric. converting to categorical instead".format(
                cur_header)
            df[cur_header] = df[cur_header].astype('category')
            df[cur_header] = df[cur_header].cat.codes
    if filter_na_by_rows and filter_type == FILTER_IN:
        print "remove NaN values by row"
        df.dropna(inplace=True)
    # df[['_OS','_OS_IND', 'age_at_initial_pathologic_diagnosis']] =  df[['_OS','_OS_IND', 'age_at_initial_pathologic_diagnosis']].apply(pandas.to_numeric)
    # df["anatomic_treatment_site"] = df["anatomic_treatment_site"].astype('category')
    # df['anatomic_treatment_site'] = df['anatomic_treatment_site'].cat.codes
    df = df.drop(headers[0], 1)
    # df = pandas.get_dummies(df)
    # print str(df['tx_on_clinical_trial'])
    print "shape : {}".format(df.shape)
    cph = CoxPHFitter()
    cph.fit(df,
            duration_col='_OS',
            event_col='_OS_IND',
            show_progress=False,
            step_size=0.001)
    return cph.print_summary()  # access the results using cph.summary
Exemplo n.º 9
0
def test_cox(phenotype_dataset, gene_expression_top_var_headers_columns,
             survival_dataset, pheno_survival_integrated, filtered_list,
             filter_type, filter_na_by_rows):
    headers = ["ids"] + list(
        gene_expression_top_var_headers_columns[:pheno_limit]) + list(
            survival_dataset[0][4:])
    pandas.set_option("mode.use_inf_as_na", True)
    df = pandas.DataFrame(
        columns=headers,
        data=[[k] + v for k, v in pheno_survival_integrated.iteritems()
              ])  # np.array().astype(np.float32)
    for cur_header in headers[1:]:
        if filter_type == FILTER_IN and cur_header.split(
                ".")[0] not in filtered_list:
            df = df.drop(cur_header, 1)
            print "column {} was dropped as it's not filtered in".format(
                cur_header)
            continue

        if filter_type == FILTER_OUT and cur_header in filtered_list:
            df = df.drop(cur_header, 1)
            print "column {} was dropped as it has low variance".format(
                cur_header)
            continue

        if df[[cur_header
               ]].isnull().values.any() and (not filter_na_by_rows
                                             or filter_type == FILTER_OUT):
            df = df.drop(cur_header, 1)
            print "column {} was dropped as it has NaN values".format(
                cur_header)
            continue

        try:
            df[[cur_header]] = df[[cur_header]].apply(pandas.to_numeric)
            print "column {} has numeric values".format(cur_header)

        except ValueError:
            print "{} cannot be converted to numeric. converting to categorical instead".format(
                cur_header)
            df[cur_header] = df[cur_header].astype('category')
            df[cur_header] = df[cur_header].cat.codes
    if filter_na_by_rows and filter_type == FILTER_IN:
        print "remove NaN values by row"
        df.dropna(inplace=True)
    df = df.drop(headers[0], 1)

    print "shape : {}".format(df.shape)
    cph = CoxPHFitter()
    cph.fit(df,
            duration_col='_OS',
            event_col='_OS_IND',
            show_progress=True,
            step_size=0.0001)
    return cph.print_summary()  # access the results using cph.summary
Exemplo n.º 10
0
 def Cox_Label_HR(self):
     cph = CoxPHFitter()
     try:
         cph.fit(self.survival_label,
                 self.duration_column,
                 event_col=self.observed_column)
         self.cox_report_for_HR = cph.print_summary()
         self.HR = cph.hazard_ratios_[0]
         self.CI = cph.confidence_intervals_.values[0]
     #         return cph.print_summary()
     except:
         print("The truncation has problem. ")
def main():
    # get command line arguments
    cmd_args = commandLineParser()
    # import data (mother/rain)
    rainfall_df = pd.read_csv(cmd_args.rainfall_data)
    mother_df = pd.read_csv(cmd_args.DHS_data)
    # get relevant data from rain data
    merged = pd.merge(mother_df, rainfall_df, on=['DHSID', 'Year'], how='left')
    merged.set_index('IDHSPID', inplace=True)
    # drop unneeded columns
    drop_columns = ['DHSID', 'Year', r'%-ile', 'Total Rainfall (mm)']
    for column in merged.columns:
        if column in drop_columns:
            merged.drop(column, axis=1, inplace=True)
    # change Bools into ones or zeros
    for column in [r'<5%-ile', r'<10%-ile', r'<15%-ile']:
        merged[column] = (merged[column] == True).astype(int)
    # regressions
    cph = CoxPHFitter()
    cph.fit(merged, 'Event Time', event_col='Event Occured')
    # display results
    cph.print_summary()
Exemplo n.º 12
0
def fitcoxmodel(classification, T, E, pid, verbose=True):
    # Convert the inputs to PD dataframe
    data = dict()
    data['T'] = T
    data['E'] = E
    data['Cov'] = classification
    data = pd.DataFrame(data=data, index=pid)

    # Create the COX fitter
    cph = CoxPHFitter()
    cph.fit(data, duration_col='T', event_col='E')

    if verbose:
        cph.print_summary()

    # Retreive the coefficient
    s = cph.summary
    coef = s['coef']['Cov']
    CI = [s['lower 0.95']['Cov'], s['upper 0.95']['Cov']]
    p = s['p']['Cov']

    return coef, CI, p
Exemplo n.º 13
0
def coxph_smoke():
    rossi = load_rossi()

    cph = CoxPHFitter()
    cph.fit(rossi, duration_col='week', event_col='arrest')

    cph.print_summary()

    rossiH2O = h2o.H2OFrame(rossi)
    cphH2O = H2OCoxProportionalHazardsEstimator(stop_column="week")
    cphH2O.train(x=["age", "fin", "race", "wexp", "mar", "paro", "prio"], y="arrest", training_frame=rossiH2O)

    assert cphH2O.model_id != ""
    assert cphH2O.formula() == "Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio", \
        "Expected formula to be 'Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio' but it was " + cphH2O.formula()

    predH2O = cphH2O.predict(test_data=rossiH2O)
    assert len(predH2O) == len(rossi)

    metricsH2O = cphH2O.model_performance(rossiH2O)
    py_concordance = concordance_for_lifelines(cph)
    
    assert abs(py_concordance - metricsH2O.concordance()) < 0.001
Exemplo n.º 14
0
def cox_regression_experiment():
    dynamic_features = np.load('pick_5_visit_features_merge_1.npy')[
        0:2100, :, :-2]
    dynamic_features.astype(np.int32)
    labels = np.load('pick_5_visit_labels_merge_1.npy')[:, :, -4].reshape(
        -1, dynamic_features.shape[1], 1)
    data = np.concatenate((dynamic_features, labels), axis=2).reshape(-1, 94)
    data_set = pd.DataFrame(data)
    col_list = list(data_set.columns.values)
    new_col = [str(x) for x in col_list]
    data_set.columns = new_col
    np.savetxt('allPatient_now.csv', data_set, delimiter=',')
    print(list(data_set.columns.values))
    cph = CoxPHFitter(penalizer=100)
    cph.fit(data_set, duration_col='0', event_col='93', show_progress=True)
    cph.print_summary()
    # cph.plot(columns=['15','20','21','25'])
    # plt.savefig('cox model' + '.png', format='png')

    scores = k_fold_cross_validation(cph, data_set, '0', event_col='93', k=5)
    print(scores)
    print(np.mean(scores))
    print(np.std(scores))
Exemplo n.º 15
0
def main(filter_gap_days, rp_vs_radiation):
    # get RP procedure date
    with open('../data/empi_to_rp_date_dic.pkl', 'rb') as handle:
        empi_to_rp_date_dic = pickle.load(handle)

    # if rp_vs_radiation:
    # get biopsy date
    with open('../data/empi_to_date_oi_dic.pkl', 'rb') as handle:
        empi_to_date_oi_dic = pickle.load(handle)
    # rp date min : 1992-8-5, rp date max : 2020-7-29
    # rp_date_range = [np.arange(1992, 1998), np.arange(1998, 2004), np.arange(2004, 2009), np.arange(2009, 2015), np.arange(2015, 2021)]

    # pre-process the data using radiation only patients / multiple RP patients
    df_first_date_rads = pd.read_csv(
        '../data/processed_data/first_date_rads.csv')
    df_first_date_rads.set_index('EMPI', inplace=True)
    for empi, pr_date in zip(df_first_date_rads.index,
                             df_first_date_rads.prdate_parsed.values):
        if empi in empi_to_date_oi_dic.keys():
            df_first_date_rads.at[empi, 'pr_date_minus_biopsy_date'] = (
                pd.to_datetime(pr_date) - empi_to_date_oi_dic[empi]).days
    df_first_date_rads_filtered = df_first_date_rads.loc[
        df_first_date_rads.pr_date_minus_biopsy_date > 0]
    df_first_date_rads_filtered = df_first_date_rads_filtered.loc[
        df_first_date_rads_filtered.pr_date_minus_biopsy_date <=
        filter_gap_days]
    radiation_empis = set(df_first_date_rads_filtered.index)

    df_multirp = pd.read_csv('../data/processed_data/multirp.csv')
    df_multirp.set_index('EMPI', inplace=True)
    multirp_empis = set(df_multirp.index)
    # breakpoint()

    # get relevant data
    df_merged_comorb = pd.read_csv(
        '../data/merged/df_merged_biopsy_based_C.csv')
    df_merged_comorb.set_index(df_merged_comorb.columns[0], inplace=True)
    df_merged_comorb.index.name = 'EMPI'

    df_merged_psa_prior = pd.read_csv(
        '../data/merged/df_merged_biopsy_based_E.csv')
    df_merged_psa_prior.set_index(df_merged_psa_prior.columns[0], inplace=True)
    df_merged_psa_prior.index.name = 'EMPI'

    df_outcome_oi = pd.read_csv(
        '../data/df_outcome_final_biopsy_based_clean.csv')
    df_outcome_oi.set_index('EMPI', inplace=True)

    # filter_gap_days = 60
    df_outcome_oi_filtered_rp_postive = df_outcome_oi.loc[
        df_outcome_oi.rp_date_minus_biopsy_date_in_days <= filter_gap_days]
    df_outcome_oi_rp_negative = df_outcome_oi.loc[
        df_outcome_oi.rp_date.isnull()]
    if rp_vs_radiation:
        # exclude both RP and Radiation
        both_rp_radiation_empis = set(
            df_outcome_oi_filtered_rp_postive.index) & radiation_empis
        df_outcome_oi_filtered_rp_postive = df_outcome_oi_filtered_rp_postive.loc[
            set(df_outcome_oi_filtered_rp_postive.index) -
            both_rp_radiation_empis]
        df_outcome_oi_rp_negative = df_outcome_oi_rp_negative.loc[
            set(df_outcome_oi_rp_negative.index)
            & radiation_empis - multirp_empis]
        df_outcome_oi_filtered_total = pd.concat(
            [df_outcome_oi_filtered_rp_postive, df_outcome_oi_rp_negative])
    else:  # treated (radiation and rp) vs AS
        df_outcome_oi_treated_postive = pd.concat([
            df_outcome_oi_filtered_rp_postive,
            df_outcome_oi_rp_negative.loc[set(df_outcome_oi_rp_negative.index)
                                          & radiation_empis]
        ])
        # df_outcome_oi_filtered_rp_postive
        df_outcome_oi_treated_negative = df_outcome_oi_rp_negative.loc[
            set(df_outcome_oi_rp_negative.index) - radiation_empis]
        df_outcome_oi_filtered_total = pd.concat(
            [df_outcome_oi_treated_postive, df_outcome_oi_treated_negative])

    # filter out negative time to death
    df_outcome_oi_filtered_final = df_outcome_oi_filtered_total.loc[
        df_outcome_oi_filtered_total.time_to_death_in_month > 0]
    # df_outcome_oi_filtered_final.set_index('EMPI', inplace = True)

    # merge comorbidity data and psa prior data
    empi_common = set(df_merged_comorb.index) & set(df_merged_psa_prior.index)
    df_merged_psa_comorb = pd.concat([
        df_merged_comorb.loc[empi_common],
        df_merged_psa_prior.loc[empi_common].psa_prior_to_rp
    ],
                                     axis=1)
    df_merged_psa_comorb.drop(columns=['wscore_agg'], inplace=True)

    # merge outcome and feature dfs
    empi_common_outcome = set(df_outcome_oi_filtered_final.index) & set(
        df_merged_psa_comorb.index)
    outcome_cols_oi = ['death_ind', 'time_to_death_in_month']
    df_cox = pd.concat([
        df_merged_psa_comorb.loc[empi_common_outcome],
        df_outcome_oi_filtered_final.loc[empi_common_outcome][outcome_cols_oi]
    ],
                       axis=1)

    if not rp_vs_radiation:
        df_cox.loc[df_cox.index.isin(df_outcome_oi_treated_postive.index),
                   'rp_indicator'] = 1
        df_cox.rename(columns={'rp_indicator': 'treated'}, inplace=True)
    # get biopsy date range
    # biopsy_date_list = []
    # for empi in data_merged.index:
    # 	biopsy_date = empi_to_rp_date_dic[empi]
    # 	for yr_idx, yr_range in enumerate(rp_date_range):
    # 		if biopsy_date.year in yr_range:
    # 			biopsy_date_list.append(yr_idx)
    # 			break
    # data_merged['rp_date'] = biopsy_date_list

    # drop uninformative and extreme minority features
    drop_cols = ['benign', 'Unknown/other']  # pre-filtering
    # remove metastatic cancer patients
    df_cox = df_cox.loc[df_cox.metacanc_agg == 0]
    # drop_cols_non_informative = ['cevd_agg', 'rheumd_agg', 'pud_agg', 'mld_agg', 'diabwc_agg', 'aids_agg', 'metacanc_agg']
    if rp_vs_radiation:
        drop_cols_non_informative = [
            'metacanc_agg', 'rheumd_agg', 'copd_agg', 'mld_agg', 'diabwc_agg',
            'hp_agg', 'rend_agg', 'aids_agg'
        ]
    else:
        drop_cols_non_informative = [
            'metacanc_agg', 'cevd_agg', 'rheumd_agg', 'mld_agg', 'diabwc_agg',
            'hp_agg', 'aids_agg'
        ]  #, 'rheumd_agg', 'copd_agg', 'mld_agg', 'diabwc_agg', 'hp_agg', 'rend_agg', 'aids_agg']
    df_cox_final = df_cox.drop(columns=drop_cols + drop_cols_non_informative)
    # standardize age and auxiiliary_mci_score,psa_prior_to_rp
    standardize_cols = ['Age at RP', 'auxiiliary_mci_score', 'psa_prior_to_rp']
    for col in standardize_cols:
        if col == 'psa_prior_to_rp':
            breakpoint()
        df_cox_final[col] = (df_cox_final[col].values - np.mean(
            df_cox_final[col])) / np.std(df_cox_final[col].values)

    # df_cox_final = df_cox_final.loc[df_cox_final.overall_grade_merged == 1]
    # df_cox_final.drop(columns = ['overall_grade_merged'], inplace = True)
    # df_cox_final = df_cox_final.loc[df_cox_final.overall_grade_merged > 1]
    # df_cox_final.drop(columns = ['overall_grade_merged'], inplace = True)

    print('Final cox df stats : ')
    print(df_cox_final.sum())

    cph = CoxPHFitter(penalizer=0.00, l1_ratio=0)
    cph.fit(df_cox_final,
            'time_to_death_in_month',
            'death_ind',
            show_progress=False,
            step_size=0.1)
    cph.print_summary()

    breakpoint()

    # rename colums for downstraem task compatibility
    df_cox_final.rename(columns={
        'death_ind': 'death',
        'rp_indicator': 'rp',
        'time_to_death_in_month': 'survtime'
    },
                        inplace=True)
    if rp_vs_radiation:
        df_cox_final.to_csv(
            '../data/df_cox_data_death_causal_inference_rp_vs_radiation.csv')
    else:
        df_cox_final.to_csv('../data/df_cox_data_death_causal_inference.csv')
    breakpoint()
    return
Exemplo n.º 16
0
Created on Sat Feb  6 20:47:27 2016

@author: Rahul Ahuja
"""
from lifelines import CoxPHFitter
cf = CoxPHFitter()
import pandas as pd
import numpy as np
import grade

ProcessedData = pd.read_csv('C:/Users/abmm832/Downloads/ProcessedData.csv',
                            index_col=False)

cf.fit(ProcessedData, 'Time', event_col='loan_status')

cf.print_summary()


#should have made the module of this function as well. Anyways I ve done it for loan grade module.
def get_non_negative_int(prompt):
    while True:
        try:
            value = int(input(prompt))
        except ValueError:
            print("Invalid entry")
            continue

        if value < 0:
            print("Invalid entry")
            continue
        else:
import pandas as pd
from lifelines import WeibullAFTFitter, CoxPHFitter

# This is an implementation of https://uwspace.uwaterloo.ca/bitstream/handle/10012/10265/Cook_Richard-10265.pdf

N = 50000
p = 0.5
bX = np.log(0.5)
bZ = np.log(4)

Z = np.random.binomial(1, p, size=N)
X = np.random.binomial(1, 0.5, size=N)
X_ = 20000 + 10 * np.random.randn(N)

W = weibull_min.rvs(1, scale=1, loc=0, size=N)

Y = bX * X + bZ * Z + np.log(W)
T = np.exp(Y)

#######################################

df = pd.DataFrame({"T": T, "x": X, "x_": X_})


wf = WeibullAFTFitter().fit(df, "T")
wf.print_summary(4)


cph = CoxPHFitter().fit(df, "T", show_progress=True, step_size=1.0)
cph.print_summary(4)
def check_cox(rossi, x, stratify_by, formula):
    if stratify_by:
        cph_py = CoxPHFitter(strata=stratify_by)
    else:
        cph_py = CoxPHFitter()

    for col in stratify_by:
        rossi[col] = rossi[col].astype('category')

    cph_py.fit(rossi, duration_col='week', event_col='arrest')
    cph_py.print_summary()
    rossi_h2o = h2o.H2OFrame(rossi)

    for col in stratify_by:
        rossi_h2o[col] = rossi_h2o[col].asfactor()
    
    cph_h2o = H2OCoxProportionalHazardsEstimator(stop_column="week", stratify_by=stratify_by)
    cph_h2o.train(x=x, y="arrest", training_frame=rossi_h2o)
    
    assert cph_h2o.model_id != ""
    assert cph_h2o.model_id != ""
    assert cph_h2o.formula() == formula, "Expected formula to be '" + formula + "' but it was " + cph_h2o.formula()
    
    predH2O = cph_h2o.predict(test_data=rossi_h2o)
    assert len(predH2O) == len(rossi)
    metrics_h2o = cph_h2o.model_performance(rossi_h2o)
    concordance_py = concordance_for_lifelines(cph_py)
    assert abs(concordance_py - metrics_h2o.concordance()) < 0.001
    hazard_h2o_as_pandas = cph_h2o.baseline_hazard_frame.as_data_frame(use_pandas=True)

    hazard_py = cph_py.baseline_hazard_
    
    for col_name in hazard_py.columns:
        hazard_py.rename(columns={col_name: str(col_name)}, inplace=True)

    hazard_py_reordered_columns = hazard_py.reset_index(drop=True).sort_index(axis=1)
    hazard_h2o_reordered_columns = hazard_h2o_as_pandas.drop('t', axis="columns").reset_index( drop=True).sort_index(axis=1)

    hazard_py_reordered_columns = fix_py_result_for_older_lifelines(hazard_py_reordered_columns)

    print("h2o:")
    print(hazard_h2o_as_pandas.reset_index(drop=True))

    print("lifelines:")
    print(hazard_py_reordered_columns.reset_index(drop=True)) 
    
    assert_frame_equal(hazard_py_reordered_columns, hazard_h2o_reordered_columns, 
                       check_dtype=False, check_index_type=False, check_column_type=False)
    
    survival_h2o_as_pandas = cph_h2o.baseline_survival_frame.as_data_frame(use_pandas=True)

    survival_py = cph_py.baseline_survival_
    
    for col_name in survival_py.columns:
        survival_py.rename(columns={col_name: str(col_name)}, inplace=True)

    survival_py_reordered_columns = survival_py.reset_index(drop=True).sort_index(axis=1)
    survival_h2o_reordered_columns = survival_h2o_as_pandas.drop('t', axis="columns").reset_index( drop=True).sort_index(axis=1)

    survival_py_reordered_columns = fix_py_result_for_older_lifelines(survival_py_reordered_columns)
    
    print("h2o:")
    print(survival_h2o_as_pandas.reset_index(drop=True))

    print("lifelines:")
    print(survival_py_reordered_columns.reset_index(drop=True))

    assert_frame_equal(survival_py_reordered_columns, survival_h2o_reordered_columns,
                       check_dtype=False, check_index_type=False, check_column_type=False)
Exemplo n.º 19
0
# -*- coding: utf-8 -*-
# cox regression

if __name__ == "__main__":
    import pandas as pd
    import time
    import numpy as np

    from lifelines import CoxPHFitter
    from lifelines.datasets import load_rossi

    df = load_rossi()
    df = pd.concat([df] * 20)
    # df['week'] = np.random.exponential(1, size=df.shape[0])
    cp = CoxPHFitter()
    start_time = time.time()
    cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True)
    print("--- %s seconds ---" % (time.time() - start_time))
    cp.print_summary(4)
print(cv['test_score'].mean())

#Feature Importance
for idx,estimator in enumerate(cv['estimator']):
    print("Features sorted by their score for estimator {}:".format(idx))
    feature_importances = pd.DataFrame(estimator.feature_importances_,
                                       index = X.columns,
                                        columns=['importance']).sort_values('importance', ascending=False)
    print(feature_importances)

#Fit Logistic Regression
model = sm.OLS(y, X)
result = model.fit()
print("regression results:\n", result.summary())


#Using Cox Proportional Hazards model
df=dat[['Month', 'Status_F', 'Age(year)', 'Max.rotor.speed,RPM-Avg', 'active power-Avg']]
print("df=",df )

#Survival Analysis_ Cox Proportional Hazard Model
cph = CoxPHFitter()   ## Instantiate the class to create a cph object
cph.fit(df , 'Month', 'Status_F')   ## Fit the data to train the model
cph.print_summary()    ## HAve a look at the significance of the features
cph.plot()

#Survival curves for the selected Turbines
tr_rows = df.iloc[19:27, 2:]
cph.predict_survival_function(tr_rows).plot()
plt.show()
tx = df['history_of_neoadjuvant_treatment']=='Yes'
ax = plt.subplot(111)

kmf1 = KaplanMeierFitter(alpha=0.95)
kmf1.fit(durations=df.ix[tx, survival_col], event_observed=df.ix[tx, censor_col], label=['Tx==Yes'])
kmf1.plot(ax=ax, show_censors=True,  ci_show=False)


kmf2 = KaplanMeierFitter(alpha=0.95)
kmf2.fit(durations=df.ix[~tx, survival_col], event_observed=df.ix[~tx, censor_col], label=['Tx==No'])
kmf2.plot(ax=ax, show_censors=True,  ci_show=False )

add_at_risk_counts(kmf1, kmf2, ax=ax)
plt.title ('Acute myeloid leukemia survival analysis with Tx and without Tx')
plt.xlabel(survival_col)
plt.savefig('km.png')

results = logrank_test(df.ix[tx, survival_col], df.ix[~tx, survival_col], df.ix[tx, censor_col], df.ix[~tx, censor_col], alpha=.99 )
results.print_summary()

cox = CoxPHFitter(normalize=False)
df_age = df[[survival_col, censor_col, 'age_at_initial_pathologic_diagnosis']]
df_age = df_age[pd.notnull(df_age['age_at_initial_pathologic_diagnosis'])]
cox = cox.fit(df_age, survival_col, event_col=censor_col, include_likelihood=True)
cox.print_summary()

scores = k_fold_cross_validation(cox, df_age, survival_col, event_col=censor_col, k=10)
print scores
print 'Mean score', np.mean(scores)
print 'Std', np.std(scores)
 
Exemplo n.º 22
0
def main(outcome_oi):
    # get RP procedure date
    with open('../data/empi_to_rp_date_dic.pkl', 'rb') as handle:
        empi_to_rp_date_dic = pickle.load(handle)
    # rp date min : 1992-8-5, rp date max : 2020-7-29
    rp_date_range = [
        np.arange(1992, 1998),
        np.arange(1998, 2004),
        np.arange(2004, 2009),
        np.arange(2009, 2015),
        np.arange(2015, 2021)
    ]

    # get relevant data
    data_merged = pd.read_csv(
        '../data/merged/df_merged_rp_positive_based_E.csv')
    data_merged.set_index('Unnamed: 0', inplace=True)
    data_merged.index.name = 'EMPI'

    data_merged_comorb = pd.read_csv(
        '../data/merged/df_merged_rp_positive_based_C.csv')
    data_merged_comorb.set_index('Unnamed: 0', inplace=True)
    data_merged_comorb.index.name = 'EMPI'

    empis_oi = set(data_merged.index) & set(data_merged_comorb.index)
    comorbs_oi = [
        'ami_agg', 'chf_agg', 'pvd_agg', 'cevd_agg', 'copd_agg', 'rheumd_agg',
        'pud_agg', 'mld_agg', 'diab_agg', 'diabwc_agg', 'hp_agg', 'rend_agg',
        'metacanc_agg', 'aids_agg'
    ]
    data_merged = pd.concat([
        data_merged.loc[empis_oi], data_merged_comorb.loc[empis_oi][comorbs_oi]
    ],
                            axis=1)
    # process pt stage
    data_merged['pt1'] = 0
    data_merged['pt1a'] = 0
    data_merged['pt1b'] = 0
    data_merged['pt1c'] = 0
    data_merged['pt2'] = 0
    data_merged['pt2a'] = 0
    data_merged['pt2b'] = 0
    data_merged['pt2c'] = 0
    data_merged['pt3'] = 0
    data_merged['pt3a'] = 0
    data_merged['pt3b'] = 0
    data_merged['pt3c'] = 0
    data_merged['pt4'] = 0

    for empi, pt_stage in zip(data_merged.index,
                              data_merged.pT_stage_combined.values):
        data_merged.at[empi, pt_stage] = 1
    #     if pt_stage[:-1] in data_merged.columns:
    #         data_merged.at[empi, pt_stage[:-1]] = 1
    # data_merged.drop(columns = ['pT_stage_combined'], inplace = True)

    # get rp procedure date
    rp_date_list = []
    for empi in data_merged.index:
        rp_date = empi_to_rp_date_dic[empi]
        for yr_idx, yr_range in enumerate(rp_date_range):
            if rp_date.year in yr_range:
                rp_date_list.append(yr_idx)
                break
    data_merged['rp_date'] = rp_date_list

    # load outcome :
    df_outcome = pd.read_csv('../data/df_outcome_rp_positive_final.csv')
    df_outcome.set_index('EMPI', inplace=True)

    # outcome_oi = 'death' # 'death', 'bcr'
    empis_oi = set(df_outcome.index) & set(data_merged.index)
    df_outcome_oi = df_outcome.loc[empis_oi]
    df_merged_data_oi = data_merged.loc[empis_oi]

    print('Feature stats in the cohort of interest : ')
    print(df_merged_data_oi.sum())

    drop_pt_stage = ['pt1', 'pt1a', 'pt1b', 'pt1c', 'pt3c', 'pt4',
                     'pt3']  # 'Unknown/other', 'pt3', 'Asian']
    drop_other = ['Unknown/other', 'Asian']
    drop_cols = drop_pt_stage + drop_other
    df_merged_data_oi.drop(columns=drop_cols, inplace=True)
    # df_merged_data_oi = df_merged_data_oi.loc[~df_merged_data_oi.isin(drop_pt_stage)]

    print('New featur stats in the cohort of interest : ')
    print(df_merged_data_oi.sum())

    if outcome_oi == 'bcr':
        outcome_cols_oi = ['bcr_ind', 'time_to_bcr_in_month']
        df_outcome_oi = df_outcome_oi[outcome_cols_oi]
    else:
        outcome_cols_oi = ['death_ind', 'time_to_death_in_month']
        df_outcome_oi = df_outcome_oi[outcome_cols_oi]
    df_cox_data = pd.concat([df_merged_data_oi, df_outcome_oi], axis=1)
    print('\n')
    print('Exporting data...')
    if outcome_oi == 'bcr':
        df_cox_data.to_csv('../data/df_cox_data_bcr.csv')
    else:
        df_cox_data.to_csv('../data/df_cox_data_death.csv')
    print('\n')
    run_cox = True
    if run_cox:
        cph = CoxPHFitter(l1_ratio=1, penalizer=0.01)
        if outcome_oi == 'death':
            cph.fit(df_cox_data.drop(columns=['pT_stage_combined']),
                    'time_to_death_in_month', 'death_ind')
            cph.print_summary()
            print(cph.summary)
            cph.summary.round(3).to_csv('cox_result_death.csv')
        else:
            cph.fit(df_cox_data.drop(columns=['pT_stage_combined']),
                    'time_to_bcr_in_month', 'bcr_ind')
            cph.print_summary()
            print(cph.summary)
            cph.summary.round(3).to_csv('cox_result_bcr.csv')
        breakpoint()
        """
		drop_cols = ['max_psa', 'min_psa', 'mean_psa', 'ami_agg', 'chf_agg', 'pvd_agg', 'cevd_agg', 'copd_agg', 'rheumd_agg', 'pud_agg', 'mld_agg', 'diabwc_agg', 'hp_agg', 'rend_agg', 'metacanc_agg', 'aids_agg']
		"""
    return
Exemplo n.º 23
0
ax = kmf.survival_function_.plot(ax=ax)
ax.set_title('Survival function')
plt.show()

fig, ax = plt.subplots()
ax = kmf.plot(ax=ax)
ax.set_title('Survival with confidence intervals')
plt.show()

from lifelines import CoxPHFitter
cph = CoxPHFitter()
# 传入用作自变量的列
cph_bladder_df = bladder[['rx', 'number', 'size', 'enum', 'stop', 'event']]
cph.fit(cph_bladder_df, duration_col='stop', event_col='event')
# 输出系数
print(cph.print_summary())

rx1 = bladder.loc[bladder['rx'] == 1]
rx2 = bladder.loc[bladder['rx'] == 2]
kmf1 = KaplanMeierFitter()
kmf1.fit(rx1['stop'], event_observed=rx1['event'])

kmf2 = KaplanMeierFitter()
kmf2.fit(rx2['stop'], event_observed=rx2['event'])

fig, axes = plt.subplots()
kmf1.plot_loglogs(ax=axes)
kmf2.plot_loglogs(ax=axes)

axes.legend(['rx1', 'rx2'])
plt.show()
Exemplo n.º 24
0
    # position in the sorted array of times while setting other
    # positions to 0 so that the cumsum operation will result
    # in each of the positions having the same sum of risks
    for i in range(time.shape[0] - 1, 0, -1):
        # Going from smallest survival times to largest
        if time[i] == time[i - 1]:
            # Push risk to the later time (earlier in array position)
            risk[i - 1] = risk[i - 1] + risk[i]
            risk[i] = 0
    event = K.gather(y_true[:, 1], indices=sorting.indices)
    denom = K.cumsum(risk)
    terms = xbeta - K.log(denom)
    loglik = K.cast(event, dtype=terms.dtype) * terms
    return -K.sum(loglik)


# Compile model
model.compile(optimizer="adam", loss=neg_log_pl)

# Fit model with the whole dataset as a batch, since the
# partial likelihood depends on all observations
model.fit(X, y, batch_size=n, epochs=3000)

# Compare to Cox model
cph = CoxPHFitter()
# CoxPHFitter uses Efron's method for handling tied survival times,
# whereas neg_log_pl uses Breslow's method, so the likelihood
# functions being optimized are not exactly the same
cph.fit(kidtx, duration_col="time", event_col="death")
cph.print_summary(decimals=8)
model.get_weights()
Exemplo n.º 25
0
all_features_drop_corr, de_corr_features = RandDropCorr(
    all_features_drop_low_var, 0.8)
all_features_drop_corr.columns = de_corr_features

all_features_reduced = pd.concat(
    [all_features_drop_corr, survival_df_filtered],
    axis=1).drop('case_submitter_id', axis=1)

my_cph = CoxPHFitter(penalizer=0.005, l1_ratio=0.9)
# haha.drop(['original_glszm_SizeZoneNonUniformity_1'],axis=1).to_csv('truth_reg_vars.csv')
# my_cph.fit(haha.drop(['original_glszm_SizeZoneNonUniformity_1'],axis=1), duration_col = 'days_to_death', event_col='vital_status')
my_cph.fit(all_features_reduced,
           duration_col='days_to_death',
           event_col='vital_status')
my_cph.print_summary()

haha = all_features_reduced.drop(
    ['original_glrlm_GrayLevelNonUniformityNormalized_1'], axis=1)
haha = haha.drop(['original_glcm_SumEntropy_1'], axis=1)

my_cph.fit(haha, duration_col='days_to_death', event_col='vital_status')
my_cph.print_summary()

scores = k_fold_cross_validation(my_cph,
                                 all_features_reduced,
                                 duration_col='days_to_death',
                                 event_col='vital_status',
                                 k=10,
                                 scoring_method="concordance_index")
np.mean(scores)
    cph_data['Husband_Race'] == 'Other Ethnic Groups', 0, 1)
cph_data['Couple_Race'] = np.where(cph_data['Couple_Race'] == 'Same-Race', 0,
                                   1)
cph_data.drop(['Abbreviation', 'State'], axis=1, inplace=True)
cph_data['Has_Children'] = np.where(
    cph_data['Has_Children'] == 'Have Children', 1, 0)
cph_data['Household_Income_Range'] = np.where(cph_data['Household_Income_Range']=='42,830$ - 44,765$',0,\
        (np.where(cph_data['Household_Income_Range']=='66,532$ - 70,303$',1,2)))
print('---------CPH fitting starts her---------------- ')
# Fitting the model and plotting the corresponding prediction
cph = CoxPHFitter()
cph.fit(cph_data,
        duration_col='Duration',
        event_col='Divorce',
        show_progress=True)
cph.print_summary()
sns.set()
cph.plot()
plt.savefig(
    '/home/raed/Dropbox/INSE - 6320/Final Project/CPH_Coefficients_plot.pdf')
plt.show()

#Calculating the correlation between covariates in order to understand the relationship in the data
# calculate the correlation matrix
cph_correlation = cph_data.corr()
print(corr)
#sns.heatmap(pd.crosstab(data.Duration, data.Poverty_Percentage))
sns.pairplot(cph_correlation)
plt.savefig('/home/raed/Dropbox/INSE - 6320/Final Project/CPH_Correlation.pdf')
plt.show()
Exemplo n.º 27
0
"""

# print cancer['T'].unique()
# print cancer['E'].unique()
# cancer = cancer.dropna()


# the '-1' term
# refers to not adding an intercept column (a column of all 1s).
# It can be added to the Fitter class.

covMatrix = cancer.cov()

cf = CoxPHFitter()
cf.fit(covMatrix, "T", event_col="E")  # extra paramater for categorical , strata=catVar
cf.print_summary()

curve = cf.predict_survival_function(cancer)
curve.plot()
plt.show()
print "hazard coeff", cf.hazards_
print "baseline ", cf.baseline_hazard_

"""
scores = k_fold_cross_validation(cf, covMatrix, 'T', event_col='E', k=3)
print scores
print np.mean(scores)
print np.std(scores)

"""
Exemplo n.º 28
0
def multivariate(df):
    from lifelines import CoxPHFitter
    cph = CoxPHFitter()
    cph.fit(df, duration_col='time', event_col='status',
            show_progress=True)
    cph.print_summary()  # access the results using cph.summary
Exemplo n.º 29
0
def surv_coxph(data_train,
               x_cols,
               duration_col,
               event_col,
               data_test=None,
               pt=None,
               show_extra=True):
    """Integrate functions that include modeling using Cox Regression and evaluating 

    Parameters
    ----------
    data_train : pandas.DataFame
        Full survival data for train.
    x_cols : list of str
        Name of column indicating variables.
    duration_col : str
        Name of column indicating time.
    event_col : str
        Name of column indicating event.
    data_test : pandas.DataFame
        Full survival data for test, default None.
    pt : float
        Predicted time for AUC.

    Returns
    -------
    object
        Object of cox model in `lifelines.CoxPHFitter`.

    Examples
    --------
    >>> surv_coxph(train_data, ['x1', 'x2'], 'T', 'E', test_data, pt=5*12)
    """
    y_cols = [event_col, duration_col]
    cph = CoxPHFitter()
    cph.fit(data_train[x_cols + y_cols],
            duration_col=duration_col,
            event_col=event_col,
            show_progress=True)
    # CI of train
    pred_X_train = cph.predict_partial_hazard(data_train[x_cols])
    pred_X_train.rename(columns={0: 'X'}, inplace=True)
    ci_train = concordance_index(data_train[duration_col], -pred_X_train,
                                 data_train[event_col])
    # AUC of train at pt
    df = pd.concat([data_train[y_cols], pred_X_train], axis=1)
    roc_train = surv_roc(df, 'X', duration_col, event_col, pt=pt)
    if data_test is not None:
        # CI of test
        pred_X_test = cph.predict_partial_hazard(data_test[x_cols])
        pred_X_test.rename(columns={0: 'X'}, inplace=True)
        ci_test = concordance_index(data_test[duration_col], -pred_X_test,
                                    data_test[event_col])
        # AUC of test at pt
        df = pd.concat([data_test[y_cols], pred_X_test], axis=1)
        roc_test = surv_roc(df, 'X', duration_col, event_col, pt=pt)
    # Print Summary of CPH
    cph.print_summary()
    print "__________Metrics CI__________"
    print "CI of train: %.4f" % ci_train
    if data_test is not None:
        print "CI of test : %.4f" % ci_test
    print "__________Metrics AUC__________"
    print "AUC of train: %.4f" % roc_train['AUC']
    if data_test is not None:
        print "AUC of test : %.4f" % roc_test['AUC']

    if not show_extra:
        return cph
    # Print Coefficients
    print "__________Summary of Coefficients in CPH__________"
    cols = ['coef', 'p', 'lower 0.95', 'upper 0.95']
    print cols[0], ":"
    for i in cph.summary.index:
        print "%.4f" % (cph.summary.loc[i, cols[0]])
    print "__________"
    print cols[1], ":"
    for i in cph.summary.index:
        print "%.4f" % (cph.summary.loc[i, cols[1]])
    print "__________"
    print "95% CI :"
    for i in cph.summary.index:
        print "[%.4f, %.4f]" % (cph.summary.loc[i, cols[2]],
                                cph.summary.loc[i, cols[3]])
    return cph
model.add(Dense(32, input_shape=(7,), init='glorot_uniform')) # shape= length, dimension
model.add(Activation('relu'))
model.add(Dense(32, init='glorot_uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation="linear", init='glorot_uniform', W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))
#

sgd = SGD(lr=1e-5, decay=0.01, momentum=0.9, nesterov=True)
rmsprop=RMSprop(lr=1e-5, rho=0.9, epsilon=1e-8)
model.compile(loss=negative_log_likelihood(E_train), optimizer=sgd)

print('Training...')
model.fit(X_train, Y_train, batch_size=324, nb_epoch=1000, shuffle=False)  # Shuffle False --> Important!!

hr_pred=model.predict(X_train)
hr_pred=np.exp(hr_pred)
ci=concordance_index(Y_train,-hr_pred,E_train)

hr_pred2=model.predict(X_val)
hr_pred2=np.exp(hr_pred2)
ci2=concordance_index(Y_val,-hr_pred2,E_val)
print 'Concordance Index for training dataset:', ci
print 'Concordance Index for test dataset:', ci2

#Cox Fitting
cf = CoxPHFitter()
cf.fit(rossi_dataset, 'week', event_col='arrest')

cf.print_summary()  # access the results using cf.summary
Exemplo n.º 31
0
# -*- coding: utf-8 -*-
# cox regression

if __name__ == "__main__":
    import pandas as pd
    import time
    import numpy as np

    from lifelines import CoxPHFitter
    from lifelines.datasets import load_rossi

    df = load_rossi()
    df = pd.concat([df] * 16)
    # df = df.reset_index()
    # df['week'] = np.random.exponential(1, size=df.shape[0])
    cp = CoxPHFitter()
    start_time = time.time()
    cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True)
    print("--- %s seconds ---" % (time.time() - start_time))
    cp.print_summary()
Exemplo n.º 32
0
# -*- coding: utf-8 -*-
# cox regression

if __name__ == "__main__":
    import pandas as pd
    import time
    import numpy as np

    from lifelines import CoxPHFitter
    from lifelines.datasets import load_rossi, load_regression_dataset

    reps = 1
    df = load_rossi()
    df = pd.concat([df] * reps)
    cp_breslow = CoxPHFitter(penalizer=0.1,
                             l1_ratio=1.0,
                             baseline_estimation_method="spline")
    start_time = time.time()
    cp_breslow.fit(df,
                   duration_col="week",
                   event_col="arrest",
                   show_progress=True)
    print("--- %s seconds ---" % (time.time() - start_time))
    cp_breslow.print_summary(2)
    print(cp_breslow.score(df))
    print(cp_breslow.score(df, scoring_method="concordance_index"))
to_encode = ['edema', 'stage']

one_hot_train = to_one_hot(df_train, to_encode)
one_hot_val = to_one_hot(df_val, to_encode)
one_hot_test = to_one_hot(df_test, to_encode)

print(one_hot_val.columns.tolist())
print(f"There are {len(one_hot_val.columns)} columns")

print(one_hot_train.shape)
one_hot_train.head()

cph = CoxPHFitter()
cph.fit(one_hot_train, duration_col='time', event_col='status', step_size=0.1)

cph.print_summary()

cph.plot_covariate_groups('edema_1.0', values=[0, 1])


def hazard_ratio(case_1, case_2, cox_params):

    hr = np.exp(np.dot(cox_params, (case_1 - case_2)))

    return hr


i = 1
case_1 = one_hot_train.iloc[i, :].drop(['time', 'status'])

j = 5
Exemplo n.º 34
0
# -*- coding: utf-8 -*-
# cox regression

if __name__ == "__main__":
    import pandas as pd
    import time
    import numpy as np

    from lifelines import CoxPHFitter
    from lifelines.datasets import load_rossi, load_regression_dataset

    reps = 1
    df = load_rossi()
    df = pd.concat([df] * reps)
    cph = CoxPHFitter()
    start_time = time.time()
    cph.fit(df, duration_col="week", event_col="arrest", show_progress=True)
    print("--- %s seconds ---" % (time.time() - start_time))
    cph.print_summary(2)
    print(cph.compute_followup_hazard_ratios(df, [15, 20, 30, 40, 50, 52]))
    print(cph.hazard_ratios_)
    cph.compute_followup_hazard_ratios(df, [15, 20, 30, 40, 50, 52]).plot()