Exemplo n.º 1
0
def comp_cph(endpoint, sex, df_events, df_info):
    """Prepare data and fit a Cox PH model for the given endpoint"""
    logger.info(f"{endpoint} - {sex} - Computing cumulative incidence")
    logger.debug(f"{endpoint} - {sex} - Assigning cases and controls")

    # Cases
    df_cases = df_events.loc[df_events.ENDPOINT == endpoint,
                             ["FINNGENID", "ENDPOINT_AGE"]]
    if df_cases.shape[0] < MIN_CASES:
        raise NotEnoughCases(f"Not enough cases (< {MIN_CASES}).")

    # Take all individual, also dealing with sex-specific endpoints
    df_all = df_info.loc[df_info.SEX == sex, ["FINNGENID", "FU_END_AGE"]]

    df_all = df_all.merge(df_cases, how="left", on="FINNGENID")
    df_all["outcome"] = ~df_all.ENDPOINT_AGE.isna(
    )  # ENDPOINT_AGE is NaN for controls
    df_all["duration"] = df_all.FU_END_AGE
    df_all.loc[df_all.outcome, "duration"] = df_all.loc[df_all.outcome,
                                                        "ENDPOINT_AGE"]

    # Trim down the columns so the later call to cph.fit() doesn't try to use extra columns
    dfcox = df_all.loc[:, ["outcome", "duration"]]

    logger.debug(f"{endpoint} - Fitting Cox model")
    cph = CoxPHFitter()
    cph.fit(dfcox, duration_col="duration", event_col="outcome")

    return dfcox, cph
Exemplo n.º 2
0
def main(data_df):

    for key in th_dict.keys():
        if not key.find("HU") > 0:
            data_df[key] = data_df[key].fillna(0)
        data_df[key] = data_df[key].map(lambda input: 1
                                        if input >= th_dict[key] else 0)

    add_DF = pd.DataFrame()
    add_DF["V-HU"] = data_df['HU_of_consolidation'] + data_df[
        'Volume_of_total_pneumonia_infection']  #0,1,2

    combinations_df = pd.concat(
        [
            data_df["Duration"],
            data_df["Death"],
            data_df["Age"],
            data_df["Blood_Oxygen"],
            data_df["C-Reactive_protein"],
            #data_df["White_blood_cell_count"] ,
            data_df["Lymphocyte_count"],
            data_df["Cerebrovascular_Disease"],
            data_df["Sex"],
            #data_df["Neutrophil_count"],
            #data_df["D-dimer"] ,
            data_df["Lactic_dehydrogenase"],
            add_DF["V-HU"],
        ],
        axis=1)

    cph = CoxPHFitter()
    cph.fit(combinations_df, "Duration", event_col="Death", step_size=0.01)

    cph.print_summary()
Exemplo n.º 3
0
def f(train,threshold,test):
    hi=h(train)
    h_score=pd.DataFrame(hi, index=np.array(range(1,21149)))
    gene_ls=h_score.index[h_score.iloc[:,0]>1].tolist()
    candidate_genes=['V{0}'.format(element) for element in gene_ls]

    # qualified genes were selected 

    stdsc = preprocessing.StandardScaler()
    np_scaled_train = stdsc.fit_transform(train.loc[:,candidate_genes])
    np_scaled_test  = stdsc.transform(test.loc[:,candidate_genes])
    pca = sklearnPCA(n_components=1)   
    X_train_pca = pca.fit_transform(np_scaled_train) # This is the result 
    X_test_pca  = pca.transform(np_scaled_test)
    eigen_val=pca.explained_variance_  #eigen value is the explained variance 

    
    #assign pca score to the test dataset 
    test=test.assign(w=pd.Series(np.ones(len(test.patient_id))))
    test['w']=X_test_pca
    testset_surv=test[['event_free_survival_time_days','death','w']]
    
    #do cox-regression

    # Using Cox Proportional Hazards model
    cph = CoxPHFitter()
    cph.fit(testset_surv,'event_free_survival_time_days',event_col='death')
    
    return cph.print_summary()
Exemplo n.º 4
0
def coxcalc(df, x, survivaltime, status):
    df5 = df[[status, survivaltime, x]]
    df5[x] = pd.to_numeric(df5[x])
    df5 = df5.dropna()
    cph = CoxPHFitter()
    cph.fit(df5, duration_col=survivaltime, event_col=status, show_progress=False)
    return cph.summary
Exemplo n.º 5
0
def test_proportional_hazard_test_with_weights():
    """

    library(survival)
    df <- data.frame(
      "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
      "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458),
      "E" = c(1, 1, 1, 1, 1),
      "w" = c(1, 0.5, 2, 1, 1)
    )

    c = coxph(formula=Surv(T, E) ~ var1 , data=df, weights=w)
    cox.zph(c, transform='rank')
    """

    df = pd.DataFrame({
        "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
        "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458],
        "w": [1, 0.5, 2, 1, 1],
    })
    df["E"] = True

    cph = CoxPHFitter()
    cph.fit(df, "T", "E", weights_col="w")

    results = stats.proportional_hazard_test(
        cph, df, time_transform=["km", "rank", "log", "identity"])
    results.print_summary(5)
    npt.assert_allclose(results.summary.loc["var1", "rank"]["test_statistic"],
                        0.108,
                        rtol=1e-2)
Exemplo n.º 6
0
    def mpss_ph_lifelines(self):
        """

        Performs proportional hazards regression using lifelines package.

        :return: feature importance
        """
        x_train = pd.DataFrame(self.x_train)

        # Remove any feature columns that are all 0 values, otherwise cannot run regression
        lifelines_dataset = x_train.loc[:, (x_train != 0).any(axis=0)]

        # Reformat for lifelines package
        lifelines_dataset['scores'] = self.scores
        lifelines_dataset['event'] = 1

        # Run proportional hazards regression
        cph = CoxPHFitter(penalizer=5, alpha=1)
        cph.fit(lifelines_dataset, duration_col='scores', event_col='event')

        # Dataframe with coefficients, absolute value of coefficients, and p-values
        importance = cph.summary.reset_index()[['covariate', 'coef', 'p']]
        importance['feature'] = importance['covariate']
        importance['coef_abs'] = importance['coef'].apply(
            lambda x: math.fabs(x))

        # Sort feature importance
        importance = importance.sort_values(
            'coef_abs', ascending=False).reset_index(drop=True)
        return importance
Exemplo n.º 7
0
    def _compute_likelihood_ratio_test(self):
        """
        This function computes the likelihood ratio test for the Cox model. We
        compare the existing model (with all the covariates) to the trivial model
        of no covariates.

        Conveniently, we can actually use another class to do most of the work.

        """

        trivial_dataset = self.start_stop_and_events.groupby(level=0).last()[[
            "event", "stop"
        ]]
        weights = self.weights.groupby(level=0).last()[["__weights"]]
        trivial_dataset = trivial_dataset.join(weights)

        cp_null = CoxPHFitter()
        cp_null.fit(trivial_dataset,
                    "stop",
                    "event",
                    weights_col="__weights",
                    show_progress=False)

        ll_null = cp_null._log_likelihood
        ll_alt = self._log_likelihood

        test_stat = 2 * ll_alt - 2 * ll_null
        degrees_freedom = self.hazards_.shape[1]
        _, p_value = chisq_test(test_stat,
                                degrees_freedom=degrees_freedom,
                                alpha=0.0)
        return test_stat, degrees_freedom, np.log(p_value)
Exemplo n.º 8
0
def c_index_multiple_from_python(matrix,
                                 isdead,
                                 nbdays,
                                 matrix_test,
                                 isdead_test,
                                 nbdays_test,
                                 isfactor=False):
    """
    """
    frame = pd.DataFrame(matrix)
    frame["isdead"] = isdead
    frame["nbdays"] = nbdays

    frame_test = pd.DataFrame(matrix_test)
    frame_test["isdead"] = isdead_test
    frame_test["nbdays"] = nbdays_test

    cph = CoxPHFitter()

    try:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            cph.fit(frame, "nbdays", "isdead")
    except Exception as e:
        print(e)
        return np.nan

    cindex = cph.score(frame_test, scoring_method="concordance_index")

    return cindex
Exemplo n.º 9
0
    def DoFeatureSelectionCPH(self, x, c, s, xnames, fold, sel_f_num,
                              dev_index):
        variance_th = 0.15
        xdf = pd.DataFrame(x, columns=xnames)
        sel_idx = xdf.std() > variance_th  #true or false
        xdf = xdf.loc[:, sel_idx]
        xnames = xnames[sel_idx]
        x = xdf.values

        gene_p_value = []
        for i in tqdm(range(0, x.shape[1])):
            subset_num = i
            cph_h_trn_stack = np.column_stack(
                (x[:, subset_num:subset_num + 1], c, s))
            cph_cols = xnames.copy().tolist()[subset_num:subset_num + 1]
            cph_cols.append('E')
            cph_cols.append('S')
            cph_train_df = pd.DataFrame(cph_h_trn_stack, columns=cph_cols)
            cph = CoxPHFitter()
            cph.fit(cph_train_df,
                    duration_col='S',
                    event_col='E',
                    step_size=0.1,
                    show_progress=False)
            f_scores = pd.DataFrame(cph.summary)['p'].values
            gene_p_value.append(f_scores[0])

        gene_p_value = np.asarray(gene_p_value)
        sort_idx = np.argsort(gene_p_value)
        f_name_sort = np.asarray(xnames)[sort_idx]
        f_score_sort = gene_p_value[sort_idx]

        return sort_idx, f_name_sort, f_score_sort  #, auc
def test_proportional_hazard_test_with_weights_and_strata():
    """
    library(survival)
    df <- data.frame(
      "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
      "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458),
      "E" = c(1, 1, 1, 1, 1),
      "w" = c(1, 0.5, 2, 1, 1),
      "s" = c(1, 1, 0, 0, 0)
    )

    c = coxph(formula=Surv(T, E) ~ var1 + strata(s), data=df, weights=w)
    cz = cox.zph(c, transform='identity')

    """

    df = pd.DataFrame(
        {
            "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
            "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458],
            "w": [1, 0.5, 2, 1, 1],
            "s": [1, 1, 0, 0, 0],
        }
    )
    df["E"] = True

    cph = CoxPHFitter()
    cph.fit(df, "T", "E", weights_col="w", strata="s", robust=True)

    results = stats.proportional_hazard_test(cph, df, time_transform="identity")

    npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.0283, rtol=1e-3)
Exemplo n.º 11
0
 def test_coxph_plot_covariate_groups_with_multiple_variables(self, block):
     df = load_rossi()
     cp = CoxPHFitter()
     cp.fit(df, "week", "arrest")
     cp.plot_covariate_groups(["age", "prio"], [[10, 0], [50, 10], [80, 90]])
     self.plt.title("test_coxph_plot_covariate_groups_with_multiple_variables")
     self.plt.show(block=block)
def test_proportional_hazard_test_with_kmf_with_some_censorship():
    """

    library(survival)
    df <- data.frame(
      "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
      "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458),
      "E" = c(1, 1, 1, 0, 1)
    )

    c = coxph(formula=Surv(T, E) ~ var1 , data=df)
    cox.zph(c, transform='km')
    """

    df = pd.DataFrame(
        {
            "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
            "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458],
            "E": [1, 1, 1, 0, 1],
        }
    )

    cph = CoxPHFitter()
    cph.fit(df, "T", "E")

    results = stats.proportional_hazard_test(cph, df)
    npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 1.013802, rtol=1e-3)
def test_proportional_hazard_test_with_kmf_with_some_censorship_and_weights():
    """

    library(survival)
    df <- data.frame(
      "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
      "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458),
      "E" = c(1, 1, 1, 0, 1),
      "w" = c(1, 0.5, 2, 1, 1),
    )

    c = coxph(formula=Surv(T, E) ~ var1 , data=df, weights=w)
    cox.zph(c, transform='km')
    """

    df = pd.DataFrame(
        {
            "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
            "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458],
            "E": [1, 1, 1, 0, 1],
            "w": [1, 0.5, 5, 1, 1],
        }
    )

    cph = CoxPHFitter()
    with pytest.warns(StatisticalWarning, match="weights are not integers"):
        cph.fit(df, "T", "E", weights_col="w")
        results = stats.proportional_hazard_test(cph, df)
        npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.916, rtol=1e-2)
Exemplo n.º 14
0
def getHazardRatio(df_col, os, event, genename, value, binary=False, age=None, return_sign=False):
    cph = CoxPHFitter()
    os_data = pd.DataFrame({'Gene': df_col,
                            'Duration': os,
                            'Flag': event})
    if age is not None:
        os_data['Age'] = age

    try:
        cph.fit(os_data, 'Duration', 'Flag', show_progress=False)
    except ValueError:
        print('Not working, returning nans')
        return genename, value, np.nan, df_col.sum()

    hazard_ratio = np.exp(cph.hazards_['Gene'].values)

    if binary:
        if hazard_ratio < 1:
            hazard_ratio = 1/hazard_ratio
            value = 1

    if return_sign:
        return genename, value, hazard_ratio[0], df_col.sum()
    else:
        return hazard_ratio
def survival(row, phenotype_df, duration_col = 'T', event_col = 'E', other_cols = []):
    """
    duration_col: survival time
    event_col: whether an event (death or other) has ocured or not. 0 for no, 1 for yes
    other_cols: other variables to consider in the regression
    """
    phenotype_df = phenotype_df.T
    phenotype_df = phenotype_df.join(row.astype(float))
    phenotype_df[duration_col] = phenotype_df[duration_col].astype(float)
    phenotype_df[event_col] = phenotype_df[event_col].astype(int)

    # The following lines deal with char conflicts in patsy formulas
    duration_col = duration_col.replace(' ','_').replace('.','_').replace('-','_')
    event_col = event_col.replace(' ','_').replace('.','_').replace('-','_')
    other_cols = [x.replace(' ','_').replace('.','_').replace('-','_') for x in other_cols]
    row.name = row.name.replace(' ','_').replace('.','_').replace('-','_')
    phenotype_df.columns = [x.replace(' ','_').replace('.','_').replace('-','_') for x in phenotype_df.columns]

    formula = row.name + ' + ' + duration_col + ' + ' + event_col
    if not not other_cols:
        other_cols = [x.replace(' ','_').replace('.','_') for x in other_cols]
        formula = formula + ' + ' + ' + '.join(other_cols)
    X = patsy.dmatrix(formula_like = formula, data = phenotype_df, return_type = 'dataframe')
    X = X.drop(['Intercept'], axis = 1)
    cph = CoxPHFitter()
    cph.fit(X, duration_col = duration_col, event_col = event_col)
    result = cph.summary.loc[row.name]
    return result
Exemplo n.º 16
0
 def test_coxph_plot_partial_effects_on_outcome_with_multiple_variables(self, block):
     df = load_rossi()
     cp = CoxPHFitter()
     cp.fit(df, "week", "arrest")
     cp.plot_partial_effects_on_outcome(["age", "prio"], [[10, 0], [50, 10], [80, 90]])
     self.plt.title("test_coxph_plot_partial_effects_on_outcome_with_multiple_variables")
     self.plt.show(block=block)
Exemplo n.º 17
0
 def test_coxph_plot_partial_effects_on_outcome_with_cumulative_hazard(self, block):
     df = load_rossi()
     cp = CoxPHFitter()
     cp.fit(df, "week", "arrest")
     cp.plot_partial_effects_on_outcome("age", [10, 50, 80], y="cumulative_hazard")
     self.plt.title("test_coxph_plot_partial_effects_on_outcome")
     self.plt.show(block=block)
Exemplo n.º 18
0
 def test_coxph_plotting_with_subset_of_columns(self, block):
     df = load_regression_dataset()
     cp = CoxPHFitter()
     cp.fit(df, "T", "E")
     cp.plot(columns=["var1", "var2"])
     self.plt.title("test_coxph_plotting_with_subset_of_columns")
     self.plt.show(block=block)
Exemplo n.º 19
0
 def test_coxph_plot_covariate_groups_with_single_strata(self, block):
     df = load_rossi()
     cp = CoxPHFitter()
     cp.fit(df, "week", "arrest", strata="paro")
     cp.plot_covariate_groups("age", [10, 50, 80])
     self.plt.title("test_coxph_plot_covariate_groups_with_strata")
     self.plt.show(block=block)
Exemplo n.º 20
0
 def test_coxph_plotting(self, block):
     df = load_regression_dataset()
     cp = CoxPHFitter()
     cp.fit(df, "T", "E")
     cp.plot()
     self.plt.title("test_coxph_plotting")
     self.plt.show(block=block)
Exemplo n.º 21
0
 def test_coxph_plotting_with_hazards_ratios(self, block):
     df = load_regression_dataset()
     cp = CoxPHFitter()
     cp.fit(df, "T", "E")
     cp.plot(hazard_ratios=True)
     self.plt.title("test_coxph_plotting")
     self.plt.show(block=block)
Exemplo n.º 22
0
 def test_coxph_plot_covariate_groups(self, block):
     df = load_rossi()
     cp = CoxPHFitter()
     cp.fit(df, "week", "arrest")
     cp.plot_covariate_groups("age", [10, 50, 80])
     self.plt.title("test_coxph_plot_covariate_groups")
     self.plt.show(block=block)
Exemplo n.º 23
0
def main():
    # Load data
    print('Load data...')
    hp = Hyperparameters()
    data = np.load('../' + hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz')
    
    print('Use all data for model fitting...')
    x = data['x']
    time = data['time']
    event = data['event']
    
    cols_list = load_obj('../' + hp.data_pp_dir + 'cols_list.pkl')
    
    df = pd.DataFrame(x, columns=cols_list)
    df['TIME'] = time
    df['EVENT'] = event

    ###################################################################
    
    print('Add additional columns...')
    df_index_code = feather.read_dataframe('../' + hp.results_dir + 'hr_addcodes_' + hp.gender + '.feather')
    df_index_code = pd.concat([df_index_code[df_index_code['TYPE']==1].head(10), df_index_code[df_index_code['TYPE']==0].head(10)], sort=False)
    
    for index, row in df_index_code.iterrows():
        print(row['DESCRIPTION'])
        df[row['DESCRIPTION']] = (data['codes'] == row['INDEX_CODE']).max(axis=1)
        cols_list = cols_list + [row['DESCRIPTION']]
    
    ###################################################################
    
    print('Fitting...')
    cph = CoxPHFitter()
    cph.fit(df, duration_col='TIME', event_col='EVENT', show_progress=True, step_size=0.5)
    cph.print_summary()
    print('done')
Exemplo n.º 24
0
    def _fit_cox(self):
        """ private method to fit Cox model """
        if self._cf is not None:
            return

        cox_df1 = pd.DataFrame(self.survival0.df,
                               columns=[self.time_col1, self.event_col1])
        cox_df1[self.survival1.label] = 0
        cox_df2 = pd.DataFrame(self.survival1.df,
                               columns=[self.time_col2, self.event_col2])
        if self.time_col1 != self.time_col2:
            cox_df2 = cox_df2.rename(columns={self.time_col2: self.time_col1})
        if self.event_col1 != self.event_col2:
            cox_df2 = cox_df2.rename(
                columns={self.event_col2: self.event_col1})
        cox_df2[self.survival1.label] = 1
        cox_df = cox_df1.append(cox_df2, ignore_index=True)

        cox_fitted = CoxPHFitter(normalize=False)
        cox_fitted.fit(cox_df,
                       self.time_col1,
                       event_col=self.event_col1,
                       include_likelihood=False)

        self._cf = cox_fitted
Exemplo n.º 25
0
 def test_coxph_plot_partial_effects_on_outcome_with_single_strata(self, block):
     df = load_rossi()
     cp = CoxPHFitter()
     cp.fit(df, "week", "arrest", strata="paro")
     cp.plot_partial_effects_on_outcome("age", [10, 50, 80])
     self.plt.title("test_coxph_plot_partial_effects_on_outcome_with_strata")
     self.plt.show(block=block)
Exemplo n.º 26
0
def coxreg_single_run(xtr, ytr, penalty):
    df_tr = pd.DataFrame(np.concatenate((ytr, xtr), axis=1))
    df_tr.columns = ['status', 'time'
                     ] + ['X' + str(i + 1) for i in range(xtr.shape[1])]
    cph = CoxPHFitter(penalizer=penalty)
    cph.fit(df_tr, duration_col='time', event_col='status')
    return cph
Exemplo n.º 27
0
 def test_spline_coxph_plot_partial_effects_on_outcome_with_strata(self, block):
     df = load_rossi()
     cp = CoxPHFitter(baseline_estimation_method="spline", n_baseline_knots=2)
     cp.fit(df, "week", "arrest", strata=["wexp"])
     cp.plot_partial_effects_on_outcome("age", [10, 50, 80])
     self.plt.title("test_spline_coxph_plot_partial_effects_on_outcome_with_strata")
     self.plt.show(block=block)
Exemplo n.º 28
0
def fit_cox(subset,
            name,
            duration_col='days_survival',
            event_col='vital_status',
            *args,
            **kwargs):
    '''
	use lifelines to fit COXPHFitter model.
	return summary plus the corrected p-value

	subset: DataFrame
	name: name of the analysis
	duration_col: column of subset with number of days sample survived
	event_col: column of subset with 0/1 wheter the sample is alive or dead
	*args: to be passed to CoxPHFitter
	**kwargs: to be passed to CoxPHFitter
	'''
    from lifelines import CoxPHFitter
    from statsmodels.stats.multitest import multipletests
    cph = CoxPHFitter(*args, **kwargs)
    try:
        cph.fit(subset, duration_col=duration_col, event_col=event_col)
        summary = cph.summary
        p_vals = multipletests(cph.summary["p"], method="bonferroni")[1]
        summary["corrected_p"] = p_vals
        summary["-log2(corrected_p)"] = -np.log2(p_vals)
        return summary, cph
    except:
        print(*sys.exc_info())
        return None, None
Exemplo n.º 29
0
 def test_coxph_plot_partial_effects_on_outcome_with_nonnumeric_strata(self, block):
     df = load_rossi()
     df["strata"] = np.random.choice(["A", "B"], size=df.shape[0])
     cp = CoxPHFitter()
     cp.fit(df, "week", "arrest", strata="strata")
     cp.plot_partial_effects_on_outcome("age", [10, 50, 80])
     self.plt.title("test_coxph_plot_partial_effects_on_outcome_with_single_strata")
     self.plt.show(block=block)
Exemplo n.º 30
0
 def test_coxph_plot_partial_effects_on_outcome_with_multiple_variables_and_strata(self, block):
     df = load_rossi()
     df["strata"] = np.random.choice(["A", "B"], size=df.shape[0])
     cp = CoxPHFitter()
     cp.fit(df, "week", "arrest", strata="strata")
     cp.plot_partial_effects_on_outcome(["age", "prio"], [[10, 0], [50, 10], [80, 90]])
     self.plt.title("test_coxph_plot_partial_effects_on_outcome_with_multiple_variables_and_strata")
     self.plt.show(block=block)
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        X_[self.duration_column]=y[self.duration_column]
        if self.event_col is not None:
            X_[self.event_col] = y[self.event_col]

        params = self.get_params()
        est = CoxPHFitter(**params)

        est.fit(X_, duration_col=self.duration_column, event_col=self.event_col, initial_beta=self.initial_beta, include_likelihood=self.include_likelihood, strata=self.strata, **fit_params)
        self.estimator = est
        return self
Exemplo n.º 32
0
def cox_regression(clean_df):
	cf = CoxPHFitter()
	cf.fit(clean_df, 'time', event_col='event')
	summary_df = cf.summary
	#decimals = pd.Series([2, 2, 2], index=['exp(coef)', 'lower 0.95', 'upper 0.95'])
	#summary_df = summary_df.round(decimals)
	ori_dic = summary_df.to_dict()
	res_dic= {}
	for stat_of_interest in stats_of_interest:
		if stat_of_interest != 'p':
			res_dic[stat_of_interest] = round_dic(ori_dic[stat_of_interest])
		else:
			res_dic[stat_of_interest] = round_dic_eng(ori_dic[stat_of_interest])
	return res_dic
Exemplo n.º 33
0
def estCoxPHTE(df, treatment_col='treated', duration_col='dx', event_col='disease', covars=[]):
    """Estimates treatment efficacy using proportional hazards (Cox model).
    
    Parameters
    ----------
    df : pandas.DataFrame
    
    treatment_col : string
        Column in df indicating treatment.
    duration_col : string
        Column in df indicating survival times.
    event_col : string
        Column in df indicating events (censored data are 0)
    covars : list
        List of other columns to include in Cox model as covariates.
    
    Returns
    -------
    est : float
        Estimate of vaccine efficacy
    ci : vector, length 2
        95% confidence interval, [LL, UL]
    pvalue : float
        P-value for H0: VE=0"""
    
    coxphf = CoxPHFitter()
    
    coxphf.fit(df[[treatment_col, duration_col, event_col]+covars], duration_col=duration_col, event_col=event_col)
    
    te = 1 - np.exp(coxphf.hazards_.loc['coef', treatment_col])
    ci = 1 - np.exp(coxphf.confidence_intervals_[treatment_col].loc[['upper-bound', 'lower-bound']])
    pvalue = coxphf._compute_p_values()[0]

    ind1 = df[treatment_col] == 0
    ind2 = df[treatment_col] == 1
    results = logrank_test(df[duration_col].loc[ind1], df[duration_col].loc[ind2], event_observed_A=df[event_col].loc[ind1], event_observed_B=df[event_col].loc[ind2])
    index = ['TE', 'UB', 'LB', 'pvalue', 'logrank_pvalue', 'model']
    return pd.Series([te, ci['upper-bound'], ci['lower-bound'], pvalue, results.p_value, coxphf], index=index)
tx = df['history_of_neoadjuvant_treatment']=='Yes'
ax = plt.subplot(111)

kmf1 = KaplanMeierFitter(alpha=0.95)
kmf1.fit(durations=df.ix[tx, survival_col], event_observed=df.ix[tx, censor_col], label=['Tx==Yes'])
kmf1.plot(ax=ax, show_censors=True,  ci_show=False)


kmf2 = KaplanMeierFitter(alpha=0.95)
kmf2.fit(durations=df.ix[~tx, survival_col], event_observed=df.ix[~tx, censor_col], label=['Tx==No'])
kmf2.plot(ax=ax, show_censors=True,  ci_show=False )

add_at_risk_counts(kmf1, kmf2, ax=ax)
plt.title ('Acute myeloid leukemia survival analysis with Tx and without Tx')
plt.xlabel(survival_col)
plt.savefig('km.png')

results = logrank_test(df.ix[tx, survival_col], df.ix[~tx, survival_col], df.ix[tx, censor_col], df.ix[~tx, censor_col], alpha=.99 )
results.print_summary()

cox = CoxPHFitter(normalize=False)
df_age = df[[survival_col, censor_col, 'age_at_initial_pathologic_diagnosis']]
df_age = df_age[pd.notnull(df_age['age_at_initial_pathologic_diagnosis'])]
cox = cox.fit(df_age, survival_col, event_col=censor_col, include_likelihood=True)
cox.print_summary()

scores = k_fold_cross_validation(cox, df_age, survival_col, event_col=censor_col, k=10)
print scores
print 'Mean score', np.mean(scores)
print 'Std', np.std(scores)
 
Exemplo n.º 35
0
# Convert to data frame
data = pd.DataFrame({'duration': duration, 'event': not_censor, 'age': age, 'college': college})

# Plot observations with censoring
# plot_lifetimes(duration, event_observed = not_censor)

# Kaplan Meier Summary for Simulated Data
from lifelines import KaplanMeierFitter
kmf =  KaplanMeierFitter()
kmf.fit(duration, event_observed = not_censor)
kmf.survival_function_.plot()

# Cox-PH Model Regression
from lifelines import CoxPHFitter
cf = CoxPHFitter()
cf.fit(data, 'duration', event_col = 'event')
cf.print_summary()

## Get Predictions from Model ##

# 24 year old college grad
#college_24 = pd.DataFrame({'age':[24], 'college':[1]})
#cf.predict_survival_function(college_24).plot()

# 65 year old high school grad
#hs_65 = pd.DataFrame({'age':[65], 'college':[0]})
#cf.predict_survival_function(hs_65).plot()

# Predicted Survival for 24yr-old College Grad and 65yr-old HS Grad
mixed = pd.DataFrame({'age':[24, 65,42], 'college':[1,0,.4], 'index': ['24yr old College Grad','65yr old HS Grad','Average']})
mixed = mixed.set_index(['index']) # setting row names
Exemplo n.º 36
0
def multivariate(df):
    from lifelines import CoxPHFitter
    cph = CoxPHFitter()
    cph.fit(df, duration_col='time', event_col='status',
            show_progress=True)
    cph.print_summary()  # access the results using cph.summary
Exemplo n.º 37
0
from lifelines.datasets import load_regression_dataset
regression_dataset = load_regression_dataset()

regression_dataset.head()






from lifelines import AalenAdditiveFitter, CoxPHFitter

# Using Cox Proportional Hazards model
cf = CoxPHFitter()
cf.fit(regression_dataset, 'T', event_col='E')
cf.print_summary()

# Using Aalen's Additive model
aaf = AalenAdditiveFitter(fit_intercept=False)
aaf.fit(regression_dataset, 'T', event_col='E')






x = regression_dataset[regression_dataset.columns - ['E','T']]
aaf.predict_survival_function(x.ix[10:12]).plot() #get the unique survival functions of the first two subjects

Exemplo n.º 38
0
from lifelines.datasets import generate_regression_dataset
regression_dataset = generate_regression_dataset()
from lifelines import AalenAdditiveFitter, CoxPHFitter
cf = CoxPHFitter()
cf.fit(regression_dataset, duration_col='T', event_col='E')
aaf = AalenAdditiveFitter(fit_intercept=False)
aaf.fit(regression_dataset, duration_col='T', event_col='E')
x = regression_dataset[regression_dataset.columns - ['E','T']]
aaf.predict_survival_function(x.ix[10:12]).plot()
aaf.plot()
Exemplo n.º 39
0
"""

# print cancer['T'].unique()
# print cancer['E'].unique()
# cancer = cancer.dropna()


# the '-1' term
# refers to not adding an intercept column (a column of all 1s).
# It can be added to the Fitter class.

covMatrix = cancer.cov()

cf = CoxPHFitter()
cf.fit(covMatrix, "T", event_col="E")  # extra paramater for categorical , strata=catVar
cf.print_summary()

curve = cf.predict_survival_function(cancer)
curve.plot()
plt.show()
print "hazard coeff", cf.hazards_
print "baseline ", cf.baseline_hazard_

"""
scores = k_fold_cross_validation(cf, covMatrix, 'T', event_col='E', k=3)
print scores
print np.mean(scores)
print np.std(scores)

"""
Exemplo n.º 40
0
if __name__ == '__main__':
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    args = parse_args()
    print("Arguments:",args)

    # Load Dataset
    print("Loading datasets: " + args.dataset)
    datasets = utils.load_datasets(args.dataset)

    # Train CPH model
    print("Training CPH Model")
    train_df = utils.format_dataset_to_df(datasets['train'], DURATION_COL, EVENT_COL)
    cf = CoxPHFitter()
    results = cf.fit(train_df, duration_col=DURATION_COL, event_col=EVENT_COL, 
        include_likelihood=True)
    cf.print_summary()
    print("Train Likelihood: " + str(cf._log_likelihood))

    if 'valid' in datasets:
        metrics = evaluate_model(cf, datasets['valid'])
        print("Valid metrics: " + str(metrics))

    if 'test' in datasets:
        metrics = evaluate_model(cf, datasets['test'], bootstrap=True)
        print("Test metrics: " + str(metrics))

    print("Saving Visualizations")
    if 'test' in datasets and args.treatment_idx is not None:
        print("Calculating treatment recommendation survival curvs")
        # We use the test dataset because these experiments don't have a viz dataset
Exemplo n.º 41
0
def _plot_kmf_single(df,
                     condition_col,
                     survival_col,
                     censor_col,
                     threshold,
                     title,
                     xlabel,
                     ylabel,
                     ax,
                     with_condition_color,
                     no_condition_color,
                     with_condition_label,
                     no_condition_label,
                     color_map,
                     label_map,
                     color_palette,
                     ci_show,
                     print_as_title):
    """
    Helper function to produce a single KM survival plot, among observations in df by groups defined by condition_col.

    All inputs are required - this function is intended to be called by `plot_kmf`.
    """
    # make color inputs consistent hex format
    if colors.is_color_like(with_condition_color):
        with_condition_color = colors.to_hex(with_condition_color)
    if colors.is_color_like(no_condition_color):
        no_condition_color = colors.to_hex(no_condition_color)
    ## prepare data to be plotted; producing 3 outputs:
    # - `condition`, series containing category labels to be plotted
    # - `label_map` (mapping condition values to plot labels)
    # - `color_map` (mapping condition values to plotted colors)
    if threshold is not None:
        is_median = threshold == "median"
        if is_median:
            threshold = df[condition_col].median()
        label_suffix = float_str(threshold)
        condition = df[condition_col] > threshold
        default_label_no_condition = "%s ≤ %s" % (condition_col, label_suffix)
        if is_median:
            label_suffix += " (median)"
        default_label_with_condition = "%s > %s" % (condition_col, label_suffix)
        with_condition_label = with_condition_label or default_label_with_condition
        no_condition_label = no_condition_label or default_label_no_condition
        if not label_map:
            label_map = {False: no_condition_label,
                         True: with_condition_label}
        if not color_map:
            color_map = {False: no_condition_color,
                         True: with_condition_color}
    elif df[condition_col].dtype == 'O' or df[condition_col].dtype.name == "category":
        condition = df[condition_col].astype("category")
        if not label_map:
            label_map = dict()
            [label_map.update({condition_value: '{} = {}'.format(condition_col,
                                                        condition_value)})
                     for condition_value in condition.unique()]
        if not color_map:
            rgb_values = sb.color_palette(color_palette, len(label_map.keys()))
            hex_values = [colors.to_hex(col) for col in rgb_values]
            color_map = dict(zip(label_map.keys(), hex_values))
    elif df[condition_col].dtype == 'bool':
        condition = df[condition_col]
        default_label_with_condition = "= {}".format(condition_col)
        default_label_no_condition = "¬ {}".format(condition_col)
        with_condition_label = with_condition_label or default_label_with_condition
        no_condition_label = no_condition_label or default_label_no_condition
        if not label_map:
            label_map = {False: no_condition_label,
                         True: with_condition_label}
        if not color_map:
            color_map = {False: no_condition_color,
                         True: with_condition_color}
    else:
        raise ValueError('Don\'t know how to plot data of type\
                         {}'.format(df[condition_col].dtype))

    # produce kmf plot for each category (group) identified above
    kmf = KaplanMeierFitter()
    grp_desc = list()
    grp_survival_data = dict()
    grp_event_data = dict()
    grp_names = list(condition.unique())
    for grp_name, grp_df in df.groupby(condition):
        grp_survival = grp_df[survival_col]
        grp_event = (grp_df[censor_col].astype(bool))
        grp_label = label_map[grp_name]
        grp_color = color_map[grp_name]
        kmf.fit(grp_survival, grp_event, label=grp_label)
        desc_str = "# {}: {}".format(grp_label, len(grp_survival))
        grp_desc.append(desc_str)
        grp_survival_data[grp_name] = grp_survival
        grp_event_data[grp_name] = grp_event
        if ax:
            ax = kmf.plot(ax=ax, show_censors=True, ci_show=ci_show, color=grp_color)
        else:
            ax = kmf.plot(show_censors=True, ci_show=ci_show, color=grp_color)

    ## format the plot
    # Set the y-axis to range 0 to 1
    ax.set_ylim(0, 1)
    y_tick_vals = ax.get_yticks()
    ax.set_yticklabels(["%d" % int(y_tick_val * 100) for y_tick_val in y_tick_vals])
    # plot title
    if title:
        ax.set_title(title)
    elif print_as_title:
        ax.set_title(' | '.join(grp_desc))
    else:
        [print(desc) for desc in grp_desc]
    # axis labels
    if xlabel:
        ax.set_xlabel(xlabel)
    if ylabel:
        ax.set_ylabel(ylabel)
    
    ## summarize analytical version of results
    ## again using same groups as are plotted
    if len(grp_names) == 2:
        # use log-rank test for 2 groups
        results = logrank_test(grp_survival_data[grp_names[0]],
                               grp_survival_data[grp_names[1]],
                               event_observed_A=grp_event_data[grp_names[0]],
                               event_observed_B=grp_event_data[grp_names[1]])
    elif len(grp_names) == 1:
        # no analytical result for 1 or 0 groups
        results = NullSurvivalResults()
    else:
        # cox PH fitter for >2 groups
        cf = CoxPHFitter()
        cox_df = patsy.dmatrix('+'.join([condition_col, survival_col,
                                         censor_col]),
                               df, return_type='dataframe')
        del cox_df['Intercept']
        results = cf.fit(cox_df, survival_col, event_col=censor_col)
        results.print_summary()
    # add metadata to results object so caller can print them
    results.survival_data_series = grp_survival_data
    results.event_data_series = grp_event_data
    results.desc = grp_desc
    return results