Exemplo n.º 1
0
def cox_analysis_old(in_df):
    df = in_df.transpose()
    assert (not df.isnull().values.any())
    cph = CoxPHFitter()
    cph.fit(df, duration_col='LivingDays', event_col='Dead')
    # cph.print_summary()
    pvals = cph._compute_p_values()
    return pvals[-1]
def forward_stepwise(data,
                     vars_list,
                     target_var,
                     time_var,
                     threshold_in=0.05,
                     verbose=True,
                     changed=True):
    vars_list = [var.strip() for var in vars_list]
    vars_list.remove(target_var)
    vars_list.remove(time_var)
    included = []
    while changed == True:
        changed = False
        excluded = list(set(vars_list) - set(included))
        print(excluded)
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            X = data[included + [new_column] + [target_var] + [time_var]]
            X = X.dropna(thresh=X.shape[1])
            model = CoxPHFitter(penalizer=15)
            model.fit(X,
                      duration_col=time_var,
                      event_col=target_var,
                      show_progress=True,
                      step_size=1)
            new_pval[new_column] = model._compute_p_values()[-1]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.index[new_pval.argmin()]
            included.append(best_feature)

            model_data = data[included + [target_var] + [time_var]]
            model_data = model_data.dropna(thresh=model_data.shape[1])
            tmp_model = CoxPHFitter(penalizer=15)
            tmp_model.fit(X,
                          duration_col=time_var,
                          event_col=target_var,
                          show_progress=True,
                          step_size=1)

            #included = list(tmp_model._compute_p_values()[tmp_model._compute_p_values()<0.05].index)
            if 'const' in included:
                included.remove('const')
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(
                    best_feature, best_pval))
    model = CoxPHFitter(penalizer=2)
    model.fit(X,
              duration_col=time_var,
              event_col=target_var,
              show_progress=True,
              step_size=1)
    return included, model, model_data
Exemplo n.º 3
0
def estCoxPHTE(df,
               treatment_col='treated',
               duration_col='dx',
               event_col='disease',
               covars=[]):
    """Estimates treatment efficacy using proportional hazards (Cox model).
    
    Parameters
    ----------
    df : pandas.DataFrame
    
    treatment_col : string
        Column in df indicating treatment.
    duration_col : string
        Column in df indicating survival times.
    event_col : string
        Column in df indicating events (censored data are 0)
    covars : list
        List of other columns to include in Cox model as covariates.
    
    Returns
    -------
    est : float
        Estimate of vaccine efficacy
    ci : vector, length 2
        95% confidence interval, [LL, UL]
    pvalue : float
        P-value for H0: VE=0"""

    coxphf = CoxPHFitter()

    coxphf.fit(df[[treatment_col, duration_col, event_col] + covars],
               duration_col=duration_col,
               event_col=event_col)

    te = 1 - np.exp(coxphf.hazards_.loc['coef', treatment_col])
    ci = 1 - np.exp(coxphf.confidence_intervals_[treatment_col].loc[[
        'upper-bound', 'lower-bound'
    ]])
    pvalue = coxphf._compute_p_values()[0]

    ind1 = df[treatment_col] == 0
    ind2 = df[treatment_col] == 1
    results = logrank_test(df[duration_col].loc[ind1],
                           df[duration_col].loc[ind2],
                           event_observed_A=df[event_col].loc[ind1],
                           event_observed_B=df[event_col].loc[ind2])
    index = ['TE', 'UB', 'LB', 'pvalue', 'logrank_pvalue', 'model']
    return pd.Series([
        te, ci['upper-bound'], ci['lower-bound'], pvalue, results.p_value,
        coxphf
    ],
                     index=index)
Exemplo n.º 4
0
def cox_analysis_mult(in_df, res_position=['pc1', 'pc2'], return_coeff=False):
    assert (not in_df.isnull().values.any())
    cph = CoxPHFitter()
    cph.fit(in_df, duration_col='LivingDays', event_col='Dead')
    # cph.print_summary()

    pval = cph._compute_p_values()[-1]
    summary = cph.summary
    print(summary)
    coeff = summary.loc[res_position, 'coef']
    pval = summary.loc[res_position, 'p']

    if not return_coeff:
        return pval
    else:
        return coeff
Exemplo n.º 5
0
def cox_analysis(in_df, res_position='pathway', return_coeff=False):
    assert (not in_df.isnull().values.any())
    cph = CoxPHFitter()
    try:
        cph.fit(in_df, duration_col='LivingDays', event_col='Dead')
    except ValueError:
        print('Changing starting betas')
        try:
            cph.fit(in_df,
                    duration_col='LivingDays',
                    event_col='Dead',
                    show_progress=True,
                    initial_beta=np.array([[0], [0.1]]))
        except ValueError:
            print('Changing starting betas again')
            cph.fit(in_df,
                    duration_col='LivingDays',
                    event_col='Dead',
                    show_progress=True,
                    initial_beta=np.array([[0], [0.3]]))
    except np.linalg.LinAlgError:
        print('Sigular matrix, changing betas')
        cph.fit(in_df,
                duration_col='LivingDays',
                event_col='Dead',
                show_progress=True,
                initial_beta=np.array([[0], [0.1]]))
    # cph.print_summary()

    pval = cph._compute_p_values()[-1]
    summary = cph.summary
    # print(summary)
    coeff = summary.loc['pathway', 'coef']
    pval = summary.loc['pathway', 'p']

    if not return_coeff:
        return pval
    else:
        return coeff
Exemplo n.º 6
0
def estCoxPHTE(df, treatment_col='treated', duration_col='dx', event_col='disease', covars=[]):
    """Estimates treatment efficacy using proportional hazards (Cox model).
    
    Parameters
    ----------
    df : pandas.DataFrame
    
    treatment_col : string
        Column in df indicating treatment.
    duration_col : string
        Column in df indicating survival times.
    event_col : string
        Column in df indicating events (censored data are 0)
    covars : list
        List of other columns to include in Cox model as covariates.
    
    Returns
    -------
    est : float
        Estimate of vaccine efficacy
    ci : vector, length 2
        95% confidence interval, [LL, UL]
    pvalue : float
        P-value for H0: VE=0"""
    
    coxphf = CoxPHFitter()
    
    coxphf.fit(df[[treatment_col, duration_col, event_col]+covars], duration_col=duration_col, event_col=event_col)
    
    te = 1 - np.exp(coxphf.hazards_.loc['coef', treatment_col])
    ci = 1 - np.exp(coxphf.confidence_intervals_[treatment_col].loc[['upper-bound', 'lower-bound']])
    pvalue = coxphf._compute_p_values()[0]

    ind1 = df[treatment_col] == 0
    ind2 = df[treatment_col] == 1
    results = logrank_test(df[duration_col].loc[ind1], df[duration_col].loc[ind2], event_observed_A=df[event_col].loc[ind1], event_observed_B=df[event_col].loc[ind2])
    index = ['TE', 'UB', 'LB', 'pvalue', 'logrank_pvalue', 'model']
    return pd.Series([te, ci['upper-bound'], ci['lower-bound'], pvalue, results.p_value, coxphf], index=index)
Exemplo n.º 7
0
    def create_data_df(self):
        row_names_index = [(names, index) for v_conf in self.v_stack.configs
                           for (names, index) in v_conf.list_index.items()]
        col_names = self.header + ["hr", "lcl", "ucl", "p_value"]
        output_df = pd.DataFrame(columns=col_names,
                                 data=0,
                                 index=range(len(row_names_index)))

        #  loop over vconf
        #     create sub df with cols as [1:] of rows in vconf + time value + other value
        #     apply methods
        # get results
        # use results to populate outputs

        i = -1
        for v_conf in self.v_stack.configs:
            col_names = set([x for x in v_conf.args])
            name_df_dict = {}
            for col_name in col_names:
                one_hot = pd.get_dummies(self.df[col_name], drop_first=True)

                non_dropped = [x for x in one_hot.columns]
                dropped = [
                    x for x in self.df[col_name].unique()
                    if x not in non_dropped and not pd.isnull(x)
                ]
                assert len(dropped) == 1
                dropped = dropped[0]

                one_hot["_observed"] = self.df["_observed"]
                one_hot["_time_to_observation"] = self.df[
                    "_time_to_observation"]
                cph = CoxPHFitter()
                cph.fit(
                    one_hot,
                    duration_col="_time_to_observation",
                    event_col="_observed",
                    show_progress=False,
                )

                res = cph.confidence_intervals_
                res["hr"] = cph.params_
                res["lcl"] = res["95% lower-bound"]
                res["ucl"] = res["95% upper-bound"]
                res["p_value"] = cph._compute_p_values()
                res = res.drop(columns=["95% lower-bound", "95% upper-bound"])
                res = res.apply(lambda x: np.exp(x))

                old_index = res.index
                res.at[dropped, :] = 1
                res = res.reindex(index=[dropped] + list(old_index))
                name_df_dict[col_name] = res

            for (names, index) in v_conf.list_index.items():
                i += 1
                # join on names
                res_df = name_df_dict[names[0]]
                line = [x for x in names] + [x for x in res_df.loc[names[1]]]
                output_df.at[i, :] = line
            # for (names, index) in v_conf.list_index.items():
            #    (cat, group, sample_size, label) = names
            #    if group in non_dropped:
            #        cph[group] =

            # output.append([])

        for header in self.header:
            output_df[header] = output_df[header].astype(str)

        for i, (row_names, row_index) in enumerate(row_names_index):
            # set the header shit
            for header, name in zip(self.header, row_names):
                output_df.at[i, header] = name
            for h_conf in self.h_stack.configs:
                if h_conf.kind == "space":
                    name = list(h_conf.name_index.keys())[0]
                    output_df[name] = output_df[name].astype(str)
                    output_df[name] = ""
                else:
                    for col_name, col_index in h_conf.name_index.items():
                        idx = row_index & col_index
                        new_df = self.df.iloc[idx]
                        series_reducer = h_conf.reducer
                        reducer = getattr(Reducers(), series_reducer)
                        if h_conf.kind == "space":
                            output_df[name] = output_df[name].astype(str)

                        try:

                            out_val = reducer(new_df[h_conf.name])
                            output_df.at[i, col_name] = out_val
                        except KeyError:
                            output_df[col_name] = output_df[col_name].astype(
                                object)
                            print(h_conf.name)
                            output_df.at[i, col_name] = "ERROR"

        return output_df