Exemplo n.º 1
0
def do_baseline(foldnum, train, valid, exp_code, model_str):
    cph = CoxPHFitter()
    df = pd.DataFrame(train.x)
    print(df.shape)
    df['duration'] = train.y
    df['event'] = [1 if v == 0 else 0 for v in train.c]

    df = df.fillna(df.mean())
    cph.fit(df, 'duration', event_col="event")

    cph.print_summary()

    valid_df = pd.DataFrame(valid.x)
    valid_df = valid_df.fillna(valid_df.mean())
    print(cph.predict_log_partial_hazard(valid_df))
Exemplo n.º 2
0
    ts     = ts[ np.random.choice(N, set_size, replace=True) ]
    es     = np.random.binomial(1, (1-censor_rate), set_size)

    # Create a data-frame for R:
    df = pd.DataFrame({
            'time'   : ts,
            'status' : es,
            'x1'     : np.random.uniform(-1.0, 1.0, set_size)})


    # Normalize:
    df['x1'] = (df['x1'] - df['x1'].mean()) / df['x1'].std()

    # Compute likelihood with R:
    r_out  = rfunc( df )
    preds, r_lik  = np.asarray(r_out[0]), np.negative(np.round(r_out[1][0],4))
    tf_lik_r = K.eval( efron_estimator_tf(K.variable(ts), K.variable(es), K.variable(preds)) )

    # Compute ll with Lifelines:
    cp = CoxPHFitter()
    cp.fit(df, 'time', 'status', initial_beta=np.ones((1,1))*0.543, step_size=0.0)
    preds = cp.predict_log_partial_hazard(df.drop(['time', 'status'], axis=1)).values[:, 0]
    tf_lik_lifelines = K.eval( efron_estimator_tf(K.variable(ts), K.variable(es), K.variable(preds)) )

    print( 'TensorFlow w/ R: ', tf_lik_r )
    print( 'R-survival : ', r_lik )
    print( 'TensorFlow w/ lifelines: ', tf_lik_lifelines )
    print( 'Lifelines : ', np.negative(cp._log_likelihood), end='\n\n')

# done.
Exemplo n.º 3
0
        np.savetxt(time_elapsed_filename, np.array(elapsed).reshape(1, -1))

        # ---------------------------------------------------------------------
        # evaluation
        #

        sorted_y_test = np.unique(y_test[:, 0])
        surv_df = surv_model.predict_survival_function(X_test_std,
                                                       sorted_y_test)
        surv = surv_df.values.T
        ev = EvalSurv(surv_df, y_test[:, 0], y_test[:, 1], censor_surv='km')
        cindex_td = ev.concordance_td('antolini')
        print('c-index (td):', cindex_td)

        linear_predictors = \
            surv_model.predict_log_partial_hazard(X_test_std)
        cindex = concordance_index(y_test[:, 0], -linear_predictors, y_test[:,
                                                                            1])
        print('c-index:', cindex)

        time_grid = np.linspace(sorted_y_test[0], sorted_y_test[-1], 100)
        integrated_brier = ev.integrated_brier_score(time_grid)
        print('Integrated Brier score:', integrated_brier, flush=True)

        test_set_metrics = [cindex_td, integrated_brier]

        rng = np.random.RandomState(bootstrap_random_seed)

        bootstrap_dir = os.path.join(
            output_dir, 'bootstrap', '%s_%s_exp%d_test' %
            (survival_estimator_name, dataset, experiment_idx))
Exemplo n.º 4
0
    def evaluate_model(self, model, ids_train, ids_valid, ids_test,
                       output_dir):

        self._reconstruction_plots(
            model, ids_train, ids_valid, ids_test,
            output_dir=output_dir)

        cohort_dfs_encoder = self._get_encoder_features_as_df(
            model, ids_train, ids_valid, ids_test, output_dir)

        train_df = cohort_dfs_encoder["training"]

        # Now do PCA and apply normal cox models
        feat_cols = [c for c in train_df.columns if c.startswith("feat_")]
        # info_cols = [c for c in train_df.columns if c not in feat_cols]

        ret = dict()
        for pca_dim in self.pca_dims:
            pca = PCA(n_components=pca_dim)
            pca.fit(train_df[feat_cols].values)

            # apply pca transformations on all sets for further
            # dimensionality reduction
            pca_dfs = dict()
            for name, df in cohort_dfs_encoder.items():
                output_path = os.path.join(
                    output_dir, "pca_{}comp_features_{}.csv".format(
                        pca_dim, name))
                pca_dfs[name] = apply_pca_transform(
                    pca, df, feat_cols, output_path)

            # now we can combine the datasets to a huge one
            # containing pca features for training, validation and test patients
            pca_df_concat = pd.concat(
                list(pca_dfs.values()), axis=0, sort=False)

            # evaluate the concordance index of Cox models that use
            # PCA reduced features of the autoencoder
            print("\nPCA with {} components\n".format(pca_dim))
            drop_cols = [self.data_handler.id_col, "slice_idx", "cohort"]  # only survival info is left
            cox_fitter = CoxPHFitter()
            try:
                cox_fitter.fit(
                    pca_dfs["training"].drop(drop_cols, axis=1),
                    duration_col=self.data_handler.time_col,
                    event_col=self.data_handler.event_col,
                    show_progress=False)
                cox_fitter.print_summary()
            except Exception as e:
                print("[W]: Fitting cox model failed! Reason: {}".format(e))
                continue

            # now create the prediction dataframe that we can then use
            # for computing ci and pvalues easily
            id_col = self.data_handler.id_col
            ids = np.unique(pca_df_concat[id_col].values)
            cohort = [None] * len(ids)
            slice_idx = [None] * len(ids)
            pred_risk_per_slice = [None] * len(ids)
            for i, pat in enumerate(ids):
                # find all slices for that patient
                if pat in ids_train:
                    cohort[i] = "training"
                elif pat in ids_valid:
                    cohort[i] = "validation"
                elif ids_test is not None and pat in ids_test:
                    cohort[i] = "test"
                else:
                    msg = "Patient {} could not be assigned to a cohort!".format(
                        pat)
                    raise ValueError(msg)

                pat_df = pca_df_concat[pca_df_concat[id_col] == pat]
                haz = cox_fitter.predict_log_partial_hazard(
                    pat_df.drop(drop_cols, axis=1))

                hazard = haz.values.flatten()
                slice_idx[i] = pat_df.slice_idx.values.tolist()
                pred_risk_per_slice[i] = hazard

            pred_df = pd.DataFrame({
                id_col: ids,
                'cohort': cohort,
                'slice_idx': slice_idx,
                'pred_per_slice': pred_risk_per_slice,
                'pred_per_pat(mean)': [
                    np.mean(slice_preds) for slice_preds in pred_risk_per_slice],
                'pred_variance': [
                    np.var(slice_preds) for slice_preds in pred_risk_per_slice]
            })

            cis = compute_cis(
                pred_df, self.data_handler.outcome_dict,
                id_col=id_col)
            pvals = compute_pvals(
                pred_df, self.data_handler.outcome_dict,
                id_col=id_col)

            performance_df = pd.DataFrame({
                'pca_dim': [pca_dim],
                'pca_explained_variance': [
                    pca.explained_variance_ratio_.tolist()],

                'train_ci_slice': [cis['train_ci_slice']],
                'p_val_train_slice': [pvals['train_p_slice']],

                'train_ci_pat': [cis['train_ci_pat']],
                'p_val_train_pat': [pvals['train_p_pat']],

                'valid_ci_slice': [cis['valid_ci_slice']],
                'p_val_valid_slice': [pvals['valid_p_slice']],

                'valid_ci_pat': [cis['valid_ci_pat']],
                'p_val_valid_pat': [pvals['valid_p_pat']],

                'test_ci_slice': [cis['test_ci_slice']],
                'p_val_test_slice': [pvals['test_p_slice']],

                'test_ci_pat': [cis['test_ci_pat']],
                'p_val_test_pat': [pvals['test_p_pat']]})

            subexp_name = "predictions_pca_"+str(pca_dim)+"_comp"
            ret[subexp_name] = (pred_df, performance_df)

            subexp_path = os.path.join(output_dir, subexp_name)
            os.makedirs(subexp_path, exist_ok=True)
            # kaplan meier and risk_vs_survival plots!
            plot_km_and_scatter(
                pred_df, self.data_handler.outcome_dict,
                output_dir=subexp_path,
                id_col=id_col)

            # save the transformation matrix V and the training mean
            # such that pca_train = (enc_train-mean(enc_train)) * V.T
            # and we can later work with those models
            dump(pca, os.path.join(
                subexp_path, "PCA_" + str(pca_dim) + "comp.joblib"))
            cox_fitter.summary.to_csv(
                os.path.join(
                    subexp_path, "cox_{}_pca-comp_summary.csv".format(
                        pca_dim)),
                index=False)
            cox_fitter.params_.to_csv(
                os.path.join(
                    subexp_path, "cox_{}_pca-comp_coefs.csv".format(
                        pca_dim)),
                index=False)

        # we return a tuple of prediction_df, performance_df
        # for each run
        # of the PCA with different dimensionality
        return ret