Exemplo n.º 1
0
def test_concordance_index_returns_same_after_shifting():
    T = np.array([1, 2, 3, 4, 5, 6])
    T_ = np.array([2, 1, 4, 6, 5, 3])
    assert (
        utils.concordance_index(T, T_)
        == utils.concordance_index(T - 5, T_ - 5)
        == utils.concordance_index(T, T_ - 5)
        == utils.concordance_index(T - 5, T_)
    )
Exemplo n.º 2
0
    def test_data_normalization(self, data_pred2):
        # During fit, CoxPH copies the training data and normalizes it.
        # Future calls should be normalized in the same way and
        # internal training set should not be saved in a normalized state.

        cf = CoxPHFitter(normalize=True)
        cf.fit(data_pred2, duration_col='t', event_col='E')

        # Internal training set
        ci_trn = concordance_index(cf.durations,
                                   -cf.predict_partial_hazard(cf.data).values,
                                   cf.event_observed)
        # New data should normalize in the exact same way
        ci_org = concordance_index(data_pred2['t'],
                                   -cf.predict_partial_hazard(data_pred2[['x1', 'x2']]).values,
                                   data_pred2['E'])

        assert ci_org == ci_trn
Exemplo n.º 3
0
    def test_cox_ph_prediction_monotonicity(self, data_pred2):
        # Concordance wise, all prediction methods should be monotonic versions
        # of one-another, unless numerical factors screw it up.
        t = data_pred2['t']
        e = data_pred2['E']
        X = data_pred2[['x1', 'x2']]

        for normalize in [True, False]:
            msg = ("Predict methods should get the same concordance" +
                   " when {}normalizing".format('' if normalize else 'not '))
            cf = CoxPHFitter(normalize=normalize)
            cf.fit(data_pred2, duration_col='t', event_col='E')

            # Base comparison is partial_hazards
            ci_ph = concordance_index(t, -cf.predict_partial_hazard(X).values, e)

            ci_med = concordance_index(t, cf.predict_median(X).ravel(), e)
            assert ci_ph == ci_med, msg

            ci_exp = concordance_index(t, cf.predict_expectation(X).ravel(), e)
            assert ci_ph == ci_exp, msg
    def score_(self):
        """
        The concordance score (also known as the c-index) of the fit.  The c-index is a generalization of the ROC AUC
        to survival data, including censorships.

        For this purpose, the ``score_`` is a measure of the predictive accuracy of the fitted model
        onto the training dataset. It's analogous to the R^2 in linear models.

        """
        # pylint: disable=access-member-before-definition
        if hasattr(self, "_predicted_hazards_"):
            self._concordance_score_ = concordance_index(self.durations, -self._predicted_hazards_, self.event_observed)
            del self._predicted_hazards_
            return self._concordance_score_
        return self._concordance_score_
Exemplo n.º 5
0
    def get_concordance_index(self, x, t, e, **kwargs):
        """
        Taken from the lifelines.utils package. Docstring is provided below.

        Parameters:
            x: (n, d) numpy array of observations.
            t: (n) numpy array representing observed time events.
            e: (n) numpy array representing time indicators.

        Returns:
            concordance_index: calcualted using lifelines.utils.concordance_index

        lifelines.utils.concordance index docstring:

        Calculates the concordance index (C-index) between two series
        of event times. The first is the real survival times from
        the experimental data, and the other is the predicted survival
        times from a model of some kind.

        The concordance index is a value between 0 and 1 where,
        0.5 is the expected result from random predictions,
        1.0 is perfect concordance and,
        0.0 is perfect anti-concordance (multiply predictions with -1 to get 1.0)

        Score is usually 0.6-0.7 for survival models.

        See:
        Harrell FE, Lee KL, Mark DB. Multivariable prognostic models: issues in
        developing models, evaluating assumptions and adequacy, and measuring and
        reducing errors. Statistics in Medicine 1996;15(4):361-87.
        """
        compute_hazards = theano.function(
            inputs = [self.X],
            outputs = -self.partial_hazard
        )
        partial_hazards = compute_hazards(x)

        return concordance_index(t,
            partial_hazards,
            e)
Exemplo n.º 6
0
    def print_summary(self):
        """
        Print summary statistics describing the fit.

        """
        df = self.summary
        # Significance codes last
        df[''] = [significance_code(p) for p in df['p']]

        # Print information about data first
        print('n={}, number of events={}'.format(self.data.shape[0],
                                                 np.where(self.event_observed)[0].shape[0]),
              end='\n\n')
        print(df.to_string(float_format=lambda f: '{:.3e}'.format(f)))
        # Significance code explanation
        print('---')
        print("Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ",
              end='\n\n')
        print("Concordance = {:.3f}"
              .format(concordance_index(self.durations,
                                        -self.predict_partial_hazard(self.data).values.ravel(),
                                        self.event_observed)))
        return
Exemplo n.º 7
0
def test_concordance_index():
    size = 1000
    T = np.random.normal(size=size)
    P = np.random.normal(size=size)
    C = np.random.choice([0, 1], size=size)
    Z = np.zeros_like(T)

    # Zeros is exactly random
    assert utils.concordance_index(T, Z) == 0.5
    assert utils.concordance_index(T, Z, C) == 0.5

    # Itself is 1
    assert utils.concordance_index(T, T) == 1.0
    assert utils.concordance_index(T, T, C) == 1.0

    # Random is close to 0.5
    assert abs(utils.concordance_index(T, P) - 0.5) < 0.05
    assert abs(utils.concordance_index(T, P, C) - 0.5) < 0.05
Exemplo n.º 8
0
def test_concordance_index():
    size = 1000
    T = np.random.normal(size=size)
    P = np.random.normal(size=size)
    C = np.random.choice([0, 1], size=size)
    Z = np.zeros_like(T)

    # Zeros is exactly random
    assert utils.concordance_index(T, Z) == 0.5
    assert utils.concordance_index(T, Z, C) == 0.5

    # Itself is 1
    assert utils.concordance_index(T, T) == 1.0
    assert utils.concordance_index(T, T, C) == 1.0

    # Random is close to 0.5
    assert abs(utils.concordance_index(T, P) - 0.5) < 0.05
    assert abs(utils.concordance_index(T, P, C) - 0.5) < 0.05
Exemplo n.º 9
0
    def training_epoch_end(self, outputs):
        """Compute performance metrics on the training dataset.

        This method is called automatically by pytorch-lightning.
        """
        pred_surv = torch.cat([x["pred_surv"] for x in outputs]).cpu()
        true_binary = torch.cat([x["true_binary"]
                                 for x in outputs]).cpu().numpy()
        true_time = torch.cat([x["true_time"] for x in outputs]).cpu().numpy()
        true_event = torch.cat([x["true_event"] for x in outputs
                                ]).cpu().numpy().astype(np.bool)

        two_year_bin = np.digitize(24, self.time_bins)
        survival_fn = mtlr_survival_at_times(pred_surv,
                                             np.pad(self.time_bins, (1, 0)),
                                             self.eval_times)
        pred_binary = 1 - mtlr_survival(pred_surv)[:, two_year_bin]
        roc_auc = roc_auc_score(true_binary, pred_binary)

        pred_risk = mtlr_risk(pred_surv).numpy()
        ci = concordance_index(true_time,
                               -pred_risk,
                               event_observed=true_event)

        log = {
            "training/surv/roc_auc_at_2yrs": roc_auc,
            "training/surv/ci": ci,
        }

        # log loss and metrics to Tensorboard
        loss_keys = [k for k in outputs[0]["log"].keys() if "/loss" in k]
        log.update({
            k: torch.stack([x["log"][k] for x in outputs]).mean()
            for k in loss_keys
        })
        return {"loss": log["training/total/loss"], "log": log}
Exemplo n.º 10
0
    def score(self,
              df: pd.DataFrame,
              scoring_method: str = "log_likelihood") -> float:
        """
        Score the data in df on the fitted model. With default scoring method, returns
        the *average partial log-likelihood*.

        Parameters
        ----------
        df: DataFrame
            the dataframe with duration col, event col, etc.
        scoring_method: str
            one of {'log_likelihood', 'concordance_index'}
            log_likelihood: returns the average unpenalized partial log-likelihood.
            concordance_index: returns the concordance-index
        """
        if scoring_method == "log_likelihood":
            raise NotImplementedError("Only concordance_index is available")

        T = df.pop(self.duration_col).astype(float)
        E = df.pop(self.event_col).astype(bool)

        predictions = self.predict_median(df)
        return concordance_index(T, predictions, event_observed=E)
Exemplo n.º 11
0
    def get_concordance_index(self, x, t, e, **kwargs):
        """
        Taken from the lifelines.utils package. Docstring is provided below.

        Parameters:
            x: (n, d) numpy array of observations.
            t: (n) numpy array representing observed time events.
            e: (n) numpy array representing time indicators.

        Returns:
            concordance_index: calcualted using lifelines.utils.concordance_index

        lifelines.utils.concordance index docstring:

        Calculates the concordance index (C-index) between two series
        of event times. The first is the real survival times from
        the experimental data, and the other is the predicted survival
        times from a model of some kind.

        The concordance index is a value between 0 and 1 where,
        0.5 is the expected result from random predictions,
        1.0 is perfect concordance and,
        0.0 is perfect anti-concordance (multiply predictions with -1 to get 1.0)

        Score is usually 0.6-0.7 for survival models.

        See:
        Harrell FE, Lee KL, Mark DB. Multivariable prognostic models: issues in
        developing models, evaluating assumptions and adequacy, and measuring and
        reducing errors. Statistics in Medicine 1996;15(4):361-87.
        """
        compute_hazards = theano.function(inputs=[self.X],
                                          outputs=-self.partial_hazard)
        partial_hazards = compute_hazards(x)

        return concordance_index(t, partial_hazards, e)
    def train_neural_network(self):
        train_print = "Training Deep Regularized AFT Model:"
        params_print = "Parameters: l2_reg:{}, learning_rate:{}," \
                       " momentum: beta1={} beta2={}, batch_size:{}, batch_norm:{}," \
                       " hidden_dim:{}, latent_dim:{}, num_of_batches:{}, keep_prob:{}" \
            .format(self.l2_reg, self.learning_rate, self.beta1, self.beta2, self.batch_size,
                    self.batch_norm, self.hidden_dim, self.latent_dim, self.num_batches, self.keep_prob)
        print(train_print)
        print(params_print)
        logging.debug(train_print)
        logging.debug(params_print)
        self.session.run(tf.global_variables_initializer())

        best_ci = 0
        best_validation_epoch = 0
        last_improvement = 0

        start_time = time.time()
        epochs = 0
        show_all_variables()
        j = 0

        for i in range(self.num_iterations):
            # Batch Training
            run_options = tf.RunOptions(timeout_in_ms=4000)
            x_batch, t_batch, e_batch = self.session.run(
                [self.x_batch, self.t_batch, self.e_batch],
                options=run_options)
            risk_batch = risk_set(data_t=t_batch)
            batch_impute_mask = get_missing_mask(x_batch,
                                                 self.imputation_values)
            batch_size = len(t_batch)
            idx_observed = e_batch == 1
            # TODO simplify batch processing
            feed_dict_train = {
                self.x: x_batch,
                self.x_lab: x_batch[idx_observed],
                self.x_unlab: x_batch[np.logical_not(idx_observed)],
                self.impute_mask: batch_impute_mask,
                self.t: t_batch,
                self.t_lab: t_batch[idx_observed],
                self.t_unlab: t_batch[np.logical_not(idx_observed)],
                self.e: e_batch,
                self.risk_set: risk_batch,
                self.batch_size_tensor: batch_size,
                self.is_training: True
            }
            summary, train_time, train_cost, train_ranking, train_rae, train_reg, train_lik, train_recon, \
            train_obs_lik, train_censo_lik, _ = self.session.run(
                [self.merged, self.predicted_time, self.cost, self.ranking_partial_lik, self.total_rae,
                 self.reg_loss, self.neg_log_lik, self.total_t_recon_loss, self.observed_neg_lik, self.censored_neg_lik,
                 self.optimizer],
                feed_dict=feed_dict_train)
            train_ci = concordance_index(
                event_times=t_batch,
                predicted_event_times=train_time.reshape(t_batch.shape),
                event_observed=e_batch)
            tf.verify_tensor_all_finite(train_cost,
                                        "Training Cost has Nan or Infinite")
            if j >= self.num_examples:
                epochs += 1
                is_epoch = True
                # idx = 0
                j = 0
            else:
                # idx = j
                j += self.batch_size
                is_epoch = False

            if i % 100 == 0:
                train_print = "it:{}, trainCI:{}, train_ranking:{}, train_RAE:{},  train_lik:{}, train_obs_lik:{}, " \
                              "train_cens_lik:{}, train_reg:{}".format(i, train_ci, train_ranking, train_rae, train_lik,
                                                                       train_obs_lik, train_censo_lik, train_reg)
                print(train_print)
                logging.debug(train_print)

            if is_epoch or (i == (self.num_iterations - 1)):
                improved_str = ''
                # Calculate  Vaid CI the CI
                self.train_ci.append(train_ci)
                self.train_cost.append(train_cost)
                self.train_t_rae.append(train_rae)
                self.train_log_lik.append(train_lik)
                self.train_ranking.append(train_ranking)
                self.train_recon.append(train_recon)

                self.train_writer.add_summary(summary, i)
                valid_ci, valid_cost, valid_rae, valid_ranking, valid_lik, valid_reg, valid_log_var, valid_recon = self.predict_concordance_index(
                    x=self.valid_x, e=self.valid_e, t=self.valid_t)
                self.valid_cost.append(valid_cost)
                self.valid_ci.append(valid_ci)
                self.valid_t_rae.append(valid_rae)
                self.valid_log_lik.append(valid_lik)
                self.valid_ranking.append(valid_ranking)
                self.valid_recon.append(valid_recon)
                tf.verify_tensor_all_finite(
                    valid_cost, "Validation Cost has Nan or Infinite")

                if valid_ci > best_ci:
                    self.saver.save(sess=self.session,
                                    save_path=self.save_path)
                    best_validation_epoch = epochs
                    best_ci = valid_ci
                    print("valid_ci:{}".format(valid_ci))
                    last_improvement = i
                    improved_str = '*'
                    # Save  Best Perfoming all variables of the TensorFlow graph to file.
                # update best validation accuracy
                optimization_print = "Iteration: {} epochs:{}, Training: RAE:{}, Loss: {}," \
                                     " Ranking:{}, Reg:{}, Lik:{}, T_Recon:{}, CI:{}" \
                                     " Validation RAE:{} Loss:{}, Ranking:{}, Reg:{}, Lik:{}, T_Recon:{}, CI:{}, {}" \
                    .format(i + 1, epochs, train_rae, train_cost, train_ranking, train_reg, train_lik,
                            train_recon,
                            train_ci, valid_rae, valid_cost, valid_ranking, valid_reg, valid_lik, valid_recon,
                            valid_ci, improved_str)

                print(optimization_print)
                logging.debug(optimization_print)
                if i - last_improvement > self.require_improvement or math.isnan(
                        valid_cost) or epochs >= self.max_epochs:
                    print(
                        "No improvement found in a while, stopping optimization."
                    )
                    # Break out from the for-loop.
                    break
        # Ending time.

        end_time = time.time()
        time_dif = end_time - start_time
        time_dif_print = "Time usage: " + str(
            timedelta(seconds=int(round(time_dif))))
        print(time_dif_print)
        logging.debug(time_dif_print)
        # shutdown everything to avoid zombies
        self.session.run(self.queue.close(cancel_pending_enqueues=True))
        self.coord.request_stop()
        self.coord.join(self.threads)
        return best_validation_epoch, epochs
def run_comb(i):

    clinical_train, _, rnaseq_train, _, mirna_train, _, target_train, target_test = train_test_split(
        clinical,
        rnaseq,
        mirna,
        target,
        test_size=0.15,
        stratify=patient_cancer_type)
    Mo = Model(clinical_input=clinical_input,
               gene_expression_input=gene_expression_input,
               mirna_input=mirna_input)

    device = "cpu"
    target_train.index = [i for i in range(len(target_train))]
    days_to_death = target_train["days_to_death"].values

    data = {
        "gene_expression": torch.tensor(rnaseq_train, device=device),
        "mirna": torch.tensor(mirna_train, device=device),
        "clinical": torch.tensor(clinical_train, device=device),
    }
    if i != "":
        del data[i]
    f = open("table2_{0}.txt".format(data.keys()), "a")
    f.write("\nFiles used: {0}".format(ld.files))

    f.write("{0}".format(data.keys()))
    now = time.time()
    Mo.train(data, target_train, n_batches=10)
    took = time.time() - now
    print("Train time:", took)
    f.write("Took {0}".format(took))

    for cancer_type in set(patient_cancer_type):

        indexes = [
            index for index, value in enumerate(patient_cancer_type)
            if value == cancer_type
        ]
        type_rnaseq = np.array(rnaseq)[indexes]
        type_rnaseq = [list(i) for i in type_rnaseq]
        type_mirna = np.array(mirna)[indexes]
        type_mirna = [list(i) for i in type_mirna]
        type_clinical = np.array(clinical)[indexes]
        type_clinical = [list(i) for i in type_clinical]
        print("\nTesting Data---for", cancer_type)
        f.write("\nTesting Data---for{0}--{1}".format(cancer_type, i))
        days_to_death = target_test["days_to_death"].values
        vital_status = target_test["vital_status"].values

        data = {
            "gene_expression": torch.tensor(type_rnaseq),
            "mirna": torch.tensor(type_mirna),
            "clinical": torch.tensor(type_clinical),
        }
        if i != "":
            del data[i]
        hazard = Mo(data)["hazard"].detach()

        try:
            c_index_1 = concordance_index(days_to_death, -hazard)
        except:
            c_index_1 = "None"
        try:
            c_index_2 = concordance_index(days_to_death, hazard)
        except:
            c_index_2 = "None"
        try:
            c_index_3 = concordance_index(days_to_death, -hazard,
                                          np.logical_not(vital_status))
        except:
            c_index_3 = "None"

        write = "\nC_index:{0} {1} {2}".format(c_index_1, c_index_2, c_index_3)
        f.write(write)
    f.close()
Exemplo n.º 14
0
def CIndex_lifeline(hazards, labels, survtime_all):
    return (concordance_index(survtime_all, -hazards, labels))
Exemplo n.º 15
0
    def _fit_static(self, dataframe, duration_col, event_col=None,
                    timeline=None, show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model.

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.
                      one row per individual. duration_col refers to how long the individual was
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col
                      should be left as None.

            duration_col: specify what the duration column is called in the dataframe
            event_col: specify what the event occurred column is called in the dataframe
            timeline: reformat the estimates index to a new timeline.
            progress_bar: include a fancy progress bar!

        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # set unique ids for individuals
        id_col = 'id'
        ids = np.arange(df.shape[0])
        df[id_col] = ids

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # if no event_col is specified, assume all non-censorships
        if event_col:
            c = df[event_col].values
            del df[event_col]
        else:
            c = np.ones_like(ids)

        # each individual should have an ID of time of leaving study
        C = pd.Series(c, dtype=bool, index=ids)
        T = pd.Series(df[duration_col].values, index=ids)

        df = df.set_index(id_col)
        pass_for_numeric_dtypes_or_raise(df)

        ix = T.argsort()
        T, C = T.iloc[ix], C.iloc[ix]

        del df[duration_col]
        n, d = df.shape
        columns = df.columns

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        n_deaths = len(non_censorsed_times)

        hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                 index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        # initialize loop variables.
        previous_hazard = np.zeros((d,))
        progress = progress_bar(n_deaths)
        to_remove = []
        t = T.iloc[0]
        i = 0

        for id, time in T.iteritems():  # should be sorted.

            if t != time:
                assert t < time
                # remove the individuals from the previous loop.
                df.iloc[to_remove] = 0.
                to_remove = []
                t = time

            to_remove.append(id)
            if C[id] == 0:
                continue

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            hazards_.loc[time, id] = v.T
            variance_.loc[time, id] = V[:, relevant_individuals][:, 0] ** 2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                i += 1
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        # not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=0).sum()
        self.cumulative_hazards_ = self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=0).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.score_ = concordance_index(self.durations,
                                        self.predict_median(dataframe).values.ravel(),
                                        self.event_observed)
        return
Exemplo n.º 16
0
def trainCox(dataroot='./data/TCGA_GBMLGG/',
             ckpt_name='./checkpoints/surv_15_cox/',
             model='cox_omic',
             penalizer=1e-4):
    ### Creates Checkpoint Directory
    if not os.path.exists(ckpt_name): os.makedirs(ckpt_name)
    if not os.path.exists(os.path.join(ckpt_name, model)):
        os.makedirs(os.path.join(ckpt_name, model))

    ### Load PNAS Splits
    pnas_splits = pd.read_csv(dataroot + 'pnas_splits.csv')
    pnas_splits.columns = ['TCGA ID'] + [str(k) for k in range(1, 16)]
    pnas_splits.index = pnas_splits['TCGA ID']
    pnas_splits = pnas_splits.drop(['TCGA ID'], axis=1)

    ### Loads Data
    ignore_missing_moltype = True if model in [
        'cox_omic', 'cox_moltype', 'cox_grade+moltype', 'all'
    ] else False
    ignore_missing_histype = True if model in [
        'cox_histype', 'cox_grade', 'cox_grade+moltype', 'all'
    ] else False
    all_dataset = getCleanAllDataset(
        dataroot=dataroot,
        ignore_missing_moltype=ignore_missing_moltype,
        ignore_missing_histype=ignore_missing_histype)[1]
    model_feats = {
        'cox_omic': [
            'TCGA ID', 'Histology', 'Grade', 'Molecular subtype',
            'Histomolecular subtype'
        ],
        'cox_moltype':
        ['Survival months', 'censored', 'codeletion', 'idh mutation'],
        'cox_histype': ['Survival months', 'censored', 'Histology'],
        'cox_grade': ['Survival months', 'censored', 'Grade'],
        'cox_grade+moltype':
        ['Survival months', 'censored', 'codeletion', 'idh mutation', 'Grade'],
        'cox_all': ['TCGA ID', 'Histomolecular subtype']
    }
    cv_results = []

    for k in pnas_splits.columns:
        pat_train = list(
            set(pnas_splits.index[pnas_splits[k] == 'Train']).intersection(
                all_dataset.index))
        pat_test = list(
            set(pnas_splits.index[pnas_splits[k] == 'Test']).intersection(
                all_dataset.index))
        feats = all_dataset.columns.drop(
            model_feats[model]
        ) if model == 'cox_omic' or model == 'cox_all' else model_feats[model]
        train = all_dataset.loc[pat_train]
        test = all_dataset.loc[pat_test]

        cph = CoxPHFitter(penalizer=penalizer)
        cph.fit(train[feats],
                duration_col='Survival months',
                event_col='censored',
                show_progress=False)
        cin = concordance_index(test['Survival months'],
                                -cph.predict_partial_hazard(test[feats]),
                                test['censored'])
        cv_results.append(cin)

        train.insert(loc=0,
                     column='Hazard',
                     value=-cph.predict_partial_hazard(train))
        test.insert(loc=0,
                    column='Hazard',
                    value=-cph.predict_partial_hazard(test))
        pickle.dump(
            train,
            open(
                os.path.join(ckpt_name, model,
                             '%s_%s_pred_train.pkl' % (model, k)), 'wb'))
        pickle.dump(
            test,
            open(
                os.path.join(ckpt_name, model,
                             '%s_%s_pred_test.pkl' % (model, k)), 'wb'))

    pickle.dump(
        cv_results,
        open(os.path.join(ckpt_name, model, '%s_results.pkl' % model), 'wb'))
    print("C-Indices across Splits", cv_results)
    print("Average C-Index: %f" % CI_pm(cv_results))
Exemplo n.º 17
0
    def fit(self,
            df,
            duration_col,
            event_col=None,
            show_progress=False,
            initial_beta=None,
            strata=None,
            step_size=None,
            weights_col=None):
        """
        Fit the Cox Propertional Hazard model to a dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          duration_col: the column in dataframe that contains the subjects'
             lifetimes.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          weights_col: an optional column in the dataframe that denotes the weight per subject.
             This column is expelled and not used as a covariate, but as a weight in the
             final regression. Default weight is 1.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          initial_beta: initialize the starting point of the iterative
             algorithm. Default is the zero vector.
          strata: specify a list of columns to use in stratification. This is useful if a
             catagorical covariate does not obey the proportional hazard assumption. This
             is used similar to the `strata` expression in R.
             See http://courses.washington.edu/b515/l17.pdf.

        Returns:
            self, with additional properties: hazards_

        """

        df = df.copy()

        # Sort on time
        df = df.sort_values(by=duration_col)

        self._n_examples = df.shape[0]
        self.strata = coalesce(strata, self.strata)
        if self.strata is not None:
            original_index = df.index.copy()
            df = df.set_index(self.strata)

        # Extract time and event
        T = df[duration_col]
        del df[duration_col]
        if event_col is None:
            E = pd.Series(np.ones(df.shape[0]), index=df.index)
        else:
            E = df[event_col]
            del df[event_col]

        if weights_col:
            weights = df.pop(weights_col)
            if (weights.astype(int) != weights).any():
                warnings.warn(
                    """It looks like your weights are not integers, possibly propensity scores then?
It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
                    """, RuntimeWarning)

        else:
            weights = pd.DataFrame(np.ones((self._n_examples, 1)),
                                   index=df.index)

        self._check_values(df, T, E)
        df = df.astype(float)

        # save fitting data for later
        self.durations = T.copy()
        self.event_observed = E.copy()
        if self.strata is not None:
            self.durations.index = original_index
            self.event_observed.index = original_index
        self.event_observed = self.event_observed.astype(bool)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        E = E.astype(bool)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean,
                                                   self._norm_std),
                                         T,
                                         E,
                                         weights=weights,
                                         initial_beta=initial_beta,
                                         show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(
            hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
        self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard(
        )
        self.baseline_survival_ = self._compute_baseline_survival()
        self.score_ = concordance_index(
            self.durations, -self.predict_partial_hazard(df).values.ravel(),
            self.event_observed)
        self._train_log_partial_hazard = self.predict_log_partial_hazard(
            self._norm_mean.to_frame().T)
        return self
Exemplo n.º 18
0
def eval(model, x, y, e):
    hr_pred = model.predict(x)
    hr_pred = np.exp(hr_pred)
    ci = concordance_index(y, -hr_pred, e)
    return ci
Exemplo n.º 19
0
def surv_coxph(data_train,
               x_cols,
               duration_col,
               event_col,
               data_test=None,
               pt=None,
               show_extra=True):
    """Integrate functions that include modeling using Cox Regression and evaluating 

    Parameters
    ----------
    data_train : pandas.DataFame
        Full survival data for train.
    x_cols : list of str
        Name of column indicating variables.
    duration_col : str
        Name of column indicating time.
    event_col : str
        Name of column indicating event.
    data_test : pandas.DataFame
        Full survival data for test, default None.
    pt : float
        Predicted time for AUC.

    Returns
    -------
    object
        Object of cox model in `lifelines.CoxPHFitter`.

    Examples
    --------
    >>> surv_coxph(train_data, ['x1', 'x2'], 'T', 'E', test_data, pt=5*12)
    """
    y_cols = [event_col, duration_col]
    cph = CoxPHFitter()
    cph.fit(data_train[x_cols + y_cols],
            duration_col=duration_col,
            event_col=event_col,
            show_progress=True)
    # CI of train
    pred_X_train = cph.predict_partial_hazard(data_train[x_cols])
    pred_X_train.rename(columns={0: 'X'}, inplace=True)
    ci_train = concordance_index(data_train[duration_col], -pred_X_train,
                                 data_train[event_col])
    # AUC of train at pt
    df = pd.concat([data_train[y_cols], pred_X_train], axis=1)
    roc_train = surv_roc(df, 'X', duration_col, event_col, pt=pt)
    if data_test is not None:
        # CI of test
        pred_X_test = cph.predict_partial_hazard(data_test[x_cols])
        pred_X_test.rename(columns={0: 'X'}, inplace=True)
        ci_test = concordance_index(data_test[duration_col], -pred_X_test,
                                    data_test[event_col])
        # AUC of test at pt
        df = pd.concat([data_test[y_cols], pred_X_test], axis=1)
        roc_test = surv_roc(df, 'X', duration_col, event_col, pt=pt)
    # Print Summary of CPH
    cph.print_summary()
    print "__________Metrics CI__________"
    print "CI of train: %.4f" % ci_train
    if data_test is not None:
        print "CI of test : %.4f" % ci_test
    print "__________Metrics AUC__________"
    print "AUC of train: %.4f" % roc_train['AUC']
    if data_test is not None:
        print "AUC of test : %.4f" % roc_test['AUC']

    if not show_extra:
        return cph
    # Print Coefficients
    print "__________Summary of Coefficients in CPH__________"
    cols = ['coef', 'p', 'lower 0.95', 'upper 0.95']
    print cols[0], ":"
    for i in cph.summary.index:
        print "%.4f" % (cph.summary.loc[i, cols[0]])
    print "__________"
    print cols[1], ":"
    for i in cph.summary.index:
        print "%.4f" % (cph.summary.loc[i, cols[1]])
    print "__________"
    print "95% CI :"
    for i in cph.summary.index:
        print "[%.4f, %.4f]" % (cph.summary.loc[i, cols[2]],
                                cph.summary.loc[i, cols[3]])
    return cph
Exemplo n.º 20
0
def test_concordance_index_returns_same_after_shifting():
    T = np.array([1, 2, 3, 4, 5, 6])
    T_ = np.array([2, 1, 4, 6, 5, 3])
    assert utils.concordance_index(T, T_) == utils.concordance_index(
        T - 5, T_ - 5) == utils.concordance_index(
            T, T_ - 5) == utils.concordance_index(T - 5, T_)
def main():
    args = parse_args()
    if args.conf_path is None:
        conf_path = DEFAULT_CONF_PATH
    else:
        conf_path = Path(args.conf_path)
    exp_config = CoxExperimentConfig.from_conf(conf_path)
    exp_config.output_dir.mkdir(parents=True, exist_ok=True)
    hypersearch_config = HypersearchConfig.from_conf(conf_path)

    shutil.copy(str(conf_path),
                str(exp_config.output_dir.joinpath("cox.conf")))

    # import input data: i_full=list of patient IDs, y_full=censoring status and survival times for patients,
    # x_full=input data for patients (i.e. motion descriptors [11,514-element vector])

    with open(str(exp_config.data_path), 'rb') as f:
        c3 = pickle.load(f)
    x_full = c3[0]
    y_full = c3[1]
    print(x_full.shape, y_full.shape)
    del c3

    # Initialize lists to store predictions
    c_vals = []
    c_trains = []

    kf = KFold(n_splits=exp_config.n_folds)
    i = 0
    for train_indices, test_indices in kf.split(x_full):
        print(train_indices.shape, test_indices.shape)

        x_train, y_train = x_full[train_indices], y_full[train_indices]
        x_val, y_val = x_full[test_indices], y_full[test_indices]

        # STEP 1: find optimal hyperparameters using CV
        print("Step 1a")
        opars, osummary = hypersearch_cox(
            x_data=x_train,
            y_data=y_train,
            method=exp_config.search_method,
            nfolds=exp_config.n_folds,
            nevals=exp_config.n_evals,
            penalty_range=hypersearch_config.penalty_exp)
        print("Step b")
        # (1b) using optimal hyperparameters, train a model and test its performance on the holdout validation set.
        olog = train_cox_reg(
            xtr=x_train,
            ytr=y_train,
            penalty=10**opars['penalty'],
        )

        # (1c) Compute Harrell's Concordance index
        pred_val = olog.predict_partial_hazard(x_val)
        c_val = concordance_index(y_val[:, 1], -pred_val, y_val[:, 0])

        pred_train = olog.predict_partial_hazard(x_train)
        c_train = concordance_index(y_train[:, 1], -pred_train, y_train[:, 0])
        c_vals.append(c_val)
        c_trains.append(c_train)
        save_params(opars,
                    osummary,
                    "cv_{}".format(i),
                    exp_config.output_dir,
                    c_val=c_val,
                    c_train=c_train,
                    c_val_mean=np.mean(c_vals),
                    c_val_var=np.var(c_vals),
                    c_train_mean=np.mean(c_trains),
                    c_train_var=np.var(c_trains))
        print('Validation concordance index = {0:.4f}'.format(c_val))
        i += 1
        plot_cs(c_trains, c_vals, exp_config.output_dir)
    print('Mean Validation concordance index = {0:.4f}'.format(
        np.mean(c_vals)))
    print('Variance = {0:.4f}'.format(np.var(c_vals)))
Exemplo n.º 22
0
 def concordance_index(self):    
     return concordance_index(self.df['TIME'], -self.df['LPH'], self.df['EVENT'])
Exemplo n.º 23
0
     print(w)
 get_target = lambda df: (df['time'].values, df['dead'].values)
 time_valid, dead_valid = get_target(dataValid)
 ypred_train_NN = model_cv.predict_proba(x_train_NN)
 ypred_test_NN = model_cv.predict_proba(x_valid_NN)
 ypred_surv_train_NN = ypred_train_NN.reshape([dataTrain.shape[0], -1])
 ypred_surv_valid_NN = ypred_test_NN.reshape([dataValid.shape[0], -1])
 y_pred_valid_surv = np.cumprod((1 - ypred_surv_valid_NN), axis=1)
 y_pred_train_surv = np.cumprod((1 - ypred_surv_train_NN), axis=1)
 oneyr_surv_train = y_pred_train_surv[:, 50]
 oneyr_surv_valid = y_pred_valid_surv[:, 50]
 surv_valid = pd.DataFrame(np.transpose(y_pred_valid_surv))
 surv_valid.index = interval_l
 surv_train = pd.DataFrame(np.transpose(y_pred_train_surv))
 surv_train.index = interval_l
 dict_cv_cindex_train[key] = concordance_index(dataTrain.time,
                                               oneyr_surv_train)
 dict_cv_cindex_valid[key] = concordance_index(dataValid.time,
                                               oneyr_surv_valid)
 #scores_train += concordance_index(dataTrain.time,oneyr_surv_train)#,data_train.dead)
 #scores_test += concordance_index(dataValid.time,oneyr_surv_valid)
 #cta.append(concordance_index(dataTrain.time,oneyr_surv_train))
 #cte.append(concordance_index(dataValid.time,oneyr_surv_valid))
 ev_valid = EvalSurv(surv_valid, time_valid, dead_valid, censor_surv='km')
 scores_test += ev_valid.concordance_td()
 ev_train = EvalSurv(surv_train,
                     dataTrain['time'].values,
                     dataTrain['dead'].values,
                     censor_surv='km')
 scores_train += ev_train.concordance_td()
 cta.append(concordance_index(dataTrain.time, oneyr_surv_train))
 cte.append(concordance_index(dataValid.time, oneyr_surv_valid))
Exemplo n.º 24
0
	def test_(self, data_generator, model, repurposing_mode = False, test = False):
		y_pred = []
		y_label = []
		model.eval()
		for i, (v_d, v_p, label) in enumerate(data_generator):
			if self.drug_encoding == "MPNN" or self.drug_encoding == 'Transformer':
				v_d = v_d
			else:
				v_d = v_d.float().to(self.device)                
			if self.target_encoding == 'Transformer':
				v_p = v_p
			else:
				v_p = v_p.float().to(self.device)                
			score = self.model(v_d, v_p)
			if self.binary:
				m = torch.nn.Sigmoid()
				logits = torch.squeeze(m(score)).detach().cpu().numpy()
			else:
				logits = torch.squeeze(score).detach().cpu().numpy()
			label_ids = label.to('cpu').numpy()
			y_label = y_label + label_ids.flatten().tolist()
			y_pred = y_pred + logits.flatten().tolist()
			outputs = np.asarray([1 if i else 0 for i in (np.asarray(y_pred) >= 0.5)])
		model.train()
		if self.binary:
			if repurposing_mode:
				return y_pred
			## ROC-AUC curve
			if test:
				roc_auc_file = os.path.join(self.result_folder, "roc-auc.jpg")
				plt.figure(0)
				roc_curve(y_pred, y_label, roc_auc_file, self.drug_encoding + '_' + self.target_encoding)
				plt.figure(1)
				pr_auc_file = os.path.join(self.result_folder, "pr-auc.jpg")
				prauc_curve(y_pred, y_label, pr_auc_file, self.drug_encoding + '_' + self.target_encoding)

			return roc_auc_score(y_label, y_pred), average_precision_score(y_label, y_pred), f1_score(y_label, outputs), log_loss(y_label, outputs), y_pred
		else:
			if repurposing_mode:
				return y_pred
			return mean_squared_error(y_label, y_pred), pearsonr(y_label, y_pred)[0], pearsonr(y_label, y_pred)[1], concordance_index(y_label, y_pred), y_pred
Exemplo n.º 25
0
            lrfinder = model.lr_finder(x_train, y_train, batch_size, tolerance=10)
            best = lrfinder.get_best_lr()

            model.optimizer.set_lr(best)
            
            epochs = args.epochs
            callbacks = [tt.callbacks.EarlyStopping(patience=patience)]
            verbose = True
            log = model.fit(x_train, y_train_transformed, batch_size, epochs, callbacks, verbose, val_data = val_transformed, val_batch_size = batch_size)

            # Evaluation ===================================================================
            val_loss = min(log.monitors['val_'].scores['loss']['score'])
            
            # get Ctd
            ctd = concordance_index(event_times = durations_test_transformed,
                                    predicted_scores = model.predict(x_test).reshape(-1),
                                    event_observed = events_test)
            
            # set time grid for numerical integration to get IBS and IBLL
            if durations_test.min()>0:
                time_grid = np.linspace(durations_test.min(), durations_test.max(), 100)
            else:
                durations_test_copy = durations_test.copy()
                durations_test_copy.sort()
                time_grid = np.linspace(durations_test_copy[1], durations_test.max(), 100)
            # time_grid = np.linspace(durations_test.min(), durations_test.max(), 100)
            # transform time grid into DSAFT scale for fair comparison
            # pdb.set_trace()
            time_grid = np.exp(scaler_train.transform(np.log(time_grid.reshape(-1, 1)))).reshape(-1)
            # grid interval for numerical integration
            ds = np.array(time_grid - np.array([0.0] + time_grid[:-1].tolist()))
Exemplo n.º 26
0
                                                presorted_times=True,
                                                kernel_function=kernel_func)

                    # ---------------------------------------------------------
                    # compute c-index
                    #
                    if cindex_method == 'cum_haz':
                        cum_haz = \
                            surv_model.predict_cum_haz(
                                fold_X_val_standardized,
                                sorted_fold_y_val,
                                presorted_times=True,
                                kernel_function=kernel_func)
                        cum_hazard_scores = cum_haz.sum(axis=1)
                        cindex = concordance_index(fold_y_val[:, 0],
                                                   -cum_hazard_scores,
                                                   fold_y_val[:, 1])
                    elif cindex_method == 'cum_haz_from_surv':
                        surv_thresholded = np.maximum(surv,
                                                      np.finfo(float).eps)
                        cum_haz = -np.log(surv_thresholded)
                        cum_hazard_scores = cum_haz.sum(axis=1)
                        cindex = concordance_index(fold_y_val[:, 0],
                                                   -cum_hazard_scores,
                                                   fold_y_val[:, 1])
                    elif cindex_method == 'median':
                        predicted_medians = \
                            np.array([compute_median_survival_time(mesh_points,
                                                                   surv_row)
                                      for surv_row in surv])
                        cindex = concordance_index(fold_y_val[:, 0],
Exemplo n.º 27
0
        else:
            count+=meta_parameters_dictionary['test_labels_{}'.format(f)].shape[1]
    clinical_features_size=count
    model = get_model(cube_size, clinical_features_size,kernel_size = (3,3,3))
    model= load_model(model_filename,custom_objects = {'cox_regression_loss':cox_regression_loss})
#    model.load_weights(checkpoint_dir+'cyclic_{}_{}.h5'.format(test_cohort,i))                                                
    test_generator = data(meta_parameters_dictionary,batch_size,False,False)
    preds = model.predict_generator(test_generator,verbose=1)
    preds = np.squeeze(preds)
    df = pd.DataFrame()
    df = df.assign(hazard = preds[0])
    df = df.assign(pred_class = (preds[1]))
    df = df.assign(actual_months = meta_parameters_dictionary['test_labels_months'])
    df = df.assign(is_dead = meta_parameters_dictionary['test_labels_is_dead'])
    df.to_csv(test_cohort+'_preds.csv')
    print(concordance_index(df.actual_months,-df.hazard,df.is_dead))
    fpr,tpr,thresholds = roc_curve(meta_parameters_dictionary['test_labels'],np.array(df.pred_class))
    opt_threshold = thresholds[np.argmax(tpr - fpr)]
    class_pred = np.zeros_like(np.array(df.pred_class))
    class_pred[np.where(np.array(df.pred_class)>opt_threshold)]=1
    df = df.assign(class_preds=class_pred)
    T = df['actual_months']
    E = df['is_dead']
#    ix = (df.class_preds==1)
    thres = np.median(df.hazard)
    ix = df.hazard < thres
    kmf = KaplanMeierFitter()
    kmf.fit(T[~ix],E[~ix],label='high-risk')
    ax = kmf.plot()
    kmf.fit(T[ix],E[ix],label='low-risk')
    kmf.plot(ax=ax)
Exemplo n.º 28
0
def CIndex_lifeline(hazards, labels, survtime_all):
    labels = labels.data.cpu().numpy()
    hazards = hazards.cpu().numpy().reshape(-1)
    return(concordance_index(survtime_all, -hazards, labels))
    def train(self, trainingData, validationData=None, validation_freq=10):
        #tdata required to sort data only
        ## sort data
        xdata, edata, tdata = trainingData['x'], trainingData[
            'e'], trainingData['t']
        sort_idx = numpy.argsort(tdata)[::-1]
        xdata = xdata[sort_idx]
        edata = edata[sort_idx].astype(numpy.float32)
        tdata = tdata[sort_idx]

        if validationData:
            xdata_valid, edata_valid, tdata_valid = validationData[
                'x'], validationData['e'], validationData['t']
            sort_idx = numpy.argsort(tdata_valid)[::-1]
            xdata_valid = xdata_valid[sort_idx]
            edata_valid = edata_valid[sort_idx].astype(numpy.float32)
            tdata_valid = tdata_valid[sort_idx]

        ##TODO : cache
        if self.params.standardize:
            mean, var = xdata.mean(axis=0), xdata.std(axis=0)
            xdata = (xdata - mean) / var
            ##same mean and var as train
            xdata_valid = (xdata_valid - mean) / var

        assert self.params.modelPath
        assert xdata.shape[
            1] == self.params.n_in, "invalid number of covariates"
        assert (edata.ndim == 1) and (tdata.ndim == 1)  ##sanity check

        train_losses, train_ci, train_index = [], [], []
        validation_losses, validation_ci, validation_index = [], [], []

        best_validation_loss = numpy.inf
        best_params_idx = -1

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer()
                     )  ##init graph with given initializers
            ##start training
            for epoch in range(self.params.n_epochs):
                loss, risk, _ = sess.run(
                    [self.loss, self.risk, self.grad_step],
                    feed_dict={
                        self.x: xdata,
                        self.e: edata
                    })

                train_losses.append(loss)
                train_ci.append(
                    concordance_index(tdata, -numpy.exp(risk.ravel()), edata))
                train_index.append(epoch)

                ##frequently check metrics on validation data
                if validationData and (epoch % validation_freq == 0):
                    vloss, vrisk = sess.run([self.loss, self.risk],
                                            feed_dict={
                                                self.x: xdata_valid,
                                                self.e: edata_valid
                                            })

                    validation_losses.append(vloss)
                    validation_ci.append(
                        concordance_index(tdata_valid,
                                          -numpy.exp(vrisk.ravel()),
                                          edata_valid))
                    validation_index.append(epoch)

                    # improve patience if loss improves enough
                    if vloss < best_validation_loss * self.params.improvement_threshold:
                        self.params.patience = max(
                            self.params.patience,
                            epoch * self.params.patience_increase)

                        best_params_idx = epoch
                        best_validation_loss = vloss

                if self.params.patience <= epoch:
                    break

            print("Training done")
            print("Best epoch", best_params_idx)
            print("Best loss", best_validation_loss)

            ##save model
            saver = tf.train.Saver()
            saver.save(sess, self.params.modelPath)

            self.trainingStats["training"] = {
                "loss": train_losses,
                "ci": train_ci,
                "epochs": train_index,
                "type": "training"
            }

            if validationData:
                self.trainingStats["validation"] = {
                    "loss": validation_losses,
                    "ci": validation_ci,
                    "epochs": validation_index,
                    "type": "validation"
                }

            return self.trainingStats
Exemplo n.º 30
0
    def fit(self, df, duration_col, event_col=None,
            show_progress=False, initial_beta=None,
            strata=None, step_size=None, weights_col=None):
        """
        Fit the Cox Propertional Hazard model to a dataset. Tied survival times
        are handled using Efron's tie-method.

        Parameters:
          df: a Pandas dataframe with necessary columns `duration_col` and
             `event_col`, plus other covariates. `duration_col` refers to
             the lifetimes of the subjects. `event_col` refers to whether
             the 'death' events was observed: 1 if observed, 0 else (censored).
          duration_col: the column in dataframe that contains the subjects'
             lifetimes.
          event_col: the column in dataframe that contains the subjects' death
             observation. If left as None, assume all individuals are non-censored.
          weights_col: an optional column in the dataframe that denotes the weight per subject.
             This column is expelled and not used as a covariate, but as a weight in the
             final regression. Default weight is 1.
          show_progress: since the fitter is iterative, show convergence
             diagnostics.
          initial_beta: initialize the starting point of the iterative
             algorithm. Default is the zero vector.
          strata: specify a list of columns to use in stratification. This is useful if a
             catagorical covariate does not obey the proportional hazard assumption. This
             is used similar to the `strata` expression in R.
             See http://courses.washington.edu/b515/l17.pdf.

        Returns:
            self, with additional properties: hazards_

        """
        df = df.copy()

        # Sort on time
        df = df.sort_values(by=duration_col)

        self._n_examples = df.shape[0]
        self.strata = coalesce(strata, self.strata)
        if self.strata is not None:
            original_index = df.index.copy()
            df = df.set_index(self.strata)

        # Extract time and event
        T = df[duration_col]
        del df[duration_col]
        if event_col is None:
            E = pd.Series(np.ones(df.shape[0]), index=df.index)
        else:
            E = df[event_col]
            del df[event_col]

        if weights_col:
            weights = df.pop(weights_col).values
        else:
            weights = np.ones(self._n_examples)

        self._check_values(df, E)
        df = df.astype(float)

        # save fitting data for later
        self.durations = T.copy()
        self.event_observed = E.copy()
        if self.strata is not None:
            self.durations.index = original_index
            self.event_observed.index = original_index
        self.event_observed = self.event_observed.astype(bool)

        self._norm_mean = df.mean(0)
        self._norm_std = df.std(0)

        E = E.astype(bool)

        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), T, E,
                                         weights=weights,
                                         initial_beta=initial_beta,
                                         show_progress=show_progress,
                                         step_size=step_size)

        self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
        self.confidence_intervals_ = self._compute_confidence_intervals()

        self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
        self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard()
        self.baseline_survival_ = self._compute_baseline_survival()
        self.score_ = concordance_index(self.durations,
                                        -self.predict_partial_hazard(df).values.ravel(),
                                        self.event_observed)
        self._train_log_partial_hazard = self.predict_log_partial_hazard(self._norm_mean.to_frame().T)
        return self
Exemplo n.º 31
0
    def train(self, x, c, s, names, fold, n_feature=50):
        #learning_ratio = 1e-3
        n = x.shape[0]
        dev_index = n * 3 // 4

        x = self.preprocess(x, c, s, names, fold, n_feature, dev_index)
        x_trn, x_dev = x[:dev_index], x[dev_index:]
        c_trn, c_dev = 1 - c[:dev_index], 1 - c[dev_index:]
        s_trn, s_dev = s[:dev_index], s[dev_index:]

        sort_idx = np.argsort(s_trn)[::-1]
        x_trn = x_trn[sort_idx]
        s_trn = s_trn[sort_idx]
        c_trn = c_trn[sort_idx]

        def nll(E, NUM_E):
            def loss(y_true, y_pred):
                hazard_ratio = K.exp(y_pred)
                log_risk = K.log(K.cumsum(hazard_ratio))
                uncensored_likelihood = K.transpose(y_pred) - log_risk
                censored_likelihood = uncensored_likelihood * E
                neg_likelihood = -K.sum(censored_likelihood) / NUM_E
                return neg_likelihood

            return loss

        input_size = len(x[0])

        cindex_dev = {}
        # for dropout in [0.0, 0.5]:
        for dropout in [0.0]:
            self.model = self.get_model(input_size, dropout)
            for lr in [0.1, 0.01, 0.001, 0.0001]:
                print('############## Run at ', fold, dropout, lr)
                adam = optimizers.Adam(lr=lr)
                self.model.compile(loss=[nll(c_trn, np.sum(c_trn))],
                                   optimizer=adam)

                data = (x_trn, c_trn, s_trn, x_dev, c_dev, s_dev)
                modelpath = self.out_folder + '/%s/%s_(%d)_%0.1f_%0.5f.hdf5' % (
                    self.model_name, self.cancer, fold, dropout, lr)

                checkpoint = MyCallback(modelpath, data)

                self.model.fit(x_trn,
                               s_trn,
                               epochs=self.epochs,
                               batch_size=len(x_trn),
                               verbose=0,
                               shuffle=False,
                               callbacks=[checkpoint])
                self.model.load_weights(modelpath)
                pred_raw = self.model.predict(x_dev, batch_size=1, verbose=1)
                pred_dev = -np.exp(pred_raw)
                cindex_dev_max = concordance_index(s_dev, pred_dev, c_dev)

                cindex_dev[modelpath] = cindex_dev_max

                self.reset_weights()

        self.bestmodelpath, self.cindex_dev_max = max(
            cindex_dev.items(), key=operator.itemgetter(1))

        return self.cindex_dev_max
 def get_concordance_index(self, xdata, edata, tdata):
     risk = self.predict(xdata)
     partial_hazards = -numpy.exp(risk)
     return concordance_index(tdata, partial_hazards, edata)
Exemplo n.º 33
0
#Initialize lists to store predictions
preds_bootfull = []
inds_inbag = []
Cb_opts  = []

#STEP 1
#(1a) find optimal hyperparameters
opars, osummary = hypersearch_DL(x_data=x_full, y_data=y_full, method='particle swarm', nfolds=6, nevals=50, lrexp_range=[-6.,-4.5], l1rexp_range=[-7,-4], dro_range=[.1,.9], units1_range=[75,250], units2_range=[5,20], alpha_range=[0.3, 0.7], batch_size=16, num_epochs=100)

#(1b) using optimal hyperparameters, train a model on full sample
olog = DL_single_run(xtr=x_full, ytr=y_full, units1=opars['units1'], units2=opars['units2'], dro=opars['dro'], lr=10**opars['lrexp'], l1r=10**opars['l1rexp'], alpha=opars['alpha'], batchsize=16, numepochs=100)

#(1c) Compute Harrell's Concordance index
predfull = olog.model.predict(x_full, batch_size=1)[1]
C_app = concordance_index(y_full[:,1], -predfull, y_full[:,0])

print('Apparent concordance index = {0:.4f}'.format(C_app))



#BOOTSTRAP SAMPLING

#define useful variables
nsmp = len(x_full)
rowids = [_ for _ in range(nsmp)]
B = 100

for b in range(B):
    print('Current bootstrap sample:', b, 'of', B-1)
    print('-------------------------------------')
def get_rf_comparison(basedir):
    norm = ['rare', 'rel_abun', 'log', 'clr']
    levels = ['phyla', 'classes', 'orders', 'families', 'genera', 'species']  
    order_cols, order_rows = [], []
    fig = plt.figure(figsize=(13, 10)) 
    for l in range(len(levels)):
        level = levels[l]
        dfs, dfs_tax = [], []
        for n in norm:
            df = pd.read_csv(basedir+'random_forest/'+n+'/'+level+'_overall.csv', header=0, index_col=0)
            try:
                df_tax = df.drop(['Score', 'OOB_score'], axis=0)
            except:
                df_tax = df.drop(['Score'], axis=0)
            df_tax['Mean'] = df_tax.mean(axis=1)
            df_tax = pd.DataFrame(df_tax.loc[:, 'Mean'])
            df = pd.DataFrame(df.loc['Score', :])
            df = df.transpose()
            df.rename(index={'Score':n}, inplace=True)
            dfs.append(df)
            df_tax = df_tax.transpose()
            df_tax.rename(index={'Mean':n}, inplace=True)
            dfs_tax.append(df_tax)
        dfs = pd.concat(dfs)*100
        dfs_tax = pd.concat(dfs_tax).fillna(value=0)
        for a in range(2):
            dfs['Mean'] = dfs.mean(axis=1)
            dfs = dfs.sort_values(by='Mean', axis=0, ascending=True)
            if a == 1: 
                dfs = dfs.sort_values(by='Mean', axis=0, ascending=False)
            else:
                df_mean = pd.DataFrame(dfs.loc[:, 'Mean'])
            dfs.drop('Mean', axis=1, inplace=True)
            dfs = dfs.transpose()
        dfs.rename(index=norm_names, columns=rename_plots, inplace=True)
        df_mean.rename(index=norm_names, inplace=True)
        order_rows = ['Rarefied', 'Relative\nabundance', 'Log', 'CLR']
        if order_cols == []:
            order_cols = list(dfs.columns)
        else:
            dfs = dfs.loc[:, order_cols]
        dfs = dfs.loc[order_rows, :]
        df_mean = df_mean.loc[order_rows, :]
        ax = plt.subplot2grid((6,120), (l, 5), colspan=76)
        ax_mean = plt.subplot2grid((6, 120), (l, 0), colspan=4)
        ax_con = plt.subplot2grid((6,6), (l, 4))
        ax_con_tax = plt.subplot2grid((6,6), (l, 5))
        xtcks = False
        if l == 5: xtcks = True
        annotate_heatmap(ax, dfs, xticks=xtcks, yticks=False)
        
        annotate_heatmap(ax_mean, df_mean, xticks=xtcks)
        ax_mean.set_ylabel(level.capitalize(), fontsize=fs_main, fontweight='bold')
        concs = []
        for a in range(4):
            conc = []
            for b in range(4):
                l1 = dfs.iloc[a, :].values
                l2 = dfs.iloc[b, :].values
                conc.append(concordance_index(l1, l2))
            concs.append(conc)
        concs = pd.DataFrame(concs, index=dfs.index.values, columns=dfs.index.values)
        annotate_heatmap(ax_con, concs, cmap='Blues', rnd=2, yticks=False, xticks=xtcks, vmin=0.75)
        concs_tax = []
        for a in range(4):
            conc_tax = []
            for b in range(4):
                l1 = dfs_tax.iloc[a, :].values
                l2 = dfs_tax.iloc[b, :].values
                conc_tax.append(concordance_index(l1, l2))
            concs_tax.append(conc_tax)
        concs_tax = pd.DataFrame(concs_tax, index=dfs_tax.index.values, columns=dfs_tax.index.values)
        concs_tax.rename(columns=norm_names, inplace=True)
        annotate_heatmap(ax_con_tax, concs_tax, cmap='Purples', rnd=2, yticks=False, xticks=xtcks, vmin=0.75)
        
        if l == 0:
            ax.set_title('Classification accuracy (%)', fontsize=fs_title, fontweight='bold')
            ax_con.set_title('Concordance in\nclassification accuracy', fontsize=fs_main, fontweight='bold')
            ax_con_tax.set_title('Concordance in\nfeature importance', fontsize=fs_main, fontweight='bold')
    
    plt.savefig(basedir+'/figures/RF_compare'+ext, dpi=600, bbox_inches='tight')
    plt.close()
    return 
Exemplo n.º 35
0
                           optimizer=tf.keras.optimizers.Adam(
                               learning_rate=0.001, ))
         callbacks = [
             tf.keras.callbacks.EarlyStopping(monitor='val_coxph',
                                              min_delta=0.0001,
                                              patience=20,
                                              mode='min',
                                              restore_best_weights=True)
         ]
         history = mil.model.fit(ds_train,
                                 steps_per_epoch=4,
                                 validation_data=ds_valid,
                                 epochs=10000,
                                 callbacks=callbacks)
         y_pred_all = mil.model.predict(ds_all)
         if concordance_index(samples['times'], np.exp(
                 -1 * y_pred_all[:, 0]), samples['event']) > .52:
             X = True
             evaluation = mil.model.evaluate(ds_test)
             histories.append(history.history)
             evaluations.append(evaluation)
             weights.append(mil.model.get_weights())
     except:
         pass
 ##get ranks per cancer
 for index, cancer in enumerate(['NA']):
     mask = np.where(cancer_strat == index)[0]
     cancer_test_indexes[cancer] = cancer_test_indexes.get(
         cancer, []) + [mask[np.isin(mask, idx_test, assume_unique=True)]]
     temp = np.exp(-y_pred_all[mask, 0]).argsort()
     ranks = np.empty_like(temp)
     ranks[temp] = np.arange(len(mask))
Exemplo n.º 36
0
def c_index3(month, risk, status):

    c_index = concordance_index(np.reshape(month, -1), -np.reshape(risk, -1),
                                np.reshape(status, -1))

    return c_index  #def get_bi_lstm_model():
Exemplo n.º 37
0
    def predict_concordance_index(self, x, t, e, outcomes=None):
        input_size = x.shape[0]
        i = 0
        num_batches = input_size / self.batch_size
        predicted_time = np.zeros(shape=input_size, dtype=np.int)
        total_ranking = 0.0
        total_rae = 0.0
        total_cost = 0.0
        total_gen_loss = 0.0
        total_disc_loss = 0.0
        total_layer_one_recon = 0.0
        total_t_reg_loss = 0.0
        total_reg = 0.0
        total_mse = 0.0
        while i < input_size:
            # The ending index for the next batch is denoted j.
            j = min(i + self.batch_size, input_size)
            feed_dict = self.batch_feed_dict(e=e,
                                             i=i,
                                             j=j,
                                             t=t,
                                             x=x,
                                             outcomes=outcomes)
            cost, ranking, gen_loss, rae, reg, disc_loss, layer_one_recon, t_reg_loss, t_mse = self.session.run(
                [
                    self.cost, self.ranking_partial_lik, self.gen_one_loss,
                    self.total_rae, self.reg_loss, self.disc_one_loss,
                    self.layer_one_recon, self.t_regularization_loss,
                    self.t_mse
                ],
                feed_dict=feed_dict)
            temp_pred_time = []
            for p in range(self.sample_size):
                gen_time = self.session.run(self.predicted_time,
                                            feed_dict=feed_dict)
                temp_pred_time.append(gen_time)

            temp_pred_time = np.array(temp_pred_time)
            # print("temp_pred_time:{}".format(temp_pred_time.shape))
            predicted_time[i:j] = np.median(temp_pred_time, axis=0)

            total_ranking += ranking
            total_cost += cost
            total_rae += rae
            total_gen_loss += gen_loss
            total_reg += reg
            total_layer_one_recon += layer_one_recon
            total_disc_loss += disc_loss
            total_t_reg_loss += t_reg_loss
            total_mse += t_mse
            i = j
        predicted_event_times = predicted_time.reshape(input_size)
        ci_index = concordance_index(
            event_times=t,
            predicted_scores=np.nan_to_num(predicted_event_times).tolist(),
            event_observed=e)

        def batch_average(total):
            return total / num_batches

        return ci_index, batch_average(total_cost), batch_average(
            total_rae), batch_average(total_ranking), batch_average(
                total_gen_loss), batch_average(total_reg), batch_average(
                    total_disc_loss), batch_average(
                        total_layer_one_recon), batch_average(
                            total_t_reg_loss), batch_average(total_mse)
Exemplo n.º 38
0
    def train_neural_network(self):
        train_print = "Training {0} Model:".format(self.model)
        params_print = "Parameters:, l2_reg:{}, learning_rate:{}," \
                       " momentum: beta1={} beta2={}, batch_size:{}, batch_norm:{}," \
                       " hidden_dim:{}, latent_dim:{}, num_of_batches:{}, keep_prob:{}, disc_update:{}" \
            .format(self.l2_reg, self.learning_rate, self.beta1, self.beta2, self.batch_size,
                    self.batch_norm, self.hidden_dim, self.latent_dim, self.num_batches, self.keep_prob,
                    self.disc_updates)
        #        print(train_print)
        #        print(params_print)
        #        logging.debug(train_print)
        #        logging.debug(params_print)
        self.session.run(tf.global_variables_initializer())

        best_ci = 0
        best_t_reg = np.inf
        best_validation_epoch = 0
        last_improvement = 0

        start_time = time.time()
        epochs = 0
        #        show_all_variables()
        j = 0

        for i in range(self.num_iterations):
            # Batch Training
            run_options = tf.RunOptions(timeout_in_ms=4000)
            x_batch, t_batch, e_batch = self.session.run(
                [self.x_batch, self.t_batch, self.e_batch],
                options=run_options)
            risk_batch = risk_set(data_t=t_batch)
            batch_impute_mask = get_missing_mask(x_batch,
                                                 self.imputation_values)
            batch_size = len(t_batch)
            idx_observed = e_batch == 1
            # TODO simplify batch processing
            feed_dict_train = {
                self.x: x_batch,
                self.impute_mask: batch_impute_mask,
                self.t: t_batch,
                self.t_lab: t_batch[idx_observed],
                self.e: e_batch,
                self.risk_set: risk_batch,
                self.batch_size_tensor: batch_size,
                self.is_training: True,
                self.noise_alpha: np.ones(shape=self.noise_dim)
            }
            for k in range(self.disc_updates):
                _ = self.session.run([self.disc_solver],
                                     feed_dict=feed_dict_train)
            for m in range(self.gen_updates):
                _ = self.session.run([self.gen_solver],
                                     feed_dict=feed_dict_train)
            summary, train_time, train_cost, train_ranking, train_rae, train_reg, train_gen, train_layer_one_recon, \
            train_t_reg, train_t_mse, train_disc = self.session.run(
                [self.merged, self.predicted_time, self.cost, self.ranking_partial_lik, self.total_rae,
                 self.reg_loss, self.gen_one_loss, self.layer_one_recon, self.t_regularization_loss, self.t_mse,
                 self.disc_one_loss],
                feed_dict=feed_dict_train)
            try:
                train_ci = concordance_index(
                    event_times=t_batch,
                    predicted_scores=np.nan_to_num(train_time).reshape(
                        t_batch.shape),
                    event_observed=e_batch)
            except IndexError:
                train_ci = 0.0
                print("C-Index IndexError")

            ##### ibs / ibll #####
            train_time_grid = np.linspace(t_batch.min(), t_batch.max(), 100)
            train_ds = np.array(train_time_grid -
                                np.array([0.0] +
                                         train_time_grid[:-1].tolist()))
            #            print(t_batch)
            #            print(e_batch)
            train_bs, train_bll = get_scores(
                y_train=t_batch,
                delta_train=e_batch,
                y_test=t_batch,
                delta_test=e_batch,
                pred_train=train_time.reshape(t_batch.shape),
                pred_test=train_time.reshape(t_batch.shape),
                time_grid=train_time_grid,
                surv_residual=False,
                cens_residual=False)

            train_ibs = sum(train_bs * train_ds) / (train_time_grid.max() -
                                                    train_time_grid.min())
            train_ibll = sum(train_bll * train_ds) / (train_time_grid.max() -
                                                      train_time_grid.min())
            ######################

            tf.verify_tensor_all_finite(train_cost,
                                        "Training Cost has Nan or Infinite")
            if j >= self.num_examples:
                epochs += 1
                is_epoch = True
                # idx = 0
                j = 0
            else:
                # idx = j
                j += self.batch_size
                is_epoch = False
            if i % 100 == 0:
                train_print = "it:{}, trainCI:{}, train_ranking:{}, train_RAE:{},  train_Gen:{}, train_Disc:{}, " \
                              "train_reg:{}, train_t_reg:{}, train_t_mse:{}, train_layer_one_recon:{}".format(
                    i, train_ci, train_ranking, train_rae, train_gen, train_disc, train_reg, train_t_reg, train_t_mse,
                    train_layer_one_recon)
#                print(train_print)
#                logging.debug(train_print)

            if is_epoch or (i == (self.num_iterations - 1)):
                improved_str = ''
                # Calculate  Vaid CI the CI
                self.train_ci.append(train_ci)
                self.train_cost.append(train_cost)
                self.train_t_rae.append(train_rae)
                self.train_gen.append(train_gen)
                self.train_disc.append(train_disc)
                self.train_ranking.append(train_ranking)
                self.train_layer_one_recon.append(train_layer_one_recon)

                self.train_writer.add_summary(summary, i)
                valid_ci, valid_cost, valid_rae, valid_ranking, valid_gen, valid_reg, valid_disc, valid_layer_one_recon, valid_t_reg, valid_t_mse = self.predict_concordance_index(
                    x=self.valid_x, e=self.valid_e, t=self.valid_t)
                self.valid_cost.append(valid_cost)
                self.valid_ci.append(valid_ci)
                self.valid_t_rae.append(valid_rae)
                self.valid_gen.append(valid_gen)
                self.valid_disc.append(valid_disc)
                self.valid_ranking.append(valid_ranking)
                self.valid_layer_one_recon.append(valid_layer_one_recon)
                tf.verify_tensor_all_finite(
                    valid_cost, "Validation Cost has Nan or Infinite")

                if valid_t_reg < best_t_reg:
                    self.saver.save(sess=self.session,
                                    save_path=self.save_path)
                    best_validation_epoch = epochs
                    best_t_reg = valid_t_reg
                    last_improvement = epochs
                    improved_str = '*'
                    # Save  Best Perfoming all variables of the TensorFlow graph to file.
                # update best validation accuracy
#                optimization_print = "Iteration: {} epochs:{}, Training: RAE:{}, Loss: {}," \
#                                     " Ranking:{}, Reg:{}, Gen:{}, Disc:{}, Recon_One:{}, T_Reg:{},T_MSE:{},  CI:{}" \
#                                     " Validation RAE:{} Loss:{}, Ranking:{}, Reg:{}, Gen:{}, Disc:{}, " \
#                                     "Recon_One:{}, T_Reg:{}, T_MSE:{}, CI:{}, {}" \
#                    .format(i + 1, epochs, train_rae, train_cost, train_ranking, train_reg, train_gen,
#                            train_disc, train_layer_one_recon, train_t_reg, train_t_mse,
#                            train_ci, valid_rae, valid_cost, valid_ranking, valid_reg, valid_gen, valid_disc,
#                            valid_layer_one_recon, valid_t_reg, valid_t_mse, valid_ci, improved_str)
                optimization_print = "Iteration: {} || TRAIN loss: {}, CI: {}, IBS: {}, IBLL: {} || VAL loss: {}, CI:{}, improved: {}".format(
                    i + 1, np.round(train_cost, 4), np.round(train_ci, 4),
                    np.round(train_ibs, 4), np.round(train_ibll, 4),
                    np.round(valid_cost, 4), np.round(valid_ci, 4),
                    improved_str)
                if (i + 1) % 50 == 0:
                    print(optimization_print)
#                logging.debug(optimization_print)
                if epochs - last_improvement > self.require_improvement or math.isnan(
                        train_cost) or epochs >= self.max_epochs:
                    # if i - last_improvement > self.require_improvement:
                    print(
                        "No improvement found in a while, stopping optimization."
                    )
                    # Break out from the for-loop.
                    break
        # Ending time.
        end_time = time.time()
        time_dif = end_time - start_time
        time_dif_print = "Time usage: " + str(
            timedelta(seconds=int(round(time_dif))))
        #        print(time_dif_print)
        #        logging.debug(time_dif_print)
        # shutdown everything to avoid zombies
        self.session.run(self.queue.close(cancel_pending_enqueues=True))
        self.coord.request_stop()
        self.coord.join(self.threads)
        return best_validation_epoch, epochs
Exemplo n.º 39
0
def main():
        
    with tf.device('/gpu:0'):
        x = tf.placeholder(tf.float32, [None, clinic_num], name = 'input')
        s_time = tf.placeholder(tf.float32, [None,num_event], name = 'surv_time')
        s_event = tf.placeholder(tf.float32, [None,num_event], name = 'surv_event')
        Pat_ind = tf.placeholder(tf.int32, [None,num_event], name = 'Pat_ind')
        keep_prob = tf.placeholder(tf.float32, name = 'keep_rate')
        treatment = tf.placeholder(tf.float32, [None, dim_interact_feature], name = 'treatment')
        global_step = tf.placeholder(tf.int32, [])
        # model
        fc = _create_fc_layer(x, 3*clinic_num, 'relu', 'shared_layer', keep_prob, w_reg = reg_W)
        # fc = tf.concat([x,fc0], axis=1)

        # fc1_1 = _create_fc_layer(fc, 5*clinic_num, 'relu', 'specific_layer1_1', keep_prob, w_reg = reg_W)
        fc1_2 = _create_fc_layer(fc, 1*clinic_num, 'relu', 'specific_layer1_2', keep_prob, w_reg = reg_W)
        output1 = _create_fc_layer(fc1_2, num_event-1, 'tanh', 'output_1', use_bias= False, w_reg = reg_W_out)


        fc2_1 = tf.multiply(treatment, fc)
        # fc2_1 = _create_fc_layer(fc2_1, 5*clinic_num, 'relu', 'specific_layer2_1', keep_prob, w_reg = reg_W)
        fc2_2 = _create_fc_layer(fc2_1, 1*clinic_num, 'relu', 'specific_layer2_2', keep_prob, w_reg = reg_W)
        output2 = _create_fc_layer(fc2_2, 1, 'tanh', 'output_2', use_bias= False, w_reg = reg_W_out)
        # loss
        loss_cox_prog, loss_rank_prog = Get_loss(output1, s_time, s_event, Pat_ind)
        pred_DFS = tf.reduce_max(output1,axis=1)
        loss_cox_pred = DeepSurv_loss(s_time[:,3], s_event[:,3], Pat_ind[:,3], output2)
        loss_reg = tf.losses.get_regularization_loss()
        # + intra_loss_weight[1]*loss_cox_pred
        loss_total = intra_loss_weight[0]*loss_cox_prog + reg_factor*loss_reg
        # import pdb; pdb.set_trace()
        # x1 = tf.Variable([0.2,0.3,0.5],tf.float32)
        # x2 = tf.reduce_max(x1)
        learning_rate = exponential_decay_with_warmup(warmup_step,learning_rate_base,global_step,learning_rate_step,learning_rate_decay,staircase=True)
        optimizer = tf.train.MomentumOptimizer(learning_rate = learning_rate, momentum = momentum, use_nesterov = True)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_step = optimizer.minimize(loss_total)

        restore_var = [v for v in tf.trainable_variables()]
        print(restore_var)
        # import pdb;pdb.set_trace()
        saver = tf.train.Saver(max_to_keep = 10)
        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.9)#设置每个GPU使用率0.7代表70%
        config = tf.ConfigProto(gpu_options = gpu_options, allow_soft_placement = True)
        with tf.Session(config = config) as sess:
            # Initialize all variables
            sess.run(tf.global_variables_initializer())
            # loader = tf.train.Saver(var_list=restore_var)
            # loader.restore(sess, snapshot_dir)
            
            gsp = 0
            # Loop over number of epochs
            for epoch in range(num_epochs):
            
                # print("{} Start epoch number: {}".format(datetime.now(), epoch))
                np.random.shuffle(ind_0)
                np.random.shuffle(ind_1)
                # Initialize iterator with the training dataset
                train_risk = 0.0
                prog_risk = 0.0
                pred_risk = 0.0
                reg_risk = 0.0
                # import pdb;pdb.set_trace()
                for i in range(num_batchs):
                    gsp += 1
                    ind0 = ind_0[i*r0:(i+1)*r0]
                    ind1 = ind_1[i*r1:(i+1)*r1]
                    treat, input_x1, input_time, input_event, input_idx = GetData(ind0,ind1)
                    # import pdb;pdb.set_trace()
                    # pdfs, opt = sess.run([pred_DFS,output1], feed_dict = {global_step:gsp, treatment: treat, x: input_x1, s_time: input_time, s_event: input_event, Pat_ind: input_idx, keep_prob: 1.0})
                    # pdfs1 = sess.run(pred_DFS, feed_dict = {global_step:gsp, treatment: treat, x: input_x1, s_time: input_time, s_event: input_event, Pat_ind: input_idx, keep_prob: 1.0})
                    # print(pdfs1)
                    # print(pdfs)
                    # print(opt)
                    _, opt2, _, opt, reg_ls, prog_ls, pred_ls, total_ls, now_lr = sess.run([train_step, fc, output1, pred_DFS, loss_reg, loss_cox_prog, loss_cox_pred, loss_total, learning_rate], feed_dict = {global_step:gsp, treatment: treat, x: input_x1, s_time: input_time, s_event: input_event, Pat_ind: input_idx, keep_prob: keep_prob_rate})
                    reg_risk += reg_ls
                    train_risk += total_ls
                    prog_risk += prog_ls
                    pred_risk += pred_ls
                # import pdb;pdb.set_trace()
                reg_risk /= num_batchs
                train_risk /= num_batchs
                prog_risk /= num_batchs
                pred_risk /= num_batchs
                line = 'epoch: %d, learning rate: %.5f, tatol_loss: %.4f, reg_loss: %.4f, prognosis-cox loss: %.4f, predict-cox loss: %.4f' % (epoch + 1, now_lr, train_risk, reg_risk, prog_risk, pred_risk)
                print(line)
                with open(log_path, 'a') as f:
                    f.write(line + '\n')
                if  (epoch+1)%2 == 0:
                    tra_pred = []
                    for i in range(len(tra_treat)):
                        xd = tra_treat[i]
                        treat = np.array([xd]*dim_interact_feature)
                        opt1, Pat_pred = sess.run([output1,pred_DFS], feed_dict = {x: clinic_factors[i,:].reshape(1,clinic_num), keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)}) 
                        if np.max(Pat_pred)>1.0:
                            import pdb;pdb.set_trace()
                        tra_pred.append(-Pat_pred)
                    # import pdb;pdb.set_trace()
                    tra_pred = np.array(tra_pred, np.float32)
                    tra_ci_value = concordance_index(tra_FFS_time, tra_pred, tra_FFS_event)
                    line = 'train cohort, CI: %.4f, epoch: %d' % (tra_ci_value, epoch)
                    print(line)
                        
                    val_pred = []
                    for i in range(len(val_treat)):
                        xd = val_treat[i]
                        treat = np.array([xd]*dim_interact_feature)
                        opt = sess.run(pred_DFS, feed_dict = {x: clinic_factors_val[i,:].reshape(1,clinic_num), 
                                                                 keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)})
                        print(opt)
                        opt = sess.run(output1, feed_dict = {x: clinic_factors_val[i,:].reshape(1,clinic_num), 
                                                                 keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)})
                        print(opt)
                        opt1, Pat_pred = sess.run([output1,pred_DFS], feed_dict = {x: clinic_factors_val[i,:].reshape(1,clinic_num), 
                                                                 keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)}) 
                        print(opt1)
                        print(Pat_pred)
                        
                        opt1, Pat_pred = sess.run([output1,pred_DFS], feed_dict = {x: clinic_factors_val[i,:].reshape(1,clinic_num), 
                                                                 keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)}) 
                        print(opt1)
                        print(Pat_pred)
                        val_pred.append(-Pat_pred)
                    import pdb;pdb.set_trace()
                    val_pred = np.array(val_pred, np.float32)
                    val_ci_value = concordance_index(val_FFS_time, val_pred, val_FFS_event)
                    line = 'validation cohort, CI: %.4f, epoch: %d' % (val_ci_value, epoch)
                    print(line)
                        
                    test_pred = []
                    for i in range(len(test_treat)):
                        xd = test_treat[i]
                        treat = np.array([xd]*dim_interact_feature)
                        # import pdb;pdb.set_trace()
                        opt, Pat_pred = sess.run([output1,pred_DFS], feed_dict = {x: clinic_factors_test[i,:].reshape(1,clinic_num), keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)}) 
                        test_pred.append(-Pat_pred[0])
                    # import pdb;pdb.set_trace()
                    test_pred = np.array(test_pred, np.float32)
                    test_ci_value = concordance_index(test_FFS_time, test_pred, test_FFS_event)
                    line = 'test cohort, CI: %.4f, epoch: %d' % (test_ci_value, epoch)
                    print(line)
Exemplo n.º 40
0
 def cph_ci(x, t, e, **kwargs):
     return concordance_index(
         event_times= t, 
         predicted_event_times= -model.predict_partial_hazard(x), 
         event_observed= e,
     )