Exemplo n.º 1
0
def main():
    """ """
    # simulating a feature matrix for 100 samples with 50 features
    data = np.random.random((100, 50))
    # simulating time of observations (days) min 10 days, max 2500 days
    observed_time = np.random.randint(10, 2500, (100))
    # simulating event (death) 0 did not occur 1 occured
    observed_event = np.random.randint(0, 2, (100))

    test_data = np.random.random((25, 50))
    test_observed_time = np.random.randint(10, 2500, (25))
    test_observed_event = np.random.randint(0, 2, (25))

    for feature_id, feature_vect in enumerate(data.T):
        dataframe = pd.DataFrame({
            'feature nb{0}'.format(feature_id): feature_vect,
            'event': observed_event,
            'time': observed_time
        })

        #building a coxph model to see the significance of each independant feature
        cox_model = CoxPHFitter()

        cox_model.fit(dataframe, duration_col='time', event_col='event')

        pvalue = cox_model.summary.p[0]
        print('pvalue: {0} for feature nb: {1}'.format(pvalue, feature_id))

        if pvalue > 0.05:
            print('feature nb {0} not overall significant!'.format(feature_id))
            continue

        # test the robustness: score close / higher to 0.7 is a good sign
        scores = k_fold_cross_validation(cox_model,
                                         dataframe,
                                         duration_col='time',
                                         event_col='event',
                                         k=3)

        print('score (mean) (c-index) for {0}'.format(np.mean(scores)))

        # validate the features on the test set
        test_dataframe = pd.DataFrame({
            'feature nb{0}'.format(feature_id):
            test_data.T[feature_id],
            'event':
            test_observed_event,
            'time':
            test_observed_time
        })

        inferred_time = cox_model.predict_expectation(test_dataframe)

        validation_c_index = concordance_index(test_observed_time,
                                               inferred_time,
                                               test_observed_event)

        print('validation c-index: {0}'.format(validation_c_index))
Exemplo n.º 2
0
class CoxChurnModel:
    def __init__(self):
        self.cf = CoxPHFitter()

    def fit(self, dataset, pred_col='deltaNextHours', event_col='observed'):
        self.cf.fit(dataset, pred_col, event_col=event_col)

    def predict(self, df):
        pred = self.cf.predict_expectation(df)
        churned = (pred - df.recency.values.reshape((-1,1))) > predPeriodHours
        return churned.values.reshape(-1)

    def predict_proba(self, df):
        return np.zeros(len(df))
def finalPrediction(image_features,
                    radiomics,
                    clinical_data,
                    y,
                    patient_id,
                    pca=None,
                    cox_model=None):
    """
    Apply the PCA and the cox model to the features extracted from the image and the other features
        parameters : features, y, patient id, PCA and CoxPH models if we are applying on the dev and train set
        return the submission as well as the PCA and CoxPH models
    """
    # apply the PCA to the data
    x, pca = applyPCA(image_features, radiomics, clinical_data, pca=None)

    # if the cox model is not given, fit it on the (x, y) pair (i.e. we are on the train set)
    if not cox_model:
        size = x.shape[1] + y.shape[1]
        final_data = pd.DataFrame(
            data=np.hstack((x, y)),
            columns=['col_' + str(i) for i in range(size)])
        cox_model = CoxPHFitter()
        cox_model.fit(final_data,
                      duration_col='col_' + str(size - 2),
                      event_col='col_' + str(size - 1),
                      step_size=0.6)

    # then predict using the model
    size = x.shape[1]
    final_data = pd.DataFrame(data=x,
                              columns=['col_' + str(i) for i in range(size)])
    prediction = cox_model.predict_expectation(final_data).values[:, 0]

    # put the prediction in a pandas DataFrame to submit or evaluate on the concoardance index
    nans = np.nan * np.ones(patient_id.shape)
    submission = pd.DataFrame(np.vstack((patient_id, prediction, nans)).T)
    submission.columns = ['PatientID', 'SurvivalTime', 'Event']
    submission = submission.set_index(['PatientID'])

    # return the submission as well as both model, that might be used on the dev or test set
    return submission, pca, cox_model
def predict(dataframe):
    """
    Function for returning the expected lifetime based on the Input Data
    """

    ## Loading the Dataset ##
    input_path = "input/"
    df = pd.read_csv(os.path.join(input_path, "pbc.csv"))

    ## Some Pre-Processing ##
    for i in df.index:
        df.at[i, 'sex'] = 0 if df.loc[i, 'sex'] == "f" else 1

    ## Splitting the Dataset ##
    np.random.seed(0)
    df_dev, df_test = train_test_split(df, test_size=0.2)
    df_train, df_val = train_test_split(df_dev, test_size=0.25)

    ## Creating a encoding function ##
    def one_hot_encoder(dataframe, columns):
        return pd.get_dummies(dataframe,
                              columns=columns,
                              drop_first=True,
                              dtype=np.float64)

    to_encode = ["edema", "stage"]
    one_hot_train = one_hot_encoder(df_train, to_encode)
    one_hot_val = one_hot_encoder(df_val, to_encode)
    one_hot_test = one_hot_encoder(df_test, to_encode)
    one_hot_train.dropna(inplace=True)

    ## Fitting the Model ##
    cph = CoxPHFitter()
    cph.fit(one_hot_train,
            duration_col='age',
            event_col='status',
            step_size=0.1)

    return cph.predict_expectation(dataframe)[0]
def cross_val(pena, train_size=0.75, selection=False, features=None):
    """
    Hold out method.
    ----------
    pena : float>0
        Penalization coefficient for the L2 penalization.
    train_size : 0<float<1, optional
        Set the size of the training set. The default is 0.75.
    selection : bool, optional
        Feature selection enabling. The default is False.
    features : pd.Index, optional
        The features to select. The default is None.

    Returns
    -------
    cph : COXPHFitter.
        The Cox model. lifeline object
    score : float
        Score from the metrics.

    """
    x_train, x_test, y_train, y_test = get_data_set(x, y, train_size,
                                                    selection, features)
    cph = CoxPHFitter(penalizer=pena).fit(pd.concat([x_train, y_train],
                                                    axis=1),
                                          duration_col='SurvivalTime',
                                          event_col='Event')
    y_pred = cph_pred(cph.predict_expectation(x_test))

    if not (np.all(y_pred.iloc[:, 0].values)
            ):  # for some reasons sometimes predicted lifetime is null
        y_test = y_test.drop(
            y_pred.iloc[np.where(y_pred.iloc[:, 0].values == 0)[0]].index, 0)
        y_pred = y_pred.drop(
            y_pred.iloc[np.where(y_pred.iloc[:, 0].values == 0)[0]].index, 0)

    return (cph, cindex(y_test, y_pred))
Exemplo n.º 6
0
        data_test = {
            'T': y_test[:, 0],
            'E': y_test[:, 1],
            '%s' % name[0]: X_test[:, ID[0] - 1],
            '%s' % name[1]: X_test[:, ID[1] - 1],
            '%s' % name[2]: X_test[:, ID[2] - 1],
            '%s' % name[3]: X_test[:, ID[3] - 1],
            '%s' % name[4]: X_test[:, ID[4] - 1],
            #  '%s' % name[0]: X_test[:, ID[0]+1],
            #  '%s' % name[1]: X_test[:, ID[1]+1],
            #  '%s' % name[2]: X_test[:, ID[2]+1 ],
            #  '%s' % name[3]: X_test[:, ID[3]+1],
            #  '%s' % name[4]: X_test[:, ID[4]+1 ],
        }
        df_test = pd.DataFrame(data_test)
        predict = cph.predict_expectation(df_test)
        test_c_index.append(compute_C_index(predict, y_test))
train_c_index = np.asarray(train_c_index)
print("Train_c_index_mean:", np.mean(train_c_index))
test_c_index = np.asarray(test_c_index)
test_c_index = np.delete(test_c_index, 0)
print("test:", test_c_index)
print("Test_c_index_mean:", np.mean(test_c_index))
# 将选出的
# print(count)
rank = Counter(count).most_common(7)
print(rank, rank[0][0])

name = []
for i in (rank):
    id = i[0]
class ProportionalHazardRegressor_lfl(object):
    """
    Thin wrapper on Lifelines' cox proportional hazards fitter to be used with
    CVmodel.

    Attributes:
        model_kwargs (dict): keyword arguments to pass to CoxPHFitter's constructor
        model (CoxPHFitter): lifelines.CoxPHFitter object

    """
    def __init__(self, **kwargs):
        """
        Constructs a ProportionalHazardRegressor_lfl object.

        An lifelines.CoxPHFitter instance is created when
        the fit method is called.

        Args:
            kwargs (dict): kwargs to pass to the constructor of
                lifelines.CoxPHFitter.

        Returns:
            ProportionalHazardRegressor_lfl

        """
        self.model_kwargs = kwargs
        self.model = None

    def fit(self,
            X_train,
            Y_train,
            X_validate=None,
            Y_validate=None,
            cv_param=0.0,
            **fit_kwargs):
        """
        Create and fit the ProportionalHazards_lfl.
        The cv_param is the l2-penalizer term accepted CoxPHFitter.

        Args:
            X_train (numpy.ndarray ~ (num_samples, num_units)): training data.
            Y_train (numpy.ndarray ~ (num_samples,)): training labels.
            X_validate (numpy.ndarray ~ (num_samples, num_units)):
                validation data. Unused for this model.
            Y_validate (numpy.ndarray ~ (num_samples,)):
                validation labels. Unused for this model.
            cv_param: the value of the hyperparameter optimized in CV.
                The l2 penalizer term.
            fit_kwargs (dict): kwargs to pass to the fit method.

        Returns:
            None

        """
        self.model = CoxPHFitter(penalizer=cv_param, **self.model_kwargs)
        y = pd.DataFrame(Y_train, columns=['time', 'censor'])
        df = pd.concat((pd.DataFrame(X_train), y), axis=1)
        self.model.fit(df, 'time', 'censor', **fit_kwargs)

    def predict(self, X):
        """
        Predict survival expectations for X.

        Args:
            X (numpy.ndarray ~ (num_samples, num_features)): samples to predict
                survival times from.

        Returns:
            times (numpy.ndarray ~ (num_samples, num_classes)):
                expected survival times.

        """
        assert self.model is not None, "Need to fit model first"
        return self.model.predict_expectation(X)
Exemplo n.º 8
0
def train_cox(x_train0, ix_in, y_per_pt, y_int, metric = 'auc', feature_grid = None):
    if feature_grid is None:
        feature_grid = np.logspace(7, 20, 14)
    survival = {}
    # for ic_in, ix_in in enumerate(ix_inner):
    train_index, test_index = ix_in
    x_train, x_test = x_train0.iloc[train_index, :], x_train0.iloc[test_index, :]

    lamb_dict = {}
    lamb_dict['auc'] = {}
    lamb_dict['ci'] = {}
    for il, lamb in enumerate(feature_grid):
        ix_inner2 = leave_one_out_cv(x_train, x_train['outcome'], ddtype='all_data')
        ix_rand_samp = np.random.choice(np.arange(len(ix_inner2)), 10, replace=False)
        ix_inner2_samp = np.array(ix_inner2, dtype='object')[ix_rand_samp]
        # ix_inner2_rand_samp = np.random.choice(ix_inner2, 10, replace = False)
        counter = 0
        start = time.time()

        hazards = []
        event_times = []
        event_outcomes = []
        probs_in = []
        true = []

        model = CoxPHFitter(penalizer=lamb, l1_ratio=1.)
        for ic_in2, ix_in2 in enumerate(ix_inner2_samp):
            start_inner = time.time()

            train_ix, test_ix = ix_in2
            x_tr2, x_ts2 = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :]
            tmpts_in = [xx.split('-')[1] for xx in x_tr2.index.values]
            samp_weights = get_class_weights(np.array(y_int[x_tr2.index.values]), tmpts_in)
            samp_weights[samp_weights <= 0] = 1
            x_tr2.insert(x_tr2.shape[1], 'weights', samp_weights)
            try:
                model.fit(x_tr2, duration_col='week', event_col='outcome',
                          weights_col='weights', robust=True, show_progress = False)
            except:
                counter += 1
                continue
            pred_f = model.predict_survival_function(x_ts2.iloc[0, :])
            probs_in.append(1 - pred_f.loc[4.0].item())
            true.append(x_ts2['outcome'].iloc[-1])
            hazard = model.predict_partial_hazard(x_ts2)
            hazards.append(hazard)
            event_times.append(x_ts2['week'])
            event_outcomes.append(x_ts2['outcome'])
            end_inner = time.time()
            # print('Inner ix ' + str(ic_in2) + ' complete in ' + str(end_inner - start_inner))

        # if metric == 'CI':
        try:
            score = concordance_index(pd.concat(event_times), pd.concat(hazards), pd.concat(event_outcomes))
            lamb_dict['ci'][lamb] = score
            end_t = time.time()
            print(str(il) + ' complete')
            print((end_t - start)/60)
        except:
            print('No score available')
            continue
        # elif metric == 'auc':
        try:
            score = sklearn.metrics.roc_auc_score(true, probs_in)
            lamb_dict['auc'][lamb] = score
        except:
            continue

    lambdas, aucs_in = list(zip(*lamb_dict[metric].items()))
    ix_max = np.argmax(aucs_in)
    best_lamb = lambdas[ix_max]

    model_out = CoxPHFitter(penalizer=best_lamb, l1_ratio=1.)
    tmpts_in = [xx.split('-')[1] for xx in x_train.index.values]
    samp_weights = get_class_weights(np.array(y_int[x_train.index.values]), tmpts_in)
    samp_weights[samp_weights<=0] = 1
    x_train.insert(x_train.shape[1], 'weights', samp_weights)
    x_train['weights'] = samp_weights
    try:
        model_out.fit(x_train, duration_col='week', event_col='outcome', weights_col='weights', robust=True)
    except:
        return {}
    pred_f = model_out.predict_survival_function(x_test.iloc[0, :])
    pt = x_test.index.values[0].split('-')[0]

    hazard_out = model_out.predict_partial_hazard(x_test)


    pts = [ii.split('-')[0] for ii in x.index.values]
    tmpts = [ii.split('-')[1] for ii in x.index.values]
    # if pt not in survival.keys():
        # survival[pt] = {}
    ixs = np.where(np.array(pts) == pt)[0]
    survival['actual'] = str(np.max([float(tmpt) for tmpt in np.array(tmpts)[ixs]]))
    if y_per_pt[pt] == 'Cleared':
        survival['actual'] = survival['actual'] + '+'

    probs_sm = 1 - pred_f.loc[4.0].item()

    y_pred_exp = model_out.predict_expectation(x_test.iloc[[0], :])
    survival['predicted'] = str(np.round(y_pred_exp.item(), 3))
    surv_func = pred_f

    # probs_df = pd.Series(probs_sm)
    # y_pp = y_per_pt.replace('Cleared', 0).replace('Recur', 1)
    # final_df = pd.concat([y_pp, probs_df], axis=1).dropna()

    final_dict = {}
    # final_dict['probability_df'] = final_df
    final_dict['model'] = model_out
    final_dict['survival'] = survival
    final_dict['survival_function'] = surv_func
    final_dict['prob_true'] = (probs_sm, y_per_pt[pt])
    final_dict['times_hazards_outcomes'] = (x_test['week'], hazard_out, x_test['outcome'])
    final_dict['lambdas'] = lamb_dict
    # final_dict['auc'] = sklearn.metrics.roc_auc_score(final_df[0], final_df[1])
    return final_dict
Exemplo n.º 9
0
    def __linear_small(self, is_death, train_data_path, basepath):
        small_dataset_file = train_data_path
        small_dataset = pandas.read_csv(small_dataset_file,
                                        encoding='UTF-8',
                                        index_col=[0])
        del small_dataset['patient_id']
        del small_dataset['name']

        # 哑变量处理
        formular = ''
        classify_attr = {
            'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol',
            'HBsAg', 'HBsAb', 'HBeAg', 'HBeAb', 'HBcAb', 'HCV',
            'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'diuretic', 'LipidD',
            'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD',
            'access', 'ESRDcause', 'hypertension', 'DM',
            'cardiovasculardisease', 'cerebrovasculardisease', 'bleeding',
            'malignancy', 'ablocker', 'bblocker'
        }
        for column in small_dataset.columns:
            if column in classify_attr:
                formular = formular + 'C(' + column + ')+'
            else:
                formular = formular + column + '+'
        formular = formular[:-1]

        small_dataset = patsy.dmatrix(formular + '-1',
                                      small_dataset,
                                      return_type='dataframe')
        if is_death:
            T_true, E_true, T_false, E_false = ('survivaltime1', 'outcome1',
                                                'survivaltime2', 'outcome2')
            attr_file, p632_file, var_file, kfold_file = (
                'lm_significant_attrs.txt', 'lm_stats632.csv',
                'lm_statvar.txt', 'lm_statskfold.csv')
            beta_file, p_file = ('lm_coef.csv', 'lm_p.csv')
        else:
            T_true, E_true, T_false, E_false = ('survivaltime2', 'outcome2',
                                                'survivaltime1', 'outcome1')
            attr_file, p632_file, var_file, kfold_file = (
                'lm_significant_attrs_e.txt', 'lm_stats632_e.csv',
                'lm_statvar_e.txt', 'lm_statskfold_e.csv')
            beta_file, p_file = ('lm_coef_e.csv', 'lm_p_e.csv')
        del small_dataset[T_false]
        del small_dataset[E_false]

        significant_attrs = list()
        for column in small_dataset.columns:
            # print('column', column)
            if column in {T_true, E_true}:
                continue
            subset = small_dataset[[column, T_true, E_true]]
            # print('subset', subset)
            try:
                cox = CoxPHFitter()
                cox.fit(subset, T_true, E_true)
                # print('cox.summary['p'][0]:', cox.summary['p'][0])
                if cox.summary['p'][0] < 0.05:
                    significant_attrs.append(column)
            except Exception:
                continue
        output = open(attr_file, mode='w')
        for attr in significant_attrs:
            output.write(attr + '\n')
        output.close()

        input = open(attr_file)
        significant_attrs = [line.strip() for line in input.readlines()]
        input.close()

        significant_attrs.append(T_true)
        significant_attrs.append(E_true)
        print('linear_small ## sign_attr : %d' % len(significant_attrs))

        small_dataset = small_dataset[significant_attrs]

        # 10000 times .632 bootstrap
        count = 0
        stats632 = list()
        statscoef = list()
        statspvalue = list()
        while count < 10000:  # 线性训练
            try:
                train_set = small_dataset.take(
                    numpy.random.randint(0,
                                         len(small_dataset),
                                         size=len(small_dataset)))
                test_set = small_dataset.ix[set(
                    small_dataset.index).difference(set(train_set.index))]

                train_set.index = range(len(train_set))
                test_set.index = range(len(test_set))

                cox = CoxPHFitter()
                cox.fit(train_set, T_true, E_true)
                train_cindex = concordance_index(
                    cox.durations,
                    -cox.predict_partial_hazard(cox.data).values.ravel(),
                    cox.event_observed)

                statscoef.append(cox.summary[['coef']].T)
                statspvalue.append(cox.summary[['p']].T)

                # test_set
                test_actual_T = test_set[T_true].copy()
                test_actual_E = test_set[E_true].copy()
                test_variable = test_set[test_set.columns.difference(
                    [T_true, E_true])]
                test_predictT = cox.predict_expectation(test_variable)

                # small_set
                all_actual_T = small_dataset[T_true].copy()
                all_actual_E = small_dataset[E_true].copy()
                all_variable = small_dataset[small_dataset.columns.difference(
                    [T_true, E_true])]
                all_predictT = cox.predict_expectation(all_variable)

                try:
                    test_cindex = concordance_index(test_actual_T,
                                                    test_predictT,
                                                    test_actual_E)
                    all_cindex = concordance_index(all_actual_T, all_predictT,
                                                   all_actual_E)
                except Exception:
                    test_cindex = concordance_index(test_actual_T,
                                                    test_predictT)
                    all_cindex = concordance_index(all_actual_T, all_predictT)

                stats632.append([train_cindex, test_cindex, all_cindex])
                count += 1
                print('632 -> %d' % count)
            except Exception:
                continue
        stats632_df = pandas.DataFrame(stats632,
                                       columns=['train', 'test', 'all'])
        stats632_df.to_csv(p632_file, encoding='UTF-8')

        statscoef_df = pandas.DataFrame(
            pandas.concat(statscoef, ignore_index=True))
        statscoef_df.to_csv(beta_file, encoding='UTF-8')
        statspvalue_df = pandas.DataFrame(
            pandas.concat(statspvalue, ignore_index=True))
        statspvalue_df.to_csv(p_file, encoding='UTF-8')

        # 2000 times 10-fold cross-validation、十折交叉
        count = 0
        statskfold = list()
        while count < 2000:
            try:
                cox = CoxPHFitter()
                scores = k_fold_cross_validation(cox, small_dataset, T_true,
                                                 E_true, 10)
                statskfold.append(scores)
                count += 1
                print('k-fold -> %d' % count)
            except Exception:
                continue
        statskfold_df = pandas.DataFrame(statskfold)
        statskfold_df.to_csv(basepath + "/" + kfold_file, encoding='UTF-8')
Exemplo n.º 10
0
    def __linear_big(self, is_death, train_data_path, basepath):
        big_dataset_file = train_data_path
        big_dataset = pandas.read_csv(big_dataset_file,
                                      encoding='UTF-8',
                                      index_col=[0])
        del big_dataset['patient_id']
        del big_dataset['name']
        del big_dataset['tx_id']
        # del big_dataset['tx_id.1']
        del big_dataset['tx_date']

        formular = ''
        # classify_attr = {'subject', 'treat_item', 'vascular_access_type',
        #                  'dialysis_machine', 'reuse_times', 'anticoagulation_scope',
        #                  'anticoagulation', 'protamine', 'replacement_way',
        #                  'take_food', 'fluid_infusion', 'blood_pressure_pos',
        #                  'gender', 'smoking', 'highflux', 'payment', 'marital', 'alcohol', 'HBsAg', 'HBsAb',
        #                  'HBeAg', 'HBeAb', 'HBcAb', 'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'diuretic',
        #                  'LipidD', 'CaPB', 'NCaPB', 'VitD', 'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access',
        #                  'ESRDcause', 'hypertension', 'DM', 'cardiovasculardisease', 'cerebrovasculardisease',
        #                  'bleeding', 'malignancy', 'ablocker', 'bblocker'}
        classify_attr = {
            'subject', 'treat_item', 'vascular_access_type',
            'dialysis_machine', 'anticoagulation_scope', 'anticoagulation',
            'protamine', 'replacement_way', 'take_food', 'fluid_infusion',
            'blood_pressure_pos', 'gender', 'smoking', 'highflux', 'payment',
            'marital', 'alcohol', 'HBsAg', 'HBsAb', 'HBeAg', 'HBeAb', 'HBcAb',
            'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'blocker',
            'blocer', 'diuretic', 'LipidD', 'CaPB', 'NCaPB', 'VitD',
            'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access', 'ESRDcause',
            'hypertension', 'DM', 'cardiovasculardisease',
            'cerebrovasculardisease', 'bleeding', 'malignancy'
        }
        # u'\xa6\xc2blocker'
        # print('classify_attr.dtype:', classify_attr.shape)

        for column in big_dataset.columns:
            # print("column", column)
            if column in classify_attr:
                formular = formular + 'C(' + column + ')+'
            else:
                formular = formular + column + '+'
        # print('formular:', formular)
        # 去掉最后面的'+'
        # type(formular): <type 'unicode'>
        formular = formular[:-1].encode('utf-8')
        # print('formular[:-1].type:', type(formular))

        # '-1'表示不添加截取列
        big_dataset = patsy.dmatrix(formular + '-1',
                                    big_dataset,
                                    return_type='dataframe')
        # print(type(big_dataset))
        # print(big_dataset.columns)
        # print('big_dataset:', big_dataset)
        if is_death:
            T_true, E_true, T_false, E_false = ('survivaltime1', 'outcome1',
                                                'survivaltime2', 'outcome2')
            attr_file, p632_file, var_file, kfold_file = (
                'lb_significant_attrs.txt', 'lb_stats632.csv',
                'lb_statvar.txt', 'lb_statskfold.csv')
            beta_file, p_file = ('lb_coef.csv', 'lb_p.csv')
        else:
            T_true, E_true, T_false, E_false = ('survivaltime2', 'outcome2',
                                                'survivaltime1', 'outcome1')
            attr_file, p632_file, var_file, kfold_file = (
                'lb_significant_attrs_e.txt', 'lb_stats632_e.csv',
                'lb_statvar_e.txt', 'lb_statskfold_e.csv')
            beta_file, p_file = ('lb_coef_e.csv', 'lb_p_e.csv')
        del big_dataset[T_false]
        del big_dataset[E_false]

        significant_attrs = list()
        # 根据报错删除部分字段
        del big_dataset['k_concentration']
        del big_dataset['SDUFR_x']
        del big_dataset['SDUFR_y']
        del big_dataset['SDUFR_y_v']
        del big_dataset['protamine_c']
        del big_dataset['k_concentration_c']
        """如果已经挑选出了具有统计意义的风险因子则不需要执行以下验证风险因子统计学意义的片段 """
        #+++++++++++++++++++++++++++++++++++++++++++++++++++++
        #        for column in big_dataset.columns:
        #            if column in {T_true, E_true}:
        #                continue
        #            subset = big_dataset[[column, T_true, E_true]]
        #            # print('subset', subset)
        #            try:
        #                # print('start fitting ')
        #                cox = CoxPHFitter()
        #                cox.fit(subset, T_true, E_true)
        #                help(cox)
        #                print('cox value:', cox.print_summary())
        #                print('p value:', cox.summary['p'][0])
        #                if cox.summary['p'][0] < 0.05:
        #                    # print(column, cox.summary['p'][0])
        #                    significant_attrs.append(column)
        #            except Exception:
        #                continue
        #        output = open(basepath+"/"+attr_file, mode='w')
        #        for attr in significant_attrs:
        #            output.write(attr + '\n')
        #        output.close()
        #++++++++++++++++++++++++++++++++++++++++++++++++++++
        input = open(basepath + "/" + attr_file)
        significant_attrs = [line.strip() for line in input.readlines()]
        input.close()
        significant_attrs.append(T_true)
        significant_attrs.append(E_true)
        print('linear_big ## sign_attr : %d' % len(significant_attrs))
        print(len(significant_attrs), T_true, E_true)

        big_dataset = big_dataset[significant_attrs]
        print(len(big_dataset.columns))
        #        exit()

        #        10000 times .632 bootstrap
        count = 0
        stats632 = list()
        statscoef = list()
        statspvalue = list()
        while count < 10000:
            print('count', count)
            try:
                # big_dataset = big_dataset.take(numpy.random.permutation(len(big_dataset)))
                # big_dataset.index = range(len(big_dataset))
                # percent = int(len(big_dataset) * 0.30)
                # train_set = big_dataset[:-percent]
                # test_set = big_dataset[-percent:]
                # train_set.index = range(len(train_set))
                # test_set.index = range(len(test_set))

                train_set = big_dataset.sample(1500, replace=False)
                test_set = big_dataset.sample(1500, replace=False)

                print('try fitting......', len(big_dataset), len(train_set),
                      len(test_set))
                cox = CoxPHFitter()
                cox.fit(train_set, T_true, E_true)
                train_cindex = concordance_index(
                    cox.durations,
                    -cox.predict_partial_hazard(cox.data).values.ravel(),
                    cox.event_observed)

                statscoef.append(cox.summary[['coef']].T)
                statspvalue.append(cox.summary[['p']].T)

                print('try predicting......')
                # test_set
                test_actual_T = test_set[T_true]
                test_actual_E = test_set[E_true]
                test_variable = test_set[test_set.columns.difference(
                    [T_true, E_true])]
                test_predictT = cox.predict_expectation(test_variable)

                # small_set
                all_actual_T = big_dataset[T_true]
                all_actual_E = big_dataset[E_true]
                all_variable = big_dataset[big_dataset.columns.difference(
                    [T_true, E_true])]
                all_predictT = cox.predict_expectation(all_variable)

                print('try cindexing......')
                try:
                    test_cindex = concordance_index(test_actual_T,
                                                    test_predictT,
                                                    test_actual_E)
                    all_cindex = concordance_index(all_actual_T, all_predictT,
                                                   all_actual_E)
                except Exception:
                    test_cindex = concordance_index(test_actual_T,
                                                    test_predictT)
                    all_cindex = concordance_index(all_actual_T, all_predictT)

                print(train_cindex, test_cindex, all_cindex)
                # 0.5 0.5 0.5
                # 0.963726363744 0.965792024703 0.964552831227
                # 0.5 0.5 0.5
                # 0.5 0.5 0.5
                # 0.940458783243 0.939660104788 0.940145223899
                # 0.950570809577 0.946854258363 0.949067405671
                # 0.941352881629 0.941623634389 0.941462605414
                # 0.5 0.5 0.5
                stats632.append([train_cindex, test_cindex, all_cindex])
                count += 1
                print('632 -> %d' % count)
            except Exception as e:
                print(e.message)
                continue
        stats632_df = pandas.DataFrame(stats632,
                                       columns=['train', 'test', 'all'])
        stats632_df.to_csv(p632_file, encoding='UTF-8')
        statscoef_df = pandas.DataFrame(
            pandas.concat(statscoef, ignore_index=True))
        statscoef_df.to_csv(beta_file, encoding='UTF-8')
        statspvalue_df = pandas.DataFrame(
            pandas.concat(statspvalue, ignore_index=True))
        statspvalue_df.to_csv(p_file, encoding='UTF-8')
        print('10000 times .632 bootstrap has done.')
Exemplo n.º 11
0
def get_surv_curv(
        data,
        player):  ##add percentile of prediction as an annottion on the graph
    cph = CoxPHFitter()
    cph.fit(data, 'NBA_Experience', event_col='active')
    X = data.loc[[player]].drop(['NBA_Experience', 'active'], axis=1)
    league_surv = cph.baseline_survival_
    player_surv = cph.predict_survival_function(X)
    x = data.drop(['NBA_Experience', 'active'], axis=1)
    predictions = cph.predict_expectation(x)
    percentiles = predictions.rank(pct=True)
    player_pct = percentiles.loc[player]
    string = 'Career Length Prediction Percentile: ' + str(
        round(player_pct.values[0], 2))

    trace1 = go.Scatter(name='League Average',
                        x=league_surv.index,
                        y=league_surv['baseline survival'].values,
                        marker={'color': "#253046"})
    trace2 = go.Scatter(name=player,
                        x=player_surv.index,
                        y=player_surv[player].values,
                        marker={'color': '#B35E3B'})

    data = [trace1, trace2]
    layout = go.Layout({
        "xaxis": {
            "title": "Years in the NBA",
            'color': '#253046'
        },
        "yaxis": {
            "title": "Probability of remaining in the NBA",
            'color': '#253046'
        },
        'paper_bgcolor':
        '#F8F3F1',
        'plot_bgcolor':
        '#F8F3F1',
        'margin': {
            't': 50,
            'r': 30
        },
        'annotations': [{
            'x': 13,
            'y': 0.78,
            'text': string,
            'showarrow': False,
            'font': {
                'size': 14,
                'color': '#253046'
            }
        }],
        'legend': {
            'x': .8,
            'y': 1,
            'traceorder': 'normal'
        }
    })

    fig = go.Figure(data=data, layout=layout)

    return fig
Exemplo n.º 12
0
    rad_X_data = rad_X_data[r_features]

    X_data = pd.concat([X_data, rad_X_data], axis=1)

    X_data = pd.get_dummies(X_data, drop_first=True)

    full_data = pd.concat([X_data, y_data], axis=1)

    full_data = full_data.dropna()
    """
    MODEL TRAIN
    """
    model = CoxPHFitter()
    model.fit(full_data, 'SurvivalTime', event_col='Event')
    model.print_summary()
    p = model.predict_expectation(X_data)
    print(p)

    p_df = pd.DataFrame(index=y_data.index)
    p_df['SurvivalTime'] = p
    p_df['Event'] = None
    p_df.SurvivalTime = p_df.SurvivalTime.fillna(p_df.SurvivalTime.mean())
    print(p_df.head())

    score = metric.cindex(y_data, p_df)
    print(f'TRAIN : CScore = {score}')
    model.print_summary()
    """
    VALIDATION
    """
    test_X_c_dir = './x_test/features/clinical_data.csv'
Exemplo n.º 13
0
    def __predict_individual(self, is_death, train_data_path, basepath):
        big_dataset_file = train_data_path
        big_dataset = pd.read_csv(big_dataset_file,
                                  encoding='UTF-8',
                                  index_col=[0])
        del big_dataset['patient_id']
        del big_dataset['name']
        del big_dataset['tx_id']
        # del big_dataset['tx_id.1']
        del big_dataset['tx_date']

        formular = ''
        classify_attr = {
            'subject', 'treat_item', 'vascular_access_type',
            'dialysis_machine', 'anticoagulation_scope', 'anticoagulation',
            'protamine', 'replacement_way', 'take_food', 'fluid_infusion',
            'blood_pressure_pos', 'gender', 'smoking', 'highflux', 'payment',
            'marital', 'alcohol', 'HBsAg', 'HBsAb', 'HBeAg', 'HBeAb', 'HBcAb',
            'HCV', 'anticoagulant', 'EPO', 'CCB', 'ACEI', 'ARB', 'blocker',
            'blocer', 'diuretic', 'LipidD', 'CaPB', 'NCaPB', 'VitD',
            'mucosaprotect', 'H2RA', 'PPI', 'APUD', 'access', 'ESRDcause',
            'hypertension', 'DM', 'cardiovasculardisease',
            'cerebrovasculardisease', 'bleeding', 'malignancy'
        }

        for column in big_dataset.columns:
            # print("column", column)
            if column in classify_attr:
                formular = formular + 'C(' + column + ')+'
            else:
                formular = formular + column + '+'

        formular = formular[:-1].encode('utf-8')

        # '-1'表示不添加截取列
        big_dataset = patsy.dmatrix(formular + '-1',
                                    big_dataset,
                                    return_type='dataframe')
        if is_death:
            T_true, E_true, T_false, E_false = ('survivaltime1', 'outcome1',
                                                'survivaltime2', 'outcome2')
            attr_file, p632_file, var_file, kfold_file = (
                'lb_significant_attrs.txt', 'lb_stats632.csv',
                'lb_statvar.txt', 'lb_statskfold.csv')
            beta_file, p_file = ('lb_coef.csv', 'lb_p.csv')
        else:
            T_true, E_true, T_false, E_false = ('survivaltime2', 'outcome2',
                                                'survivaltime1', 'outcome1')
            attr_file, p632_file, var_file, kfold_file = (
                'lb_significant_attrs_e.txt', 'lb_stats632_e.csv',
                'lb_statvar_e.txt', 'lb_statskfold_e.csv')
            beta_file, p_file = ('lb_coef_e.csv', 'lb_p_e.csv')
        del big_dataset[T_false]
        del big_dataset[E_false]

        significant_attrs = list()
        # 根据报错删除部分字段
        del big_dataset['k_concentration']
        del big_dataset['SDUFR_x']
        del big_dataset['SDUFR_y']
        del big_dataset['SDUFR_y_v']
        del big_dataset['protamine_c']
        del big_dataset['k_concentration_c']
        """如果已经挑选出了具有统计意义的风险因子则不需要执行以下验证风险因子统计学意义的片段 """
        #+++++++++++++++++++++++++++++++++++++++++++++++++++++
        #        for column in big_dataset.columns:
        #            if column in {T_true, E_true}:
        #                continue
        #            subset = big_dataset[[column, T_true, E_true]]
        #            # print('subset', subset)
        #            try:
        #                # print('start fitting ')
        #                cox = CoxPHFitter()
        #                cox.fit(subset, T_true, E_true)
        #                help(cox)
        #                print('cox value:', cox.print_summary())
        #                print('p value:', cox.summary['p'][0])
        #                if cox.summary['p'][0] < 0.05:
        #                    # print(column, cox.summary['p'][0])
        #                    significant_attrs.append(column)
        #            except Exception:
        #                continue
        #        output = open(basepath+"/"+attr_file, mode='w')
        #        for attr in significant_attrs:
        #            output.write(attr + '\n')
        #        output.close()
        #++++++++++++++++++++++++++++++++++++++++++++++++++++
        input = open(basepath + "/" + attr_file)
        significant_attrs = [line.strip() for line in input.readlines()]
        input.close()
        significant_attrs.append(T_true)
        significant_attrs.append(E_true)
        print('linear_big ## sign_attr : %d' % len(significant_attrs))
        print(len(significant_attrs), T_true, E_true)

        big_dataset = big_dataset[significant_attrs]
        print(len(big_dataset.columns))
        #        10000 times .632 bootstrap
        count = 9999
        stats632 = list()
        statscoef = list()
        statspvalue = list()
        cox = CoxPHFitter()
        if count < 10000:
            print('count', count)
            try:
                train_set = big_dataset.sample(1500, replace=False)
                test_set = big_dataset.sample(1, replace=False)
                print('try fitting......', len(big_dataset), len(train_set),
                      len(test_set))
                #                cox = CoxPHFitter()
                cox = cox.fit(train_set, T_true, E_true)
                print(test_set)
                cox.predict_survival_function(test_set).plot()
                print(cox.predict_log_hazard_relative_to_mean(test_set))
                #                for t_index,t_item in test_set.iterrows:
                #                    print(str(t_index)+"predict_survival_function")
                #                    print(cox.predict_survival_function(t_item))
                #                    cox.predict_survival_function(t_item).plot()
                #                    print(str(t_index)+"predict_survival_function")
                #                    print(cox.predict_survival_function(t_item))
                train_cindex = concordance_index(
                    cox.durations,
                    -cox.predict_partial_hazard(cox.data).values.ravel(),
                    cox.event_observed)

                statscoef.append(cox.summary[['coef']].T)
                statspvalue.append(cox.summary[['p']].T)

                print('try predicting......')
                # test_set
                test_actual_T = test_set[T_true]
                test_actual_E = test_set[E_true]
                test_variable = test_set[test_set.columns.difference(
                    [T_true, E_true])]
                test_predictT = cox.predict_expectation(test_variable)

                # small_set
                #                all_actual_T = big_dataset[T_true]
                #                all_actual_E = big_dataset[E_true]
                #                all_variable = big_dataset[big_dataset.columns.difference([T_true, E_true])]
                #                all_predictT = cox.predict_expectation(all_variable)
                #
                #                print('try cindexing......')
                try:
                    test_cindex = concordance_index(test_actual_T,
                                                    test_predictT,
                                                    test_actual_E)
#                    all_cindex = concordance_index(all_actual_T, all_predictT, all_actual_E)
                except Exception:
                    test_cindex = concordance_index(test_actual_T,
                                                    test_predictT)
#                    all_cindex = concordance_index(all_actual_T, all_predictT)
#
#                stats632.append([train_cindex, test_cindex, all_cindex])
                count += 1
                print('632 -> %d' % count)

            except Exception as e:
                print(e.message)

            mean_patient = self.__filter_dt(test_set)
            print(cox.predict_log_hazard_relative_to_mean(test_set))
            #            mean_hazard = cox.predict_expectation(mean_patient)
            print(mean_hazard)