def fit_and_score_features(X, y):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxnetSurvivalAnalysis()
    for j in range(n_features):
        Xj = X[:, j:j + 1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores
示例#2
0
def COX(X, y, best_features, oversampling, undersampling, aggregation):
    if aggregation == True:
        results, model = execute_survival(
            X, y, best_features,
            lambda: CoxnetSurvivalAnalysis(l1_ratio=0.1, n_alphas=100),
            oversampling, undersampling)
    else:
        results, model = execute_survival(
            X, y, best_features,
            lambda: CoxnetSurvivalAnalysis(l1_ratio=0.1, n_alphas=200),
            oversampling, undersampling)
    if model:
        features = model.coef_
    else:
        features = False
    return results, features, model
    def optimizePenalty(self):
        import warnings
        from sklearn.exceptions import ConvergenceWarning
        from sklearn.pipeline import make_pipeline
        from sklearn.model_selection import KFold
        from sklearn.model_selection import GridSearchCV

        pipeline = make_pipeline(self.model)
        warnings.simplefilter("ignore", ConvergenceWarning)

        pipeline.fit(self.data.values, self.data.tags)

        alphas = 10. ** np.linspace(-2, 3,50)

        cv = KFold(n_splits = 5, shuffle = True)

        grid = GridSearchCV(make_pipeline(CoxnetSurvivalAnalysis(l1_ratio=1.0, max_iter = 1000000)),
                            param_grid = {"coxnetsurvivalanalysis__alphas" : [[alpha] for alpha in alphas]},
                            cv = cv,
                            error_score = 0.5,
                            n_jobs = -1).fit(self.data.values, self.data.tags)

        bestAlpha = grid.best_params_["coxnetsurvivalanalysis__alphas"][0]

        print("El mejor pare!", bestAlpha)
        self.model.set_params(**{"alphas" : [bestAlpha]})
def LASSO_COX_bootstrap(fp, num=False):
    df = pd.read_csv(fp, index_col=0)

    # configure bootstrap (sampling 50% of data)
    n_iterations = 100
    n_size = int(len(df) * 0.50)

    # calculate population of statistics
    metrics = []
    for i in range(n_iterations):
        # prepare sample

        # if indicated, include number of mets (col 42)
        if num:
            sample = resample(df.iloc[:, np.r_[:20, 40, 41, 42]],
                              n_samples=n_size)
            X = sample.iloc[:, np.r_[:20, 42]].copy()

        else:
            sample = resample(df.iloc[:, np.r_[:20, 40, 41]], n_samples=n_size)
            X = sample.iloc[:, :20].copy()

        X = X.to_numpy()
        y = sample[['Event', 'Time']].copy()
        y['Event'] = y['Event'].astype('bool')
        y = y.to_records(index=False)

        estimator = CoxnetSurvivalAnalysis(l1_ratio=1, alphas=[0.001])
        estimator.fit(X, y)
        score = estimator.score(X, y)
        metrics.append(score)

    # calculate confidence interval
    alpha = 0.95
    p = ((1.0 - alpha) / 2.0) * 100
    lower = max(0.0, np.percentile(metrics, p))
    p = (alpha + ((1.0 - alpha) / 2.0)) * 100
    upper = min(1.0, np.percentile(metrics, p))
    med = np.percentile(metrics, 50)

    # identify aggregation method name
    if num:
        name = fp.split('/')[-1].split('_')[0] + ' + NumMets'
    else:
        name = fp.split('/')[-1].split('_')[0]

    return print(name, 'Lasso-Cox', '%.3f (%.3f-%.3f)' % (med, lower, upper))
        def functionToOptimize(**params):

            self.counter += 1
            print(f"Bayesian Optimization model: {2 ** params['alphas']}; time: {self.counter}")



            model = CoxnetSurvivalAnalysis(l1_ratio = 1.0, max_iter = 1000000)
            params["alphas"] = [2 ** params["alphas"]]



            model.set_params(**params)

            cvAucMeans = []
            for trainIndex, testIndex in KFold(n_splits = 4).split(self.data.values):

                trainX, trainY = self.data.values[trainIndex,], self.data.tags[trainIndex[:, None],]
                testX, testY = self.data.values[testIndex,:], self.data.tags[testIndex[:, None],]

                trainY = np.reshape(trainY, -1)
                testY = np.reshape(testY, -1)

                model.fit(trainX, trainY)

                times = np.percentile(testY["Time_in_days"], np.linspace(5, 81, 15))
                _, meanAuc = cumulative_dynamic_auc(testY, testY,
                                                      model.predict(testX),
                                                      times)
                cvAucMeans.append(meanAuc)

            return -np.mean(cvAucMeans)
示例#6
0
def execute_survival(X, y, k, headers, survival, aggregation):
    new_X, best_features = pearson_fs(X, y, headers, k, feature_selection=True, survival=survival)
    y_for_cv = np.array([t[0] for t in y])
    cv = StratifiedKFold(y_for_cv, n_folds=5) # x-validation

    if aggregation == True:
        clf = CoxnetSurvivalAnalysis(l1_ratio=0.1, n_alphas=100)
    else: 
        clf = CoxnetSurvivalAnalysis(l1_ratio=0.1, n_alphas=200)

    CIscore = 0
    
    print ('  ...performing x-validation')
    for i, (train, test) in enumerate(cv):
        print ('   ...',i+1)
    
        y_train = y[train]
        trained_classifier = clf.fit(new_X[train], y[train])


        event_indicators = []
        event_times = []
        scores = []

        for target in y[test]:
           event_indicators.append(target[0])
           event_times.append(target[1])

        predictions = trained_classifier.predict(new_X[test])

        for prediction in predictions:
            scores.append(prediction)
        # print(prediction)

        result = concordance_index_censored(np.array(event_indicators), np.array(event_times), np.array(scores).reshape(-1))
        CIscore += result[0]
        # TODO fix metrics

    avgCIscore = CIscore / len(cv)
    print(avgCIscore)

    return avgCIscore
    def _fit_with_python(self,
                         matrix_test,
                         get_proba=False,
                         return_nonzero_features=False,
                         l1_ratio=0.5):
        """
        """
        from sksurv.linear_model import CoxnetSurvivalAnalysis

        Y = np.asarray([(bool(a), b)
                        for a, b in zip(self.isdead, self.nbdays)],
                       dtype=[("event", np.bool), ("time", np.int)])

        self.coxph_python = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio,
                                                   fit_baseline_model=False)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.coxph_python.fit(self.matrix, Y)

        predictions = self.coxph_python.predict(matrix_test)

        if get_proba:
            return self._get_proba_from_prediction(predictions)

        if return_nonzero_features:
            for coef in self.coxph_python.coef_.T:
                if coef.sum() != 0:
                    break
            if coef.sum() == 0:
                raise (Exception("All features Coefficient are 0!"))

            if self.metadata_mat is not None:
                if coef[:-self.metadata_mat.shape[1]].sum() == 0:
                    raise (Exception("Only metadata features are non zero"))

                return np.nonzero(coef[:-self.metadata_mat.shape[1]])
            else:
                return np.nonzero(coef)

        return self._fit_and_dichotomise(predictions,
                                         n_clusters=self.n_clusters)
def test():
    """
    """
    #### Compare glmnet with sksurv CoxnetSurvivalAnalysis
    from sksurv.linear_model import CoxnetSurvivalAnalysis
    ######################################################

    ################ DUMMY DATA ##########################
    isdead = [0, 1, 1, 1, 0, 1, 0, 0, 1, 0]
    nbdays = [24, 10, 25, 50, 14, 10, 100, 10, 50, 10]
    matrix = np.array([[0, 1, 1, 0, 1, 2, 0, 1, 0, 0],
                       [0, 1, 1, 0, 1, 3, 0, 1, 0, 0]]).T
    ######################################################

    res = predict_with_coxph_glmnet(matrix, isdead, nbdays, matrix)

    coxph = CoxnetSurvivalAnalysis()
    Y = np.asarray([(bool(a), b) for a, b in zip(isdead, nbdays)],
                   dtype=[("event", np.bool), ("time", np.int)])

    coxph.fit(matrix, Y)
示例#9
0
def run_coxnet(l1_ratio, n_alphas, x_train, y_train, x_test, y_test):

    coxnet = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, n_alphas=n_alphas)
    coxnet.fit(x_train, y_train)
    outputs = coxnet.predict(x_test)
    score = coxnet.score(x_test, y_test)
    return outputs, score
E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False)

df2['E'] = E
df2['T'] = T

X, y = get_x_y(df2, ['E', 'T'], pos_label=True)

for c in X.columns.values:
    if c != 'AGE AT DOC':
        X[c] = X[c].astype('category')

data_x_numeric = OneHotEncoder().fit_transform(X)
#%%

estimator = CoxnetSurvivalAnalysis(verbose=True)
estimator.fit(data_x_numeric, y)
#%%

print(estimator.score(data_x_numeric, y))
print()

scores = fit_and_score_features(data_x_numeric.values, y)
print(
    pd.Series(scores,
              index=data_x_numeric.columns).sort_values(ascending=False))
#%%

from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
示例#11
0
#df=merge_frames(df1,df2)

#Converting the integer to 0 and 1 to boolean for python
df["Status"] = df["Status"].astype(bool)
#data contains the time and status column and X will have all the mutation present or absent corresponding to each gene
data = df.iloc[0:, 1:3]
X = df.iloc[0:, 3:]

#storing the value used to store status and time in tuple
Y = data.to_records(index=False)

X = OneHotEncoder().fit_transform(X)

#Running the module for 50 randomly generated penalty values
estimator = CoxnetSurvivalAnalysis(n_alphas=100,
                                   l1_ratio=1,
                                   alpha_min_ratio=0.01,
                                   max_iter=10000)
estimator.fit(X, Y)

#Making the dataframe for the coefficients of each genes corresponding to that alpha value
coefficients_lasso = pd.DataFrame(estimator.coef_,
                                  index=X.columns,
                                  columns=np.round(estimator.alphas_, 5))
alphas = estimator.alphas_

print(coefficients_lasso)

#Sending parameters to the function to plot the alpha vs coefficient graph for all the genes, with the 10 mostly divergent genes as hightlights
plot_coefficients(coefficients_lasso, n_highlight=10)
alphas = coefficients_lasso.columns
示例#12
0
def train_cox(x,
              outer_split=leave_two_out,
              inner_split=leave_two_out,
              num_folds=None,
              meas_key=None,
              key='metabs'):
    if num_folds is None:
        print('none')
    else:
        print(num_folds)
    np.random.seed(5)
    # if feature_grid is None:
    #     feature_grid = np.logspace(7, 20, 14)
    hazards = []
    event_times = []
    event_outcomes = []
    score_vec = []
    model_out_dict = {}
    ix_inner = outer_split(x, x['outcome'], num_folds=100)
    lambda_dict = {}
    for ic_in, ix_in in enumerate(ix_inner):
        train_index, test_index = ix_in
        x_train, x_test = x.iloc[train_index, :], x.iloc[test_index, :]
        week = x_train['week']
        outcome = x_train['outcome']
        if (x_train < 0).any().any():
            x_train_, x_test_ = filter_by_train_set(
                x_train.drop(['week', 'outcome'], axis=1),
                x_test.drop(['week', 'outcome'], axis=1),
                meas_key,
                key=key,
                log_transform=False)
        else:
            x_train_, x_test_ = filter_by_train_set(
                x_train.drop(['week', 'outcome'], axis=1),
                x_test.drop(['week', 'outcome'], axis=1),
                meas_key,
                key=key,
                log_transform=True)

        temp = x_train_.copy()
        temp['week'], temp['outcome'] = x_train['week'], x_train['outcome']
        x_train = temp.copy()

        temp = x_test_.copy()
        temp['week'], temp['outcome'] = x_test['week'], x_test['outcome']
        x_test = temp.copy()

        if np.sum(x_test['outcome'].values) < 1:
            continue

        x_train_ = x_train.drop(['week', 'outcome'], axis=1)
        yy = list(zip(outcome, week))
        y_arr = np.array(yy, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])

        ix_inner2 = inner_split(x_train, x_train['outcome'], num_folds=100)
        lamb_dict = {}
        lamb_dict['auc'] = {}
        lamb_dict['ci'] = {}
        model2 = CoxnetSurvivalAnalysis(l1_ratio=1)

        model_dict = {}
        alphas = None
        hazards_dict = {}
        e_times_dict = {}
        e_outcomes_dict = {}
        score_dict = {}

        coxnet_pipe = CoxnetSurvivalAnalysis(l1_ratio=1,
                                             alpha_min_ratio=0.001,
                                             n_alphas=300)

        coxnet_pipe.fit(x_train_, y_arr)
        alphas = coxnet_pipe.alphas_

        for ic_in2, ix_in2 in enumerate(ix_inner2):
            start_inner = time.time()

            train_ix, test_ix = ix_in2
            x_tr2, x_ts2 = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :]

            if np.sum(x_tr2['outcome'].values) < 1:
                continue

            y_test = list(zip(x_ts2['outcome'], x_ts2['week']))
            y_test_arr = np.array(y_test,
                                  dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])
            if len(np.unique(y_test_arr)) < len(test_ix):
                continue

            week = x_tr2['week']
            outcome = x_tr2['outcome']
            if (outcome == 0).all():
                continue
            x_tr2_ = x_tr2.drop(['week', 'outcome'], axis=1)
            yy2 = list(zip(outcome, week))
            y_arr2 = np.array(yy2, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])
            model2.set_params(alphas=alphas)
            try:
                model2.fit(x_tr2_, y_arr2)
            except:
                print('removed alpha ' + str(alphas[0]))
                alphas_n = np.delete(alphas, 0)
                model2.set_params(alphas=alphas_n)
                while (1):
                    try:
                        model2.fit(x_tr2_, y_arr2)
                        alphas = alphas_n
                        break
                    except:
                        print('removed alpha ' + str(alphas_n[0]))
                        alphas_n = np.delete(alphas, 0)
                        model2.set_params(alphas=alphas_n)
                    if len(alphas_n) <= 2:
                        break
                if len(alphas_n) <= 2:
                    continue
            # alphas_new = model2.alphas_
            # if ic_in2 == 0:
            #     alphas = alphas_new

            model_dict[ic_in2] = model2
            for i, alpha in enumerate(alphas):
                if i not in hazards_dict.keys():
                    hazards_dict[i] = {}
                    e_times_dict[i] = {}
                    e_outcomes_dict[i] = {}
                    score_dict[i] = {}
                risk_scores = model2.predict(x_ts2.drop(['week', 'outcome'],
                                                        axis=1),
                                             alpha=alpha)
                hazards_dict[i][ic_in2] = risk_scores
                e_times_dict[i][ic_in2] = x_ts2['week']
                e_outcomes_dict[i][ic_in2] = x_ts2['outcome']

                if len(test_ix) >= 2:
                    try:
                        ci = concordance_index_censored(
                            e_outcomes_dict[i][ic_in2].astype(bool),
                            e_times_dict[i][ic_in2],
                            hazards_dict[i][ic_in2])[0]
                    except:
                        print('debug')
                        print(x_ts2['outcome'])
                        print(x_ts2['week'])
                        print('')
                        continue

                    if not np.isnan(ci):
                        score_dict[i][ic_in2] = ci

        if len(score_dict[i]) > 0:
            scores = {
                i: sum(score_dict[i].values()) / len(score_dict[i].values())
                for i in score_dict.keys()
            }
        else:
            scores = {}
            for a_ix in hazards_dict.keys():
                alpha_num = alphas[a_ix]
                scores[
                    alpha_num], concordant, discondordant, tied_risk, tied_time = concordance_index_censored(
                        np.array(
                            np.concatenate(list(
                                e_outcomes_dict[a_ix].values()))).astype(bool),
                        np.array(
                            np.concatenate(list(e_times_dict[a_ix].values()))),
                        np.array(
                            np.concatenate(list(hazards_dict[a_ix].values()))))

        lambdas, aucs_in = list(zip(*scores.items()))
        ix_max = np.argmax(aucs_in)
        best_lamb = alphas[ix_max]

        lambda_dict[ic_in] = {
            'best_lambda': best_lamb,
            'scores': scores,
            'event_outcomes': event_outcomes,
            'times': event_times,
            'hazards': hazards,
            'lambdas_tested': alphas
        }
        model_out = CoxnetSurvivalAnalysis(l1_ratio=1, alphas=alphas)

        model_out.fit(x_train_, y_arr)

        risk_scores = model_out.predict(x_test.drop(['week', 'outcome'],
                                                    axis=1),
                                        alpha=best_lamb)

        hazards.append(risk_scores)
        event_times.append(x_test['week'])
        event_outcomes.append(x_test['outcome'])

        coefs = model_out.coef_[:, ix_max]
        out_df = pd.DataFrame({'odds_ratio': np.zeros(x.shape[1])},
                              index=x.columns.values)
        out_df.loc[x_train.columns.values[:-2]] = np.expand_dims(coefs, 1)

        model_out_dict[ic_in] = out_df
        if len(test_index) > 1:
            ci = concordance_index_censored(x_test['outcome'].astype(bool),
                                            x_test['week'], risk_scores)[0]
            if not np.isnan(ci):
                score_vec.append(ci)

    if len(score_vec) > 1:
        score = sum(score_vec) / len(score_vec)
    else:
        score, concordant, discondordant, tied_risk, tied_time = concordance_index_censored(
            np.array(np.concatenate(event_outcomes)).astype(bool),
            np.array(np.concatenate(event_times)),
            np.array(np.concatenate(hazards)))

    final_dict = {}
    final_dict['score'] = score
    final_dict['model'] = model_out_dict
    final_dict['hazards'] = hazards
    final_dict['event_times'] = event_times
    final_dict['event_outcomes'] = event_outcomes
    final_dict['lambdas'] = lambda_dict
    return final_dict
示例#13
0
def train_survival(X_train,
                   X_test,
                   y_train,
                   alphas,
                   l1_ratios,
                   seed,
                   n_folds=4,
                   max_iter=1000,
                   fit_ridge=False,
                   output_fn=False,
                   debug_info=None):
    """
    Build the logic and sklearn pipelines to predict survival info y from dataset x,
    using elastic net Cox regression

    Arguments
    ---------
    X_train: pandas DataFrame of feature matrix for training data
    X_test: pandas DataFrame of feature matrix for testing data
    y_train: pandas DataFrame of processed y matrix, containing 'status' = False
             if right-censored else True, 'time_in_days' = survival time
    alphas: list of alphas to perform cross validation over, if None use the
            alphas path generated by scikit-survival
    l1_ratios: list of l1 mixing parameters to perform cross validation over
    n_folds: int of how many folds of cross validation to perform
    max_iter: the maximum number of iterations to test until convergence
    fit_ridge: if True, use ridge regularized model (CoxPHSurvivalAnalysis). This
               uses a slightly different optimizer than CoxnetSurvivalAnalysis which
               can be more stable, but also scales poorly to many features. If this
               is True, l1_ratios range will be ignored, and hyperparameter search
               will be over alphas range only.

    Returns
    ------
    The full pipeline sklearn object and y matrix predictions for training, testing,
    and cross validation
    """

    # set up the cross-validation parameters
    # sometimes we want to use sksurv to compute the alpha path
    if alphas is None:
        cox = CoxnetSurvivalAnalysis(alpha_min_ratio=0.01, n_alphas=100)
        cox.fit(X_train, _y_df_to_struct(y_train))
        alphas = cox.alphas_

    if fit_ridge:
        surv_parameters = {
            "survival__alpha": alphas
        }
        estimator = Pipeline(
            steps=[
                (
                    "survival",
                    CoxPHSurvivalAnalysis(
                        n_iter=max_iter,
                        tol=1e-5,
                    ),
                )
            ]
        )
    else:
        surv_parameters = {
            "survival__alphas": [[a] for a in alphas],
            "survival__l1_ratio": l1_ratios,
        }
        estimator = Pipeline(
            steps=[
                (
                    "survival",
                    CoxnetSurvivalAnalysis(
                        max_iter=max_iter,
                        tol=1e-5,
                        fit_baseline_model=output_fn
                    ),
                )
            ]
        )

    cv_pipeline = GridSearchCV(
        estimator=estimator,
        param_grid=surv_parameters,
        n_jobs=-1,
        cv=n_folds,
        error_score=0.5,
        return_train_score=True,
    )

    # fit the model
    cv_pipeline.fit(X=X_train,
                    y=_y_df_to_struct(y_train))

    if debug_info is not None:
        grid_mean_df = pd.DataFrame(
            cv_pipeline.cv_results_['mean_test_score'].reshape(len(alphas), -1),
            columns=l1_ratios,
            index=alphas
        )
        grid_mean_df.to_csv('{}_{}_fold{}_grid.tsv'.format(debug_info['prefix'],
                                                           debug_info['signal'],
                                                           debug_info['fold_no']),
                            sep='\t')

    # Obtain cross validation results
    y_cv = cross_val_predict(
       cv_pipeline.best_estimator_,
       X=X_train,
       y=_y_df_to_struct(y_train),
       cv=n_folds,
       method="predict",
    )

    # get predictions
    y_predict_train = cv_pipeline.predict(X_train)
    y_predict_test = cv_pipeline.predict(X_test)

    return cv_pipeline, y_predict_train, y_predict_test, y_cv
示例#14
0
plt.legend()

plt.plot()

_train_l = numpy.array(list(_train_l), dtype='bool,f4')

_test_l = numpy.array(list(_test_l), dtype='bool,f4')

# create ph model
estimator = CoxPHSurvivalAnalysis()

estimator.fit(_train_d, _train_l)

# create the cox model
clf = CoxnetSurvivalAnalysis(n_alphas=5, tol=0.1)

# train model
clf.fit(_train_d, _train_l)

result = []
# evaluate for every alpha
for v in clf.alphas_:
    res = clf.predict(_test_d, alpha=[v])
    result.append(concordance_index_censored(tft, timet, res))

# calculate precision
clf.predict(_test_d)
res = clf.predict(_test_d)

# print out some results
示例#15
0
def train_with_inner_folds(x, num_folds=5):
    final_res_dict = {}
    scores = []

    scores = []
    score_d = []
    ix_inner = leave_two_out(x, x['outcome'], num_folds=None)
    final_res_dict['grid_search_model'] = []
    final_res_dict['best_model'] = []
    final_res_dict['best_alpha'] = []
    final_res_dict['alphas'] = []
    for ic_in, ix_in in enumerate(ix_inner):
        train_index, test_index = ix_in
        x_train, x_test = x.iloc[train_index, :], x.iloc[test_index, :]

        if np.sum(x_test['outcome'].values < 1):
            continue
        y_test = list(zip(x_test['outcome'], x_test['week']))
        y_test_arr = np.array(y_test, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])
        if len(np.unique(y_test_arr)) == 1:
            continue

        model2 = CoxnetSurvivalAnalysis(l1_ratio=1,
                                        n_alphas=300,
                                        alpha_min_ratio='auto')

        week = x_train['week']
        outcome = x_train['outcome']
        x_train_ = x_train.drop(['week', 'outcome'], axis=1)
        yy = list(zip(outcome, week))
        y_arr = np.array(yy, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])
        model2.fit(x_train_, y_arr)
        num_rec = np.sum(outcome)
        if num_folds > num_rec:
            nf_inner = int(num_rec)
        else:
            nf_inner = int(num_folds)
        cv = StratifiedKFold(n_splits=int(nf_inner),
                             shuffle=True,
                             random_state=0)
        alphas = model2.alphas_
        try:
            gcv = GridSearchCV(make_pipeline(
                StandardScaler(),
                CoxnetSurvivalAnalysis(l1_ratio=1,
                                       n_alphas=300,
                                       alpha_min_ratio='auto',
                                       max_iter=100)),
                               param_grid={
                                   "coxnetsurvivalanalysis__alphas":
                                   [[v] for v in alphas]
                               },
                               cv=cv,
                               error_score=0.5,
                               n_jobs=4).fit(x_train_, y_arr)
            best_model = gcv.best_estimator_.named_steps[
                "coxnetsurvivalanalysis"]
            best_alpha = best_model.alphas

            pred = model2.predict(x_test.drop(['week', 'outcome'], axis=1),
                                  alpha=best_alpha)
            score_default = model2.score(
                x_test.drop(['week', 'outcome'], axis=1), y_test_arr)
            score = concordance_index_censored(x_test['outcome'].astype(bool),
                                               x_test['week'], pred)[0]
        except:
            score_default = model2.score(
                x_test.drop(['week', 'outcome'], axis=1), y_test_arr)
            score = score_default.copy()
            best_model = model2
            best_alpha = alphas[-1]

        if not np.isnan(score):
            scores.append(score)
        if not np.isnan(score_default):
            score_d.append(score_default)
        final_res_dict['grid_search_model'].append(gcv)
        final_res_dict['best_model'].append(best_model)
        final_res_dict['best_alpha'].append(best_alpha)
        final_res_dict['alphas'].append(alphas)

    conc_ix = np.mean(scores)
    conc_ix_d = np.mean(score_d)
    final_res_dict['score'] = conc_ix
    final_res_dict['score_default'] = conc_ix_d
    return final_res_dict
示例#16
0
def train_with_folds(x, num_folds=5):
    num_rec = np.sum(x['outcome'])

    if num_folds > num_rec:
        num_folds = num_rec
    skf = StratifiedKFold(n_splits=num_folds)
    splits = skf.split(x, x['outcome'])

    score_vec = []
    final_res_dict = {}
    fold = 0
    for train_index, test_index in splits:
        #     probs[ic] = []
        #     train_index, test_index = ix
        x_train, x_test = x.iloc[train_index, :], x.iloc[test_index, :]
        week = x_train['week']
        outcome = x_train['outcome']
        x_train_ = x_train.drop(['week', 'outcome'], axis=1)
        yy = list(zip(outcome, week))
        y_arr = np.array(yy, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])

        y_test = list(zip(x_test['outcome'], x_test['week']))
        y_test_arr = np.array(y_test, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])
        if len(np.unique(y_test_arr)) == 1:
            continue

        model2 = CoxnetSurvivalAnalysis(l1_ratio=1,
                                        alpha_min_ratio='auto',
                                        n_alphas=300)
        warnings.simplefilter("ignore")
        model2.fit(x_train_, y_arr)

        estimated_alphas = model2.alphas_

        num_rec = np.sum(outcome)
        if num_folds > num_rec:
            nf_inner = int(num_rec)
        else:
            nf_inner = int(num_folds)
        cv = StratifiedKFold(n_splits=nf_inner, shuffle=True, random_state=0)
        try:
            gcv = GridSearchCV(make_pipeline(
                StandardScaler(), CoxnetSurvivalAnalysis(l1_ratio=1)),
                               param_grid={
                                   "coxnetsurvivalanalysis__alphas":
                                   [[v] for v in estimated_alphas]
                               },
                               cv=cv,
                               error_score=0.5,
                               n_jobs=4).fit(x_train_, y_arr)
            cv_results = pd.DataFrame(gcv.cv_results_)
            alphas = cv_results.param_coxnetsurvivalanalysis__alphas.map(
                lambda x: x[0])
            best_model = gcv.best_estimator_.named_steps[
                "coxnetsurvivalanalysis"]
            best_alpha = best_model.alphas
            best_coefs = pd.DataFrame(best_model.coef_,
                                      index=x_train_.columns,
                                      columns=["coefficient"])
        except:
            score_default = model2.score(
                x_test.drop(['week', 'outcome'], axis=1), y_test_arr)
            score = score_default.copy()
            best_model = model2
            best_alpha = estimated_alphas[-1]
            best_coefs = pd.DataFrame(best_model.coef_[:, -1],
                                      index=x_train_.columns,
                                      columns=["coefficient"])

        #     model_out = CoxnetSurvivalAnalysis(l1_ratio=1, alphas = best_alpha)
        #     model_out.fit(x_train_, y_arr)

        week = x_test['week']
        outcome = x_test['outcome']
        x_test_ = x_test.drop(['week', 'outcome'], axis=1)
        yy = list(zip(outcome, week))
        y_arr = np.array(yy, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])

        score_ix = best_model.score(x_test_, y_arr)
        score_vec.append(score_ix)
        final_res_dict[fold] = {}
        final_res_dict[fold]['score'] = score_ix
        final_res_dict[fold]['best_model'] = best_model
        final_res_dict[fold]['best_coefs'] = best_coefs
        final_res_dict[fold]['train_test'] = (x_train, x_test)
        fold += 1
    return final_res_dict
示例#17
0
文件: hpt.py 项目: Letris/HS
def RandomGridSearchRFC_Fixed(X, Y, splits, model, survival):
    """
    This function looks for the best set o parameters for RFC method
    Input: 
        X: training set
        Y: labels of training set
        splits: cross validation splits, used to make sure the parameters are stable
    Output:
        clf.best_params_: dictionary with the parameters, to use: param_svm['kernel']
    """

    start_svm = time.time()

    if model == 'svm':
        clf = svm.SVC()

        tuned_parameters = {
            'C': ([0.01, 1, 10]),
            'kernel': (['rbf', 'linear']),
            # 'kernel': (['linear', 'rbf', 'sigmoid']),
            # 'degree': ([1,3,5,10]),
            # 'decision_function_shape' : (['ovo', 'ovr']),
            # 'cache_size': ([500,1000,1500,2000]),
            'shrinking': ([False, True]),
            # 'probability': ([False, True])
        }

    if model == 'cart':
        clf = tree.DecisionTreeClassifier()

        tuned_parameters = {
            'criterion': (['gini', 'entropy']),
            'max_depth': ([10, 20]),
            'min_samples_split': ([2, 3, 5]),
            'min_samples_leaf': ([2, 3, 5]),
        }

    if model == 'rf':
        clf = ensemble.RandomForestClassifier()

        tuned_parameters = {
            'n_estimators': ([200, 500, 1000]),
            # 'max_features': (['auto', 'sqrt', 'log2',1,4,8]),
            'max_depth': ([10, 20]),
            # 'criterion':    (['gini', 'entropy']),
            'min_samples_split': [2, 3, 5],
            'min_samples_leaf': [2, 3, 5],
        }

    if model == 'xgboost':
        clf = XGBClassifier()

        tuned_parameters = {
            'booster': (['gbtree']),
            'max_depth': ([5, 10, 20]),
            'reg_lambda': ([0, 1]),
            'reg_alpha': ([0, 1]),
            'subsample': ([0.5, 1])
        }

    if model == 'lr':
        clf = linear_model.LogisticRegression()

        tuned_parameters = {'solver': (['liblinear', 'sag', 'saga'])}

    if model == 'cox':

        clf = CoxnetSurvivalAnalysis()
        tuned_parameters = {
            'n_alphas': ([50, 100, 200]),
            'l1_ratio': ([0.1, 0.5, 1]),
        }

    if model == 'survSVM':
        clf = FastSurvivalSVM()

        tuned_parameters = {
            'alpha': ([0.5, 1]),
            'rank_ratio': ([0.5, 1]),
            'max_iter': ([20, 40, 80]),
            'optimizer': (['rbtree', 'avltree']),
        }

    if model == 'gb':
        clf = GradientBoostingSurvivalAnalysis()

        tuned_parameters = {
            'learning_rate': ([0.1, 0.3]),
            'n_estimators': ([100, 200, 400]),
            'max_depth': ([3, 6, 12])
        }

    if survival == True:
        scorer = make_scorer(CI, greater_is_better=True)

        y_for_cv = np.array([t[0] for t in Y])
        cv = StratifiedKFold(y_for_cv, n_folds=splits)  # x-validation

    else:
        cv = StratifiedKFold(Y, n_folds=splits)  # x-validation
        scores = ['roc_auc']

    print('  ...performing x-validation')

    clf = GridSearchCV(clf,
                       tuned_parameters,
                       scoring='%s' % scores[0],
                       cv=cv,
                       verbose=10)  #scoring='%s' % scores[0]

    clf.fit(X, Y)

    end_svm = time.time()
    print("Total time to process: ", end_svm - start_svm)

    return (clf.best_params_, clf)
示例#18
0
def train_survival_model(
    x,
    y,
    *,
    outer_cv_splits,
    inner_cv_splits,
    param_grid,
):
    """Train survival model.

    The model is trained with ssGSEA normalized enrichment scores (NES) from TCGA expression data and cBioPortal
    survival data on patient survival status and survival times.

    :param pandas.core.frame.DataFrame x: dataFrame of ssGSEA NES where controls are filtered out, as are
     patients with missing enrichment scores or survival data
    :param numpy.ndarray y: Structured array A where binary survival status is first field and survival time is
     second field.
    :param int outer_cv_splits: number of folds to split data in train/test sets in outer cross validation loop
    :param int inner_cv_splits: number of folds to split data in train/test sets in inner cross validation loop
    :param dict param_grid: parameter types and values to try in grid search
    :return: concordance scores
    """
    concordance_scores = []

    kf = KFold(n_splits=outer_cv_splits, shuffle=True)
    inner_cv = KFold(n_splits=inner_cv_splits)

    iterator = tqdm(kf.split(x, y))

    # Iterator for each CV step in the outer loop
    for i, (train_indexes, test_indexes) in enumerate(iterator):
        # Slice main data frame to get the training and test data for this CV step
        x_train = x.iloc[train_indexes]
        x_test = x.iloc[test_indexes]
        y_train = np.asarray([y[train_index] for train_index in train_indexes])
        # y_test = np.asarray([y[test_index] for test_index in test_indexes])

        # Instantiate Cox’s proportional hazard’s regression model with elastic net penalty
        coxnet = CoxnetSurvivalAnalysis()

        # Tune hyper-parameters (e.g., L1-ratio) of the estimator using grid search (Inner loop in the nested-CV)
        gcv = GridSearchCV(estimator=coxnet,
                           param_grid=param_grid,
                           cv=inner_cv,
                           return_train_score=True)

        # Run grid search on training data
        gcv.fit(x_train, y_train)

        # Extract best model from the grid
        coxnet = gcv.best_estimator_

        # predict y using the best model from the grid
        prediction = coxnet.predict(x_test)

        # Evaluate the performance of the model during grid search using Harrell's concordance index
        # Note that the main data frame is sliced to use only the test data for this CV step
        cindex, concordant, discordant, tied_risk, tied_time = concordance_index_censored(
            [y[test_index]['status']
             for test_index in test_indexes],  # The status array for test set
            [y[test_index]['days_to_death']
             for test_index in test_indexes],  # The days to death for test set
            prediction,  # Prediction scores
        )

        # print C-Index and best parameter found in the grid search
        print('best c-index: {}'.format(cindex))
        print('best parameter: {}'.format(gcv.best_params_))

        concordance_scores.append({
            "c-index": cindex,
            "number of concordant pairs": concordant,
            "number of discordant pairs": discordant,
            "tied_risk": tied_risk,
            "tied_time": tied_time,
            "l1-ratio": gcv.best_estimator_.l1_ratio,
            "split": i,
        })

    # avg_c_index = np.average([
    #     iter_result["c-index"]
    #     for iter_result in concordance_scores
    # ])

    # print('Avg C-Index {}'.format(avg_c_index))
    print(concordance_scores)
    # return avg_c_index, concordance_scores
    return concordance_scores
class ClusterWithSurvival(object):
    """
    """
    def __init__(self,
                 isdead,
                 nbdays,
                 n_clusters=2,
                 metadata_mat=None,
                 use_gaussian_to_dichotomize=False,
                 use_sksurv=True):
        "docstring"

        self.use_sksurv = use_sksurv
        self.coxph_python = None
        self.isdead = isdead
        self.nbdays = nbdays
        self.n_clusters = n_clusters
        self.metadata_mat = metadata_mat
        self.matrix = None
        self._glm = None
        self._labels = None
        self._use_gaussian_to_dichotomize = use_gaussian_to_dichotomize

    def get_nonzero_features(self, matrix):
        """
        Get non zero features using lasso coxPH
        """
        if self.metadata_mat is not None:
            self.matrix = hstack([matrix, self.metadata_mat])
            rbs = RobustScaler()
            self.matrix = rbs.fit_transform(self.matrix)

        else:
            self.matrix = matrix

        return self._fit_with_python(self.matrix,
                                     l1_ratio=1.0,
                                     return_nonzero_features=True)

    def fit(self, matrix):
        """
        """
        self.matrix = matrix

    def predict(self, matrix_test):
        """
        """
        if self.use_sksurv:
            return self._fit_with_python(matrix_test)
        else:
            return self._fit_with_glm(matrix_test)

    def predict_proba(self, matrix_test):
        """
        """
        if self.use_sksurv:
            return self._fit_with_python(matrix_test, get_proba=True)
        else:
            return self._fit_with_glm(matrix_test, get_proba=True)

    def _fit_with_python(self,
                         matrix_test,
                         get_proba=False,
                         return_nonzero_features=False,
                         l1_ratio=0.5):
        """
        """
        from sksurv.linear_model import CoxnetSurvivalAnalysis

        Y = np.asarray([(bool(a), b)
                        for a, b in zip(self.isdead, self.nbdays)],
                       dtype=[("event", np.bool), ("time", np.int)])

        self.coxph_python = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio,
                                                   fit_baseline_model=False)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.coxph_python.fit(self.matrix, Y)

        predictions = self.coxph_python.predict(matrix_test)

        if get_proba:
            return self._get_proba_from_prediction(predictions)

        if return_nonzero_features:
            for coef in self.coxph_python.coef_.T:
                if coef.sum() != 0:
                    break
            if coef.sum() == 0:
                raise (Exception("All features Coefficient are 0!"))

            if self.metadata_mat is not None:
                if coef[:-self.metadata_mat.shape[1]].sum() == 0:
                    raise (Exception("Only metadata features are non zero"))

                return np.nonzero(coef[:-self.metadata_mat.shape[1]])
            else:
                return np.nonzero(coef)

        return self._fit_and_dichotomise(predictions,
                                         n_clusters=self.n_clusters)

    def _fit_with_glm(self, matrix_test, get_proba=False):
        """
        """
        predictions = predict_with_coxph_glmnet(self.matrix, self.isdead,
                                                self.nbdays, matrix_test)

        if get_proba:
            return self._get_proba_from_prediction(predictions)

        return self._fit_and_dichotomise(predictions,
                                         n_clusters=self.n_clusters)

    def _fit_and_dichotomise(self, predicted_time, n_clusters=2):
        """
        """
        labels = np.zeros(predicted_time.shape)
        predicted_time[predicted_time == 0] = np.inf

        if self._use_gaussian_to_dichotomize:
            glm = GaussianMixture(n_components=n_clusters)
            self._labels = glm.fit_predict(predicted_time.reshape(1, -1).T)
            self._glm = glm

            return self._labels

        for cluster in range(n_clusters):
            percentile = 100 * (1.0 - 1.0 / (cluster + 1.0))
            value = np.percentile(predicted_time, percentile)
            labels[predicted_time >= value] = n_clusters - cluster

        return labels

    def _get_proba_from_prediction(self,
                                   predicted_time,
                                   time_of_following=None):
        """
        time_of_following is used to compute the probability of the even happening
        using the predicted values as referendce => proba = time_predicted / time_of_following
        if None, time_of_following is computed using the std of time_predicted for all non zero
        """
        if self._glm is not None:
            return self._glm.predict_proba(predicted_time.reshape(1, -1).T)

        predicted_time = predicted_time.astype("float32")

        if not time_of_following:
            time_of_following = np.max(predicted_time[predicted_time != 0]) + \
                np.std(predicted_time[predicted_time != 0])

        predicted_time[predicted_time == 0] = time_of_following

        return predicted_time / time_of_following
normed_features = (feature_matrix - feature_means) / feature_stds
normed_features = normed_features.fillna(0.0)
# In[ ]:
from sksurv.datasets import get_x_y
full_dataset = pd.read_csv('training/response.csv').set_index('lab_id').join(
    normed_features)
X, Y = get_x_y(full_dataset, ['vitalStatus', 'overallSurvival'],
               pos_label='Dead')
# In[ ]:
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

# This package allows general elastic net tuning, but by setting
# l1_ratio=1, we restrict to LASSO.
regr = CoxnetSurvivalAnalysis(l1_ratio=1, alpha_min_ratio=0.05, max_iter=3e5)

n_folds = 10

alphas = np.logspace(-1.3, 0, num=100)
cv = KFold(n_splits=5, shuffle=True, random_state=328)
gcv = GridSearchCV(regr, {"alphas": [[v] for v in alphas]}, cv=cv).fit(X, Y)
#In[ ]:
import matplotlib.pyplot as plt

scores = gcv.cv_results_['mean_test_score']
scores_std = gcv.cv_results_['std_test_score']
std_error = scores_std / np.sqrt(n_folds)

plt.figure().set_size_inches(8, 6)
plt.semilogx(alphas, scores)
示例#21
0
                ls='--',
                label=('Best alpha, CI = %0.3f' % gcv.best_score_))
    plt.legend()
    plt.title('Cross Validation Concordance Index')


def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['vitalStatus'], y['overallSurvival'],
                                        prediction)
    return result[0]


# In[ ]:
# This package allows general elastic net tuning, but by setting l1_ratio = 1, we restrict to LASSO.
regr = CoxnetSurvivalAnalysis(l1_ratio=0.8, alpha_min_ratio=0.1, max_iter=3e5)

n_folds = 10

alphas = np.logspace(-1.3, 1.5, num=50)
cv = KFold(n_splits=5, shuffle=True, random_state=0)
gcv = GridSearchCV(regr, {
    "alphas": [[v] for v in alphas]
}, cv=cv, n_jobs=-1).fit(X, Y)

plot_gridcv_results(gcv, alphas)
regr_best = CoxnetSurvivalAnalysis(alphas=gcv.best_params_["alphas"],
                                   l1_ratio=0.8,
                                   alpha_min_ratio=0.1,
                                   max_iter=3e5).fit(X, Y)
y_regr = regr_best.predict(X_lb)
示例#22
0
def train_cox(x,
              outer_split=leave_two_out,
              inner_split=leave_two_out,
              num_folds=None):
    if num_folds is None:
        print('none')
    else:
        print(num_folds)
    # if feature_grid is None:
    #     feature_grid = np.logspace(7, 20, 14)
    hazards = []
    event_times = []
    event_outcomes = []
    score_vec = []
    model_out_dict = {}
    ix_inner = outer_split(x, x['outcome'], num_folds=num_folds)
    lambda_dict = {}
    for ic_in, ix_in in enumerate(ix_inner):
        train_index, test_index = ix_in
        x_train, x_test = x.iloc[train_index, :], x.iloc[test_index, :]

        week = x_train['week']
        outcome = x_train['outcome']
        x_train_ = x_train.drop(['week', 'outcome'], axis=1)
        yy = list(zip(outcome, week))
        y_arr = np.array(yy, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])

        ix_inner2 = inner_split(x_train,
                                x_train['outcome'],
                                num_folds=num_folds)
        lamb_dict = {}
        lamb_dict['auc'] = {}
        lamb_dict['ci'] = {}
        model2 = CoxnetSurvivalAnalysis(l1_ratio=1)

        model_dict = {}
        alphas = None
        hazards_dict = {}
        e_times_dict = {}
        e_outcomes_dict = {}
        score_dict = {}

        coxnet_pipe = CoxnetSurvivalAnalysis(l1_ratio=1,
                                             alpha_min_ratio=0.001,
                                             n_alphas=300)

        coxnet_pipe.fit(x_train_, y_arr)
        alphas = coxnet_pipe.alphas_

        for ic_in2, ix_in2 in enumerate(ix_inner2):
            start_inner = time.time()

            train_ix, test_ix = ix_in2
            x_tr2, x_ts2 = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :]

            y_test = list(zip(x_ts2['outcome'], x_ts2['week']))
            y_test_arr = np.array(y_test,
                                  dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])
            if len(np.unique(y_test_arr)) < len(test_ix):
                continue

            week = x_tr2['week']
            outcome = x_tr2['outcome']
            if (outcome == 0).all():
                continue
            x_tr2_ = x_tr2.drop(['week', 'outcome'], axis=1)
            yy2 = list(zip(outcome, week))
            y_arr2 = np.array(yy2, dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])
            model2.set_params(alphas=alphas)
            try:
                model2.fit(x_tr2_, y_arr2)
            except:
                print('removed alpha ' + str(alphas[0]))
                alphas_n = np.delete(alphas, 0)
                model2.set_params(alphas=alphas_n)
                while (1):
                    try:
                        model2.fit(x_tr2_, y_arr2)
                        alphas = alphas_n
                        break
                    except:
                        print('removed alpha ' + str(alphas_n[0]))
                        alphas_n = np.delete(alphas, 0)
                        model2.set_params(alphas=alphas_n)
                    if len(alphas_n) <= 2:
                        break
                if len(alphas_n) <= 2:
                    continue
            # alphas_new = model2.alphas_
            # if ic_in2 == 0:
            #     alphas = alphas_new

            model_dict[ic_in2] = model2
            for i, alpha in enumerate(alphas):
                if i not in hazards_dict.keys():
                    hazards_dict[i] = {}
                    e_times_dict[i] = {}
                    e_outcomes_dict[i] = {}
                    score_dict[i] = {}
                risk_scores = model2.predict(x_ts2.drop(['week', 'outcome'],
                                                        axis=1),
                                             alpha=alpha)
                hazards_dict[i][ic_in2] = risk_scores
                e_times_dict[i][ic_in2] = x_ts2['week']
                e_outcomes_dict[i][ic_in2] = x_ts2['outcome']

                if len(test_ix) >= 2:
                    score_dict[i][
                        ic_in2], _, _, _, _ = concordance_index_censored(
                            e_outcomes_dict[i][ic_in2].astype(bool),
                            e_times_dict[i][ic_in2], hazards_dict[i][ic_in2])

        if len(score_dict[i]) > 0:
            scores = {
                i: sum(score_dict[i].values()) / len(score_dict[i].values())
                for i in score_dict.keys()
            }
        else:
            scores = {}
            for a_ix in hazards_dict.keys():
                alpha_num = alphas[a_ix]
                scores[
                    alpha_num], concordant, discondordant, tied_risk, tied_time = concordance_index_censored(
                        np.array(
                            np.concatenate(list(
                                e_outcomes_dict[a_ix].values()))).astype(bool),
                        np.array(
                            np.concatenate(list(e_times_dict[a_ix].values()))),
                        np.array(
                            np.concatenate(list(hazards_dict[a_ix].values()))))

        lambdas, aucs_in = list(zip(*scores.items()))
        ix_max = np.argmax(aucs_in)
        best_lamb = lambdas[ix_max]

        lambda_dict[ic_in] = {
            'best_lambda': best_lamb,
            'scores': scores,
            'event_outcomes': event_outcomes,
            'times': event_times,
            'hazards': hazards,
            'lambdas_tested': alphas
        }
        model_out = CoxnetSurvivalAnalysis(l1_ratio=1, alphas=alphas)

        model_out.fit(x_train_, y_arr)

        risk_scores = model_out.predict(x_test.drop(['week', 'outcome'],
                                                    axis=1),
                                        alpha=best_lamb)

        hazards.append(risk_scores)
        event_times.append(x_test['week'])
        event_outcomes.append(x_test['outcome'])

        model_out_dict[ic_in] = model_out
        if len(test_index) > 1:
            score_vec.append(
                concordance_index_censored(x_test['outcome'].astype(bool),
                                           x_test['week'], risk_scores)[0])

    if len(test_index) > 1:
        score = sum(score_vec) / len(score_vec)
    else:
        score, concordant, discondordant, tied_risk, tied_time = concordance_index_censored(
            np.array(np.concatenate(event_outcomes)).astype(bool),
            np.array(np.concatenate(event_times)),
            np.array(np.concatenate(hazards)))

    final_dict = {}
    final_dict['score'] = score
    final_dict['model'] = model_out_dict
    final_dict['hazards'] = hazards
    final_dict['event_times'] = event_times
    final_dict['event_outcomes'] = event_outcomes
    final_dict['lambdas'] = lambda_dict
    return final_dict
示例#23
0
 def __init__(self, data):
     super().__init__(data)
     self.model = CoxnetSurvivalAnalysis(l1_ratio=1.0, max_iter=1000000)
input_train = input_train[features]
input_test = input_test[features]
input_train, input_test = preprocessing.normalizing_input(
    input_train, input_test)
structured_y = Surv.from_dataframe('Event', 'SurvivalTime', output_train)

# Coxnet
# coxnet = CoxnetSurvivalAnalysis()
# print(cross_validate(coxnet, input_train, structured_y, cv=5))

# Grid search
tuned_params = {
    "l1_ratio": np.linspace(0.01, 0.02, 100),
    "n_alphas": range(140, 160, 1),
}
grid_search = RandomizedSearchCV(CoxnetSurvivalAnalysis(),
                                 tuned_params,
                                 cv=5,
                                 n_jobs=4,
                                 n_iter=1000)
grid_search.fit(input_train, structured_y)
print(grid_search.best_score_)
best_params = grid_search.best_params_
print(best_params)


# Prediction
def predict(model, X, threshold=0.9):
    prediction = model.predict_survival_function(X)
    y_pred = []
    for pred in prediction:
_test_l = numpy.array(list(_test_l), dtype='bool,f4')
'''plot some estimator stuff
_event, _time = split_for_kaplan(_train_l, _train_d, 24)

for i in range(0, len(_event)):
    x, y = kaplan_meier_estimator(_event[i], _time[i])
    plt.step(x, y, where="post", label="CT_group= "+str(i));

plt.legend();
plt.plot();
plt.show();'''

# create and train the coxnet model
clf = CoxnetSurvivalAnalysis(n_alphas=100,
                             l1_ratio=0.5,
                             alpha_min_ratio=0.01,
                             tol=0.1,
                             fit_baseline_model=True).fit(_train_d, _train_l)

ccx = []
event_indicator = [val[0] for val in _test_l]
event_time = [val[1] for val in _test_l]
for val in clf.alphas_:
    res = clf.predict(_test_d, alpha=val)
    ccx.append(
        concordance_index_censored(event_indicator, event_time, estimate=res))

#curve concordance over alphas
plt.step(clf.alphas_, [val[0] for val in ccx], where="post")
plt.show()