Пример #1
0
    def convert_dataset(dataset):
        data_x_numeric = dataset.loc[:, dataset.columns != "status"]
        data_x_numeric = data_x_numeric.loc[:,
                                            data_x_numeric.columns != "time"]
        # convert string columns to categorical type
        for col in data_x_numeric.columns:
            if str(data_x_numeric[col].dtype) == "object":
                data_x_numeric[col] = data_x_numeric[col].astype('category')
        data_x_numeric = OneHotEncoder().fit_transform(data_x_numeric)

        data_y = dataset[["status", "time"]]
        data_y = data_y.reindex(columns=["status", "time"])
        data_y["status"] = data_y["status"].astype('bool')

        pd_y_values = data_y.copy()
        pd_y_values = pd_y_values.rename(index=int,
                                         columns={"status": "event"})
        pd_y_values = pd_y_values.reindex(columns=["time", "event"])

        # test on sorted input data
        test_data = data_x_numeric.copy()
        test_timed_data = test_data
        test_timed_data['time'] = pd_y_values["time"]

        return data_x_numeric, pd_y_values, test_timed_data
def fit_and_prepare(x_train, y_train, test_df):

    # 3.1. Prepare Y-----
    y_train.specific_death = y_train.specific_death.astype(bool)

    # Transform it into a structured array
    y_train = y_train.to_records(index=False)

    # 3.2. Prepare X-----
    # obtain the x variables that are categorical
    categorical_feature_mask = x_train.dtypes == object

    # Filter categorical columns using mask and turn it into a list
    categorical_cols = x_train.columns[categorical_feature_mask].tolist()

    # Ensure categorical columns are category type
    for col in categorical_cols:
        x_train[col] = x_train[col].astype('category')
        test_df[col] = test_df[col].astype('category')

    # 3.3. Fit model-----
    # initiate
    encoder = OneHotEncoder()
    estimator = CoxPHSurvivalAnalysis()

    # fit model
    estimator.fit(encoder.fit_transform(x_train), y_train)

    # transform the test variables to match the train
    x_test = encoder.transform(test_df)

    return (estimator, x_test, x_train, y_train)
    def test_transform(create_data):
        data, _ = create_data()

        t = OneHotEncoder().fit(data)
        data, expected_data = create_data(165)
        actual_data = t.transform(data)
        tm.assert_frame_equal(actual_data, expected_data)

        data = pd.concat((data.iloc[:, :2], data.iloc[:, 5:], data.iloc[:, 2:5]), axis=1)
        actual_data = t.transform(data)
        tm.assert_frame_equal(actual_data, expected_data)
    def test_transform_other_columns(create_data):
        data, _ = create_data()

        t = OneHotEncoder().fit(data)
        data, _ = create_data(125)

        data_renamed = data.rename(columns={"binary_1": "renamed_1"})
        with pytest.raises(
                ValueError,
                match=r"1 features are missing from data: \['binary_1'\]"):
            t.transform(data_renamed)

        data_dropped = data.drop('trinary', axis=1)
        with pytest.raises(
                ValueError,
                match=r"1 features are missing from data: \['trinary'\]"):
            t.transform(data_dropped)

        data_renamed = data.rename(columns={
            "binary_1": "renamed_1",
            "many": "too_many"
        })
        with pytest.raises(
                ValueError,
                match=
                r"2 features are missing from data: \['binary_1', 'many'\]"):
            t.transform(data_renamed)
Пример #5
0
    def test_fit_unpenalized():
        X, y = load_breast_cancer()
        included = X["grade"] != "unkown"
        X = X.loc[included, :]
        y = y[included.values]

        X["grade"] = pandas.Series(pandas.Categorical(
            X["grade"].astype(object),
            categories=["intermediate", "poorly differentiated",
                        "well differentiated"]),
            index=X.index, name="grade")

        enc = OneHotEncoder()
        X = enc.fit_transform(X)

        cols_unpen = ['age', 'size', 'grade=poorly differentiated',
                      'grade=well differentiated', 'er=positive']
        X = pandas.concat((
            X.loc[:, cols_unpen],
            X.drop(cols_unpen, axis=1)),
            axis=1)

        alphas = numpy.ones(X.shape[1])
        alphas[:len(cols_unpen)] = 0.0

        cph = CoxPHSurvivalAnalysis(alpha=alphas)
        cph.fit(X, y)

        coef = numpy.array([
            -0.0228825990482334, 0.635554486750423, -0.242079636336473,
            -1.30197563647684, -2.27790151300312,
            0.291950212930807, 0.210861165049552, -0.612456645638769, -0.453414844486013, -0.1239424190253,
            0.196855946938761, 1.08724198521351, -0.313645443818603, -0.660016141198812, 1.07104977404073,
            0.559632480471393, -0.47740746012516, -1.26199769642326, -1.40486191330444, -0.418517018253652,
            0.284936091689505, -0.215531076378674, -0.200889269720281, 0.341231176941461, 0.0307350667648337,
            -0.212527052910377, -0.3019678509188, 0.54491723178866, -0.286914381308269, 0.370374100647823,
            -0.496258248067704, 0.624528657777646, 0.287884026214139, 0.022095151910937, 0.910293732936019,
            -0.13076488639207, 0.0857209529827562, -0.0922302696963889, 0.498136631416287, 0.937133644376614,
            0.395090607856869, -1.04727952099579, -0.54974694800345, 0.442372971174454, -0.745558450753062,
            -0.0920496108021893, 0.75549238586293, 0.562496351046743, 0.259183349320614, 0.405816113039412,
            -0.0969485695700491, -0.507388915258978, -0.474246597197329, -0.209335517183595, 0.187390427612498,
            -0.0522568530719332, 0.0806559868641646, -0.0397654339013217, -0.269582356665396, 0.791793553908743,
            0.344208857844796, -0.180165785909583, -0.7927695046551, 0.0311635012097026, -0.579429950080662,
            -0.264770995160963, 0.869512689697827, 0.765479119494175, -0.173588059680979, -0.199781736503338,
            -0.58712767650975, -0.457389854855, 0.3891865514653, 0.707309743580534, -0.121997864690072,
            0.0447174402649954, 0.0319336975869795, 0.0117988435665652, -0.593691059339064, -0.838107176656365,
            -0.247955128152877
        ])

        assert_array_almost_equal(cph.coef_, coef)
Пример #6
0
    def test_alpha_too_small(self):
        X, y = load_breast_cancer()
        Xt = OneHotEncoder().fit_transform(X)
        index = numpy.array([
            0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21,
            22, 23, 25, 26, 27, 28, 29, 30, 31, 33, 34, 36, 39, 40, 41, 42, 43,
            44, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 60, 61, 62, 63,
            64, 65, 66, 68, 70, 71, 72, 75, 76, 78, 79, 80, 82, 84, 85, 86, 87,
            88, 90, 91, 92, 93, 94, 95, 98, 99, 100, 102, 103, 104, 105, 107,
            108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121,
            124, 125, 126, 127, 128, 130, 131, 132, 133, 135, 136, 137, 138,
            139, 140, 143, 144, 145, 147, 148, 150, 151, 153, 154, 155, 156,
            157, 158, 160, 161, 164, 165, 166, 167, 168, 169, 170, 171, 172,
            174, 175, 177, 178, 180, 181, 182, 183, 184, 185, 186, 187, 188,
            190, 191, 192, 193, 194, 195, 196, 197
        ])

        nn = CoxnetSurvivalAnalysis(alphas=[0.007295025406624247],
                                    l1_ratio=1.0)
        Xf, yf = Xt.iloc[index], y[index]

        self.assertRaisesRegex(
            ArithmeticError,
            "Numerical error, because weights are too large. Consider increasing alpha.",
            nn.fit, Xf, yf)
    def test_fit(create_data):
        data, expected_data = create_data()

        t = OneHotEncoder().fit(data)

        assert t.feature_names_.tolist() == ['binary_1', 'binary_2', 'trinary', 'many']
        assert set(t.encoded_columns_) == set(expected_data.columns)

        assert t.categories_ == {k: data[k].cat.categories
                                 for k in ['binary_1', 'binary_2', 'trinary', 'many']}
Пример #8
0
    def test_fit(self):
        data = create_data()
        expected_data = encoded_data(data)

        t = OneHotEncoder().fit(data)

        self.assertListEqual(t.feature_names_.tolist(),
                             ['binary_1', 'binary_2', 'trinary', 'many'])
        self.assertSetEqual(set(t.encoded_columns_),
                            set(expected_data.columns))

        self.assertDictEqual(t.categories_,
                             {k: data[k].cat.categories
                              for k in ['binary_1', 'binary_2', 'trinary', 'many']})
Пример #9
0
def test_brier_coxph():
    X, y = load_gbsg2()
    X.loc[:, "tgrade"] = X.loc[:, "tgrade"].map(len).astype(int)

    Xt = OneHotEncoder().fit_transform(X)

    est = CoxPHSurvivalAnalysis(ties="efron").fit(Xt, y)
    survs = est.predict_survival_function(Xt)

    preds = [fn(1825) for fn in survs]

    _, score = brier_score(y, y, preds, 1825)

    assert round(abs(score[0] - 0.208817407492645), 5) == 0
Пример #10
0
def test_pipeline_predict(breast_cancer, name, func):
    X_str, _ = load_breast_cancer()
    X_num, y = breast_cancer

    est = FORESTS[name](n_estimators=10, random_state=1)
    est.fit(X_num[10:], y[10:])

    pipe = make_pipeline(OneHotEncoder(), FORESTS[name](n_estimators=10, random_state=1))
    pipe.fit(X_str[10:], y[10:])

    tree_pred = getattr(est, func)(X_num[:10], return_array=True)
    pipe_pred = getattr(pipe, func)(X_str[:10], return_array=True)

    assert_array_almost_equal(tree_pred, pipe_pred)
Пример #11
0
    def convert_dataset(dataset):
        # convert string columns to categorical type
        for col in dataset.columns:
            if str(dataset[col].dtype) == "object":
                dataset.loc[:, col] = dataset[col].astype('category')

        data_x_numeric = OneHotEncoder().fit_transform(dataset[[
            "horTh", "age", "menostat", "tsize", "tgrade", "pnodes", "progrec",
            "estrec"
        ]])
        data_y = dataset[["time", "cens"]]
        data_y = data_y.reindex(columns=["cens", "time"])
        data_y["cens"] = data_y["cens"].astype('bool')

        pd_y_values = data_y.copy()
        pd_y_values = pd_y_values.rename(index=int, columns={"cens": "event"})
        pd_y_values = pd_y_values.reindex(columns=["time", "event"])

        # test on sorted input data
        test_data = data_x_numeric.copy()
        test_timed_data = test_data
        test_timed_data['time'] = pd_y_values["time"]

        return data_x_numeric, pd_y_values, test_timed_data
Пример #12
0
def test_pipeline_predict(func):
    X_str, y = load_breast_cancer()
    X_num = encode_categorical(X_str)

    est = RandomSurvivalForest(n_estimators=10, random_state=1)
    est.fit(X_num[10:], y[10:])

    pipe = make_pipeline(OneHotEncoder(),
                         RandomSurvivalForest(n_estimators=10, random_state=1))
    pipe.fit(X_str[10:], y[10:])

    tree_pred = getattr(est, func)(X_num[:10])
    pipe_pred = getattr(pipe, func)(X_str[:10])

    assert_array_almost_equal(tree_pred, pipe_pred)
Пример #13
0
    def test_transform_other_columns(self):
        data = create_data()

        t = OneHotEncoder().fit(data)
        data = create_data(125)

        data_renamed = data.rename(columns={"binary_1": "renamed_1"})
        self.assertRaisesRegex(ValueError,
                               "1 features are missing from data: \['binary_1'\]",
                               t.transform, data_renamed)

        data_dropped = data.drop('trinary', axis=1)
        self.assertRaisesRegex(ValueError,
                               "1 features are missing from data: \['trinary'\]",
                               t.transform, data_dropped)

        data_renamed = data.rename(columns={"binary_1": "renamed_1", "many": "too_many"})
        self.assertRaisesRegex(ValueError,
                               "2 features are missing from data: \['binary_1', 'many'\]",
                               t.transform, data_renamed)
Пример #14
0
def test_pipeline_predict(func):
    X_str, y = load_breast_cancer()
    X_num = column.encode_categorical(X_str)

    est = CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001,
                                 l1_ratio=1.0,
                                 fit_baseline_model=True)
    est.fit(X_num[10:], y[10:])

    pipe = make_pipeline(
        OneHotEncoder(),
        CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001,
                               l1_ratio=1.0,
                               fit_baseline_model=True))
    pipe.fit(X_str[10:], y[10:])

    tree_pred = getattr(est, func)(X_num[:10])
    pipe_pred = getattr(pipe, func)(X_str[:10])

    for s1, s2 in zip(tree_pred, pipe_pred):
        assert_array_almost_equal(s1.x, s2.x)
        assert_array_almost_equal(s1.y, s2.y)
    def test_fit_transform(create_data):
        data, expected_data = create_data()

        actual_data = OneHotEncoder().fit_transform(data)
        tm.assert_frame_equal(actual_data, expected_data)
Пример #16
0
plt.legend(loc="best")

for value in data_x["Celltype"].unique():
    mask = data_x["Celltype"] == value
    time_cell, survival_prob_cell = kaplan_meier_estimator(data_y["Status"][mask],
                                                           data_y["Survival_in_days"][mask])
    plt.step(time_cell, survival_prob_cell, where="post",
             label="%s (n = %d)" % (value, mask.sum()))

plt.ylabel("est. probability of survival $\hat{S}(t)$")
plt.xlabel("time $t$")
plt.legend(loc="best")

from sksurv.preprocessing import OneHotEncoder

data_x_numeric = OneHotEncoder().fit_transform(data_x)
data_x_numeric.head()

from sksurv.linear_model import CoxPHSurvivalAnalysis

estimator = CoxPHSurvivalAnalysis()
estimator.fit(data_x_numeric, data_y)

pd.Series(estimator.coef_, index=data_x_numeric.columns)

x_new = pd.DataFrame.from_dict({
    1: [65, 0, 0, 1, 60, 1, 0, 1],
    2: [65, 0, 0, 1, 60, 1, 0, 0],
    3: [65, 0, 1, 0, 60, 1, 0, 0],
    4: [65, 0, 1, 0, 60, 1, 0, 1]},
     columns=data_x_numeric.columns, orient='index')
Пример #17
0
for col in x.columns:
    if censored_percentage[col] > 0.6:
        not_enough_data.append(col)

x = x.drop(not_enough_data, axis=1)

# Impute missing values with mode
x = CustomImputer(strategy='mode').fit_transform(x)

# Removes low-variance categorical features
categorical = x.select_dtypes(['object']).columns
cat = x[categorical]
cat[cat.select_dtypes(['object']).columns] = cat.select_dtypes(
    ['object']).apply(lambda y: y.astype('category'))
cat = OneHotEncoder().fit_transform(cat)
selector = VarianceThreshold(.8 * (1 - .8))
selector.fit_transform(cat)
columns = cat.columns
labels_c = []
for index in selector.get_support(indices=True):
    labels_c.append(columns[index])
selected_categorical = pd.DataFrame(selector.fit_transform(cat),
                                    columns=labels_c)

# Feature selection for numeric features
numeric = x.select_dtypes(['float64']).columns
num = x[numeric]
selector = SelectFpr(score_func=f_regression, alpha=0.05)
selected_numeric = selector.fit_transform(num, survival)
columns = num.columns
]]

T = df['Duration']

E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False)

df2['E'] = E
df2['T'] = T

X, y = get_x_y(df2, ['E', 'T'], pos_label=True)

for c in X.columns.values:
    if c != 'AGE AT DOC':
        X[c] = X[c].astype('category')

data_x_numeric = OneHotEncoder().fit_transform(X)
#%%

estimator = CoxnetSurvivalAnalysis(verbose=True)
estimator.fit(data_x_numeric, y)
#%%

print(estimator.score(data_x_numeric, y))
print()

scores = fit_and_score_features(data_x_numeric.values, y)
print(
    pd.Series(scores,
              index=data_x_numeric.columns).sort_values(ascending=False))
#%%
Пример #19
0
#Intersection between two files
#df=intersection(df1, df2)

#Adding genes based on the sample, status and time
#df=merge_frames(df1,df2)

#Converting the integer to 0 and 1 to boolean for python
df["Status"] = df["Status"].astype(bool)
#data contains the time and status column and X will have all the mutation present or absent corresponding to each gene
data = df.iloc[0:, 1:3]
X = df.iloc[0:, 3:]

#storing the value used to store status and time in tuple
Y = data.to_records(index=False)

X = OneHotEncoder().fit_transform(X)

#Running the module for 50 randomly generated penalty values
estimator = CoxnetSurvivalAnalysis(n_alphas=100,
                                   l1_ratio=1,
                                   alpha_min_ratio=0.01,
                                   max_iter=10000)
estimator.fit(X, Y)

#Making the dataframe for the coefficients of each genes corresponding to that alpha value
coefficients_lasso = pd.DataFrame(estimator.coef_,
                                  index=X.columns,
                                  columns=np.round(estimator.alphas_, 5))
alphas = estimator.alphas_

print(coefficients_lasso)
Пример #20
0
    ]]

    T = df['Duration']

    E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False)

    df2['E'] = E
    df2['T'] = T

    X, y = get_x_y(df2, ['E', 'T'], pos_label=True)

    for c in X.columns.values:
        if c != 'AGE AT DOC':
            X[c] = X[c].astype('category')

    data_x_numeric = OneHotEncoder().fit_transform(X)

    from sklearn.decomposition import NMF

    model = NMF(n_components=8)
    data_x_numeric = model.fit_transform(data_x_numeric)

    #%%

    estimator = CoxPHSurvivalAnalysis(verbose=True, n_iter=10000)
    estimator.fit(data_x_numeric, y)
    #%%

    print()
    print(pd.Series(estimator.coef_, index=data_x_numeric.columns))
    print()
Пример #21
0
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from sksurv.datasets import load_gbsg2
from sksurv.preprocessing import OneHotEncoder
from sksurv.ensemble import RandomSurvivalForest

X, y = load_gbsg2()

grade_str = X.loc[:, "tgrade"].astype(object).values[:, np.newaxis]
grade_num = OrdinalEncoder(
    categories=[["I", "II", "III"]]).fit_transform(grade_str)

X_no_grade = X.drop("tgrade", axis=1)
Xt = OneHotEncoder().fit_transform(X_no_grade)
Xt = np.column_stack((Xt.values, grade_num))

feature_names = X_no_grade.columns.tolist() + ["tgrade"]

random_state = 20

X_train, X_test, y_train, y_test = train_test_split(Xt,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=random_state)

rsf = RandomSurvivalForest(n_estimators=1000,
                           min_samples_split=10,
                           min_samples_leaf=15,
                           max_features="sqrt",
Пример #22
0
def plot_cumulative_dynamic_auc(risk_score, label, color=None):
    auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times)

    plt.plot(times, auc, marker="o", color=color, label=label)
    plt.xlabel("days from enrollment")
    plt.ylabel("time-dependent AUC")
    plt.axhline(mean_auc, color=color, linestyle="--")
    plt.legend()


for i, col in enumerate(num_columns):
    plot_cumulative_dynamic_auc(x_test[:, i], col, color="C{}".format(i))
    ret = concordance_index_ipcw(y_train, y_test, x_test[:, i], tau=times[-1])

from sksurv.datasets import load_veterans_lung_cancer

va_x, va_y = load_veterans_lung_cancer()

cph = make_pipeline(OneHotEncoder(), CoxPHSurvivalAnalysis())
cph.fit(va_x, va_y)

va_times = np.arange(7, 183, 7)
# estimate performance on training data, thus use `va_y` twice.
va_auc, va_mean_auc = cumulative_dynamic_auc(va_y, va_y, cph.predict(va_x),
                                             va_times)

plt.plot(va_times, va_auc, marker="o")
plt.axhline(va_mean_auc, linestyle="--")
plt.xlabel("days from enrollment")
plt.ylabel("time-dependent AUC")
plt.grid(True)