def _k_bins(
        self, original_data
    ):  # add *kwargs variables to discretizer setting: n_bins, encode, strategy

        df_data = original_data.copy()

        # set attributes to be discretized
        if not UserInputs.attr_2disc_names:
            attrs = list(df_data.columns.values)
            attrs2remove = [
                UserInputs.attr_survival_name, UserInputs.attr_event_name
            ]
            if UserInputs.attr_id_name is not None:
                attrs2remove = attrs2remove + [UserInputs.attr_id_name]
            if UserInputs.attr_to_ignore:
                attrs2remove = attrs2remove + UserInputs.attr_to_ignore
            attrs2disc = [attr for attr in attrs if attr not in attrs2remove]
        else:
            attrs2disc = UserInputs.attr_2disc_names

        # Discretization:
        enc = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
        for attr in attrs2disc:
            to_disc = np.array(df_data[attr]).reshape(-1, 1)
            data_enc = enc.fit_transform(to_disc)
            data_disc = enc.inverse_transform(data_enc)
            if UserInputs.save_log:
                self._save_log(attr, df_data[attr], data_enc, data_disc)
            df_data[
                attr] = data_enc  # replaces attribute for discretized-encoded data

        if UserInputs.save_log:
            df_data.to_csv(self._save_file, index=False)

        return df_data
예제 #2
0
class DiscretizeTransformer(object):
    """Discretize continuous columns into several bins.
    Transformation result is a int array."""
    def __init__(self, meta, n_bins):
        self.meta = meta
        self.c_index = [
            id for id, info in enumerate(meta) if info['type'] == CONTINUOUS
        ]
        self.kbin_discretizer = KBinsDiscretizer(n_bins=n_bins,
                                                 encode='ordinal',
                                                 strategy='uniform')

    def fit(self, data):
        if self.c_index == []:
            return
        self.kbin_discretizer.fit(data[:, self.c_index])

    def transform(self, data):
        if self.c_index == []:
            return data.astype('int')

        data_t = data.copy()
        data_t[:, self.c_index] = self.kbin_discretizer.transform(
            data[:, self.c_index])
        return data_t.astype('int')

    def inverse_transform(self, data):
        if self.c_index == []:
            return data

        data_t = data.copy().astype('float32')
        data_t[:, self.c_index] = self.kbin_discretizer.inverse_transform(
            data[:, self.c_index])
        return data_t
예제 #3
0
class DiscretizeTransformer(Transformer):
    """Discretize continuous columns into several bins.

    Attributes:
        meta
        column_index
        discretizer(sklearn.preprocessing.KBinsDiscretizer)

    Transformation result is a int array.

    """
    def __init__(self, n_bins):
        self.n_bins = n_bins
        self.meta = None
        self.column_index = None
        self.discretizer = None

    def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()):
        self.meta = self.get_metadata(data, categorical_columns,
                                      ordinal_columns)
        self.column_index = [
            index for index, info in enumerate(self.meta)
            if info['type'] == CONTINUOUS
        ]

        self.discretizer = KBinsDiscretizer(n_bins=self.n_bins,
                                            encode='ordinal',
                                            strategy='uniform')

        if not self.column_index:
            return

        self.discretizer.fit(data[:, self.column_index])

    def transform(self, data):
        """Transform data discretizing continous values.

        Args:
            data(pandas.DataFrame)

        Returns:
            numpy.ndarray

        """
        if self.column_index == []:
            return data.astype('int')

        data[:, self.column_index] = self.discretizer.transform(
            data[:, self.column_index])
        return data.astype('int')

    def inverse_transform(self, data):
        if self.column_index == []:
            return data

        data = data.astype('float32')
        data[:, self.column_index] = self.discretizer.inverse_transform(
            data[:, self.column_index])
        return data
예제 #4
0
def discretized_pca(taxi_data, num_components, num_bin_components):
    # normalize
    scaler = MinMaxScaler()
    taxi_data_rescaled = scaler.fit_transform(taxi_data)

    # pca
    print('pca processing')
    pca = PCA(n_components=num_components)
    pca.fit(taxi_data_rescaled)
    taxi_rep = pca.transform(taxi_data_rescaled)

    # test pca loss
    back_taxi_data = pca.inverse_transform(taxi_rep)
    back_taxi_data = scaler.inverse_transform(back_taxi_data)

    average_loss = 0
    for i in range(len(taxi_data)):
        diff = taxi_data[i] - back_taxi_data[i]
        loss = np.sum(diff * diff)
        average_loss += loss
    print('pca loss: {:.6f}'.format(float(average_loss/taxi_data.size)))

    # for i in range(config['num_components']):
    #     print(np.min(taxi_data[:, i]), np.max(taxi_data[:, i]))
    #     print(np.min(taxi_rep[:, i]), np.max(taxi_rep[:, i]))

    # discretization
    est = KBinsDiscretizer(n_bins=num_bin_components, encode='ordinal', strategy='uniform')
    est.fit(taxi_rep)
    disc_taxi_rep = est.transform(taxi_rep)
    # for i in range(5):
    #     tools.print_random_pos(disc_taxi_rep)

    # test discretized pca loss
    disc_taxi_rep2 = est.inverse_transform(disc_taxi_rep)
    back_disc_taxi_data = pca.inverse_transform(disc_taxi_rep2)
    back_disc_taxi_data = scaler.inverse_transform(back_disc_taxi_data)

    average_loss = 0
    for i in range(len(taxi_data)):
        diff = taxi_data[i] - back_disc_taxi_data[i]
        loss = np.sum(diff * diff)
        average_loss += loss
    print('discretized pca loss: {:.6f}'.format(float(average_loss/taxi_data.size)))

    # test ridiculous loss
    average_loss = 0
    test_line = np.zeros(shape=taxi_data[i].shape)
    for i in range(len(taxi_data)):
        diff = taxi_data[i] - test_line
        loss = np.sum(diff * diff)
        average_loss += loss
    print('ridiculous loss: {:.6f}'.format(float(average_loss/taxi_data.size)))
    print(f'num components: {disc_taxi_rep.shape[1]}')

    return disc_taxi_rep
def test_inverse_transform(strategy):
    X = np.random.RandomState(0).randn(100, 3)
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal')
    Xt = kbd.fit_transform(X)
    assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_)

    X2 = kbd.inverse_transform(Xt)
    X2t = kbd.fit_transform(X2)
    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
    assert_array_equal(Xt, X2t)
예제 #6
0
def test_overwrite():
    X = np.array([0, 1, 2, 3])[:, None]
    X_before = X.copy()

    est = KBinsDiscretizer(n_bins=3, encode="ordinal")
    Xt = est.fit_transform(X)
    assert_array_equal(X, X_before)

    Xt_before = Xt.copy()
    Xinv = est.inverse_transform(Xt)
    assert_array_equal(Xt, Xt_before)
    assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
예제 #7
0
def test_overwrite():
    X = np.array([0, 1, 2, 3])[:, None]
    X_before = X.copy()

    est = KBinsDiscretizer(n_bins=3, encode="ordinal")
    Xt = est.fit_transform(X)
    assert_array_equal(X, X_before)

    Xt_before = Xt.copy()
    Xinv = est.inverse_transform(Xt)
    assert_array_equal(Xt, Xt_before)
    assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
예제 #8
0
    class MeanBinner():
        def __init__(self):
            self.binner = KBinsDiscretizer(n_bins=10,
                                           encode='ordinal',
                                           strategy='quantile')

        def fit(self, X, y=None):
            self.binner.fit(X)

        def transform(self, X, y=None):
            binned = self.binner.transform(X)
            return self.binner.inverse_transform(binned)
예제 #9
0
def test_inverse_transform(strategy, encode):
    X = np.random.RandomState(0).randn(100, 3)
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
    Xt = kbd.fit_transform(X)
    X2 = kbd.inverse_transform(Xt)
    X2t = kbd.fit_transform(X2)
    if encode == 'onehot':
        assert_array_equal(Xt.todense(), X2t.todense())
    else:
        assert_array_equal(Xt, X2t)
    if 'onehot' in encode:
        Xt = kbd._encoder.inverse_transform(Xt)
        X2t = kbd._encoder.inverse_transform(X2t)

    assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_)
    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
예제 #10
0
def test_inverse_transform(strategy, encode):
    X = np.random.RandomState(0).randn(100, 3)
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
    Xt = kbd.fit_transform(X)
    X2 = kbd.inverse_transform(Xt)
    X2t = kbd.fit_transform(X2)
    if encode == 'onehot':
        assert_array_equal(Xt.todense(), X2t.todense())
    else:
        assert_array_equal(Xt, X2t)
    if 'onehot' in encode:
        Xt = kbd._encoder.inverse_transform(Xt)
        X2t = kbd._encoder.inverse_transform(X2t)

    assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_)
    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
예제 #11
0
파일: prep.py 프로젝트: TonyLv/mlpl
def binning_values(df, col, strategy='quantile', bins=10):
    """Binning, but returns group values instead of group names. Returns pd.Series.
    
    Args:
        df (pd.DataFrame): dataframe
        col (str): column name 
        strategy (str): bin strategy for sklearn.KBinsDiscretizer()
        bins (int): bin count
        
    Returns:
        binned_col (pd.Series): result of binning, but values
    """

    col = utils.tolist(col)
    disc = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy=strategy)
    binned = disc.fit_transform(df[col].fillna(-99999).to_numpy())
    return pd.Series(disc.inverse_transform(binned).flatten())
예제 #12
0
def test_inverse_transform(strategy, encode, expected_inv):
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
    Xt = kbd.fit_transform(X)
    Xinv = kbd.inverse_transform(Xt)
    assert_array_almost_equal(expected_inv, Xinv)
예제 #13
0
def main(use_simple_lr_pca_pipeline, kbins_strat, train_split, test_split,
         exclude_pca, hyperparameters, output_size, validation_size, n_process,
         precached_pkl, prestore_data, return_mode,
         use_simple_lin_reg_pca_pipeline, use_simple_lstm, discretize_age,
         kbins_encoding, num_epochs, num_pca_comp):
    if precached_pkl is not None:
        allData = pkl.load(open(precached_pkl, 'rb'))
        data = allData["data"]
        # clinical_txt_paths = precached_pkl["clinical_txt_paths"]
        ages = allData["ages"]
        testAges = allData["testAges"]
        testData = allData["testData"]
        # test_clinical_txt_paths = precached_pkl["test_clinical_txt_paths"]
    else:
        data, ages, clinical_txt_paths = get_data(split=train_split)
        testData, testAges, test_clinical_txt_paths = get_data(
            split=test_split)
    return_dict = Dict()

    if prestore_data:
        toStore = Dict()
        toStore.data = data
        toStore.ages = ages
        toStore.clinical_txt_paths = clinical_txt_paths
        toStore.testData = testData
        toStore.testAges = testAges
        toStore.test_clinical_txt_paths = test_clinical_txt_paths
        if return_mode == "age":
            pkl.dump(toStore, open("agePredictionData.pkl", 'wb'))
        elif return_mode == "bpm":
            pkl.dump(toStore, open("bpmPredictionData.pkl", 'wb'))
        return return_mode

    if discretize_age:
        kbins = KBinsDiscretizer(output_size,
                                 encode=kbins_encoding,
                                 strategy=kbins_strat)
        ages = np.array(ages).reshape(-1, 1)
        ages = kbins.fit_transform(ages)
        return_dict['kbins'] = kbins.bin_edges_
        testAges = np.array(testAges).reshape(-1, 1)
        testAges = kbins.transform(testAges)
        print("KBins used!  Edges are: {}".format(kbins.bin_edges_))

    if use_simple_lstm:
        ageScaler = StandardScaler()
        ages = np.array(ages).reshape(-1, 1)
        ages = ageScaler.fit_transform(ages)
        testAges = np.array(testAges).reshape(-1, 1)
        testAges = ageScaler.transform(testAges)
        model = get_lstm()
        x = pad_sequences(data)
        model.fit(x,
                  ages,
                  epochs=num_epochs,
                  validation_split=validation_size,
                  callbacks=get_early_stopping())
        testX = pad_sequences(testData)
        score = model.evaluate(testX, testAges)
        y_pred = model.predict(testX)

        ages = ageScaler.inverse_transform(ages)
        testAges = ageScaler.inverse_transform(testAges)
        mse = mean_squared_error(y_pred, testAges)
        r2 = r2_score(y_pred, testAges)
        print("MSE: {}".format(mse))
        print("R2: {}".format(r2))
        fn = "model_{}_epochs{}.h5".format(return_mode, num_epochs)
        model.save(fn)
        ex.add_artifact(fn)
        return score, mse, r2

    if use_simple_lin_reg_pca_pipeline:
        ages = np.array(ages).reshape(-1, 1)
        testAges = np.array(testAges).reshape(-1, 1)
        data = np.stack(data).reshape(len(data), -1)
        testData = np.stack(testData).reshape(len(testData), -1)

        steps = [
            ('pca', PCA(n_components=num_pca_comp)),
            ('scaler', StandardScaler()),
            ('lin_reg', LinearRegression()),
        ]
        if exclude_pca:
            steps = steps[1:]
        p = Pipeline(steps)
        cv = int(1 / validation_size)
        gridsearch = GridSearchCV(p,
                                  hyperparameters,
                                  scoring=make_scorer(r2_score),
                                  cv=cv,
                                  n_jobs=n_process)
        gridsearch.fit(data, ages)
        return_dict["gridsearch_best_estimator"] = gridsearch.best_estimator_
        return_dict["best_cv_score"] = gridsearch.best_score_
        print("best cv score was {}".format(gridsearch.best_score_))
        best_pipeline = gridsearch.best_estimator_
        best_pipeline.fit(data, ages)

        y_pred = best_pipeline.predict(data)
        y_pred[y_pred < 0] = 0
        y_pred[y_pred > 90] = 90
        print("train r^2 was {}".format(r2_score(ages, y_pred)))

        y_pred = best_pipeline.predict(testData)
        y_pred[y_pred < 0] = 0
        y_pred[y_pred > 90] = 90
        test_score = mean_squared_error(testAges, y_pred)
        print("test_score: {}".format(test_score))
        print("test r^2 was {}".format(r2_score(testAges, y_pred)))
        return_dict["test_score"] = test_score
        pkl.dump(return_dict,
                 open("predict_{}Exp.pkl".format(return_mode), 'wb'))
        ex.add_artifact("predict_{}Exp.pkl".format(return_mode))
        return test_score, r2_score(testAges, y_pred)

    if use_simple_lr_pca_pipeline:
        data = np.stack(data).reshape(len(data), -1)
        testData = np.stack(testData).reshape(len(testData), -1)

        steps = [
            ('pca', PCA(n_components=num_pca_comp)),
            ('scaler', StandardScaler()),
            ('lr', LogisticRegression()),
        ]
        if exclude_pca:
            steps = steps[1:]
        p = Pipeline(steps)
        cv = int(1 / validation_size)
        gridsearch = GridSearchCV(p,
                                  hyperparameters,
                                  scoring=make_scorer(r2_score),
                                  cv=cv,
                                  n_jobs=n_process)
        gridsearch.fit(data, ages)
        return_dict["gridsearch_best_estimator"] = gridsearch.best_estimator_
        return_dict["best_cv_score"] = gridsearch.best_score_
        print("best cv score was {}".format(gridsearch.best_score_))
        best_pipeline = gridsearch.best_estimator_
        best_pipeline.fit(data, ages)
        y_pred = best_pipeline.predict(data)
        print("train r^2 was {}".format(r2_score(ages, y_pred)))

        y_pred = best_pipeline.predict(testData)
        test_score = f1_score(testAges, y_pred, average="weighted")

        y_pred_orig = kbins.inverse_transform(y_pred.reshape(-1, 1))
        test_ages_orig = kbins.inverse_transform(testAges.reshape(-1, 1))

        print("test r^2 was {}".format(r2_score(testAges, y_pred)))
        print("test mse was {}".format(
            mean_squared_error(test_ages_orig, y_pred_orig)))

        print("test_score: f1 {}".format(test_score))
        print("test_score: accuracy {}".format(accuracy_score(
            testAges, y_pred)))

        return_dict["test_score"] = test_score
        pkl.dump(return_dict,
                 open("predict_{}Exp.pkl".format(return_mode), 'wb'))
        ex.add_artifact("predict_{}Exp.pkl".format(return_mode))
        return test_score

    raise Exception("Valid config not set")
예제 #14
0
def test_inverse_transform(strategy, encode, expected_inv):
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
    Xt = kbd.fit_transform(X)
    Xinv = kbd.inverse_transform(Xt)
    assert_array_almost_equal(expected_inv, Xinv)