def _k_bins( self, original_data ): # add *kwargs variables to discretizer setting: n_bins, encode, strategy df_data = original_data.copy() # set attributes to be discretized if not UserInputs.attr_2disc_names: attrs = list(df_data.columns.values) attrs2remove = [ UserInputs.attr_survival_name, UserInputs.attr_event_name ] if UserInputs.attr_id_name is not None: attrs2remove = attrs2remove + [UserInputs.attr_id_name] if UserInputs.attr_to_ignore: attrs2remove = attrs2remove + UserInputs.attr_to_ignore attrs2disc = [attr for attr in attrs if attr not in attrs2remove] else: attrs2disc = UserInputs.attr_2disc_names # Discretization: enc = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile') for attr in attrs2disc: to_disc = np.array(df_data[attr]).reshape(-1, 1) data_enc = enc.fit_transform(to_disc) data_disc = enc.inverse_transform(data_enc) if UserInputs.save_log: self._save_log(attr, df_data[attr], data_enc, data_disc) df_data[ attr] = data_enc # replaces attribute for discretized-encoded data if UserInputs.save_log: df_data.to_csv(self._save_file, index=False) return df_data
class DiscretizeTransformer(object): """Discretize continuous columns into several bins. Transformation result is a int array.""" def __init__(self, meta, n_bins): self.meta = meta self.c_index = [ id for id, info in enumerate(meta) if info['type'] == CONTINUOUS ] self.kbin_discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform') def fit(self, data): if self.c_index == []: return self.kbin_discretizer.fit(data[:, self.c_index]) def transform(self, data): if self.c_index == []: return data.astype('int') data_t = data.copy() data_t[:, self.c_index] = self.kbin_discretizer.transform( data[:, self.c_index]) return data_t.astype('int') def inverse_transform(self, data): if self.c_index == []: return data data_t = data.copy().astype('float32') data_t[:, self.c_index] = self.kbin_discretizer.inverse_transform( data[:, self.c_index]) return data_t
class DiscretizeTransformer(Transformer): """Discretize continuous columns into several bins. Attributes: meta column_index discretizer(sklearn.preprocessing.KBinsDiscretizer) Transformation result is a int array. """ def __init__(self, n_bins): self.n_bins = n_bins self.meta = None self.column_index = None self.discretizer = None def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()): self.meta = self.get_metadata(data, categorical_columns, ordinal_columns) self.column_index = [ index for index, info in enumerate(self.meta) if info['type'] == CONTINUOUS ] self.discretizer = KBinsDiscretizer(n_bins=self.n_bins, encode='ordinal', strategy='uniform') if not self.column_index: return self.discretizer.fit(data[:, self.column_index]) def transform(self, data): """Transform data discretizing continous values. Args: data(pandas.DataFrame) Returns: numpy.ndarray """ if self.column_index == []: return data.astype('int') data[:, self.column_index] = self.discretizer.transform( data[:, self.column_index]) return data.astype('int') def inverse_transform(self, data): if self.column_index == []: return data data = data.astype('float32') data[:, self.column_index] = self.discretizer.inverse_transform( data[:, self.column_index]) return data
def discretized_pca(taxi_data, num_components, num_bin_components): # normalize scaler = MinMaxScaler() taxi_data_rescaled = scaler.fit_transform(taxi_data) # pca print('pca processing') pca = PCA(n_components=num_components) pca.fit(taxi_data_rescaled) taxi_rep = pca.transform(taxi_data_rescaled) # test pca loss back_taxi_data = pca.inverse_transform(taxi_rep) back_taxi_data = scaler.inverse_transform(back_taxi_data) average_loss = 0 for i in range(len(taxi_data)): diff = taxi_data[i] - back_taxi_data[i] loss = np.sum(diff * diff) average_loss += loss print('pca loss: {:.6f}'.format(float(average_loss/taxi_data.size))) # for i in range(config['num_components']): # print(np.min(taxi_data[:, i]), np.max(taxi_data[:, i])) # print(np.min(taxi_rep[:, i]), np.max(taxi_rep[:, i])) # discretization est = KBinsDiscretizer(n_bins=num_bin_components, encode='ordinal', strategy='uniform') est.fit(taxi_rep) disc_taxi_rep = est.transform(taxi_rep) # for i in range(5): # tools.print_random_pos(disc_taxi_rep) # test discretized pca loss disc_taxi_rep2 = est.inverse_transform(disc_taxi_rep) back_disc_taxi_data = pca.inverse_transform(disc_taxi_rep2) back_disc_taxi_data = scaler.inverse_transform(back_disc_taxi_data) average_loss = 0 for i in range(len(taxi_data)): diff = taxi_data[i] - back_disc_taxi_data[i] loss = np.sum(diff * diff) average_loss += loss print('discretized pca loss: {:.6f}'.format(float(average_loss/taxi_data.size))) # test ridiculous loss average_loss = 0 test_line = np.zeros(shape=taxi_data[i].shape) for i in range(len(taxi_data)): diff = taxi_data[i] - test_line loss = np.sum(diff * diff) average_loss += loss print('ridiculous loss: {:.6f}'.format(float(average_loss/taxi_data.size))) print(f'num components: {disc_taxi_rep.shape[1]}') return disc_taxi_rep
def test_inverse_transform(strategy): X = np.random.RandomState(0).randn(100, 3) kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal') Xt = kbd.fit_transform(X) assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_) X2 = kbd.inverse_transform(Xt) X2t = kbd.fit_transform(X2) assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_) assert_array_equal(Xt, X2t)
def test_overwrite(): X = np.array([0, 1, 2, 3])[:, None] X_before = X.copy() est = KBinsDiscretizer(n_bins=3, encode="ordinal") Xt = est.fit_transform(X) assert_array_equal(X, X_before) Xt_before = Xt.copy() Xinv = est.inverse_transform(Xt) assert_array_equal(Xt, Xt_before) assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
class MeanBinner(): def __init__(self): self.binner = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') def fit(self, X, y=None): self.binner.fit(X) def transform(self, X, y=None): binned = self.binner.transform(X) return self.binner.inverse_transform(binned)
def test_inverse_transform(strategy, encode): X = np.random.RandomState(0).randn(100, 3) kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) Xt = kbd.fit_transform(X) X2 = kbd.inverse_transform(Xt) X2t = kbd.fit_transform(X2) if encode == 'onehot': assert_array_equal(Xt.todense(), X2t.todense()) else: assert_array_equal(Xt, X2t) if 'onehot' in encode: Xt = kbd._encoder.inverse_transform(Xt) X2t = kbd._encoder.inverse_transform(X2t) assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_) assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
def binning_values(df, col, strategy='quantile', bins=10): """Binning, but returns group values instead of group names. Returns pd.Series. Args: df (pd.DataFrame): dataframe col (str): column name strategy (str): bin strategy for sklearn.KBinsDiscretizer() bins (int): bin count Returns: binned_col (pd.Series): result of binning, but values """ col = utils.tolist(col) disc = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy=strategy) binned = disc.fit_transform(df[col].fillna(-99999).to_numpy()) return pd.Series(disc.inverse_transform(binned).flatten())
def test_inverse_transform(strategy, encode, expected_inv): kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) Xt = kbd.fit_transform(X) Xinv = kbd.inverse_transform(Xt) assert_array_almost_equal(expected_inv, Xinv)
def main(use_simple_lr_pca_pipeline, kbins_strat, train_split, test_split, exclude_pca, hyperparameters, output_size, validation_size, n_process, precached_pkl, prestore_data, return_mode, use_simple_lin_reg_pca_pipeline, use_simple_lstm, discretize_age, kbins_encoding, num_epochs, num_pca_comp): if precached_pkl is not None: allData = pkl.load(open(precached_pkl, 'rb')) data = allData["data"] # clinical_txt_paths = precached_pkl["clinical_txt_paths"] ages = allData["ages"] testAges = allData["testAges"] testData = allData["testData"] # test_clinical_txt_paths = precached_pkl["test_clinical_txt_paths"] else: data, ages, clinical_txt_paths = get_data(split=train_split) testData, testAges, test_clinical_txt_paths = get_data( split=test_split) return_dict = Dict() if prestore_data: toStore = Dict() toStore.data = data toStore.ages = ages toStore.clinical_txt_paths = clinical_txt_paths toStore.testData = testData toStore.testAges = testAges toStore.test_clinical_txt_paths = test_clinical_txt_paths if return_mode == "age": pkl.dump(toStore, open("agePredictionData.pkl", 'wb')) elif return_mode == "bpm": pkl.dump(toStore, open("bpmPredictionData.pkl", 'wb')) return return_mode if discretize_age: kbins = KBinsDiscretizer(output_size, encode=kbins_encoding, strategy=kbins_strat) ages = np.array(ages).reshape(-1, 1) ages = kbins.fit_transform(ages) return_dict['kbins'] = kbins.bin_edges_ testAges = np.array(testAges).reshape(-1, 1) testAges = kbins.transform(testAges) print("KBins used! Edges are: {}".format(kbins.bin_edges_)) if use_simple_lstm: ageScaler = StandardScaler() ages = np.array(ages).reshape(-1, 1) ages = ageScaler.fit_transform(ages) testAges = np.array(testAges).reshape(-1, 1) testAges = ageScaler.transform(testAges) model = get_lstm() x = pad_sequences(data) model.fit(x, ages, epochs=num_epochs, validation_split=validation_size, callbacks=get_early_stopping()) testX = pad_sequences(testData) score = model.evaluate(testX, testAges) y_pred = model.predict(testX) ages = ageScaler.inverse_transform(ages) testAges = ageScaler.inverse_transform(testAges) mse = mean_squared_error(y_pred, testAges) r2 = r2_score(y_pred, testAges) print("MSE: {}".format(mse)) print("R2: {}".format(r2)) fn = "model_{}_epochs{}.h5".format(return_mode, num_epochs) model.save(fn) ex.add_artifact(fn) return score, mse, r2 if use_simple_lin_reg_pca_pipeline: ages = np.array(ages).reshape(-1, 1) testAges = np.array(testAges).reshape(-1, 1) data = np.stack(data).reshape(len(data), -1) testData = np.stack(testData).reshape(len(testData), -1) steps = [ ('pca', PCA(n_components=num_pca_comp)), ('scaler', StandardScaler()), ('lin_reg', LinearRegression()), ] if exclude_pca: steps = steps[1:] p = Pipeline(steps) cv = int(1 / validation_size) gridsearch = GridSearchCV(p, hyperparameters, scoring=make_scorer(r2_score), cv=cv, n_jobs=n_process) gridsearch.fit(data, ages) return_dict["gridsearch_best_estimator"] = gridsearch.best_estimator_ return_dict["best_cv_score"] = gridsearch.best_score_ print("best cv score was {}".format(gridsearch.best_score_)) best_pipeline = gridsearch.best_estimator_ best_pipeline.fit(data, ages) y_pred = best_pipeline.predict(data) y_pred[y_pred < 0] = 0 y_pred[y_pred > 90] = 90 print("train r^2 was {}".format(r2_score(ages, y_pred))) y_pred = best_pipeline.predict(testData) y_pred[y_pred < 0] = 0 y_pred[y_pred > 90] = 90 test_score = mean_squared_error(testAges, y_pred) print("test_score: {}".format(test_score)) print("test r^2 was {}".format(r2_score(testAges, y_pred))) return_dict["test_score"] = test_score pkl.dump(return_dict, open("predict_{}Exp.pkl".format(return_mode), 'wb')) ex.add_artifact("predict_{}Exp.pkl".format(return_mode)) return test_score, r2_score(testAges, y_pred) if use_simple_lr_pca_pipeline: data = np.stack(data).reshape(len(data), -1) testData = np.stack(testData).reshape(len(testData), -1) steps = [ ('pca', PCA(n_components=num_pca_comp)), ('scaler', StandardScaler()), ('lr', LogisticRegression()), ] if exclude_pca: steps = steps[1:] p = Pipeline(steps) cv = int(1 / validation_size) gridsearch = GridSearchCV(p, hyperparameters, scoring=make_scorer(r2_score), cv=cv, n_jobs=n_process) gridsearch.fit(data, ages) return_dict["gridsearch_best_estimator"] = gridsearch.best_estimator_ return_dict["best_cv_score"] = gridsearch.best_score_ print("best cv score was {}".format(gridsearch.best_score_)) best_pipeline = gridsearch.best_estimator_ best_pipeline.fit(data, ages) y_pred = best_pipeline.predict(data) print("train r^2 was {}".format(r2_score(ages, y_pred))) y_pred = best_pipeline.predict(testData) test_score = f1_score(testAges, y_pred, average="weighted") y_pred_orig = kbins.inverse_transform(y_pred.reshape(-1, 1)) test_ages_orig = kbins.inverse_transform(testAges.reshape(-1, 1)) print("test r^2 was {}".format(r2_score(testAges, y_pred))) print("test mse was {}".format( mean_squared_error(test_ages_orig, y_pred_orig))) print("test_score: f1 {}".format(test_score)) print("test_score: accuracy {}".format(accuracy_score( testAges, y_pred))) return_dict["test_score"] = test_score pkl.dump(return_dict, open("predict_{}Exp.pkl".format(return_mode), 'wb')) ex.add_artifact("predict_{}Exp.pkl".format(return_mode)) return test_score raise Exception("Valid config not set")