def fit(self, X, y=None): # подразумевает, что X - это объект DataFrame self._columns = X.columns.values # разбиваем данные на категориальные и количественные признаки self._dtypes = X.dtypes.values self._kinds = np.array([dt.kind for dt in X.dtypes]) self._column_dtypes = {} is_cat = self._kinds == 'O' self._column_dtypes['cat'] = self._columns[is_cat] self._column_dtypes['num'] = self._columns[~is_cat] self._feature_names = self._column_dtypes['num'] # создаем словарь на основе категориального признака, # где ключом будет уникальное значение выше порога self._cat_cols = {} for col in self._column_dtypes['cat']: vc = X[col].value_counts() if self.cat_threshold is not None: vc = vc[vc > self.cat_threshold] vals = vc.index.values self._cat_cols[col] = vals self._feature_names = np.append(self._feature_names, col + '_' + vals) # вычисляем общее количество новых категориальных признаков self._total_cat_cols = sum([len(v) for col, v in self._cat_cols.items()]) # вычисляем среднее или медиану self._num_fill = X[self._column_dtypes['num']].agg(self.num_strategy) if self.asymmetry: self._skew = X.skew() # выделяем список признаков с небольшой отрицательной асимметрией self._neg_skew_num_columns = self._skew[self._skew < 0].index.values # выделяем список признаков с высокой положительной асимметрией self._high_pos_skew_num_columns = self._skew[self._skew > 7].index.values # создадим булев массив not_neg_high_pos_skew_num_columns = ~np.isin( self._column_dtypes['num'], np.r_[self._high_pos_skew_num_columns, self._neg_skew_num_columns]) # из списка количественных признаков удалим количественные признаки # с небольшой отрицательной и высокой положительной асимметрией self._mean_skew_num_columns = self._column_dtypes['num'][not_neg_high_pos_skew_num_columns] # создаем конвейер преобразований для количественных признаков # с небольшой отрицательной асимметрией self._num_negskew_pipe = Pipeline([ ('square', FunctionTransformer(np.square, validate=False)), ('scaler', StandardScaler()) ]) # создаем конвейер преобразований для количественных признаков # с небольшой и средней положительной асимметрией self._num_meanskewpipe = Pipeline([ ('log', FunctionTransformer(np.log1p, validate=False)), ('scaler', RobustScaler()) ]) # создаем конвейер преобразований для количественных # признаков с высокой асимметрией self._num_highposskew_pipe = Pipeline([ ('sqrt', FunctionTransformer(np.sqrt, validate=False)), ('kbd', KBinsDiscretizer(n_bins=5, encode='onehot-dense')) ]) X_num = X[self._column_dtypes['num']] self._num_negskew_pipe.fit(X_num[self._neg_skew_num_columns]) self._num_meanskewpipe.fit(X_num[self._mean_skew_num_columns]) self._num_highposskew_pipe.fit(X_num[self._high_pos_skew_num_columns]) self._cat_names = self._feature_names[~np.isin(self._feature_names, self._column_dtypes['num'])] # self._feature_names = self._feature_names[~np.isin(self._feature_names, self._high_pos_skew_num_columns)] self._bins_names = self._num_highposskew_pipe.named_steps['kbd']._encoder.get_feature_names() self._num_names = np.r_[self._neg_skew_num_columns, self._mean_skew_num_columns, self._bins_names, ] self._feature_names = np.append(self._num_names, self._cat_names) return self
from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.compose import ColumnTransformer log_scale_transformer = make_pipeline( FunctionTransformer(np.log, validate=False), StandardScaler() ) linear_model_preprocessor = ColumnTransformer( [ ("passthrough_numeric", "passthrough", ["BonusMalus"]), ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), ("log_scaled_numeric", log_scale_transformer, ["Density"]), ("onehot_categorical", OneHotEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), ], remainder="drop", ) # %% # A constant prediction baseline # ------------------------------ # # It is worth noting that more than 93% of policyholders have zero claims. If # we were to convert this problem into a binary classification task, it would
###################################################################### ### 二值离散化{0,1} # 大于threshold都标记为1,小于等于threshold的都标记为0. from sklearn.preprocessing import Binarizer cols = ['年龄', '收入'] est = Binarizer(threshold=50) X_ = est.fit_transform(df[cols]) print(X_) ### 多值离散化,分箱数n_bins from sklearn.preprocessing import KBinsDiscretizer est = KBinsDiscretizer(n_bins=5, encode='ordinal') #0~n_bins-1 X_ = est.fit_transform(df[cols]) print(X_) # KBinsDiscretizer # (n_bins=5, encode=’onehot’, strategy=’quantile’) #n_bins : int or array-like, shape (n_features,) (default=5) # 分箱数,不能小于2 # encode : {‘onehot’, ‘onehot-dense’, ‘ordinal’}, (default=’onehot’) #编码结果的方法 #onehot:one-hot编码,返回稀疏矩阵sparse array. #onehot-dense:one-hot编码,返回密集矩阵dense array. #ordinal:返回分箱标识(整数值). # strategy : {‘uniform’, ‘quantile’, ‘kmeans’}, (default=’quantile’) # 分箱宽度的策略Strategy used to define the widths of the bins. # uniform:等宽,所有箱都有相同的宽度
def _generate_features(self, X, y=None, numeric_extra=None, categorical_extra=None): try: self.feature_pipeline_ except AttributeError: n_days = X['dayofweek'].nunique() n_hours = X['hour'].nunique() self.feature_pipeline_ = Pipeline([( 'features', FeatureUnion([ # time of week part of TOWT ('weeks', Pipeline([ ('split', FeatureUnion([ ('days', Pipeline([ ('select', ColumnSelector('dayofweek')), ('ordinal', OrdinalEncoder(cols=['dayofweek'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')) ])), ('hours', Pipeline([('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer( missing_values=-1, strategy='most_frequent'))])) ])), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=['dayofweek', 'hour']))), ('term', PatsyTransformer('-1 + C(dayofweek):C(hour)')) ])) if (n_days > 1) and (n_hours > 1) else ('days', Pipeline([ ('select', ColumnSelector('dayofweek')), ('ordinal', OrdinalEncoder(cols=['dayofweek'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=['dayofweek']))), ('one_hot', OneHotEncoder(cols=['dayofweek'], return_df=False)) ])) if n_days > 1 else ('hours', Pipeline( [('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')), ('to_pandas', FunctionTransformer( lambda x: pd.DataFrame(x, columns=['hour']))), ('one_hot', OneHotEncoder(cols=['hour'], return_df=False))])), # temperature part of TOWT ('temperature', ColumnTransformer([ ('encode_temperature', IntervalEncoder( n_chunks=10, span=0.1 * X[self.temperature_col].std(), method='normal'), [self.temperature_col]) ])), ('temperature_interact', 'drop' if n_hours == 1 else Pipeline( [('split', FeatureUnion([ ('temperature_part', Pipeline([ ('select', ColumnSelector(self.temperature_col)), ( 'create_bins', KBinsDiscretizer( n_bins=self.n_bins_temperature, strategy='quantile', encode='ordinal'), ) ])), ('hour_part', Pipeline([('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer( missing_values=-1, strategy='most_frequent'))])) ])), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=[self.temperature_col, 'hour']))), ('term', PatsyTransformer( f'-1 + C({self.temperature_col}):C(hour)'))])), # deal with extra numerical regressors ('numerical_regressors', 'drop' if not numeric_extra else ColumnTransformer( [(f'encode_{col}', IntervalEncoder(n_chunks=4, span=0.1 * X[col].std(), method='normal'), [col]) for col in numeric_extra])), # deal with extra categorical regressors ('categorical_regressors', 'drop' if not categorical_extra else TargetEncoder(cols=categorical_extra, return_df=False, handle_missing='value', handle_unknown='value')) ]))]) # Fit the pipeline self.feature_pipeline_.fit(X, y) finally: return self.feature_pipeline_.transform(X)
# more efficient to perform explicit feature expansion using # `PolynomialFeatures` or other non-linear transformers from scikit-learn such # as # [KBinsDiscretizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html) # or # [Nystroem](https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html). # # Here again we refer the interested reader to the documentation to get a # proper definition of those methods. The following just gives an intuitive # overview of the predictions we would get using those on our toy dataset: # %% from sklearn.preprocessing import KBinsDiscretizer binned_regression = make_pipeline( KBinsDiscretizer(n_bins=8), LinearRegression(), ) binned_regression.fit(data, target) target_predicted = binned_regression.predict(data) mse = mean_squared_error(target, target_predicted) ax = sns.scatterplot(data=full_data, x="input_feature", y="target", color="black", alpha=0.5) ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") # %%
def q2(): est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') X = df['Pop_density'].values.reshape(-1, 1) pop_density_bins = est.fit_transform(X) # Since we have 10 bins from 0 to 9, bins >8 (bins == 9) represent countries over 90 percentile return len((df[pop_density_bins == 9]['Country']).unique())
def test_fit_transform(strategy, expected): est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy) est.fit(X) assert_array_equal(expected, est.transform(X))
#providing categorical mask number_columns_mask = data.select_dtypes('number').columns print(number_columns_mask) data = pd.get_dummies(data) dummies_mask = pickle.load(open('dummies.sav','rb')) data = data.loc[:,[*number_columns_mask ,*dummies_mask]] X = data.drop('price',axis=1) X = data.drop('symboling',axis=1) y = data.loc[:,'price'] est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans') Xt = est.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25) rf = RandomForestRegressor(bootstrap=False,max_depth=93,max_features='sqrt',min_samples_leaf=1,min_samples_split=3, n_estimators=667 ) rf.fit(X_train,y_train) score = r2_score(y_test,rf.predict(X_test)) plt.plot(y_test,rf.predict(X_test)) plt.show() """ parameters = {'bootstrap': [False], 'max_depth': [ 93], 'max_features': ['sqrt'],
def fit(self, X, y=None): for index in range(len(self.feature_is_included_list)): if self.feature_is_included_list[index] == True: self.k_bins_discretizers[index] = KBinsDiscretizer(encode='ordinal', strategy='quantile') self.k_bins_discretizers[index].fit(X[:, index:index + 1]) return self
def discretize(X, max_bins): enc = KBinsDiscretizer(n_bins=max_bins, encode='onehot-dense') enc.fit(X) enc.bin_edges_ = [np.unique(edges) for edges in enc.bin_edges_] return enc.transform(X)
class_1 = 50000 # 多数类为50000个样本 class_2 = 500 # 少数类为500个样本 centers = [[0.0, 0.0], [5.0, 5.0]] # 设定两个类别的中心 clusters_std = [3, 1] # 设定两个类别的方差 X, y = make_blobs(n_samples=[class_1, class_2], centers=centers, cluster_std=clusters_std, random_state=0, shuffle=False) name = ["Multinomial", "Gaussian", "Bernoulli"] models = [MultinomialNB(), GaussianNB(), BernoulliNB()] for name, clf in zip(name, models): Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.3, random_state=420) if name != "Gaussian": kbs = KBinsDiscretizer(n_bins=10, encode='onehot').fit(Xtrain) Xtrain = kbs.transform(Xtrain) Xtest = kbs.transform(Xtest) clf.fit(Xtrain, Ytrain) y_pred = clf.predict(Xtest) proba = clf.predict_proba(Xtest)[:, 1] score = clf.score(Xtest, Ytest) print(name) print("\tBrier:{:.3f}".format(BS(Ytest, proba, pos_label=1))) print("\tAccuracy:{:.3f}".format(score)) print("\tRecall:{:.3f}".format(recall_score(Ytest, y_pred))) print("\tAUC:{:.3f}".format(AUC(Ytest, proba))) print("*" * 100) name = ["Multinomial", "Gaussian", "Bernoulli", "Complement"] models = [MultinomialNB(), GaussianNB(), BernoulliNB(), ComplementNB()]
def discretize(_, __, angle, pole_velocity): est = KBinsDiscretizer(n_bins=num_discrete_bins, encode='ordinal', strategy='uniform') est.fit([lower_bounds, upper_bounds]) return tuple(map(int, est.transform([[angle, pole_velocity]])[0]))
from sklearn.preprocessing import KBinsDiscretizer import pandas as pd dataSet = pd.read_csv("./dataset.csv") age = dataSet['Age'].values est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform') est.fit(age.reshape(-1, 1)) Xt = est.transform(age.reshape(-1, 1)) print(Xt)
print( 'Data after cleaning: {}, {} positive examples, {} negative examples'. format(df.shape, positive_examples, negative_examples)) split_ratio = args.train_test_split_ratio print('Splitting data into train and test sets with ratio {}'.format( split_ratio)) X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df['income'], test_size=split_ratio, random_state=0) preprocess = make_column_transformer( (['age', 'num persons worked for employer' ], KBinsDiscretizer(encode='onehot-dense', n_bins=10)), (['capital gains', 'capital losses', 'dividends from stocks' ], StandardScaler()), (['education', 'major industry code', 'class of worker' ], OneHotEncoder(sparse=False))) print('Running preprocessing and feature engineering transformations') train_features = preprocess.fit_transform(X_train) test_features = preprocess.transform(X_test) print('Train data shape after preprocessing: {}'.format( train_features.shape)) print('Test data shape after preprocessing: {}'.format( test_features.shape)) train_features_output_path = os.path.join('/opt/ml/processing/train', 'train_features.csv')
y = pd.read_csv("spambase.data", names=[57]) # dropping null values from dataset x.dropna() y.dropna() # converting to numpy arrays for further operations X = x.values.astype(float) Y = y.values # Setting number of bins using sturgs rule s = 1 + (math.log(len(X), 2)) print("\nNumber of bins:", round(s)) # Discetizer object creation d = KBinsDiscretizer(n_bins=round(s), encode='ordinal', strategy='uniform') # Hypothesis space creation H = [] for i in range(57): H.append([]) ''' Algorithm 4.1 implementation with initializating the Hypothesis space with the first instance from dataset and then implementing algorithm 4.3 in the LGG for finding the conjuntions of internal disjunction of the literals and returning the hypothesis space with the conjugates ''' def LGG(H, X): #Initilization of LGG with first instance from dataset for j in range(len(X[0])): H[j].append(X[0][j]) while (j < len(X)):
def preprocessing_discrete_whole_image(data_path, img, feat_list_all, batch): img_path = data_path / 'images' / img stack_path = img_path / 'stack' / 'stack.tif' bins = 7 with rasterio.open(str(stack_path), 'r') as ds: data = ds.read() data = data.transpose((1, -1, 0)) data[data == -999999] = np.nan data[np.isneginf(data)] = np.nan data_vector = data.reshape( [data.shape[0] * data.shape[1], data.shape[2]]) data_vector = data_vector[~np.isnan(data_vector).any(axis=1)] # Get indices of non-nan values nans = np.sum(data, axis=2) data_ind = np.where(~np.isnan(nans)) rows, cols = zip(data_ind) # Discretize continuous features cts_feats = [ 'GSW_distSeasonal', 'aspect', 'curve', 'elevation', 'hand', 'slope', 'spi', 'twi', 'sti' ] non_cts_feats = [ 'developed', 'forest', 'planted', 'wetlands', 'openspace', 'carbonate', 'noncarbonate', 'akl_intrusive', 'silicic_resid', 'silicic_resid', 'extrusive_volcanic', 'colluvial_sed', 'glacial_till_clay', 'glacial_till_loam', 'glacial_till_coarse', 'glacial_lake_sed_fine', 'glacial_outwash_coarse', 'hydric', 'eolian_sed_coarse', 'eolian_sed_fine', 'saline_lake_sed', 'alluv_coastal_sed_fine', 'coastal_sed_coarse', 'GSW_perm', 'flooded' ] feats_disc = [] all_edges = pd.DataFrame([]) # GSW_distSeasonal discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') GSW_distSeasonal_disc = discretizer.fit_transform( data_vector[:, feat_list_all.index('GSW_distSeasonal')].reshape(-1, 1)) for i in range(bins): feats_disc.append('GSW_distSeasonal_' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = GSW_distSeasonal_disc[:, bin] GSW_distSeasonal_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # Elevation discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') elevation_disc = discretizer.fit_transform( data_vector[:, feat_list_all.index('elevation')].reshape(-1, 1)) for i in range(bins): feats_disc.append('elevation' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = elevation_disc[:, bin] elevation_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # Slope discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') slope_disc = discretizer.fit_transform( data_vector[:, feat_list_all.index('slope')].reshape(-1, 1)) for i in range(bins): feats_disc.append('slope' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = slope_disc[:, bin] slope_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # TWI discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') twi_disc = discretizer.fit_transform( data_vector[:, feat_list_all.index('twi')].reshape(-1, 1)) for i in range(bins): feats_disc.append('twi' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = twi_disc[:, bin] twi_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # SPI discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') spi_disc = discretizer.fit_transform( data_vector[:, feat_list_all.index('spi')].reshape(-1, 1)) for i in range(bins): feats_disc.append('spi' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = spi_disc[:, bin] spi_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # STI discretizer = KBinsDiscretizer(n_bins=bins, encode='onehot-dense', strategy='quantile') sti_disc = discretizer.fit_transform( data_vector[:, feat_list_all.index('sti')].reshape(-1, 1)) for i in range(bins): feats_disc.append('sti' + str(i + 1)) disc_nan = np.zeros(data[:, :, 0:bins].shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = sti_disc[:, bin] sti_disc = disc_nan del disc_nan edges = [] for arr in discretizer.bin_edges_: for edge in arr[:-1]: edges.append(edge) all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # Curve (flat, convex, concave) convex = np.zeros((data_vector.shape[0], )) concave = np.zeros((data_vector.shape[0], )) flat = np.zeros((data_vector.shape[0], )) convex[np.where(data_vector[:, feat_list_all.index('curve')] < 0)] = 1 concave[np.where(data_vector[:, feat_list_all.index('curve')] > 0)] = 1 flat[np.where(data_vector[:, feat_list_all.index('curve')] == 0)] = 1 names = ['convex', 'concave', 'flat'] bins = len(names) for name in names: feats_disc.append(name) curve = np.column_stack([convex, concave, flat]) shape = data[:, :, 0:curve.shape[1]].shape disc_nan = np.zeros(shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = curve[:, bin] curve = disc_nan del disc_nan, convex, concave, flat edges = [0, 0, 0] all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # Aspect (north, northeast, northwest, south, southeast, southwest, east, west) north = np.zeros((data_vector.shape[0], )) northeast = np.zeros((data_vector.shape[0], )) east = np.zeros((data_vector.shape[0], )) southeast = np.zeros((data_vector.shape[0], )) south = np.zeros((data_vector.shape[0], )) southwest = np.zeros((data_vector.shape[0], )) west = np.zeros((data_vector.shape[0], )) northwest = np.zeros((data_vector.shape[0], )) north[np.where( np.logical_or.reduce( (data_vector[:, feat_list_all.index('aspect')] >= 337.5, data_vector[:, feat_list_all.index('aspect')] < 22.5)))] = 1 northeast[np.where( np.logical_and.reduce( (data_vector[:, feat_list_all.index('aspect')] >= 22.5, data_vector[:, feat_list_all.index('aspect')] < 67.5)))] = 1 east[np.where( np.logical_and.reduce( (data_vector[:, feat_list_all.index('aspect')] >= 67.5, data_vector[:, feat_list_all.index('aspect')] < 112.5)))] = 1 southeast[np.where( np.logical_and.reduce( (data_vector[:, feat_list_all.index('aspect')] >= 112.5, data_vector[:, feat_list_all.index('aspect')] < 157.5)))] = 1 south[np.where( np.logical_and.reduce( (data_vector[:, feat_list_all.index('aspect')] >= 157.5, data_vector[:, feat_list_all.index('aspect')] < 202.5)))] = 1 southwest[np.where( np.logical_and.reduce( (data_vector[:, feat_list_all.index('aspect')] >= 202.5, data_vector[:, feat_list_all.index('aspect')] < 247.5)))] = 1 west[np.where( np.logical_and.reduce( (data_vector[:, feat_list_all.index('aspect')] >= 247.5, data_vector[:, feat_list_all.index('aspect')] < 292.5)))] = 1 northwest[np.where( np.logical_and.reduce( (data_vector[:, feat_list_all.index('aspect')] >= 292.5, data_vector[:, feat_list_all.index('aspect')] < 337.5)))] = 1 names = [ 'north', 'northeast', 'east', 'southeast', 'south', 'southwest', 'west', 'northwest' ] bins = len(names) for name in names: feats_disc.append(name) aspect = np.column_stack( [north, northeast, east, southeast, south, southwest, west, northwest]) shape = data[:, :, 0:aspect.shape[1]].shape disc_nan = np.zeros(shape) disc_nan[~np.isnan(disc_nan)] = np.nan for bin in range(bins): disc_nan[rows, cols, bin] = aspect[:, bin] aspect = disc_nan del disc_nan, north, northeast, east, southeast, south, southwest, west, northwest edges = [22.5, 67.5, 112.5, 157.5, 202.5, 247.5, 292.5, 337.5] all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0) # Get original discrete features orig_disc_inds = [] for feat in non_cts_feats: orig_disc_inds.append(feat_list_all.index(feat)) orig_disc_data = data[:, :, orig_disc_inds] # Combine with new discrete features new_disc_data = np.dstack([ GSW_distSeasonal_disc, elevation_disc, slope_disc, twi_disc, spi_disc, sti_disc, curve, aspect ]) data = np.dstack([new_disc_data, orig_disc_data]) del orig_disc_data, new_disc_data # Combine all edges and features all_edges = all_edges.reset_index(drop=True) feature_edges = pd.concat( [all_edges, pd.DataFrame(data=feats_disc)], axis=1) feature_edges.columns = ['edge', 'feature'] # If a feat has only zeros or 1s, it is removed data_vector = data.reshape([data.shape[0] * data.shape[1], data.shape[2]]) data_vector = data_vector[~np.isnan(data_vector).any(axis=1)] std = data_vector[:, 0:data_vector.shape[1] - 2].std(0) remove_inds = [] if 0 in std.tolist(): zero_inds = np.where(std == 0)[0].tolist() for ind in zero_inds: remove_inds.append(ind) remove_inds = np.unique(remove_inds).tolist() feat_list_stack = feats_disc + non_cts_feats remove_feats = [feat_list_stack[ind] for ind in remove_inds] data_vector_keep = np.delete(data_vector, remove_inds, axis=1) feat_keep = [x for x in feat_list_stack if x not in remove_feats] feature_edges_keep = feature_edges[~feature_edges.feature.isin(remove_feats )] # Save feature class bin edges filedir = data_path / batch / 'class_bins' try: filedir.mkdir(parents=True) except FileExistsError: pass filename = filedir / '{}'.format('feature_edges.csv') feature_edges_keep.to_csv(filename, index=False) return data, data_vector_keep, data_ind, feat_keep, feature_edges_keep
plt.xlabel('Gamma value') plt.show() figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k') plt1.plot(C_data, C_data_train_score, color="red") plt1.plot(C_data, C_data_score, color="blue") plt1.ylabel('Accuracy Score') plt1.xlabel('C value') plt1.show() #Load the three datasets and shuffle them train_set = pd.read_csv("2018.csv") train_set = train_set.fillna(0) train_set = train_set.sample(frac=1) enc_score = KBinsDiscretizer(n_bins=3, encode='ordinal') X_binned = enc_score.fit_transform(train_set["Score"].values.reshape(-1, 1)) train_set['Score'] = X_binned train_set1 = pd.read_csv("diamonds.csv") train_set1 = train_set1.fillna(0) train_set1 = train_set1.sample(frac=1) train_set2 = pd.read_csv("winequality-red.csv") train_set2 = train_set2.fillna(0) train_set2 = train_set2.sample(frac=1) #Parameter a is how many data I want to use, inorder to train and test my model. I choose the target and seperate it from the input a = 500 enc_y = OneHotEncoder(sparse="True")
def add_features(self, X, transform=False, index=None): '''Adds features to the learned one at given index, if index is not provided then they are appended at the end''' try: check_is_fitted(self) except NotFittedError as e: self.fit(X) if transform: return self.transform(X) return self X = X.copy() if isinstance(X, pd.DataFrame): numerical_features = X.select_dtypes("float") if len(numerical_features.columns): if self.discretize: temp_discretizer = KBinsDiscretizer( n_bins=self.n_intervals, encode="ordinal", strategy="quantile") X.loc[:, numerical_features. columns] = temp_discretizer.fit_transform( numerical_features) X.loc[:, numerical_features. columns] = X.loc[:, numerical_features.columns].astype( int) new_index = X.columns.get_indexer( numerical_features.columns) if index: index_with_column = list(enumerate(index)) numerical_index = np.array(index)[new_index] sort_index = np.argsort(numerical_index) numerical_index_with_column = [ index_with_column[i] for i in numerical_index ] last = 0 for i in sort_index: feature, list_insert_index = numerical_index_with_column[ i] while last <= len( self.numerical_feature_index_ ) and list_insert_index < self.numerical_feature_index_[ last]: last += 1 self.numerical_feature_index_.insert( last, list_insert_index) self.discretizer.n_bins_ = np.insert( self.discretizer.n_bins_, last, temp_discretizer.n_bins_[i], axis=1) self.discretizer.bin_edges_ = np.insert( self.discretizer.n_bins_, last, temp_discretizer.bin_edges_[i], axis=1) for n in range(last + 1, len(self.numerical_feature_index_)): self.numerical_feature_index_[n] += 1 else: new_index = X.columns.get_indexer( numerical_features.columns) + self.n_features self.numerical_feature_index_.extend(new_index) self.discretizer.n_bins_ = np.concatenate([ self.discretizer.n_bins_, temp_discretizer.n_bins_ ], axis=1) self.discretizer.bin_edges_ = np.concatenate([ self.discretizer.bin_edges_, temp_discretizer.bin_edges_ ], axis=1) X = X.to_numpy() if X.dtype == "O": X = X.astype(str) self.n_features += X.shape[1] new_categories = [np.unique(X[:, j]) for j in range(X.shape[1])] if index is not None: sort_index = np.argsort(index) index_with_column = list(enumerate(index)) for i in sort_index: column, list_insert_index = index_with_column[i] self.categories_.insert(list_insert_index, new_categories[column]) else: self.categories_.extend(new_categories) self.sort_index_ = [cat.argsort() for cat in self.categories_] self.sorted_categories_ = [ self.categories_[j][self.sort_index_[j]] for j in range(self.n_features) ] self.sorted_encoded_ = [ np.arange(self.categories_[j].shape[0])[self.sort_index_[j]] for j in range(self.n_features) ] self.unknown_values_ = [cat.shape[0] for cat in self.categories_] if transform: if index is None: index = list( range( len(self.categories_) - len(new_categories), len(self.categories_))) return self.transform_columns(X, categories=index)
def test_inverse_transform(strategy, encode, expected_inv): kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) Xt = kbd.fit_transform(X) Xinv = kbd.inverse_transform(Xt) assert_array_almost_equal(expected_inv, Xinv)
countries.Region = countries.Region.str.strip() # In[6]: #Questao 1 q1 = list(countries.Region.unique()) q1.sort() # In[7]: #Questao 2 disc = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') # In[15]: disc.fit(countries.Pop_density.values.reshape(-1,1)) Pop_disc = disc.transform(countries.Pop_density.values.reshape(-1,1)) # In[17]: quart = np.quantile(Pop_disc, 0.9)
def test_valid_n_bins(): KBinsDiscretizer(n_bins=2).fit_transform(X) KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X) assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int)
def q2(): disc = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') disc.fit(countries.Pop_density.values.reshape(-1,1)) Pop_disc = disc.transform(countries.Pop_density.values.reshape(-1,1)) quart = np.quantile(Pop_disc, 0.9) return int(Pop_disc[Pop_disc > quart].size)
def discretization(_database, json_obj, _db_name='', _save_path=None): if isinstance(_database,str): _database = pd.read_csv(_database, header=0, sep=',') db = _database.copy() # remove old VAR_TIME_NAME from features to discretize if json_obj['_survivalAttr']['survivalTime_name'] in json_obj['_cols2disc']: json_obj['_cols2disc'].remove(json_obj['_survivalAttr']['survivalTime_name']) # return if there is no col to discretize if not json_obj['_cols2disc']: # save db as db_disc _save_db(db, _db_name+'_disc', _save_path) # save log discretization with open(_save_path+'{}_log_discretization.json'.format(_db_name), 'w') as f: json.dump('! This data set has no features for discretization',f) return # discretization process: iterates over all coluns log_discretization = {} db_disc = db.copy() for col_name in json_obj['_cols2disc']: col_log = {} # data shape to discretize data = np.array(db[col_name]).reshape(-1, 1) try: discretizer = KBinsDiscretizer(n_bins=N_BINS, encode=ENCODE, strategy=STRATEGY) discretizer.fit(data) # data discretized/encoded data_encoded = pd.Series(discretizer.transform(data).reshape(1,-1)[0]).astype('int64') except: discretizer = KBinsDiscretizer(n_bins=N_BINS, encode=ENCODE, strategy='quantile') discretizer.fit(data) # data discretized/encoded data_encoded = pd.Series(discretizer.transform(data).reshape(1,-1)[0]).astype('int64') # categories representation categories = sorted(data_encoded.unique().tolist()) bin_edges = list(discretizer.bin_edges_[0]) map_names = {} for idx,ctg in enumerate(categories): if idx+1 == len(categories): string = '[{:0.2f},{:0.2f}]'.format(bin_edges[idx],bin_edges[idx+1]) else: string = '[{:0.2f},{:0.2f})'.format(bin_edges[idx],bin_edges[idx+1]) map_names[ctg] = string # data decodification data_ctg = data_encoded.map(map_names) db_disc[col_name] = data_ctg.astype('category') # register log for discretized column col_log['discretizer_params'] = discretizer.get_params().copy() col_log['bin_edges'] = bin_edges col_log['dict_categories_names'] = map_names col_log['data_orig'] = data.tolist() col_log['data_encoded'] = data_encoded.tolist() col_log['data_categories'] = data_ctg.tolist() log_discretization[col_name] = col_log.copy() # save discretized database _save_db(db_disc, _db_name+'_disc', _save_path) # save log discretization with open(_save_path+'{}_log_discretization.json'.format(_db_name), 'w') as f: json.dump(log_discretization,f) return
def q2(): # Retorne aqui o resultado da questão 2. est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') bins_score = est.fit_transform(countries['Pop_density'].values.reshape(-1, 1)) return int((bins_score >= 9).sum())
KBinsDiscretizer, MinMaxScaler, Normalizer, OneHotEncoder, OrdinalEncoder, PolynomialFeatures, PowerTransformer, StandardScaler, ) from feature_engine.selection import DropFeatures from feature_engine.wrappers import SklearnTransformerWrapper _transformers = [ Binarizer(threshold=2), KBinsDiscretizer(n_bins=3, encode="ordinal"), StandardScaler(), MinMaxScaler(), Normalizer(), PowerTransformer(), FunctionTransformer(np.log, validate=True), OrdinalEncoder(), ] _selectors = [ SelectFromModel(Lasso(random_state=1)), SelectKBest(f_regression, k=2), VarianceThreshold(), RFE(Lasso(random_state=1)), ]
def fit(self, X, y=None): self.kbd = KBinsDiscretizer(n_bins=self.n_bins, encode=self.encode,strategy=self.strategy) self.kbd.fit(X[self.cols]) return self
# # In order to fit linear models with those predictors it is therefore # necessary to perform standard feature transformations as follows: from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.compose import ColumnTransformer log_scale_transformer = make_pipeline( FunctionTransformer(np.log, validate=False), StandardScaler()) linear_model_preprocessor = ColumnTransformer( [ ("passthrough_numeric", "passthrough", ["BonusMalus"]), ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), ("log_scaled_numeric", log_scale_transformer, ["Density"]), ( "onehot_categorical", OneHotEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"], ), ], remainder="drop", ) # %% # A constant prediction baseline # ------------------------------ # # It is worth noting that more than 93% of policyholders have zero claims. If
ax.set_title('Input Data', size=14) xx, yy = np.meshgrid(np.linspace(X[:, 0].min(), X[:, 0].max(), 300), np.linspace(X[:, 1].min(), X[:, 1].max(), 300)) grid = np.c_[xx.ravel(), yy.ravel()] ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 for strategy in strategies: enc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy=strategy) enc.fit(X) grid_enc = enc.transform(grid) ax = plt.subplot(len(X_list), len(strategies) + 1, i) # horizontal stripes horizontal = grid_enc[:, 0].reshape(xx.shape) ax.contourf(xx, yy, horizontal, alpha=0.5) # vertical stripes vertical = grid_enc[:, 1].reshape(xx.shape) ax.contourf(xx, yy, vertical, alpha=0.5) ax.scatter(X[:, 0], X[:, 1], edgecolors='k') ax.set_xlim(xx.min(), xx.max())
name = estimator.__class__.__name__ if name == 'Pipeline': name = [get_name(est[1]) for est in estimator.steps] name = ' + '.join(name) return name # list of (estimator, param_grid), where param_grid is used in GridSearchCV classifiers = [ (LogisticRegression(random_state=0), { 'C': np.logspace(-2, 7, 10) }), (LinearSVC(random_state=0), { 'C': np.logspace(-2, 7, 10) }), (make_pipeline(KBinsDiscretizer(encode='onehot'), LogisticRegression(random_state=0)), { 'kbinsdiscretizer__n_bins': np.arange(2, 10), 'logisticregression__C': np.logspace(-2, 7, 10), }), (make_pipeline(KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), { 'kbinsdiscretizer__n_bins': np.arange(2, 10), 'linearsvc__C': np.logspace(-2, 7, 10), }), (GradientBoostingClassifier(n_estimators=50, random_state=0), { 'learning_rate': np.logspace(-4, 0, 10) }), (SVC(random_state=0), { 'C': np.logspace(-2, 7, 10) }),
def test_invalid_strategy_option(): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy='invalid-strategy') assert_raise_message(ValueError, "Valid options for 'strategy' are " "('uniform', 'quantile', 'kmeans'). " "Got strategy='invalid-strategy' instead.", est.fit, X)