예제 #1
0
    def fit(self, X, y=None):
        # подразумевает, что X - это объект DataFrame
        self._columns = X.columns.values

        # разбиваем данные на категориальные и количественные признаки
        self._dtypes = X.dtypes.values
        self._kinds = np.array([dt.kind for dt in X.dtypes])
        self._column_dtypes = {}
        is_cat = self._kinds == 'O'
        self._column_dtypes['cat'] = self._columns[is_cat]
        self._column_dtypes['num'] = self._columns[~is_cat]
        self._feature_names = self._column_dtypes['num']

        # создаем словарь на основе категориального признака,
        # где ключом будет уникальное значение выше порога
        self._cat_cols = {}
        for col in self._column_dtypes['cat']:
            vc = X[col].value_counts()
            if self.cat_threshold is not None:
                vc = vc[vc > self.cat_threshold]
            vals = vc.index.values
            self._cat_cols[col] = vals
            self._feature_names = np.append(self._feature_names, col + '_' + vals)

        # вычисляем общее количество новых категориальных признаков
        self._total_cat_cols = sum([len(v) for col, v in self._cat_cols.items()])

        # вычисляем среднее или медиану
        self._num_fill = X[self._column_dtypes['num']].agg(self.num_strategy)

        if self.asymmetry:
            self._skew = X.skew()

            # выделяем список признаков с небольшой отрицательной асимметрией
            self._neg_skew_num_columns = self._skew[self._skew < 0].index.values

            # выделяем список признаков с высокой положительной асимметрией
            self._high_pos_skew_num_columns = self._skew[self._skew > 7].index.values

            # создадим булев массив
            not_neg_high_pos_skew_num_columns = ~np.isin(
                self._column_dtypes['num'], np.r_[self._high_pos_skew_num_columns, self._neg_skew_num_columns])

            # из списка количественных признаков удалим количественные признаки
            # с небольшой отрицательной и высокой положительной асимметрией
            self._mean_skew_num_columns =  self._column_dtypes['num'][not_neg_high_pos_skew_num_columns]

            # создаем конвейер преобразований для количественных признаков
            # с небольшой отрицательной асимметрией
            self._num_negskew_pipe = Pipeline([
                ('square', FunctionTransformer(np.square, validate=False)),
                ('scaler', StandardScaler())
            ])

            # создаем конвейер преобразований для количественных признаков
            # с небольшой и средней положительной асимметрией
            self._num_meanskewpipe = Pipeline([
                ('log', FunctionTransformer(np.log1p, validate=False)),
                ('scaler', RobustScaler())
            ])

            # создаем конвейер преобразований для количественных
            # признаков с высокой асимметрией
            self._num_highposskew_pipe = Pipeline([
                ('sqrt', FunctionTransformer(np.sqrt, validate=False)),
                ('kbd', KBinsDiscretizer(n_bins=5, encode='onehot-dense'))
            ])

            X_num = X[self._column_dtypes['num']]
            self._num_negskew_pipe.fit(X_num[self._neg_skew_num_columns])
            self._num_meanskewpipe.fit(X_num[self._mean_skew_num_columns])
            self._num_highposskew_pipe.fit(X_num[self._high_pos_skew_num_columns])

            self._cat_names = self._feature_names[~np.isin(self._feature_names, self._column_dtypes['num'])]
            # self._feature_names = self._feature_names[~np.isin(self._feature_names, self._high_pos_skew_num_columns)]

            self._bins_names = self._num_highposskew_pipe.named_steps['kbd']._encoder.get_feature_names()
            self._num_names = np.r_[self._neg_skew_num_columns,
                              self._mean_skew_num_columns,
                              self._bins_names,
            ]
            self._feature_names = np.append(self._num_names, self._cat_names)

        return self
예제 #2
0
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer


log_scale_transformer = make_pipeline(
    FunctionTransformer(np.log, validate=False),
    StandardScaler()
)

linear_model_preprocessor = ColumnTransformer(
    [
        ("passthrough_numeric", "passthrough",
            ["BonusMalus"]),
        ("binned_numeric", KBinsDiscretizer(n_bins=10),
            ["VehAge", "DrivAge"]),
        ("log_scaled_numeric", log_scale_transformer,
            ["Density"]),
        ("onehot_categorical", OneHotEncoder(),
            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
    ],
    remainder="drop",
)

# %%
# A constant prediction baseline
# ------------------------------
#
# It is worth noting that more than 93% of policyholders have zero claims. If
# we were to convert this problem into a binary classification task, it would
예제 #3
0
######################################################################

### 二值离散化{0,1}
# 大于threshold都标记为1,小于等于threshold的都标记为0.
from sklearn.preprocessing import Binarizer

cols = ['年龄', '收入']

est = Binarizer(threshold=50)
X_ = est.fit_transform(df[cols])
print(X_)

### 多值离散化,分箱数n_bins
from sklearn.preprocessing import KBinsDiscretizer

est = KBinsDiscretizer(n_bins=5, encode='ordinal')  #0~n_bins-1
X_ = est.fit_transform(df[cols])
print(X_)

# KBinsDiscretizer
# (n_bins=5, encode=’onehot’, strategy=’quantile’)
#n_bins : int or array-like, shape (n_features,) (default=5)
#   分箱数,不能小于2
# encode : {‘onehot’, ‘onehot-dense’, ‘ordinal’}, (default=’onehot’)
#编码结果的方法
#onehot:one-hot编码,返回稀疏矩阵sparse array.
#onehot-dense:one-hot编码,返回密集矩阵dense array.
#ordinal:返回分箱标识(整数值).
# strategy : {‘uniform’, ‘quantile’, ‘kmeans’}, (default=’quantile’)
# 分箱宽度的策略Strategy used to define the widths of the bins.
# uniform:等宽,所有箱都有相同的宽度
예제 #4
0
    def _generate_features(self,
                           X,
                           y=None,
                           numeric_extra=None,
                           categorical_extra=None):
        try:
            self.feature_pipeline_

        except AttributeError:
            n_days = X['dayofweek'].nunique()
            n_hours = X['hour'].nunique()

            self.feature_pipeline_ = Pipeline([(
                'features',
                FeatureUnion([
                    # time of week part of TOWT
                    ('weeks',
                     Pipeline([
                         ('split',
                          FeatureUnion([
                              ('days',
                               Pipeline([
                                   ('select', ColumnSelector('dayofweek')),
                                   ('ordinal',
                                    OrdinalEncoder(cols=['dayofweek'],
                                                   return_df=False)),
                                   ('unknown',
                                    SimpleImputer(missing_values=-1,
                                                  strategy='most_frequent'))
                               ])),
                              ('hours',
                               Pipeline([('select', ColumnSelector('hour')),
                                         ('ordinal',
                                          OrdinalEncoder(cols=['hour'],
                                                         return_df=False)),
                                         ('unknown',
                                          SimpleImputer(
                                              missing_values=-1,
                                              strategy='most_frequent'))]))
                          ])),
                         ('to_pandas',
                          FunctionTransformer(lambda x: pd.DataFrame(
                              x, columns=['dayofweek', 'hour']))),
                         ('term',
                          PatsyTransformer('-1 + C(dayofweek):C(hour)'))
                     ])) if (n_days > 1) and (n_hours > 1) else
                    ('days',
                     Pipeline([
                         ('select', ColumnSelector('dayofweek')),
                         ('ordinal',
                          OrdinalEncoder(cols=['dayofweek'], return_df=False)),
                         ('unknown',
                          SimpleImputer(missing_values=-1,
                                        strategy='most_frequent')),
                         ('to_pandas',
                          FunctionTransformer(lambda x: pd.DataFrame(
                              x, columns=['dayofweek']))),
                         ('one_hot',
                          OneHotEncoder(cols=['dayofweek'], return_df=False))
                     ])) if n_days > 1 else
                    ('hours',
                     Pipeline(
                         [('select', ColumnSelector('hour')),
                          ('ordinal',
                           OrdinalEncoder(cols=['hour'], return_df=False)),
                          ('unknown',
                           SimpleImputer(missing_values=-1,
                                         strategy='most_frequent')),
                          ('to_pandas',
                           FunctionTransformer(
                               lambda x: pd.DataFrame(x, columns=['hour']))),
                          ('one_hot',
                           OneHotEncoder(cols=['hour'], return_df=False))])),

                    # temperature part of TOWT
                    ('temperature',
                     ColumnTransformer([
                         ('encode_temperature',
                          IntervalEncoder(
                              n_chunks=10,
                              span=0.1 * X[self.temperature_col].std(),
                              method='normal'), [self.temperature_col])
                     ])),
                    ('temperature_interact',
                     'drop' if n_hours == 1 else Pipeline(
                         [('split',
                           FeatureUnion([
                               ('temperature_part',
                                Pipeline([
                                    ('select',
                                     ColumnSelector(self.temperature_col)),
                                    (
                                        'create_bins',
                                        KBinsDiscretizer(
                                            n_bins=self.n_bins_temperature,
                                            strategy='quantile',
                                            encode='ordinal'),
                                    )
                                ])),
                               ('hour_part',
                                Pipeline([('select', ColumnSelector('hour')),
                                          ('ordinal',
                                           OrdinalEncoder(cols=['hour'],
                                                          return_df=False)),
                                          ('unknown',
                                           SimpleImputer(
                                               missing_values=-1,
                                               strategy='most_frequent'))]))
                           ])),
                          ('to_pandas',
                           FunctionTransformer(lambda x: pd.DataFrame(
                               x, columns=[self.temperature_col, 'hour']))),
                          ('term',
                           PatsyTransformer(
                               f'-1 + C({self.temperature_col}):C(hour)'))])),

                    # deal with extra numerical regressors
                    ('numerical_regressors',
                     'drop' if not numeric_extra else ColumnTransformer(
                         [(f'encode_{col}',
                           IntervalEncoder(n_chunks=4,
                                           span=0.1 * X[col].std(),
                                           method='normal'), [col])
                          for col in numeric_extra])),

                    # deal with extra categorical regressors
                    ('categorical_regressors', 'drop' if not categorical_extra
                     else TargetEncoder(cols=categorical_extra,
                                        return_df=False,
                                        handle_missing='value',
                                        handle_unknown='value'))
                ]))])
            # Fit the pipeline
            self.feature_pipeline_.fit(X, y)

        finally:
            return self.feature_pipeline_.transform(X)
# more efficient to perform explicit feature expansion using
# `PolynomialFeatures` or other non-linear transformers from scikit-learn such
# as
# [KBinsDiscretizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html)
# or
# [Nystroem](https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html).
#
# Here again we refer the interested reader to the documentation to get a
# proper definition of those methods. The following just gives an intuitive
# overview of the predictions we would get using those on our toy dataset:

# %%
from sklearn.preprocessing import KBinsDiscretizer

binned_regression = make_pipeline(
    KBinsDiscretizer(n_bins=8),
    LinearRegression(),
)
binned_regression.fit(data, target)
target_predicted = binned_regression.predict(data)
mse = mean_squared_error(target, target_predicted)

ax = sns.scatterplot(data=full_data,
                     x="input_feature",
                     y="target",
                     color="black",
                     alpha=0.5)
ax.plot(data, target_predicted)
_ = ax.set_title(f"Mean squared error = {mse:.2f}")

# %%
def q2():
    est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
    X = df['Pop_density'].values.reshape(-1, 1)
    pop_density_bins = est.fit_transform(X)
    # Since we have 10 bins from 0 to 9, bins >8 (bins == 9) represent countries over 90 percentile
    return len((df[pop_density_bins == 9]['Country']).unique())
예제 #7
0
def test_fit_transform(strategy, expected):
    est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
    est.fit(X)
    assert_array_equal(expected, est.transform(X))
예제 #8
0
#providing categorical mask
number_columns_mask = data.select_dtypes('number').columns
print(number_columns_mask)
data = pd.get_dummies(data)
dummies_mask = pickle.load(open('dummies.sav','rb'))

data = data.loc[:,[*number_columns_mask ,*dummies_mask]] 



X = data.drop('price',axis=1)
X = data.drop('symboling',axis=1)
y = data.loc[:,'price']

est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')
Xt = est.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25)


rf = RandomForestRegressor(bootstrap=False,max_depth=93,max_features='sqrt',min_samples_leaf=1,min_samples_split=3, n_estimators=667 )
rf.fit(X_train,y_train)
score = r2_score(y_test,rf.predict(X_test))

plt.plot(y_test,rf.predict(X_test))
plt.show()

"""
parameters = {'bootstrap': [False],
 'max_depth': [ 93],
 'max_features': ['sqrt'],
예제 #9
0
 def fit(self, X, y=None):
     for index in range(len(self.feature_is_included_list)):
         if self.feature_is_included_list[index] == True:
             self.k_bins_discretizers[index] = KBinsDiscretizer(encode='ordinal', strategy='quantile')
             self.k_bins_discretizers[index].fit(X[:, index:index + 1])
     return self
예제 #10
0
def discretize(X, max_bins):
    enc = KBinsDiscretizer(n_bins=max_bins, encode='onehot-dense')
    enc.fit(X)
    enc.bin_edges_ = [np.unique(edges) for edges in enc.bin_edges_]
    return enc.transform(X)
예제 #11
0
class_1 = 50000  # 多数类为50000个样本
class_2 = 500  # 少数类为500个样本
centers = [[0.0, 0.0], [5.0, 5.0]]  # 设定两个类别的中心
clusters_std = [3,
                1]  # 设定两个类别的方差
X, y = make_blobs(n_samples=[class_1, class_2], centers=centers, cluster_std=clusters_std, random_state=0,
                  shuffle=False)
name = ["Multinomial", "Gaussian", "Bernoulli"]
models = [MultinomialNB(), GaussianNB(), BernoulliNB()]

for name, clf in zip(name, models):
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.3,
                                                    random_state=420)
    if name != "Gaussian":
        kbs = KBinsDiscretizer(n_bins=10, encode='onehot').fit(Xtrain)
        Xtrain = kbs.transform(Xtrain)
        Xtest = kbs.transform(Xtest)
    clf.fit(Xtrain, Ytrain)
    y_pred = clf.predict(Xtest)
    proba = clf.predict_proba(Xtest)[:, 1]
    score = clf.score(Xtest, Ytest)
    print(name)
    print("\tBrier:{:.3f}".format(BS(Ytest, proba, pos_label=1)))
    print("\tAccuracy:{:.3f}".format(score))
    print("\tRecall:{:.3f}".format(recall_score(Ytest, y_pred)))
    print("\tAUC:{:.3f}".format(AUC(Ytest, proba)))
print("*" * 100)
name = ["Multinomial", "Gaussian", "Bernoulli", "Complement"]
models = [MultinomialNB(), GaussianNB(), BernoulliNB(), ComplementNB()]
예제 #12
0
def discretize(_, __, angle, pole_velocity):
    est = KBinsDiscretizer(n_bins=num_discrete_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int, est.transform([[angle, pole_velocity]])[0]))
예제 #13
0
from sklearn.preprocessing import KBinsDiscretizer
import pandas as pd

dataSet = pd.read_csv("./dataset.csv")

age = dataSet['Age'].values

est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
est.fit(age.reshape(-1, 1))
Xt = est.transform(age.reshape(-1, 1))
print(Xt)
    print(
        'Data after cleaning: {}, {} positive examples, {} negative examples'.
        format(df.shape, positive_examples, negative_examples))

    split_ratio = args.train_test_split_ratio
    print('Splitting data into train and test sets with ratio {}'.format(
        split_ratio))
    X_train, X_test, y_train, y_test = train_test_split(df.drop('income',
                                                                axis=1),
                                                        df['income'],
                                                        test_size=split_ratio,
                                                        random_state=0)

    preprocess = make_column_transformer(
        (['age', 'num persons worked for employer'
          ], KBinsDiscretizer(encode='onehot-dense', n_bins=10)),
        (['capital gains', 'capital losses', 'dividends from stocks'
          ], StandardScaler()),
        (['education', 'major industry code', 'class of worker'
          ], OneHotEncoder(sparse=False)))
    print('Running preprocessing and feature engineering transformations')
    train_features = preprocess.fit_transform(X_train)
    test_features = preprocess.transform(X_test)

    print('Train data shape after preprocessing: {}'.format(
        train_features.shape))
    print('Test data shape after preprocessing: {}'.format(
        test_features.shape))

    train_features_output_path = os.path.join('/opt/ml/processing/train',
                                              'train_features.csv')
예제 #15
0
y = pd.read_csv("spambase.data", names=[57])

# dropping null values from dataset
x.dropna()
y.dropna()

# converting to numpy arrays for further operations
X = x.values.astype(float)
Y = y.values

# Setting number of bins using sturgs rule
s = 1 + (math.log(len(X), 2))
print("\nNumber of bins:", round(s))

# Discetizer object creation
d = KBinsDiscretizer(n_bins=round(s), encode='ordinal', strategy='uniform')

# Hypothesis space creation
H = []
for i in range(57):
    H.append([])
''' Algorithm 4.1 implementation with initializating the Hypothesis space with the first instance from dataset 
    and then implementing algorithm 4.3 in the LGG for finding the conjuntions of internal disjunction of the literals and returning 
    the hypothesis space with the conjugates '''


def LGG(H, X):
    #Initilization of LGG with first instance from dataset
    for j in range(len(X[0])):
        H[j].append(X[0][j])
    while (j < len(X)):
예제 #16
0
def preprocessing_discrete_whole_image(data_path, img, feat_list_all, batch):
    img_path = data_path / 'images' / img
    stack_path = img_path / 'stack' / 'stack.tif'
    bins = 7

    with rasterio.open(str(stack_path), 'r') as ds:
        data = ds.read()
        data = data.transpose((1, -1, 0))
        data[data == -999999] = np.nan
        data[np.isneginf(data)] = np.nan
        data_vector = data.reshape(
            [data.shape[0] * data.shape[1], data.shape[2]])
        data_vector = data_vector[~np.isnan(data_vector).any(axis=1)]

    # Get indices of non-nan values
    nans = np.sum(data, axis=2)
    data_ind = np.where(~np.isnan(nans))
    rows, cols = zip(data_ind)

    # Discretize continuous features
    cts_feats = [
        'GSW_distSeasonal', 'aspect', 'curve', 'elevation', 'hand', 'slope',
        'spi', 'twi', 'sti'
    ]
    non_cts_feats = [
        'developed', 'forest', 'planted', 'wetlands', 'openspace', 'carbonate',
        'noncarbonate', 'akl_intrusive', 'silicic_resid', 'silicic_resid',
        'extrusive_volcanic', 'colluvial_sed', 'glacial_till_clay',
        'glacial_till_loam', 'glacial_till_coarse', 'glacial_lake_sed_fine',
        'glacial_outwash_coarse', 'hydric', 'eolian_sed_coarse',
        'eolian_sed_fine', 'saline_lake_sed', 'alluv_coastal_sed_fine',
        'coastal_sed_coarse', 'GSW_perm', 'flooded'
    ]

    feats_disc = []
    all_edges = pd.DataFrame([])

    # GSW_distSeasonal
    discretizer = KBinsDiscretizer(n_bins=bins,
                                   encode='onehot-dense',
                                   strategy='quantile')
    GSW_distSeasonal_disc = discretizer.fit_transform(
        data_vector[:, feat_list_all.index('GSW_distSeasonal')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('GSW_distSeasonal_' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = GSW_distSeasonal_disc[:, bin]

    GSW_distSeasonal_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # Elevation
    discretizer = KBinsDiscretizer(n_bins=bins,
                                   encode='onehot-dense',
                                   strategy='quantile')
    elevation_disc = discretizer.fit_transform(
        data_vector[:, feat_list_all.index('elevation')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('elevation' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = elevation_disc[:, bin]

    elevation_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # Slope
    discretizer = KBinsDiscretizer(n_bins=bins,
                                   encode='onehot-dense',
                                   strategy='quantile')
    slope_disc = discretizer.fit_transform(
        data_vector[:, feat_list_all.index('slope')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('slope' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = slope_disc[:, bin]

    slope_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # TWI
    discretizer = KBinsDiscretizer(n_bins=bins,
                                   encode='onehot-dense',
                                   strategy='quantile')
    twi_disc = discretizer.fit_transform(
        data_vector[:, feat_list_all.index('twi')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('twi' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = twi_disc[:, bin]

    twi_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # SPI
    discretizer = KBinsDiscretizer(n_bins=bins,
                                   encode='onehot-dense',
                                   strategy='quantile')
    spi_disc = discretizer.fit_transform(
        data_vector[:, feat_list_all.index('spi')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('spi' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = spi_disc[:, bin]

    spi_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # STI
    discretizer = KBinsDiscretizer(n_bins=bins,
                                   encode='onehot-dense',
                                   strategy='quantile')
    sti_disc = discretizer.fit_transform(
        data_vector[:, feat_list_all.index('sti')].reshape(-1, 1))
    for i in range(bins):
        feats_disc.append('sti' + str(i + 1))

    disc_nan = np.zeros(data[:, :, 0:bins].shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = sti_disc[:, bin]

    sti_disc = disc_nan
    del disc_nan

    edges = []
    for arr in discretizer.bin_edges_:
        for edge in arr[:-1]:
            edges.append(edge)

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # Curve (flat, convex, concave)
    convex = np.zeros((data_vector.shape[0], ))
    concave = np.zeros((data_vector.shape[0], ))
    flat = np.zeros((data_vector.shape[0], ))
    convex[np.where(data_vector[:, feat_list_all.index('curve')] < 0)] = 1
    concave[np.where(data_vector[:, feat_list_all.index('curve')] > 0)] = 1
    flat[np.where(data_vector[:, feat_list_all.index('curve')] == 0)] = 1
    names = ['convex', 'concave', 'flat']
    bins = len(names)
    for name in names:
        feats_disc.append(name)

    curve = np.column_stack([convex, concave, flat])

    shape = data[:, :, 0:curve.shape[1]].shape
    disc_nan = np.zeros(shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = curve[:, bin]

    curve = disc_nan

    del disc_nan, convex, concave, flat

    edges = [0, 0, 0]
    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # Aspect (north, northeast, northwest, south, southeast, southwest, east, west)
    north = np.zeros((data_vector.shape[0], ))
    northeast = np.zeros((data_vector.shape[0], ))
    east = np.zeros((data_vector.shape[0], ))
    southeast = np.zeros((data_vector.shape[0], ))
    south = np.zeros((data_vector.shape[0], ))
    southwest = np.zeros((data_vector.shape[0], ))
    west = np.zeros((data_vector.shape[0], ))
    northwest = np.zeros((data_vector.shape[0], ))

    north[np.where(
        np.logical_or.reduce(
            (data_vector[:, feat_list_all.index('aspect')] >= 337.5,
             data_vector[:, feat_list_all.index('aspect')] < 22.5)))] = 1
    northeast[np.where(
        np.logical_and.reduce(
            (data_vector[:, feat_list_all.index('aspect')] >= 22.5,
             data_vector[:, feat_list_all.index('aspect')] < 67.5)))] = 1
    east[np.where(
        np.logical_and.reduce(
            (data_vector[:, feat_list_all.index('aspect')] >= 67.5,
             data_vector[:, feat_list_all.index('aspect')] < 112.5)))] = 1
    southeast[np.where(
        np.logical_and.reduce(
            (data_vector[:, feat_list_all.index('aspect')] >= 112.5,
             data_vector[:, feat_list_all.index('aspect')] < 157.5)))] = 1
    south[np.where(
        np.logical_and.reduce(
            (data_vector[:, feat_list_all.index('aspect')] >= 157.5,
             data_vector[:, feat_list_all.index('aspect')] < 202.5)))] = 1
    southwest[np.where(
        np.logical_and.reduce(
            (data_vector[:, feat_list_all.index('aspect')] >= 202.5,
             data_vector[:, feat_list_all.index('aspect')] < 247.5)))] = 1
    west[np.where(
        np.logical_and.reduce(
            (data_vector[:, feat_list_all.index('aspect')] >= 247.5,
             data_vector[:, feat_list_all.index('aspect')] < 292.5)))] = 1
    northwest[np.where(
        np.logical_and.reduce(
            (data_vector[:, feat_list_all.index('aspect')] >= 292.5,
             data_vector[:, feat_list_all.index('aspect')] < 337.5)))] = 1
    names = [
        'north', 'northeast', 'east', 'southeast', 'south', 'southwest',
        'west', 'northwest'
    ]
    bins = len(names)
    for name in names:
        feats_disc.append(name)

    aspect = np.column_stack(
        [north, northeast, east, southeast, south, southwest, west, northwest])

    shape = data[:, :, 0:aspect.shape[1]].shape
    disc_nan = np.zeros(shape)
    disc_nan[~np.isnan(disc_nan)] = np.nan
    for bin in range(bins):
        disc_nan[rows, cols, bin] = aspect[:, bin]

    aspect = disc_nan

    del disc_nan, north, northeast, east, southeast, south, southwest, west, northwest

    edges = [22.5, 67.5, 112.5, 157.5, 202.5, 247.5, 292.5, 337.5]

    all_edges = pd.concat([all_edges, pd.DataFrame(edges)], axis=0)

    # Get original discrete features
    orig_disc_inds = []
    for feat in non_cts_feats:
        orig_disc_inds.append(feat_list_all.index(feat))
    orig_disc_data = data[:, :, orig_disc_inds]

    # Combine with new discrete features
    new_disc_data = np.dstack([
        GSW_distSeasonal_disc, elevation_disc, slope_disc, twi_disc, spi_disc,
        sti_disc, curve, aspect
    ])
    data = np.dstack([new_disc_data, orig_disc_data])

    del orig_disc_data, new_disc_data

    # Combine all edges and features
    all_edges = all_edges.reset_index(drop=True)
    feature_edges = pd.concat(
        [all_edges, pd.DataFrame(data=feats_disc)], axis=1)
    feature_edges.columns = ['edge', 'feature']

    # If a feat has only zeros or 1s, it is removed
    data_vector = data.reshape([data.shape[0] * data.shape[1], data.shape[2]])
    data_vector = data_vector[~np.isnan(data_vector).any(axis=1)]
    std = data_vector[:, 0:data_vector.shape[1] - 2].std(0)

    remove_inds = []
    if 0 in std.tolist():
        zero_inds = np.where(std == 0)[0].tolist()
        for ind in zero_inds:
            remove_inds.append(ind)

    remove_inds = np.unique(remove_inds).tolist()

    feat_list_stack = feats_disc + non_cts_feats
    remove_feats = [feat_list_stack[ind] for ind in remove_inds]
    data_vector_keep = np.delete(data_vector, remove_inds, axis=1)
    feat_keep = [x for x in feat_list_stack if x not in remove_feats]

    feature_edges_keep = feature_edges[~feature_edges.feature.isin(remove_feats
                                                                   )]

    # Save feature class bin edges
    filedir = data_path / batch / 'class_bins'
    try:
        filedir.mkdir(parents=True)
    except FileExistsError:
        pass
    filename = filedir / '{}'.format('feature_edges.csv')
    feature_edges_keep.to_csv(filename, index=False)

    return data, data_vector_keep, data_ind, feat_keep, feature_edges_keep
예제 #17
0
    plt.xlabel('Gamma value')
    plt.show()

    figure(num=None, figsize=(10, 8), dpi=100, facecolor='w', edgecolor='k')
    plt1.plot(C_data, C_data_train_score, color="red")
    plt1.plot(C_data, C_data_score, color="blue")
    plt1.ylabel('Accuracy Score')
    plt1.xlabel('C value')
    plt1.show()


#Load the three datasets and shuffle them
train_set = pd.read_csv("2018.csv")
train_set = train_set.fillna(0)
train_set = train_set.sample(frac=1)
enc_score = KBinsDiscretizer(n_bins=3, encode='ordinal')
X_binned = enc_score.fit_transform(train_set["Score"].values.reshape(-1, 1))
train_set['Score'] = X_binned

train_set1 = pd.read_csv("diamonds.csv")
train_set1 = train_set1.fillna(0)
train_set1 = train_set1.sample(frac=1)

train_set2 = pd.read_csv("winequality-red.csv")
train_set2 = train_set2.fillna(0)
train_set2 = train_set2.sample(frac=1)

#Parameter a is how many data I want to use, inorder to train and test my model. I choose the target and seperate it from the input

a = 500
enc_y = OneHotEncoder(sparse="True")
예제 #18
0
    def add_features(self, X, transform=False, index=None):
        '''Adds features to the learned one at given index, if index is not provided then they are appended at the end'''
        try:
            check_is_fitted(self)
        except NotFittedError as e:
            self.fit(X)
            if transform:
                return self.transform(X)
            return self
        X = X.copy()
        if isinstance(X, pd.DataFrame):
            numerical_features = X.select_dtypes("float")
            if len(numerical_features.columns):

                if self.discretize:
                    temp_discretizer = KBinsDiscretizer(
                        n_bins=self.n_intervals,
                        encode="ordinal",
                        strategy="quantile")
                    X.loc[:, numerical_features.
                          columns] = temp_discretizer.fit_transform(
                              numerical_features)
                    X.loc[:, numerical_features.
                          columns] = X.loc[:,
                                           numerical_features.columns].astype(
                                               int)
                    new_index = X.columns.get_indexer(
                        numerical_features.columns)
                    if index:
                        index_with_column = list(enumerate(index))
                        numerical_index = np.array(index)[new_index]
                        sort_index = np.argsort(numerical_index)
                        numerical_index_with_column = [
                            index_with_column[i] for i in numerical_index
                        ]
                        last = 0
                        for i in sort_index:
                            feature, list_insert_index = numerical_index_with_column[
                                i]
                            while last <= len(
                                    self.numerical_feature_index_
                            ) and list_insert_index < self.numerical_feature_index_[
                                    last]:
                                last += 1
                            self.numerical_feature_index_.insert(
                                last, list_insert_index)
                            self.discretizer.n_bins_ = np.insert(
                                self.discretizer.n_bins_,
                                last,
                                temp_discretizer.n_bins_[i],
                                axis=1)
                            self.discretizer.bin_edges_ = np.insert(
                                self.discretizer.n_bins_,
                                last,
                                temp_discretizer.bin_edges_[i],
                                axis=1)
                            for n in range(last + 1,
                                           len(self.numerical_feature_index_)):
                                self.numerical_feature_index_[n] += 1

                    else:
                        new_index = X.columns.get_indexer(
                            numerical_features.columns) + self.n_features
                        self.numerical_feature_index_.extend(new_index)
                        self.discretizer.n_bins_ = np.concatenate([
                            self.discretizer.n_bins_, temp_discretizer.n_bins_
                        ],
                                                                  axis=1)
                        self.discretizer.bin_edges_ = np.concatenate([
                            self.discretizer.bin_edges_,
                            temp_discretizer.bin_edges_
                        ],
                                                                     axis=1)
            X = X.to_numpy()

        if X.dtype == "O":
            X = X.astype(str)
        self.n_features += X.shape[1]
        new_categories = [np.unique(X[:, j]) for j in range(X.shape[1])]
        if index is not None:
            sort_index = np.argsort(index)
            index_with_column = list(enumerate(index))
            for i in sort_index:
                column, list_insert_index = index_with_column[i]
                self.categories_.insert(list_insert_index,
                                        new_categories[column])
        else:
            self.categories_.extend(new_categories)
        self.sort_index_ = [cat.argsort() for cat in self.categories_]
        self.sorted_categories_ = [
            self.categories_[j][self.sort_index_[j]]
            for j in range(self.n_features)
        ]
        self.sorted_encoded_ = [
            np.arange(self.categories_[j].shape[0])[self.sort_index_[j]]
            for j in range(self.n_features)
        ]
        self.unknown_values_ = [cat.shape[0] for cat in self.categories_]
        if transform:
            if index is None:
                index = list(
                    range(
                        len(self.categories_) - len(new_categories),
                        len(self.categories_)))
            return self.transform_columns(X, categories=index)
예제 #19
0
def test_inverse_transform(strategy, encode, expected_inv):
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
    Xt = kbd.fit_transform(X)
    Xinv = kbd.inverse_transform(Xt)
    assert_array_almost_equal(expected_inv, Xinv)
예제 #20
0
countries.Region = countries.Region.str.strip()


# In[6]:


#Questao 1
q1 = list(countries.Region.unique())
q1.sort()


# In[7]:


#Questao 2
disc = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')


# In[15]:


disc.fit(countries.Pop_density.values.reshape(-1,1))
Pop_disc = disc.transform(countries.Pop_density.values.reshape(-1,1))


# In[17]:


quart =  np.quantile(Pop_disc, 0.9)

예제 #21
0
def test_valid_n_bins():
    KBinsDiscretizer(n_bins=2).fit_transform(X)
    KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
    assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int)
예제 #22
0
def q2():
    disc = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
    disc.fit(countries.Pop_density.values.reshape(-1,1))
    Pop_disc = disc.transform(countries.Pop_density.values.reshape(-1,1))
    quart =  np.quantile(Pop_disc, 0.9)
    return int(Pop_disc[Pop_disc > quart].size)
def discretization(_database, json_obj, _db_name='', _save_path=None):
    
    if isinstance(_database,str):
        _database = pd.read_csv(_database, header=0, sep=',')    
    
    db = _database.copy()
    
    # remove old VAR_TIME_NAME from features to discretize
    if json_obj['_survivalAttr']['survivalTime_name'] in json_obj['_cols2disc']:
        json_obj['_cols2disc'].remove(json_obj['_survivalAttr']['survivalTime_name'])
        
    # return if there is no col to discretize
    if not json_obj['_cols2disc']:
        # save db as db_disc
        _save_db(db, _db_name+'_disc', _save_path)
        # save log discretization
        with open(_save_path+'{}_log_discretization.json'.format(_db_name), 'w') as f:
            json.dump('! This data set has no features for discretization',f)
        return
    
    # discretization process: iterates over all coluns
    log_discretization = {}
    db_disc = db.copy()
    for col_name in json_obj['_cols2disc']:
        col_log = {}
        
        # data shape to discretize
        data = np.array(db[col_name]).reshape(-1, 1)
        
        try:
            discretizer = KBinsDiscretizer(n_bins=N_BINS, encode=ENCODE, strategy=STRATEGY)
            discretizer.fit(data)
            # data discretized/encoded
            data_encoded = pd.Series(discretizer.transform(data).reshape(1,-1)[0]).astype('int64')
        except:
            discretizer = KBinsDiscretizer(n_bins=N_BINS, encode=ENCODE, strategy='quantile')
            discretizer.fit(data)
            # data discretized/encoded
            data_encoded = pd.Series(discretizer.transform(data).reshape(1,-1)[0]).astype('int64')      
        
        
        # categories representation
        categories = sorted(data_encoded.unique().tolist())
        bin_edges = list(discretizer.bin_edges_[0])
        map_names = {}
        for idx,ctg in enumerate(categories):
            if idx+1 == len(categories):
                string = '[{:0.2f},{:0.2f}]'.format(bin_edges[idx],bin_edges[idx+1])
            else:
                string = '[{:0.2f},{:0.2f})'.format(bin_edges[idx],bin_edges[idx+1])
            map_names[ctg] = string
        
        # data decodification
        data_ctg = data_encoded.map(map_names)
        db_disc[col_name] = data_ctg.astype('category')

        # register log for discretized column
        col_log['discretizer_params'] = discretizer.get_params().copy()
        col_log['bin_edges'] = bin_edges
        col_log['dict_categories_names'] = map_names
        col_log['data_orig'] = data.tolist()
        col_log['data_encoded'] = data_encoded.tolist()
        col_log['data_categories'] = data_ctg.tolist()
        log_discretization[col_name] = col_log.copy()
    
    # save discretized database
    _save_db(db_disc, _db_name+'_disc', _save_path)
    
    # save log discretization
    with open(_save_path+'{}_log_discretization.json'.format(_db_name), 'w') as f:
        json.dump(log_discretization,f)
        
    return
예제 #24
0
def q2():
    # Retorne aqui o resultado da questão 2.
    est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
    bins_score =  est.fit_transform(countries['Pop_density'].values.reshape(-1, 1))

    return int((bins_score >= 9).sum())
    KBinsDiscretizer,
    MinMaxScaler,
    Normalizer,
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
    PowerTransformer,
    StandardScaler,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

_transformers = [
    Binarizer(threshold=2),
    KBinsDiscretizer(n_bins=3, encode="ordinal"),
    StandardScaler(),
    MinMaxScaler(),
    Normalizer(),
    PowerTransformer(),
    FunctionTransformer(np.log, validate=True),
    OrdinalEncoder(),
]

_selectors = [
    SelectFromModel(Lasso(random_state=1)),
    SelectKBest(f_regression, k=2),
    VarianceThreshold(),
    RFE(Lasso(random_state=1)),
]
예제 #26
0
 def fit(self, X, y=None):
     self.kbd = KBinsDiscretizer(n_bins=self.n_bins, encode=self.encode,strategy=self.strategy)
     self.kbd.fit(X[self.cols])
     return self
#
# In order to fit linear models with those predictors it is therefore
# necessary to perform standard feature transformations as follows:

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer

log_scale_transformer = make_pipeline(
    FunctionTransformer(np.log, validate=False), StandardScaler())

linear_model_preprocessor = ColumnTransformer(
    [
        ("passthrough_numeric", "passthrough", ["BonusMalus"]),
        ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
        ("log_scaled_numeric", log_scale_transformer, ["Density"]),
        (
            "onehot_categorical",
            OneHotEncoder(),
            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
        ),
    ],
    remainder="drop",
)

# %%
# A constant prediction baseline
# ------------------------------
#
# It is worth noting that more than 93% of policyholders have zero claims. If
예제 #28
0
        ax.set_title('Input Data', size=14)

    xx, yy = np.meshgrid(np.linspace(X[:, 0].min(), X[:, 0].max(), 300),
                         np.linspace(X[:, 1].min(), X[:, 1].max(), 300))
    grid = np.c_[xx.ravel(), yy.ravel()]

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())

    ax.set_xticks(())
    ax.set_yticks(())

    i += 1

    for strategy in strategies:
        enc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy=strategy)
        enc.fit(X)
        grid_enc = enc.transform(grid)

        ax = plt.subplot(len(X_list), len(strategies) + 1, i)

        # horizontal stripes
        horizontal = grid_enc[:, 0].reshape(xx.shape)
        ax.contourf(xx, yy, horizontal, alpha=0.5)

        # vertical stripes
        vertical = grid_enc[:, 1].reshape(xx.shape)
        ax.contourf(xx, yy, vertical, alpha=0.5)

        ax.scatter(X[:, 0], X[:, 1], edgecolors='k')
        ax.set_xlim(xx.min(), xx.max())
예제 #29
0
    name = estimator.__class__.__name__
    if name == 'Pipeline':
        name = [get_name(est[1]) for est in estimator.steps]
        name = ' + '.join(name)
    return name


# list of (estimator, param_grid), where param_grid is used in GridSearchCV
classifiers = [
    (LogisticRegression(random_state=0), {
        'C': np.logspace(-2, 7, 10)
    }),
    (LinearSVC(random_state=0), {
        'C': np.logspace(-2, 7, 10)
    }),
    (make_pipeline(KBinsDiscretizer(encode='onehot'),
                   LogisticRegression(random_state=0)), {
                       'kbinsdiscretizer__n_bins': np.arange(2, 10),
                       'logisticregression__C': np.logspace(-2, 7, 10),
                   }),
    (make_pipeline(KBinsDiscretizer(encode='onehot'),
                   LinearSVC(random_state=0)), {
                       'kbinsdiscretizer__n_bins': np.arange(2, 10),
                       'linearsvc__C': np.logspace(-2, 7, 10),
                   }),
    (GradientBoostingClassifier(n_estimators=50, random_state=0), {
        'learning_rate': np.logspace(-4, 0, 10)
    }),
    (SVC(random_state=0), {
        'C': np.logspace(-2, 7, 10)
    }),
def test_invalid_strategy_option():
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy='invalid-strategy')
    assert_raise_message(ValueError, "Valid options for 'strategy' are "
                         "('uniform', 'quantile', 'kmeans'). "
                         "Got strategy='invalid-strategy' instead.",
                         est.fit, X)