Exemplo n.º 1
0
def test_transform_1d_behavior():
    X = np.arange(4)
    est = KBinsDiscretizer(n_bins=2)
    assert_raises(ValueError, est.fit, X)

    est = KBinsDiscretizer(n_bins=2)
    est.fit(X.reshape(-1, 1))
    assert_raises(ValueError, est.transform, X)
Exemplo n.º 2
0
def test_transform_outside_fit_range(strategy):
    X = np.array([0, 1, 2, 3])[:, None]
    kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode='ordinal')
    kbd.fit(X)

    X2 = np.array([-2, 5])[:, None]
    X2t = kbd.transform(X2)
    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
    assert_array_equal(X2t.min(axis=0), [0])
Exemplo n.º 3
0
def test_inverse_transform(strategy):
    X = np.random.RandomState(0).randn(100, 3)
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal')
    Xt = kbd.fit_transform(X)
    assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_)

    X2 = kbd.inverse_transform(Xt)
    X2t = kbd.fit_transform(X2)
    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
    assert_array_equal(Xt, X2t)
Exemplo n.º 4
0
def test_fit_transform_n_bins_array(strategy, expected):
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal',
                           strategy=strategy).fit(X)
    assert_array_equal(expected, est.transform(X))

    # test the shape of bin_edges_
    n_features = np.array(X).shape[1]
    assert est.bin_edges_.shape == (n_features, )
    for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
        assert bin_edges.shape == (n_bins + 1, )
def test_percentile_numeric_stability():
    X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
    bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
    Xt = np.array([0, 0, 4]).reshape(-1, 1)
    kbd = KBinsDiscretizer(n_bins=10, encode='ordinal',
                           strategy='quantile')
    msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
           "are removed. Consider decreasing the number of bins.")
    assert_warns_message(UserWarning, msg, kbd.fit, X)
    assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
    assert_array_almost_equal(kbd.transform(X), Xt)
Exemplo n.º 6
0
def test_overwrite():
    X = np.array([0, 1, 2, 3])[:, None]
    X_before = X.copy()

    est = KBinsDiscretizer(n_bins=3, encode="ordinal")
    Xt = est.fit_transform(X)
    assert_array_equal(X, X_before)

    Xt_before = Xt.copy()
    Xinv = est.inverse_transform(Xt)
    assert_array_equal(Xt, Xt_before)
    assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
Exemplo n.º 7
0
def test_nonuniform_strategies(strategy, expected_2bins, expected_3bins):
    X = np.array([0, 1, 2, 3, 9, 10]).reshape(-1, 1)

    # with 2 bins
    est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode='ordinal')
    Xt = est.fit_transform(X)
    assert_array_equal(expected_2bins, Xt.ravel())

    # with 3 bins
    est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal')
    Xt = est.fit_transform(X)
    assert_array_equal(expected_3bins, Xt.ravel())
Exemplo n.º 8
0
def test_same_min_max(strategy):
    warnings.simplefilter("always")
    X = np.array([[1, -2],
                  [1, -1],
                  [1, 0],
                  [1, 1]])
    est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode='ordinal')
    assert_warns_message(UserWarning,
                         "Feature 0 is constant and will be replaced "
                         "with 0.", est.fit, X)
    assert est.n_bins_[0] == 1
    # replace the feature with zeros
    Xt = est.transform(X)
    assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
Exemplo n.º 9
0
def test_inverse_transform(strategy, encode):
    X = np.random.RandomState(0).randn(100, 3)
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
    Xt = kbd.fit_transform(X)
    X2 = kbd.inverse_transform(Xt)
    X2t = kbd.fit_transform(X2)
    if encode == 'onehot':
        assert_array_equal(Xt.todense(), X2t.todense())
    else:
        assert_array_equal(Xt, X2t)
    if 'onehot' in encode:
        Xt = kbd._encoder.inverse_transform(Xt)
        X2t = kbd._encoder.inverse_transform(X2t)

    assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_)
    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
Exemplo n.º 10
0
def test_encode_options():
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='ordinal').fit(X)
    Xt_1 = est.transform(X)
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='onehot-dense').fit(X)
    Xt_2 = est.transform(X)
    assert not sp.issparse(Xt_2)
    assert_array_equal(OneHotEncoder(
                           categories=[np.arange(i) for i in [2, 3, 3, 3]],
                           sparse=False)
                       .fit_transform(Xt_1), Xt_2)
    assert_raise_message(ValueError, "inverse_transform only supports "
                         "'encode = ordinal'. Got encode='onehot-dense' "
                         "instead.", est.inverse_transform, Xt_2)
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='onehot').fit(X)
    Xt_3 = est.transform(X)
    assert sp.issparse(Xt_3)
    assert_array_equal(OneHotEncoder(
                           categories=[np.arange(i) for i in [2, 3, 3, 3]],
                           sparse=True)
                       .fit_transform(Xt_1).toarray(),
                       Xt_3.toarray())
    assert_raise_message(ValueError, "inverse_transform only supports "
                         "'encode = ordinal'. Got encode='onehot' "
                         "instead.", est.inverse_transform, Xt_2)
Exemplo n.º 11
0
def test_encode_options():
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='ordinal').fit(X)
    Xt_1 = est.transform(X)
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='onehot-dense').fit(X)
    Xt_2 = est.transform(X)
    assert not sp.issparse(Xt_2)
    assert_array_equal(OneHotEncoder(
                           categories=[np.arange(i) for i in [2, 3, 3, 3]],
                           sparse=False)
                       .fit_transform(Xt_1), Xt_2)
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
                           encode='onehot').fit(X)
    Xt_3 = est.transform(X)
    assert sp.issparse(Xt_3)
    assert_array_equal(OneHotEncoder(
                           categories=[np.arange(i) for i in [2, 3, 3, 3]],
                           sparse=True)
                       .fit_transform(Xt_1).toarray(),
                       Xt_3.toarray())
Exemplo n.º 12
0
train = train[train['c_charge_degree'] != "O"]
# We filtered the underlying data from Broward county to include only those rows representing people who had either
# recidivated in two years, or had at least two years outside of a correctional facility.
train = train[train['score_text'] != 'N/A']

train = train.replace('Medium', "Low")
test = test.replace('Medium', "Low")

train_labels = label_binarize(train['score_text'], classes=['High', 'Low'])
test_labels = label_binarize(test['score_text'], classes=['High', 'Low'])

impute_and_onehot = Pipeline([
    ('imputer1', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
impute_and_bin = Pipeline([('imputer2', SimpleImputer(strategy='mean')),
                           ('discretizer',
                            KBinsDiscretizer(n_bins=4,
                                             encode='ordinal',
                                             strategy='uniform'))])

compas_featurizer = ColumnTransformer(
    transformers=[('impute1_and_onehot', impute_and_onehot,
                   ['is_recid']), ('impute2_and_bin', impute_and_bin,
                                   ['age'])])
compas_pipeline = Pipeline([('features', compas_featurizer),
                            ('classifier', LogisticRegression())])

compas_pipeline.fit(train, train_labels.ravel())
print(compas_pipeline.score(test, test_labels.ravel()))
        ax.set_title("Input data", size=14)

    xx, yy = np.meshgrid(
        np.linspace(X[:, 0].min(), X[:, 0].max(), 300),
        np.linspace(X[:, 1].min(), X[:, 1].max(), 300))
    grid = np.c_[xx.ravel(), yy.ravel()]

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())

    i += 1
    # transform the dataset with KBinsDiscretizer
    for strategy in strategies:
        enc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy=strategy)
        enc.fit(X)
        grid_encoded = enc.transform(grid)

        ax = plt.subplot(len(X_list), len(strategies) + 1, i)

        # horizontal stripes
        horizontal = grid_encoded[:, 0].reshape(xx.shape)
        ax.contourf(xx, yy, horizontal, alpha=.5)
        # vertical stripes
        vertical = grid_encoded[:, 1].reshape(xx.shape)
        ax.contourf(xx, yy, vertical, alpha=.5)

        ax.scatter(X[:, 0], X[:, 1], edgecolors='k')
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
Exemplo n.º 14
0
def test_inverse_transform(strategy, encode, expected_inv):
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
    Xt = kbd.fit_transform(X)
    Xinv = kbd.inverse_transform(Xt)
    assert_array_almost_equal(expected_inv, Xinv)
def bucketize_y(y, num_buckets):
    binner = KBinsDiscretizer(n_bins=num_buckets, encode='ordinal')
    cols = y.columns
    y = binner.fit_transform(y)
    y = pd.DataFrame(data=y, columns=cols)
    return y
Exemplo n.º 16
0
for cat in categorical_features:
    numerical_columns.remove(cat)
for tcat in time_columns:
    numerical_columns.remove(tcat)
for bcat in bin_features:
    numerical_columns.remove(bcat)

transformed_cols = [f"{col}_transformed" for col in time_columns]

timefeat = FunctionTransformer(func=make_time_features,
                               kw_args=dict(columns=time_columns),
                               validate=False)
# droptimefeat=FunctionTransformer(func=drop_time_columns,kw_args=dict(columns=time_columns),validate=False)

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

bin_transformer = Pipeline(steps=[("bin_transformer", KBinsDiscretizer())])
categorical_transformer = Pipeline(
    steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
print(time_columns)
print(time_columns + transformed_cols)
preprocessor = ColumnTransformer(transformers=[
    ('timefeat', timefeat, time_columns),
    ('num', numeric_transformer, numerical_columns),
    ('cat', categorical_transformer, categorical_features),
    ('bin', bin_transformer, bin_features),
])

train1 = preprocessor.fit_transform(train.iloc[:, :-1])
test1 = preprocessor.transform(test)
Exemplo n.º 17
0
def create_criteo_dataset(file,
                          embed_dim=8,
                          read_part=True,
                          sample_num=100000,
                          test_size=0.2):
    """
    a example about creating criteo dataset
    :param file: dataset's path
    :param embed_dim: the embedding dimension of sparse features
    :param read_part: whether to read part of it
    :param sample_num: the number of instances if read_part is True
    :param test_size: ratio of test dataset
    :return: feature columns, train, test
    """
    names = [
        'label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10',
        'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8',
        'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18',
        'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26'
    ]

    if read_part:
        data_df = pd.read_csv(file,
                              sep='\t',
                              iterator=True,
                              header=None,
                              names=names)
        data_df = data_df.get_chunk(sample_num)

    else:
        data_df = pd.read_csv(file, sep='\t', header=None, names=names)

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]
    features = sparse_features + dense_features

    data_df[sparse_features] = data_df[sparse_features].fillna('-1')
    data_df[dense_features] = data_df[dense_features].fillna(0)

    # Bin continuous data into intervals.
    est = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform')
    data_df[dense_features] = est.fit_transform(data_df[dense_features])

    for feat in sparse_features:
        le = LabelEncoder()
        data_df[feat] = le.fit_transform(data_df[feat])

    # ==============Feature Engineering===================

    # ====================================================
    feature_columns = [
        sparseFeature(feat, int(data_df[feat].max()) + 1, embed_dim=embed_dim)
        for feat in features
    ]
    train, test = train_test_split(data_df, test_size=test_size)

    train_X = train[features].values.astype('int32')
    train_y = train['label'].values.astype('int32')
    test_X = test[features].values.astype('int32')
    test_y = test['label'].values.astype('int32')

    return feature_columns, (train_X, train_y), (test_X, test_y)
Exemplo n.º 18
0
 def __init__(self, n_bins=15):
     self.n_bins = n_bins
     self.binarizer = KBinsDiscretizer(n_bins=self.n_bins,
                                       encode='onehot-dense')
Exemplo n.º 19
0
def test_invalid_strategy_option():
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy='invalid-strategy')
    assert_raise_message(
        ValueError, "Valid options for 'strategy' are "
        "('uniform', 'quantile', 'kmeans'). "
        "Got strategy='invalid-strategy' instead.", est.fit, X)
Exemplo n.º 20
0
def test_fit_transform(strategy, expected):
    est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy)
    est.fit(X)
    assert_array_equal(expected, est.transform(X))
 def get_new_base_enc():
     return [
         KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='quantile')
         for _ in range(LatLongScalarEnc.cont_dim)
     ]
Exemplo n.º 22
0
def test_invalid_encode_option():
    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='invalid-encode')
    assert_raise_message(
        ValueError, "Valid options for 'encode' are "
        "('onehot', 'onehot-dense', 'ordinal'). "
        "Got encode='invalid-encode' instead.", est.fit, X)
 def get_new_base_enc():
     return KBinsDiscretizer(n_bins=8,
                             encode='ordinal',
                             strategy='quantile')
Exemplo n.º 24
0
# Selecionando as Regiões únicas em ordem alfabética
regioes = countries['Region'].sort_values().unique()
regioes = list(regioes)
regioes

# # Análise de Dados: Questão 2

# In[ ]:

from sklearn.preprocessing import KBinsDiscretizer

# In[31]:

# Aplicando o KBinsDiscretizer
discretizer = KBinsDiscretizer(n_bins=10,
                               encode="ordinal",
                               strategy="quantile")
discretizer.fit(countries[['Pop_density']])
# Obtendo um array com os dados transformados
score_bins = discretizer.transform(countries[["Pop_density"]])
score_bins

# In[32]:

# Encontrando o percentil 90
q_90 = np.quantile(score_bins, 0.9)
q_90

# In[33]:

#Contando quantos valores estão acima do percentil 90
Exemplo n.º 25
0
def test_invalid_n_bins_array():
    # Bad shape
    n_bins = np.full((2, 4), 2.)
    est = KBinsDiscretizer(n_bins=n_bins)
    err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
    with pytest.raises(ValueError, match=err_msg):
        est.fit_transform(X)

    # Incorrect number of features
    n_bins = [1, 2, 2]
    est = KBinsDiscretizer(n_bins=n_bins)
    err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
    with pytest.raises(ValueError, match=err_msg):
        est.fit_transform(X)

    # Bad bin values
    n_bins = [1, 2, 2, 1]
    est = KBinsDiscretizer(n_bins=n_bins)
    err_msg = ("KBinsDiscretizer received an invalid number of bins "
               "at indices 0, 3. Number of bins must be at least 2, "
               "and must be an int.")
    with pytest.raises(ValueError, match=err_msg):
        est.fit_transform(X)

    # Float bin values
    n_bins = [2.1, 2, 2.1, 2]
    est = KBinsDiscretizer(n_bins=n_bins)
    err_msg = ("KBinsDiscretizer received an invalid number of bins "
               "at indices 0, 2. Number of bins must be at least 2, "
               "and must be an int.")
    with pytest.raises(ValueError, match=err_msg):
        est.fit_transform(X)
Exemplo n.º 26
0
print(bin_cols)

#Ahora si pasamos al preprocesamiento del dataset

si_cat_step = ('si1', SimpleImputer(strategy='constant', fill_value='MISSING'))
ohe_cat_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
cat_steps = [si_cat_step, ohe_cat_step]
cat_pipe = Pipeline(cat_steps)

si_num_step = ('si2', SimpleImputer(strategy='mean'))
ss_num_step = ('ss', StandardScaler())
num_steps = [si_num_step, ss_num_step]
num_pipe = Pipeline(num_steps)

si_bin_step = ('si3', SimpleImputer(strategy='median'))
kb_bin_step = ('kb', KBinsDiscretizer(encode='onehot-dense'))
bin_steps = [si_bin_step, kb_bin_step]
bin_pipe = Pipeline(bin_steps)

transformers = [('cat', cat_pipe, cat_cols), ('num', num_pipe, num_cols),
                ('bin', bin_pipe, bin_cols)]
ct = ColumnTransformer(transformers=transformers)
Z = ct.fit_transform(X)
print(Z.shape)
"""
Ahora que tenemos nuestro dataset pre-procesado realizaremos 2 algoritmos de ML para nuestra
regresión: 1) Red Neuronal 2) Random Forest. En ambos casos el tuneo de hiperparámetros se
realizará a través de las herramientas de Grid Search y Cross Validation. Sobre los modelos
con mejores parámetros se tomará la métrica de R2 para determinar que modelo predice mejor
el salario de los empleados.
"""
Exemplo n.º 27
0
def test_valid_n_bins():
    KBinsDiscretizer(n_bins=2).fit_transform(X)
    KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
    assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(np.int)
Exemplo n.º 28
0
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.tree import DecisionTreeRegressor

print(__doc__)

# construct the dataset
rnd = np.random.RandomState(42)
X = rnd.uniform(-3, 3, size=100)
y = np.sin(X) + rnd.normal(size=len(X)) / 3
X = X.reshape(-1, 1)

# transform the dataset with KBinsDiscretizer
enc = KBinsDiscretizer(n_bins=10, encode='onehot')
X_binned = enc.fit_transform(X)

# predict with original dataset
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4))
line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)
reg = LinearRegression().fit(X, y)
ax1.plot(line,
         reg.predict(line),
         linewidth=2,
         color='green',
         label="linear regression")
reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y)
ax1.plot(line,
         reg.predict(line),
         linewidth=2,
Exemplo n.º 29
0
def test_fit_transform(strategy, expected):
    est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy)
    est.fit(X)
    assert_array_equal(expected, est.transform(X))
def eval_cv(ims_save_name, cnn_save_name, features, load_ECG, ims_loaded_vars,
            cnn_loaded_vars, ims_save_dir, cnn_save_dir, device, conf_thresh,
            k, use_svm, norm, prune):
    params = ims_loaded_vars["params"]
    seed = params.seed
    np.random.seed(seed)
    n_splits = params.cv_splits
    n_repeats = params.cv_repeats

    cnn_params = cnn_loaded_vars["params"]
    use_norm = True if hasattr(cnn_params,
                               "use_norm") and cnn_params.use_norm else False
    batch_size = cnn_params.batch_size

    print("{:>40}  {:d}".format("Cross validation splits:", n_splits))
    print("{:>40}  {:d}".format("Cross validation repeats:", n_repeats))

    ims_x = features[:, :13]
    ims_y = features[:, 13:15]

    raw_x = load_ECG['raw_x']
    target = torch.tensor(load_ECG['target'])

    fft_x0 = scipy.fftpack.fft(raw_x[:, 0].numpy())
    fft_x0 = np.abs(fft_x0[:, :raw_x.shape[2] // 2])
    fft_x1 = scipy.fftpack.fft(raw_x[:, 1].numpy())
    fft_x1 = np.abs(fft_x1[:, :raw_x.shape[2] // 2])

    nf1 = np.mean(fft_x0, axis=-1)
    nf2 = np.mean(fft_x1, axis=-1)
    nf3 = np.max(raw_x[:, 0].numpy(), axis=-1)  # / 11
    nf4 = np.max(raw_x[:, 1].numpy(), axis=-1)  # / 11
    nf5 = np.min(raw_x[:, 0].numpy(), axis=-1)  # / 11
    nf6 = np.min(raw_x[:, 1].numpy(), axis=-1)  # / 15

    ims_x = np.append(ims_x,
                      np.transpose([nf1, nf2, nf3, nf4, nf5, nf6]),
                      axis=1)
    # ims_x = ims_x[:,[0, 2, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 18]] # stable
    ims_x = ims_x[:, [0, 2, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16,
                      17]]  # mid, last - 512

    # plt.plot(fft_x[802])

    # plt.scatter(np.mean(fft_x0, axis=-1), np.mean(fft_x1, axis=-1), c=target)
    # plt.show()
    # exit()

    assert (ims_y[:, 1] == target.numpy()).all()

    data_tag = load_ECG['data_tag']
    raw_feat = raw_x.shape[1]
    raw_size = raw_x.shape[2]
    num_classes = len(np.unique(target))

    # rel_y = predict_reliability(ims_x, ims_y[:,1], k)

    rskf = RepeatedStratifiedKFold(n_splits=n_splits,
                                   n_repeats=n_repeats,
                                   random_state=seed)
    ims_tp = np.zeros(n_splits * n_repeats)
    ims_fp = np.zeros(n_splits * n_repeats)
    ims_acc = np.zeros(n_splits * n_repeats)
    cnn_tp = np.zeros_like(ims_tp)
    cnn_fp = np.zeros_like(ims_fp)
    cnn_acc = np.zeros_like(ims_acc)
    nums_total = np.zeros_like(ims_tp)
    nums_pos = np.zeros_like(nums_total)
    nums_neg = np.zeros_like(nums_total)
    nums_cnn = np.zeros_like(nums_total)
    nums_pos_cnn = np.zeros_like(nums_total)
    nums_neg_cnn = np.zeros_like(nums_total)
    conf = np.ones_like(nums_cnn)
    three_class = True
    use_pca = False
    use_tree = True
    rf_size = 10
    rf_seed = 1
    rf_depth = np.empty(0)
    rf_params = np.empty(0)
    # ims_x = ims_x[:,12]
    # ims_x = ims_x.reshape(-1,1)

    # ims_x = np.log(ims_x + 1)
    # pca = PCA()
    # ims_x = pca.fit_transform(ims_x)
    # df = pd.DataFrame(ims_x)
    # scatter_matrix(df, diagonal="kde", alpha=0.2, c=ims_y[:,1])
    # plt.show()
    # # for feat in range(ims_x.shape[1]):
    # #     plt.subplot(7, 2, feat + 1)
    # #     plt.plot(ims_x[:,feat])
    # # plt.show()
    # exit()

    # selector = SelectFromModel(RandomForestClassifier(random_state=rf_seed, n_estimators=rf_size, ccp_alpha=1.1e-4), threshold=-np.inf, max_features=13)
    # feat_ctr = Counter()
    maxint = 12800  #np.iinfo(np.uint32).max
    quant = KBinsDiscretizer(n_bins=maxint,
                             encode="ordinal",
                             strategy="kmeans")

    for cv_idx, (trn_idx, tst_idx) in enumerate(rskf.split(ims_x, ims_y[:,
                                                                        1])):
        # trn_idx, val_idx = train_test_split(trn_idx, test_size=len(tst_idx), stratify=target[trn_idx], random_state=seed)
        val_idx = None

        cv_save = "{}{}".format(ims_save_name[:-1], cv_idx)
        x_trn = ims_x[trn_idx]
        y_trn = ims_y[trn_idx]
        # rel_trn = predict_reliability(x_trn, y_trn[:,1], k)
        x_tst = ims_x[tst_idx]
        y_tst = ims_y[tst_idx]

        if use_pca:
            pca = PCA()
            pca.fit(x_trn)
            x_trn = pca.transform(x_trn)
            x_tst = pca.transform(x_tst)

        if norm:
            m_trn = x_trn.mean(axis=0)
            v_trn = x_trn.std(axis=0)

            x_trn = (x_trn - m_trn) / v_trn
            x_tst = (x_tst - m_trn) / v_trn

        # rel_trn = predict_reliability_simplified(x_trn, y_trn[:,1], x_tst, k)
        if three_class:
            if use_tree:
                rel_trn = predict_3_class(x_trn, y_trn[:, 1], k)
            else:
                rel_trn = predict_3_class_simplified(x_trn, y_trn[:, 1], x_tst,
                                                     k)
        else:
            rel_trn = y_trn[:, 1]

        nums_total[cv_idx] = len(tst_idx)
        nums_pos[cv_idx] = (y_tst[:, 1] == 1).sum()
        nums_neg[cv_idx] = (y_tst[:, 1] == 0).sum()

        if len(rel_trn) > 0:
            if use_svm:
                svm = train_svm(x_trn, rel_trn)
                rel_mask = svm.predict(x_tst).astype(bool)
            elif use_tree:
                # dt = tree.DecisionTreeClassifier(random_state=rf_seed, max_depth=7)
                selected_trn = x_trn
                selected_tst = x_tst
                # selected_trn = selector.fit_transform(x_trn, rel_trn)
                # selected_tst = selector.transform(x_tst)
                # feat_ctr.update(np.where(selector.get_support())[0])
                # selected_trn = quant.fit_transform(selected_trn)
                # selected_tst = quant.transform(selected_tst)
                # selected_trn = (selected_trn * 100000000).astype(np.int32)
                # selected_tst = (selected_tst * 100000000).astype(np.int32)
                # print(selected_tst)
                # exit()
                # print(np.where(selector.get_support()))
                # print("before: {}, after: {}".format(x_tst.shape, selected_tst.shape))
                dt = RandomForestClassifier(random_state=rf_seed,
                                            n_estimators=rf_size,
                                            ccp_alpha=4.0e-4,
                                            max_depth=30)
                dt.fit(selected_trn, rel_trn)
                # temp = dt.estimators_[0].tree_.threshold.astype(np.int32)
                # dt.estimators_[0].tree_.threshold[:] = temp
                internal = [[
                    estimator.tree_.feature, estimator.tree_.threshold,
                    estimator.tree_.children_left,
                    estimator.tree_.children_right,
                    np.argmax(estimator.tree_.value[estimator.tree_.feature ==
                                                    -2][:, 0],
                              axis=-1)
                ] for estimator in dt.estimators_]
                # tree_summary(dt.estimators_[0])
                # print(internal)
                # print(dt.estimators_[0].tree_.children_right)
                # print(np.argmax(dt.estimators_[0].tree_.value[dt.estimators_[0].tree_.feature == -2][:,0], axis=-1))
                # print(dt.estimators_[0].tree_.node_count)
                # print(help(sklearn.tree._tree.Tree))
                # exit()
                # dump(internal, open("000_rf/3c_rf{}_nf_norm_k2_cv{}.p".format(rf_size, cv_idx), "wb"))
                rf_params = np.append(
                    rf_params,
                    np.sum([
                        estimator.tree_.node_count
                        for estimator in dt.estimators_
                    ]))
                rf_depth = np.append(rf_depth, [
                    estimator.tree_.max_depth for estimator in dt.estimators_
                ])
                # rf_depth = dt.tree_.max_depth
                order = get_rf_order(dt, selected_trn, rel_trn, "pred")
                pred = predict_rf_sorted(dt, selected_tst, order)
                # pred = dt.predict(selected_tst)
                rel_mask = pred != 2
                rel_trn = pred
            else:
                # rel_mask = predict_reliability(x_trn, rel_trn, k-1, x_tst=x_tst)
                if three_class:
                    rel_mask = rel_trn != 2
                else:
                    knn = KNeighborsClassifier(n_neighbors=k)
                    knn.fit(x_trn, rel_trn)
                    rel_trn = knn.predict(x_tst)
                    rel_mask = np.ones_like(rel_trn).astype(bool)

            x_rel = x_tst[rel_mask]
            y_rel = y_tst[rel_mask]

            if len(x_rel) > 0:
                # ims_tp[cv_idx], ims_fp[cv_idx], ims_acc[cv_idx], below_thresh = fraunhofer_test.evaluate(x_rel, y_rel, cv_save, ims_save_dir, conf_thresh=conf_thresh, print_results=False)
                ims_tp[cv_idx] = ((rel_trn == y_tst[:, 1]) &
                                  (rel_trn == 1)).sum().item()
                ims_fp[cv_idx] = ((rel_trn != y_tst[:, 1]) &
                                  (rel_trn == 1)).sum().item()
                ims_acc[cv_idx] = (
                    rel_trn == y_tst[:, 1]).astype(int).sum().item()

            tst_idx = tst_idx[np.invert(rel_mask)]

        nums_cnn[cv_idx] = len(tst_idx)
        nums_pos_cnn[cv_idx] = (ims_y[tst_idx, 1] == 1).sum()
        nums_neg_cnn[cv_idx] = (ims_y[tst_idx, 1] == 0).sum()

        if len(tst_idx) == 0:
            continue

        jobs = 0
        orig_device = None
        if device.type == "cuda":
            gpu_mem = torch.cuda.get_device_properties(device).total_memory
            data_size = sys.getsizeof(raw_x.storage()) + sys.getsizeof(
                target.storage())
            if data_size >= gpu_mem * 0.85:  # 85% of total memory, just a guess
                jobs = os.cpu_count()
                orig_device = device
                device = torch.device("cpu")

        ecg_datasets = create_datasets_cv(raw_x, target, trn_idx, val_idx,
                                          tst_idx, use_norm, device)

        trn_dl, val_dl, tst_dl = create_loaders(ecg_datasets,
                                                bs=batch_size,
                                                jobs=jobs)

        if orig_device:
            device = orig_device

        cv_save = "{}{}".format(cnn_save_name[:-1], cv_idx)

        model = torch.load(os.path.join(cnn_save_dir,
                                        "train_" + cv_save + '_best.pth'),
                           map_location=device)

        if prune > 0:
            model = pruning.prune_fc(model, prune)

        (cnn_tp[cv_idx], _), (cnn_fp[cv_idx],
                              _), cnn_acc[cv_idx], _, _ = evaluation.evaluate(
                                  model,
                                  tst_dl,
                                  tst_idx,
                                  data_tag,
                                  device=device,
                                  slide=False,
                                  print_results=False)
    # cnn_tp = nums_pos_cnn
    # cnn_fp = nums_neg_cnn
    # cnn_acc = nums_pos_cnn

    # flops, params = get_model_complexity_info(model, (raw_feat, raw_size), as_strings=False, print_per_layer_stat=False)

    # print("{:>40}  {:.2f} seconds".format("Mean elapsed test time:", elapsed.mean()))
    nums_ims = nums_total - nums_cnn
    nums_pos_ims = nums_pos - nums_pos_cnn
    nums_neg_ims = nums_neg - nums_neg_cnn

    # # IMS-only
    # acc = ims_acc / nums_ims
    # tp = ims_tp / nums_pos_ims
    # fp = ims_fp / nums_neg_ims

    # # CNN-only
    # acc = cnn_acc / nums_cnn
    # tp = cnn_tp / nums_pos_cnn
    # fp = cnn_fp / nums_neg_cnn

    # Full
    acc = (ims_acc + cnn_acc) / nums_total
    tp = (ims_tp + cnn_tp) / nums_pos
    fp = (ims_fp + cnn_fp) / nums_neg

    conf = conf - (nums_cnn / nums_total)

    # print("{:>40}  {:.2%}".format("Total data labeled as reliable:", rel_y.sum() / ims_y.shape[0]))
    # print("{:>40}  {}".format("Best Features:", sorted([x[0] for x in feat_ctr.most_common(13)])))

    if (nums_ims != nums_total).any():
        print("{:>40}  {:.2%}".format("Min IMS-net data:", conf.min()))
        print("{:>40}  {:.2%}".format("Max IMS-net data:", conf.max()))
        print("{:>40}  {:.2%}".format("Mean IMS-net data:", conf.mean()))
        print("{:>40}  {:.2%}".format("IMS-net data standard deviation:",
                                      conf.std()))

    print("{:>40}  {:.2%}".format("Min test accuracy:", acc.min()))
    print("{:>40}  {:.2%}".format("Max test accuracy:", acc.max()))
    print("{:>40}  {:.2%}".format("Mean test accuracy:", acc.mean()))
    print("{:>40}  {:.2%}".format("Test accuracy standard deviation:",
                                  acc.std()))

    print("{:>40}  {:.2%}".format("Min TP rate:", np.nanmin(tp)))
    print("{:>40}  {:.2%}".format("Max TP rate:", np.nanmax(tp)))
    print("{:>40}  {:.2%}".format("Mean TP rate:", np.nanmean(tp)))
    print("{:>40}  {:.2%}".format("TP rate standard deviation:",
                                  np.nanstd(tp)))

    print("{:>40}  {:.2%}".format("Min FP rate:", fp.min()))
    print("{:>40}  {:.2%}".format("Max FP rate:", fp.max()))
    print("{:>40}  {:.2%}".format("Mean FP rate:", fp.mean()))
    print("{:>40}  {:.2%}".format("FP rate standard deviation:", fp.std()))

    if use_tree:
        print("{:>40}  {:.0f}".format("Min RF params:", rf_params.min()))
        print("{:>40}  {:.0f}".format("Max RF params:", rf_params.max()))
        print("{:>40}  {:.2f}".format("Mean RF params:", rf_params.mean()))

        print("{:>40}  {:.0f}".format("Min RF max_depth:", rf_depth.min()))
        print("{:>40}  {:.0f}".format("Max RF max_depth:", rf_depth.max()))
        print("{:>40}  {:.2f}".format("Mean RF max_depth:", rf_depth.mean()))

    print("{:>40}  {}".format("Min TP > 90+std:", tp.min() > 0.9 + tp.std()))
    print("{:>40}  {}".format("Mean TP > 90+4*std:",
                              tp.mean() > 0.9 + (4 * tp.std())))
    print("{:>40}  {}".format("Max FP < 20-std:", fp.max() < 0.2 - tp.std()))
    print("{:>40}  {}".format("Mean FP < 20-4*std:",
                              fp.mean() < 0.2 - (4 * fp.std())))

    # print('{:>40}  {:d}'.format('Number of parameters:', params))
    # print('{:>40}  {:.0f}'.format('Computational complexity:', flops))

    df = pd.DataFrame({
        "Total-Acc": acc * 100,
        "Total-TP": tp * 100,
        "Total-FP": fp * 100,
        "IMS-Acc": ims_acc / nums_ims * 100,
        "IMS-TP": ims_tp / nums_pos_ims * 100,
        "IMS-FP": ims_fp / nums_neg_ims * 100,
        "CNN-Acc": cnn_acc / nums_cnn * 100,
        "CNN-TP": cnn_tp / nums_pos_cnn * 100,
        "CNN-FP": cnn_fp / nums_neg_cnn * 100
    })
Exemplo n.º 31
0
# %% [markdown]
# We recall that a way of accelerating the gradient boosting is to reduce the
# number of split considered within the tree building. One way is to bin the
# data before to give them into the gradient boosting. A transformer called
# `KBinsDiscretizer` is doing such transformation. Thus, we can pipeline
# this preprocessing with the gradient boosting.
#
# We can first demonstrate the transformation done by the `KBinsDiscretizer`.

# %%
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins=256,
                               encode="ordinal",
                               strategy="quantile")
data_trans = discretizer.fit_transform(data)
data_trans

# %% [markdown]
# ```{note}
# The code cell above will generate a couple of warnings. Indeed, for some of
# the features, we requested too much bins in regard of the data dispersion
# for those features. The smallest bins will be removed.
# ```
# We see that the discretizer transforms the original data into an integer.
# This integer represents the bin index when the distribution by quantile is
# performed. We can check the number of bins per feature.

# %%
Exemplo n.º 32
0
def q2():
    kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
    pop_density_discrete = kbins.fit_transform(
        countries['Pop_density'].values.reshape(-1, 1))
    return int((pop_density_discrete == 9).sum())
Exemplo n.º 33
0
 def __init__(self, **hyperparams):
     self._hyperparams = hyperparams
     self._wrapped_model = Op(**self._hyperparams)
Exemplo n.º 34
0
    df.drop_duplicates(inplace=True)
    df.replace(class_labels, [0, 1], inplace=True)

    negative_examples, positive_examples = np.bincount(df["income"])

    split_ratio = args.train_test_split_ratio
    X_train, X_test, y_train, y_test = train_test_split(df.drop("income",
                                                                axis=1),
                                                        df["income"],
                                                        test_size=split_ratio,
                                                        random_state=0)

    preprocess = make_column_transformer(
        (
            ["age", "num persons worked for employer"],
            KBinsDiscretizer(encode="onehot-dense", n_bins=10),
        ),
        (["capital gains", "capital losses", "dividends from stocks"
          ], StandardScaler()),
        (["education", "major industry code", "class of worker"
          ], OneHotEncoder(sparse=False)),
    )
    train_features = preprocess.fit_transform(X_train)
    test_features = preprocess.transform(X_test)

    train_features_output_path = os.path.join("/opt/ml/processing/train",
                                              "train_features.csv")
    train_labels_output_path = os.path.join("/opt/ml/processing/train",
                                            "train_labels.csv")

    test_features_output_path = os.path.join("/opt/ml/processing/test",
Exemplo n.º 35
0

# ## Questão 2
#
# Discretizando a variável `Pop_density` em 10 intervalos com `KBinsDiscretizer`, seguindo o encode `ordinal` e estratégia `quantile`, quantos países se encontram acima do 90º percentil? Responda como um único escalar inteiro.

# In[93]:

from sklearn.preprocessing import (OneHotEncoder, Binarizer, KBinsDiscretizer,
                                   MinMaxScaler, StandardScaler,
                                   PolynomialFeatures)

# In[94]:

discretizer = KBinsDiscretizer(n_bins=10,
                               encode='ordinal',
                               strategy='quantile')
discretizer.fit(countries[["Pop_density"]])

# In[95]:

discretizer.bin_edges_[0]

# In[102]:


def q2():
    return countries["Pop_density"][
        countries.Pop_density >= discretizer.bin_edges_[0][9]].count()

q1()

# ## Questão 2
#
# Discretizando a variável `Pop_density` em 10 intervalos com `KBinsDiscretizer`, seguindo o encode `ordinal` e estratégia `quantile`, quantos países se encontram acima do 90º percentil? Responda como um único escalar inteiro.

# In[16]:

countries.head()

# In[17]:

from sklearn.preprocessing import KBinsDiscretizer

discretizador = KBinsDiscretizer(n_bins=10,
                                 encode='ordinal',
                                 strategy='quantile')

discretizador.fit(countries[['Pop_density']])

popDensity_disc = discretizador.transform(countries[['Pop_density']])

popDensity_disc

# In[18]:

np.unique(popDensity_disc.flatten())

# In[19]:

pd.Series(popDensity_disc.flatten()).value_counts()[9.0]
Exemplo n.º 37
0
def preprocess_dataset(df,
                       pipeline_path,
                       load=False) -> (np.ndarray, np.ndarray):

    if load:
        with open(pipeline_path, "rb") as file:
            components = pickle.load(file)

    # 1)
    reply = df["reply_timestamp"].notnull().astype(int).to_numpy()
    retweet = df["retweet_timestamp"].notnull().astype(int).to_numpy()
    retweet_with_comment = df["retweet_with_comment_timestamp"].notnull(
    ).astype(int).to_numpy()
    like = df["like_timestamp"].notnull().astype(int).to_numpy()

    response = np.column_stack((reply, retweet, retweet_with_comment, like))

    # 2)
    if load:
        language = components['language_encoder'].transform(
            df["language"].to_numpy().reshape(-1, 1))
        tweet_type = components["tweet_type_encoder"].transform(
            df["tweet_type"].to_numpy().reshape(-1, 1))
        present_media = components["present_media_encoder"].transform(
            df["present_media"])
    else:
        language_encoder = OneHotEncoder()
        language = language_encoder.fit_transform(
            df["language"].to_numpy().reshape(-1, 1))
        tweet_type_encoder = OneHotEncoder()
        tweet_type = tweet_type_encoder.fit_transform(
            df["tweet_type"].to_numpy().reshape(-1, 1))
        present_media_encoder = MultiLabelBinarizer(sparse_output=False)
        present_media = present_media_encoder.fit_transform(
            df["present_media"])

    tweet_features = sp.hstack([language, tweet_type, present_media])

    #3)
    if load:
        text_tokens = components["text_tfidf"].transform(df['text_tokens'])
    else:
        text_tfidf = TfidfVectorizer()
        text_tokens = text_tfidf.fit_transform(df['text_tokens'])

    #4)
    if load:
        hashtags = components["hashtags_tfidf"].transform(df['hashtags'])
    else:
        hashtags_tfidf = TfidfVectorizer()
        hashtags = hashtags_tfidf.fit_transform(df['hashtags'])

    tweet_features = sp.hstack((text_tokens, hashtags))  # NOT np.vstack
    # 5)
    if load:
        df['tweet_id'] = df["tweet_id"].map(hash)
        df['engaged_with_user_id'] = df["engaged_with_user_id"].map(hash)
        df['engaging_user_id'] = df["engaging_user_id"].map(hash)

        tweet_id = components["tweet_discretizer"].transform(
            df['tweet_id'].to_numpy().reshape(-1, 1))
        engaged_with_user_id = components[
            "engaged_with_user_discretizer"].transform(
                df['engaged_with_user_id'].to_numpy().reshape(-1, 1))
        engaging_user_id = components["engaging_user_discretizer"].transform(
            df['engaging_user_id'].to_numpy().reshape(-1, 1))
    else:
        df['tweet_id'] = df["tweet_id"].map(hash)
        df['engaged_with_user_id'] = df["engaged_with_user_id"].map(hash)
        df['engaging_user_id'] = df["engaging_user_id"].map(hash)

        tweet_discretizer = KBinsDiscretizer(n_bins=50)
        tweet_id = tweet_discretizer.fit_transform(
            df['tweet_id'].to_numpy().reshape(-1, 1))
        engaged_with_user_discretizer = KBinsDiscretizer(n_bins=50)
        engaged_with_user_id = tweet_discretizer.fit_transform(
            df['engaged_with_user_id'].to_numpy().reshape(-1, 1))
        engaging_user_discretizer = KBinsDiscretizer(n_bins=50)
        engaging_user_id = tweet_discretizer.fit_transform(
            df['engaging_user_id'].to_numpy().reshape(-1, 1))

    id_features = sp.hstack([tweet_id, engaged_with_user_id, engaging_user_id])

    # 6)
    engaged_with_user_is_verified = df["engaged_with_user_is_verified"].astype(
        int).to_numpy()
    engaging_user_is_verified = df["engaging_user_is_verified"].astype(
        int).to_numpy()
    engaged_follows_engaging = df["engaged_follows_engaging"].astype(
        int).to_numpy()
    boolean_features = np.column_stack([
        engaged_with_user_is_verified, engaging_user_is_verified,
        engaged_follows_engaging
    ])

    # 7)
    present_links = df["present_links"].notnull().astype(int).to_numpy()
    present_domains = df["present_domains"].notnull().astype(int).to_numpy()

    present_features = np.column_stack([present_links, present_domains])

    X_train = sp.hstack(
        [tweet_features, id_features, boolean_features, present_features])

    if not load:
        components = {
            "language_encoder": language_encoder,
            "tweet_type_encoder": tweet_type_encoder,
            "present_media_encoder": present_media_encoder,
            "text_tfidf": text_tfidf,
            "hashtags_tfidf": hashtags_tfidf,
            "tweet_discretizer": tweet_id,
            "engaged_with_user_discretizer": engaged_with_user_discretizer,
            "engaging_user_discretizer": engaging_user_discretizer
        }
        with open(pipeline_path, "wb") as file:
            pickle.dump(components, file)

    return X_train, response
Exemplo n.º 38
0
def test_inverse_transform(strategy, encode, expected_inv):
    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
    Xt = kbd.fit_transform(X)
    Xinv = kbd.inverse_transform(Xt)
    assert_array_almost_equal(expected_inv, Xinv)
Exemplo n.º 39
0
# Preprocess data
#
##########################################

if norm_target == 1:
    #Target normalization for continuous values
    target_np = scale(target_np)

if norm_features == 1:
    #Feature normalization for continuous values
    data_np = scale(data_np)

if binning == 1:
    #Discretize Target variable with KBinsDiscretizer
    enc = KBinsDiscretizer(
        n_bins=[bin_cnt], encode='ordinal', strategy='quantile'
    )  #Strategy here is important, quantile creating equal bins, but kmeans prob being more valid "clusters"
    target_np_bin = enc.fit_transform(target_np.reshape(-1, 1))

    #Get Bin min/max
    temp = [[] for x in range(bin_cnt + 1)]
    for i in range(len(target_np)):
        for j in range(bin_cnt):
            if target_np_bin[i] == j:
                temp[j].append(target_np[i])

    for j in range(bin_cnt):
        print('Bin', j, ':', min(temp[j]), max(temp[j]), len(temp[j]))
    print('\n')

    #Convert Target array back to correct shape
Exemplo n.º 40
0
def transform_amounts(amounts: List[float], discretizer: KBinsDiscretizer) -> List[str]:
    amounts = discretizer.transform([[x] for x in amounts])
    # unpack and covert float -> int -> str
    amounts = list(map(str, (map(int, chain(*amounts)))))
    return amounts
    name = estimator.__class__.__name__
    if name == 'Pipeline':
        name = [get_name(est[1]) for est in estimator.steps]
        name = ' + '.join(name)
    return name


# list of (estimator, param_grid), where param_grid is used in GridSearchCV
classifiers = [
    (LogisticRegression(random_state=0), {
        'C': np.logspace(-2, 7, 10)
    }),
    (LinearSVC(random_state=0), {
        'C': np.logspace(-2, 7, 10)
    }),
    (make_pipeline(KBinsDiscretizer(encode='onehot'),
                   LogisticRegression(random_state=0)), {
                       'kbinsdiscretizer__n_bins': np.arange(2, 10),
                       'logisticregression__C': np.logspace(-2, 7, 10),
                   }),
    (make_pipeline(KBinsDiscretizer(encode='onehot'),
                   LinearSVC(random_state=0)), {
                       'kbinsdiscretizer__n_bins': np.arange(2, 10),
                       'linearsvc__C': np.logspace(-2, 7, 10),
                   }),
    (GradientBoostingClassifier(n_estimators=50, random_state=0), {
        'learning_rate': np.logspace(-4, 0, 10)
    }),
    (SVC(random_state=0), {
        'C': np.logspace(-2, 7, 10)
    }),
Exemplo n.º 42
0
def test_invalid_n_features():
    est = KBinsDiscretizer(n_bins=3).fit(X)
    bad_X = np.arange(25).reshape(5, -1)
    assert_raise_message(
        ValueError, "Incorrect number of features. Expecting 4, "
        "received 5", est.transform, bad_X)
Exemplo n.º 43
0
data_2 = data.copy()
 
from sklearn.preprocessing import Binarizer
X = data_2.iloc[:,0].values.reshape(-1,1)               #类为特征专用,所以不能使用一维数组
transformer = Binarizer(threshold=30).fit_transform(X)
 
data_2.iloc[:,0] = transformer
data_2.head()

# In[]:
# 分箱 编码 一同完成:
from sklearn.preprocessing import KBinsDiscretizer
 
X = data.iloc[:,0].values.reshape(-1,1) 
# 普通转换、等宽
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') # 等宽分箱
res = est.fit_transform(X)
# 查看转换后分的箱:变成了一列中的三箱
print(set(res.ravel()))
unique_label, counts_label = np.unique(res, return_counts=True)
print(counts_label/ len(res)) 

# 普通转换、等位/等深
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile') # 等位/深分箱
res = est.fit_transform(X)
# 查看转换后分的箱:变成了一列中的三箱
print(set(res.ravel()))
unique_label, counts_label = np.unique(res, return_counts=True)
print(counts_label/ len(res)) 

# one-hot转换 默认
Exemplo n.º 44
0
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer

data = pd.read_csv(
    r"E:\python_workspace\python_scripts\data\feature\Narrativedata.csv",
    index_col=0)
data.loc[:, "Age"] = data.loc[:, "Age"].fillna(data.loc[:, "Age"].median())
data2 = data.copy()
data2.iloc[:, 0].fillna(0)
print(data2.iloc[:, 0])

X = data2.iloc[:, 0].values.reshape(-1, 1)

est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
res = est.fit_transform(X)
print(res)

est2 = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')
print(est2.fit_transform(X).toarray())
Exemplo n.º 45
0
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.tree import DecisionTreeRegressor

print(__doc__)

# construct the dataset
rnd = np.random.RandomState(42)
X = rnd.uniform(-3, 3, size=100)
y = np.sin(X) + rnd.normal(size=len(X)) / 3
X = X.reshape(-1, 1)

# transform the dataset with KBinsDiscretizer
enc = KBinsDiscretizer(n_bins=10, encode='onehot')
X_binned = enc.fit_transform(X)

# predict with original dataset
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4))
line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)
reg = LinearRegression().fit(X, y)
ax1.plot(line, reg.predict(line), linewidth=2, color='green',
         label="linear regression")
reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y)
ax1.plot(line, reg.predict(line), linewidth=2, color='red',
         label="decision tree")
ax1.plot(X[:, 0], y, 'o', c='k')
ax1.legend(loc="best")
ax1.set_ylabel("Regression output")
ax1.set_xlabel("Input feature")