def test_column_transformer_sparse_array():
    X_sparse = sparse.eye(3, 2).tocsr()

    # no distinction between 1D and 2D
    X_res_first = X_sparse[:, 0]
    X_res_both = X_sparse

    for col in [0, [0], slice(0, 1)]:
        for remainder, res in [('drop', X_res_first),
                               ('passthrough', X_res_both)]:
            ct = ColumnTransformer([('trans', Trans(), col)],
                                   remainder=remainder,
                                   sparse_threshold=0.8)
            assert sparse.issparse(ct.fit_transform(X_sparse))
            assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res)
            assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                         res)

    for col in [[0, 1], slice(0, 2)]:
        ct = ColumnTransformer([('trans', Trans(), col)],
                               sparse_threshold=0.8)
        assert sparse.issparse(ct.fit_transform(X_sparse))
        assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both)
        assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                     X_res_both)
def test_column_transformer_special_strings():

    # one 'drop' -> ignore
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer(
        [('trans1', Trans(), [0]), ('trans2', 'drop', [1])])
    exp = np.array([[0.], [1.], [2.]])
    assert_array_equal(ct.fit_transform(X_array), exp)
    assert_array_equal(ct.fit(X_array).transform(X_array), exp)

    # all 'drop' -> return shape 0 array
    ct = ColumnTransformer(
        [('trans1', 'drop', [0]), ('trans2', 'drop', [1])])
    assert_array_equal(ct.fit(X_array).transform(X_array).shape, (3, 0))
    assert_array_equal(ct.fit_transform(X_array).shape, (3, 0))

    # 'passthrough'
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer(
        [('trans1', Trans(), [0]), ('trans2', 'passthrough', [1])])
    exp = X_array
    assert_array_equal(ct.fit_transform(X_array), exp)
    assert_array_equal(ct.fit(X_array).transform(X_array), exp)

    # None itself / other string is not valid
    for val in [None, 'other']:
        ct = ColumnTransformer(
            [('trans1', Trans(), [0]), ('trans2', None, [1])])
        assert_raise_message(TypeError, "All estimators should implement",
                             ct.fit_transform, X_array)
        assert_raise_message(TypeError, "All estimators should implement",
                             ct.fit, X_array)
def test_column_transformer_callable_specifier():
    # assert that function gets the full array / dataframe
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_res_first = np.array([[0, 1, 2]]).T

    def func(X):
        assert_array_equal(X, X_array)
        return [0]

    ct = ColumnTransformer([('trans', Trans(), func)],
                           remainder='drop')
    assert_array_equal(ct.fit_transform(X_array), X_res_first)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)

    pd = pytest.importorskip('pandas')
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])

    def func(X):
        assert_array_equal(X.columns, X_df.columns)
        assert_array_equal(X.values, X_df.values)
        return ['first']

    ct = ColumnTransformer([('trans', Trans(), func)],
                           remainder='drop')
    assert_array_equal(ct.fit_transform(X_df), X_res_first)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)
def test_column_transformer_remainder():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
    X_res_both = X_array

    # default drop
    ct = ColumnTransformer([('trans1', Trans(), [0])])
    assert_array_equal(ct.fit_transform(X_array), X_res_first)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'drop'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # specify passthrough
    ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # column order is not preserved (passed through added to end)
    ct = ColumnTransformer([('trans1', Trans(), [1])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1])
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1])
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [0])

    # passthrough when all actual transformers are skipped
    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_second)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # error on invalid arg
    ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\', \'passthrough\', "
        "or estimator.", ct.fit, X_array)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\', \'passthrough\', "
        "or estimator.", ct.fit_transform, X_array)

    # check default for make_column_transformer
    ct = make_column_transformer(([0], Trans()))
    assert ct.remainder == 'drop'
def test_column_transformer():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    X_res_first1D = np.array([0, 1, 2])
    X_res_second1D = np.array([2, 4, 6])
    X_res_first = X_res_first1D.reshape(-1, 1)
    X_res_both = X_array

    cases = [
        # single column 1D / 2D
        (0, X_res_first),
        ([0], X_res_first),
        # list-like
        ([0, 1], X_res_both),
        (np.array([0, 1]), X_res_both),
        # slice
        (slice(0, 1), X_res_first),
        (slice(0, 2), X_res_both),
        # boolean mask
        (np.array([True, False]), X_res_first),
    ]

    for selection, res in cases:
        ct = ColumnTransformer([('trans', Trans(), selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_array), res)
        assert_array_equal(ct.fit(X_array).transform(X_array), res)

        # callable that returns any of the allowed specifiers
        ct = ColumnTransformer([('trans', Trans(), lambda x: selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_array), res)
        assert_array_equal(ct.fit(X_array).transform(X_array), res)

    ct = ColumnTransformer([('trans1', Trans(), [0]),
                            ('trans2', Trans(), [1])])
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2

    # test with transformer_weights
    transformer_weights = {'trans1': .1, 'trans2': 10}
    both = ColumnTransformer([('trans1', Trans(), [0]),
                              ('trans2', Trans(), [1])],
                             transformer_weights=transformer_weights)
    res = np.vstack([transformer_weights['trans1'] * X_res_first1D,
                     transformer_weights['trans2'] * X_res_second1D]).T
    assert_array_equal(both.fit_transform(X_array), res)
    assert_array_equal(both.fit(X_array).transform(X_array), res)
    assert len(both.transformers_) == 2

    both = ColumnTransformer([('trans', Trans(), [0, 1])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
def test_column_transformer_negative_column_indexes():
    X = np.random.randn(2, 2)
    X_categories = np.array([[1], [2]])
    X = np.concatenate([X, X_categories], axis=1)

    ohe = OneHotEncoder(categories='auto')

    tf_1 = ColumnTransformer([('ohe', ohe, [-1])], remainder='passthrough')
    tf_2 = ColumnTransformer([('ohe', ohe,  [2])], remainder='passthrough')
    assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
def test_column_transformer_cloning():
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T

    ct = ColumnTransformer([('trans', StandardScaler(), [0])])
    ct.fit(X_array)
    assert_false(hasattr(ct.transformers[0][1], 'mean_'))
    assert_true(hasattr(ct.transformers_[0][1], 'mean_'))

    ct = ColumnTransformer([('trans', StandardScaler(), [0])])
    ct.fit_transform(X_array)
    assert_false(hasattr(ct.transformers[0][1], 'mean_'))
    assert_true(hasattr(ct.transformers_[0][1], 'mean_'))
def test_column_transformer_remainder_numpy(key):
    # test different ways that columns are specified with passthrough
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_res_both = X_array

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
def test_make_column_transformer_pandas():
    pd = pytest.importorskip('pandas')
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    norm = Normalizer()
    ct1 = ColumnTransformer([('norm', Normalizer(), X_df.columns)])
    ct2 = make_column_transformer((norm, X_df.columns))
    assert_almost_equal(ct1.fit_transform(X_df),
                        ct2.fit_transform(X_df))
def test_column_transformer_remainder_pandas(key):
    # test different ways that columns are specified with passthrough
    pd = pytest.importorskip('pandas')

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    X_res_both = X_array

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
def test_column_transformer_no_remaining_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    ct = ColumnTransformer([('trans1', Trans(), [0, 1, 2])],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_array)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_array)
    assert len(ct.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'
def test_column_transformer_remainder():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
    X_res_both = X_array

    # default passthrough
    ct = ColumnTransformer([('trans', Trans(), [0])])
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)

    # specify to drop remaining columns
    ct = ColumnTransformer([('trans1', Trans(), [0])],
                           remainder='drop')
    assert_array_equal(ct.fit_transform(X_array), X_res_first)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)

    # column order is not preserved (passed through added to end)
    ct = ColumnTransformer([('trans1', Trans(), [1])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1])
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1])

    # passthrough when all actual transformers are skipped
    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_second)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second)

    # error on invalid arg
    ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\' or \'passthrough\'",
        ct.fit, X_array)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\' or \'passthrough\'",
        ct.fit_transform, X_array)
def test_column_transformer_empty_columns(pandas, column):
    # test case that ensures that the column transformer does also work when
    # a given transformer doesn't have any columns to work on
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_res_both = X_array

    if pandas:
        pd = pytest.importorskip('pandas')
        X = pd.DataFrame(X_array, columns=['first', 'second'])
    else:
        X = X_array

    ct = ColumnTransformer([('trans1', Trans(), [0, 1]),
                            ('trans2', Trans(), column)])
    assert_array_equal(ct.fit_transform(X), X_res_both)
    assert_array_equal(ct.fit(X).transform(X), X_res_both)
    assert len(ct.transformers_) == 2
    assert isinstance(ct.transformers_[1][1], Trans)

    ct = ColumnTransformer([('trans1', Trans(), column),
                            ('trans2', Trans(), [0, 1])])
    assert_array_equal(ct.fit_transform(X), X_res_both)
    assert_array_equal(ct.fit(X).transform(X), X_res_both)
    assert len(ct.transformers_) == 2
    assert isinstance(ct.transformers_[0][1], Trans)

    ct = ColumnTransformer([('trans', Trans(), column)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X), X_res_both)
    assert_array_equal(ct.fit(X).transform(X), X_res_both)
    assert len(ct.transformers_) == 2  # including remainder
    assert isinstance(ct.transformers_[0][1], Trans)

    fixture = np.array([[], [], []])
    ct = ColumnTransformer([('trans', Trans(), column)],
                           remainder='drop')
    assert_array_equal(ct.fit_transform(X), fixture)
    assert_array_equal(ct.fit(X).transform(X), fixture)
    assert len(ct.transformers_) == 2  # including remainder
    assert isinstance(ct.transformers_[0][1], Trans)
def test_column_transformer_sparse_threshold():
    X_array = np.array([['a', 'b'], ['A', 'B']], dtype=object).T
    # above data has sparsity of 4 / 8 = 0.5

    # apply threshold even if all sparse
    col_trans = ColumnTransformer([('trans1', OneHotEncoder(), [0]),
                                   ('trans2', OneHotEncoder(), [1])],
                                  sparse_threshold=0.2)
    res = col_trans.fit_transform(X_array)
    assert not sparse.issparse(res)
    assert not col_trans.sparse_output_

    # mixed -> sparsity of (4 + 2) / 8 = 0.75
    for thres in [0.75001, 1]:
        col_trans = ColumnTransformer(
            [('trans1', OneHotEncoder(sparse=True), [0]),
             ('trans2', OneHotEncoder(sparse=False), [1])],
            sparse_threshold=thres)
        res = col_trans.fit_transform(X_array)
        assert sparse.issparse(res)
        assert col_trans.sparse_output_

    for thres in [0.75, 0]:
        col_trans = ColumnTransformer(
            [('trans1', OneHotEncoder(sparse=True), [0]),
             ('trans2', OneHotEncoder(sparse=False), [1])],
            sparse_threshold=thres)
        res = col_trans.fit_transform(X_array)
        assert not sparse.issparse(res)
        assert not col_trans.sparse_output_

    # if nothing is sparse -> no sparse
    for thres in [0.33, 0, 1]:
        col_trans = ColumnTransformer(
            [('trans1', OneHotEncoder(sparse=False), [0]),
             ('trans2', OneHotEncoder(sparse=False), [1])],
            sparse_threshold=thres)
        res = col_trans.fit_transform(X_array)
        assert not sparse.issparse(res)
        assert not col_trans.sparse_output_
def test_column_transformer_no_estimators():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).astype('float').T
    ct = ColumnTransformer([], remainder=StandardScaler())

    params = ct.get_params()
    assert params['remainder__with_mean']

    X_trans = ct.fit_transform(X_array)
    assert X_trans.shape == X_array.shape
    assert len(ct.transformers_) == 1
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][2] == [0, 1, 2]
def test_column_transformer_drop_all_sparse_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T
    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder=SparseMatrixTrans())

    X_trans = ct.fit_transform(X_array)
    assert sparse.issparse(X_trans)

    #  SparseMatrixTrans creates 3 features for each column, thus:
    assert X_trans.shape == (3, 3)
    assert_array_equal(X_trans.toarray(), np.eye(3))
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_list():
    X_list = [
        [1, float('nan'), 'a'],
        [0, 0, 'b']
    ]
    expected_result = np.array([
        [1, float('nan'), 1, 0],
        [-1, 0, 0, 1],
    ])

    ct = ColumnTransformer([
        ('numerical', StandardScaler(), [0, 1]),
        ('categorical', OneHotEncoder(), [2]),
    ])

    assert_array_equal(ct.fit_transform(X_list), expected_result)
    assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_column_transformer_drops_all_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    # columns are doubled when remainder = DoubleTrans
    X_res_both = 2 * X_array.copy()[:, 1:3]

    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_remainder_pandas(key):
    # test different ways that columns are specified with passthrough
    pd = pytest.importorskip('pandas')
    if isinstance(key, six.string_types) and key == 'pd-index':
        key = pd.Index(['first'])

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    X_res_both = X_array

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])
def test_column_transformer_remainder_transformer(key):
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T
    X_res_both = X_array.copy()

    # second and third columns are doubled when remainder = DoubleTrans
    X_res_both[:, 1:3] *= 2

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_list():
    X_list = [
        [1, float('nan'), 'a'],
        [0, 0, 'b']
    ]
    expected_result = np.array([
        [1, float('nan'), 1, 0],
        [-1, 0, 0, 1],
    ])

    ct = ColumnTransformer([
        ('numerical', StandardScaler(), [0, 1]),
        ('categorical', OneHotEncoder(), [2]),
    ])

    with pytest.warns(DataConversionWarning):
        # TODO: this warning is not very useful in this case, would be good
        # to get rid of it
        assert_array_equal(ct.fit_transform(X_list), expected_result)
        assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_column_transformer_sparse_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    ct = ColumnTransformer([('trans1', Trans(), [0])],
                           remainder=SparseMatrixTrans())

    X_trans = ct.fit_transform(X_array)
    assert sparse.issparse(X_trans)
    # SparseMatrixTrans creates 3 features for each column. There is
    # one column in ``transformers``, thus:
    assert X_trans.shape == (3, 3 + 1)

    exp_array = np.hstack(
        (X_array[:, 0].reshape(-1, 1), np.eye(3)))
    assert_array_equal(X_trans.toarray(), exp_array)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])
示例#23
0
# %%
band_gap_num = band_gap.drop('Compound', axis=1)

pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std', StandardScaler()),
])
band_gap_tr = pipe.fit_transform(band_gap_num)
# %%
num_attribs = list(band_gap_num)
cat_attribs = ['Compound']

full_pipe = ColumnTransformer([('num', pipe, num_attribs),
                               ('cat', OrdinalEncoder(), cat_attribs)])
band_gap_prepared = full_pipe.fit_transform(band_gap)
# %%
# OrdinalEncoder().categories
# band_gap_prepared_df = pd.DataFrame(band_gap_prepared)
# band_gap_prepared_df.head(10)
# %%
lin_reg = LinearRegression()
lin_reg.fit(band_gap_prepared, band_gap_label)
# %%
band_gap_prediction = lin_reg.predict(band_gap_prepared)
zip_sample = zip(band_gap_prediction, band_gap_label)
for i, j in zip_sample:
    print(i, j)

bg_mse = mean_squared_error(band_gap_prediction, band_gap_label)
bg_rmse = np.sqrt(bg_mse)
示例#24
0
"""

import pandas as pd

base = pd.read_csv('census.csv')

previsores = base.iloc[:, 0:14].values
classe = base.iloc[:, 14].values

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

onehotencorder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(),
                                                  [1, 3, 5, 6, 7, 8, 9, 13])],
                                   remainder='passthrough')
previsores = onehotencorder.fit_transform(previsores).toarray()

labelencorder_classe = LabelEncoder()
classe = labelencorder_classe.fit_transform(classe)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(
    previsores, classe, test_size=0.15, random_state=0)

#importa biblioteca
from sklearn.neighbors import KNeighborsClassifier
classificador = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
# Importing the libraries
import numpy as np
import pandas as pd

# Importing the dataset 
dataset = pd.read_csv('50_Startups.csv') 
X = dataset.iloc[ : , :-1].values
y = dataset.iloc[ : , -1].values

# Encoding Categorical Data 
# Encoding the Independent Variable
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder
ct =ColumnTransformer( transformers = [('encoder', OneHotEncoder(), [3])], remainder ='passthrough') 
X= np.array (ct.fit_transform(X))

#avoiding dummy variable trap 
X = X[ : ,1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split( X, y, test_size = 0.2, random_state = 0)

# Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)
示例#26
0
# convert texts to numbers
housing_cat = housing[['ocean_proximity']]
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
ordinal_encoder.categories_

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    # ('attribs_adder', Combined)
    ('std_scaler', StandardScaler())
])

num_attribs = list(housing_num)
cat_attribs = ['ocean_proximitnum_pipeliney']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing)

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)


from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
#labelEncoder_x = LabelEncoder()
#x[:, 0] = labelEncoder_x.fit_transform(x[:, 0])

# Create dummy variables for the countries
columnTransformer = ColumnTransformer([('lel', OneHotEncoder(), [0])],
                                      remainder='passthrough')
x = columnTransformer.fit_transform(x).astype(float)

# Encoding dependent variable
labelEncoder_y = LabelEncoder()
y = labelEncoder_y.fit_transform(y)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
示例#28
0
class train_lnphi:
    def __init__(self):
        # Force CPU
        #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
        return

    # Read using pandas
    def load_lnphi_data(self, lnphi_path, datafile_name):
        csv_path = os.path.join(
            lnphi_path, datafile_name
        )  # previously data = "data_const_T_20200716-230921.csv"
        self.lnphi = pd.read_csv(
            csv_path,
            delimiter=',',
            names=['a_mix', 'b_mix', 'b_i', 'sum', 'lnphi'])
        print('Loading done. Shape: {}'.format(str(self.lnphi.shape)))

    # Drop out of range lnphi instances
    def lnphi_range(self, min, max):
        self.lnphi.drop(
            self.lnphi.loc[(self.lnphi.loc[:, 'lnphi'] < min) |
                           (self.lnphi.loc[:, 'lnphi'] > max)].index,
            inplace=True)
        print('Drop lnphi out of range done. Shape: {}'.format(
            str(self.lnphi.shape)))

    def split_data(self):
        self.X = self.lnphi.loc[:, 'a_mix':'sum']
        self.y = self.lnphi.loc[:, 'lnphi']

        # Split data -> (train_full, test)
        self.X_train_full, self.X_test, self.y_train_full, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=42)

        # Split train_full -> (train, valid)
        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(
            self.X_train_full,
            self.y_train_full,
            test_size=0.2,
            random_state=42)

        print('Splitting done.')

    def feature_eng(self):
        # Label Transform pipeline
        self.label_scaler = MinMaxScaler()
        self.label_num_pipeline = Pipeline([('label minmax scaler',
                                             self.label_scaler)])
        self.y_train_prepared = self.label_num_pipeline.fit_transform(
            self.y_train.values.reshape(-1, 1))
        self.y_valid_prepared = self.label_num_pipeline.transform(
            self.y_valid.values.reshape(-1, 1))
        self.y_test_prepared = self.label_num_pipeline.transform(
            self.y_test.values.reshape(-1, 1))

        # Attribute Transform pipeline

        self.attr_scaler = MinMaxScaler()
        num_pipeline = Pipeline([
            #('std scaler', self.attr_std_scaler)
            ('min_max_scaler', self.attr_scaler)
        ])
        num_attribs = list(self.X_train)
        self.full_pipeline = ColumnTransformer([('num', num_pipeline,
                                                 num_attribs)])

        self.X_train_prepared = self.full_pipeline.fit_transform(self.X_train)
        self.X_valid_prepared = self.full_pipeline.transform(self.X_valid)
        self.X_test = self.full_pipeline.transform(self.X_test)

        print('Feature Eng done.')

    def model_construct(self, n_layers, n_nodes):
        n_inputs = self.X_train_prepared.shape[1]

        self.model = tf.keras.Sequential()
        self.model.add(
            tf.keras.layers.Dense(
                n_nodes,
                activation=tf.keras.layers.LeakyReLU(alpha=0.1),
                input_shape=[n_inputs]))
        for _ in range(n_layers - 1):
            self.model.add(
                tf.keras.layers.Dense(
                    n_nodes, activation=tf.keras.layers.LeakyReLU(alpha=0.1)))
        self.model.add(tf.keras.layers.Dense(1))

        # Remove lr if scheduler in use?
        self.model.compile(loss='mse',
                           optimizer=keras.optimizers.Adam(),
                           metrics=[
                               'mse', 'mae',
                               tf.keras.metrics.MeanAbsolutePercentageError()
                           ])

    def train_model(self, batch_size, n_layers, n_nodes, epochs, initial_epoch,
                    log_save_dir, name_prefix):
        # Logs callback
        model_name = name_prefix + '_' + str(batch_size) + '_' + str(
            n_layers) + '_' + str(n_nodes) + '_' + str(epochs) + '_'
        try:
            logdir = self.logdir
        except AttributeError:
            print('New logdir created.')
            self.logdir = log_save_dir + ".\\logs\\scalars\\" + model_name + str(
                datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
            logdir = self.logdir
        if not os.path.exists(logdir):
            os.makedirs(logdir)

        tensorboard_callback = keras.callbacks.TensorBoard(
            log_dir=logdir,
            histogram_freq=0,  # How often to log histogram visualizations
            write_graph=True,
            update_freq='epoch',
            profile_batch=
            0,  # set to 0. Else bug Tensorboard not show train loss.
            embeddings_freq=0,  # How often to log embedding visualizations
        )

        # Learning rate schedule as callback
        def scheduler(epoch):
            if epoch < 10:
                return 0.001
            else:
                return 0.001 * tf.math.exp(0.5 * (10 - epoch))
            '''if 0.001 * tf.math.exp(0.1 * (10 - epoch)) < 1E-5:
                return 1E-5
            else:
                return 0.001 * tf.math.exp(0.1 * (10 - epoch))'''

        #lr_scheduler_callback = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

        #reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=5, min_lr=0.0001)

        # Early stop
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='mse',
                                                      min_delta=0.001,
                                                      patience=3)
        #todo maybe make proportional early stopping

        # Callback save
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=logdir,  # +'.\\{epoch:.02d}-{mse:.2f}',
            verbose=1,
            save_weights_only=False,
            monitor='mse',  # Not sure
            mode='auto',
            save_best_only=True)

        # Store version info as file in directory
        def get_git_revision_hash():
            return subprocess.check_output(['git', 'rev-parse', 'HEAD'])

        with open(logdir + '.\\version_info.txt', 'a', newline='') as file:
            file.write('model_name' + ' ' + str(get_git_revision_hash()) +
                       '\n')

        # Store attributes from data transformation
        # Delete previous file if exists
        try:
            os.remove(logdir + '.\\full_pipeline_' + model_name + '.pkl')
        except OSError:
            pass
        with open(logdir + '.\\full_pipeline_' + model_name + '.pkl',
                  'wb') as f:
            pickle.dump(self.full_pipeline, f)
            pickle.dump(self.label_num_pipeline, f)

        # "history" object holds a record of the loss values and metric values during training
        history = self.model.fit(
            self.X_train_prepared,
            self.y_train_prepared,
            initial_epoch=initial_epoch,
            epochs=epochs,
            callbacks=[tensorboard_callback, model_checkpoint_callback],
            validation_data=(self.X_valid_prepared, self.y_valid_prepared),
            shuffle=True,
            batch_size=batch_size,
            verbose=2)

        # Save entire model with training config
        self.model.save(logdir + '.\\' + model_name + '{}'.format(
            str(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))))

        endTime = datetime.datetime.now()
        print('Ended at ' + str(endTime))
        print('end')
def test_column_transformer_dataframe():
    pd = pytest.importorskip('pandas')

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_both = X_array

    cases = [
        # String keys: label based

        # scalar
        ('first', X_res_first),
        # list
        (['first'], X_res_first),
        (['first', 'second'], X_res_both),
        # slice
        (slice('first', 'second'), X_res_both),

        # int keys: positional

        # scalar
        (0, X_res_first),
        # list
        ([0], X_res_first),
        ([0, 1], X_res_both),
        (np.array([0, 1]), X_res_both),
        # slice
        (slice(0, 1), X_res_first),
        (slice(0, 2), X_res_both),

        # boolean mask
        (np.array([True, False]), X_res_first),
        (pd.Series([True, False], index=['first', 'second']), X_res_first),
    ]

    for selection, res in cases:
        ct = ColumnTransformer([('trans', Trans(), selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_df), res)
        assert_array_equal(ct.fit(X_df).transform(X_df), res)

        # callable that returns any of the allowed specifiers
        ct = ColumnTransformer([('trans', Trans(), lambda X: selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_df), res)
        assert_array_equal(ct.fit(X_df).transform(X_df), res)

    ct = ColumnTransformer([('trans1', Trans(), ['first']),
                            ('trans2', Trans(), ['second'])])
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    ct = ColumnTransformer([('trans1', Trans(), [0]),
                            ('trans2', Trans(), [1])])
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    # test with transformer_weights
    transformer_weights = {'trans1': .1, 'trans2': 10}
    both = ColumnTransformer([('trans1', Trans(), ['first']),
                              ('trans2', Trans(), ['second'])],
                             transformer_weights=transformer_weights)
    res = np.vstack([transformer_weights['trans1'] * X_df['first'],
                     transformer_weights['trans2'] * X_df['second']]).T
    assert_array_equal(both.fit_transform(X_df), res)
    assert_array_equal(both.fit(X_df).transform(X_df), res)
    assert len(both.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    # test multiple columns
    both = ColumnTransformer([('trans', Trans(), ['first', 'second'])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'

    both = ColumnTransformer([('trans', Trans(), [0, 1])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'

    # ensure pandas object is passes through

    class TransAssert(BaseEstimator):

        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):
            assert_true(isinstance(X, (pd.DataFrame, pd.Series)))
            if isinstance(X, pd.Series):
                X = X.to_frame()
            return X

    ct = ColumnTransformer([('trans', TransAssert(), 'first')],
                           remainder='drop')
    ct.fit_transform(X_df)
    ct = ColumnTransformer([('trans', TransAssert(), ['first', 'second'])])
    ct.fit_transform(X_df)

    # integer column spec + integer column names -> still use positional
    X_df2 = X_df.copy()
    X_df2.columns = [1, 0]
    ct = ColumnTransformer([('trans', Trans(), 0)], remainder='drop')
    assert_array_equal(ct.fit_transform(X_df), X_res_first)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)

    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'drop'
    assert_array_equal(ct.transformers_[-1][2], [1])
示例#30
0
imputer.fit(
    x[:,
      1:3])  # compute the missing values for all rows for 1st and 2nd calumn
x[:, 1:3] = imputer.transform(
    x[:,
      1:3])  # replace all rows for 1st and second column with imputers version
# print(x)
# print("".join(['-' for i in range(40)]))

# Transform and Encode Categorical Data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# use oneHotEncoder to trasform 0th column, keep the others unchanged
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')
x = np.array(ct.fit_transform(x))  # transform the x and convert it to np array
# print(x)
# print("".join(['-' for i in range(40)]))

# Transform and Encode The Dependent Variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
# print(y)
# print("".join(['-' for i in range(40)]))

# Split dataset into Training and Testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1)  # split 80/20 and seed random with 1
print(X_train)
示例#31
0

# Encoding categorical data

# Categorical variable for country
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])

# Categorical variable for gender
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])


# Since country is not ordinal variable we need to create three dummy variables
ct = ColumnTransformer([("Geography", OneHotEncoder(), [1])], remainder = 'passthrough')
X = ct.fit_transform(X)

# We need remove one dummy variable to avoid dummy variable trap
X = X[:,1:]


# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)



"""

Applying XGBoost

"""
###############################################################################

###############################################################################
# We will perform a 10-fold cross-validation and train the neural-network with
# the two different strategies previously presented.

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10)

cv_results_imbalanced = []
cv_time_imbalanced = []
cv_results_balanced = []
cv_time_balanced = []
for train_idx, valid_idx in skf.split(X_train, y_train):
    X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx])
    y_local_train = y_train.iloc[train_idx].values.ravel()
    X_local_test = preprocessor.transform(X_train.iloc[valid_idx])
    y_local_test = y_train.iloc[valid_idx].values.ravel()

    elapsed_time, roc_auc = fit_predict_imbalanced_model(
        X_local_train, y_local_train, X_local_test, y_local_test)
    cv_time_imbalanced.append(elapsed_time)
    cv_results_imbalanced.append(roc_auc)

    elapsed_time, roc_auc = fit_predict_balanced_model(
        X_local_train, y_local_train, X_local_test, y_local_test)
    cv_time_balanced.append(elapsed_time)
    cv_results_balanced.append(roc_auc)

###############################################################################
示例#33
0
## Encoding categorical data
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Label encorder not required anymore can use ColumnTransformer
# But can use to identify the category
#labelEncoder_Country = LabelEncoder()
#x[:, 0] = labelEncoder_Country.fit_transform(x[:, 0])

# we there are multiple categpries so need to use

columnTransformer = ColumnTransformer(transformers=[
    ('one_hot_encoder', OneHotEncoder(categories='auto'), [0])
],
                                      remainder='passthrough')
x = columnTransformer.fit_transform(x)

labelEncoder_Purchased = LabelEncoder()
y = labelEncoder_Purchased.fit_transform(y)

## Split dataset to train and test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

## Feature scaling
# There are two ways Standardisation and Normalisation
from sklearn.preprocessing import StandardScaler
示例#34
0
df


df=df.iloc[:,:].values
df

m_status=LabelEncoder()
df[:,2]=m_status.fit_transform(df[:,2])
df

df=pd.DataFrame(df)
df

ct= ColumnTransformer(transformers=[('encode',OneHotEncoder(),[2])],remainder='passthrough')

df=ct.fit_transform(df)
df

df=pd.DataFrame(df)
df
#pd.get_dummies(df)


#B Rename all columns as df

df.rename(columns={0:'df1', 1:'df2', 2:'df3', 3:'df4',4:'df5', 5: 'df6',6:'df7',7:'df8',
                          8:'df9',9:'df10', 10:'df11',11:'df12',12:'df13', 
                          13:'df14',14:'df15', 15:'df16',16:'df17',17:'df18',18:'df19'})

df
示例#35
0
"""
Created on Sat Aug 22 20:52:32 2020

@author: renan
"""
import pandas as pd

base = pd.read_csv('census.csv')

previsores = base.iloc[:, 0:14].values
classe = base.iloc[:, 14].values

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

column_tranformer = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13])],
    remainder='passthrough')
previsores = column_tranformer.fit_transform(previsores).toarray()

labelencoder_classe = LabelEncoder()
classe = labelencoder_classe.fit_transform(classe)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(
    previsores, classe, test_size=0.15, random_state=0)
示例#36
0
dataset_away = pd.read_csv('result_data_A.csv',
                           header=None,
                           encoding="shift-jis")
#アウェイのクラブがどのクラブに対して何点取るか、という予想をするモデル
X_away = dataset_away.iloc[:, [0, 5, 6]].values
Y_away = dataset_away.iloc[:, 7:8].values

print("data import clear")

#列のデータを変換するためのクラス
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

cd = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [1, 2])],
                       remainder="passthrough")
X_home = (cd.fit_transform(X_home)).toarray()
X_away = (cd.fit_transform(X_away)).toarray()

print("encoding clear")

#randomforestはensembleからimport
from sklearn.ensemble import RandomForestRegressor
#n_estimatorsでいくつの木(モデル)に分割するか指定
regressor_home = RandomForestRegressor(n_estimators=10, random_state=0)
regressor_away = RandomForestRegressor(n_estimators=10, random_state=0)
regressor_home.fit(X_home, Y_home)
regressor_away.fit(X_away, Y_away)

print("learning clear")

#予想したい対戦カード
示例#37
0
data.drop('bool_of_active', axis=1, inplace=True)
data

# In[9]:

data.drop('step_count', axis=1, inplace=True)
data

# In[10]:

from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
    [("mood", OneHotEncoder(), [0])], remainder="passthrough"
)  # The last arg ([0]) is the list of columns you want to transform in this step
x = ct.fit_transform(data)
x

# In[11]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=0)

# In[12]:

from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
model = nb.fit(X_train, y_train)
示例#38
0
# conver categorical columns into numerical columns:
features = pd.get_dummies(features)

# Split your data into training set and test sets:
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.20, random_state=23)

# standardize/normalize  numerical features:
numerical_features = features.select_dtypes(include=['float64', 'int64'])
numerical_columns = numerical_features.columns
ct = ColumnTransformer([("only numeric", StandardScaler(), numerical_columns)],
                       remainder='passthrough')

# Fit your instance ct of ColumnTransformer to the training data and at the same time transform it by using the ColumnTransformer.fit_transform() method. Assign the result to a variable called features_train_scaled:
features_train_scaled = ct.fit_transform(features_train)

#ransform your test data instance features_test using the trained ColumnTransformer instance ct. Assign the result to a variable called features_test_scaled:
features_test_scaled = ct.transform(features_test)

# Create an instance of my_model:
my_model = Sequential()

# Create the input layer
input = InputLayer(input_shape=(features.shape[1], ))

# Add the input layer:
my_model.add(input)

# Add one hidden layer:
my_model.add(Dense(64, activation="relu"))
示例#39
0
admissionData = admissionData.drop(["Serial No."], axis=1)
labels = admissionData.iloc[:, -1]

# remove uni rating and TOEFL score - unethical?
# remove serial no. and research - irrelevant info
features = admissionData.iloc[:, [0, 3, 4, 5, 6]]

# split dataset into train and test
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.3, random_state=42)

# scale/normalise dataset features
ct = ColumnTransformer([("normalize", Normalizer(), [0, 1, 2, 3])],
                       remainder='passthrough')
features_train = ct.fit_transform(features_train)
features_test = ct.transform(features_test)

learning_rate = 0.001
num_epochs = 20

# create neural network

#  admissionsModel = build_model(features_train, learning_rate)  # rewrite this function
#  admissionsModel.fit(features_train, labels_train, epochs=20, batch_size=1, verbose=1)
history1 = fit_model(build_model(features_train, learning_rate),
                     features_train, labels_train, learning_rate, num_epochs)

#  need to return the fitted model into a graph somehow here

plt.savefig('perf_graph.png')
示例#40
0
labelEncoder_previsores = LabelEncoder()
previsores[:, 1] = labelEncoder_previsores.fit_transform(previsores[:, 1])
previsores[:, 3] = labelEncoder_previsores.fit_transform(previsores[:, 3])
previsores[:, 5] = labelEncoder_previsores.fit_transform(previsores[:, 5])
previsores[:, 6] = labelEncoder_previsores.fit_transform(previsores[:, 6])
previsores[:, 7] = labelEncoder_previsores.fit_transform(previsores[:, 7])
previsores[:, 8] = labelEncoder_previsores.fit_transform(previsores[:, 8])
previsores[:, 9] = labelEncoder_previsores.fit_transform(previsores[:, 9])
previsores[:, 13] = labelEncoder_previsores.fit_transform(previsores[:, 13])

# One Hot Encoder
oneHotEncoder = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'),
      [1, 3, 5, 6, 7, 8, 9, 13])],
    remainder='passthrough')
previsores = oneHotEncoder.fit_transform(previsores).toarray()

# Y
labelEncoder_classe = LabelEncoder()
classe = labelEncoder_classe.fit_transform(classe)

# Escalonamento dos dados
##### Escalonamento Parcial #####
# scalerCols = previsores[:, 102:]
# scaler = StandardScaler()
# previsores[:, 102:] = scaler.fit_transform(scalerCols)
##### Escalonamento Total #####
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

# Split dos dados
示例#41
0
# Handling missing data

from sklearn.impute import SimpleImputer
imputer =  SimpleImputer(missing_values = np.nan,strategy = 'mean' )
imputer = imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])

# category encoder

from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer

labelencoder_x = LabelEncoder()
x[:,0] = labelencoder_x.fit_transform(x[:,0])
transform = ColumnTransformer([("Country", OneHotEncoder(), [0])], remainder="passthrough")
x = transform.fit_transform(x)

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

# split the dataset into train and test dataset
# random state make the result be the same as the other people if it setted like them
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

# Feature Scaling
# here we made the feature scaling on the whole training and test sets with the non numeric features
# in x_test we don't need to fit the data because it is already fitted  
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
)

column_trans = ColumnTransformer(
    [
        ("binned_numeric", KBinsDiscretizer(n_bins=10),
            ["VehAge", "DrivAge"]),
        ("onehot_categorical", OneHotEncoder(),
            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
        ("passthrough_numeric", "passthrough",
            ["BonusMalus"]),
        ("log_scaled_numeric", log_scale_transformer,
            ["Density"]),
    ],
    remainder="drop",
)
X = column_trans.fit_transform(df)

# Insurances companies are interested in modeling the Pure Premium, that is
# the expected total claim amount per unit of exposure for each policyholder
# in their portfolio:
df["PurePremium"] = df["ClaimAmount"] / df["Exposure"]

# This can be indirectly approximated by a 2-step modeling: the product of the
# Frequency times the average claim amount per claim:
df["Frequency"] = df["ClaimNb"] / df["Exposure"]
df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1)

with pd.option_context("display.max_columns", 15):
    print(df[df.ClaimAmount > 0].head())

# %%
        if cat_attr[i] == True:
            print(feature_nam[i])
            val = feature_nam[i]
            features_1[val].fillna(features_1[val].value_counts().index[0], inplace=True)

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

preprocess = ColumnTransformer([
    ("num", num_pipeline, num_attr),
    ("cat", OneHotEncoder(), cat_attr),
])

features_prepared = preprocess.fit_transform(features_1)
features_prepared_2 = preprocess.fit_transform(features_2)

# Set up train and test arrays
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_prepared, target_1, random_state=0)


#=====================================================================================================#
# Prediction models
print() ; print('=============== Predicition Models ===============')
nam_model = []
type_model = []

#========== SVC ==========#
from sklearn.svm import SVC
示例#44
0
def noprep(dataset,
           dirt,
           numeric_features,
           categorical_features,
           delim=',',
           indexdrop=False):
    index_features = ['_dmIndex_', '_PartInd_']
    data = pd.read_csv(dirt + dataset + '.csv',
                       delimiter=delim)  # panda.DataFrame
    print(data.columns)
    data = data.astype({'_dmIndex_': 'int', '_PartInd_': 'int'})

    numeric_features = list(
        set(data.select_dtypes(include=["number"])) - set(index_features) -
        set(['income_flag']))
    categorical_features = list(
        set(data.select_dtypes(exclude=["number"])) - set(['income_flag']))
    index_transformer = Pipeline(
        steps=[('imputer', SimpleImputer(strategy='constant', fill_value=-1))])
    #y_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant',fill_value=-1)),\
    #                                ('orden', OrdinalEncoder())])
    numeric_transformer = Pipeline(steps=[('imputer',
                                           SimpleImputer(strategy='median'))])

    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\
        ('onehot', OneHotEncoder(sparse=False))])

    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),\
         ('cat', categorical_transformer, categorical_features),('index',index_transformer, index_features)])

    data["income_flag"] = data["income_flag"].astype('category')
    data["income_flag"] = data["income_flag"].cat.codes

    #    data["income_flag"]=data.where(data["income_flag"]==,0)
    #    data["income_flag"]=data.where(data["income_flag"]==4,1)

    data = preprocessor.fit_transform(data)
    data = pd.DataFrame(data)
    col = data.columns.values
    print(col)
    X = data.drop(col[-3:], axis=1)
    X_train = data[data[col[-1]] > 0].drop(
        col[-3:], axis=1)  #pd.DataFrame(X).to_csv('X_vanilla.csv')
    X_test = data[data[col[-1]] == 0].drop(
        col[-3:], axis=1)  #pd.DataFrame(X).to_csv('X_vanilla.csv')
    print(data.shape)

    ####################################################################
    #y= data["y"]
    #lb = preprocessing.LabelBinarizer()
    #y= lb.fit_transform(y)
    y = data[col[-3]]
    y_train = data[data[col[-1]] > 0][col[-3]]
    y_test = data[data[col[-1]] == 0][col[-3]]
    ##########################################################
    ##################################################################
    feat_type = []  #dict()
    xcol = X.columns.values
    for cl in xcol:
        if cl in categorical_features:
            feat_type.append(1)
        else:
            feat_type.append(0)

#    X_train_auto, X_test_auto, y_train_auto, y_test_auto = \
#      sklearn.model_selection.train_test_split(X, y,test_size=0.2, random_state=1)
    return data, X, y, X_train, y_train, X_test, y_test, feat_type
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy ='mean')

#imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis =0)
imputer = imputer.fit(X[:,1:3])
X[:, 1:3] = imputer.transform (X[:,1:3])

#Encoding categorical values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
LabelEncoder_X = LabelEncoder()
X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0])

columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])],     
                                      remainder='passthrough')
X=np.array(columnTransformer.fit_transform(X),dtype=np.str)

LabelEncoder_y = LabelEncoder()
y = LabelEncoder_y.fit_transform(y)
# columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [3])],     
#                                       remainder='passthrough')
# y=np.array(columnTransformer.fit_transform(y),dtype=np.str)

#Splitting datasets into test and training sets nb train sz + tst sz = 1
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=0)

#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
#convert gender and country to number data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

label_encoder_x_1 = LabelEncoder()
X[:, 2] = label_encoder_x_1.fit_transform(X[:,2])
transformer = ColumnTransformer(
    transformers=[
        ("OneHot",        # Just a name
         OneHotEncoder(), # The transformer class
         [1]              # The column(s) to be applied on.
         )
    ],
    remainder='passthrough' # donot apply anything to the remaining columns
)
X = transformer.fit_transform(X.tolist())

X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
Sscale = StandardScaler()
X_train = Sscale.fit_transform(X_train)
X_test = Sscale.transform(X_test)

#importing keras
from keras.models import Sequential
示例#47
0
    'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time'
]]
# One-hot ecoding to convert from categorical features into vectors
x = pd.get_dummies(x)

X_train, X_test, Y_train, Y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)

ct = ColumnTransformer([('numeric', StandardScaler(), [
    'age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets',
    'serum_creatinine', 'serum_sodium', 'time'
])],
                       remainder='passthrough')  #why not 'standardize'?
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

# Label encoding for categorical outcome
le = LabelEncoder()
Y_train = le.fit_transform(Y_train.astype(str))
Y_test = le.transform(Y_test.astype(str))
# Convert labels into categorical type
Y_train = to_categorical(Y_train)
Y_test = to_categorical(Y_test)

# Model
model = Sequential()
model.add(InputLayer(input_shape=(X_train.shape[1], )))
model.add(Dense(12, activation='relu'))
model.add(
示例#48
0
# ### Encoding categorical variables

# In[ ]:


# Encoding independent variables
# Import Class --> Create Object --> Fit Object to Data --> Transform Data

from sklearn.compose import ColumnTransformer       # import class
from sklearn.preprocessing import OneHotEncoder     # import class

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop = 'first'), [0])], remainder='passthrough')    
#create object

X = np.array(ct.fit_transform(X))                   # fit object to data and transform data

print(X)


# In[ ]:


# Encoding dependent variable because this is a classification problem. The dependent variable is categorical.
# Import Class --> Create Object --> Fit Object to Data --> Transform Data

from sklearn.preprocessing import LabelEncoder   #import class
le = LabelEncoder()                              #create object
y = le.fit_transform(y)                          #fit and transform

print(y)
    def __init__(self, explanation, model, dataset, true_y, classes, features,
                 locale, categorical_features, true_y_dataset):
        """Initialize the Error Analysis Dashboard Input.

        :param explanation: An object that represents an explanation.
        :type explanation: ExplanationMixin
        :param model: An object that represents a model.
        It is assumed that for the classification case
            it has a method of predict_proba() returning
            the prediction probabilities for each
            class and for the regression case a method of predict()
            returning the prediction value.
        :type model: object
        :param dataset: A matrix of feature vector examples
        (# examples x # features), the same samples
            used to build the explanation.
            Will overwrite any set on explanation object already.
            Must have fewer than
            10000 rows and fewer than 1000 columns.
        :type dataset: numpy.array or list[][] or pandas.DataFrame
        :param true_y: The true labels for the provided explanation.
            Will overwrite any set on explanation object already.
        :type true_y: numpy.array or list[]
        :param classes: The class names.
        :type classes: numpy.array or list[]
        :param features: Feature names.
        :type features: numpy.array or list[]
            :param categorical_features: The categorical feature names.
        :type categorical_features: list[str]
        :param true_y_dataset: The true labels for the provided dataset.
        Only needed if the explanation has a sample of instances from the
        original dataset.  Otherwise specify true_y parameter only.
        :type true_y_dataset: numpy.array or list[]
        """
        self._model = model
        original_dataset = dataset
        if isinstance(dataset, pd.DataFrame):
            self._dataset = dataset.to_json()
        else:
            self._dataset = dataset
        if true_y_dataset is None:
            self._true_y = true_y
        else:
            self._true_y = true_y_dataset
        self._categorical_features = categorical_features
        self._categories = []
        self._categorical_indexes = []
        self._is_classifier = model is not None\
            and hasattr(model, SKLearn.PREDICT_PROBA) and \
            model.predict_proba is not None
        self._dataframeColumns = None
        self.dashboard_input = {}
        # List of explanations, key of explanation type is "explanation_type"
        self._mli_explanations = explanation.data(-1)["mli"]
        local_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_LOCAL_EXPLANATION_KEY)
        global_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_GLOBAL_EXPLANATION_KEY)
        ebm_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_EBM_GLOBAL_EXPLANATION_KEY)
        dataset_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_EXPLANATION_DATASET_KEY)

        if hasattr(explanation, 'method'):
            self.dashboard_input[ExplanationDashboardInterface.
                                 EXPLANATION_METHOD] = explanation.method

        predicted_y = None
        feature_length = None
        if dataset_explanation is not None:
            if dataset is None or len(dataset) != len(true_y):
                dataset = dataset_explanation[
                    ExplanationDashboardInterface.MLI_DATASET_X_KEY]
            if true_y is None:
                true_y = dataset_explanation[
                    ExplanationDashboardInterface.MLI_DATASET_Y_KEY]
        elif len(dataset) != len(true_y):
            dataset = explanation._eval_data

        if isinstance(dataset, pd.DataFrame) and hasattr(dataset, 'columns'):
            self._dataframeColumns = dataset.columns
        try:
            list_dataset = self._convert_to_list(dataset)
        except Exception as ex:
            ex_str = _format_exception(ex)
            raise ValueError(
                "Unsupported dataset type, inner error: {}".format(ex_str))
        if dataset is not None and model is not None:
            try:
                predicted_y = model.predict(dataset)
            except Exception as ex:
                ex_str = _format_exception(ex)
                msg = "Model does not support predict method for given"
                "dataset type, inner error: {}".format(ex_str)
                raise ValueError(msg)
            try:
                predicted_y = self._convert_to_list(predicted_y)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Model prediction output of unsupported type,"
                                 "inner error: {}".format(ex_str))
        if predicted_y is not None:
            self.dashboard_input[
                ExplanationDashboardInterface.PREDICTED_Y] = predicted_y
        row_length = 0
        if list_dataset is not None:
            row_length, feature_length = np.shape(list_dataset)
            if row_length > 100000:
                raise ValueError("Exceeds maximum number of rows"
                                 "for visualization (100000)")
            if feature_length > 1000:
                raise ValueError("Exceeds maximum number of features for"
                                 " visualization (1000). Please regenerate the"
                                 " explanation using fewer features or"
                                 " initialize the dashboard without passing a"
                                 " dataset.")
            self.dashboard_input[ExplanationDashboardInterface.
                                 TRAINING_DATA] = _serialize_json_safe(
                                     list_dataset)
            self.dashboard_input[ExplanationDashboardInterface.
                                 IS_CLASSIFIER] = self._is_classifier

        local_dim = None

        if true_y is not None and len(true_y) == row_length:
            self.dashboard_input[ExplanationDashboardInterface.
                                 TRUE_Y] = self._convert_to_list(true_y)

        if local_explanation is not None:
            try:
                local_explanation["scores"] = self._convert_to_list(
                    local_explanation["scores"])
                if np.shape(local_explanation["scores"])[-1] > 1000:
                    raise ValueError("Exceeds maximum number of features for "
                                     "visualization (1000). Please regenerate"
                                     " the explanation using fewer features.")
                local_explanation["intercept"] = self._convert_to_list(
                    local_explanation["intercept"])
                # We can ignore perf explanation data.
                # Note if it is added back at any point,
                # the numpy values will need to be converted to python,
                # otherwise serialization fails.
                local_explanation["perf"] = None
                self.dashboard_input[ExplanationDashboardInterface.
                                     LOCAL_EXPLANATIONS] = local_explanation
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Unsupported local explanation type,"
                                 "inner error: {}".format(ex_str))
            if list_dataset is not None:
                local_dim = np.shape(local_explanation["scores"])
                if len(local_dim) != 2 and len(local_dim) != 3:
                    raise ValueError(
                        "Local explanation expected to be a 2D or 3D list")
                if len(local_dim) == 2 and (local_dim[1] != feature_length
                                            or local_dim[0] != row_length):
                    raise ValueError("Shape mismatch: local explanation"
                                     "length differs from dataset")
                if len(local_dim) == 3 and (local_dim[2] != feature_length
                                            or local_dim[1] != row_length):
                    raise ValueError("Shape mismatch: local explanation"
                                     " length differs from dataset")
        if local_explanation is None and global_explanation is not None:
            try:
                global_explanation["scores"] = self._convert_to_list(
                    global_explanation["scores"])
                if 'intercept' in global_explanation:
                    global_explanation["intercept"] = self._convert_to_list(
                        global_explanation["intercept"])
                self.dashboard_input[ExplanationDashboardInterface.
                                     GLOBAL_EXPLANATION] = global_explanation
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Unsupported global explanation type,"
                                 "inner error: {}".format(ex_str))
        if ebm_explanation is not None:
            try:
                self.dashboard_input[ExplanationDashboardInterface.
                                     EBM_EXPLANATION] = ebm_explanation
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError(
                    "Unsupported ebm explanation type: {}".format(ex_str))

        if features is None and hasattr(explanation, 'features')\
                and explanation.features is not None:
            features = explanation.features
        if features is not None:
            features = self._convert_to_list(features)
            if feature_length is not None and len(features) != feature_length:
                raise ValueError("Feature vector length mismatch:"
                                 " feature names length differs"
                                 " from local explanations dimension")
            self.dashboard_input[FEATURE_NAMES] = features
        if classes is None and hasattr(explanation, 'classes')\
                and explanation.classes is not None:
            classes = explanation.classes
        if classes is not None:
            classes = self._convert_to_list(classes)
            if local_dim is not None and len(classes) != local_dim[0]:
                raise ValueError("Class vector length mismatch:"
                                 "class names length differs from"
                                 "local explanations dimension")
            self.dashboard_input[
                ExplanationDashboardInterface.CLASS_NAMES] = classes
        if model is not None and hasattr(model, SKLearn.PREDICT_PROBA) \
                and model.predict_proba is not None and dataset is not None:
            try:
                probability_y = model.predict_proba(dataset)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Model does not support predict_proba method"
                                 " for given dataset type,"
                                 " inner error: {}".format(ex_str))
            try:
                probability_y = self._convert_to_list(probability_y)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError(
                    "Model predict_proba output of unsupported type,"
                    "inner error: {}".format(ex_str))
            self.dashboard_input[
                ExplanationDashboardInterface.PROBABILITY_Y] = probability_y
        if locale is not None:
            self.dashboard_input[ExplanationDashboardInterface.LOCALE] = locale
        if self._categorical_features:
            category_dictionary = {}
            features = self.dashboard_input[FEATURE_NAMES]
            self._categorical_indexes = [
                features.index(feature)
                for feature in self._categorical_features
            ]
            from sklearn.compose import ColumnTransformer
            from sklearn.preprocessing import OrdinalEncoder
            ordinal_enc = OrdinalEncoder()
            ct = ColumnTransformer(
                [('ord', ordinal_enc, self._categorical_indexes)],
                remainder='drop')
            self.string_ind_data = ct.fit_transform(original_dataset)
            transformer_categories = ct.transformers_[0][1].categories_
            for category_arr, category_index in zip(transformer_categories,
                                                    self._categorical_indexes):
                category_values = category_arr.tolist()
                self._categories.append(category_values)
                category_dictionary[category_index] = category_values
            self.dashboard_input[ExplanationDashboardInterface.
                                 CATEGORICAL_MAP] = category_dictionary
示例#50
0
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

data = pd.read_csv('Dataset/50_Startups.csv')  #set the path accordingly

x = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-1])],
                       remainder='passthrough')
x = np.array(ct.fit_transform(x))

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

regressor = LinearRegression()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

np.set_printoptions(precision=2)
print(
    np.concatenate(
        (y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_pred), 1)),
        axis=1))
示例#51
0
def prepare_data(train_df_raw, test_df_raw, data_prep_dict):
    '''
    
        Function to process raw data into required modelling data
        
        Inputs:
            1. train_df_raw - Dataframe
            2. test_df_raw  - Dataframe
            3. data_prep_dict - Dictionary
        
        Outputs:
            1. train_df_processed - Dataframe
            2. test_df_processed - Dataframe
    '''

    #quick check to apply data processing on both train and test combined
    #train_df_raw = pd.concat([train_df_raw,test_df_raw],axis = 0)

    #override simple imputer error by manually assigning missing values
    train_df_raw['Holding_Policy_Duration'].fillna('-1', inplace=True)
    test_df_raw['Holding_Policy_Duration'].fillna('-1', inplace=True)
    train_df_raw.fillna('missing', inplace=True)
    test_df_raw.fillna('missing', inplace=True)

    #modify data values to convert catergorical raw attributes to potential numeric features

    train_df_raw.replace({'14+': '14'}, inplace=True)
    train_df_raw['Holding_Policy_Duration'] = train_df_raw[
        'Holding_Policy_Duration'].astype(float)
    test_df_raw.replace({'14+': '14'}, inplace=True)
    test_df_raw['Holding_Policy_Duration'] = test_df_raw[
        'Holding_Policy_Duration'].astype(float)

    #freeze data types
    train_df_raw[data_prep_dict['one_hot_encode']] = train_df_raw[
        data_prep_dict['one_hot_encode']].astype(str)
    test_df_raw[data_prep_dict['one_hot_encode']] = test_df_raw[
        data_prep_dict['one_hot_encode']].astype(str)

    #target encode required attributes
    for target_encode_col in data_prep_dict['target_encode']:
        encoding_dict = train_df_raw.groupby(
            target_encode_col)[TARGET].mean().to_dict()
        train_df_raw[target_encode_col] = train_df_raw[target_encode_col].map(
            encoding_dict)
        test_df_raw[target_encode_col] = test_df_raw[target_encode_col].map(
            encoding_dict)

    #fill missing Region Codes
    #city_code_means = train_df_raw.groupby(['City_Code'])[TARGET].mean().reset_index()
    #test_df_raw['Region_Code'] = test_df_raw.apply(
    #lambda row: city_code_means[TARGET][city_code_means.City_Code ==
    #                                    row['City_Code']].values[0]
    #                                if row['Region_Code'] not in train_df_raw['Region_Code'].unique() else row['Region_Code'],
    #                            axis=1
    #                        )

    #define set of transformation steps per raw attribute present in the data

    column_transformer_1 = ColumnTransformer(
        [('one_hot_encode', OneHotEncoder(sparse=False, drop='if_binary'),
          data_prep_dict['one_hot_encode'])],
        remainder='passthrough',
        verbose='True')

    #build and fit the column transformer on train data
    train_df_processed = column_transformer_1.fit_transform(train_df_raw)
    #apply the column transformer on test data
    test_df_processed = column_transformer_1.transform(test_df_raw)

    #convert numpy arrays into pandas dataframe for further analysis
    train_df_processed_1 = pd.DataFrame(
        train_df_processed, columns=column_transformer_1.get_feature_names())
    test_df_processed_1 = pd.DataFrame(
        test_df_processed, columns=column_transformer_1.get_feature_names())

    column_transformer_2 = ColumnTransformer([('passthrough', 'passthrough', [
        col for col in train_df_processed_1.columns
        if col not in data_prep_dict['standard_scale']
    ]), ('standard_scale', StandardScaler(), data_prep_dict['standard_scale'])
                                              ],
                                             remainder='passthrough',
                                             verbose='True')

    #build and fit the column transformer on train data
    train_df_processed_2 = column_transformer_2.fit_transform(
        train_df_processed_1)
    #apply the column transformer on test data
    test_df_processed_2 = column_transformer_2.transform(test_df_processed_1)

    #recreate column names in the correct order, to understand feature importances
    train_df_processed_out = pd.DataFrame(
        train_df_processed_2,
        columns=[
            col for col in train_df_processed_1.columns
            if col not in data_prep_dict['standard_scale']
        ] + data_prep_dict['standard_scale'])
    test_df_processed_out = pd.DataFrame(
        test_df_processed_2,
        columns=[
            col for col in train_df_processed_1.columns
            if col not in data_prep_dict['standard_scale']
        ] + data_prep_dict['standard_scale'])

    #progress logger
    print('Target encoding completed, return processed data')

    return train_df_processed_out, test_df_processed_out
示例#52
0
class DogeDataLoader:
    def __init__(self,
                 filename,
                 categorical_cols,
                 target_col,
                 seq_length,
                 batch_size,
                 preprocessor=True,
                 prediction_window=1):
        '''
        :param filename: path to the csv dataset
        :param categorical_cols: name of the categorical columns, if None pass empty list
        :param target_col: name of the targeted column
        :param seq_length: window length to use
        :param prediction_window: window length to predict
        :param preprocessor: if normalize data or not
        :param batch_size: batch size
        '''
        self.data = self.read_and_preprocess(filename)
        self.categorical_cols = categorical_cols
        self.numerical_cols = list(
            set(self.data.columns) - set(categorical_cols) - set(target_col))
        self.target_col = target_col
        self.seq_length = seq_length
        self.prediction_window = prediction_window
        self.batch_size = batch_size
        self.preprocessor = preprocessor
        self.preprocess = ColumnTransformer(
            [
                ("scaler", StandardScaler(), self.numerical_cols),
                #("encoder", OneHotEncoder(), self.categorical_cols)
            ],
            remainder="passthrough")

    def read_and_preprocess(self, filename):
        # Reading
        df = pd.read_csv(filename)
        # Reorder and resetting index
        df = df[::-1].reset_index(drop=True)
        # Preprocessing 'Change' column
        df['Change %'] = df['Change %'].str.replace("%", "")
        df['Change %'] = pd.to_numeric(df['Change %'].str.replace(",", ""))
        # Preprocessing 'Vol.' column
        vols = [el for el in df['Vol.']]
        for num, el in enumerate(vols):
            # Check if is billion
            isB = el[-1] == 'B'
            try:
                el = float(el[:-1])
            except ValueError:
                print("Value Error at row ", num)
                el = vols[num - 1]
            if isB:
                el = el * 1000
            vols[num] = el
        df['Vol.'] = vols
        # Dropping Date column
        df.pop('Date')
        # Done, returning dataframe
        return df

    def preprocess_data(self):
        '''
        Preprocessing function
        '''
        X = self.data.drop(self.target_col, axis=1)
        y = self.data[self.target_col]

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            train_size=0.8,
                                                            shuffle=False)
        if self.preprocessor is not None:
            X_train = self.preprocess.fit_transform(X_train)
            X_test = self.preprocess.fit_transform(X_test)

        if self.target_col:
            return X_train, X_test, y_train.values, y_test.values
        return X_train, X_test

    def frame_series(self, X, y=None):
        '''
        Function used to prepare the data for time series prediction
        :param X: set of features
        :param y: targeted value to predict
        :return: TensorDataset
        '''
        nb_obs, nb_features = X.shape
        features, target, y_hist = [], [], []

        for i in range(1, nb_obs - self.seq_length - self.prediction_window):
            features.append(
                torch.FloatTensor(X[i:i + self.seq_length, :]).unsqueeze(0))

        features_var = torch.cat(features)

        if y is not None:
            for i in range(1,
                           nb_obs - self.seq_length - self.prediction_window):
                target.append(
                    torch.tensor(y[i + self.seq_length:i + self.seq_length +
                                   self.prediction_window]))
            target_var = torch.cat(target)
            return TensorDataset(features_var, target_var)
        return TensorDataset(features_var)

    def get_loaders(self, ):
        '''
        Preprocess and frame the dataset
        :return: DataLoaders associated to training and testing data
        '''

        X_train, X_test, y_train, y_test = self.preprocess_data()

        train_dataset = self.frame_series(X_train, y_train)
        test_dataset = self.frame_series(X_test, y_test)

        train_iter = DataLoader(train_dataset,
                                batch_size=self.batch_size,
                                shuffle=False,
                                drop_last=True)
        test_iter = DataLoader(test_dataset,
                               batch_size=self.batch_size,
                               shuffle=False,
                               drop_last=True)
        return train_iter, test_iter
示例#53
0
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

#encoding the categorical coloumn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])

from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('state', OneHotEncoder(), [3])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype='float')

# avoiding dummy variable trap
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
示例#54
0
#Part 1 Data Preprocessing
# importing the dataset
dataset =pd.read_csv("Churn_Modelling.csv")
X = dataset.iloc[:, 3:-1].values
y= dataset.iloc[:,-1].values

#Encoding independent variables
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
country=LabelEncoder()
gender=LabelEncoder()
X[:,1]=country.fit_transform(X[:,1])
X[:,2]=gender.fit_transform(X[:,2])

from sklearn.compose import ColumnTransformer
transformer=ColumnTransformer([('encoder' , OneHotEncoder(), [1])] ,remainder= 'passthrough')
X=np.array(transformer.fit_transform(X) , dtype=np.float)
X=X[:,1:]
#splitting the data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=0)

#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

#Part 2 Building ANN
import keras
from keras.layers import Dense
from keras.models import Sequential