def test_column_transformer_callable_specifier():
    # assert that function gets the full array / dataframe
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_res_first = np.array([[0, 1, 2]]).T

    def func(X):
        assert_array_equal(X, X_array)
        return [0]

    ct = ColumnTransformer([('trans', Trans(), func)],
                           remainder='drop')
    assert_array_equal(ct.fit_transform(X_array), X_res_first)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)

    pd = pytest.importorskip('pandas')
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])

    def func(X):
        assert_array_equal(X.columns, X_df.columns)
        assert_array_equal(X.values, X_df.values)
        return ['first']

    ct = ColumnTransformer([('trans', Trans(), func)],
                           remainder='drop')
    assert_array_equal(ct.fit_transform(X_df), X_res_first)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)
def test_column_transformer_special_strings():

    # one 'drop' -> ignore
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer(
        [('trans1', Trans(), [0]), ('trans2', 'drop', [1])])
    exp = np.array([[0.], [1.], [2.]])
    assert_array_equal(ct.fit_transform(X_array), exp)
    assert_array_equal(ct.fit(X_array).transform(X_array), exp)

    # all 'drop' -> return shape 0 array
    ct = ColumnTransformer(
        [('trans1', 'drop', [0]), ('trans2', 'drop', [1])])
    assert_array_equal(ct.fit(X_array).transform(X_array).shape, (3, 0))
    assert_array_equal(ct.fit_transform(X_array).shape, (3, 0))

    # 'passthrough'
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer(
        [('trans1', Trans(), [0]), ('trans2', 'passthrough', [1])])
    exp = X_array
    assert_array_equal(ct.fit_transform(X_array), exp)
    assert_array_equal(ct.fit(X_array).transform(X_array), exp)

    # None itself / other string is not valid
    for val in [None, 'other']:
        ct = ColumnTransformer(
            [('trans1', Trans(), [0]), ('trans2', None, [1])])
        assert_raise_message(TypeError, "All estimators should implement",
                             ct.fit_transform, X_array)
        assert_raise_message(TypeError, "All estimators should implement",
                             ct.fit, X_array)
def test_column_transformer_sparse_array():
    X_sparse = sparse.eye(3, 2).tocsr()

    # no distinction between 1D and 2D
    X_res_first = X_sparse[:, 0]
    X_res_both = X_sparse

    for col in [0, [0], slice(0, 1)]:
        for remainder, res in [('drop', X_res_first),
                               ('passthrough', X_res_both)]:
            ct = ColumnTransformer([('trans', Trans(), col)],
                                   remainder=remainder,
                                   sparse_threshold=0.8)
            assert sparse.issparse(ct.fit_transform(X_sparse))
            assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res)
            assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                         res)

    for col in [[0, 1], slice(0, 2)]:
        ct = ColumnTransformer([('trans', Trans(), col)],
                               sparse_threshold=0.8)
        assert sparse.issparse(ct.fit_transform(X_sparse))
        assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both)
        assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                     X_res_both)
def test_column_transformer_remainder():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
    X_res_both = X_array

    # default drop
    ct = ColumnTransformer([('trans1', Trans(), [0])])
    assert_array_equal(ct.fit_transform(X_array), X_res_first)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'drop'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # specify passthrough
    ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # column order is not preserved (passed through added to end)
    ct = ColumnTransformer([('trans1', Trans(), [1])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1])
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1])
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [0])

    # passthrough when all actual transformers are skipped
    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_second)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # error on invalid arg
    ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\', \'passthrough\', "
        "or estimator.", ct.fit, X_array)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\', \'passthrough\', "
        "or estimator.", ct.fit_transform, X_array)

    # check default for make_column_transformer
    ct = make_column_transformer(([0], Trans()))
    assert ct.remainder == 'drop'
def test_column_transformer_sparse_stacking():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)])
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert_true(sparse.issparse(X_trans))
    assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1))
    assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    X_res_first1D = np.array([0, 1, 2])
    X_res_second1D = np.array([2, 4, 6])
    X_res_first = X_res_first1D.reshape(-1, 1)
    X_res_both = X_array

    cases = [
        # single column 1D / 2D
        (0, X_res_first),
        ([0], X_res_first),
        # list-like
        ([0, 1], X_res_both),
        (np.array([0, 1]), X_res_both),
        # slice
        (slice(0, 1), X_res_first),
        (slice(0, 2), X_res_both),
        # boolean mask
        (np.array([True, False]), X_res_first),
    ]

    for selection, res in cases:
        ct = ColumnTransformer([('trans', Trans(), selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_array), res)
        assert_array_equal(ct.fit(X_array).transform(X_array), res)

        # callable that returns any of the allowed specifiers
        ct = ColumnTransformer([('trans', Trans(), lambda x: selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_array), res)
        assert_array_equal(ct.fit(X_array).transform(X_array), res)

    ct = ColumnTransformer([('trans1', Trans(), [0]),
                            ('trans2', Trans(), [1])])
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2

    # test with transformer_weights
    transformer_weights = {'trans1': .1, 'trans2': 10}
    both = ColumnTransformer([('trans1', Trans(), [0]),
                              ('trans2', Trans(), [1])],
                             transformer_weights=transformer_weights)
    res = np.vstack([transformer_weights['trans1'] * X_res_first1D,
                     transformer_weights['trans2'] * X_res_second1D]).T
    assert_array_equal(both.fit_transform(X_array), res)
    assert_array_equal(both.fit(X_array).transform(X_array), res)
    assert len(both.transformers_) == 2

    both = ColumnTransformer([('trans', Trans(), [0, 1])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
def test_2D_transformer_output():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    # if one transformer is dropped, test that name is still correct
    ct = ColumnTransformer([('trans1', 'drop', 0),
                            ('trans2', TransNo2D(), 1)])
    assert_raise_message(ValueError, "the 'trans2' transformer should be 2D",
                         ct.fit_transform, X_array)
    ct.fit(X_array)
    assert_raise_message(ValueError, "the 'trans2' transformer should be 2D",
                         ct.transform, X_array)
def test_column_transformer_cloning():
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T

    ct = ColumnTransformer([('trans', StandardScaler(), [0])])
    ct.fit(X_array)
    assert_false(hasattr(ct.transformers[0][1], 'mean_'))
    assert_true(hasattr(ct.transformers_[0][1], 'mean_'))

    ct = ColumnTransformer([('trans', StandardScaler(), [0])])
    ct.fit_transform(X_array)
    assert_false(hasattr(ct.transformers[0][1], 'mean_'))
    assert_true(hasattr(ct.transformers_[0][1], 'mean_'))
def test_2D_transformer_output_pandas():
    pd = pytest.importorskip('pandas')

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['col1', 'col2'])

    # if one transformer is dropped, test that name is still correct
    ct = ColumnTransformer([('trans1', TransNo2D(), 'col1')])
    assert_raise_message(ValueError, "the 'trans1' transformer should be 2D",
                         ct.fit_transform, X_df)
    ct.fit(X_df)
    assert_raise_message(ValueError, "the 'trans1' transformer should be 2D",
                         ct.transform, X_df)
def test_column_transformer_named_estimators():
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer([('trans1', StandardScaler(), [0]),
                            ('trans2', StandardScaler(with_std=False), [1])])
    assert_false(hasattr(ct, 'transformers_'))
    ct.fit(X_array)
    assert_true(hasattr(ct, 'transformers_'))
    assert_true(isinstance(ct.named_transformers_['trans1'], StandardScaler))
    assert_true(isinstance(ct.named_transformers_.trans1, StandardScaler))
    assert_true(isinstance(ct.named_transformers_['trans2'], StandardScaler))
    assert_true(isinstance(ct.named_transformers_.trans2, StandardScaler))
    assert_false(ct.named_transformers_.trans2.with_std)
    # check it are fitted transformers
    assert_equal(ct.named_transformers_.trans1.mean_, 1.)
def test_column_transformer_remainder_numpy(key):
    # test different ways that columns are specified with passthrough
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_res_both = X_array

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
def test_2D_transformer_output():

    class TransNo2D(BaseEstimator):
        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):
            return X

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    # if one transformer is dropped, test that name is still correct
    ct = ColumnTransformer([('trans1', 'drop', 0),
                            ('trans2', TransNo2D(), 1)])
    assert_raise_message(ValueError, "the 'trans2' transformer should be 2D",
                         ct.fit_transform, X_array)
    ct.fit(X_array)
    assert_raise_message(ValueError, "the 'trans2' transformer should be 2D",
                         ct.transform, X_array)
def test_column_transformer_remainder_pandas(key):
    # test different ways that columns are specified with passthrough
    pd = pytest.importorskip('pandas')

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    X_res_both = X_array

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
def test_column_transformer_sparse_stacking():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)],
                                  sparse_threshold=0.8)
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert sparse.issparse(X_trans)
    assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1))
    assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
    assert len(col_trans.transformers_) == 2
    assert col_trans.transformers_[-1][0] != 'remainder'

    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)],
                                  sparse_threshold=0.1)
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert not sparse.issparse(X_trans)
    assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
    assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer_no_remaining_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    ct = ColumnTransformer([('trans1', Trans(), [0, 1, 2])],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_array)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_array)
    assert len(ct.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'
def test_column_transformer_remainder():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
    X_res_both = X_array

    # default passthrough
    ct = ColumnTransformer([('trans', Trans(), [0])])
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)

    # specify to drop remaining columns
    ct = ColumnTransformer([('trans1', Trans(), [0])],
                           remainder='drop')
    assert_array_equal(ct.fit_transform(X_array), X_res_first)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)

    # column order is not preserved (passed through added to end)
    ct = ColumnTransformer([('trans1', Trans(), [1])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1])
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1])

    # passthrough when all actual transformers are skipped
    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_second)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second)

    # error on invalid arg
    ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\' or \'passthrough\'",
        ct.fit, X_array)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\' or \'passthrough\'",
        ct.fit_transform, X_array)
def test_column_transformer_empty_columns(pandas, column):
    # test case that ensures that the column transformer does also work when
    # a given transformer doesn't have any columns to work on
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_res_both = X_array

    if pandas:
        pd = pytest.importorskip('pandas')
        X = pd.DataFrame(X_array, columns=['first', 'second'])
    else:
        X = X_array

    ct = ColumnTransformer([('trans1', Trans(), [0, 1]),
                            ('trans2', Trans(), column)])
    assert_array_equal(ct.fit_transform(X), X_res_both)
    assert_array_equal(ct.fit(X).transform(X), X_res_both)
    assert len(ct.transformers_) == 2
    assert isinstance(ct.transformers_[1][1], Trans)

    ct = ColumnTransformer([('trans1', Trans(), column),
                            ('trans2', Trans(), [0, 1])])
    assert_array_equal(ct.fit_transform(X), X_res_both)
    assert_array_equal(ct.fit(X).transform(X), X_res_both)
    assert len(ct.transformers_) == 2
    assert isinstance(ct.transformers_[0][1], Trans)

    ct = ColumnTransformer([('trans', Trans(), column)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X), X_res_both)
    assert_array_equal(ct.fit(X).transform(X), X_res_both)
    assert len(ct.transformers_) == 2  # including remainder
    assert isinstance(ct.transformers_[0][1], Trans)

    fixture = np.array([[], [], []])
    ct = ColumnTransformer([('trans', Trans(), column)],
                           remainder='drop')
    assert_array_equal(ct.fit_transform(X), fixture)
    assert_array_equal(ct.fit(X).transform(X), fixture)
    assert len(ct.transformers_) == 2  # including remainder
    assert isinstance(ct.transformers_[0][1], Trans)
def test_column_transformer_get_feature_names():
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer([('trans', Trans(), [0, 1])])
    # raise correct error when not fitted
    assert_raises(NotFittedError, ct.get_feature_names)
    # raise correct error when no feature names are available
    ct.fit(X_array)
    assert_raise_message(AttributeError,
                         "Transformer trans (type Trans) does not provide "
                         "get_feature_names", ct.get_feature_names)

    # working example
    X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}],
                  [{'c': 5}, {'c': 6}]], dtype=object).T
    ct = ColumnTransformer(
        [('col' + str(i), DictVectorizer(), i) for i in range(2)])
    ct.fit(X)
    assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b', 'col1__c'])

    # passthrough transformers not supported
    ct = ColumnTransformer([('trans', 'passthrough', [0, 1])])
    ct.fit(X)
    assert_raise_message(
        NotImplementedError, 'get_feature_names is not yet supported',
        ct.get_feature_names)

    ct = ColumnTransformer([('trans', DictVectorizer(), 0)],
                           remainder='passthrough')
    ct.fit(X)
    assert_raise_message(
        NotImplementedError, 'get_feature_names is not yet supported',
        ct.get_feature_names)

    # drop transformer
    ct = ColumnTransformer(
        [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)])
    ct.fit(X)
    assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b'])
def test_column_transformer_list():
    X_list = [
        [1, float('nan'), 'a'],
        [0, 0, 'b']
    ]
    expected_result = np.array([
        [1, float('nan'), 1, 0],
        [-1, 0, 0, 1],
    ])

    ct = ColumnTransformer([
        ('numerical', StandardScaler(), [0, 1]),
        ('categorical', OneHotEncoder(), [2]),
    ])

    assert_array_equal(ct.fit_transform(X_list), expected_result)
    assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_column_transformer_drops_all_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    # columns are doubled when remainder = DoubleTrans
    X_res_both = 2 * X_array.copy()[:, 1:3]

    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_remainder_transformer(key):
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T
    X_res_both = X_array.copy()

    # second and third columns are doubled when remainder = DoubleTrans
    X_res_both[:, 1:3] *= 2

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_remainder_pandas(key):
    # test different ways that columns are specified with passthrough
    pd = pytest.importorskip('pandas')
    if isinstance(key, six.string_types) and key == 'pd-index':
        key = pd.Index(['first'])

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    X_res_both = X_array

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])
def test_column_transformer_list():
    X_list = [
        [1, float('nan'), 'a'],
        [0, 0, 'b']
    ]
    expected_result = np.array([
        [1, float('nan'), 1, 0],
        [-1, 0, 0, 1],
    ])

    ct = ColumnTransformer([
        ('numerical', StandardScaler(), [0, 1]),
        ('categorical', OneHotEncoder(), [2]),
    ])

    with pytest.warns(DataConversionWarning):
        # TODO: this warning is not very useful in this case, would be good
        # to get rid of it
        assert_array_equal(ct.fit_transform(X_list), expected_result)
        assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
示例#24
0
numeric_transformer = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

categorical_features = [
    'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
    'weathersit'
]
categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[(
    'num', numeric_transformer,
    numeric_features), ('cat', categorical_transformer, categorical_features)])

preprocessor.fit(hour_x_train)
x_train = preprocessor.transform(hour_x_train).todense()
x_val = preprocessor.transform(hour_x_val).todense()
pickle.dump(preprocessor, open('encoder.p', "wb"))  # Save encoder
print('Predictors prepared')

# Prepare targets
y_train = hour_y_train.values.astype(float)
y_val = hour_y_val.values.astype(float)
print('all data prepared')

# Test different weight decays
weight_decay_list = 10**np.linspace(-5, 3, 10)

val_loss_list = []
result_path_list = []
示例#25
0
train_df = pd.read_csv('training.csv')

X = train_df.drop('Instance', axis=1)
X = X.drop('Income in EUR', axis=1)
y = train_df['Income in EUR']

X_pred = pd.read_csv('test.csv')
X_pred = X_pred.drop('Income', axis=1)
X_pred = X_pred.drop('Instance', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1)

ct = ColumnTransformer(transformers=[('num_imp', SimpleImputer(strategy='median'), [0, 2, 4, 9]), ('cat_imp', SimpleImputer(strategy='most_frequent'), [1, 3, 5, 6, 7, 8])], remainder='passthrough')

ct.fit(X_train, y_train)
X_train = ct.transform(X_train)
X_test = ct.transform(X_test)

jobs = X_train[:,6]
senior_job_terms = ['senior', 'manager', 'doctor', 'lawyer', 'analyst', 'programmer', 'specialist', 'supervisor', 'chief']
senior_job = []
for j in jobs:
    found=False
    for s in senior_job_terms:
        if s in j:
            senior_job.append('yes')
            found = True
            break
    if not found:
        senior_job.append('no')
Xtest_new1 = d_tr.drop(['ID'], axis=1)
featuresObject = ['season', 'year', 'month', 'hours', 'is_business_day', 'is_holiday']
for var in featuresObject:
    Xtest_new1[var] = Xtest_new1[var].astype('category')
Xtest_new1.info()
Xtest_new = Xtest_new1.copy()

col_name_test = [f for f in Xtest_new.columns if Xtest_new[f].dtype == float]
type(col_name_test)
numeric_features_test = Xtest_new[col_name_test]
type(numeric_features_test)
Xtest_new[col_name_test].dtypes

# Les Num

X_te = ct_num.fit(numeric_features_test)
Xtest_new[col_name_test] = pd.DataFrame(X_te, columns=numeric_features_test.columns, index = list(X_test.index.values))
Xtest_new.info()
type(X_te)
print(type(X_te))
print(X_te[0])
numeric_features_test = scaler.transform(numeric_features_test.values)
# Gerer les variables categoriques

d_1he_test = ct.fit(Xtest_new)

Xtest_new.info()
Xtrain_new.info()
d_encoded_data = pd.DataFrame(d_1he_test, columns=ct.get_feature_names(), index = list(X_test.index.values))
d_encoded_data.drop(['oh_enc__x0_2016', 'oh_enc__x1_1','oh_enc__x2_0', 'oh_enc__x3_0','oh_enc__x4_0', 'oh_enc__x5_fall'], inplace=True, axis=1)
df_concat = pd.concat([Xtest_new.reset_index(drop=True), d_encoded_data.reset_index(drop=True)], axis=1)
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    turbine_type_feature = ['turbine_type']
    turbine_type_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='HAWT')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('ttype', turbine_type_transformer, turbine_type_feature)
    ],
                                     remainder="drop")

    preprocessor.fit(raw_data)

    joblib.dump(preprocessor, os.path.join(args.model_dir, "model.joblib"))


def input_fn(input_data, content_type):
    """Parse input data payload

    We currently only take csv input. Since we need to process both labelled
    and unlabelled data we first determine whether the label column is present
    by looking at how many columns were provided.
    """
    if content_type == 'text/csv':
        # Read the raw input data as CSV.
        df = pd.read_csv(StringIO(input_data), header=None)
class TransformerClass(BaseEstimator, TransformerMixin):
    """
        TransformerClass
    """
    def __init__(self):
        pass

    def create_pipeline_for_categorical_params(self) -> Pipeline:
        """
            processing of categorical params
        """
        return Pipeline([("OH", OneHotEncoder())])

    def create_pipeline_for_numerical_params(self) -> Pipeline:
        """
                    processing of numerical params
        """
        return Pipeline([("impute", SimpleImputer(np.nan, "mean"))])

    def create(self, params: FeatureParams) -> ColumnTransformer:
        """
                create transformer pipeline
        """
        self.transformer = ColumnTransformer([
            (
                "pipeline_for_categorical_params",
                self.create_pipeline_for_categorical_params(),
                params.categorical,
            ),
            (
                "pipeline_for_numerical_params",
                self.create_pipeline_for_numerical_params(),
                params.numerical,
            ),
        ])
        return self.transformer

    def save(self, path_to_save: str) -> str:
        """
            save transformer to disk
        """
        with open(path_to_save, "wb") as file:
            pickle.dump(self.transformer, file)
        return path_to_save

    def load(self, path_to_save: str) -> ColumnTransformer:
        """
              load transformer from disk
        """
        with open(path_to_save, "rb") as file:
            self.transformer = pickle.load(file)
        return self.transformer

    def fit(self, df: pd.DataFrame, params: FeatureParams) -> pd.DataFrame:
        """
             fit the transformer to input data
        """
        self.create(params)
        self.transformer.fit(df)
        return self

    def fit_transform(self, df: pd.DataFrame,
                      params: FeatureParams) -> pd.DataFrame:
        """
            fit the transformer to input data and transform the data
        """
        self.create(params)
        return pd.DataFrame(self.transformer.fit_transform(df))

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """
            transform the data with already fitted transformer
        """
        return pd.DataFrame(self.transformer.transform(df))
# Calculate important parameters.
n_patients_train = X_train.shape[0]
n_features = X_train.shape[1]

# %% [markdown]
# ### Pre-process data

# %%
# Standardization
cols_standardize = [
    'grade', 'age', 'n_positive_nodes', 'progesterone', 'estrogen'
]
X_ct = ColumnTransformer([('standardizer', StandardScaler(), cols_standardize)
                          ])
X_ct.fit(X_train[cols_standardize])

X_train[cols_standardize] = X_ct.transform(X_train[cols_standardize])
X_test[cols_standardize] = X_ct.transform(X_test[cols_standardize])

Y_scaler = StandardScaler().fit(Y_train)
Y_train['T'] = Y_scaler.transform(Y_train)
Y_test['T'] = Y_scaler.transform(Y_test)

# %%
# Sorting
sort_idx = np.argsort(Y_train.to_numpy(), axis=None)[::-1]
X_train = X_train.loc[sort_idx, :]
Y_train = Y_train.loc[sort_idx, :]
E_train = E_train.loc[sort_idx, :]
示例#30
0
class EasyPreprocessor(BaseEstimator, TransformerMixin):
    """A simple preprocessor

    Detects variable types, encodes everything as floats
    for use with sklearn.

    Applies one-hot encoding, missing value imputation and scaling.

    Attributes
    ----------
    ct_ : ColumnTransformer
        Main container for all transformations.

    columns_ : pandas columns
        Columns of training data

    dtypes_ : Series of dtypes
        Dtypes of training data columns.

    types_ : something
        Inferred input types.


    Parameters
    ----------
    scale : boolean, default=True
        Whether to scale continuous data.

    verbose : int, default=0
        Control output verbosity.

    """
    def __init__(self, scale=True, verbose=0, types=None):
        self.verbose = verbose
        self.scale = scale
        self.types = types

    def fit(self, X, y=None):
        """A reference implementation of a fitting function for a transformer.

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples.
        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        self : object
            Returns self.
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        self.columns_ = X.columns
        self.dtypes_ = X.dtypes
        if self.types is None:
            # FIXME some sanity check?
            types = detect_types(X, verbose=self.verbose)
        else:
            types = self.types

        types = types.copy()
        # low card int encoded as categorical and continuous for now:
        types.loc[types.low_card_int, 'continuous'] = True
        types.loc[types.low_card_int, 'categorical'] = True

        # go over variable blocks
        # check for missing values
        # scale etc
        steps_categorical = []
        if X.loc[:, types.categorical].isna().any(axis=None):
            steps_categorical.append(
                SimpleImputer(strategy='constant', fill_value='dabl_missing'))
        steps_categorical.append(
            OneHotEncoder(categories='auto',
                          handle_unknown='ignore',
                          sparse=False))
        pipe_categorical = make_pipeline(*steps_categorical)

        steps_continuous = []
        if (X.loc[:, types.continuous].isna().any(axis=None)
                or types['dirty_float'].any()):
            # we could skip the imputer here, but if there's dirty
            # floats, they'll have NaN, and we reuse the cont pipeline
            steps_continuous.append(SimpleImputer(strategy='median'))
        if self.scale:
            steps_continuous.append(StandardScaler())
        # if X.loc[:, types['continuous']].isnull().values.any():
        # FIXME doesn't work if missing values only in dirty column
        pipe_continuous = make_pipeline(*steps_continuous)
        # FIXME only have one imputer/standard scaler in all
        # (right now copied in dirty floats and floats)

        pipe_dirty_float = make_pipeline(
            DirtyFloatCleaner(),
            make_column_transformer((pipe_continuous, select_cont),
                                    remainder="passthrough"))
        # construct column transformer
        transformer_cols = []
        if types['continuous'].any():
            transformer_cols.append(
                ('continuous', pipe_continuous, types['continuous']))
        if types['categorical'].any():
            transformer_cols.append(
                ('categorical', pipe_categorical, types['categorical']))
        if types['dirty_float'].any():
            # FIXME we're not really handling this here any more? (yes we are)
            transformer_cols.append(
                ('dirty_float', pipe_dirty_float, types['dirty_float']))

        if not len(transformer_cols):
            raise ValueError("No feature columns found")
        self.ct_ = ColumnTransformer(transformer_cols, sparse_threshold=.1)

        self.ct_.fit(X)

        self.input_shape_ = X.shape
        self.types_ = types
        # Return the transformer
        return self

    def get_feature_names(self):
        # this can go soon hopefully
        feature_names = []
        for name, trans, cols in self.ct_.transformers_:
            if name == "continuous":
                # three should be no all-nan columns in the imputer
                if (trans.steps[0][0] == "simpleimputer"
                        and np.isnan(trans.steps[0][1].statistics_).any()):
                    raise ValueError("So unexpected! Looks like the imputer"
                                     " dropped some all-NaN columns."
                                     "Try calling 'clean' on your data first.")
                feature_names.extend(cols.index[cols])
            elif name == 'categorical':
                # this is the categorical pipe, extract one hot encoder
                ohe = trans.steps[-1][1]
                # FIXME that is really strange?!
                ohe_cols = self.columns_[self.columns_.map(cols)]
                feature_names.extend(ohe.get_feature_names(ohe_cols))
            elif name == "remainder":
                assert trans == "drop"
            elif name == "dirty_float":
                raise ValueError(
                    "Can't compute feature names when handling dirty floats. "
                    "Call 'clean' as a workaround")
            else:
                raise ValueError(
                    "Can't compute feature names for {}".format(name))
        return feature_names

    def transform(self, X):
        """ A reference implementation of a transform function.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        Returns
        -------
        X_transformed : array of int of shape = [n_samples, n_features]
            The array containing the element-wise square roots of the values
            in `X`
        """
        # Check is fit had been called
        check_is_fitted(self, ['ct_'])
        return self.ct_.transform(X)
示例#31
0
def test_column_transformer_dataframe():
    pd = pytest.importorskip('pandas')

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_both = X_array

    cases = [
        # String keys: label based

        # scalar
        ('first', X_res_first),
        # list
        (['first'], X_res_first),
        (['first', 'second'], X_res_both),
        # slice
        (slice('first', 'second'), X_res_both),

        # int keys: positional

        # scalar
        (0, X_res_first),
        # list
        ([0], X_res_first),
        ([0, 1], X_res_both),
        (np.array([0, 1]), X_res_both),
        # slice
        (slice(0, 1), X_res_first),
        (slice(0, 2), X_res_both),

        # boolean mask
        (np.array([True, False]), X_res_first),
        (pd.Series([True, False], index=['first', 'second']), X_res_first),
    ]

    for selection, res in cases:
        ct = ColumnTransformer([('trans', Trans(), selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_df), res)
        assert_array_equal(ct.fit(X_df).transform(X_df), res)

        # callable that returns any of the allowed specifiers
        ct = ColumnTransformer([('trans', Trans(), lambda X: selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_df), res)
        assert_array_equal(ct.fit(X_df).transform(X_df), res)

    ct = ColumnTransformer([('trans1', Trans(), ['first']),
                            ('trans2', Trans(), ['second'])])
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    ct = ColumnTransformer([('trans1', Trans(), [0]),
                            ('trans2', Trans(), [1])])
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    # test with transformer_weights
    transformer_weights = {'trans1': .1, 'trans2': 10}
    both = ColumnTransformer([('trans1', Trans(), ['first']),
                              ('trans2', Trans(), ['second'])],
                             transformer_weights=transformer_weights)
    res = np.vstack([
        transformer_weights['trans1'] * X_df['first'],
        transformer_weights['trans2'] * X_df['second']
    ]).T
    assert_array_equal(both.fit_transform(X_df), res)
    assert_array_equal(both.fit(X_df).transform(X_df), res)
    assert len(both.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    # test multiple columns
    both = ColumnTransformer([('trans', Trans(), ['first', 'second'])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'

    both = ColumnTransformer([('trans', Trans(), [0, 1])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'

    # ensure pandas object is passes through

    class TransAssert(BaseEstimator):
        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):
            assert_true(isinstance(X, (pd.DataFrame, pd.Series)))
            if isinstance(X, pd.Series):
                X = X.to_frame()
            return X

    ct = ColumnTransformer([('trans', TransAssert(), 'first')],
                           remainder='drop')
    ct.fit_transform(X_df)
    ct = ColumnTransformer([('trans', TransAssert(), ['first', 'second'])])
    ct.fit_transform(X_df)

    # integer column spec + integer column names -> still use positional
    X_df2 = X_df.copy()
    X_df2.columns = [1, 0]
    ct = ColumnTransformer([('trans', Trans(), 0)], remainder='drop')
    assert_array_equal(ct.fit_transform(X_df), X_res_first)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)

    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'drop'
    assert_array_equal(ct.transformers_[-1][2], [1])
示例#32
0
    def test_check_preprocessing_1(self):
        """
        Test check preprocessing on multiple preprocessing
        """
        train = pd.DataFrame({
            'Onehot1': ['A', 'B', 'A', 'B'],
            'Onehot2': ['C', 'D', 'C', 'D'],
            'Binary1': ['E', 'F', 'E', 'F'],
            'Binary2': ['G', 'H', 'G', 'H'],
            'Ordinal1': ['I', 'J', 'I', 'J'],
            'Ordinal2': ['K', 'L', 'K', 'L'],
            'BaseN1': ['M', 'N', 'M', 'N'],
            'BaseN2': ['O', 'P', 'O', 'P'],
            'Target1': ['Q', 'R', 'Q', 'R'],
            'Target2': ['S', 'T', 'S', 'T'],
            'other': ['other', np.nan, 'other', 'other']
        })

        y = pd.DataFrame(data=[0, 1, 0, 0], columns=['y'])

        enc_onehot = ce.OneHotEncoder(cols=['Onehot1', 'Onehot2']).fit(train)
        train_onehot = enc_onehot.transform(train)
        enc_binary = ce.BinaryEncoder(
            cols=['Binary1', 'Binary2']).fit(train_onehot)
        train_binary = enc_binary.transform(train_onehot)
        enc_ordinal = ce.OrdinalEncoder(
            cols=['Ordinal1', 'Ordinal2']).fit(train_binary)
        train_ordinal = enc_ordinal.transform(train_binary)
        enc_basen = ce.BaseNEncoder(
            cols=['BaseN1', 'BaseN2']).fit(train_ordinal)
        train_basen = enc_basen.transform(train_ordinal)
        enc_target = ce.TargetEncoder(cols=['Target1', 'Target2']).fit(
            train_basen, y)

        input_dict1 = dict()
        input_dict1['col'] = 'Onehot2'
        input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan],
                                           index=['C', 'D', 'missing'])
        input_dict1['data_type'] = 'object'

        input_dict2 = dict()
        input_dict2['col'] = 'Binary2'
        input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan],
                                           index=['G', 'H', 'missing'])
        input_dict2['data_type'] = 'object'

        input_dict = dict()
        input_dict['col'] = 'state'
        input_dict['mapping'] = pd.Series(data=['US', 'FR-1', 'FR-2'],
                                          index=['US', 'FR', 'FR'])
        input_dict['data_type'] = 'object'

        input_dict3 = dict()
        input_dict3['col'] = 'Ordinal2'
        input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan],
                                           index=['K', 'L', 'missing'])
        input_dict3['data_type'] = 'object'
        list_dict = [input_dict2, input_dict3]

        y = pd.DataFrame(data=[0, 1], columns=['y'])

        train = pd.DataFrame({
            'city': ['chicago', 'paris'],
            'state': ['US', 'FR'],
            'other': ['A', 'B']
        })
        enc = ColumnTransformer(transformers=[('onehot', skp.OneHotEncoder(),
                                               ['city', 'state'])],
                                remainder='drop')
        enc.fit(train, y)

        wrong_prepro = skp.OneHotEncoder().fit(train, y)

        check_preprocessing([
            enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target,
            input_dict1, list_dict
        ])
        for preprocessing in [
                enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target
        ]:
            check_preprocessing(preprocessing)

        check_preprocessing(input_dict2)
        check_preprocessing(enc)
        check_preprocessing(None)

        with self.assertRaises(Exception):
            check_preprocessing(wrong_prepro)
    def test_multiple_encoding_columntransfomers(self):
        """
        Test multiple preprocessing columntransformers
        """
        train = pd.DataFrame({
            'Onehot1': ['A', 'B'],
            'Onehot2': ['C', 'D'],
            'Binary1': ['E', 'F'],
            'Binary2': ['G', 'H'],
            'Ordinal1': ['I', 'J'],
            'Ordinal2': ['K', 'L'],
            'BaseN1': ['M', 'N'],
            'BaseN2': ['O', 'P'],
            'Target1': ['Q', 'R'],
            'Target2': ['S', 'T'],
            'other': ['other', np.nan]
        })

        contributions = pd.DataFrame(
            [[
                1, 0, 1, 1, 1, 0, 1, 1, 3, 0, -3.5, 0, 4, 4, 5, 5, 0, 6, 7, 0,
                8, 9, 10
            ],
             [
                 .5, .5, 2, 0, .5, .5, 2, 0, 1.5, 1.5, 5.5, -2, -4, -4, -5, -5,
                 8.5, -2.5, -7, 14, -8, -9, -10
             ]],
            index=['index1', 'index2'])

        expected_contrib = pd.DataFrame(
            {
                'onehot_skp_Onehot1': [1., 1.],
                'onehot_skp_Onehot2': [2, 2],
                'onehot_ce_Onehot1': [1., 1.],
                'onehot_ce_Onehot2': [2, 2],
                'binary_ce_Binary1': [3., 3.],
                'binary_ce_Binary2': [-3.5, 3.5],
                'ordinal_ce_Ordinal1': [4, -4],
                'ordinal_ce_Ordinal2': [4, -4],
                'ordinal_skp_Ordinal1': [5, -5],
                'ordinal_skp_Ordinal2': [5, -5],
                'basen_ce_BaseN1': [6., 6.],
                'basen_ce_BaseN2': [7, 7],
                'target_ce_Target1': [8, -8],
                'target_ce_Target2': [9, -9],
                22: [10, -10]
            },
            index=['index1', 'index2'])

        y = pd.DataFrame(data=[0, 1], columns=['y'])

        enc = ColumnTransformer(transformers=[
            ('onehot_skp', skp.OneHotEncoder(), ['Onehot1', 'Onehot2']),
            ('onehot_ce', ce.OneHotEncoder(), ['Onehot1', 'Onehot2']),
            ('binary_ce', ce.BinaryEncoder(), ['Binary1', 'Binary2']),
            ('ordinal_ce', ce.OrdinalEncoder(), ['Ordinal1', 'Ordinal2']),
            ('ordinal_skp', skp.OrdinalEncoder(), ['Ordinal1', 'Ordinal2']),
            ('basen_ce', ce.BaseNEncoder(), ['BaseN1', 'BaseN2']),
            ('target_ce', ce.TargetEncoder(), ['Target1', 'Target2'])
        ],
                                remainder='passthrough')
        enc.fit(train, y)

        input_dict1 = dict()
        input_dict1['col'] = 'Onehot2'
        input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan],
                                           index=['C', 'D', 'missing'])
        input_dict1['data_type'] = 'object'

        input_dict2 = dict()
        input_dict2['col'] = 'Binary2'
        input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan],
                                           index=['G', 'H', 'missing'])
        input_dict2['data_type'] = 'object'

        input_dict3 = dict()
        input_dict3['col'] = 'Ordinal2'
        input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan],
                                           index=['K', 'L', 'missing'])
        input_dict3['data_type'] = 'object'
        list_dict = [input_dict2, input_dict3]

        original = inverse_transform_contributions(
            contributions, [enc, input_dict1, list_dict])

        pd.testing.assert_frame_equal(expected_contrib, original)
示例#34
0
    # - length: Longest shell measurement
    # - diameter: Diameter perpendicular to length
    # - height: Height with meat in shell
    # - whole_weight: Weight of whole abalone
    # - shucked_weight: Weight of meat
    # - viscera_weight: Gut weight (after bleeding)
    # - shell_weight: Weight after being dried
    # Categorical Features:
    # - sex: categories encoded as strings {'M', 'F', 'I'} where 'I' is Infant
    numeric_features = list(feature_columns_names)
    numeric_features.remove('sex')
    numeric_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='median')), ('scaler',
                                                    StandardScaler())])

    categorical_features = ['sex']
    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
                                     remainder="drop")

    preprocessor.fit(concat_data)

    joblib.dump(preprocessor, os.path.join(args.model_dir, "model.joblib"))
示例#35
0
)

numerical_pipe = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', MinMaxScaler())
    ]
)

preprocessor = ColumnTransformer(transformers = [
    ('cat', categorical_pipe, categorical),
    ('num', numerical_pipe, numerical)]
)

# Fit and transform training data
preprocessor.fit(X_train)

cat = preprocessor.named_transformers_['cat']['encoder'].get_feature_names(categorical)
columns = np.append(cat, numerical)

X_train_transformed = pd.DataFrame(preprocessor.transform(X_train), columns=columns)
X_train_transformed.head()

'''
 Step 4 : Do some serious ML
'''

def create_baseline_classifiers(seed=seed):
    """Create a list of baseline classifiers.
    
    Parameters
示例#36
0
class RepeatingBasisFunction(TransformerMixin, BaseEstimator):
    """
    This is a transformer for features with some form of circularity.
    E.g. for days of the week you might face the problem that, conceptually, day 7 is as
    close to day 6 as it is to day 1. While numerically their distance is different.
    This transformer remedies that problem.
    The transformer selects a column and transforms it with a given number of repeating
    (radial) basis functions. Radial basis functions are bell-curve shaped functions
    which take the original data as input. The basis functions are equally spaced over
    the input range. The key feature of repeating basis funtions is that they are
    continuous when moving from the max to the min of the input range. As a result these
    repeating basis functions can capture how close each datapoint is to the center of
    each repeating basis function, even when the input data has a circular nature.

    :type column: int or list, default=0
    :param column: Indexes the data on its second axis. Integers are interpreted as
        positional columns, while strings can reference DataFrame columns by name.

    :type remainder: {'drop', 'passthrough'}, default="drop"
    :param remainder: By default, only the specified column is transformed, and the
        non-specified columns are dropped. (default of ``'drop'``). By specifying
        ``remainder='passthrough'``, all remaining columns will be automatically passed
        through. This subset of columns is concatenated with the output of the transformer.

    :type n_periods: int, default=12
    :param n_periods: number of basis functions to create, i.e., the number of columns that
        will exit the transformer.

    :type input_range: tuple or None, default=None
    :param input_range: the values at which the data repeats itself. For example, for days of
        the week this is (1,7). If input_range=None it is inferred from the training data.
    """

    def __init__(
        self, column=0, remainder="drop", n_periods=12, input_range=None
    ):
        self.column = column
        self.remainder = remainder
        self.n_periods = n_periods
        self.input_range = input_range

    def fit(self, X, y=None):
        self.pipeline_ = ColumnTransformer(
            [
                (
                    "repeatingbasis",
                    _RepeatingBasisFunction(
                        n_periods=self.n_periods, input_range=self.input_range
                    ),
                    [self.column],
                )
            ],
            remainder=self.remainder,
        )

        self.pipeline_.fit(X, y)

        return self

    def transform(self, X):
        check_is_fitted(self, ["pipeline_"])
        return self.pipeline_.transform(X)
def build_model(dataframe=None,
                target_column=None,
                numerical_transformer=None,
                categorical_transformer=None,
                pca=False,
                algorithm=None,
                balance_data=False,
                grid_search=False,
                params=None,
                hashing=False,
                hash_size=500,
                project_path=None,
                **kwargs):

    #     algorithm = algorithm.copy()
    data_path = os.path.join(project_path, 'data/')
    try:
        os.mkdir(data_path)
    except:
        pass

    identify_columns(dataframe, target_column, output_path=data_path, **kwargs)

    model_preprocessor_pipeline = os.path.join(project_path, 'model/')
    try:
        os.mkdir(model_preprocessor_pipeline)
    except:
        pass

    if os.path.exists(f"{project_path}/data/metadata/store_file.yaml"):
        config = yaml.safe_load(
            open(f"{project_path}/data/metadata/store_file.yaml"))
        numerical_attribute = config['num_feat']
        categorical_attribute = config['cat_feat']
        lower_categorical_attribute = config['lower_cat']
        hash_features = config['hash_feat']
        input_columns = config['input_columns']
    else:
        raise ValueError(
            'path: No file found in f"{project_path}/data/metadata/"')

    if hashing:
        hash_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='constant',
                                      fill_value='Missing')),
            ('hasher', FeatureHasher(n_features=hash_size,
                                     input_type='string'))
        ])
        categorical_attribute = lower_categorical_attribute
        data_transformer = ColumnTransformer(transformers=[(
            'numerical', numerical_transformer, numerical_attribute
        ), ('categorical', categorical_transformer,
            categorical_attribute), ('hasher', hash_transformer,
                                     hash_features)])
        use_cols = numerical_attribute + categorical_attribute + hash_features

    else:
        data_transformer = ColumnTransformer(transformers=[(
            'numerical', numerical_transformer, numerical_attribute
        ), ('categorical', categorical_transformer, categorical_attribute)])
        use_cols = numerical_attribute + categorical_attribute

    train_df = manage_columns(dataframe,
                              columns=input_columns,
                              select_columns=True)
    y = dataframe[target_column]

    if balance_data:
        oversample = SMOTE()
        data_transformer.fit(train_df)
        encoder = data_transformer.transform(train_df)
        X, y = oversample.fit_resample(encoder, y)
        train_df = X
        X_train_copy = encoder
        X_train, X_test, y_train, y_test = train_test_split(train_df, y,\
                                                            stratify=y, test_size=0.20, random_state=0)

    else:

        X_train, X_test, y_train, y_test = train_test_split(train_df, y,\
                                                            stratify=y, test_size=0.20, random_state=0)

        X_train_copy = X_train.copy()
        data_transformer.fit(X_train_copy)
        X_train_copy = data_transformer.transform(X_train_copy)

    if pca:
        print_devider('Applying PCA to the data')
        if scipy.sparse.issparse(X_train_copy):
            X_train_array = X_train_copy.toarray()
#         elif isinstance(X_train_copy, np.ndarray):
#             X_train_array = X_train_copy
        else:
            X_train_array = X_train_copy
#         X_train_copy = data_transformer.fit_transform(X_train_copy)
        pca_ = PCA().fit(X_train_array)
        pca_evr = pca_.explained_variance_ratio_
        cumsum_ = np.cumsum(pca_evr)
        dim_95 = np.argmax(cumsum_ >= 0.95) + 1
        instances_, dims_ = X_train_copy.shape
        dim_reduction = PCA(dim_95)
        print(
            f"\nDimension reduced from {dims_} to {dim_95} while retaining 95% of variance."
        )
        if hashing:
            preprocessor = Pipeline(steps=[('data_transformer', data_transformer),\
                                       ('to_dense', DenseTransformer()),\
                                       ('reduce_dim',dim_reduction)])
        else:
            preprocessor = Pipeline(steps=[('data_transformer', data_transformer),\
                                       ('reduce_dim',dim_reduction)])
    else:
        if hashing:
            preprocessor = Pipeline(steps=[('data_transformer', data_transformer),\
                                   ('to_dense', DenseTransformer())])
        else:
            preprocessor = Pipeline(steps=[('data_transformer',
                                            data_transformer)])

    classifier = Pipeline(steps=[('preprocessor',
                                  preprocessor), ('model', algorithm)])

    exclusive_keyword = ['model', 'fit', 'hyperparameters']

    if params:
        model_params = {}
        fit_params = {}
        hyperparamers_params = {}
        keys = [key for key in params.keys()]
        for first_key in keys:
            for key, value in params[first_key].items():
                key = f'model__{key}'
                if first_key == 'model':
                    model_params[key] = value
                elif first_key == 'fit':
                    fit_params[key] = value
                elif first_key == 'hyperparameters':
                    hyperparamers_params[key] = value
                else:
                    raise ValueError(
                        "params:'Only one of parameters {} should be set'.format(exclusive_keyword)"
                    )

    if grid_search:
        # We can utilize params grid to check for best hyperparameters or transformers
        # The syntax here is pipeline_step_name__parameters and we need to chain if we have nested pipelines
        parameters_grid = {}
        for key, value in param_grid.items():
            parameters_grid[f'model__{key}'] = value
            # Doing a Grid Search
        grid_search = GridSearchCV(classifier, param_grid=parameters_grid)
        # fitting on our dataset
        grid_search.fit(
            X_train, y_train)  # Semicolon to not print estimator in notebook
        # set config to diagram for visualizing the pipelines/composite estimator

        model_path = os.path.join(model_preprocessor_pipeline,
                                  f'{algorithm.__class__.__name__}.pkl')
        store_pipeline(grid_search.best_estimator_, model_path)

        set_config(display='diagram')
        # Lets visualize the best estimator from grid search.
        output = grid_search.best_estimator_
        # saving pipeline as html format
        with open(
                f'{model_preprocessor_pipeline}/titanic_data_pipeline_estimator.html',
                'w') as f:
            f.write(estimator_html_repr(grid_search.best_estimator_))

    else:
        if params:
            kwargsList = inspect.getfullargspec(algorithm.fit)[0]
            if len(fit_params) > 0:

                #             X_val = data_transformer.transform(X_test)
                #             if 'eval_set' in kwargsList:
                #                 fit_params['model__eval_set'] =  (X_test, y_test)
                try:
                    classifier.set_params(**model_params)
                    classifier.fit(X_train, y_train, **fit_params)
                except:
                    if 'cat_features' in kwargsList:
                        cate_features_index = [
                            X_train.columns.get_loc(col)
                            for col in X_train.columns
                        ][len(numerical_attribute):]
                    fit_params['model__cat_features'] = cate_features_index
                    classifier.set_params(**model_params)
                    classifier.fit(X_train, y_train, **fit_params)
            else:
                try:
                    classifier.set_params(**model_params)
                    classifier.fit(X_train, y_train)
                except:
                    if 'cat_features' in kwargsList:
                        cate_features_index = [
                            X_train.columns.get_loc(col)
                            for col in X_train.columns
                        ][len(numerical_attribute):]
                    fit_params['model__cat_features'] = cate_features_index
                    classifier.set_params(**model_params)
                    classifier.fit(X_train, y_train, **fit_params)
        else:
            classifier.set_params(**model_params)
            classifier.fit(X_train, y_train)

        model_path = os.path.join(model_preprocessor_pipeline,
                                  f'{algorithm.__class__.__name__}.pkl')
        store_pipeline(classifier, model_path)

        #         set config to diagram for visualizing the pipelines/composite estimators
        set_config(display='diagram')
        output = classifier

        with open(
                f'{model_preprocessor_pipeline}/titanic_data_pipeline_estimator.html',
                'w') as f:
            f.write(estimator_html_repr(classifier))


#     X_test = data_transformer.transform(X_test)
    y_pred = output.predict(X_test)

    print_devider('Metric Performance')

    met_perf = get_scores(y_test, y_pred)

    print(f'\nMetric performance on test data\n{met_perf}\n\n')
    print('\nconfusion matrix')

    print(confusion_matrix(y_test, y_pred))

    return output
示例#38
0
train.drop('m_id', axis=1, inplace=True)
train.drop(['user', 'item'], axis=1, inplace=True)
# %%
test = pd.merge(test, u_user, how='left', left_on='user', right_on='u_id')
test.drop('u_id', axis=1, inplace=True)
test = pd.merge(test, u_item, how='left', left_on='item', right_on='m_id')
test.drop('m_id', axis=1, inplace=True)
test.drop(['user', 'item'], axis=1, inplace=True)

# %%
ct = ColumnTransformer([
                        # ('u_i_onehot',OneHotEncoder(categories=[range(1, n_user + 1), range(1, n_item + 1)], sparse=False,dtype=np.int), ['user', 'item']),
                        ('gender_onehot', OneHotEncoder(dtype=np.int, sparse=False),['gender', 'occupation', 'zip_code'])
                        ],
                       remainder='passthrough')
ct.fit(train)
X_train = ct.transform(train)
X_test = ct.transform(test)

# %%
# 特征维度与V的维度
n_feature = X_train.shape[1]
k = 10
# %%
# 定义权重
w0 = tf.Variable(initial_value=tf.truncated_normal(shape=[1]), name='w0')
w = tf.Variable(initial_value=tf.truncated_normal(shape=[n_feature]), name='w')
V = tf.Variable(initial_value=tf.truncated_normal(shape=[k, n_feature]), name='V')
# %%
X = tf.placeholder(dtype='float', shape=[None, n_feature], name="X")
y = tf.placeholder(dtype='float', shape=[None, 1], name='y')
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    categorical_features = ['sex']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)],
        remainder="drop")
    
    preprocessor.fit(concat_data)

    joblib.dump(preprocessor, os.path.join(args.model_dir, "model.joblib"))

    print("saved model!")
    
    
def input_fn(input_data, content_type):
    """Parse input data payload
    
    We currently only take csv input. Since we need to process both labelled
    and unlabelled data we first determine whether the label column is present
    by looking at how many columns were provided.
    """
    if content_type == 'text/csv':
        # Read the raw input data as CSV.
cat_attrb_selected = [
    "Suburb", "Type", "Method", "SellerG", "Date", "CouncilArea", "Regionname"
]

cat_pipeline = Pipeline([("select_cat", DataFrameSelector(cat_attrb_selected)),
                         ("imputer", MostFrequentImputer()),
                         ('cat_encoder',
                          OneHotEncoder(handle_unknown='ignore',
                                        sparse=False))])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attrb_selected),
    ("cat", cat_pipeline, cat_attrb_selected),
])

full_pipeline.fit(data[data.Price.notnull()])

#SVM = pickle.load(open('model/Pickle/SVM.pkl', 'rb'))
#KNN = pickle.load(open('model/Pickle/KNN.pkl', 'rb'))
#RF = pickle.load(open('model/Pickle/RF.pkl', 'rb'))
#SVM_Grid = pickle.load(open('model/Pickle/svm_grid.pkl','rb'))
#KNN_Grid = pickle.load(open('model/Pickle/knn_grid.pkl','rb'))
#RF_Random = pickle.load(open('model/Pickle/rf_random.pkl','rb'))
# print(SVM.predict(full_pipeline.transform(data.head(1))))

test = [
    2, 2.5, 2, 1, 1, 202, -37.7996, 144.9984, "Abbotsford", "h", "S", "Biggin",
    "3/12/2016", "Yarra City Council", "Northern Metropolitan"
]

features = num_attrb_selected + cat_attrb_selected
示例#41
0
# In[14]:

test_data = pd.DataFrame([test_country], columns=countries.columns)

# In[15]:

data_features = countries.select_dtypes('number').columns

data_pipeline = Pipeline(steps=[('imputer', SimpleImputer(
    strategy='median')), ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', data_pipeline,
                                                data_features)],
                                 remainder='drop')

preprocessor.fit(countries)

# In[16]:


def q4():
    arable = preprocessor.transform(test_data)[0][data_features.get_loc(
        'Arable')]
    return float(round(arable, 3))


# ## Questão 5
#
# Descubra o número de _outliers_ da variável `Net_migration` segundo o método do _boxplot_, ou seja, usando a lógica:
#
# $$x \notin [Q1 - 1.5 \times \text{IQR}, Q3 + 1.5 \times \text{IQR}] \Rightarrow x \text{ é outlier}$$
def test_column_transformer_dataframe():
    pd = pytest.importorskip('pandas')

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_both = X_array

    cases = [
        # String keys: label based

        # scalar
        ('first', X_res_first),
        # list
        (['first'], X_res_first),
        (['first', 'second'], X_res_both),
        # slice
        (slice('first', 'second'), X_res_both),

        # int keys: positional

        # scalar
        (0, X_res_first),
        # list
        ([0], X_res_first),
        ([0, 1], X_res_both),
        (np.array([0, 1]), X_res_both),
        # slice
        (slice(0, 1), X_res_first),
        (slice(0, 2), X_res_both),

        # boolean mask
        (np.array([True, False]), X_res_first),
        (pd.Series([True, False], index=['first', 'second']), X_res_first),
    ]

    for selection, res in cases:
        ct = ColumnTransformer([('trans', Trans(), selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_df), res)
        assert_array_equal(ct.fit(X_df).transform(X_df), res)

        # callable that returns any of the allowed specifiers
        ct = ColumnTransformer([('trans', Trans(), lambda X: selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_df), res)
        assert_array_equal(ct.fit(X_df).transform(X_df), res)

    ct = ColumnTransformer([('trans1', Trans(), ['first']),
                            ('trans2', Trans(), ['second'])])
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    ct = ColumnTransformer([('trans1', Trans(), [0]),
                            ('trans2', Trans(), [1])])
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    # test with transformer_weights
    transformer_weights = {'trans1': .1, 'trans2': 10}
    both = ColumnTransformer([('trans1', Trans(), ['first']),
                              ('trans2', Trans(), ['second'])],
                             transformer_weights=transformer_weights)
    res = np.vstack([transformer_weights['trans1'] * X_df['first'],
                     transformer_weights['trans2'] * X_df['second']]).T
    assert_array_equal(both.fit_transform(X_df), res)
    assert_array_equal(both.fit(X_df).transform(X_df), res)
    assert len(both.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    # test multiple columns
    both = ColumnTransformer([('trans', Trans(), ['first', 'second'])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'

    both = ColumnTransformer([('trans', Trans(), [0, 1])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'

    # ensure pandas object is passes through

    class TransAssert(BaseEstimator):

        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):
            assert_true(isinstance(X, (pd.DataFrame, pd.Series)))
            if isinstance(X, pd.Series):
                X = X.to_frame()
            return X

    ct = ColumnTransformer([('trans', TransAssert(), 'first')],
                           remainder='drop')
    ct.fit_transform(X_df)
    ct = ColumnTransformer([('trans', TransAssert(), ['first', 'second'])])
    ct.fit_transform(X_df)

    # integer column spec + integer column names -> still use positional
    X_df2 = X_df.copy()
    X_df2.columns = [1, 0]
    ct = ColumnTransformer([('trans', Trans(), 0)], remainder='drop')
    assert_array_equal(ct.fit_transform(X_df), X_res_first)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)

    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'drop'
    assert_array_equal(ct.transformers_[-1][2], [1])
示例#43
0
    def test_check_consistency_model_features_5(self):
        """
        Unit test check_consistency_model_features 5
        """
        train = pd.DataFrame(
            {
                'city': ['chicago', 'paris'],
                'state': ['US', 'FR'],
                'other': [5, 10]
            },
            index=['index1', 'index2'])

        features_dict = None
        columns_dict = {
            i: features
            for i, features in enumerate(train.columns)
        }
        features_types = {
            features: str(train[features].dtypes)
            for features in train.columns
        }
        mask_params = None

        enc = ColumnTransformer(transformers=[
            ('Ordinal_ce', ce.OrdinalEncoder(), ['city', 'state']),
            ('Ordinal_skp', skp.OrdinalEncoder(), ['city', 'state'])
        ],
                                remainder='passthrough')

        enc_2 = ColumnTransformer(transformers=[
            ('Ordinal_ce', ce.OrdinalEncoder(), ['city', 'state']),
            ('Ordinal_skp', skp.OrdinalEncoder(), ['city', 'state'])
        ],
                                  remainder='drop')

        enc.fit(train)
        train_1 = pd.DataFrame(
            enc.transform(train),
            columns=["city_ce", "state_ce", "city_skp", "state_skp", "other"])
        train_1["y"] = np.array([1, 0])

        clf_1 = cb.CatBoostClassifier(n_estimators=1) \
            .fit(train_1[["city_ce", "state_ce", "city_skp", "state_skp", "other"]],
                 train_1['y'])

        enc_2.fit(train)
        train_2 = pd.DataFrame(
            enc_2.transform(train),
            columns=["city_ce", "state_ce", "city_skp", "state_skp"])
        train_2["y"] = np.array([1, 0])

        clf_2 = cb.CatBoostClassifier(n_estimators=1) \
            .fit(train_2[["city_ce", "state_ce", "city_skp", "state_skp"]],
                 train_2['y'])

        enc_3 = ce.OneHotEncoder(cols=['city', 'state'])
        enc_3.fit(train)
        train_3 = enc_3.transform(train)
        train_3["y"] = np.array([1, 0])

        clf_3 = cb.CatBoostClassifier(n_estimators=1) \
            .fit(train_3[["city_1", "city_2", "state_1", "state_2", "other"]],
                 train_3['y'])

        dict_4 = {
            'col': 'state',
            'mapping': pd.Series(data=[1, 2], index=['US', 'FR']),
            'data_type': 'object'
        }

        dict_5 = {
            'col': 'city',
            'mapping': pd.Series(data=[1, 2], index=['chicago', 'paris']),
            'data_type': 'object'
        }

        enc_4 = [enc_3, [dict_4]]

        enc_5 = [enc_3, [dict_4, dict_5]]

        check_consistency_model_features(features_dict,
                                         clf_1,
                                         columns_dict,
                                         features_types,
                                         mask_params,
                                         enc,
                                         list_preprocessing=[enc])

        check_consistency_model_features(features_dict,
                                         clf_2,
                                         columns_dict,
                                         features_types,
                                         mask_params,
                                         enc_2,
                                         list_preprocessing=[enc_2])

        check_consistency_model_features(features_dict,
                                         clf_3,
                                         columns_dict,
                                         features_types,
                                         mask_params,
                                         enc_3,
                                         list_preprocessing=[enc_3])

        check_consistency_model_features(features_dict,
                                         clf_3,
                                         columns_dict,
                                         features_types,
                                         mask_params,
                                         enc_4,
                                         list_preprocessing=enc_4)

        check_consistency_model_features(features_dict,
                                         clf_3,
                                         columns_dict,
                                         features_types,
                                         mask_params,
                                         enc_5,
                                         list_preprocessing=enc_5)
    ('num', numeric_transformer,
     numeric_features), ('cat', categorical_transformer, categorical_features)
    #       ('vect', CountVectorizer(), 'Model')
    # ('scale', StandardScaler(), all_features)
    #        ('iter', IterativeImputer(max_iter=10, random_state=0), ['New_Price'])
])
""" clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LinearRegression())])
 """
X = df.drop(['Price', 'Location_Type', 'Name', 'Mileage', 'New_Price'], axis=1)
y = df['Price']

X_test = tf.drop(['Location_Type', 'Name'], axis=1)

all_data = X.append(X_test)
preprocessor.fit(all_data)

X = preprocessor.transform(X)
X_test = preprocessor.transform(X_test)
X_train, X_validate, y_train, y_validate = train_test_split(X,
                                                            y,
                                                            test_size=0.1)

xgb = xgboost.XGBRegressor(n_estimators=200,
                           learning_rate=0.08,
                           gamma=0,
                           subsample=0.55,
                           colsample_bytree=0.75,
                           max_depth=20)

xgb.fit(X_train, y_train)
示例#45
0
def get_preprocessor_pipeline(df: pd.DataFrame) -> Pipeline:
    """
    Building transformation. Returns a pipeline which involves in multiple steps.
    Args:
        df: Dataframe

    Returns: Data transformed by pipeline

    """
    numeric_features = list(NUMERICAL_COLUMNS.keys())
    numeric_features += ['HouseAge', 'HouseAgeRemodel', 'GarageAge']

    ordinal_features = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                        'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'GarageQual']

    categorical_features = list(set(CATEGORICAL_COLUMNS.keys()) - set(ordinal_features))

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
    ])

    categorical_transformers = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(categories='auto', handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    ordinal_transformer_1 = Pipeline(steps=[
        ('encoder',
         OrdinalEncoder(categories=[['Fa', 'TA', 'Gd', 'Ex']], handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    ordinal_transformer_2 = Pipeline(steps=[
        ('encoder',
         OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']], handle_unknown='use_encoded_value',
                        unknown_value=-1))
    ])

    ordinal_transformer_3 = Pipeline(steps=[
        ('encoder',
         OrdinalEncoder(categories=[['Not Available', 'Po', 'Fa', 'TA', 'Gd', 'Ex']],
                        handle_unknown='use_encoded_value',
                        unknown_value=-1))
    ])

    ordinal_transformer_4 = Pipeline(steps=[
        ('encoder',
         OrdinalEncoder(categories=[['Not Available', 'Po', 'Fa', 'TA', 'Gd', 'Ex']],
                        handle_unknown='use_encoded_value',
                        unknown_value=-1))
    ])

    ordinal_transformer_5 = Pipeline(steps=[
        ('encoder',
         OrdinalEncoder(categories=[['Not Available', 'No', 'Mn', 'Av', 'Gd']], handle_unknown='use_encoded_value',
                        unknown_value=-1))
    ])

    ordinal_transformer_6 = Pipeline(steps=[
        ('encoder', OrdinalEncoder(categories=[['Not Available', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']],
                                   handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    ordinal_transformer_7 = Pipeline(steps=[
        ('encoder', OrdinalEncoder(categories=[['Not Available', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']],
                                   handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    ordinal_transformer_8 = Pipeline(steps=[
        ('encoder',
         OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']], handle_unknown='use_encoded_value',
                        unknown_value=-1))
    ])

    ordinal_transformer_9 = Pipeline(steps=[
        ('encoder',
         OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']], handle_unknown='use_encoded_value',
                        unknown_value=-1))
    ])

    ordinal_transformer_10 = Pipeline(steps=[
        ('encoder',
         OrdinalEncoder(categories=[['Not Available', 'Po', 'Fa', 'TA', 'Gd', 'Ex']],
                        handle_unknown='use_encoded_value',
                        unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformers, categorical_features),
            ('ord1', ordinal_transformer_1, ['ExterQual']),
            ('ord2', ordinal_transformer_2, ['ExterCond']),
            ('ord3', ordinal_transformer_3, ['BsmtQual']),
            ('ord4', ordinal_transformer_4, ['BsmtCond']),
            ('ord5', ordinal_transformer_5, ['BsmtExposure']),
            ('ord6', ordinal_transformer_6, ['BsmtFinType1']),
            ('ord7', ordinal_transformer_7, ['BsmtFinType2']),
            ('ord8', ordinal_transformer_8, ['HeatingQC']),
            ('ord9', ordinal_transformer_9, ['KitchenQual']),
            ('ord10', ordinal_transformer_10, ['GarageQual'])]
    )

    joblib.dump(preprocessor.fit(df), filename=os.path.join(MODEL_DIR, PREPROCESSING_PIPELINE_FILE_NAME))
    upload_files(filename=PREPROCESSING_PIPELINE_FILE_NAME,
                 source_file_path=os.path.join(MODEL_DIR, PREPROCESSING_PIPELINE_FILE_NAME))

    return preprocessor.fit_transform(df)
示例#46
0
num_attr = ['tenure', 'MonthlyCharges', 'TotalCharges']
testing_churn = telecom[['Churn']]
testing_churn = testing_churn['Churn'].map({'Yes': 1, 'No': 0})
telecom.drop('Churn', axis=1, inplace=True)
######## Pipeline ########
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('std_scale', MinMaxScaler(feature_range=(-1, 1)))])

full_pipeline = ColumnTransformer([('num', num_pipeline, num_attr),
                                   ('ordinal', OrdinalEncoder(), ordinal_attr),
                                   ('cat',
                                    OneHotEncoder(drop='first',
                                                  sparse=False), dummy)])

# sample_telecom = telecom[0:3]
fitting = full_pipeline.fit(telecom)
cat_names = full_pipeline.named_transformers_.cat.get_feature_names(dummy)


def feature_ext(sample):
    int_cols = {'tenure': int, 'MonthlyCharges': float, 'TotalCharges': float}
    val = [
        'customerID', 'tenure', 'PhoneService', 'Contract', 'PaperlessBilling',
        'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn', 'gender',
        'SeniorCitizen', 'Partner', 'Dependents', 'MultipleLines',
        'InternetService', 'OnlineSecurity', 'OnlineBackup',
        'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
    ]
    sample.columns = val
    sample = sample.astype(int_cols)
    testing = fitting.transform(sample)
示例#47
0
    -0.13929748680369286, 1.3163604645710438, -0.3699637766938669,
    -0.6149300604558857, -0.854369594993175, 0.263445277972641,
    0.5712416961268142
]

# In[36]:

ct = pd.DataFrame([test_country], columns=df.columns)

# In[37]:

cols2 = df.select_dtypes(['int64', 'float64']).columns
pl = Pipeline(steps=[('imp', SimpleImputer(
    strategy='median')), ('scaler', StandardScaler())])
tf = ColumnTransformer(transformers=[('number', pl, cols2)], n_jobs=-1)
tf.fit(df)

# In[38]:


def q4():
    res = tf.transform(ct)[0][cols2.get_loc('Arable')]
    return round(float(res), 3)


# ## Questão 5
#
# Descubra o número de _outliers_ da variável `Net_migration` segundo o método do _boxplot_, ou seja, usando a lógica:
#
# $$x \notin [Q1 - 1.5 \times \text{IQR}, Q3 + 1.5 \times \text{IQR}] \Rightarrow x \text{ é outlier}$$
#
preproc_scale = ColumnTransformer(transformers=[('num', StandardScaler(),
                                                 feat_num_idx)])

# for models that don't require scaling, we want to pass-through these features:
preproc_num_pass = ColumnTransformer(transformers=[('num', 'passthrough',
                                                    feat_num_idx)])

# Categorical:
cat_cols = ['registered_via']
cat_cols_idx = [list(df_feat1.columns).index(x) for x in cat_cols]

preproc_ohe = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(categories='auto'), cat_cols_idx)])

# fit to get feature names
preproc_ohe.fit(df_feat1)
feat_ohe = preproc_ohe.named_transformers_['cat'].get_feature_names()
feat_ohe = feat_ohe.tolist()
feat_ohe = list(map(fix_ohe_names, feat_ohe))

# Boolean: pass through
pass_cols = [
    'payment_method_most_common_mode', 'is_auto_renew_mode', 'is_cancel_mode'
]
pass_cols_idx = [list(df_feat1.columns).index(x) for x in pass_cols]

preproc_pass = ColumnTransformer(transformers=[
    ('as_is', 'passthrough', pass_cols_idx),
])

# Scaling
示例#49
0
def main():
    if len(sys.argv) < 3:
        print(
            "Not enough arguments specified\n Usage: lasso.py <x features path> <y target path> <outdir>"
        )
        sys.exit(1)
    else:
        # print command line arguments
        for arg in sys.argv[0:]:
            print(arg)
    #Load X features data
        X_path = sys.argv[1]
        print('Loading the X features at {}'.format(X_path))
        X_train = pd.read_csv(X_path, index_col=0)
        X_train = X_train.sort_index(axis=0)
        Y_path = sys.argv[2]
        print('Loading Y target at {}'.format(Y_path))
        y_train = pd.read_csv(Y_path, index_col=0)
        y_train = y_train.sort_index(axis=0)
        y_target = y_train[:-1].columns

        #Load the numeric and categorical feature names
        num_feat = pd.read_csv("data/numerical_features_ffq.csv",
                               delimiter=',',
                               header=0)
        cat_feat = pd.read_csv("data/categorical_features_ffq.csv",
                               delimiter=',',
                               header=0)
        zero_feat = pd.read_csv("data/ffq_var_with_zeroes.csv",
                                delimiter=",",
                                header=0)

        #Define the numeric and categorical features
        numerical_features = [
            col for col in X_train.columns if col in num_feat.values
        ]
        numeric_nonzero = [
            col for col in numerical_features if col not in zero_feat.values
        ]
        numeric_zeroes = [
            col for col in X_train.columns if col in zero_feat.values
        ]
        categorical_features = [
            col for col in X_train.columns if col in cat_feat.values
        ]

        print('Setting up ColumnTransformer...')
        numeric_transformer = Pipeline(
            steps=[('log',
                    FunctionTransformer(np.log)), ('scaler',
                                                   StandardScaler())])

        #set up pipeline for numeric variables with zeroes
        zero_transformer = Pipeline(
            steps=[('yeo',
                    PowerTransformer(method="yeo-johnson", standardize=True))])

        #Set up the categorical pipeline
        #define the unique levels of each category
        X_cat = X_train[categorical_features]
        enc = OneHotEncoder(handle_unknown="error", sparse=False)
        enc.fit(X_cat)
        enc.transform(X_cat)
        cat_levels = enc.categories_
        #define the categorical transformer
        categorical_transformer = Pipeline(steps=[(
            'onehot',
            OneHotEncoder(
                handle_unknown='error', sparse=False, categories=cat_levels))])

        #Set up ColumnTransformer
        prep = ColumnTransformer(transformers=[(
            'num', numeric_transformer,
            numeric_nonzero), ('yeo', zero_transformer, numeric_zeroes),
                                               ('cat', categorical_transformer,
                                                categorical_features)])

        model = TransformedTargetRegressor(Lasso(random_state=0),
                                           func=np.log,
                                           inverse_func=np.exp)

    #Set up the pipeline
    print('Setting up pipeline...')
    pipeline = Pipeline(steps=[('preprocessor', prep), ('lasso', model)])
    #Set up the param grid and CV
    param_grid = {'lasso__regressor__alpha': np.logspace(-4, -1, 50)}

    #define inner and outer cv
    inner_cv = KFold(n_splits=10, shuffle=True, random_state=0)
    outer_cv = KFold(n_splits=10, shuffle=True, random_state=0)

    refit = 'r2'
    pscore = make_scorer(pcc)
    scoring = {
        'r2': make_scorer(r2_score),
        'MAE': make_scorer(mean_absolute_error),
        'pearson': pscore
    }

    #create output sinks
    outer_loop_r2 = []
    outer_loop_pcc = []
    outer_loop_mae = []

    inner_loop_won_params = []
    inner_loop_accuracy_scores = []
    inner_loop_coefs = []
    inner_loop_best_cv_results = []

    # Looping through the outer loop, feeding each training set into a grid_search as the inner loop
    for train_index, test_index in outer_cv.split(X_train, y_train):

        grid_search = GridSearchCV(estimator=pipeline,
                                   param_grid=param_grid,
                                   cv=inner_cv,
                                   scoring=scoring,
                                   refit="r2",
                                   n_jobs=-1)

        # inner loop
        grid_search.fit(X_train.iloc[train_index], y_train.iloc[train_index])
        inner_results = pd.DataFrame(grid_search.cv_results_)
        inner_best_scores = inner_results[inner_results['rank_test_r2'] == 1]

        # The best hyper parameters from grid_search is now being tested on the unseen outer loop test data.
        pred = grid_search.predict(X_train.iloc[test_index])

        # Appending the "winning" hyper parameters and their associated accuracy score
        outer_loop_r2.append(r2_score(y_train.iloc[test_index], pred))
        outer_loop_mae.append(
            mean_absolute_error(y_train.iloc[test_index], pred))
        outer_loop_pcc.append(
            sp.stats.pearsonr(y_train.iloc[test_index], pred)[0])

        inner_loop_won_params.append(grid_search.best_params_)
        inner_loop_best_cv_results.append(inner_best_scores)
        inner_loop_coefs.append(
            grid_search.best_estimator_.named_steps['lasso'].regressor_.coef_)
        inner_loop_accuracy_scores.append(grid_search.best_score_)

    for i in zip(inner_loop_won_params, outer_loop_r2,
                 inner_loop_accuracy_scores):
        print(i)

    print('Mean of outer loop accuracy score:', np.mean(outer_loop_r2))

    #save the results

    cv_savepath = sys.argv[3]

    #save outer loop scores
    outer_results = pd.DataFrame()
    outer_results['r2'] = outer_loop_r2
    outer_results['mae'] = outer_loop_mae
    outer_results['pcc'] = outer_loop_pcc
    outer_results['pcc'] = outer_results['pcc'].str.get(0)
    outer_name = 'outer_loop_results_for_{}'.format(y_target[0]) + '.csv'
    outer_path = cv_savepath + outer_name
    outer_results.to_csv(outer_path, index=True)

    #save the inner loop results
    inner_results = pd.concat(inner_loop_best_cv_results)
    inner_name = 'inner_loop_results_for_{}'.format(y_target[0]) + '.csv'
    inner_path = cv_savepath + inner_name
    inner_results.to_csv(inner_path, index=True)

    #get the feature names
    prep.fit(X_train)
    feature_names = get_transformer_feature_names(prep)
    #save the inner loop coefs
    inner_feat_df = pd.DataFrame(inner_loop_coefs).T
    inner_feat_df['Feature'] = feature_names
    inner_feat_df = inner_feat_df.set_index(['Feature'])
    inner_coef_name = 'inner_loop_coefs_for_{}'.format(y_target[0]) + '.csv'
    inner_coef_path = cv_savepath + inner_coef_name
    inner_feat_df.to_csv(inner_coef_path, index=True)

    #save the model
    mod_name = 'lasso_{}'.format(y_target[0]) + '.pkl'
    filename = cv_savepath + mod_name
    dump(grid_search.best_estimator_, open(filename, 'wb'))

    print("\nResults saved to {}".format(cv_savepath))
    print("\nModel saved to {}".format(filename))
示例#50
0
    train_df.head()

    train_data = train_df.drop(['bruises'], axis=1)
    test_data = test_df.drop(['bruises'], axis=1)
    y_train_label = train_df['bruises']
    y_test_label = test_df['bruises']

    # ### Encoding categorical variables
    categorical_transformer = Pipeline(steps=[('woe', ce.OrdinalEncoder())])
    categorical_features = train_data.select_dtypes(include=['object']).columns
    preprocessor = ColumnTransformer(transformers=[('cat',
                                                    categorical_transformer,
                                                    categorical_features)])

    # Feature transformation
    preprocessor.fit(train_data)
    X_train = preprocessor.transform(train_data)
    X_test = preprocessor.transform(test_data)

    le = LabelEncoder()
    label_encoder = le.fit(y_train_label)
    y_train = label_encoder.transform(y_train_label)
    y_test = label_encoder.transform(y_test_label)

    print("Training Size:", X_train.shape, "Testing Size:", X_test.shape)

    comparision_df = pd.DataFrame(columns=[
        'Algorithm', 'Implementation', 'Depth', 'Bag Size', 'Accuracy'
    ])
    MAX_TREE_DEPTH = [3, 5]
    BAG_SIZE = [10, 20]
示例#51
0
my_trans = ColumnTransformer([
    ("bin_weather", pip_weather, ["weather"]),
    ("bin and encode atemp",
     KBinsDiscretizer(n_bins=3, encode="onehot-dense",
                      strategy="uniform"), ["atemp"]),
    ("interaction term work and hour",
     PolynomialFeatures(interaction_only=True, include_bias=False,
                        degree=2), ["workingday", "hour"]),
    ("poly 2nd degree and scale", pip_scale_poly, ["atemp", "humidity"]),
    ("scale", MinMaxScaler(), ["windspeed"]),
    ("passthrough", "passthrough", ["datetime", "peek_hours"]),
    ("one hot encode", OneHotEncoder(), ["month", "season"]),
])

my_trans.fit(X_train)
X_trans = my_trans.transform(X_train)
X_trans = pd.DataFrame(X_trans,
                       columns=[
                           "good_weath_cond", "bad_weath_cond", "low_temp",
                           "medium_temp", "high_temp", "workingday", "hour",
                           "inter_work_hour", "atemp", "humidity", "atemp^2",
                           "interaction_atemp_hum", "humidity^2", "windspeed",
                           "datetime", "peek_hours", "Jan", "Feb", "March",
                           "Apr", "May", "Jun", "july", "Aug", "Sept", "Oct",
                           "Nov", "Dec", "Spring", "Summer", "Autumn", "Winter"
                       ])

X_trans.set_index("datetime", inplace=True)
X_trans.head()
示例#52
0
numcols = ["total_bill"]
catcols = xtrain.select_dtypes("category").columns

ohe = OneHotEncoder()
ss = StandardScaler()

prep_pl_categorical = Pipeline([("OHE", ohe)])

prep_pl_numeric = Pipeline([("Scaling", ss)])

ct = ColumnTransformer([("1", prep_pl_categorical, catcols),
                        ("2", prep_pl_numeric, numcols)])

# %%

ct.fit(xtrain)
xtrain = ct.transform(xtrain)
xtest = ct.transform(xtest)

# %%
new_input = pd.DataFrame(
    {
        "total_bill": 16.99,
        "sex": "Female",
        "smoker": "No",
        "day": "Sun",
        "time": "Dinner",
        "size": 2
    },
    index=[0])
class CoreferenceClassifier:
    def __init__(self, training_instances_iterator, classifier='SVM'):

        if classifier not in {
                'NaiveBayes', 'Perceptron', 'SVM', 'MaxEnt', 'RandomForest',
                '__existing'
        }:
            print("ERROR: {} is not a valid classifier.".format(classifier),
                  file=sys.stderr)
            print("Valid classifiers: ", file=sys.stderr)
            print(
                "\t'NaiveBayes'\n\t'Perceptron'\n\t'SVM'\n\t'MaxEnt'\n\t'RandomForest'",
                file=sys.stderr)
            print("(Default = 'DecisionTree')", file=sys.stderr)
            sys.exit(1)

        # Classifier model
        self.classifier = classifier
        # Transformer that prepares data for training the model and making predictions
        self.column_transformer = None

        # leave constructor if loading an already trained model
        if classifier == '__existing':
            pass
        else:

            # Scaler and OneHotEncoder to adapt feature vectors to model
            self.column_transformer = ColumnTransformer([
                ('NumericalData', StandardScaler(), [0]),
                ('CategoricalData', OneHotEncoder(), slice(1, 11))
            ])

            self.column_transformer.fit(
                [[0, '+', '+', '+', '+', '+', '+', '+', '+', '+', '+'],
                 [0, '-', '-', '-', '-', '-', '-', '-', '-', '-', '-'],
                 [
                     0, 'unknown', 'unknown', 'unknown', 'unknown', 'unknown',
                     'unknown', 'unknown', 'unknown', 'unknown', 'unknown'
                 ]])

            # incremental learning of model
            if self.classifier == 'SVM':
                self.classifier = SGDClassifier(loss='hinge')
            elif self.classifier == 'Perceptron':
                self.classifier = Perceptron()
            elif self.classifier == 'NaiveBayes':
                self.classifier = BernoulliNB()
            elif self.classifier == 'MaxEnt':
                self.classifier = SGDClassifier(loss='log')
            elif self.classifier == 'RandomForest':
                self.classifier = RandomForestClassifier(warm_start=True)

            # over sampler to cope with uneven balanced class distributions
            # (there are a lot more non-coreferent mention-pairs than coreferent mention-pairs)
            over_sampler = RandomOverSampler()
            under_sampler = RandomUnderSampler()

            for instances in training_instances_iterator:
                feature_matrix = [x[2:13] for x in instances]
                labels = [x[13] for x in instances]

                if len(set(labels)) > 1:
                    feature_matrix, labels = over_sampler.fit_resample(
                        feature_matrix, labels)

                # update Scaler
                num_data = [[x[0]] for x in feature_matrix]
                self.column_transformer.named_transformers_[
                    'NumericalData'].partial_fit(num_data)
                del num_data

                # transform feature vectors
                feature_matrix = self.column_transformer.transform(
                    feature_matrix)

                # update the model
                if classifier == 'RandomForest':
                    self.classifier.fit(feature_matrix, labels)
                else:
                    self.classifier.partial_fit(feature_matrix,
                                                labels,
                                                classes=['+', '-'])

    # predict returns a vector containing the predicted classes for an input vector or matrix
    def predict(self, data):
        # transform data so it fits the model
        try:
            data = self.column_transformer.transform(data)
        except ValueError:
            data = [data]
            data = self.column_transformer.transform(data)

        # make predictions
        return self.classifier.predict(data)

    def predict_mention_pair(self, feature_vector):
        # transform feature vector so it fits the model
        feature_vector = self.column_transformer.transform([feature_vector])

        # make prediction
        pred = self.classifier.predict(feature_vector)

        return pred[0]

    def transform(self, data):
        return self.column_transformer.transform(data)

    # saves trained model in a binary file
    def save_binary(self, filename):
        binary = open(filename, 'wb')
        pickle.dump((self.classifier, self.column_transformer), binary)
        binary.close()

    # reads a trained model from a binary file
    # Usage: classifier = CoreferenceClassifier.load_binary(filename)
    @classmethod
    def load_binary(cls, filename):
        classifier = CoreferenceClassifier([])
        binary = open(filename, 'rb')
        models = pickle.load(binary)
        binary.close()
        classifier.classifier = models[0]
        classifier.column_transformer = models[1]
        return classifier
示例#54
0
                                                    stratify=y)

numerical_features = X.select_dtypes("int64").columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(
    steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[(
    'num', numeric_transformer,
    numerical_features), ('cat', categorical_transformer,
                          categorical_features)])

transf = preprocessor.fit(X)
transf

xtrain_prepared = transf.transform(X_train)
xtrain_prepared.shape

xtest_prepared = transf.transform(X_test)
xtest_prepared.shape
"""## MLP Model

Investigação feita neste modelo MLP foi para definir or melhores parametros para tax de aprendizado e o numero ideal de iterações que a rede precisa.
"""

mlp = MLPClassifier(max_iter=1000)

params_mlp = {
示例#55
0
enc.fit([['male', 0, 3], ['male', 1, 0], ['female', 2, 1], ['female', 0, 2]])
enc.categories_

# %%
enc.transform([['male', 0, 3], ['none', 1, 0], ['male', 0, 2]]).toarray()
enc.get_feature_names()

# %%
# ColumnTransformer
from sklearn.compose import ColumnTransformer
categorical_features = [0]
enc = OneHotEncoder(handle_unknown='ignore')
clt = ColumnTransformer([('name', enc, categorical_features)],
                        remainder='passthrough')

clt.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
clt.transform([[0, 2, 3]])

# %%
# fit_transform
enc = OneHotEncoder(sparse=False)
ans = enc.fit_transform([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
ans

# %%
enc = OneHotEncoder()
ans = enc.fit_transform([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
ans.toarray()

# %%
# StandardScaler 去均值和方差归一化