def test_function_transformer_future_warning(validate, expected_warning):
    # FIXME: to be removed in 0.22
    X = np.random.randn(100, 10)
    transformer = FunctionTransformer(validate=validate)
    with pytest.warns(expected_warning) as results:
        transformer.fit_transform(X)
    if expected_warning is None:
        assert len(results) == 0
def test_kw_arg():
    X = np.linspace(0, 1, num=10).reshape((5, 2))

    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))

    # Test that rounding is correct
    assert_array_equal(F.transform(X),
                       np.around(X, decimals=3))
def test_inverse_transform():
    X = np.array([1, 4, 9, 16]).reshape((2, 2))

    # Test that inverse_transform works correctly
    F = FunctionTransformer(
            func=np.sqrt,
            inverse_func=np.around, inv_kw_args=dict(decimals=3))
    testing.assert_array_equal(
            F.inverse_transform(F.transform(X)),
            np.around(np.sqrt(X), decimals=3))
Пример #4
0
def test_functiontransformer_vs_sklearn():
    # Compare msmbuilder.preprocessing.FunctionTransformer
    # with sklearn.preprocessing.FunctionTransformer

    functiontransformerr = FunctionTransformerR()
    functiontransformerr.fit(np.concatenate(trajs))

    functiontransformer = FunctionTransformer()
    functiontransformer.fit(trajs)

    y_ref1 = functiontransformerr.transform(trajs[0])
    y1 = functiontransformer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
def test_check_inverse():
    X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))

    X_list = [X_dense,
              sparse.csr_matrix(X_dense),
              sparse.csc_matrix(X_dense)]

    for X in X_list:
        if sparse.issparse(X):
            accept_sparse = True
        else:
            accept_sparse = False
        trans = FunctionTransformer(func=np.sqrt,
                                    inverse_func=np.around,
                                    accept_sparse=accept_sparse,
                                    check_inverse=True,
                                    validate=True)
        assert_warns_message(UserWarning,
                             "The provided functions are not strictly"
                             " inverse of each other. If you are sure you"
                             " want to proceed regardless, set"
                             " 'check_inverse=False'.",
                             trans.fit, X)

        trans = FunctionTransformer(func=np.expm1,
                                    inverse_func=np.log1p,
                                    accept_sparse=accept_sparse,
                                    check_inverse=True,
                                    validate=True)
        Xt = assert_no_warnings(trans.fit_transform, X)
        assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))

    # check that we don't check inverse when one of the func or inverse is not
    # provided.
    trans = FunctionTransformer(func=np.expm1, inverse_func=None,
                                check_inverse=True, validate=True)
    assert_no_warnings(trans.fit, X_dense)
    trans = FunctionTransformer(func=None, inverse_func=np.expm1,
                                check_inverse=True, validate=True)
    assert_no_warnings(trans.fit, X_dense)
def test_function_transformer_frame():
    pd = pytest.importorskip('pandas')
    X_df = pd.DataFrame(np.random.randn(100, 10))
    transformer = FunctionTransformer(validate=False)
    X_df_trans = transformer.fit_transform(X_df)
    assert hasattr(X_df_trans, 'loc')
Пример #7
0
    #return pd.concat([X, histopatological_diagnosis_encoded], axis=1).astype(float)
    #return X.join(histopatological_diagnosis_encoded)
    return X


X_encoded = encode(X)
X_encoded

# %% [markdown]
# We will examine models with

# %%
pipelines = dict(
    pipeline_logistic_regression=Pipeline([
        ('encoder', FunctionTransformer(encode)), ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=300))
    ]),
    pipeline_pca_logistic_regression=Pipeline([
        ('encoder', FunctionTransformer(encode)), ('scaler', StandardScaler()),
        ('pca', PCA(0.95)), ('clf', LogisticRegression(max_iter=300))
    ]),
    pipeline_nn=Pipeline([('encoder', FunctionTransformer(encode)),
                          ('scaler', StandardScaler()),
                          ('clf',
                           MLPClassifier((10, ),
                                         learning_rate='adaptive',
                                         early_stopping=True))]),
    pipeline_pca_nn=Pipeline([('encoder', FunctionTransformer(encode)),
                              ('scaler', StandardScaler()), ('pca', PCA(0.95)),
                              ('clf',
Пример #8
0
 def __init__(self):
     self._log_transformer = FunctionTransformer(np.log1p)
     self._drop_first_feature = FunctionTransformer(
         self._drop_first_feature)
Пример #9
0
from timeserio.preprocessing import (PandasColumnSelector,
                                     PandasDateTimeFeaturizer,
                                     PandasValueSelector, utils)


@pytest.fixture
def input_df():
    df = mock.mock_raw_data(ids=[0, 1])
    df['group'] = np.random.randint(2, size=len(df))
    return df


col_selector = ('select', PandasColumnSelector([ini.Columns.target]))
val_selector = ('select', PandasValueSelector([ini.Columns.target]))
identity = ('identity', utils.IdentityRegressor())
lagger = ('lag', FunctionTransformer(lambda x: x.shift(1), validate=False))


@pytest.mark.parametrize(
    'pipeline, groupby, is_estimator',
    [
        (Pipeline([val_selector]), 'id', False),
        (Pipeline([val_selector]), ['id'], False),
        (Pipeline([val_selector]), ['id', 'group'], False),
        (Pipeline_sk([val_selector]), ['id'], False),
        (Pipeline_sk([val_selector]), ['id', 'group'], False),
        (Pipeline([val_selector, identity]), ['id'], True),
        (Pipeline([val_selector, identity]), ['id', 'group'], True),
        (Pipeline_sk([val_selector, identity]), ['id'], True),
        (Pipeline_sk([val_selector, identity]), ['id', 'group'], True),
    ]
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Import other preprocessing modules
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import chi2, SelectKBest

# Select 300 best features
chi_k = 300

# Import functional utilities
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import FeatureUnion

# Perform preprocessing
get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS],
                                       validate=False)

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate pipeline: pl
pl = Pipeline([
    ('union',
     FeatureUnion(
         transformer_list=[('numeric_features',
                            Pipeline([('selector',
                                       get_numeric_data), ('imputer',
                                                           Imputer())])),
                           ('text_features',
Пример #11
0
# containing the claim amount (``ClaimAmount``) for the same policy ids
# (``IDpol``).

df = load_mtpl2(n_samples=60000)

# Note: filter out claims with zero amount, as the severity model
# requires strictly positive target values.
df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0

# Correct for unreasonable observations (that might be data error)
# and a few exceptionally large claim amounts
df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
df["Exposure"] = df["Exposure"].clip(upper=1)
df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)

log_scale_transformer = make_pipeline(FunctionTransformer(func=np.log),
                                      StandardScaler())

column_trans = ColumnTransformer(
    [
        ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
        ("onehot_categorical", OneHotEncoder(),
         ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
        ("passthrough_numeric", "passthrough", ["BonusMalus"]),
        ("log_scaled_numeric", log_scale_transformer, ["Density"]),
    ],
    remainder="drop",
)
X = column_trans.fit_transform(df)

# Insurances companies are interested in modeling the Pure Premium, that is
def lab_gaussian(X_train, y_train):
    model = make_pipeline(FunctionTransformer(rgb_to_lab, validate=True),
                          GaussianNB())
    model.fit(X_train, y_train)
    return model
Пример #13
0
dow_trans = DayOfWeekTransformer()
month_trans = MonthTransformer()
tfidf_vec = TfidfVectorizer(ngram_range=(1, 2), max_features=2000)


def select_time_column(X):
    return X[:, 0]


def select_text_column(X):
    return X[:, 1]


pipe = make_pipeline(
    make_union(
        make_pipeline(FunctionTransformer(select_time_column, validate=False),
                      dow_trans),
        make_pipeline(FunctionTransformer(select_time_column, validate=False),
                      month_trans),
        make_pipeline(FunctionTransformer(select_text_column, validate=False),
                      tfidf_vec)),
    LassoCV(n_alphas=200,
            cv=5,
            max_iter=2000,
            verbose=True,
            n_jobs=-1,
            random_state=None))

pipe.fit(X, np.reshape(y, y_raw.shape))

joblib.dump(pipe, 'models/pipe.pkl')
    PolynomialFeatures,
    PowerTransformer,
    StandardScaler,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

_transformers = [
    Binarizer(threshold=2),
    KBinsDiscretizer(n_bins=3, encode="ordinal"),
    StandardScaler(),
    MinMaxScaler(),
    Normalizer(),
    PowerTransformer(),
    FunctionTransformer(np.log, validate=True),
    OrdinalEncoder(),
]

_selectors = [
    SelectFromModel(Lasso(random_state=1)),
    SelectKBest(f_regression, k=2),
    VarianceThreshold(),
    RFE(Lasso(random_state=1)),
]


@pytest.mark.parametrize(
    "transformer",
    [
        SimpleImputer(),
Пример #15
0
def testPreProc():

    iris = load_iris()
    # 无量纲化使不同规格的数据转换到同一规格。常见的无量纲化方法有标准化和区间缩放法。
    # 标准化是依照特征矩阵的列处理数据,其通过求z-score的方法,将样本的特征值转换到同一量纲下。
    # 归一化是依照特征矩阵的行处理数据,其目的在于样本向量在点乘运算或其他核函数计算相似性时,
    # 拥有统一的标准,也就是说都转化为“单位向量”。
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    MinMaxScaler().fit_transform(iris.data)
    StandardScaler().fit_transform(iris.data)

    # 二值化,阈值设置为3,返回值为二值化后的数据
    from sklearn.preprocessing import Binarizer
    Binarizer(threshold=3).fit_transform(iris.data)

    # 哑编码,对IRIS数据集的目标值,返回值为哑编码后的数据,注意是2D的
    # OneHotEncoder(sparse = False).fit_transform( testdata[['age']] )
    from sklearn.preprocessing import OneHotEncoder
    OneHotEncoder().fit_transform(iris.target.reshape((-1, 1)))

    # 对于字符串型离散变量可以先用LabelEncoder 转换为数值再用OneHotEncoder编码
    # 注意LabelEncoder是1D而OneHotEncoder是2D的
    from sklearn.preprocessing import LabelEncoder
    LabelEncoder().fit_transform(iris.data[""])

    # 缺失值计算,返回值为计算缺失值后的数据
    # 参数missing_value为缺失值的表示形式,默认为NaN
    # 参数strategy为缺失值填充方式,默认为mean(均值)
    from numpy import vstack, array, nan
    from sklearn.preprocessing import Imputer
    Imputer().fit_transform(vstack((array([nan, nan, nan, nan]), iris.data)))

    # 数据变换
    # 多项式变换
    from sklearn.preprocessing import PolynomialFeatures  # 多项式转换 #参数degree为度,默认值为
    PolynomialFeatures().fit_transform(iris.data)
    # 自定义转换函数为对数函数的数据变换 #第一个参数是单变元函数
    from numpy import log1p
    from sklearn.preprocessing import FunctionTransformer
    FunctionTransformer(log1p).fit_transform(iris.data)

    # 特征选择之filter
    # 方差选择法,返回值为特征选择后的数据 #参数threshold为方差的阈值
    from sklearn.feature_selection import VarianceThreshold
    VarianceThreshold(threshold=3).fit_transform(iris.data)
    # 选择K个最好的特征,返回选择特征后的数据
    # 第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,
    # 输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
    # 第二个参数k为选择的特征个数
    from sklearn.feature_selection import SelectKBest
    from scipy.stats import pearsonr
    # 评价函数为 pearsonr 相关系数
    SelectKBest(lambda X, Y: array(map(lambda x: pearsonr(x, Y), X.T)).T,
                k=2).fit_transform(iris.data, iris.target)
    # 评价函数为 卡方检验函数
    from sklearn.feature_selection import chi2
    SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)
    # 互信息法
    # from minepy import MINE
    # # 由于MINE的设计不是函数式的,定义mic方法将其为函数式的,返回一个二元组,二元组的第2项设置成固定的P值0.5
    # def mic(x, y):
    #     m = MINE()
    #     m.compute_score(x, y)
    #     return (m.mic(), 0.5)
    # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target)

    # 特征选择之wrapper
    # 递归特征消除法,返回特征选择后的数据
    # 参数estimator为基模型
    # 参数n_features_to_select为选择的特征个数
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    RFE(estimator=LogisticRegression(),
        n_features_to_select=2).fit_transform(iris.data, iris.target)

    # 特征选择之embedded
    # 使用带惩罚项的基模型,除了筛选出特征外,同时也进行了降维。
    # 使用feature_selection库的SelectFromModel类结合带L1惩罚项的逻辑回归模型,来选择特征:
    from sklearn.feature_selection import SelectFromModel
    from sklearn.linear_model import LogisticRegression
    # 带L1惩罚项的逻辑回归作为基模型的特征选择
    SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(
        iris.data, iris.target)

    # L1惩罚项降维的原理在于保留多个对目标值具有同等相关性的特征中的一个,所以没选到的特征不代表不重要。
    # 故可结合L2惩罚项来优化。若一个特征在L1中的权值为1,选择在L2中权值差别不大且在L1中权值为0的特征构成同类集合,
    # 将这一集合中的特征平分L1中的权值

    # GBDT作为基模型的特征选择
    from sklearn.ensemble import GradientBoostingClassifier
    SelectFromModel(GradientBoostingClassifier()).fit_transform(
        iris.data, iris.target)
Пример #16
0
 def __init__(self):
     self.transformer_ = FunctionTransformer(to_dense, validate=False)
Пример #17
0
 def __init__(self, impute_val=None):
     self.transformer_ = FunctionTransformer(
         impute_null, kw_args={"impute_val": impute_val},
         validate=False
         )
Пример #18
0
#
# dataframe slicing
# selectionlist gets passed the parameterlist from json object
selectionlist = []
selectionlist.extend((args.list))

# read data,
df1=pd.read_table('penalties.csv', sep=';',header=0)

# all headers
colnames = list(df1.columns.values)
# slice data
X=df1.ix[:,selectionlist]
# sqrt transform the heavily skewed data
transformer = FunctionTransformer(np.sqrt)
Xtran = transformer.transform(X)
X = pd.DataFrame(Xtran)
selectionheaders = selectionlist
oldnames = X.columns.values

# rename all columns with original columnheaders
X.rename(columns=dict(zip(oldnames, selectionheaders)), inplace=True)
# rest indizes
colnamesrest = [x for x in colnames if x not in selectionlist]
Rest = df1.ix[:, colnamesrest]
# deletes multiplier columns
del Rest['multiplier']
#plot 3by3 scatterplotmatrix
from pandas.tools.plotting import scatter_matrix
scatter_matrix(X, alpha=0.2, figsize=(3, 3))
Пример #19
0
def on_field(f, *vec):
    return make_pipeline(FunctionTransformer(itemgetter(f), validate=False),
                         *vec)
Пример #20
0
 def __init__(self, cast_type=None):
     self.transformer_ = FunctionTransformer(
         feature_cast, kw_args={"cast_type": cast_type}, 
         validate=False
         )
Пример #21
0
# %%
# The remaining columns can be used to predict the frequency of claim events.
# Those columns are very heterogeneous with a mix of categorical and numeric
# variables with different scales, possibly very unevenly distributed.
#
# In order to fit linear models with those predictors it is therefore
# necessary to perform standard feature transformations as follows:

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer


log_scale_transformer = make_pipeline(
    FunctionTransformer(np.log, validate=False),
    StandardScaler()
)

linear_model_preprocessor = ColumnTransformer(
    [
        ("passthrough_numeric", "passthrough",
            ["BonusMalus"]),
        ("binned_numeric", KBinsDiscretizer(n_bins=10),
            ["VehAge", "DrivAge"]),
        ("log_scaled_numeric", log_scale_transformer,
            ["Density"]),
        ("onehot_categorical", OneHotEncoder(),
            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
    ],
    remainder="drop",
#4.5 Transforming Features

# Load libraries
import numpy as np
from sklearn.preprocessing import FunctionTransformer
# Create feature matrix
features = np.array([[2, 3], [2, 3], [2, 3]])


# Define a simple function
def add_ten(x):
    return x + 10


# Create transformer
ten_transformer = FunctionTransformer(add_ten)
# Transform feature matrix
print("Transform features: \n", ten_transformer.transform(features))

print("\n")
print("\n")
print("\n")

# Load library
import pandas as pd
# Create DataFrame
df = pd.DataFrame(features, columns=["feature_1", "feature_2"])
# Apply function
print(df.apply(add_ten))

print("\n")
# %%
# The remaining columns can be used to predict the frequency of claim events.
# Those columns are very heterogeneous with a mix of categorical and numeric
# variables with different scales, possibly very unevenly distributed.
#
# In order to fit linear models with those predictors it is therefore
# necessary to perform standard feature transformations as follows:

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer

log_scale_transformer = make_pipeline(
    FunctionTransformer(np.log, validate=False), StandardScaler())

linear_model_preprocessor = ColumnTransformer(
    [
        ("passthrough_numeric", "passthrough", ["BonusMalus"]),
        ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
        ("log_scaled_numeric", log_scale_transformer, ["Density"]),
        (
            "onehot_categorical",
            OneHotEncoder(),
            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
        ),
    ],
    remainder="drop",
)
Пример #24
0
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.816716439759918
exported_pipeline = make_pipeline(
    make_union(SelectPercentile(score_func=f_classif, percentile=46),
               FunctionTransformer(copy)),
    PCA(iterated_power=8, svd_solver="randomized"),
    PCA(iterated_power=8, svd_solver="randomized"),
    LinearSVC(C=0.001,
              dual=False,
              loss="squared_hinge",
              penalty="l2",
              tol=1e-05))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #25
0
    def train(self):
        import xgboost
        from baikal import make_step, Step, Input, Model
        from baikal.steps import Stack
        from sklearn_pandas import gen_features
        import custom_transformations as ct
        from custom_transformations import DataFrameMapperStep, ConcatDataFrame, CatBoostRegressorStep

        # these are the categorical columns in the dataset
        CATEGORICAL_COLUMNS = [
            'KitchenQual',
            'MSSubClass',
            'MSZoning',
            'Street',
            'Alley',
            'LotShape',
            'LandContour',
            'Utilities',
            'LotConfig',
            'LandSlope',
            'Neighborhood',
            'Condition1',
            'Condition2',
            'BldgType',
            'HouseStyle',
            'RoofStyle',
            'RoofMatl',
            'Exterior1st',
            'Exterior2nd',
            'MasVnrType',
            'ExterQual',
            'ExterCond',
            'Foundation',
            'BsmtQual',
            'BsmtCond',
            'BsmtExposure',
            'BsmtFinType1',
            'BsmtFinType2',
            'Heating',
            'HeatingQC',
            'CentralAir',
            'Functional',
            'FireplaceQu',
            'GarageType',
            'GarageFinish',
            'GarageQual',
            'GarageCond',
            'PavedDrive',
            'PoolQC',
            'Fence',
            'MiscFeature',
            'SaleType',
            'SaleCondition',
            'OverallQual',
            'OverallCond',
        ]

        # these columns will be terated as a numerical columns
        NUMERICAL_COLUMNS = [
            'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
            'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
            'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
            'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
            'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
            'PoolArea', 'MiscVal', 'MoSold', 'YrSold'
        ]

        # These columns have missing values and the one for which we will add missing indicator variable
        MISSING_INDICATOR = [
            'LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
            'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
            'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
            'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
            'MiscFeature'
        ]

        ## Categorical Columns for which we want One Hot Encoding
        ONEHOT_COLUMNS = [
            'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
            'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2',
            'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType',
            'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
            'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
            'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
            'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
            'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
            'MiscFeature', 'SaleType', 'SaleCondition'
        ]

        ## Categorical Columns for which we want to have target encoding
        TARGET_COLUMNS = [
            'MSSubClass', 'Neighborhood', 'Exterior1st', 'Exterior2nd'
        ]

        ## Columns for that require log transformations
        LOG_COLUMNS = [
            'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
            'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal'
        ]

        # Define Steps
        ElasticNetStep = make_step(ElasticNet, class_name='ElasticNet')
        ConcatStep = make_step(ConcatDataFrame, class_name='Concat')
        XGBRegressorStep = make_step(xgboost.XGBRegressor,
                                     class_name='XGBRegressor')
        LinearRegressionStep = make_step(sklearn.linear_model.LinearRegression,
                                         class_name='LinearRegression')

        # Define sklearn-pandas transformations. Here I am using gen_features utility to
        # define transformations for individual columns.
        baseProcessing = (
            gen_features(columns=[[x] for x in MISSING_INDICATOR],
                         classes=[{
                             'class': MissingIndicator,
                             'features': 'all',
                             'sparse': False,
                             'error_on_new': False
                         }],
                         prefix='na_') +
            gen_features(
                columns=LOG_COLUMNS,
                classes=[{
                    'class': FunctionTransformer,
                    'func': lambda x: x.astype(np.float).reshape((-1, 1))
                }, {
                    'class': SimpleImputer,
                    'strategy': 'mean'
                }, {
                    'class': FunctionTransformer,
                    'func': np.log1p
                }]) +
            gen_features(
                columns=list(set(NUMERICAL_COLUMNS) - set(LOG_COLUMNS)),
                classes=[{
                    'class': FunctionTransformer,
                    'func': lambda x: x.astype(np.float).reshape((-1, 1))
                }, {
                    'class': SimpleImputer,
                    'strategy': 'mean'
                }],
            ) + [
                # constructing new features -- age of the house
                (['YrSold', 'YearBuilt'], [
                    FunctionTransformer(
                        func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'age'
                }),

                # constructing new feature -- remodeling age
                (['YrSold', 'YearRemodAdd'], [
                    FunctionTransformer(
                        func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'remodel_age'
                }),

                # new feature -- total surface area
                (['1stFlrSF', '2ndFlrSF', 'TotalBsmtSF'], [
                    FunctionTransformer(lambda x: np.nansum(x, axis=1)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'numerical_TotalArea'
                })
            ])

        # Since CatBoost model can handle categorical data, we don't need to encode categorical variables
        # we will simply impute missing values and let CatBoost model handle categorical data.
        catModelPreprocessing = gen_features(
            columns=CATEGORICAL_COLUMNS,
            classes=[{
                'class': FunctionTransformer,
                'func': lambda x: x.astype(np.object).reshape(-1, 1)
            }, {
                'class': SimpleImputer,
                'strategy': 'most_frequent'
            }],
        )

        # for regression and XGBoost, we will need to encode categorical variables ourselfs.
        # Depending on the cardinality of the variable, I am either using one hot encoding or target encoding.
        regressionModelProcessing = (
            gen_features(columns=[[x] for x in ONEHOT_COLUMNS],
                         classes=[{
                             'class': OneHotEncoder,
                             'handle_unknown': 'ignore',
                             'sparse': False
                         }]) + gen_features(columns=[[x]
                                                     for x in TARGET_COLUMNS],
                                            classes=[
                                                {
                                                    'class': TargetEncoder
                                                },
                                                {
                                                    'class': SimpleImputer,
                                                    'strategy': 'mean'
                                                },
                                            ]))

        # Define DAG
        x = Input(name="x")
        y = Input(name='y')

        # Define feature transformations
        d0 = DataFrameMapperStep(baseProcessing,
                                 df_out=True,
                                 name='BasePreprocess')(x, y)
        d1 = DataFrameMapperStep(regressionModelProcessing,
                                 df_out=True,
                                 name='RegressionModelPreprocess')(x, y)
        d2 = DataFrameMapperStep(catModelPreprocessing,
                                 df_out=True,
                                 name='CatModelPreprocess')(x, y)

        # Consolidate features for catboost and elasticnet
        regressionFeatures = ConcatStep(name='RegressionFeatures')([d0, d1])
        catFeatures = ConcatStep(name='CatBoostFeatures')([d0, d2])

        # Generate predictions using three different algorithms.
        m1 = ElasticNetStep(name='ElasticNet')(regressionFeatures, y)
        m2 = XGBRegressorStep(name='XGBoost')(regressionFeatures, y)
        m3 = CatBoostRegressorStep(name='CatBoost',
                                   cat_features=CATEGORICAL_COLUMNS,
                                   iterations=10)(catFeatures, y)

        # combine predictions from the three models
        combinedPredictions = Stack(name='CombinePredictions')([m1, m3])

        # construct an ensemble model
        ensembleModel = LinearRegressionStep()(combinedPredictions, y)
        model = Model(x, ensembleModel, y)
        model.fit(self.trainDF, self.trainDF['SalePrice'])
        self.artifact = {
            'model.pkl': cloudpickle.dumps(model),
            'environment': {
                'pip': {}
            }
        }
        self.next(self.end)
Пример #26
0
# Creating a learning pipeline
# ----------------------------
# The encoders for both clean and dirty data are first imported:

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from dirty_cat import SimilarityEncoder, TargetEncoder, MinHashEncoder,\
    GapEncoder

encoders_dict = {
    'one-hot': OneHotEncoder(handle_unknown='ignore', sparse=False),
    'similarity': SimilarityEncoder(similarity='ngram'),
    'target': TargetEncoder(handle_unknown='ignore'),
    'minhash': MinHashEncoder(n_components=100),
    'gap': GapEncoder(n_components=100),
    'numerical': FunctionTransformer(None)
}

# We then create a function that takes one key of our ``encoders_dict``,
# returns a pipeline object with the associated encoder,
# as well as a Scaler and a RidgeCV regressor:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [(enc + '_' + col, encoders_dict[enc], [col])
                    for col, enc in clean_columns.items()]
    # adding the encoded column
Пример #27
0
def make_clf(*args, **kwargs):
    clf = make_pipeline(FunctionTransformer(crossterm), LogisticRegressionCV())
    return clf
Пример #28
0
x, y, _, test_x, test_y, smiles, test_smiles = load_data(
    data_cfg, **repr_cfg[utils_section])
# change y in case of classification
if 'classification' == task_cfg[utils_section]['task']:
    log_scale = True if 'log' == data_cfg[csv_section]['scale'].lower().strip(
    ) else False
    y = task_cfg[utils_section]['cutoffs'](y, log_scale)
    test_y = task_cfg[utils_section]['cutoffs'](test_y, log_scale)

training_features = x
training_target = y
testing_features = test_x

# Average CV score on the training set was: 0.8708849509343505
exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
    ExtraTreesClassifier(bootstrap=True,
                         criterion="entropy",
                         max_depth=20,
                         max_features=0.9000000000000001,
                         max_samples=0.5,
                         min_samples_leaf=1,
                         min_samples_split=2,
                         n_estimators=500))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 666)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

print('Success.')
Пример #29
0
    x_fit = x_pipeline.fit_transform(x)

    y_pipeline = Pipeline([
        ("labeler", OrdinalEncoder()),
    ])
    y_fit = y_pipeline.fit_transform(y)

    # Fit the pipeline
    model = LogisticRegression(solver="lbfgs")
    model.fit(x_fit, y_fit)

    # Create a fully integrated pipeline
    prediction_pipeline = Pipeline([
        ("preprocessing", x_pipeline),
        ("pipeline",
         FunctionTransformer(func=call_model, kw_args={"model": model})),
        (
            "retransform labels",
            FunctionTransformer(
                # Turn the 0-1 labels back into setosa / virginica labels
                func=reverse_labels,
                validate=False,
                kw_args={"pipe": y_pipeline}),
        )
    ])

    # Try the full pipeline
    preds = prediction_pipeline.transform(x)
    print(confusion_matrix(y, preds))

    # Save models
Пример #30
0
from sklearn.preprocessing import FunctionTransformer


def func(df, book_maker, type_of_bet):
    col = "{}_over_count".format(
        book_maker) if type_of_bet == "total" else "{}_diff_home_count".format(
            book_maker)
    cols = [col]
    df_ = df[cols]
    return df_.values.tolist()


betting_count_encoder = FunctionTransformer(func=func,
                                            kw_args={
                                                "book_maker": "oversea",
                                                "type_of_bet": "total"
                                            })

if __name__ == "__main__":
    from data import Data

    data = Data(alliance="NBA")
    df1 = data.get_train(book_maker="oversea", type_of_bet="total")
    # print(df1[["game_time", "away_team", "home_team", "away_score", "home_score",
    #                      "is_back_to_back", ]])
    print(betting_count_encoder.fit_transform(df1))
    # print(game_count_encoder.steps[1][1].categories_)
Пример #31
0
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.utils import np_utils, generic_utils

from sklearn.preprocessing import FunctionTransformer

#read data
train_tour1 = pd.read_csv('numerai_training_data.csv')
feature = pd.DataFrame(train_tour1.ix[:,0:21])
target = pd.DataFrame(train_tour1.target)

#log feature
transformer = FunctionTransformer(np.log1p)
feature_log = transformer.transform(feature)

#add all feature
feature_log = pd.DataFrame(feature_log)
feature_all = pd.concat([feature, feature_log], axis =1 )

#separate target and features
feature_all = np.asarray(feature_all)
target = np.asarray(target)

# convert list of labels to binary class matrix
target = np_utils.to_categorical(target) 

# pre-processing: divide by max and substract mean
scale = np.max(feature_all)
Пример #32
0
    def __init__(self,
                 model,
                 df_train,
                 categorical_inputs,
                 categorical_imputer,
                 numeric_inputs,
                 numeric_imputer,
                 input_preproc,
                 class_names=None,
                 **kwargs):
        """
        Args:
            categorical_imputer: The imputer that is to be used for categorical columns.
                The imputer is not allowed to add new columns or change order of the
                existing ones.

            numeric_imputer: The imputer that is to be used for numeric columns.
                The imputer is not allowed to add new columns or change order of the
                existing ones.
        """
        self.model = model
        self.categorical_inputs = categorical_inputs
        self.categorical_imputer = categorical_imputer
        self.numeric_inputs = numeric_inputs
        self.numeric_imputer = numeric_imputer
        self.input_preproc = input_preproc
        class_names = [str(c) for c in class_names]

        self.interpret_preproc = make_column_transformer(
            (
                make_pipeline(
                    # wrap in a function transformer to prevent being refitted
                    FunctionTransformer(categorical_imputer.transform,
                                        validate=False),
                    OrdinalEncoder()),
                categorical_inputs),

            # wrap in a function transformer to prevent being refitted
            (FunctionTransformer(numeric_imputer.transform,
                                 validate=False), numeric_inputs))

        xx_train = self.interpret_preproc.fit_transform(
            df_train[self.categorical_inputs + self.numeric_inputs])

        if xx_train.shape[1] != len(categorical_inputs) + len(numeric_inputs):
            raise ValueError(
                "Imputers are not allowed to add new columns or to change their order."
            )

        self.ordenc = self.interpret_preproc.transformers_[0][1][1]

        try:
            cat_name_idx = {
                k: v
                for k, v in enumerate(self.ordenc.categories_)
            }

            self.categorical_names = {
                k: v
                for k, v in zip(categorical_inputs, self.ordenc.categories_)
            }
        except AttributeError:
            cat_name_idx = {}
            self.categorical_names = {}

        self.explainer = LimeTabularExplainer(
            xx_train,
            feature_names=categorical_inputs + numeric_inputs,
            class_names=class_names,
            categorical_features=range(len(categorical_inputs)),
            categorical_names=cat_name_idx,
            mode="classification"
            if is_classifier(self.model) else "regression",
            **kwargs)

        self.full_model = make_pipeline(FunctionTransformer(self._preproc_fn),
                                        self.model)
Пример #33
0
    def _generate_features(self,
                           X,
                           y=None,
                           numeric_extra=None,
                           categorical_extra=None):
        try:
            self.feature_pipeline_

        except AttributeError:
            n_days = X['dayofweek'].nunique()
            n_hours = X['hour'].nunique()

            self.feature_pipeline_ = Pipeline([(
                'features',
                FeatureUnion([
                    # time of week part of TOWT
                    ('weeks',
                     Pipeline([
                         ('split',
                          FeatureUnion([
                              ('days',
                               Pipeline([
                                   ('select', ColumnSelector('dayofweek')),
                                   ('ordinal',
                                    OrdinalEncoder(cols=['dayofweek'],
                                                   return_df=False)),
                                   ('unknown',
                                    SimpleImputer(missing_values=-1,
                                                  strategy='most_frequent'))
                               ])),
                              ('hours',
                               Pipeline([('select', ColumnSelector('hour')),
                                         ('ordinal',
                                          OrdinalEncoder(cols=['hour'],
                                                         return_df=False)),
                                         ('unknown',
                                          SimpleImputer(
                                              missing_values=-1,
                                              strategy='most_frequent'))]))
                          ])),
                         ('to_pandas',
                          FunctionTransformer(lambda x: pd.DataFrame(
                              x, columns=['dayofweek', 'hour']))),
                         ('term',
                          PatsyTransformer('-1 + C(dayofweek):C(hour)'))
                     ])) if (n_days > 1) and (n_hours > 1) else
                    ('days',
                     Pipeline([
                         ('select', ColumnSelector('dayofweek')),
                         ('ordinal',
                          OrdinalEncoder(cols=['dayofweek'], return_df=False)),
                         ('unknown',
                          SimpleImputer(missing_values=-1,
                                        strategy='most_frequent')),
                         ('to_pandas',
                          FunctionTransformer(lambda x: pd.DataFrame(
                              x, columns=['dayofweek']))),
                         ('one_hot',
                          OneHotEncoder(cols=['dayofweek'], return_df=False))
                     ])) if n_days > 1 else
                    ('hours',
                     Pipeline(
                         [('select', ColumnSelector('hour')),
                          ('ordinal',
                           OrdinalEncoder(cols=['hour'], return_df=False)),
                          ('unknown',
                           SimpleImputer(missing_values=-1,
                                         strategy='most_frequent')),
                          ('to_pandas',
                           FunctionTransformer(
                               lambda x: pd.DataFrame(x, columns=['hour']))),
                          ('one_hot',
                           OneHotEncoder(cols=['hour'], return_df=False))])),

                    # temperature part of TOWT
                    ('temperature',
                     ColumnTransformer([
                         ('encode_temperature',
                          IntervalEncoder(
                              n_chunks=10,
                              span=0.1 * X[self.temperature_col].std(),
                              method='normal'), [self.temperature_col])
                     ])),
                    ('temperature_interact',
                     'drop' if n_hours == 1 else Pipeline(
                         [('split',
                           FeatureUnion([
                               ('temperature_part',
                                Pipeline([
                                    ('select',
                                     ColumnSelector(self.temperature_col)),
                                    (
                                        'create_bins',
                                        KBinsDiscretizer(
                                            n_bins=self.n_bins_temperature,
                                            strategy='quantile',
                                            encode='ordinal'),
                                    )
                                ])),
                               ('hour_part',
                                Pipeline([('select', ColumnSelector('hour')),
                                          ('ordinal',
                                           OrdinalEncoder(cols=['hour'],
                                                          return_df=False)),
                                          ('unknown',
                                           SimpleImputer(
                                               missing_values=-1,
                                               strategy='most_frequent'))]))
                           ])),
                          ('to_pandas',
                           FunctionTransformer(lambda x: pd.DataFrame(
                               x, columns=[self.temperature_col, 'hour']))),
                          ('term',
                           PatsyTransformer(
                               f'-1 + C({self.temperature_col}):C(hour)'))])),

                    # deal with extra numerical regressors
                    ('numerical_regressors',
                     'drop' if not numeric_extra else ColumnTransformer(
                         [(f'encode_{col}',
                           IntervalEncoder(n_chunks=4,
                                           span=0.1 * X[col].std(),
                                           method='normal'), [col])
                          for col in numeric_extra])),

                    # deal with extra categorical regressors
                    ('categorical_regressors', 'drop' if not categorical_extra
                     else TargetEncoder(cols=categorical_extra,
                                        return_df=False,
                                        handle_missing='value',
                                        handle_unknown='value'))
                ]))])
            # Fit the pipeline
            self.feature_pipeline_.fit(X, y)

        finally:
            return self.feature_pipeline_.transform(X)
est = preprocessing.KBinsDiscretizer(n_bins=[3, 2, 2], encode='ordinal').fix(X)
est.transform(X)

## encoding categorical features
genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
print(enc.fit(X))
print(enc.transform([['female', 'from Europe', 'uses Firefox']]).toarray())

## generating polynomial features
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(9).reshape(3, 3)
poly0 = PolynomialFeatures(2)
poly1 = PolynomialFeatures(degree=3, interaction_only=True)
print(X)
print(poly0.fit_transform(X))
print(poly1.fit_transform(X))

## custom transformers
from  sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p, validate=True)
X = np.array([[0, 1],[2, 3]])
print(transformer.transform(X))




def knn_lab(X_train, y_train):
    model = make_pipeline(FunctionTransformer(rgb_to_lab, validate=True),
                          neighbors.KNeighborsClassifier())
    model.fit(X_train, y_train)
    return model
from sklearn.preprocessing import FunctionTransformer


def add_extra_features(X, add_bedrooms_per_room=True):
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]


attr_adder = FunctionTransformer(add_extra_features,
                                 validate=False,
                                 kw_args={"add_bedrooms_per_room": False})
housing_extra_attribs = attr_adder.fit_transform(housing.values)

#%%
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns) +
    ["rooms_per_household", "population_per_household"],
    index=housing.index)
housing_extra_attribs.head()

#%% [markdown]
# Now let's build a pipeline for preprocessing the numerical attributes (note that
# we could use `CombinedAttributesAdder()` instead of `FunctionTransformer(...)` if
# we preferred):
Пример #37
0
# training data
train = load_file()
X = train.drop(['target'], axis=1)
y = train.target

# bump all values up 1, so missing is now zero
cat_columns = get_cat_features_idx(X)
X = make_missing_zero(X, cat_columns)

# make a pipeline
pipe = Pipeline([('encode',
                  OneHotEncoder(categorical_features=cat_columns,
                                handle_unknown='ignore')),
                 ('to_dense',
                  FunctionTransformer(lambda x: x.todense(),
                                      accept_sparse=True)),
                 ('model', LogisticRegression())])
param_grid = {'model': [GaussianNB(), LogisticRegression()]}

model = GridSearchCV(pipe, param_grid, scoring='roc_auc')
model.fit(X.as_matrix(), y)
logger.info("Best Params: {}".format(model.best_params_))

results = cross_val_predict(model, X, y, method='predict_proba')[:, 1]
score = gini_normalized(y, results)
logger.info(
    "Cross-val normalized gini score on training set is {}".format(score))

# predict
test = make_missing_zero(load_file("test"), cat_columns)
test['target'] = model.predict_proba(test.as_matrix())[:, 1]
Пример #38
0
def myTrans(X):

    #可以自定义函数
    X_ = np.log10(X) + 2

    # 小数定标标准化,缩放到[-1, 1]范围
    # 变换公式:Decimal scaling y=(X/10的k次方) (k确保maX|y|<1)
    # X_ = X/10**np.ceil(np.log10(np.abs(X).max()))

    # 对数Logistic模式 y = 1/(1+e^(-X))
    # X_ = 1/(1+np.exp(-X))

    return X_


trans = FunctionTransformer(myTrans)
cols = ['年龄', '收入']
X_ = trans.fit_transform(df[cols])
print(X_)

######################################################################
########  Part2. 标准化、正则化等
######################################################################

#####=======StandardScaler============
### z-score标准化:y = (X-mean)/std
# 标准化:将不同规模和量纲的数据处理,缩放到相同的数据区间和范围,以减少规模、特征、分布差异等对模型的影响。
# 做法:将数据转换为标准正态分布(均值=0,标准差= 1),这样可利用正态分布的特征,
# 一种去中心化方法,会改变原数据分布,不适合于稀疏数据处理。

from sklearn.preprocessing import StandardScaler
Any step in the pipeline must be an object that implements the fit and transform methods. The FunctionTransformer creates an object with these methods out of any Python function that you pass to it. We'll use it to help select subsets of data in a way that plays nicely with pipelines.

You are working with numeric data that needs imputation, and text data that needs to be converted into a bag-of-words. You'll create functions that separate the text from the numeric variables and see how the .fit() and .transform() methods work.

INSTRUCTIONS
100XP
Compute the selector get_text_data by using a lambda function and FunctionTransformer() to obtain all 'text' columns.
Compute the selector get_numeric_data by using a lambda function and FunctionTransformer() to obtain all the numeric columns (including missing data). These are 'numeric' and 'with_missing'.
Fit and transform get_text_data using the .fit_transform() method with sample_df as the argument.
Fit and transform get_numeric_data using the same approach as above.
'''
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Obtain the text data: get_text_data
get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)

# Obtain the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False)

# Fit and transform the text data: just_text_data
just_text_data = get_text_data.fit_transform(sample_df)

# Fit and transform the numeric data: just_numeric_data
just_numeric_data = get_numeric_data.fit_transform(sample_df)

# Print head to check results
print('Text Data')
print(just_text_data.head())
print('\nNumeric Data')
print(just_numeric_data.head())