示例#1
0
def test_make_step():
    def some_method(self):
        pass

    LogisticRegression = make_step(
        sklearn.linear_model.LogisticRegression, attr_dict={"some_method": some_method}
    )

    assert issubclass(LogisticRegression, Step)
    assert issubclass(LogisticRegression, sklearn.linear_model.LogisticRegression)
    assert hasattr(LogisticRegression, "get_params")
    assert hasattr(LogisticRegression, "set_params")
    assert hasattr(LogisticRegression, "fit")
    assert hasattr(LogisticRegression, "predict")
    assert hasattr(LogisticRegression, "some_method")
    assert LogisticRegression.__name__ == "LogisticRegression"
示例#2
0
def test_make_step(class_name, expected, warns):
    def some_method(self):
        pass

    with warns:
        LogisticRegression = make_step(
            sklearn.linear_model.LogisticRegression,
            {"some_method": some_method},
            class_name,
        )

    assert issubclass(LogisticRegression, Step)
    assert issubclass(LogisticRegression, sklearn.linear_model.LogisticRegression)
    assert hasattr(LogisticRegression, "get_params")
    assert hasattr(LogisticRegression, "set_params")
    assert hasattr(LogisticRegression, "fit")
    assert hasattr(LogisticRegression, "predict")
    assert hasattr(LogisticRegression, "some_method")
    assert LogisticRegression.__name__ == expected
示例#3
0
import numpy as np
import random

import sklearn.linear_model
from sklearn.datasets import fetch_openml
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split

from baikal import Input, Model, make_step
from baikal.plot import plot_model
from baikal.steps import ColumnStack, Split, Lambda

# ------- Define steps
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)

# ------- Load a multi-label dataset
# (from https://www.openml.org/d/40597)
X, Y = fetch_openml("yeast", version=4, return_X_y=True)
Y = Y == "TRUE"
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

n_targets = Y.shape[1]
random.seed(87)
order = list(range(n_targets))
random.shuffle(order)

# ------- Build model
x = Input()
示例#4
0
import sklearn.decomposition
import sklearn.ensemble
import sklearn.linear_model
import sklearn.preprocessing
import sklearn.svm
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from baikal import Input, Model, make_step
from baikal.plot import plot_model
from baikal.steps import Stack

# 1. Define the steps
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)
ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier)
PCA = make_step(sklearn.decomposition.PCA)
SVC = make_step(sklearn.svm.SVC)
PowerTransformer = make_step(sklearn.preprocessing.PowerTransformer)

# 2. Build the model
x1 = Input(name="x1")
x2 = Input(name="x2")
y_t = Input(name="y_t")

y1 = ExtraTreesClassifier()(x1, y_t)
y2 = RandomForestClassifier()(x2, y_t)
z = PowerTransformer()(x2)
z = PCA()(z)
y3 = LogisticRegression()(z, y_t)
示例#5
0
import sklearn.svm
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from baikal import Input, Model, make_step
from baikal.plot import plot_model

# 1. Define a step
SVC = make_step(sklearn.svm.SVC)

# 2. Build the model
x = Input()
y_t = Input()
y_p = SVC(C=1.0, kernel="rbf", gamma=0.5)(x, y_t)

model = Model(x, y_p, y_t)
plot_model(model, filename="readme_quick_example.png")

# 3. Train the model
dataset = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(dataset.data,
                                                    dataset.target,
                                                    random_state=0)

model.fit(X_train, y_train)

# 4. Use the model
y_test_pred = model.predict(X_test)
示例#6
0
import sklearn.decomposition
import sklearn.ensemble
import sklearn.linear_model
import sklearn.preprocessing

from baikal import make_step

LinearRegression = make_step(sklearn.linear_model.LinearRegression)
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)
ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier)
PCA = make_step(sklearn.decomposition.PCA)
LabelEncoder = make_step(sklearn.preprocessing.LabelEncoder)
StandardScaler = make_step(sklearn.preprocessing.StandardScaler)
示例#7
0
import sklearn.datasets
import sklearn.ensemble
import sklearn.linear_model
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from baikal import Input, Model, make_step
from baikal.plot import plot_model
from baikal.steps import Concatenate

# ------- Define steps
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)
ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier)

# ------- Load dataset
data = sklearn.datasets.load_breast_cancer()
X, y_p = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_p,
                                                    test_size=0.2,
                                                    random_state=0)

# ------- Build model
x = Input()
y_t = Input()
y_p1 = LogisticRegression(function="predict_proba")(x, y_t)
y_p2 = RandomForestClassifier(function="predict_proba")(x, y_t)
ensemble_features = Concatenate()([y_p1, y_p2])
y_p = ExtraTreesClassifier()(ensemble_features, y_t)
示例#8
0
import sklearn.preprocessing
from sklearn.model_selection import cross_val_predict

from baikal import make_step


def _fit_predict_proba(self, X, y):
    self.fit(X, y)
    return cross_val_predict(self, X, y, method="predict_proba")


def _fit_decision_function(self, X, y):
    self.fit(X, y)
    return cross_val_predict(self, X, y, method="decision_function")


LinearRegression = make_step(sklearn.linear_model.LinearRegression)
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
LinearSVC = make_step(sklearn.svm.LinearSVC)
LinearSVCOOF = make_step(sklearn.svm.LinearSVC,
                         attr_dict={"fit_predict": _fit_decision_function})
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)
RandomForestClassifierOOF = make_step(
    sklearn.ensemble.RandomForestClassifier,
    attr_dict={"fit_predict": _fit_predict_proba},
)
ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier)
PCA = make_step(sklearn.decomposition.PCA)
LabelEncoder = make_step(sklearn.preprocessing.LabelEncoder)
StandardScaler = make_step(sklearn.preprocessing.StandardScaler)
示例#9
0
    def train(self):
        import xgboost
        from baikal import make_step, Step, Input, Model
        from baikal.steps import Stack
        from sklearn_pandas import gen_features
        import custom_transformations as ct
        from custom_transformations import DataFrameMapperStep, ConcatDataFrame, CatBoostRegressorStep

        # these are the categorical columns in the dataset
        CATEGORICAL_COLUMNS = [
            'KitchenQual',
            'MSSubClass',
            'MSZoning',
            'Street',
            'Alley',
            'LotShape',
            'LandContour',
            'Utilities',
            'LotConfig',
            'LandSlope',
            'Neighborhood',
            'Condition1',
            'Condition2',
            'BldgType',
            'HouseStyle',
            'RoofStyle',
            'RoofMatl',
            'Exterior1st',
            'Exterior2nd',
            'MasVnrType',
            'ExterQual',
            'ExterCond',
            'Foundation',
            'BsmtQual',
            'BsmtCond',
            'BsmtExposure',
            'BsmtFinType1',
            'BsmtFinType2',
            'Heating',
            'HeatingQC',
            'CentralAir',
            'Functional',
            'FireplaceQu',
            'GarageType',
            'GarageFinish',
            'GarageQual',
            'GarageCond',
            'PavedDrive',
            'PoolQC',
            'Fence',
            'MiscFeature',
            'SaleType',
            'SaleCondition',
            'OverallQual',
            'OverallCond',
        ]

        # these columns will be terated as a numerical columns
        NUMERICAL_COLUMNS = [
            'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
            'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
            'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
            'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
            'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
            'PoolArea', 'MiscVal', 'MoSold', 'YrSold'
        ]

        # These columns have missing values and the one for which we will add missing indicator variable
        MISSING_INDICATOR = [
            'LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
            'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
            'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
            'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
            'MiscFeature'
        ]

        ## Categorical Columns for which we want One Hot Encoding
        ONEHOT_COLUMNS = [
            'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
            'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2',
            'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType',
            'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
            'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
            'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
            'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
            'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
            'MiscFeature', 'SaleType', 'SaleCondition'
        ]

        ## Categorical Columns for which we want to have target encoding
        TARGET_COLUMNS = [
            'MSSubClass', 'Neighborhood', 'Exterior1st', 'Exterior2nd'
        ]

        ## Columns for that require log transformations
        LOG_COLUMNS = [
            'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
            'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal'
        ]

        # Define Steps
        ElasticNetStep = make_step(ElasticNet, class_name='ElasticNet')
        ConcatStep = make_step(ConcatDataFrame, class_name='Concat')
        XGBRegressorStep = make_step(xgboost.XGBRegressor,
                                     class_name='XGBRegressor')
        LinearRegressionStep = make_step(sklearn.linear_model.LinearRegression,
                                         class_name='LinearRegression')

        # Define sklearn-pandas transformations. Here I am using gen_features utility to
        # define transformations for individual columns.
        baseProcessing = (
            gen_features(columns=[[x] for x in MISSING_INDICATOR],
                         classes=[{
                             'class': MissingIndicator,
                             'features': 'all',
                             'sparse': False,
                             'error_on_new': False
                         }],
                         prefix='na_') +
            gen_features(
                columns=LOG_COLUMNS,
                classes=[{
                    'class': FunctionTransformer,
                    'func': lambda x: x.astype(np.float).reshape((-1, 1))
                }, {
                    'class': SimpleImputer,
                    'strategy': 'mean'
                }, {
                    'class': FunctionTransformer,
                    'func': np.log1p
                }]) +
            gen_features(
                columns=list(set(NUMERICAL_COLUMNS) - set(LOG_COLUMNS)),
                classes=[{
                    'class': FunctionTransformer,
                    'func': lambda x: x.astype(np.float).reshape((-1, 1))
                }, {
                    'class': SimpleImputer,
                    'strategy': 'mean'
                }],
            ) + [
                # constructing new features -- age of the house
                (['YrSold', 'YearBuilt'], [
                    FunctionTransformer(
                        func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'age'
                }),

                # constructing new feature -- remodeling age
                (['YrSold', 'YearRemodAdd'], [
                    FunctionTransformer(
                        func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'remodel_age'
                }),

                # new feature -- total surface area
                (['1stFlrSF', '2ndFlrSF', 'TotalBsmtSF'], [
                    FunctionTransformer(lambda x: np.nansum(x, axis=1)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'numerical_TotalArea'
                })
            ])

        # Since CatBoost model can handle categorical data, we don't need to encode categorical variables
        # we will simply impute missing values and let CatBoost model handle categorical data.
        catModelPreprocessing = gen_features(
            columns=CATEGORICAL_COLUMNS,
            classes=[{
                'class': FunctionTransformer,
                'func': lambda x: x.astype(np.object).reshape(-1, 1)
            }, {
                'class': SimpleImputer,
                'strategy': 'most_frequent'
            }],
        )

        # for regression and XGBoost, we will need to encode categorical variables ourselfs.
        # Depending on the cardinality of the variable, I am either using one hot encoding or target encoding.
        regressionModelProcessing = (
            gen_features(columns=[[x] for x in ONEHOT_COLUMNS],
                         classes=[{
                             'class': OneHotEncoder,
                             'handle_unknown': 'ignore',
                             'sparse': False
                         }]) + gen_features(columns=[[x]
                                                     for x in TARGET_COLUMNS],
                                            classes=[
                                                {
                                                    'class': TargetEncoder
                                                },
                                                {
                                                    'class': SimpleImputer,
                                                    'strategy': 'mean'
                                                },
                                            ]))

        # Define DAG
        x = Input(name="x")
        y = Input(name='y')

        # Define feature transformations
        d0 = DataFrameMapperStep(baseProcessing,
                                 df_out=True,
                                 name='BasePreprocess')(x, y)
        d1 = DataFrameMapperStep(regressionModelProcessing,
                                 df_out=True,
                                 name='RegressionModelPreprocess')(x, y)
        d2 = DataFrameMapperStep(catModelPreprocessing,
                                 df_out=True,
                                 name='CatModelPreprocess')(x, y)

        # Consolidate features for catboost and elasticnet
        regressionFeatures = ConcatStep(name='RegressionFeatures')([d0, d1])
        catFeatures = ConcatStep(name='CatBoostFeatures')([d0, d2])

        # Generate predictions using three different algorithms.
        m1 = ElasticNetStep(name='ElasticNet')(regressionFeatures, y)
        m2 = XGBRegressorStep(name='XGBoost')(regressionFeatures, y)
        m3 = CatBoostRegressorStep(name='CatBoost',
                                   cat_features=CATEGORICAL_COLUMNS,
                                   iterations=10)(catFeatures, y)

        # combine predictions from the three models
        combinedPredictions = Stack(name='CombinePredictions')([m1, m3])

        # construct an ensemble model
        ensembleModel = LinearRegressionStep()(combinedPredictions, y)
        model = Model(x, ensembleModel, y)
        model.fit(self.trainDF, self.trainDF['SalePrice'])
        self.artifact = {
            'model.pkl': cloudpickle.dumps(model),
            'environment': {
                'pip': {}
            }
        }
        self.next(self.end)
示例#10
0
class XGBRegressor(XGBStep, xgb.XGBRegressor):
    def __init__(self, *args, name=None, **kwargs):
        super().__init__(*args, name=name, **kwargs)


class XGBClassifier(XGBStep, xgb.XGBClassifier):
    def __init__(self, *args, name=None, **kwargs):
        super().__init__(*args, name=name, **kwargs)


class XGBRanker(XGBStep, xgb.XGBRanker):
    def __init__(self, *args, name=None, **kwargs):
        super().__init__(*args, name=name, **kwargs)


SimpleImputer = make_step(impute.SimpleImputer, class_name="SimpleImputer")

KNeighborsRegressor = make_step(neighbors.KNeighborsRegressor,
                                class_name="KNeighborsRegressor")

KNeighborsClassifier = make_step(neighbors.KNeighborsClassifier,
                                 class_name="KNeighborsClassifier")

RandomForestRegressor = make_step(ensemble.RandomForestRegressor,
                                  class_name="RandomForestRegressor")

RandomForestClassifier = make_step(ensemble.RandomForestClassifier,
                                   class_name="RandomForestClassifier")

ExtraTreesRegressor = make_step(ensemble.ExtraTreesRegressor,
                                class_name="ExtraTreesRegressor")
示例#11
0
import sklearn.decomposition
import sklearn.ensemble
import sklearn.decomposition
import sklearn.linear_model
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from baikal import Input, Model, make_step
from baikal.sklearn import SKLearnWrapper


LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)
PCA = make_step(sklearn.decomposition.PCA)


def build_fn():
    x = Input()
    y_t = Input()
    h = PCA(random_state=random_state, name="pca")(x)
    y_p = LogisticRegression(random_state=random_state, name="classifier")(h, y_t)
    model = Model(x, y_p, y_t)
    return model


iris = datasets.load_iris()
x_data = iris.data
y_data = iris.target
random_state = 123
verbose = 0
示例#12
0
# Adapted from the scikit-learn example in:
# https://scikit-learn.org/stable/auto_examples/compose/plot_transformed_target.html#sphx-glr-auto-examples-compose-plot-transformed-target-py

import numpy as np
import sklearn.linear_model
import sklearn.preprocessing
from sklearn.datasets import load_boston
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.model_selection import train_test_split

from baikal import make_step, Input, Model
from baikal.plot import plot_model
from baikal.steps import Lambda

# ------- Define steps
RidgeCV = make_step(sklearn.linear_model.RidgeCV)
QuantileTransformer = make_step(sklearn.preprocessing.QuantileTransformer)

# ------- Load dataset
dataset = load_boston()
target = np.array(dataset.feature_names) == "DIS"
X = dataset.data[:, np.logical_not(target)]
y = dataset.data[:, target].squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# ------- Build model
transformer = QuantileTransformer(n_quantiles=300,
                                  output_distribution="normal")

x = Input()
y_t = Input()
示例#13
0
    def predict(self, X):
        raise KeyError("some failure")


class _DummyEstimator(BaseEstimator):
    def __init__(self, x=123, y="abc"):
        self.x = x
        self.y = y
        self.fit_calls = 0
        self.fit_predict_calls = 0

    def predict(self, X):
        return X

    def predict_proba(self, X):
        return X

    def fit(self, X, y):
        self.fit_calls += 1
        return self

    def fit_predict(self, X, y):
        self.fit_predict_calls += 1
        return X

    def fit_predict_proba(self, X, y):
        return X


DummyEstimator = make_step(_DummyEstimator, class_name="DummyEstimator")
示例#14
0
    def predict(self, X):
        raise KeyError("some failure")


class _DummyEstimator(BaseEstimator):
    def __init__(self, x=123, y="abc"):
        self.x = x
        self.y = y
        self.fit_calls = 0
        self.fit_predict_calls = 0

    def predict(self, X):
        return X

    def predict_proba(self, X):
        return X

    def fit(self, X, y):
        self.fit_calls += 1
        return self

    def fit_predict(self, X, y):
        self.fit_predict_calls += 1
        return X

    def fit_predict_proba(self, X, y):
        return X


DummyEstimator = make_step(_DummyEstimator)