Python selector示例，sklearn.compose.selector Python示例

示例#1

0

显示文件

def get_svr_pipeline(countries_threshold=0.97, utc_threshold=0.95, log=False):
    preprocessing = Pipeline(steps=[
        ('countries',
         CategoricalThresholdTransformer(
             'country#cat', threshold=countries_threshold, log=log)),
        ('utc_offset',
         CategoricalThresholdTransformer(
             'utc_offset#cat', threshold=utc_threshold, log=log)),
        ('calculated_pop', CalculatedPopTransformer()),
    ])

    numeric_transformer = Pipeline(steps=[
        ('log', LogTransformer(exclude_columns=[])),
        ('scale', MinMaxScaler()),
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

    transformers = ColumnTransformer(transformers=[
        ('numeric_log', numeric_transformer,
         selector(dtype_exclude=['object', 'category'])),
        ('categorical', categorical_transformer,
         selector(dtype_include=['object', 'category'])),
    ],
                                     remainder='passthrough')

    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing),
        ('transformations', transformers),
        ('model', SVR(C=0.5, epsilon=0.01, gamma='scale', cache_size=1999)),
    ])
    return pipeline

示例#2

0

显示文件

def preprocess():
    numeric_transformer = StandardScaler(with_mean=True, with_std=True)
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer,
             selector(dtype_exclude=object)),  #self.numeric_features),
            ('cat', categorical_transformer, selector(dtype_include=object)
             )  #self.categorical_features)
        ],
        remainder='passthrough')
    return preprocessor

示例#3

0

显示文件

def get_preprocessor():
    numeric_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='median')), ('scaler',
                                                    StandardScaler())])
    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer,
                       selector(dtype_exclude=["category", "object"])),
                      ('cat', categorical_transformer,
                       selector(dtype_include=["category"]))])
    return preprocessor

示例#4

0

显示文件

文件： star_type_predictions.py 项目： annalieNK/star-types

    def prepocess(self):
        """
        Preprocess the data through normalization of numeric variables and categorical transformations.
        """

        numeric_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer,
                 selector(dtype_exclude=object)),  #self.numeric_features),
                ('cat', categorical_transformer, selector(dtype_include=object)
                 )  #self.categorical_features)
            ],
            remainder='passthrough')

示例#5

0

显示文件

    def preprocessor(self):
        """Pipeline with numerical pipeline combined with categorical pipeline

        Returns
        -------
        Pipeline
            Pipeline with num and cat transformers
        """
        return ColumnTransformer(transformers=[
            ("num", self.numerical_transformer,
             selector(dtype_exclude="category")),
            (
                "cat",
                self.categorical_transformer,
                selector(dtype_include="category"),
            ),
        ])

示例#6

0

显示文件

    def pre_encoder(x):
        str_encode = Pipeline(steps=[('miss', SimpleImputer()), ('strings',
                                                                 x)])
        num_encode = Pipeline(
            steps=[('miss', SimpleImputer()), ('scaler', StandardScaler())])

        pre_encode = ColumnTransformer(
            transformers=[('categoricals', str_encode,
                           selector(dtype_exclude=['float'])),
                          ('numericals', num_encode,
                           selector(dtype_include=['float']))])
        parameters = [{
            'pre_encode__categoricals': [str_encode],
            'pre_encode__categoricals__miss__strategy': ['most_frequent']
        }, {
            'pre_encode__numericals': [num_encode],
            'pre_encode__numericals__miss__strategy':
            ['mean', 'median', 'most_frequent']
        }]
        # return preprocessor and their parameters
        return [parameters, pre_encode]

示例#7

0

显示文件

文件： train_util.py 项目： franec94/Machine-Learning-Projects

def create_and_run_pipeline_GDCV(X,
                                 y,
                                 param_grid,
                                 num_cv=10,
                                 clf_obj=LogisticRegression(),
                                 random_state=42):

    # Reproduce the identical fit/score process
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state)

    numeric_transformer = Pipeline(
        steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    param_grid = {
        'preprocessor__num__imputer__strategy': ['mean', 'median'],
        'classifier__C': [0.1, 1.0, 10, 100],
    }

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer,
                       selector(dtype_exclude="category")),
                      ('cat', categorical_transformer,
                       selector(dtype_include="category"))])

    clf = Pipeline(steps=[('preprocessor',
                           preprocessor), ('classifier', clf_obj)])

    grid_search = GridSearchCV(clf, param_grid, cv=num_cv)
    grid_search.fit(X_train, y_train)

    clf_name = str(clf_obj).split('(')[0]
    print(("best %s from grid search: %.3f" %
           (clf_name, grid_search.score(X_test, y_test))))
    return clf

示例#8

0

显示文件

def get_linear_pipeline(alpha=1,
                        countries_threshold=0.97,
                        utc_threshold=0.95,
                        log=False):
    preprocessing = Pipeline(steps=[
        ('countries',
         CategoricalThresholdTransformer(
             'country#cat', threshold=countries_threshold, log=log)),
        ('utc_offset',
         CategoricalThresholdTransformer(
             'utc_offset#cat', threshold=utc_threshold, log=log)),
        ('calculated_pop', CalculatedPopTransformer()),
    ])

    numeric_transformer = Pipeline(steps=[
        ('log', LogTransformer(exclude_columns=[])),
        ('poli', PolynomialFeatures(2)),
        ('scale', MinMaxScaler()),
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

    transformers = ColumnTransformer(transformers=[
        ('numeric_log', numeric_transformer,
         selector(dtype_exclude=['object', 'category'])),
        ('categorical', categorical_transformer,
         selector(dtype_include=['object', 'category'])),
    ],
                                     remainder='passthrough')

    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing),
        ('transformations', transformers),
        ('model', Ridge(alpha=alpha)),
    ])
    return pipeline

示例#9

0

显示文件

文件： column_transformer_target_encoder.py 项目： libinruan/competitions

 def transform(self, X, y=None):
     """Transform features of length less than self.threshold
     with ordinal encoder
     """
     dX = X.copy()  # use deep copy!!
     enc = OrdinalEncoder()
     cats = selector(dtype_include='object')(X)
     cats_to_encode = list(filter(lambda x: len(x) < self.threshold, cats))
     nums_to_keep = set(X.columns).difference(set(cats))
     for i in dX.columns:
         if i in cats_to_encode:
             dX.loc[:,
                    i] = enc.fit_transform(dX.loc[:, i].to_numpy().reshape(
                        -1, 1)).astype('int')
     return dX

示例#10

0

显示文件

def q4():
    # Retorne aqui o resultado da questão 4.
    X = df.drop(['Region','Country'],axis=1)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, selector(dtype_exclude="category"))
    ])

    preprocessor.fit(X)
    res = preprocessor.transform(df_test.drop(['Region','Country'],axis=1)).tolist()
    return round(res[0][9],3)

示例#11

0

显示文件

文件： column_transformer_target_encoder.py 项目： libinruan/competitions

    def transform(self, X, y=None):
        """Transform features of length less than self.threshold
        with ordinal encoder
        """
        temp = pd.DataFrame(index=range(X.shape[0]))  # initialize a Dataframe
        enc = OrdinalEncoder()
        cats = selector(dtype_include='object')(X)
        cats_to_encode = list(filter(lambda x: len(x) < self.threshold, cats))
        nums_to_keep = set(X.columns).difference(set(cats))
        m = 0
        for i in set(cats_to_encode):
            temp[i] = enc.fit_transform(X.loc[:, i].to_numpy().reshape(
                -1, 1)).astype('int')

        return pd.concat([
            temp, X.loc[:, [z for z in cats if z not in cats_to_encode]],
            X.loc[:, nums_to_keep]
        ],
                         axis=1)

示例#12

0

显示文件

文件： sklearn_pipeline_gridsearchCV_demo.py 项目： libinruan/competitions

def seed_everything(seed=1903):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


seed_everything(seed=2020)

os.chdir('/kaggle/working')
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
sample_submission = pd.read_csv(
    '../input/tabular-playground-series-mar-2021/sample_submission.csv')

select_numeric_features = selector(dtype_include='number')
numeric_features = select_numeric_features(
    train
)  # 記得 scaleing for linear models with regularization. Without regularization, linear models doesn't need to be scaled simply for prediction.

train_id = train.loc[:, 'id']
test_id = test.loc[:, 'id']
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

cat_features = selector(dtype_exclude='number')(train.drop('target', axis=1))
num_features = selector(dtype_include='number')(train.drop('target', axis=1))

cat_preprocessor = Pipeline(steps=[('oh', OneHotEncoder(
    handle_unknown='ignore')), ('ss', StandardScaler(with_mean=False))])
num_preprocessor = Pipeline(steps=[('pt', PowerTransformer(

示例#13

0

显示文件

文件： Bank_marketing.py 项目： Munther3/DS_Research_Bank_Marketing_stage3a

test_strat["Attrition"].value_counts(normalize=True)


# Sample data
train, test = train_test_split(ames, test_size=0.3, random_state=123)

# Extract features and response
features = train.drop(columns="Sale_Price")
label = train["Sale_Price"]



# SciKit-Learn does not automatically transform categorical features so we need to 
# apply a one-hot transformer. We will discuss this more thoroughly in the next chapter.
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('cat', categorical_transformer, selector(dtype_include="object"))])

knn_fit = Pipeline(steps=[('preprocessor', preprocessor),
                          ('knn', KNeighborsRegressor(metric='euclidean'))])

                          
# Specify resampling strategy
cv = RepeatedKFold(n_splits=10, n_repeats=5)

# Create grid of hyperparameter values
hyper_grid = {'knn__n_neighbors': range(3, 26)}

# Tune a knn model using grid search
grid_search = GridSearchCV(knn_fit, hyper_grid, cv=cv, scoring='neg_mean_squared_error')
results = grid_search.fit(features, label)

示例#14

0

显示文件

文件： metrics.py 项目： mdiazmel/scikit-learn-mooc

# %%
X_train.info()

# %% [markdown]
# While some features are numeric, some have been tagged as `category`. These
# features need to be encoded such that our random forest can
# deal with them. The simplest solution is to use an `OrdinalEncoder`.
# Regarding, the numerical features, we don't need to do anything. Thus, we
# will create preprocessing steps to take care of the encoding.

# %%
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_selector = selector(dtype_include="category")
preprocessor = make_column_transformer(
    (OrdinalEncoder(), categorical_selector),
    remainder="passthrough",
)

X_train_preprocessed = pd.DataFrame(
    preprocessor.fit_transform(X_train),
    columns=(
        categorical_selector(X_train) +
        [col for col in X_train.columns
         if col not in categorical_selector(X_train)]
    )
)
X_train_preprocessed.head()

示例#15

0

显示文件

文件： train_util.py 项目： franec94/Machine-Learning-Projects

def wrapper_feature_transformer_ensembles_trees_clf_v2(
    X,
    y,
    X_test,
    y_test,
    n_estimator=10,
    transformer=None,
    clf_obj=LogisticRegression(max_iter=1000)):

    numeric_transformer = Pipeline(
        steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer,
                       selector(dtype_exclude="category")),
                      ('cat', categorical_transformer,
                       selector(dtype_include="category"))])

    scaler = StandardScaler()
    scaler.fit(X)
    x_train_scaled = scaler.transform(X)
    x_test_scaled = scaler.transform(X_test)

    clf_name = str(clf_obj).split('(')[0]

    if transformer is not None:
        transformer.fit(x_train_scaled)
        x_train_scaled = transformer.transform(x_train_scaled)
        x_test_scaled = transformer.transform(x_test_scaled)
        pass

    # It is important to train the ensemble of trees on a different subset
    # of the training data than the linear regression model to avoid
    # overfitting, in particular if the total number of leaves is
    # similar to the number of training samples
    X_train, X_train_lr, y_train, y_train_lr = train_test_split(x_train_scaled,
                                                                y,
                                                                test_size=0.5,
                                                                random_state=0)

    # Unsupervised transformation based on totally random trees
    rt = RandomTreesEmbedding(max_depth=3,
                              n_estimators=n_estimator,
                              random_state=0)
    rt_clf = sklearn.base.clone(clf_obj)
    pipeline = make_pipeline(rt, rt_clf)
    pipeline.fit(X_train, y_train)
    y_pred_rt = pipeline.predict(X_test)
    fpr_rt_clf, tpr_rt_clf, _ = roc_curve(y_test, y_pred_rt)

    # Supervised transformation based on random forests
    rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
    rf_enc = OneHotEncoder()
    rf_clf = sklearn.base.clone(clf_obj)
    rf.fit(X_train, y_train)
    rf_enc.fit(rf.apply(X_train))
    rf_clf.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

    y_pred_rf_clf = rf_clf.predict(rf_enc.transform(rf.apply(x_test_scaled)))
    fpr_rf_clf, tpr_rf_clf, _ = roc_curve(y_test, y_pred_rf_clf)

    # Supervised transformation based on gradient boosted trees
    grd = GradientBoostingClassifier(n_estimators=n_estimator)
    grd_enc = OneHotEncoder()
    grd_clf = sklearn.base.clone(clf_obj)
    grd.fit(X_train, y_train)
    grd_enc.fit(grd.apply(X_train)[:, :, 0])
    grd_clf.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

    y_pred_grd_clf = grd_clf.predict(
        # grd_enc.transform(grd.apply(x_test_scaled)[:, :, 0]))[:, 1]
        grd_enc.transform(grd.apply(x_test_scaled)[:, :, 0]))
    fpr_grd_clf, tpr_grd_clf, _ = roc_curve(y_test, y_pred_grd_clf)

    # The gradient boosted model by itself
    y_pred_grd = grd.predict(x_test_scaled)
    fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)

    # The random forest model by itself
    y_pred_rf = rf.predict(X_test)
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)

    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rt_clf, tpr_rt_clf, label='RT + LR')
    plt.plot(fpr_rf, tpr_rf, label='RF')
    plt.plot(fpr_rf_clf, tpr_rf_clf, label='RF + LR')
    plt.plot(fpr_grd, tpr_grd, label='GBT')
    plt.plot(fpr_grd_clf, tpr_grd_clf, label='GBT + LR')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title(f'ROC curve - {clf_name}')
    plt.legend(loc='best')
    plt.show()

    plt.figure(2)
    plt.xlim(0, 0.2)
    plt.ylim(0.8, 1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rt_clf, tpr_rt_clf, label='RT + LR')
    plt.plot(fpr_rf, tpr_rf, label='RF')
    plt.plot(fpr_rf_clf, tpr_rf_clf, label='RF + LR')
    plt.plot(fpr_grd, tpr_grd, label='GBT')
    plt.plot(fpr_grd_clf, tpr_grd_clf, label='GBT + LR')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title(f'ROC curve (zoomed in at top left) - {clf_name}')
    plt.legend(loc='best')
    plt.show()

    pass

示例#16

0

显示文件

文件： pipeline.py 项目： micahmelling/applied_data_science

def get_pipeline(model):
    """
    Generates a scikit-learn modeling pipeline with model as the final step.

    :param model: instantiated model
    :returns: scikit-learn pipeline
    """
    numeric_transformer = Pipeline(steps=[
        ('mouse_movement_clipper',
         FunctionTransformer(clip_feature_bounds,
                             validate=False,
                             kw_args={
                                 'feature': 'mouse_movement',
                                 'cutoff': 0,
                                 'new_amount': 0,
                                 'clip_type': 'lower'
                             })),
        ('propensity_score_clipper',
         FunctionTransformer(clip_feature_bounds,
                             validate=False,
                             kw_args={
                                 'feature': 'propensity_score',
                                 'cutoff': 0,
                                 'new_amount': 0,
                                 'clip_type': 'lower'
                             })),
        ('completeness_score_clipper',
         FunctionTransformer(clip_feature_bounds,
                             validate=False,
                             kw_args={
                                 'feature': 'completeness_score',
                                 'cutoff': 0,
                                 'new_amount': 0,
                                 'clip_type': 'lower'
                             })),
        ('profile_score_clipper',
         FunctionTransformer(clip_feature_bounds,
                             validate=False,
                             kw_args={
                                 'feature': 'profile_score',
                                 'cutoff': 0,
                                 'new_amount': 0,
                                 'clip_type': 'lower'
                             })),
        ('average_stars_clipper',
         FunctionTransformer(clip_feature_bounds,
                             validate=False,
                             kw_args={
                                 'feature': 'average_stars',
                                 'cutoff': 0,
                                 'new_amount': 0,
                                 'clip_type': 'lower'
                             })),
        ('ratio_creator',
         FunctionTransformer(create_ratio_column,
                             validate=False,
                             kw_args={
                                 'col1': 'profile_score',
                                 'col2': 'activity_score'
                             })),
        ('log_creator', TakeLog()),
        ('dict_creator', FeaturesToDict()),
        ('dict_vectorizer', DictVectorizer(sparse=False)),
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('feature_selector', SelectPercentile(f_classif)),
    ])

    categorical_transformer = Pipeline(steps=[
        ('date_transformer',
         FunctionTransformer(convert_column_to_datetime,
                             validate=False,
                             kw_args={'feature': 'acquired_date'})),
        ('month_extractor',
         FunctionTransformer(extract_month_from_date,
                             validate=False,
                             kw_args={'date_col': 'acquired_date'})),
        ('quarter_extractor',
         FunctionTransformer(convert_month_to_quarter,
                             validate=False,
                             kw_args={
                                 'month_col': 'month',
                                 'mapping_dict': MONTH_TO_QUARTER_DICT
                             })),
        ('year_extractor',
         FunctionTransformer(extract_year_from_date,
                             validate=False,
                             kw_args={'date_col': 'acquired_date'})),
        ('date_dropper',
         FunctionTransformer(drop_features,
                             validate=False,
                             kw_args={'feature_list': FEATURES_TO_DROP})),
        ('imputer',
         FunctionTransformer(fill_missing_values,
                             validate=False,
                             kw_args={'fill_value': CATEGORICAL_FILL_VALUE})),
        ('category_combiner', CombineCategoryLevels()),
        ('dict_creator', FeaturesToDict()),
        ('dict_vectorizer', DictVectorizer(sparse=False)),
        ('feature_selector', SelectPercentile(chi2)),
    ])

    preprocessor = ColumnTransformer(
        transformers=[('numeric_transformer', numeric_transformer,
                       selector(dtype_include='number')),
                      ('categorical_transformer', categorical_transformer,
                       selector(dtype_exclude='number'))],
        remainder='passthrough',
    )

    pipeline = Pipeline(steps=[(
        'data_mapper',
        FunctionTransformer(ensure_features_are_standardized,
                            validate=False,
                            kw_args={'feature_mapping': FEATURE_DTYPE_MAPPING})
    ), ('preprocessor',
        preprocessor), ('variance_thresholder',
                        VarianceThreshold()), ('model', model)])

    return pipeline

示例#17

0

显示文件

文件： column_transformer_target_encoder.py 项目： libinruan/competitions

# %% 選出categorical features方法二
from sklearn.compose import make_column_selector as selector


class dummyTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(
            X,
            columns=selector(dtype_include='object')(X))  # return pd.DataFrame


tr4 = ColumnTransformer(transformers=[('dum', dummyTransformer(),
                                       selector(dtype_include='object'))])
tr4.fit_transform(df)  # Note: returns a ndarray

# %%
# ANCHOR Custom transformers
# ------------------------------- EXPERIMENT 3 ------------------------------- #
import random
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector

示例#18

0

显示文件

文件： plot_impact_imbalanced_classes.py 项目： zaky9/imbalanced-learn

cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore"),
)

# %% [markdown]
# Then, we can create a preprocessor which will dispatch the categorical
# columns to the categorical pipeline and the numerical columns to the
# numerical pipeline

# %%
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector

preprocessor_linear = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

# %% [markdown]
# Finally, we connect our preprocessor with our
# :class:`~sklearn.linear_model.LogisticRegression`. We can then evaluate our
# model.

# %%
from sklearn.linear_model import LogisticRegression

lr_clf = make_pipeline(preprocessor_linear, LogisticRegression(max_iter=1000))

# %%

示例#19

0

显示文件

文件： xgboost_model.py 项目： ronanb95/DublinBus

def xgboost_model(line_file, trip_file, weather_file):
    df_route = pd.read_csv('./leavetimes_by_line/' + line_file,
                           keep_default_na=True,
                           sep=',\s+',
                           delimiter=';',
                           skipinitialspace=True)
    df_route = df_route.drop([
        'DATASOURCE', 'PLANNEDTIME_DEP', 'ACTUALTIME_DEP', 'PASSENGERS',
        'PASSENGERSIN', 'PASSENGERSOUT', 'DISTANCE', 'SUPPRESSED',
        'JUSTIFICATIONID', 'LASTUPDATE', 'NOTE'
    ], 1)

    df_trips = pd.read_csv(trip_file,
                           keep_default_na=True,
                           sep=',\s+',
                           delimiter=';',
                           skipinitialspace=True)
    df_trips = df_trips.drop([
        'DATASOURCE', 'TENDERLOT', 'SUPPRESSED', 'JUSTIFICATIONID', 'BASIN',
        'ACTUALTIME_ARR', 'ACTUALTIME_DEP', 'PLANNEDTIME_ARR', 'LASTUPDATE',
        'NOTE'
    ], 1)
    df_trips = df_trips.rename(
        columns={'PLANNEDTIME_DEP': 'TRIPS_PLANNEDTIME_DEP'})

    df_weather = pd.read_csv(weather_file)
    df_weather.drop([
        'dt', 'timezone', 'city_name', 'lat', 'lon', 'temp_min', 'temp_max',
        'sea_level', 'grnd_level', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h',
        'weather_description', 'wind_deg', 'weather_icon'
    ], 1)

    def drop_UTC(str):
        return str.replace("+0000 UTC", "")

    df_weather['date'] = df_weather['dt_iso'].apply(drop_UTC)
    df_weather = df_weather.drop(['dt_iso'], 1)
    df_weather['date'] = pd.to_datetime(df_weather['date'])

    df_weather = df_weather[[
        'date', 'temp', 'feels_like', 'pressure', 'humidity', 'wind_speed',
        'clouds_all', 'weather_main'
    ]]

    df_weather['weather_main'] = df_weather['weather_main'].astype('category')

    df = pd.merge(df_route, df_trips, on=['DAYOFSERVICE', 'TRIPID', 'ROUTEID'])
    df['TRIPID'] = df['TRIPID'].astype('object')
    df = df[[
        'DAYOFSERVICE', 'LINEID', 'ROUTEID', 'DIRECTION', 'TRIPID',
        'PROGRNUMBER', 'STOPPOINTID', 'PLANNEDTIME_ARR', 'ACTUALTIME_ARR',
        'VEHICLEID', 'TRIPS_PLANNEDTIME_DEP'
    ]]

    import re

    def tidy_datetime(time_str):
        if 'JAN' in time_str:
            return str(re.sub('JAN', '01', time_str))
        elif 'FEB' in time_str:
            return str(re.sub('FEB', '02', time_str))
        elif 'MAR' in time_str:
            return str(re.sub('MAR', '03', time_str))
        elif 'APR' in time_str:
            return str(re.sub('APR', '04', time_str))
        elif 'MAY' in time_str:
            return str(re.sub('MAY', '05', time_str))
        elif 'JUN' in time_str:
            return str(re.sub('JUN', '06', time_str))
        elif 'JUL' in time_str:
            return str(re.sub('JUL', '07', time_str))
        elif 'AUG' in time_str:
            return str(re.sub('AUG', '08', time_str))
        elif 'SEP' in time_str:
            return str(re.sub('SEP', '09', time_str))
        elif 'OCT' in time_str:
            return str(re.sub('OCT', '10', time_str))
        elif 'NOV' in time_str:
            return str(re.sub('NOV', '11', time_str))
        elif 'DEC' in time_str:
            return str(re.sub('DEC', '12', time_str))
        return time_str

    df['DAYOFSERVICE'] = df['DAYOFSERVICE'].apply(tidy_datetime)

    df['DAYOFSERVICE'] = pd.to_datetime(
        df['DAYOFSERVICE'],
        format='%d-%m-%y %H:%M:%S').dt.strftime('%Y-%m-%d %H:%M:%S')
    df['DAYOFSERVICE'] = pd.to_datetime(df['DAYOFSERVICE'])
    df['timestamp'] = df.apply(lambda x: x['DAYOFSERVICE'] + pd.Timedelta(
        seconds=x['TRIPS_PLANNEDTIME_DEP']),
                               axis=1)
    df = df.sort_values(
        ["timestamp", "PROGRNUMBER"],
        ascending=(True, True)).apply(lambda x: x.reset_index(drop=True))

    df['timestamp'] = pd.to_datetime(df['timestamp'])

    df['DAYOFWEEK'] = df['timestamp'].dt.dayofweek
    df['MONTH'] = df['timestamp'].dt.month
    df['DAY'] = df['timestamp'].dt.day
    df['date'] = df['timestamp'].dt.round('H')

    df = pd.merge(df, df_weather, on=['date'])
    df = df.drop(['date'], 1)

    holiday_list = [
        '2018-01-01', '2018-03-17', '2018-03-20', '2018-03-30', '2018-04-01',
        '2018-04-02', '2018-05-07', '2018-06-04', '2018-06-21', '2018-08-06',
        '2018-09-23', '2018-10-29', '2018-12-21', '2018-12-24', '2018-12-25',
        '2018-12-26', '2018-12-31'
    ]

    def holiday(time_str):
        if str(time_str) in holiday_list:
            return 1
        return 0

    df['HOLIDAY'] = df['DAYOFSERVICE'].dt.date.apply(holiday)

    df1 = df.apply(lambda x: x.reset_index(drop=True))

    df1['TRIPID'] = df1['TRIPID'].astype('category')
    df1['STOPPOINTID'] = df1['STOPPOINTID'].astype('category')
    df1['VEHICLEID'] = df1['VEHICLEID'].astype('category')
    df1['LINEID'] = df1['LINEID'].astype('category')
    df1['ROUTEID'] = df1['ROUTEID'].astype('category')
    df1['DIRECTION'] = df1['DIRECTION'].astype('category')
    df1['DAYOFWEEK'] = df1['DAYOFWEEK'].astype('category')
    df1['MONTH'] = df1['MONTH'].astype('category')
    df1['DAY'] = df1['DAY'].astype('category')
    df1['HOLIDAY'] = df1['HOLIDAY'].astype('category')
    df1['weather_main'] = df1['weather_main'].astype('category')
    df1['PROGRNUMBER'] = df1['PROGRNUMBER'].astype('int64')
    df1['clouds_all'] = df1['clouds_all'].astype('float64')

    df1 = df1[[
        'DAYOFSERVICE',
        'LINEID',
        'ROUTEID',
        'DIRECTION',
        'TRIPID',
        'PROGRNUMBER',
        'STOPPOINTID',
        'PLANNEDTIME_ARR',
        'ACTUALTIME_ARR',
        'VEHICLEID',
        'TRIPS_PLANNEDTIME_DEP',
        'timestamp',
        'DAYOFWEEK',
        'DAY',
        'HOLIDAY',
        'temp',
        'feels_like',
        # 'pressure',
        # 'humidity',
        # 'wind_speed',
        'clouds_all',
        'weather_main'
        #  'weather_id'
    ]]

    df_rev = df1.copy()
    df_rev = df_rev.drop([
        'DAYOFSERVICE', 'TRIPID', 'PLANNEDTIME_ARR', 'STOPPOINTID',
        'timestamp', 'DAY', 'VEHICLEID'
    ],
                         axis=1)
    numeric_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='median')), ('scaler',
                                                    StandardScaler())])

    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer,
                       selector(dtype_exclude="category")),
                      ('cat', categorical_transformer,
                       selector(dtype_include="category"))])

    # X_train, X_test, y_train, y_test = train_test_split(df_rev.drop(['ACTUALTIME_ARR'], axis=1), df_rev['ACTUALTIME_ARR'], test_size=0.1, shuffle=False, stratify = None)

    param_grid = {
        'colsample_bytree': [0.1, 0.5, 0.8, 1],
        'learning_rate': [0.001, 0.01, 0.1, 1],
        'max_depth': [5, 10, 15],
        'n_estimators': [50, 100, 150, 200]
    }

    grid_search = Pipeline(
        steps=[('preprocessor', preprocessor),
               ('grid_search',
                GridSearchCV(XGBRegressor(), param_grid, cv=5))])
    grid_search.fit(df_rev.drop(['ACTUALTIME_ARR'], axis=1),
                    df_rev['ACTUALTIME_ARR'])

    result = pd.DataFrame(grid_search['grid_search'].cv_results_).sort_values(
        'mean_test_score',
        ascending=False)[0:5].apply(lambda x: x.reset_index(drop=True))

    param_learning_rate = result.loc[0]['param_learning_rate']
    param_max_depth = result.loc[0]['param_max_depth']
    param_n_estimators = result.loc[0]['param_n_estimators']
    param_colsample_bytree = result.loc[0]['param_colsample_bytree']

    clf_XG = Pipeline(
        steps=[('preprocessor', preprocessor),
               ('classifier',
                XGBRegressor(colsample_bytree=param_colsample_bytree,
                             learning_rate=param_learning_rate,
                             max_depth=param_max_depth,
                             n_estimators=param_n_estimators))])

    # clf_XG = Pipeline(steps=[('preprocessor', preprocessor),
    #                     ('classifier', XGBRegressor(colsample_bytree = 1, learning_rate = 0.1,max_depth = 10, n_estimators = 200))])

    clf_XG.fit(df_rev.drop(['ACTUALTIME_ARR'], axis=1),
               df_rev['ACTUALTIME_ARR'])

    # print("model score: %.7f" % clf_XG.score(X_test, y_test))

    joblib.dump(clf_XG,
                './pickle_file_XG/XG_' + df_rev.iloc[0]['LINEID'] + '.pkl')

示例#20

0

显示文件

文件： 03_categorical_pipeline_ex_01.py 项目： mdiazmel/scikit-learn-mooc

# ```python
# categories = [data[column].unique()
#               for column in data[categorical_columns]]
# OrdinalEncoder(categories=categories)
# ```

# %%
import pandas as pd

df = pd.read_csv("../datasets/adult-census.csv")

# %%
target_name = "class"
target = df[target_name]
data = df.drop(columns=[target_name, "fnlwgt"])

# %%
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)
data_categorical = data[categorical_columns]

# %%
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression

# Write your code here.

示例#21

0

显示文件

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

df = pd.read_csv("../datasets/adult-census.csv")

# %%
target_name = "class"
target = df[target_name].to_numpy()
data = df.drop(columns=[target_name, "fnlwgt"])

# %%
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_include=["int", "float"])
categorical_columns_selector = selector(dtype_exclude=["int", "float"])
numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

categories = [
    data[column].unique() for column in data[categorical_columns]]

# %% [markdown]
# ## Reference pipeline (no numerical scaling and integer-coded categories)
#
# First let's time the pipeline we used in the main notebook to serve as a reference:

# %%
# %%time

示例#22

0

显示文件

文件： column_transformer_target_encoder.py 项目： libinruan/competitions

 def transform(self, X):
     return pd.DataFrame(
         X,
         columns=selector(dtype_include='object')(X))  # return pd.DataFrame

示例#23

0

显示文件

target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "fnlwgt", "education-num"])

data_train, data_test, target_train, target_test = train_test_split(
    data, target, train_size=0.2, random_state=42)

# %%
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer(
    [('cat-preprocessor', categorical_preprocessor,
      selector(dtype_include=object))],
    remainder='passthrough',
    sparse_threshold=0)

# This line is currently required to import HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([("preprocessor", preprocessor),
                  ("classifier",
                   HistGradientBoostingClassifier(random_state=42))])

# %% [markdown]
#
# Use the previously defined model (called `model`) and using two nested `for`

示例#24

0

显示文件

    steps=[
        ("impute", SimpleImputer()),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category")),
    ]
)

complete_pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "estimator",
            DecisionTreeClassifier(min_samples_leaf=10, max_depth=4),
        ),
    ]
)

complete_pipeline.fit(X_train, y_train_true)

示例#25

0

显示文件

文件： 02_numerical_pipeline_scaling.py 项目： mdiazmel/scikit-learn-mooc

import pandas as pd

df = pd.read_csv("../datasets/adult-census.csv")

target_name = "class"
target = df[target_name]

data = df.drop(columns=[target_name, "fnlwgt"])

# %% [markdown]
# We only keep numerical features

# %%
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
numerical_columns = numerical_columns_selector(data)
numerical_columns

data_numeric = data[numerical_columns]

# %% [markdown]
# We do a train-test split for evaluation

# %%
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data_numeric, target, random_state=42)

# %% [markdown]

示例#26

0

显示文件

文件： CDC.py 项目： trey-capps/Pythons-Angels-32

            SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

categorical_features = [
    'applicant_age', 'derived_sex', 'derived_race', 'derived_ethnicity',
    'loan_type', 'county_code', 'denial_reason-1'
]

categorical_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder()),
    #('imputer', SimpleImputer(strategy='constant', fill_value='mode')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer,
                   selector(dtype_exclude='object')),
                  ('cat', categorical_transformer,
                   selector(dtype_include='object'))])

total_features = [
    'income', 'loan_amount', 'tract_minority_population_percent',
    'applicant_age', 'derived_sex', 'derived_race', 'derived_ethnicity',
    'loan_type', 'county_code', 'denial_reason-1'
]

# In[22]:

#Model Training and Testing

#Select Features for models
X = fin_data[[

示例#27

0

显示文件

from sklearn.preprocessing import StandardScaler

categorical_preprocessor = OneHotEncoder()
numerical_preprocessor = StandardScaler()

# Subsequently, create a `ColumnTransformer` to redirect the specific columns
# a preprocessing pipeline.

# %%

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

preprocessor = ColumnTransformer(
    [('cat-preprocessor', categorical_preprocessor,
      selector(dtype_include=object)),
     ('num-preprocessor', numerical_preprocessor,
      selector(dtype_include='number'))],
    remainder='passthrough',
    sparse_threshold=0)

# Finally, concatenate the preprocessing pipeline with a logistic regression.

# %%

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

model = make_pipeline(preprocessor, LogisticRegression())

# Use a `RandomizedSearchCV` to find the best set of hyperparameters by tuning

示例#28

0

显示文件

文件： optuned_ridgeClf_baycatEncoder_premature.py 项目： libinruan/competitions

def ridgeCLF_objective(trial):
    seed_everything(seed=2020)

    TOGGLE_BAY_CAT_ENCODER = True
    if TOGGLE_BAY_CAT_ENCODER:
        temp = train_encoded.drop('target', axis=1).columns
        cat_features = [i for i in temp if i.startswith('cat') and not i.endswith('_code')]
        num_features = [i for i in temp if i not in cat_features and not i.endswith('_code')]
        enc_features = [i for i in temp if i.endswith('_code')]
    else:
        cat_features = selector(dtype_exclude='number')(train.drop('target', axis=1))
        num_features = selector(dtype_include='number')(train.drop('target', axis=1))

    #categorical features zone
    cat_preprocessor = Pipeline(steps=[
        ('oh', OneHotEncoder(handle_unknown='ignore')),
        ('ss', StandardScaler(with_mean=False))
    ])

    # MAX_OF_CARDINALITY = trial.suggest_categorical('max_cardi', [100])
    # def get_low_cardinality_features(df):
    #     cols = df \
    #         .select_dtypes(['object', 'category']) \
    #         .apply(lambda col: col.nunique()) \
    #         .loc[lambda x: x <= MAX_OF_CARDINALITY] \
    #         .index.tolist()     
    #     return df.loc[:, cols]

    # cat_low_cardi_preprocessor = Pipeline([
    #     ('cat_low', FunctionTransformer(func=get_low_cardinality_features)),
    #     ('oh', OneHotEncoder(handle_unknown='ignore')),
    #     ('ss', StandardScaler(with_mean=False))        
    # ])    

    # def get_high_cardinality_features(df):
    #     cols = df \
    #         .select_dtypes(['object', 'category']) \
    #         .apply(lambda col: col.nunique()) \
    #         .loc[lambda x: x > MAX_OF_CARDINALITY] \
    #         .index.tolist()     
    #     return df.loc[:, cols]    

    # SMOOTHING = 0.2182996635284694 # trial.suggest_float('smooth', 0.001, 1.0)
    # cat_high_cardi_preprocessor = Pipeline([
    #     ('cat_high', FunctionTransformer(func=get_high_cardinality_features)),
    #     ('te', TargetEncoder(smoothing=SMOOTHING)),
    #     ('ss', StandardScaler(with_mean=False))        
    # ])    

    def generate_num_polynomial(X):
        cols = X.columns
        for i in range(len(cols)-1):
            for j in range(i+1, len(cols)):
                colname = cols[i] + '_' + cols[j]
                X[colname] = X[cols[i]] * X[cols[j]]
        for i in range(len(cols)-1):
            colname= cols[i] + '^2'
            X[colname] = X[cols[i]].pow(2)
        return X

    num_polynomial = Pipeline([
        ('interact', FunctionTransformer(func=generate_num_polynomial))
    ])        

    num_polynomial_switch = trial.suggest_categorical('ph', [True])

    # numerical features zone
    if num_polynomial_switch:
        num_preprocessor = Pipeline(steps=[ 
            ('ac', num_polynomial),
            ('pt', PowerTransformer(method='yeo-johnson')),
            ('ss', StandardScaler())                                   
        ])
    else:        
        num_preprocessor = Pipeline(steps=[ 
            ('pt', PowerTransformer(method='yeo-johnson')),
            ('ss', StandardScaler())                                   
        ]) 

    enc_preprocessor = Pipeline(steps=[
        ('pt', PowerTransformer(method='yeo-johnson')), # I think it doen's make sense to transform probability values.
        ('ss', StandardScaler())          
    ])

    if TOGGLE_BAY_CAT_ENCODER:
        preprocessor = ColumnTransformer(transformers=[ 
            ('cat', cat_preprocessor, cat_features),
            ('enc', enc_preprocessor, enc_features),
            # ('cat_low', cat_low_cardi_preprocessor, cat_features),
            # ('cat_high', cat_high_cardi_preprocessor, cat_features),
            ('num', num_preprocessor, num_features)                                                       
        ])        
    else:
        preprocessor = ColumnTransformer(transformers=[ 
            ('cat', cat_preprocessor, cat_features),
            # ('cat_low', cat_low_cardi_preprocessor, cat_features),
            # ('cat_high', cat_high_cardi_preprocessor, cat_features),
            ('num', num_preprocessor, num_features)                                                       
        ])

    # if conduct hyperparameter tunning with Optuna, take the comment off in the next line.
    # alpha = trial.suggest_loguniform('clf_alpha', 0.001, 10.0) # [0.001, 10] the first 200 rounds lead to best para = 9.961215980791827. [10, 1e4] the first 60 rounds lead to 9983.72346180751. [1e4, 1e8] leads to 40482.85448271827. <<--- the best lambad so far.
    model = Pipeline(steps=[
        ('prep', preprocessor),
        ('clf', RidgeClassifier(class_weight='balanced', alpha=40482.85448271827, fit_intercept=False))
    ])

    if TOGGLE_BAY_CAT_ENCODER:
        X = train_encoded.drop('target', axis=1)
        y = train_encoded['target']        
    else:
        X = train.drop('target', axis=1)
        y = train['target']

    skf = StratifiedKFold(n_splits=2, shuffle=True)

    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=3, n_jobs=-1) # remove n_jobs=-1 to avoid "Timeout or by a memory leak."
    return scores.mean()

示例#29

0

显示文件

# can use this information to dispatch the categorical columns to the
# ``categorical_transformer`` and the remaining columns to the
# ``numerical_transformer``.

###############################################################################
# .. note:: In practice, you will have to handle yourself the column data type.
#    If you want some columns to be considered as `category`, you will have to
#    convert them into categorical columns. If you are using pandas, you can
#    refer to their documentation regarding `Categorical data
#    <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_.

from sklearn.compose import make_column_selector as selector

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer,
                   selector(dtype_exclude="category")),
                  ('cat', categorical_transformer,
                   selector(dtype_include="category"))])
clf = Pipeline(steps=[('preprocessor',
                       preprocessor), ('classifier', LogisticRegression())])

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

###############################################################################
# The resulting score is not exactly the same as the one from the previous
# pipeline becase the dtype-based selector treats the ``pclass`` columns as
# a numeric features instead of a categorical feature as previously:

selector(dtype_exclude="category")(X_train)

示例#30

0

显示文件

文件： plot_column_transformer_mixed_types.py 项目： Scinawa/qsklearn

# `category` columns when loading the data with ``fetch_openml``. Therefore, we
# can use this information to dispatch the categorical columns to the
# ``categorical_transformer`` and the remaining columns to the
# ``numerical_transformer``.

###############################################################################
# .. note:: In practice, you will have to handle yourself the column data type.
#    If you want some columns to be considered as `category`, you will have to
#    convert them into categorical columns. If you are using pandas, you can
#    refer to their documentation regarding `Categorical data
#    <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_.

from sklearn.compose import make_column_selector as selector

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude="category")),
    ('cat', categorical_transformer, selector(dtype_include="category"))
])

# Reproduce the identical fit/score process
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

###############################################################################
# Using the prediction pipeline in a grid search
###############################################################################
# Grid search can also be performed on the different preprocessing steps
# defined in the ``ColumnTransformer`` object, together with the classifier's
# hyperparameters as part of the ``Pipeline``.