def get_svr_pipeline(countries_threshold=0.97, utc_threshold=0.95, log=False): preprocessing = Pipeline(steps=[ ('countries', CategoricalThresholdTransformer( 'country#cat', threshold=countries_threshold, log=log)), ('utc_offset', CategoricalThresholdTransformer( 'utc_offset#cat', threshold=utc_threshold, log=log)), ('calculated_pop', CalculatedPopTransformer()), ]) numeric_transformer = Pipeline(steps=[ ('log', LogTransformer(exclude_columns=[])), ('scale', MinMaxScaler()), ]) categorical_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore')), ]) transformers = ColumnTransformer(transformers=[ ('numeric_log', numeric_transformer, selector(dtype_exclude=['object', 'category'])), ('categorical', categorical_transformer, selector(dtype_include=['object', 'category'])), ], remainder='passthrough') pipeline = Pipeline(steps=[ ('preprocessing', preprocessing), ('transformations', transformers), ('model', SVR(C=0.5, epsilon=0.01, gamma='scale', cache_size=1999)), ]) return pipeline
def preprocess(): numeric_transformer = StandardScaler(with_mean=True, with_std=True) categorical_transformer = OneHotEncoder(handle_unknown='ignore') preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, selector(dtype_exclude=object)), #self.numeric_features), ('cat', categorical_transformer, selector(dtype_include=object) ) #self.categorical_features) ], remainder='passthrough') return preprocessor
def get_preprocessor(): numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, selector(dtype_exclude=["category", "object"])), ('cat', categorical_transformer, selector(dtype_include=["category"]))]) return preprocessor
def prepocess(self): """ Preprocess the data through normalization of numeric variables and categorical transformations. """ numeric_transformer = StandardScaler() categorical_transformer = OneHotEncoder(handle_unknown='ignore') self.preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, selector(dtype_exclude=object)), #self.numeric_features), ('cat', categorical_transformer, selector(dtype_include=object) ) #self.categorical_features) ], remainder='passthrough')
def preprocessor(self): """Pipeline with numerical pipeline combined with categorical pipeline Returns ------- Pipeline Pipeline with num and cat transformers """ return ColumnTransformer(transformers=[ ("num", self.numerical_transformer, selector(dtype_exclude="category")), ( "cat", self.categorical_transformer, selector(dtype_include="category"), ), ])
def pre_encoder(x): str_encode = Pipeline(steps=[('miss', SimpleImputer()), ('strings', x)]) num_encode = Pipeline( steps=[('miss', SimpleImputer()), ('scaler', StandardScaler())]) pre_encode = ColumnTransformer( transformers=[('categoricals', str_encode, selector(dtype_exclude=['float'])), ('numericals', num_encode, selector(dtype_include=['float']))]) parameters = [{ 'pre_encode__categoricals': [str_encode], 'pre_encode__categoricals__miss__strategy': ['most_frequent'] }, { 'pre_encode__numericals': [num_encode], 'pre_encode__numericals__miss__strategy': ['mean', 'median', 'most_frequent'] }] # return preprocessor and their parameters return [parameters, pre_encode]
def create_and_run_pipeline_GDCV(X, y, param_grid, num_cv=10, clf_obj=LogisticRegression(), random_state=42): # Reproduce the identical fit/score process X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_state) numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]) categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) param_grid = { 'preprocessor__num__imputer__strategy': ['mean', 'median'], 'classifier__C': [0.1, 1.0, 10, 100], } preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, selector(dtype_exclude="category")), ('cat', categorical_transformer, selector(dtype_include="category"))]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', clf_obj)]) grid_search = GridSearchCV(clf, param_grid, cv=num_cv) grid_search.fit(X_train, y_train) clf_name = str(clf_obj).split('(')[0] print(("best %s from grid search: %.3f" % (clf_name, grid_search.score(X_test, y_test)))) return clf
def get_linear_pipeline(alpha=1, countries_threshold=0.97, utc_threshold=0.95, log=False): preprocessing = Pipeline(steps=[ ('countries', CategoricalThresholdTransformer( 'country#cat', threshold=countries_threshold, log=log)), ('utc_offset', CategoricalThresholdTransformer( 'utc_offset#cat', threshold=utc_threshold, log=log)), ('calculated_pop', CalculatedPopTransformer()), ]) numeric_transformer = Pipeline(steps=[ ('log', LogTransformer(exclude_columns=[])), ('poli', PolynomialFeatures(2)), ('scale', MinMaxScaler()), ]) categorical_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore')), ]) transformers = ColumnTransformer(transformers=[ ('numeric_log', numeric_transformer, selector(dtype_exclude=['object', 'category'])), ('categorical', categorical_transformer, selector(dtype_include=['object', 'category'])), ], remainder='passthrough') pipeline = Pipeline(steps=[ ('preprocessing', preprocessing), ('transformations', transformers), ('model', Ridge(alpha=alpha)), ]) return pipeline
def transform(self, X, y=None): """Transform features of length less than self.threshold with ordinal encoder """ dX = X.copy() # use deep copy!! enc = OrdinalEncoder() cats = selector(dtype_include='object')(X) cats_to_encode = list(filter(lambda x: len(x) < self.threshold, cats)) nums_to_keep = set(X.columns).difference(set(cats)) for i in dX.columns: if i in cats_to_encode: dX.loc[:, i] = enc.fit_transform(dX.loc[:, i].to_numpy().reshape( -1, 1)).astype('int') return dX
def q4(): # Retorne aqui o resultado da questão 4. X = df.drop(['Region','Country'],axis=1) numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, selector(dtype_exclude="category")) ]) preprocessor.fit(X) res = preprocessor.transform(df_test.drop(['Region','Country'],axis=1)).tolist() return round(res[0][9],3)
def transform(self, X, y=None): """Transform features of length less than self.threshold with ordinal encoder """ temp = pd.DataFrame(index=range(X.shape[0])) # initialize a Dataframe enc = OrdinalEncoder() cats = selector(dtype_include='object')(X) cats_to_encode = list(filter(lambda x: len(x) < self.threshold, cats)) nums_to_keep = set(X.columns).difference(set(cats)) m = 0 for i in set(cats_to_encode): temp[i] = enc.fit_transform(X.loc[:, i].to_numpy().reshape( -1, 1)).astype('int') return pd.concat([ temp, X.loc[:, [z for z in cats if z not in cats_to_encode]], X.loc[:, nums_to_keep] ], axis=1)
def seed_everything(seed=1903): random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) seed_everything(seed=2020) os.chdir('/kaggle/working') train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv') test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv') sample_submission = pd.read_csv( '../input/tabular-playground-series-mar-2021/sample_submission.csv') select_numeric_features = selector(dtype_include='number') numeric_features = select_numeric_features( train ) # 記得 scaleing for linear models with regularization. Without regularization, linear models doesn't need to be scaled simply for prediction. train_id = train.loc[:, 'id'] test_id = test.loc[:, 'id'] train.drop(['id'], axis=1, inplace=True) test.drop(['id'], axis=1, inplace=True) cat_features = selector(dtype_exclude='number')(train.drop('target', axis=1)) num_features = selector(dtype_include='number')(train.drop('target', axis=1)) cat_preprocessor = Pipeline(steps=[('oh', OneHotEncoder( handle_unknown='ignore')), ('ss', StandardScaler(with_mean=False))]) num_preprocessor = Pipeline(steps=[('pt', PowerTransformer(
test_strat["Attrition"].value_counts(normalize=True) # Sample data train, test = train_test_split(ames, test_size=0.3, random_state=123) # Extract features and response features = train.drop(columns="Sale_Price") label = train["Sale_Price"] # SciKit-Learn does not automatically transform categorical features so we need to # apply a one-hot transformer. We will discuss this more thoroughly in the next chapter. categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[('cat', categorical_transformer, selector(dtype_include="object"))]) knn_fit = Pipeline(steps=[('preprocessor', preprocessor), ('knn', KNeighborsRegressor(metric='euclidean'))]) # Specify resampling strategy cv = RepeatedKFold(n_splits=10, n_repeats=5) # Create grid of hyperparameter values hyper_grid = {'knn__n_neighbors': range(3, 26)} # Tune a knn model using grid search grid_search = GridSearchCV(knn_fit, hyper_grid, cv=cv, scoring='neg_mean_squared_error') results = grid_search.fit(features, label)
# %% X_train.info() # %% [markdown] # While some features are numeric, some have been tagged as `category`. These # features need to be encoded such that our random forest can # deal with them. The simplest solution is to use an `OrdinalEncoder`. # Regarding, the numerical features, we don't need to do anything. Thus, we # will create preprocessing steps to take care of the encoding. # %% from sklearn.compose import make_column_transformer from sklearn.compose import make_column_selector as selector from sklearn.preprocessing import OrdinalEncoder categorical_selector = selector(dtype_include="category") preprocessor = make_column_transformer( (OrdinalEncoder(), categorical_selector), remainder="passthrough", ) X_train_preprocessed = pd.DataFrame( preprocessor.fit_transform(X_train), columns=( categorical_selector(X_train) + [col for col in X_train.columns if col not in categorical_selector(X_train)] ) ) X_train_preprocessed.head()
def wrapper_feature_transformer_ensembles_trees_clf_v2( X, y, X_test, y_test, n_estimator=10, transformer=None, clf_obj=LogisticRegression(max_iter=1000)): numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]) categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, selector(dtype_exclude="category")), ('cat', categorical_transformer, selector(dtype_include="category"))]) scaler = StandardScaler() scaler.fit(X) x_train_scaled = scaler.transform(X) x_test_scaled = scaler.transform(X_test) clf_name = str(clf_obj).split('(')[0] if transformer is not None: transformer.fit(x_train_scaled) x_train_scaled = transformer.transform(x_train_scaled) x_test_scaled = transformer.transform(x_test_scaled) pass # It is important to train the ensemble of trees on a different subset # of the training data than the linear regression model to avoid # overfitting, in particular if the total number of leaves is # similar to the number of training samples X_train, X_train_lr, y_train, y_train_lr = train_test_split(x_train_scaled, y, test_size=0.5, random_state=0) # Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0) rt_clf = sklearn.base.clone(clf_obj) pipeline = make_pipeline(rt, rt_clf) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict(X_test) fpr_rt_clf, tpr_rt_clf, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = OneHotEncoder() rf_clf = sklearn.base.clone(clf_obj) rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_clf.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_clf = rf_clf.predict(rf_enc.transform(rf.apply(x_test_scaled))) fpr_rf_clf, tpr_rf_clf, _ = roc_curve(y_test, y_pred_rf_clf) # Supervised transformation based on gradient boosted trees grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = OneHotEncoder() grd_clf = sklearn.base.clone(clf_obj) grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_clf.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_clf = grd_clf.predict( # grd_enc.transform(grd.apply(x_test_scaled)[:, :, 0]))[:, 1] grd_enc.transform(grd.apply(x_test_scaled)[:, :, 0])) fpr_grd_clf, tpr_grd_clf, _ = roc_curve(y_test, y_pred_grd_clf) # The gradient boosted model by itself y_pred_grd = grd.predict(x_test_scaled) fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd) # The random forest model by itself y_pred_rf = rf.predict(X_test) fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_rt_clf, tpr_rt_clf, label='RT + LR') plt.plot(fpr_rf, tpr_rf, label='RF') plt.plot(fpr_rf_clf, tpr_rf_clf, label='RF + LR') plt.plot(fpr_grd, tpr_grd, label='GBT') plt.plot(fpr_grd_clf, tpr_grd_clf, label='GBT + LR') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title(f'ROC curve - {clf_name}') plt.legend(loc='best') plt.show() plt.figure(2) plt.xlim(0, 0.2) plt.ylim(0.8, 1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_rt_clf, tpr_rt_clf, label='RT + LR') plt.plot(fpr_rf, tpr_rf, label='RF') plt.plot(fpr_rf_clf, tpr_rf_clf, label='RF + LR') plt.plot(fpr_grd, tpr_grd, label='GBT') plt.plot(fpr_grd_clf, tpr_grd_clf, label='GBT + LR') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title(f'ROC curve (zoomed in at top left) - {clf_name}') plt.legend(loc='best') plt.show() pass
def get_pipeline(model): """ Generates a scikit-learn modeling pipeline with model as the final step. :param model: instantiated model :returns: scikit-learn pipeline """ numeric_transformer = Pipeline(steps=[ ('mouse_movement_clipper', FunctionTransformer(clip_feature_bounds, validate=False, kw_args={ 'feature': 'mouse_movement', 'cutoff': 0, 'new_amount': 0, 'clip_type': 'lower' })), ('propensity_score_clipper', FunctionTransformer(clip_feature_bounds, validate=False, kw_args={ 'feature': 'propensity_score', 'cutoff': 0, 'new_amount': 0, 'clip_type': 'lower' })), ('completeness_score_clipper', FunctionTransformer(clip_feature_bounds, validate=False, kw_args={ 'feature': 'completeness_score', 'cutoff': 0, 'new_amount': 0, 'clip_type': 'lower' })), ('profile_score_clipper', FunctionTransformer(clip_feature_bounds, validate=False, kw_args={ 'feature': 'profile_score', 'cutoff': 0, 'new_amount': 0, 'clip_type': 'lower' })), ('average_stars_clipper', FunctionTransformer(clip_feature_bounds, validate=False, kw_args={ 'feature': 'average_stars', 'cutoff': 0, 'new_amount': 0, 'clip_type': 'lower' })), ('ratio_creator', FunctionTransformer(create_ratio_column, validate=False, kw_args={ 'col1': 'profile_score', 'col2': 'activity_score' })), ('log_creator', TakeLog()), ('dict_creator', FeaturesToDict()), ('dict_vectorizer', DictVectorizer(sparse=False)), ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()), ('feature_selector', SelectPercentile(f_classif)), ]) categorical_transformer = Pipeline(steps=[ ('date_transformer', FunctionTransformer(convert_column_to_datetime, validate=False, kw_args={'feature': 'acquired_date'})), ('month_extractor', FunctionTransformer(extract_month_from_date, validate=False, kw_args={'date_col': 'acquired_date'})), ('quarter_extractor', FunctionTransformer(convert_month_to_quarter, validate=False, kw_args={ 'month_col': 'month', 'mapping_dict': MONTH_TO_QUARTER_DICT })), ('year_extractor', FunctionTransformer(extract_year_from_date, validate=False, kw_args={'date_col': 'acquired_date'})), ('date_dropper', FunctionTransformer(drop_features, validate=False, kw_args={'feature_list': FEATURES_TO_DROP})), ('imputer', FunctionTransformer(fill_missing_values, validate=False, kw_args={'fill_value': CATEGORICAL_FILL_VALUE})), ('category_combiner', CombineCategoryLevels()), ('dict_creator', FeaturesToDict()), ('dict_vectorizer', DictVectorizer(sparse=False)), ('feature_selector', SelectPercentile(chi2)), ]) preprocessor = ColumnTransformer( transformers=[('numeric_transformer', numeric_transformer, selector(dtype_include='number')), ('categorical_transformer', categorical_transformer, selector(dtype_exclude='number'))], remainder='passthrough', ) pipeline = Pipeline(steps=[( 'data_mapper', FunctionTransformer(ensure_features_are_standardized, validate=False, kw_args={'feature_mapping': FEATURE_DTYPE_MAPPING}) ), ('preprocessor', preprocessor), ('variance_thresholder', VarianceThreshold()), ('model', model)]) return pipeline
# %% 選出categorical features方法二 from sklearn.compose import make_column_selector as selector class dummyTransformer(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): return pd.DataFrame( X, columns=selector(dtype_include='object')(X)) # return pd.DataFrame tr4 = ColumnTransformer(transformers=[('dum', dummyTransformer(), selector(dtype_include='object'))]) tr4.fit_transform(df) # Note: returns a ndarray # %% # ANCHOR Custom transformers # ------------------------------- EXPERIMENT 3 ------------------------------- # import random import numpy as np import pandas as pd from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.base import TransformerMixin, BaseEstimator from sklearn.preprocessing import LabelEncoder, OrdinalEncoder from sklearn.pipeline import Pipeline from sklearn.compose import make_column_selector as selector
cat_pipe = make_pipeline( SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(handle_unknown="ignore"), ) # %% [markdown] # Then, we can create a preprocessor which will dispatch the categorical # columns to the categorical pipeline and the numerical columns to the # numerical pipeline # %% from sklearn.compose import make_column_transformer from sklearn.compose import make_column_selector as selector preprocessor_linear = make_column_transformer( (num_pipe, selector(dtype_include="number")), (cat_pipe, selector(dtype_include="category")), n_jobs=2, ) # %% [markdown] # Finally, we connect our preprocessor with our # :class:`~sklearn.linear_model.LogisticRegression`. We can then evaluate our # model. # %% from sklearn.linear_model import LogisticRegression lr_clf = make_pipeline(preprocessor_linear, LogisticRegression(max_iter=1000)) # %%
def xgboost_model(line_file, trip_file, weather_file): df_route = pd.read_csv('./leavetimes_by_line/' + line_file, keep_default_na=True, sep=',\s+', delimiter=';', skipinitialspace=True) df_route = df_route.drop([ 'DATASOURCE', 'PLANNEDTIME_DEP', 'ACTUALTIME_DEP', 'PASSENGERS', 'PASSENGERSIN', 'PASSENGERSOUT', 'DISTANCE', 'SUPPRESSED', 'JUSTIFICATIONID', 'LASTUPDATE', 'NOTE' ], 1) df_trips = pd.read_csv(trip_file, keep_default_na=True, sep=',\s+', delimiter=';', skipinitialspace=True) df_trips = df_trips.drop([ 'DATASOURCE', 'TENDERLOT', 'SUPPRESSED', 'JUSTIFICATIONID', 'BASIN', 'ACTUALTIME_ARR', 'ACTUALTIME_DEP', 'PLANNEDTIME_ARR', 'LASTUPDATE', 'NOTE' ], 1) df_trips = df_trips.rename( columns={'PLANNEDTIME_DEP': 'TRIPS_PLANNEDTIME_DEP'}) df_weather = pd.read_csv(weather_file) df_weather.drop([ 'dt', 'timezone', 'city_name', 'lat', 'lon', 'temp_min', 'temp_max', 'sea_level', 'grnd_level', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h', 'weather_description', 'wind_deg', 'weather_icon' ], 1) def drop_UTC(str): return str.replace("+0000 UTC", "") df_weather['date'] = df_weather['dt_iso'].apply(drop_UTC) df_weather = df_weather.drop(['dt_iso'], 1) df_weather['date'] = pd.to_datetime(df_weather['date']) df_weather = df_weather[[ 'date', 'temp', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'clouds_all', 'weather_main' ]] df_weather['weather_main'] = df_weather['weather_main'].astype('category') df = pd.merge(df_route, df_trips, on=['DAYOFSERVICE', 'TRIPID', 'ROUTEID']) df['TRIPID'] = df['TRIPID'].astype('object') df = df[[ 'DAYOFSERVICE', 'LINEID', 'ROUTEID', 'DIRECTION', 'TRIPID', 'PROGRNUMBER', 'STOPPOINTID', 'PLANNEDTIME_ARR', 'ACTUALTIME_ARR', 'VEHICLEID', 'TRIPS_PLANNEDTIME_DEP' ]] import re def tidy_datetime(time_str): if 'JAN' in time_str: return str(re.sub('JAN', '01', time_str)) elif 'FEB' in time_str: return str(re.sub('FEB', '02', time_str)) elif 'MAR' in time_str: return str(re.sub('MAR', '03', time_str)) elif 'APR' in time_str: return str(re.sub('APR', '04', time_str)) elif 'MAY' in time_str: return str(re.sub('MAY', '05', time_str)) elif 'JUN' in time_str: return str(re.sub('JUN', '06', time_str)) elif 'JUL' in time_str: return str(re.sub('JUL', '07', time_str)) elif 'AUG' in time_str: return str(re.sub('AUG', '08', time_str)) elif 'SEP' in time_str: return str(re.sub('SEP', '09', time_str)) elif 'OCT' in time_str: return str(re.sub('OCT', '10', time_str)) elif 'NOV' in time_str: return str(re.sub('NOV', '11', time_str)) elif 'DEC' in time_str: return str(re.sub('DEC', '12', time_str)) return time_str df['DAYOFSERVICE'] = df['DAYOFSERVICE'].apply(tidy_datetime) df['DAYOFSERVICE'] = pd.to_datetime( df['DAYOFSERVICE'], format='%d-%m-%y %H:%M:%S').dt.strftime('%Y-%m-%d %H:%M:%S') df['DAYOFSERVICE'] = pd.to_datetime(df['DAYOFSERVICE']) df['timestamp'] = df.apply(lambda x: x['DAYOFSERVICE'] + pd.Timedelta( seconds=x['TRIPS_PLANNEDTIME_DEP']), axis=1) df = df.sort_values( ["timestamp", "PROGRNUMBER"], ascending=(True, True)).apply(lambda x: x.reset_index(drop=True)) df['timestamp'] = pd.to_datetime(df['timestamp']) df['DAYOFWEEK'] = df['timestamp'].dt.dayofweek df['MONTH'] = df['timestamp'].dt.month df['DAY'] = df['timestamp'].dt.day df['date'] = df['timestamp'].dt.round('H') df = pd.merge(df, df_weather, on=['date']) df = df.drop(['date'], 1) holiday_list = [ '2018-01-01', '2018-03-17', '2018-03-20', '2018-03-30', '2018-04-01', '2018-04-02', '2018-05-07', '2018-06-04', '2018-06-21', '2018-08-06', '2018-09-23', '2018-10-29', '2018-12-21', '2018-12-24', '2018-12-25', '2018-12-26', '2018-12-31' ] def holiday(time_str): if str(time_str) in holiday_list: return 1 return 0 df['HOLIDAY'] = df['DAYOFSERVICE'].dt.date.apply(holiday) df1 = df.apply(lambda x: x.reset_index(drop=True)) df1['TRIPID'] = df1['TRIPID'].astype('category') df1['STOPPOINTID'] = df1['STOPPOINTID'].astype('category') df1['VEHICLEID'] = df1['VEHICLEID'].astype('category') df1['LINEID'] = df1['LINEID'].astype('category') df1['ROUTEID'] = df1['ROUTEID'].astype('category') df1['DIRECTION'] = df1['DIRECTION'].astype('category') df1['DAYOFWEEK'] = df1['DAYOFWEEK'].astype('category') df1['MONTH'] = df1['MONTH'].astype('category') df1['DAY'] = df1['DAY'].astype('category') df1['HOLIDAY'] = df1['HOLIDAY'].astype('category') df1['weather_main'] = df1['weather_main'].astype('category') df1['PROGRNUMBER'] = df1['PROGRNUMBER'].astype('int64') df1['clouds_all'] = df1['clouds_all'].astype('float64') df1 = df1[[ 'DAYOFSERVICE', 'LINEID', 'ROUTEID', 'DIRECTION', 'TRIPID', 'PROGRNUMBER', 'STOPPOINTID', 'PLANNEDTIME_ARR', 'ACTUALTIME_ARR', 'VEHICLEID', 'TRIPS_PLANNEDTIME_DEP', 'timestamp', 'DAYOFWEEK', 'DAY', 'HOLIDAY', 'temp', 'feels_like', # 'pressure', # 'humidity', # 'wind_speed', 'clouds_all', 'weather_main' # 'weather_id' ]] df_rev = df1.copy() df_rev = df_rev.drop([ 'DAYOFSERVICE', 'TRIPID', 'PLANNEDTIME_ARR', 'STOPPOINTID', 'timestamp', 'DAY', 'VEHICLEID' ], axis=1) numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, selector(dtype_exclude="category")), ('cat', categorical_transformer, selector(dtype_include="category"))]) # X_train, X_test, y_train, y_test = train_test_split(df_rev.drop(['ACTUALTIME_ARR'], axis=1), df_rev['ACTUALTIME_ARR'], test_size=0.1, shuffle=False, stratify = None) param_grid = { 'colsample_bytree': [0.1, 0.5, 0.8, 1], 'learning_rate': [0.001, 0.01, 0.1, 1], 'max_depth': [5, 10, 15], 'n_estimators': [50, 100, 150, 200] } grid_search = Pipeline( steps=[('preprocessor', preprocessor), ('grid_search', GridSearchCV(XGBRegressor(), param_grid, cv=5))]) grid_search.fit(df_rev.drop(['ACTUALTIME_ARR'], axis=1), df_rev['ACTUALTIME_ARR']) result = pd.DataFrame(grid_search['grid_search'].cv_results_).sort_values( 'mean_test_score', ascending=False)[0:5].apply(lambda x: x.reset_index(drop=True)) param_learning_rate = result.loc[0]['param_learning_rate'] param_max_depth = result.loc[0]['param_max_depth'] param_n_estimators = result.loc[0]['param_n_estimators'] param_colsample_bytree = result.loc[0]['param_colsample_bytree'] clf_XG = Pipeline( steps=[('preprocessor', preprocessor), ('classifier', XGBRegressor(colsample_bytree=param_colsample_bytree, learning_rate=param_learning_rate, max_depth=param_max_depth, n_estimators=param_n_estimators))]) # clf_XG = Pipeline(steps=[('preprocessor', preprocessor), # ('classifier', XGBRegressor(colsample_bytree = 1, learning_rate = 0.1,max_depth = 10, n_estimators = 200))]) clf_XG.fit(df_rev.drop(['ACTUALTIME_ARR'], axis=1), df_rev['ACTUALTIME_ARR']) # print("model score: %.7f" % clf_XG.score(X_test, y_test)) joblib.dump(clf_XG, './pickle_file_XG/XG_' + df_rev.iloc[0]['LINEID'] + '.pkl')
# ```python # categories = [data[column].unique() # for column in data[categorical_columns]] # OrdinalEncoder(categories=categories) # ``` # %% import pandas as pd df = pd.read_csv("../datasets/adult-census.csv") # %% target_name = "class" target = df[target_name] data = df.drop(columns=[target_name, "fnlwgt"]) # %% from sklearn.compose import make_column_selector as selector categorical_columns_selector = selector(dtype_include=object) categorical_columns = categorical_columns_selector(data) data_categorical = data[categorical_columns] # %% from sklearn.model_selection import cross_val_score from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OrdinalEncoder from sklearn.linear_model import LogisticRegression # Write your code here.
from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OrdinalEncoder from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier df = pd.read_csv("../datasets/adult-census.csv") # %% target_name = "class" target = df[target_name].to_numpy() data = df.drop(columns=[target_name, "fnlwgt"]) # %% from sklearn.compose import make_column_selector as selector numerical_columns_selector = selector(dtype_include=["int", "float"]) categorical_columns_selector = selector(dtype_exclude=["int", "float"]) numerical_columns = numerical_columns_selector(data) categorical_columns = categorical_columns_selector(data) categories = [ data[column].unique() for column in data[categorical_columns]] # %% [markdown] # ## Reference pipeline (no numerical scaling and integer-coded categories) # # First let's time the pipeline we used in the main notebook to serve as a reference: # %% # %%time
def transform(self, X): return pd.DataFrame( X, columns=selector(dtype_include='object')(X)) # return pd.DataFrame
target = adult_census[target_name] data = adult_census.drop(columns=[target_name, "fnlwgt", "education-num"]) data_train, data_test, target_train, target_test = train_test_split( data, target, train_size=0.2, random_state=42) # %% from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_selector as selector from sklearn.preprocessing import OrdinalEncoder categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) preprocessor = ColumnTransformer( [('cat-preprocessor', categorical_preprocessor, selector(dtype_include=object))], remainder='passthrough', sparse_threshold=0) # This line is currently required to import HistGradientBoostingClassifier from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.pipeline import Pipeline model = Pipeline([("preprocessor", preprocessor), ("classifier", HistGradientBoostingClassifier(random_state=42))]) # %% [markdown] # # Use the previously defined model (called `model`) and using two nested `for`
steps=[ ("impute", SimpleImputer()), ("scaler", StandardScaler()), ] ) categorical_transformer = Pipeline( [ ("impute", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False)), ] ) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, selector(dtype_exclude="category")), ("cat", categorical_transformer, selector(dtype_include="category")), ] ) complete_pipeline = Pipeline( [ ("preprocessor", preprocessor), ( "estimator", DecisionTreeClassifier(min_samples_leaf=10, max_depth=4), ), ] ) complete_pipeline.fit(X_train, y_train_true)
import pandas as pd df = pd.read_csv("../datasets/adult-census.csv") target_name = "class" target = df[target_name] data = df.drop(columns=[target_name, "fnlwgt"]) # %% [markdown] # We only keep numerical features # %% from sklearn.compose import make_column_selector as selector numerical_columns_selector = selector(dtype_exclude=object) numerical_columns = numerical_columns_selector(data) numerical_columns data_numeric = data[numerical_columns] # %% [markdown] # We do a train-test split for evaluation # %% from sklearn.model_selection import train_test_split data_train, data_test, target_train, target_test = train_test_split( data_numeric, target, random_state=42) # %% [markdown]
SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = [ 'applicant_age', 'derived_sex', 'derived_race', 'derived_ethnicity', 'loan_type', 'county_code', 'denial_reason-1' ] categorical_transformer = Pipeline(steps=[ ('encoder', OrdinalEncoder()), #('imputer', SimpleImputer(strategy='constant', fill_value='mode')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, selector(dtype_exclude='object')), ('cat', categorical_transformer, selector(dtype_include='object'))]) total_features = [ 'income', 'loan_amount', 'tract_minority_population_percent', 'applicant_age', 'derived_sex', 'derived_race', 'derived_ethnicity', 'loan_type', 'county_code', 'denial_reason-1' ] # In[22]: #Model Training and Testing #Select Features for models X = fin_data[[
from sklearn.preprocessing import StandardScaler categorical_preprocessor = OneHotEncoder() numerical_preprocessor = StandardScaler() # Subsequently, create a `ColumnTransformer` to redirect the specific columns # a preprocessing pipeline. # %% from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_selector as selector preprocessor = ColumnTransformer( [('cat-preprocessor', categorical_preprocessor, selector(dtype_include=object)), ('num-preprocessor', numerical_preprocessor, selector(dtype_include='number'))], remainder='passthrough', sparse_threshold=0) # Finally, concatenate the preprocessing pipeline with a logistic regression. # %% from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression model = make_pipeline(preprocessor, LogisticRegression()) # Use a `RandomizedSearchCV` to find the best set of hyperparameters by tuning
def ridgeCLF_objective(trial): seed_everything(seed=2020) TOGGLE_BAY_CAT_ENCODER = True if TOGGLE_BAY_CAT_ENCODER: temp = train_encoded.drop('target', axis=1).columns cat_features = [i for i in temp if i.startswith('cat') and not i.endswith('_code')] num_features = [i for i in temp if i not in cat_features and not i.endswith('_code')] enc_features = [i for i in temp if i.endswith('_code')] else: cat_features = selector(dtype_exclude='number')(train.drop('target', axis=1)) num_features = selector(dtype_include='number')(train.drop('target', axis=1)) #categorical features zone cat_preprocessor = Pipeline(steps=[ ('oh', OneHotEncoder(handle_unknown='ignore')), ('ss', StandardScaler(with_mean=False)) ]) # MAX_OF_CARDINALITY = trial.suggest_categorical('max_cardi', [100]) # def get_low_cardinality_features(df): # cols = df \ # .select_dtypes(['object', 'category']) \ # .apply(lambda col: col.nunique()) \ # .loc[lambda x: x <= MAX_OF_CARDINALITY] \ # .index.tolist() # return df.loc[:, cols] # cat_low_cardi_preprocessor = Pipeline([ # ('cat_low', FunctionTransformer(func=get_low_cardinality_features)), # ('oh', OneHotEncoder(handle_unknown='ignore')), # ('ss', StandardScaler(with_mean=False)) # ]) # def get_high_cardinality_features(df): # cols = df \ # .select_dtypes(['object', 'category']) \ # .apply(lambda col: col.nunique()) \ # .loc[lambda x: x > MAX_OF_CARDINALITY] \ # .index.tolist() # return df.loc[:, cols] # SMOOTHING = 0.2182996635284694 # trial.suggest_float('smooth', 0.001, 1.0) # cat_high_cardi_preprocessor = Pipeline([ # ('cat_high', FunctionTransformer(func=get_high_cardinality_features)), # ('te', TargetEncoder(smoothing=SMOOTHING)), # ('ss', StandardScaler(with_mean=False)) # ]) def generate_num_polynomial(X): cols = X.columns for i in range(len(cols)-1): for j in range(i+1, len(cols)): colname = cols[i] + '_' + cols[j] X[colname] = X[cols[i]] * X[cols[j]] for i in range(len(cols)-1): colname= cols[i] + '^2' X[colname] = X[cols[i]].pow(2) return X num_polynomial = Pipeline([ ('interact', FunctionTransformer(func=generate_num_polynomial)) ]) num_polynomial_switch = trial.suggest_categorical('ph', [True]) # numerical features zone if num_polynomial_switch: num_preprocessor = Pipeline(steps=[ ('ac', num_polynomial), ('pt', PowerTransformer(method='yeo-johnson')), ('ss', StandardScaler()) ]) else: num_preprocessor = Pipeline(steps=[ ('pt', PowerTransformer(method='yeo-johnson')), ('ss', StandardScaler()) ]) enc_preprocessor = Pipeline(steps=[ ('pt', PowerTransformer(method='yeo-johnson')), # I think it doen's make sense to transform probability values. ('ss', StandardScaler()) ]) if TOGGLE_BAY_CAT_ENCODER: preprocessor = ColumnTransformer(transformers=[ ('cat', cat_preprocessor, cat_features), ('enc', enc_preprocessor, enc_features), # ('cat_low', cat_low_cardi_preprocessor, cat_features), # ('cat_high', cat_high_cardi_preprocessor, cat_features), ('num', num_preprocessor, num_features) ]) else: preprocessor = ColumnTransformer(transformers=[ ('cat', cat_preprocessor, cat_features), # ('cat_low', cat_low_cardi_preprocessor, cat_features), # ('cat_high', cat_high_cardi_preprocessor, cat_features), ('num', num_preprocessor, num_features) ]) # if conduct hyperparameter tunning with Optuna, take the comment off in the next line. # alpha = trial.suggest_loguniform('clf_alpha', 0.001, 10.0) # [0.001, 10] the first 200 rounds lead to best para = 9.961215980791827. [10, 1e4] the first 60 rounds lead to 9983.72346180751. [1e4, 1e8] leads to 40482.85448271827. <<--- the best lambad so far. model = Pipeline(steps=[ ('prep', preprocessor), ('clf', RidgeClassifier(class_weight='balanced', alpha=40482.85448271827, fit_intercept=False)) ]) if TOGGLE_BAY_CAT_ENCODER: X = train_encoded.drop('target', axis=1) y = train_encoded['target'] else: X = train.drop('target', axis=1) y = train['target'] skf = StratifiedKFold(n_splits=2, shuffle=True) scores = cross_val_score(model, X, y, scoring='roc_auc', cv=3, n_jobs=-1) # remove n_jobs=-1 to avoid "Timeout or by a memory leak." return scores.mean()
# can use this information to dispatch the categorical columns to the # ``categorical_transformer`` and the remaining columns to the # ``numerical_transformer``. ############################################################################### # .. note:: In practice, you will have to handle yourself the column data type. # If you want some columns to be considered as `category`, you will have to # convert them into categorical columns. If you are using pandas, you can # refer to their documentation regarding `Categorical data # <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_. from sklearn.compose import make_column_selector as selector preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, selector(dtype_exclude="category")), ('cat', categorical_transformer, selector(dtype_include="category"))]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())]) clf.fit(X_train, y_train) print("model score: %.3f" % clf.score(X_test, y_test)) ############################################################################### # The resulting score is not exactly the same as the one from the previous # pipeline becase the dtype-based selector treats the ``pclass`` columns as # a numeric features instead of a categorical feature as previously: selector(dtype_exclude="category")(X_train)
# `category` columns when loading the data with ``fetch_openml``. Therefore, we # can use this information to dispatch the categorical columns to the # ``categorical_transformer`` and the remaining columns to the # ``numerical_transformer``. ############################################################################### # .. note:: In practice, you will have to handle yourself the column data type. # If you want some columns to be considered as `category`, you will have to # convert them into categorical columns. If you are using pandas, you can # refer to their documentation regarding `Categorical data # <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_. from sklearn.compose import make_column_selector as selector preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, selector(dtype_exclude="category")), ('cat', categorical_transformer, selector(dtype_include="category")) ]) # Reproduce the identical fit/score process X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train) print("model score: %.3f" % clf.score(X_test, y_test)) ############################################################################### # Using the prediction pipeline in a grid search ############################################################################### # Grid search can also be performed on the different preprocessing steps # defined in the ``ColumnTransformer`` object, together with the classifier's # hyperparameters as part of the ``Pipeline``.