def fit_model(X, y, classifier_settings={}, fit_settings={}): strategy = classifier_settings.get("numerical_impute_strategy") if strategy is None: raise Exception("Missing impute strategy for numerical features") numeric_transformer = Pipeline(steps=[("num_impute", SimpleImputer(strategy=strategy))]) strategy = classifier_settings.get("categorical_impute_strategy") if strategy is None: raise Exception("Missing impute strategy for categorical features") categorical_transformer = Pipeline(steps=[ ("cat_impute", SimpleImputer(strategy=strategy)), ("ohe", OneHotEncoder(drop="if_binary", handle_unknown="error")), ]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, make_column_selector(dtype_include=np.number)), ( "cat", categorical_transformer, make_column_selector(dtype_exclude=np.number), ), ]) clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]) clf.fit(X, y) return clf
def fit_model(X, y, classifier_settings={}, fit_settings={}): numeric_transformer = Pipeline(steps=[("identity", IdentityTransformer())]) categorical_transformer = Pipeline(steps=[ ("ohe", OneHotEncoder(drop="if_binary", handle_unknown="error")), ]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, make_column_selector(dtype_include=np.number)), ( "cat", categorical_transformer, make_column_selector(dtype_exclude=np.number), ), ]) xgb_clf = xgb.sklearn.XGBClassifier(**classifier_settings) clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", xgb_clf)]) clf.fit(X, y, **fit_settings) return clf
def test_ColumnTransformer_with_selector(): expected = pd.DataFrame({ "name": [ "hotel", "hotel", "meal", "meal", "meal", "lead_time", "average_daily_rate", ], "feature": [ "x0_City_Hotel", "x0_Resort_Hotel", "x1_BB", "x1_HB", "x1_SC", "lead_time", "average_daily_rate", ], }) preprocess = make_column_transformer( (OneHotEncoder(sparse=False), make_column_selector(dtype_include=object)), (StandardScaler(), numeric, make_column_selector(dtype_exclude=object)), ) preprocess.fit(X) assert feat(preprocess, X.columns).equals(expected)
def build_pipeline(X_train): categorical_values = [] cat_subset = X_train.select_dtypes( include=['object', 'category', 'bool']) for i in range(cat_subset.shape[1]): categorical_values.append( list(cat_subset.iloc[:, i].dropna().unique())) date_pipeline = Pipeline([('dateFeatures', process.DateTransform())]) num_pipeline = Pipeline([('cleaner', SimpleImputer()), ('scaler', StandardScaler())]) cat_pipeline = Pipeline([ ('cleaner', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(sparse=False, categories=categorical_values)) ]) preprocessor = ColumnTransformer([ ('numerical', num_pipeline, make_column_selector( dtype_exclude=['object', 'category', 'bool'])), ('categorical', cat_pipeline, make_column_selector( dtype_include=['object', 'category', 'bool'])) ]) return preprocessor
def test_column_transformer_with_make_column_selector(): # Functional test for column transformer + column selector pd = pytest.importorskip('pandas') X_df = pd.DataFrame( { 'col_int': np.array([0, 1, 2], dtype=np.int), 'col_float': np.array([0.0, 1.0, 2.0], dtype=np.float), 'col_cat': ["one", "two", "one"], 'col_str': ["low", "middle", "high"] }, columns=['col_int', 'col_float', 'col_cat', 'col_str']) X_df['col_str'] = X_df['col_str'].astype('category') cat_selector = make_column_selector(dtype_include=['category', object]) num_selector = make_column_selector(dtype_include=np.number) ohe = OneHotEncoder() scaler = StandardScaler() ct_selector = make_column_transformer((ohe, cat_selector), (scaler, num_selector)) ct_direct = make_column_transformer((ohe, ['col_cat', 'col_str']), (scaler, ['col_float', 'col_int'])) X_selector = ct_selector.fit_transform(X_df) X_direct = ct_direct.fit_transform(X_df) assert_allclose(X_selector, X_direct)
def set_pipeline(self): self.pipeline = self.kwargs.get("pipeline", None) # Create a temp folder cachedir = mkdtemp() # Pipeline structure num_transformer = MinMaxScaler() cat_transformer = OneHotEncoder(handle_unknown = 'ignore') feateng_blocks = [ ("num_transformer", num_transformer, make_column_selector(dtype_include = ['int', 'float'])), ("cat_transformer", cat_transformer, make_column_selector(dtype_include = ['object', 'bool'])) ] features_encoder = columnTransformer(feateng_blocks, n_jobs = None, remainder = "drop" ) # Combine preprocessing and model: self.pipeline = Pipeline(steps = [ ('features', features_encoder), ('model', self.get_estimator()) ], memory = cachedir # Avoid recalculating transformer variables during cross validations or grid searches ) # Clear the cache directory after the cross-validation rmtree(cachedir)
def features1(): return [ ("shift_0", Shift(0), make_column_selector(dtype_include=np.number)), ("shift_1", Shift(1), make_column_selector(dtype_include=np.number)), ( "moving_average_3", MovingAverage(window_size=3), make_column_selector(dtype_include=np.number), ), ]
def columns_transform(): return make_column_transformer( ( StandardScaler(), make_column_selector("^(?!crashYear)", dtype_include=np.number), ), ( OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include=object), ), )
def feature_selection(X: pd.DataFrame, y: pd.DataFrame): fe_column_transformer = ColumnTransformer( transformers=[('numeric', SelectKBest(score_func=f_classif, k="all"), make_column_selector(dtype_include=np.number)), ('categorical', SelectKBest(score_func=chi2, k="all"), make_column_selector(dtype_include="category"))]) fe_column_transformer.fit(X, y) X = fe_column_transformer.transform(X) return X, y
def dataset_transform(): return ColumnTransformer( [ ("scaler", StandardScaler(), make_column_selector(dtype_include="number")), ( "encoder", OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include=object), ), ], remainder="passthrough", )
def __init__(self, X, y): ''' This creates the X_train, X_test, y_train, y_test arrays self.X_train self.X_test self.y_train self.y_test This also creates the simple preprocessing object A column transformer with two sides able to automatically handle numerical and categeorical data. Default Numerical Simple Inputter uses strategy 'mean' Default Categoerical Simple Inputter uses fill value 'other' self.preprocessing ''' X_train, X_test, y_train, y_test = train_test_split(X, y) self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test # Below is the basic preprocessing pipeline # https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html # set up numerical pipeline numeric_transformer = Pipeline( steps=[('num_imputer', SimpleImputer( strategy='mean')), ('num_scaler', StandardScaler())]) # Set up categorical papeline categorical_transformer = Pipeline( steps=[('cat_imputer', SimpleImputer(strategy='constant', fill_value='Other') ), ('cat_onehot', OneHotEncoder(handle_unknown='ignore') ), ('cat_scaler', StandardScaler(with_mean=False))]) # set up preprocessing column transformer preprocessing = ColumnTransformer( transformers=[('num', numeric_transformer, make_column_selector(dtype_include=np.number)), ('cat', categorical_transformer, make_column_selector(dtype_include='object'))]) self.preprocessing = preprocessing
def __init__(self, return_df=True): self.return_df = return_df self.impute_median = SimpleImputer(strategy='median') self.impute_const = SimpleImputer(strategy='constant') self.ss = StandardScaler() self.ohe = OneHotEncoder(handle_unknown='ignore') self.num_cols = make_column_selector(dtype_include='number') self.cat_cols = make_column_selector(dtype_exclude='number') self.prerocessor = make_column_transformer( (make_pipeline(self.impute_median, self.ss), self.num_cols), (make_pipeline(self.impute_const, self.ohe), self.cat_cols), )
def get_pipeline(model, impute_cat='default', impute_num = 'default', scale='default',onehot='default',remove_outliers='default'): # in essence this splits the input into a categorical pipeline and a numeric pipeline # merged with a ColumnTransformer # on top a model is plugged (within OutlierExtractor if remove_outliers = True) # this works very nicely! cat_steps = [] if impute_cat=='default': cat_steps.append(('impute_cat', DFSimpleImputer(strategy='constant',fill_value='None'))) elif impute_cat: cat_steps.append(('impute_cat', impute_cat)) if onehot == 'default': cat_steps.append(('cat_to_num', DFGetDummies())) elif onehot: cat_steps.append(('cat_to_num', onehot)) # equal to: cat_steps.append(('cat_to_num', DFOneHotEncoder(handle_unknown="ignore"))) categorical_transformer = Pipeline(steps=cat_steps) num_steps = [] if impute_num == 'default': num_steps.append(('impute_num', DFSimpleImputer(strategy='mean'))) elif impute_num: num_steps.append(('impute_num', impute_num)) if scale == 'default': num_steps.append(('scale_num', DFStandardScaler())) elif scale: num_steps.append(('scale_num', scale)) numeric_transformer = Pipeline(steps=num_steps) col_trans = DFColumnTransformer(transformers=[ ('numeric', numeric_transformer, make_column_selector(dtype_include=np.number)), ('category', categorical_transformer, make_column_selector(dtype_exclude=np.number)), ]) preprocessor_steps = [('col_trans', col_trans)] preprocessor = Pipeline(steps=preprocessor_steps,memory=memory) final_pipe = [('preprocess', preprocessor)] if remove_outliers == 'default': final_pipe.append(('model',model)) elif remove_outliers: final_pipe.append(('model',remove_outliers)) # DFOutlierExtractor(model, corruption=0.005) return Pipeline(steps=final_pipe)
def run_cv_model(model, X, y): ''' Runs CrossValidation for a model and dataset :param model: model to run :param X: features :param y: target :return: void ''' one_hot_encoder = make_column_transformer( (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include='category')), remainder='passthrough') # pipeline = Pipeline(one_hot_encoder, model) pipeline = make_pipeline(one_hot_encoder, model) cv_results = cross_validate(pipeline, X, y, cv=4, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=6) # print(sorted(cv_results.keys())) print("Model: " + str(model)) print("test_score") print(cv_results['test_score']) print("average: ", np.average(cv_results['test_score'])) return cv_results['test_score']
def __init__(self, max_features_to_select=0, n_jobs: int = -1): """Initialize the class. Parameters ---------- n_jobs Number of parallel processes to use for cross-validation. If `n_jobs == -1` (default), use all available CPUs. """ self.max_features_to_select = max_features_to_select self.n_jobs = n_jobs transformer = ColumnTransformer([('scale', StandardScaler(), make_column_selector(dtype_include=np.floating))], remainder="passthrough") logistic = LogisticRegressionCV(class_weight="balanced", scoring="roc_auc", solver="lbfgs", max_iter=1000, n_jobs=self.n_jobs) if self.max_features_to_select > 0: select = SelectMRMRe() pipe = make_pipeline(transformer, select, logistic) param_grid = {"selectmrmre__n_features": np.arange(2, self.max_features_to_select + 1)} self.model = GridSearchCV(pipe, param_grid, n_jobs=self.n_jobs) else: self.model = make_pipeline(transformer, logistic)
def __init__(self, max_features_to_select=0, n_jobs: int = -1): """Initialize the class. Parameters ---------- n_jobs Number of parallel processes to use for cross-validation. If `n_jobs == -1` (default), use all available CPUs. """ self.max_features_to_select = max_features_to_select self.n_jobs = n_jobs self.transformer = ColumnTransformer([('scale', StandardScaler(), make_column_selector(dtype_include=np.floating))], remainder="passthrough") CoxRegression = sklearn_adapter(CoxPHFitter, event_col="death", predict_method="predict_partial_hazard") cox = CoxRegression(step_size = 0.5) param_grid = {"sklearncoxphfitter__penalizer": 10.0**np.arange(-2, 3)} if self.max_features_to_select > 0: select = SelectMRMRe(target_col="death") # can't put CoxRegression in the pipeline since sklearn # transformers cannot return data frames pipe = make_pipeline(select, cox) param_grid["selectmrmre__n_features"] = np.arange(2, self.max_features_to_select + 1) else: pipe = make_pipeline(cox) # XXX lifelines sklearn adapter does not support parallelization # for now, need to find a better workaround self.model = GridSearchCV(pipe, param_grid, n_jobs=1)
def full_training(model, X, y): ''' Trains a model over full split. Prints error adn plots estimations for test. :param model: model to train :param X: features :param y: labels :return: void ''' one_hot_encoder = make_column_transformer( (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include='category')), remainder='passthrough') pipeline = make_pipeline(one_hot_encoder, model) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) model_predict = pipeline.fit(X_train, y_train) predictions = model_predict.predict(X_test) # The coefficients # print('Coefficients: \n', model_predict[1].coef_) # The mean squared error print('Mean squared error: %.2f' % mean_squared_error(y_test, predictions)) # The coefficient of determination: 1 is perfect prediction print('Coefficient of determination: %.2f' % r2_score(y_test, predictions)) sns.regplot(y_test, predictions) st.pyplot()
def select_dtype_data(df, dtype: Union[str, list] = 'NUMERIC', return_type='pandas'): if isinstance(dtype, str): if dtype == 'NUMERIC': num_ndarr = num_selector.fit_transform(df) if return_type == 'pandas': return pd.DataFrame(num_ndarr, columns=num_selector.get_feature_names()) else: return num_ndarr if dtype == 'CATEGORICAL': num_ndarr = cat_selector.fit_transform(df) if return_type == 'pandas': return pd.DataFrame(num_ndarr, columns=cat_selector.get_feature_names()) else: return num_ndarr else: data_selector = make_column_transformer( ('passthrough', make_column_selector(dtype_include=dtype)), remainder='drop') num_ndarr = data_selector.fit_transform(df) if return_type == 'pandas': return pd.DataFrame(num_ndarr, columns=data_selector.get_feature_names()) else: return num_ndarr
def __init__(self, p: int, horizon: int): features = [ tuple((f"s{i}", Shift(i), make_column_selector(dtype_include=np.number))) for i in range(1, p + 1) ] model = GAR(LinearRegression()) super().__init__(features=features, horizon=horizon, model=model)
def __init__(self, data_fn =None , df=None , **kwargs)->None : self._logging = watexlog().get_watex_logger(self.__class__.__name__) self._data_fn = data_fn self._df =df self.categorial_features =kwargs.pop('categorial_features', None) self.numerical_features =kwargs.pop('numerical_features', None) self.target =kwargs.pop('target', 'flow') self._drop_features = kwargs.pop('drop_features', ['lwi']) self.random_state = kwargs.pop('random_state', 0) self.default_estimator = kwargs.pop('default_estimator', 'svc') self._df_cache =None self._features = None self.y = None self.X = None self.X_train =None self.X_test = None self.y_train =None self.y_test =None self._num_column_selector = make_column_selector( dtype_include=np.number) self._cat_colum_selector =make_column_selector( dtype_exclude=np.number) self._features_engineering =PolynomialFeatures( 10, include_bias=False) self._selectors= SelectKBest(f_classif, k=4) self._scalers =RobustScaler() self._encodages =OneHotEncoder() self._select_estimator_ =None for key in kwargs.keys(): setattr(self, key, kwargs[key]) if self._data_fn is not None: self.data_fn = self._data_fn if self.df is not None : self._read_and_encode_catFeatures()
def get_col_transf(): num_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()), ('poly', 'passthrough')]) cat_pipe = Pipeline([ ('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder()), ]) col_t = ColumnTransformer([ ('num', num_pipe, make_column_selector(dtype_include=['int64', 'float64'])), ('cat', cat_pipe, make_column_selector(dtype_include='object')) ]) return col_t
def create_pipeline(params: dict = None): """ Create sklearn.pipeline.Pipeline Parameters ---------- params : dict dictionary of parameters for the pipeline Returns ------- sklearn.pipeline.Pipeline """ # pipeline for numeric variables p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)), ("rmmean", MeanMedianImputer()), ("drop_quasi_constant", DropConstantFeatures(tol=0.97))]) # pipeline for categorical variables p_cat = Pipeline([("fill_cat_nas", CategoricalImputer(fill_value='MISSING')), ("rlc", RareLabelEncoder()), ("one_hot_encoder", OneHotEncoder())]) # list of pipelines to combine transformers = [("num", p_num, make_column_selector(dtype_include=np.number)), ("cat", p_cat, make_column_selector(dtype_include=object))] # combine pipelines and add XGBClassifier col_transforms = ColumnTransformer(transformers) p = Pipeline([("col_transformers", col_transforms), ("xgb", XGBClassifier(min_child_weight=1, gamma=0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=1, gpu_id=0, tree_method='gpu_hist'))]) if params: p.set_params(**params) return p
def test_pipeline_make_column_selector(self): X = pandas.DataFrame({ 'city': ['London', 'London', 'Paris', 'Sallisaw'], 'rating': [5, 3, 4, 5]}) X['rating'] = X['rating'].astype(numpy.float32) ct = make_column_transformer( (StandardScaler(), make_column_selector( dtype_include=numpy.number)), (OneHotEncoder(), make_column_selector( dtype_include=object))) expected = ct.fit_transform(X) onx = to_onnx(ct, X, target_opset=TARGET_OPSET) sess = InferenceSession(onx.SerializeToString()) names = [i.name for i in sess.get_inputs()] got = sess.run(None, {names[0]: X[names[0]].values.reshape((-1, 1)), names[1]: X[names[1]].values.reshape((-1, 1))}) assert_almost_equal(expected, got[0])
def create_pipeline(self): preprocessing_pipeline = Pipeline([ ('drop_columns', FunctionTransformer(self.drop_columns, kw_args={'columns_to_drop': FEATURES_TO_DROP})), # Need to convert to string otherwise leads to error when imputing object data: ('convert_object_columns_to_string', FunctionTransformer(self.convert_object_columns_to_string)) ]) # Pipeline of operations to perform on any object columns in DataFrame object_pipeline = Pipeline( [ ('most_frequent_imputer', SimpleImputer(strategy='most_frequent')), # Very slow ('ohe', OneHotEncoder()) ] ) # Pipeline of operations to perform on any numeric columns in DataFrame numeric_pipeline = Pipeline( [ ('mean_imputer', SimpleImputer(strategy='mean')), ('min_max_scalar', MinMaxScaler()) ] ) full_pipeline = Pipeline( [ ( 'process_data', ColumnTransformer( [ ('numeric_processing', numeric_pipeline, make_column_selector(dtype_include=np.number)), ('object_processing', object_pipeline, make_column_selector(dtype_include=object)) ] ) ) ] ) end_to_end_pipeline = Pipeline( [ ('preprocessing', preprocessing_pipeline), ('processing', full_pipeline), ('model', lightgbm.LGBMClassifier()) ] ) return end_to_end_pipeline
def make_column_transformer(num_missing_impute_strategy='mean'): """[create a data preprocess pipeline using sklearn pipeline] Todo """ num_imputer = Pipeline([ ("imputer", SimpleImputer(strategy=num_missing_impute_strategy, add_indicator=False)) ]) cat_ohe = Pipeline([ ("cat_imputer", SimpleImputer(strategy='constant', fill_value='NA')), ('ohe', OneHotEncoder(dtype=np.int, handle_unknown='ignore')) ]) return ColumnTransformer( [('imp', num_imputer, make_column_selector(dtype_include=np.number)), ('ohe', cat_ohe, make_column_selector(dtype_include=['object', 'category']))], remainder='passthrough')
def __init__(self, horizon: int, seasonal_length: int): features = [ ("s1", Shift(0), make_column_selector()), ] super().__init__( features=features, horizon=horizon, model=SeasonalNaiveForecaster(seasonal_length), )
def make_normalizer_column_transformer(normalizations: Dict[SensorComponent, Normalization]): transformers = [] for sensor_component, normalization in normalizations.items(): selector = make_column_selector(pattern=sensor_component + MATCH_REST_REGEX) transformers.append( (sensor_component, get_normalizer(normalization), selector)) return PandasColumnTransformer(transformers)
def test_column_selector(): X = pd.DataFrame({ "country": ["GB", "GB", "FR", "US"], "city": ["London", "London", "Paris", "Sallisaw"], "int": [5, 3, 4, 5], }) ct = make_column_transformer( (StandardScaler(), make_column_selector(dtype_include=np.number)), (OneHotEncoder(), make_column_selector("city")), ) expected = ct.fit_transform(X) ct = make_column_transformer( (StandardScaler(), ColumnSelector(AllNumeric())), (OneHotEncoder(), ColumnSelector(StartsWith("c") & ~AnyOf("country"))), ) actual = ct.fit_transform(X) assert_array_equal(actual, expected)
def train_and_persist(): # Read in data into a Pandas DataFrame df = pd.read_csv("hour.csv", parse_dates=["dteday"]) # Assign features to independent (X) and predicted (y) variables X = df.drop(columns=["instant", "cnt", "casual", "registered"]) y = df["cnt"] # using ffill_missing to make imputter with forward fill ffiller = FunctionTransformer(ffill_missing) # Make weather imputter pipeline for later use weather_enc = make_pipeline( ffiller, OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=X["weathersit"].nunique()), ) # Make column transformer for imputtation and encoding process ct = make_column_transformer( (ffiller, make_column_selector(dtype_include=np.number)), (weather_enc, ["weathersit"]), ) # Make preprocessing object for Feature Engineering preprocessing = FeatureUnion([("is_weekend", FunctionTransformer(is_weekend)), ("year", FunctionTransformer(year)), ("column_transform", ct)]) # Define Pipeline to separate preprocessing and modelling reg = Pipeline([("preprocessing", preprocessing), ("model", RandomForestRegressor())]) # Train, test split: Train is before 10/2012 and Test is after 10/2012 X_train, y_train = X.loc[X["dteday"] < "2012-10"], y.loc[ X["dteday"] < "2012-10"] X_test, y_test = X.loc["2012-10" <= X["dteday"]], y.loc[ "2012-10" <= X["dteday"]] # Train the model reg.fit(X_train, y_train) # # Evaluate to get R-squared # reg.score(X_test, y_test) # # Predict # y_pred = reg.predict(X_test) # Create the joblib file joblib.dump(reg, "biking.joblib") print("Model trained successfully")
def fit(self, X, y=None): from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OrdinalEncoder df = pd.DataFrame(data=X, index=range(X.shape[0]), columns=range(X.shape[1])).infer_objects() categorical = make_column_selector(dtype_exclude=np.number) self.estimator_ = ColumnTransformer( [('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical)], remainder='passthrough') self.estimator_.fit(df, y) return self