def test_column_transformer_mixed_cols_sparse(): df = np.array([['a', 1, True], ['b', 2, False]], dtype='O') ct = make_column_transformer( (OneHotEncoder(), [0]), ('passthrough', [1, 2]), sparse_threshold=1.0 ) # this shouldn't fail, since boolean can be coerced into a numeric # See: https://github.com/scikit-learn/scikit-learn/issues/11912 X_trans = ct.fit_transform(df) assert X_trans.getformat() == 'csr' assert_array_equal(X_trans.toarray(), np.array([[1, 0, 1, 1], [0, 1, 2, 0]])) ct = make_column_transformer( (OneHotEncoder(), [0]), ('passthrough', [0]), sparse_threshold=1.0 ) with pytest.raises(ValueError, match="For a sparse output, all columns should"): # this fails since strings `a` and `b` cannot be # coerced into a numeric. ct.fit_transform(df)
def test_make_column_transformer_pandas(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) norm = Normalizer() # XXX remove in v0.22 with pytest.warns(DeprecationWarning, match='`make_column_transformer` now expects'): ct1 = make_column_transformer((X_df.columns, norm)) ct2 = make_column_transformer((norm, X_df.columns)) assert_almost_equal(ct1.fit_transform(X_df), ct2.fit_transform(X_df))
def test_make_column_transformer_kwargs(): scaler = StandardScaler() norm = Normalizer() ct = make_column_transformer(('first', scaler), (['second'], norm), n_jobs=3, remainder='drop') assert_equal(ct.transformers, make_column_transformer( ('first', scaler), (['second'], norm)).transformers) assert_equal(ct.n_jobs, 3) assert_equal(ct.remainder, 'drop') # invalid keyword parameters should raise an error message assert_raise_message( TypeError, 'Unknown keyword arguments: "transformer_weights"', make_column_transformer, ('first', scaler), (['second'], norm), transformer_weights={'pca': 10, 'Transf': 1} )
def test_make_column_transformer_remainder_transformer(): scaler = StandardScaler() norm = Normalizer() remainder = StandardScaler() ct = make_column_transformer(('first', scaler), (['second'], norm), remainder=remainder) assert ct.remainder == remainder
def test_column_transformer_remainder(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_second = np.array([2, 4, 6]).reshape(-1, 1) X_res_both = X_array # default drop ct = ColumnTransformer([('trans1', Trans(), [0])]) assert_array_equal(ct.fit_transform(X_array), X_res_first) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'drop' assert_array_equal(ct.transformers_[-1][2], [1]) # specify passthrough ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1]) # column order is not preserved (passed through added to end) ct = ColumnTransformer([('trans1', Trans(), [1])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1]) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1]) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [0]) # passthrough when all actual transformers are skipped ct = ColumnTransformer([('trans1', 'drop', [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_second) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1]) # error on invalid arg ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\', \'passthrough\', " "or estimator.", ct.fit, X_array) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\', \'passthrough\', " "or estimator.", ct.fit_transform, X_array) # check default for make_column_transformer ct = make_column_transformer(([0], Trans())) assert ct.remainder == 'drop'
def test_make_column_transformer(): scaler = StandardScaler() norm = Normalizer() ct = make_column_transformer(('first', scaler), (['second'], norm)) names, transformers, columns = zip(*ct.transformers) assert_equal(names, ("standardscaler", "normalizer")) assert_equal(transformers, (scaler, norm)) assert_equal(columns, ('first', ['second']))
def test_make_column_transformer(): scaler = StandardScaler() norm = Normalizer() ct = make_column_transformer((scaler, 'first'), (norm, ['second'])) names, transformers, columns = zip(*ct.transformers) assert_equal(names, ("standardscaler", "normalizer")) assert_equal(transformers, (scaler, norm)) assert_equal(columns, ('first', ['second'])) # XXX remove in v0.22 with pytest.warns(DeprecationWarning, match='`make_column_transformer` now expects'): ct1 = make_column_transformer(([0], norm)) ct2 = make_column_transformer((norm, [0])) X_array = np.array([[0, 1, 2], [2, 4, 6]]).T assert_almost_equal(ct1.fit_transform(X_array), ct2.fit_transform(X_array)) with pytest.warns(DeprecationWarning, match='`make_column_transformer` now expects'): make_column_transformer(('first', 'drop')) with pytest.warns(DeprecationWarning, match='`make_column_transformer` now expects'): make_column_transformer(('passthrough', 'passthrough'), ('first', 'drop'))
def test_make_column_transformer_pandas(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) norm = Normalizer() ct1 = ColumnTransformer([('norm', Normalizer(), X_df.columns)]) ct2 = make_column_transformer((norm, X_df.columns)) assert_almost_equal(ct1.fit_transform(X_df), ct2.fit_transform(X_df))
def fit(self, X, y): encode_columns = [item for item in X.columns if 'suit' in item] scale_columns = [item for item in X.columns if item not in encode_columns] self.column_transformer = make_column_transformer( (StandardScaler(), scale_columns), (OneHotEncoder(categories='auto'), encode_columns)) self.column_transformer.fit(X) return self
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Apr 26 15:27:47 2020 @author: dorian """ import os import importlib import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder from sklearn.compose import make_column_transformer from sklearn.ensemble import ExtraTreesRegressor from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score import problem from sklearn.model_selection import RandomizedSearchCV from sklearn.impute import SimpleImputer from sklearn.preprocessing import FunctionTransformer from sklearn.inspection import permutation_importance from sklearn.model_selection import train_test_split import shap def _merge_external_data(X): filepath = os.path.join( os.path.dirname(__file__), 'external_data.csv' ) # Make sure that DateOfDeparture is of dtype datetime X = X.copy() # modify a copy of X X.loc[:, "DateOfDeparture"] = pd.to_datetime(X['DateOfDeparture'])
# # Since there are rare categories in this dataset we need to specifically # encode unknown categories at prediction time in order to be able to use # cross-validation. Otherwise some rare categories could only be present on the # validation side of the cross-validation split and the `OrdinalEncoder` would # raise an error when calling its `transform` method with the data points # of the validation set. # %% from sklearn.preprocessing import OrdinalEncoder from sklearn.compose import make_column_transformer, make_column_selector categorical_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) preprocessor = make_column_transformer( (categorical_encoder, make_column_selector(dtype_include=object)), remainder="passthrough") # %% [markdown] # # We will first give a simple example where we will train a single decision # tree classifier and check its generalization performance via cross-validation. # %% from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier tree = make_pipeline(preprocessor, DecisionTreeClassifier(random_state=0)) # %% from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline from sklearn.compose import make_column_transformer from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.impute import SimpleImputer from sklearn_callbacks import ProgressBar X, y = make_classification(n_samples=500000, n_features=200, random_state=0) pipe = make_pipeline( SimpleImputer(), make_column_transformer( (StandardScaler(), slice(0, 80)), (MinMaxScaler(), slice(80, 120)), (StandardScaler(with_mean=False), slice(120, 180)), ), LogisticRegression(), ) pbar = ProgressBar() pipe._set_callbacks(pbar) _ = pipe.fit(X, y)
# - pclass: ordinal integers {1, 2, 3}. numeric_features = ['age', 'fare'] categorical_features = ['embarked', 'sex', 'pclass'] # Provisionally, use pd.fillna() to impute missing values for categorical # features; SimpleImputer will eventually support strategy="constant". data[categorical_features] = data[categorical_features].fillna(value='missing') # We create the preprocessing pipelines for both numeric and categorical data. numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler()) categorical_transformer = CategoricalEncoder('onehot-dense', handle_unknown='ignore') preprocessing_pl = make_column_transformer( (numeric_features, numeric_transformer), (categorical_features, categorical_transformer), remainder='drop' ) # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = make_pipeline(preprocessing_pl, LogisticRegression()) X = data.drop('survived', axis=1) y = data.survived.values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) clf.fit(X_train, y_train) print("model score: %f" % clf.score(X_test, y_test))
from sklearn import set_config from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.impute import SimpleImputer from sklearn.compose import make_column_transformer from sklearn.linear_model import LogisticRegression set_config(display='diagram') num_proc = make_pipeline(SimpleImputer(strategy='median'), StandardScaler()) cat_proc = make_pipeline( SimpleImputer(strategy='constant', fill_value='missing'), OneHotEncoder(handle_unknown='ignore')) preprocessor = make_column_transformer((num_proc, ('feat1', 'feat3')), (cat_proc, ('feat0', 'feat2'))) clf = make_pipeline(preprocessor, LogisticRegression()) clf ############################################################################## # Scalability and stability improvements to KMeans # ------------------------------------------------ # The :class:`~sklearn.cluster.KMeans` estimator was entirely re-worked, and it # is now significantly faster and more stable. In addition, the Elkan algorithm # is now compatible with sparse matrices. The estimator uses OpenMP based # parallelism instead of relying on joblib, so the `n_jobs` parameter has no # effect anymore. For more details on how to control the number of threads, # please refer to our :ref:`parallelism` notes. import scipy import numpy as np
# [!] not implemented # select feature set from the data frame df['time_from_trace_start'] = pd.DataFrame(tfts_lst) df['case_remaining_time'] = pd.DataFrame(crt_lst) df = df[[ 'activity_type', 'seq_of_event', 'time_from_trace_start', 'num_of_events_hour_of_day', 'num_of_events_day_of_week', 'case_remaining_time' ]] print(df) # one hot encoding and feature scaling preprocess = make_column_transformer( (OneHotEncoder(), ['activity_type']), (StandardScaler(), [ 'seq_of_event', 'time_from_trace_start', 'num_of_events_hour_of_day', 'num_of_events_day_of_week', 'case_remaining_time' ])) # separate train/valid sets train = preprocess.fit_transform(df[:parting_event_idx + 1]).toarray() valid = preprocess.transform(df[parting_event_idx + 1:]).toarray() # calculate the size of input vector input_size = train.shape[1] - 1 # excludes the attribute of target values # transformation (ndarray -> torch) def transform_data(arr): x = arr[:, 0:input_size] x_arr = np.array(x).reshape(1, -1, input_size)
expected_label = "LogisticRegression (AP = {:0.2f})".format(avg_prec) assert disp.line_.get_label() == expected_label assert disp.ax_.get_xlabel() == "Recall (Positive label: 1)" assert disp.ax_.get_ylabel() == "Precision (Positive label: 1)" # draw again with another label disp.plot(name="MySpecialEstimator") expected_label = "MySpecialEstimator (AP = {:0.2f})".format(avg_prec) assert disp.line_.get_label() == expected_label @pytest.mark.parametrize( "clf", [ make_pipeline(StandardScaler(), LogisticRegression()), make_pipeline(make_column_transformer( (StandardScaler(), [0, 1])), LogisticRegression()), ], ) def test_precision_recall_curve_pipeline(pyplot, clf): X, y = make_classification(n_classes=2, n_samples=50, random_state=0) with pytest.raises(NotFittedError): plot_precision_recall_curve(clf, X, y) clf.fit(X, y) disp = plot_precision_recall_curve(clf, X, y) assert disp.estimator_name == clf.__class__.__name__ def test_precision_recall_curve_string_labels(pyplot): # regression test #15738 cancer = load_breast_cancer() X = cancer.data
#removing erroneous entries data = data.drop(data[data.ap_hi == 0].index) data = data.drop(data[data.ap_lo == 0].index) data = data.drop(data[data.ap_hi < data.ap_lo].index) #DATA-PREPROCESSING #GENDER CHANGE 2 TO 1 data.iloc[:, 2] = [0 if i == 2 else i for i in data.iloc[:, 2]] #ONE HOT ENCODING from sklearn.preprocessing import MinMaxScaler, OneHotEncoder from sklearn.compose import make_column_transformer transformer = make_column_transformer( (['age', 'height', 'weight', 'ap_hi', 'ap_lo'], MinMaxScaler()), (['cholesterol', 'gluc'], OneHotEncoder())) data_transformed = pd.DataFrame(transformer.fit_transform(data)) data_transformed = data_transformed.drop(columns=[7, 10]).reset_index() data_cat = data.iloc[:, [2, 9, 10, 11, 12]].reset_index() data_new = pd.concat([data_transformed, data_cat], axis=1, ignore_index=True) data_new = data_new.drop(columns=[0, 10]) data_new.columns = [ 'age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol_0', 'cholesterol_1', 'gluc_0', 'gluc_1', 'gender', 'smoke', 'alco', 'active', 'cardio' ] data = data_new from sklearn.model_selection import train_test_split
# simply a series of sequential steps. The output of each step is passed to # the next step. # Workflow 1 print() print('Workflow 1') # Impute using the mean # Select features using SelectFromModel(DecisionTreeRegressor) # Fit with LinearRegression # Create the imputer object with # the default hyperparameter settings imp = SimpleImputer() # Create the column transformer object ct = make_column_transformer((imp, features), remainder='passthrough') # Create objects to use for feature selection with # the default hyperparameter settings linreg_selection = LinearRegression() dtr_selection = DecisionTreeRegressor() lasso_selection = Lasso() lassocv_selection = LassoCV() rfr_selection = RandomForestRegressor() # Create the feature selection object selection = SelectFromModel(estimator=dtr_selection) # Create an object to use for regression with # the default hyperparameter settings linreg = LinearRegression()
# # - one-hot encode (i.e., generate a column by category) the categorical # columns; # - as a first approach (we will see after how the normalisation of numerical # values will affect our discussion), keep numerical values as they are. from sklearn.compose import make_column_transformer from sklearn.preprocessing import OneHotEncoder categorical_columns = [ 'RACE', 'OCCUPATION', 'SECTOR', 'MARR', 'UNION', 'SEX', 'SOUTH' ] numerical_columns = ['EDUCATION', 'EXPERIENCE', 'AGE'] preprocessor = make_column_transformer( (OneHotEncoder(drop='if_binary'), categorical_columns), remainder='passthrough') ############################################################################## # To describe the dataset as a linear model we choose to use a ridge regressor # with a very small regularization and to model the logarithm of the WAGE. from sklearn.pipeline import make_pipeline from sklearn.linear_model import Ridge from sklearn.compose import TransformedTargetRegressor model = make_pipeline( preprocessor, TransformedTargetRegressor(regressor=Ridge(alpha=1e-10), func=np.log10, inverse_func=sp.special.exp10))
for i in range(len(G3)): if G3[i] <= 4: G3[i] = 0 if G3[i] >= 5 and G3[i] <= 8: G3[i] = 1 if G3[i] >= 9 and G3[i] <= 12: G3[i] = 2 if G3[i] >= 13 and G3[i] <= 16: G3[i] = 3 if G3[i] >= 17 and G3[i] <= 20: G3[i] = 4 df = pd.concat([df, G3], axis=1) column_trans = make_column_transformer( (OneHotEncoder(), ['Mjob', 'Fjob', 'reason', 'guardian']), remainder='passthrough') data = column_trans.fit_transform(df) n1 = data.shape[0] n2 = data.shape[1] m = int(0.8 * n1) # train=data[:m,:] # test=data[m:-1,:] # X_train=train[:,0:n2-1] # y_train=train[:,n2-1] # y_train = np.reshape(y_train, (len(y_train),1)) # y_train = to_categorical(y_train) # X_test=test[:,0:n2-1]
# On a toujours un problème de surentraînement, mais bien moindre. # Import libraries import pandas as pd # Import data dataset = pd.read_csv('data/Churn_Modelling.csv') X = dataset.iloc[:, 3:13] y = dataset.iloc[:, 13] # Encode categorical data and scale continuous data from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.compose import make_column_transformer preprocess = make_column_transformer( (OneHotEncoder(), ['Geography', 'Gender']), (StandardScaler(), [ 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary' ])) X = preprocess.fit_transform(X) # Split in train/test y = y.values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Part 2 - Now let's make the ANN! # Importing the Keras libraries and packages
# calculate a number of activity types num_of_acts = len(np.unique(df['activity_type'])) # find a parting trace which is the last trace for the train/valid separation parting_trace_idx = int(num_of_traces * 0.8) parting_trace_id = np.unique(df['case_id'])[parting_trace_idx] # find a parting event's index which is the last event's index of the parting trace. # used as a separation line between train/valid sets parting_event_idx = df.loc[df['case_id'] == parting_trace_id]\ .index.values.astype(int)[-1] # set up the transformer (one hot encoder, feature scaler) preprocess = make_column_transformer( (OneHotEncoder(), ['activity_type']), (RobustScaler(), ['seq_of_event', 'time_from_trace_start']), ('passthrough', ['case_remaining_time']) ) # transform data and separate it into train/valid sets train = preprocess.fit_transform(df[:parting_event_idx+1]).toarray() valid = preprocess.transform(df[parting_event_idx+1:]).toarray() # scale 'execution_time' values scaler = MinMaxScaler() # replace ont-hot-encoded values into execution time values # for training set event_idx = 0 for i in range(parting_trace_idx+1): trace_len = traces_lens[i]
def getAnnealingData(): global Model_num, column_transformer_pipeline def replaceUnknows(data): data['family'] = data['family'].replace(to_replace='?', value='UNK') data['product-type'] = data['product-type'].replace(to_replace='C', value=1).apply( pd.to_numeric) data['steel'] = data['steel'].replace(to_replace='?', value='NA') data['temper_rolling'] = data['temper_rolling'].replace(to_replace='?', value='NA') data['condition'] = data['condition'].replace(to_replace='?', value='NA') data['formability'] = data['formability'].replace(to_replace='?', value='0') data['non-ageing'] = data['non-ageing'].replace(to_replace='?', value='NA') data['surface-finish'] = data['surface-finish'].replace(to_replace='?', value='NA') data['surface-quality'] = data['surface-quality'].replace( to_replace='?', value='NA') data['enamelability'] = data['enamelability'].replace(to_replace='?', value='0') data['bc'] = data['bc'].replace(to_replace='?', value='NA') data['bf'] = data['bf'].replace(to_replace='?', value='NA') data['bt'] = data['bt'].replace(to_replace='?', value='NA') data['bw/me'] = data['bw/me'].replace(to_replace='?', value='NA') data['bl'] = data['bl'].replace(to_replace='?', value='NA') data['m'] = data['m'].replace(to_replace='?', value=0).apply(pd.to_numeric) data['chrom'] = data['chrom'].replace(to_replace='?', value='NA') data['phos'] = data['phos'].replace(to_replace='?', value='NA') data['cbond'] = data['cbond'].replace(to_replace='?', value='NA') data['marvi'] = data['marvi'].replace(to_replace='?', value=0).apply(pd.to_numeric) data['exptl'] = data['exptl'].replace(to_replace='?', value='NA') data['ferro'] = data['ferro'].replace(to_replace='?', value='NA') data['corr'] = data['corr'].replace(to_replace='?', value=0).apply(pd.to_numeric) data['exptl'] = data['exptl'].replace(to_replace='?', value='NA') data['blue/bright/varn/clean'] = data[ 'blue/bright/varn/clean'].replace(to_replace='?', value='NA') data['lustre'] = data['lustre'].replace(to_replace='?', value='NA') data['jurofm'] = data['jurofm'].replace(to_replace='?', value=0).apply(pd.to_numeric) data['s'] = data['s'].replace(to_replace='?', value=0).apply(pd.to_numeric) data['p'] = data['p'].replace(to_replace='?', value=0).apply(pd.to_numeric) data['oil'] = data['oil'].replace(to_replace='?', value='NA') data['packing'] = data['packing'].replace(to_replace='?', value=0).apply(pd.to_numeric) return data dataSource = 'DataSource/annealing.csv' testDataSource = 'DataSource/annealing-TEST.csv' data = pd.read_csv(dataSource, header=None) testData = pd.read_csv(testDataSource, header=None) col_headings = [ 'family', 'product-type', 'steel', 'carbon', 'hardness', 'temper_rolling', 'condition', 'formability', 'strength', 'non-ageing', 'surface-finish', 'surface-quality', 'enamelability', 'bc', 'bf', 'bt', 'bw/me', 'bl', 'm', 'chrom', 'phos', 'cbond', 'marvi', 'exptl', 'ferro', 'corr', 'blue/bright/varn/clean', 'lustre', 'jurofm', 's', 'p', 'shape', 'thick', 'width', 'len', 'oil', 'bore', 'packing', 'target' ] col_index = { 0: 'family', 1: 'product-type', 2: 'steel', 3: 'carbon', 4: 'hardness', 5: 'temper_rolling', 6: 'condition', 7: 'formability', 8: 'strength', 9: 'non-ageing', 10: 'surface-finish', 11: 'surface-quality', 12: 'enamelability', 13: 'bc', 14: 'bf', 15: 'bt', 16: 'bw/me', 17: 'bl', 18: 'm', 19: 'chrom', 20: 'phos', 21: 'cbond', 22: 'marvi', 23: 'exptl', 24: 'ferro', 25: 'corr', 26: 'blue/bright/varn/clean', 27: 'lustre', 28: 'jurofm', 29: 's', 30: 'p', 31: 'shape', 32: 'thick', 33: 'width', 34: 'len', 35: 'oil', 36: 'bore', 37: 'packing', 38: 'target' } col_to_drop = [ 'family', 'product-type', 'non-ageing', 'surface-finish', 'enamelability', 'bc', 'm', 'chrom', 'phos', 'cbond', 'marvi', 'exptl', 'ferro', 'corr', 'blue/bright/varn/clean', 'lustre', 'jurofm', 's', 'p' ] # col_to_drop = [] data.columns = col_headings testData.columns = col_headings data = replaceUnknows(data) testData = replaceUnknows(testData) data = data.drop(col_to_drop, axis=1) testData = testData.drop(col_to_drop, axis=1) X_train = data.drop('target', axis=1) y_train = data['target'].values lable_enc = LabelEncoder() y_train = lable_enc.fit_transform(y_train) cols_to_oneHotEncode = [ 'family', 'steel', 'temper_rolling', 'condition', 'formability', 'non-ageing', 'surface-finish', 'surface-quality', 'enamelability', 'bc', 'bf', 'bt', 'bw/me', 'bl', 'chrom', 'phos', 'cbond', 'exptl', 'ferro', 'blue/bright/varn/clean', 'lustre', 'shape', 'oil', 'packing' ] cols_to_oneHotEncode = list( set(list(X_train.columns)).intersection(set(cols_to_oneHotEncode))) cols_to_scale = [ 'product-type', 'carbon', 'hardness', 'strength', 'thick', 'width', 'len', 'bore', 'm', 'jurofm', 'p', 'marvi', 's', 'corr' ] cols_to_scale = list( set(list(X_train.columns)).intersection(set(cols_to_scale))) column_transformer_pipeline = make_column_transformer( (OneHotEncoder(drop='first'), cols_to_oneHotEncode), (StandardScaler(), cols_to_scale), remainder='passthrough') X_train = column_transformer_pipeline.fit_transform(X=X_train) X_test = testData.drop('target', axis=1) X_test = column_transformer_pipeline.transform(X=X_test) y_test = testData['target'].values y_test = lable_enc.transform(y=y_test) Model_num = str( abs(hash(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")))) return X_train, X_test, y_train, y_test
def get_estimator(): actors = get_actor_party_data() # Additional data about deputies # Doing this is allowed find_group_vote_demandeur = FindGroupVoteDemandeurTransformer() decompose_vote_object = DecomposeVoteObjetTransformer() find_party_actor = FindPartyActorTransformer(actors) encode_category = make_pipeline( SimpleImputer(strategy="constant", fill_value=["unknown"])) idty = lambda x: x def encode_party_presence(x): y = x.iloc[:, 0].apply(pd.Series) return y vectorize_vote = make_column_transformer( (OneHotEncoder(), ["libelle_type_vote"]), ( CountVectorizer(binary=True, preprocessor=idty, tokenizer=idty), "demandeur_group", ), ( CountVectorizer(binary=True, preprocessor=idty, tokenizer=idty), "auteur_parti", ), ( FunctionTransformer(func=encode_party_presence), ["presence_per_party"], ), # (CountVectorizer(binary=True), "libelle_desc"), (TfidfVectorizer(binary=True), "libelle_desc"), ) def create_nn_model(): nn = Sequential() nn.add(Dense(64, activation="relu")) nn.add(Dropout(0.2)) nn.add(Dense(10, activation="sigmoid")) nn.compile( optimizer=Adam(learning_rate=1e-3, decay=1e-2 / 500), loss="binary_crossentropy", metrics=["accuracy"], ) return nn classifier = NeuralNet(create_nn_model, epochs=1000, batch_size=500, verbose=0) model = Pipeline([ ("find_group_vote_demandeur", find_group_vote_demandeur), ("decompose_vote_object", decompose_vote_object), ("find_party_actor", find_party_actor), ("vectorize_vote", vectorize_vote), ("densify", DenseTransformer()), ("normalize", Normalizer()), ("nn", classifier), ]) return model
def make_pipe(n_splits): cols_log = [ "DER_mass_MMC", "DER_mass_transverse_met_lep", "DER_mass_vis", "DER_pt_h", "DER_pt_ratio_lep_tau", "DER_pt_tot", "DER_sum_pt", "PRI_jet_all_pt", "PRI_lep_pt", "PRI_met", "PRI_met_sumet", "PRI_tau_pt" ] mem = Memory(location=tempfile.mkdtemp(), verbose=0) pipe_imputed_fast = Pipeline( [ ('col', make_column_transformer( (Shift_log(), cols_log), remainder="passthrough")), ('imp', IterativeImputer(max_iter=int(1e2))), ('sca', StandardScaler()), # ('pca', PCA(15)), ( 'gri', GridSearchCV( Pipeline([ #('pca', None), ('clf', None) ]), scoring='accuracy', refit=True, cv=n_splits, iid=True, return_train_score=False, param_grid={})), ], memory=mem, verbose=0) param_grid = [ { # 'pca': (None, PCA(15)), 'clf': (SVC(gamma="auto", max_iter=100000), ), 'clf__kernel': ("poly", "rbf"), 'clf__C': np.logspace(-2, .5, num=5), }, { 'clf': (BaggingClassifier(Perceptron(max_iter=1000), max_samples=0.5, max_features=0.5), ), 'clf__n_estimators': ( 500, 1000, 2000, ), }, { 'clf': (RandomForestClassifier(), ), 'clf__n_estimators': ( 500, 1000, 2000, ), 'clf__max_depth': (None, 20, 50), }, { 'clf': (AdaBoostClassifier(), ), 'clf__n_estimators': ( 500, 1000, 2000, ), }, ] return pipe_imputed_fast, param_grid
plt.title("By race") data.groupby("race").income_bin.mean().sort_values().plot.barh() # Exercise 3 # using pd.get_dummies data_one_hot = pd.get_dummies(data_features) X_train, X_test, y_train, y_test = train_test_split(data_one_hot, income) scaler = MinMaxScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) # using OneHotEncoder cont_features = data_features.dtypes == "int64" ct = make_column_transformer( (OneHotEncoder(handle_unknown='ignore'), ~cont_features), (StandardScaler(), cont_features)) X_train, X_test, y_train, y_test = train_test_split(data_features, income) X_train_scaled = ct.fit_transform(X_train) X_test_scaled = ct.transform(X_test) # Exercise 4 from sklearn.linear_model import LogisticRegression logreg = LogisticRegression(C=0.1) logreg.fit(X_train_scaled, y_train) print("Training score:", logreg.score(X_train_scaled, y_train)) print("Test score:", logreg.score(X_test_scaled, y_test)) print("Faction <= 50k", (y_train.values == " <=50K").mean())
import pandas as pd from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import make_column_transformer dataset = pd.read_csv('Credit.csv') # We are going to encode two columns - Personal Status and other_parties # Personal Status -> index 8 # other_parties -> index 9 X = dataset.iloc[:, 8:10].values #Taking Personal Status and other_parties labelencoder = LabelEncoder() X[:, 0] = labelencoder.fit_transform(X[:, 0]) onehotencoder = make_column_transformer( (OneHotEncoder(categories='auto', sparse=False), [1]), remainder="passthrough") X = onehotencoder.fit_transform(X)
case_remaining_time = (trace_end_time - cur_event_time).total_seconds() crt_lst.append(case_remaining_time) # case 2: no 'REG_DATE' in <trace> # data sets fall into case 2: # [!] not implemented # select feature set from the data frame df['time_from_trace_start'] = pd.DataFrame(tfts_lst) df['case_remaining_time'] = pd.DataFrame(crt_lst) df = df[['activity_type', 'time_from_trace_start', 'case_remaining_time']] print(df) # one hot encoding and feature scaling preprocess = make_column_transformer( (OneHotEncoder(), ['activity_type']), (StandardScaler(), ['time_from_trace_start', 'case_remaining_time'])) # separate train/valid sets train = preprocess.fit_transform(df[:parting_event_idx + 1]).toarray() valid = preprocess.transform(df[parting_event_idx + 1:]).toarray() # calculate the size of input vector input_size = train.shape[1] - 1 # excludes the attribute of target values # transformation (ndarray -> torch) def transform_data(arr): x = arr[:, 0:input_size] x_arr = np.array(x).reshape(1, -1, input_size) y = arr[:, input_size]
import os import importlib import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler from sklearn.compose import make_column_transformer from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score import problem from sklearn.model_selection import RandomizedSearchCV, GridSearchCV from sklearn.impute import SimpleImputer from sklearn.preprocessing import FunctionTransformer from sklearn.inspection import permutation_importance from sklearn.model_selection import train_test_split from sklearn.svm import SVR import xgboost as xgb from sklearn.inspection import permutation_importance def _merge_external_data(X): """ filepath = os.path.join( os.path.dirname(__file__), 'external_data.csv' ) """ # Make sure that DateOfDeparture is of dtype datetime X = X.copy() # modify a copy of X X.loc[:, "DateOfDeparture"] = pd.to_datetime(X['DateOfDeparture']) # Parse date to also be of dtype datetime data_weather = pd.read_csv("external_data.csv", parse_dates=["DateOfDeparture"])
def preprocessing(self, input_data): dataset_columns = [ 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'TARGET' ] # JSON to pandas DataFrame # Set an index, orient = 'index' # input_data = input_data.set_index('SK_ID_CURR') # print("INdex Data", input_data) application_data = pd.DataFrame(input_data, index=[0]) # application_data = pd.DataFrame.from_dict(input_data) # application_data = application_data.json() application_data = pd.DataFrame.from_dict(application_data) # print("Un proccessed application_data", application_data) # input_data = input_data.set_index('SK_ID_CURR') # application_data = pd.read_csv(path_to_artifacts + 'application_train.csv') label_vector = application_data['TARGET'] np.unique(label_vector, return_counts=True) # print("INdex Data 2", input_data) categorical_features = [ 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'EXT_SOURCE_1', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21' ] numerical_features = [ 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR' ] application_data['AMT_ANNUITY'] = application_data[ 'AMT_ANNUITY'].fillna(0) application_data['OCCUPATION_TYPE'] = application_data[ 'OCCUPATION_TYPE'].fillna('UNKNOWN') application_data['CNT_FAM_MEMBERS'] = application_data[ 'CNT_FAM_MEMBERS'].fillna(0) application_data['EXT_SOURCE_1'] = application_data[ 'EXT_SOURCE_1'].fillna(0) application_data['EXT_SOURCE_2'] = application_data[ 'EXT_SOURCE_2'].fillna(0) application_data['EXT_SOURCE_3'] = application_data[ 'EXT_SOURCE_3'].fillna(0) application_data['OBS_30_CNT_SOCIAL_CIRCLE'] = application_data[ 'OBS_30_CNT_SOCIAL_CIRCLE'].fillna(0) application_data['DEF_30_CNT_SOCIAL_CIRCLE'] = application_data[ 'DEF_30_CNT_SOCIAL_CIRCLE'].fillna(0) application_data['OBS_60_CNT_SOCIAL_CIRCLE'] = application_data[ 'OBS_60_CNT_SOCIAL_CIRCLE'].fillna(0) application_data['DEF_60_CNT_SOCIAL_CIRCLE'] = application_data[ 'DEF_60_CNT_SOCIAL_CIRCLE'].fillna(0) application_data['DAYS_LAST_PHONE_CHANGE'] = application_data[ 'DAYS_LAST_PHONE_CHANGE'].fillna(3650) application_data['AMT_REQ_CREDIT_BUREAU_HOUR'] = application_data[ 'AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(0) application_data['AMT_REQ_CREDIT_BUREAU_DAY'] = application_data[ 'AMT_REQ_CREDIT_BUREAU_DAY'].fillna(0) application_data['AMT_REQ_CREDIT_BUREAU_WEEK'] = application_data[ 'AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(0) application_data['AMT_REQ_CREDIT_BUREAU_MON'] = application_data[ 'AMT_REQ_CREDIT_BUREAU_MON'].fillna(0) application_data['AMT_REQ_CREDIT_BUREAU_QRT'] = application_data[ 'AMT_REQ_CREDIT_BUREAU_QRT'].fillna(0) application_data['AMT_REQ_CREDIT_BUREAU_YEAR'] = application_data[ 'AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(0) treated_dataset = application_data[dataset_columns] # sample_class_1 = application_data[application_data['TARGET'] == 1][:20000] # sample_class_0 = application_data[application_data['TARGET'] == 0][:20000] # treated_dataset = pd.concat([sample_class_1, sample_class_0])[dataset_columns] # training_dataset, testing_dataset = train_test_split(treated_dataset, shuffle=True, stratify=treated_dataset['TARGET']) # train_mode = dict(training_dataset.mode().iloc[0]) # train_mode features = list(set(dataset_columns) - set(['TARGET'])) # train_features, Y_train = training_dataset[features], training_dataset['TARGET'] test_features = treated_dataset[features] column_trans = make_column_transformer( (OneHotEncoder(), categorical_features), (StandardScaler(), numerical_features)) transformer = column_trans.fit(treated_dataset[features]) # X_train = transformer.transform(train_features) clean_data = transformer.transform(test_features) return clean_data
# Make a column transformer to pre-process our data, selecting 'Sex' and 'Embarked' columns to # encode them with OneHotEncoder, the 'remainder' parameter decides what to do with the remaining # columns, as the default behavior is to drop them, we use 'passthrough' to just concatenate them # with the processed data, resulting: # For each possible value of a column, it creates a column with 0 or 1 - false or true: # ['sex_value_1', 'sex_value_2', 'embarked_value_1', 'embarked_value_2', 'embarked_value_3', 'untouched_Pclass'] # [[0. 1. 0. 0. 1. 3.] # [1. 0. 1. 0. 0. 1.] # [1. 0. 0. 0. 1. 3.] # ... # [1. 0. 0. 0. 1. 3.] # [0. 1. 1. 0. 0. 1.] # [0. 1. 0. 1. 0. 3.]] column_trans = make_column_transformer((OneHotEncoder(), ['Sex', 'Embarked']), remainder='passthrough') column_trans.fit_transform(X) # Build a Pipeline with a Logistic Regression model and our pre-processor logreg = LogisticRegression(solver='lbfgs') pipe = make_pipeline(column_trans, logreg) # Cross validate model with X and y, returning the average accuracy of the prediction score = cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean() print(score) # Works as model.fit, but runs pre-processing as well pipe.fit(X, y) X_new = X.sample(5, random_state=99)
X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # Taking care of missing data imputer = SimpleImputer(missing_values=np.nan, strategy='mean') X[:, 1:3] = imputer.fit_transform(X[:, 1:3]) # or # X[:, 1:3] = imputer.fit(X[:, 1:3]).transform(X[:, 1:3]) # Encoding categorical data # Encoding the Independent Variable le_X = LabelEncoder() X[:, 0] = le_X.fit_transform(X[:, 0]) colt = make_column_transformer( (OneHotEncoder(categories='auto'), [0]), remainder='passthrough') X = colt.fit_transform(X) # the below method is depricated and alternative to colt # ohe = OneHotEncoder(categories=[[0]]) # X = ohe.fit_transform(X).toarray() # Encoding the Dependent Variable le_y = LabelEncoder() y = le_y.fit_transform(y) # Splitting the dataset into Training set and Test set X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) # Feature Scaling
def main(input_df, build_feature_pipe=None, all_preprocess=None, method='fit_transform'): """Transforms the source data by applying preprocessing transforms. Args: input_df ([Dataframe]): Unprocessed source data build_feature_pipe (pipeline object, optional): the build_feature_pipe pipeline object, Required only when method='transform'. Defaults to None. all_preprocess (pipeline object, optional): all_preprocess pipeline object. Required when method='transform' or 'inverse_transform'. Defaults to None. method (str, optional): The operation performed by build_feature method. Valid values are ['fit','transform','fit_transform','inverse_transform]. Defaults to 'fit_transform'. Returns: X,y (array): Returns the training data and target variable as arrays """ module_logger.info('Starting to build features module.') #identify and define column sets for applying preprocessing transforms num_cols = read_csv_to_list('./data/processed/numeric_columns.csv', header=None, squeeze=True) cat_cols = read_csv_to_list('./data/processed/categorical_columns.csv', header=None, squeeze=True) drop_cols = read_csv_to_list('./data/processed/drop_columns.csv', header=None, squeeze=True) fe_cols = read_csv_to_list( './data/processed/feature_engineering_columns.csv', header=None, squeeze=True) module_logger.info('Importing columns from stored lists complete.') #Build preprocessing pipeline if method in ['fit', 'fit_transform'] and build_feature_pipe is None: module_logger.info('Building build_features_pipe for very first time.') build_feature_pipe = make_pipeline( transforms.DropRowsTransformer(), transforms.BuildFeaturesTransformer(fe_cols)) if method in ['fit', 'fit_transform'] and all_preprocess is None: module_logger.info('Building all_preprocess_pipe for very first time.') numerical_preprocess = make_pipeline(SimpleImputer(strategy='median'), transforms.CustomStandardScaler()) preprocess_pipe = make_column_transformer( (transforms.DropFeaturesTransformer( columns=list(drop_cols), inplace=True), list(drop_cols)), (transforms.RandomStandardEncoderTransformer(cat_cols), cat_cols), (numerical_preprocess, num_cols), remainder='passthrough') all_preprocess = make_pipeline(preprocess_pipe) #apply pipeline module_logger.info('Pipeline started') if method == 'fit': module_logger.info('Starting pipeline.fit') fe_df = build_feature_pipe.fit_transform(input_df) all_preprocess.fit(fe_df) X = [] #return empty array as only pipeline is fitted y = [] #return empty array as only pipeline is fitted module_logger.info('Pipeline.fit completed successfully') elif method == 'transform': module_logger.info('Starting pipeline.transform') #build_feature_pipe, all_preprocess cannot be None assert build_feature_pipe != None, module_logger.error( 'Missing pipeline object build_feature_pipe.') assert all_preprocess != None, module_logger.error( 'Missing pipeline object all_preprocess.') fe_df = build_feature_pipe.transform(input_df) module_logger.info('feature engineering + drop rows done') #Check if input_df has column CASE_STATUS, dataset during prediction will not have target variable so below code should be skipped if 'CASE_STATUS' in fe_df.columns.values: module_logger.info('Target column found.') assert fe_df[~fe_df.CASE_STATUS.isin( ['Certified', 'Denied'])].shape[0] == 0, module_logger.error( 'Unexpected values found in CASE_STATUS field.') y = fe_df.pop('CASE_STATUS') y.replace(['Certified', 'Denied'], [0, 1], inplace=True) module_logger.info('Target column separated') #if CASE_STATUS is not present, return empty array for y else: module_logger.info( 'Target column not found. Returning empty array for y.') y = [] X = all_preprocess.transform(fe_df) #Ensure that X has expected number of features assert X.shape[1] == 31, module_logger.exception( 'Arrays X of shape [:,31] expected.') #Ensure that X and y have same number of rows assert X.shape[0] == y.shape[0], module_logger.exception( 'Arrays X and y should have same number of rows.') module_logger.info('drop columns + encoding done') module_logger.info('pipeline.transform completed successfully') elif method == 'fit_transform': module_logger.info('Starting pipeline.fit_transform') fe_df = build_feature_pipe.fit_transform(input_df) module_logger.info('feature engineering + drop rows done') assert fe_df[~fe_df.CASE_STATUS.isin(['Certified', 'Denied'])].shape[ 0] == 0, module_logger.error( 'Unexpected values found in CASE_STATUS field.') y = fe_df.pop('CASE_STATUS') y.replace(['Certified', 'Denied'], [0, 1], inplace=True) module_logger.info('Target column separated') X = all_preprocess.fit_transform(fe_df) #Ensure that X has expected number of features assert X.shape[1] == 31, module_logger.exception( 'Arrays X of shape [:,31] expected.') #Ensure that X and y have same number of rows assert X.shape[0] == y.shape[0], module_logger.exception( 'Arrays X and y should have same number of rows.') module_logger.info('drop columns + encoding done') module_logger.info('pipeline.fit_transform completed successfully') elif method == 'inverse': module_logger.info('Starting pipeline.inverse_transform') # all_preprocess cannot be None assert all_preprocess != None, module_logger.error( 'Missing pipeline object all_preprocess.') X = all_preprocess.inverse_transform(input_df) y = [ ] #return empty array as inverse transform only applies to input features module_logger.info('drop columns + encoding done') module_logger.info('pipeline.inverse_transform completed successfully') module_logger.info('Building features complete.') #save pipeline when method is fit, fit_transform if method in ['fit', 'fit_transform']: dump(build_feature_pipe, open('./models/build_feature_pipe.pkl', 'wb')) dump(all_preprocess, open('./models/preprocess_pipe.pkl', 'wb')) module_logger.info('Pipeline saved.') return X, y
def load_data(data_path: str, history_size, horizon_size, historic_columns=['load', 'is_holiday', 'tempC'], horizon_columns=['is_holiday', 'tempC'], prediction_columns=['load'], splits=['train', 'validate', 'test'], shift=None, validation_split=None, batch_size=32, cycle_length=10, shuffle_buffer_size=1000, seed=42): """ Loads the preprocessed CER data and build the dataset. :param data_path: The path to the folder containing the train.csv and test.csv :type data_path: str :param history_size: The number of time steps of the historic data a patch should contain :type history_size: int :param horizon_size: The number of time steps in the prediction horizon a step should contain :type horizon_size: int :param historic_columns: The column names to used as historic data. :type historic_columns: Array :param horizon_columns: The column names to be used as horizon data. :type horizon_columns: Array :param prediction_columns: The columns to predict :type prediction_columns: Array :param splits: The data splits to be generated. At least one of 'train', 'validate' or 'test' :type splits: Array :param shift: The amount of time steps by which the window moves on each iteration :type shift: int :param validation_split: The amount of data reserved from the training set for validation :type validation_split: float :param batch_size: The batch size :type batch_size: int :param cycle_length: The number of input elements that are processed concurrently :type cycle_length: int :param shuffle_buffer_size: The shuffle buffer size :type shuffle_buffer_size: int :param seed: The seed used by the pseudo random generators :type seed: int :returns: A dict containing the windowed TensorFlow datasets generated from csv file in `data_path` for the given `spits`. :rtype: dict """ # common ################################################################## data = {} scalers = { 'load': MinMaxScaler(feature_range=(0, 1)), 'tempC': MinMaxScaler(feature_range=(-1, 1)), 'is_holiday': MinMaxScaler(feature_range=(0, 1)) } column_transformer = make_column_transformer( *[(scalers[k], [k]) for k in sorted(scalers.keys())]) make_dataset = partial(WindowedTimeSeriesDataSet, column_transformer=column_transformer, history_size=history_size, horizon_size=horizon_size, historic_columns=historic_columns, horizon_columns=horizon_columns, prediction_columns=prediction_columns, shift=shift, batch_size=32, cycle_length=cycle_length, shuffle_buffer_size=shuffle_buffer_size, seed=seed) # train data ############################################################## if 'train' in splits: if validation_split is not None: data_splitter = TimeSeriesSplit( 1 - validation_split, TimeSeriesSplit.LEFT) else: data_splitter = None train_data_path = os.path.join(data_path, 'train.csv') data['train'] = make_dataset(file_path=train_data_path, data_splitter=data_splitter, fit_transformer=True)() # validation data ######################################################### if 'validate' in splits and validation_split is not None: data_splitter = TimeSeriesSplit( validation_split, TimeSeriesSplit.RIGHT) data['validate'] = make_dataset(file_path=train_data_path, data_splitter=data_splitter)() # test data ############################################################### if 'test' in splits: test_data_path = os.path.join(data_path, 'test.csv') data['test'] = make_dataset(file_path=test_data_path)() return data
@author: aswin """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns datasheet=pd.read_csv("50_Startups_EDA.csv") datasheet.fillna(datasheet.mean(),inplace=True) x=datasheet.iloc[:,0:-1].values y=datasheet.iloc[:,5].values from sklearn.preprocessing import OneHotEncoder from sklearn.compose import make_column_transformer A=make_column_transformer((OneHotEncoder(categories='auto'), [4]),remainder="passthrough") x=A.fit_transform(x) from sklearn.model_selection import train_test_split xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.30,random_state=0) from sklearn.linear_model import LinearRegression prediction=LinearRegression() prediction.fit(xtrain,ytrain) result=prediction.predict(xtest) prediction.score(xtrain,ytrain) prediction.score(xtest,ytest) import statsmodels.api as sm x=np.append(arr=np.ones(shape=(60,1),dtype=int),values=x,axis=1) xnew1= np.array(x[:,[0,2,3,4,5,6,7]], dtype=int) model = sm.OLS(y,xnew1) results1=model.fit() results1.summary()
from sklearn.tree import DecisionTreeClassifier, plot_tree from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score from sklearn.compose import make_column_transformer from sklearn.ensemble import RandomForestClassifier data = pd.read_csv('./data.csv') # Data processing data = data.iloc[:, 1:] # removes first column with id's X_raw = data.iloc[:, :-1] # creates feature matrix without churn #X_raw = X_raw.drop(['PhoneService','MultipleLines', 'OnlineBackup','DeviceProtection','StreamingTV','StreamingMovies'], axis = 1) ''' Data pre-processing''' encoder = OneHotEncoder(sparse=False) column_trans = make_column_transformer((encoder, [ 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod' ]), remainder='passthrough') #encoder.fit_transform(X_raw[['PaymentMethod']]) #encoder.categories_ # NEW feature matrix X = column_trans.fit_transform(X_raw) # Binary encode churn target = data.iloc[:, -1:] y = target.apply(LabelEncoder().fit_transform) ''' Train test split ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) ''' Random Forest Model searching ''' #'''------------------------------------------------------------------
X_val = val[filtered_columns] y_val = val['SepsisLabel'] categorical = X_train.dtypes == object categorical['Gender'] = True #Defining the pipeline cat_pipeline = make_pipeline( OneHotEncoder(handle_unknown="ignore")) cont_scale_pipeline = make_pipeline(SimpleImputer(strategy = "median"), StandardScaler()) preprocess_trans_scale = make_column_transformer((cont_scale_pipeline, ~categorical), (cat_pipeline, categorical)) #Custom Score function def score_model(model, metric_list, y_true, y_pred): metric_dict = {'precision' : precision_score, 'recall' : recall_score, 'confusion_matrix' : confusion_matrix, 'avg_precision': average_precision_score} df = pd.DataFrame() df['model'] = [model] for metric in metric_list: df[metric] = [metric_dict[metric](y_true, y_pred)] return df #Logistic Regression
numeric_features = [ "rating", "height", "weight", "salary", "draft_year", "draft_round", "draft_peak" ] categorical_features = ["team", "country"] numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler()) categorical_transformer = make_pipeline( SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")) preprocessor = make_column_transformer( (numeric_transformer, numeric_features), (categorical_transformer, categorical_features)) # Build a pipeline containing the column transformer and an SVC model # Name it pipe_unbalanced and fit it on the training data pipe_unbalanced = make_pipeline(preprocessor, SVC()) pipe_unbalanced.fit(X_train, y_train) # Predict your values on the validation set # Save them in an object named predicted_y predicted_y = pipe_unbalanced.predict(X_valid) # Using sklearn tools, calculate precision # Save it in an object named precision precision = precision_score(y_valid, predicted_y, pos_label="F").round(3) print("precision: ", precision)
@pytest.mark.parametrize( "estimator", [ LogisticRegression(max_iter=1000, random_state=0), GradientBoostingClassifier(random_state=0, n_estimators=5), ], ids=["estimator-brute", "estimator-recursion"], ) @pytest.mark.parametrize( "preprocessor", [ None, make_column_transformer( (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]), ), make_column_transformer( (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), remainder="passthrough", ), ], ids=["None", "column-transformer", "column-transformer-passthrough"], ) @pytest.mark.parametrize( "features", [[0, 2], [iris.feature_names[i] for i in (0, 2)]], ids=["features-integer", "features-string"], ) def test_partial_dependence_dataframe(estimator, preprocessor, features): # check that the partial dependence support dataframe and pipeline
X = df.iloc[:, :-1] Y = np.array(df.iloc[:, -1]) Y = Y.reshape(len(Y), 1) ############ Target Encoding ############ print("\t> Encoding Target...") Y = pd.DataFrame(Y) Y.loc[Y[0] != 'normal.', 0] = 1 Y.loc[Y[0] == 'normal.', 0] = 0 #Y[0].Weight = Y[0].Weight.astype('int64') Y = np.array(Y) Y = Y.astype(float) ############ Input Encoding for columns 1,2,3 ############ print("\t> Encoding Input...") IE = make_column_transformer((OneHotEncoder(), ['1', '2', '3']), remainder='passthrough') IE.fit(X) X = pd.DataFrame(IE.transform(X)) ############ Train test split (80%, 20% ratio) ############ print("\t> Splitting into Train and Test Data...") X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1) Y_train = Y_train.reshape(len(Y_train), 1) Y_test = Y_test.reshape(len(Y_test), 1) ############# Scaling Input ############# print("\t> Scaling Input...") SCALE_IN = StandardScaler()