import pandas as pd df = pd.read_csv('train.csv') df_sub = pd.read_csv('test.csv') case_id = df_sub['id'] df_sub = df_sub.drop(['id'], axis=1) X = df.iloc[:, 1:11].values y = df.iloc[:, 11].values X_sub = df_sub.iloc[:, :].values from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer ohe = OneHotEncoder() ctX = ColumnTransformer([('X', ohe, [0, 5, 6])], remainder='passthrough') X = ctX.fit_transform(X) X_sub = ctX.transform(X_sub) from sklearn.preprocessing import StandardScaler sc = StandardScaler() X[:, [7, 11, 13]] = sc.fit_transform(X[:, [7, 11, 13]]) X_sub[:, [7, 11, 13]] = sc.transform(X_sub[:, [7, 11, 13]]) #Since the dataset is heavily skewed, model always predicts 0. Therefore we assign weights to classes neg, pos = np.bincount(y) total = neg + pos w0 = (1 / neg) * (total) / 2 w1 = (1 / pos) * (total) / 2 weights = {0: w0, 1: w1}
previsores[:, 3] = labelencoder.fit_transform(previsores[:, 3]) previsores[:, 5] = labelencoder.fit_transform(previsores[:, 5]) previsores[:, 6] = labelencoder.fit_transform(previsores[:, 6]) previsores[:, 7] = labelencoder.fit_transform(previsores[:, 7]) previsores[:, 8] = labelencoder.fit_transform(previsores[:, 8]) previsores[:, 9] = labelencoder.fit_transform(previsores[:, 9]) previsores[:, 13] = labelencoder.fit_transform(previsores[:, 13]) classe = labelencoder.fit_transform(classe) # Instancia a classe OneHotEncoder onehotencoder = ColumnTransformer( transformers=[( "OneHot", OneHotEncoder(), [1,3,5,6,7,8,9,13])], remainder='passthrough' ) previsores = onehotencoder.fit_transform(previsores).toarray() scaler = StandardScaler() previsores = scaler.fit_transform(previsores) # Divide a base em treino e teste previsores_train, previsores_test, classe_train, classe_test = train_test_split( previsores, classe, test_size=0.25, random_state=0
transformers = [] cols_with_missing_string_data = [] for column in cols_with_missing_vals: # Check X_train for type - has no missing data if not is_numeric_dtype(X_train[column]): imputer = Pipeline(steps=[( 'imputer', SimpleImputer(strategy='constant', fill_value='missing_value'))]) else: imputer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]) transformers.append((column, imputer, [column])) preprocessor = ColumnTransformer(transformers, remainder='passthrough') ''' # We will add an empty row to training data, validation data so that we may encode the missing values new_row = pd.Series(name='NameOfNewRow') X_train.append(new_row) X_valid.append(new_row) X_test.append(new_row) ''' preprocessor.fit(X_train) # Simple imputation imputed_X_train = pd.DataFrame(preprocessor.transform(X_train)) imputed_X_valid = pd.DataFrame(preprocessor.transform(X_valid)) imputed_X_test = pd.DataFrame(preprocessor.transform(X_test))
# ====================== # Preprocessing Pipeline # ====================== y = data['Fertilizer Name'].copy() X = data.drop('Fertilizer Name', axis=1).copy() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1) nominal_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False))]) preprocessor = ColumnTransformer(transformers=[('nominal', nominal_transformer, ['Soil Type', 'Crop Type'])], remainder='passthrough') model = Pipeline(steps=[( 'preprocessor', preprocessor), ('scaler', StandardScaler()), ('classifier', RandomForestClassifier())]) # ====================== # Training # ====================== model.fit(X_train, y_train) joblib.dump(model, 'fertilizer_pred.pkl') print("Test Accuracy: {:.2f}%".format(model.score(X_test, y_test) * 100))
def __init__( self, Estimator, numeric_features, categorical_features, response, kwargs=None, sparse=False, ): """ """ self.numeric_features = numeric_features self.categorical_features = categorical_features self.features = numeric_features + categorical_features self.response = response ordinal_pipe = Pipeline([ ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), ("ordinal", OrdinalEncoder()), ]) # Standardize numeric prenumeric = ColumnTransformer([ ( "numimp", SimpleImputer(), [self.features.index(x) for x in numeric_features], ), ( "cat", ordinal_pipe, [self.features.index(x) for x in categorical_features], ), ]) # Round to int to avoid issues with decision boundary sampling non integers # for the range. # One Hot for most Sci-kit Learn functions. one_hot_pipe = Pipeline([ ("rint", FunctionTransformer(np.rint)), ("onehot", OneHotEncoder(sparse=sparse, handle_unknown="ignore")), ]) preprocessing = ColumnTransformer([ ( "num", RobustScaler(), [self.features.index(x) for x in numeric_features], ), ( "onepipe", one_hot_pipe, [self.features.index(x) for x in categorical_features], ), ]) pipe = Pipeline([ ("prenumeric", prenumeric), ("preprocess", preprocessing), ("estimator", Estimator(**kwargs)), ]) self.Estimator = Estimator self.pipe = pipe self.transform_numeric = pipe.named_steps["prenumeric"] self.numeric_pipe = Pipeline([ ("preprocess", pipe.named_steps["preprocess"]), ("estimator", pipe.named_steps["estimator"]), ])
for i in range(len(x)): x[i] = list(map(int,x[i])) y = list(map(int,y)) for i in range(len(x[:,2])): if x[:,1][i] == 2: x[:,1][i] = 0 if x[:,2][i] > 4 or x[:,2][i] == 0: x[:,2][i] = 4 if x[:,3][i] > 3 or x[:,3][i] == 0: x[:,3][i] = 3 from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer cT = ColumnTransformer(transformers=[('cT',OneHotEncoder(categories='auto',drop='first'),[2,3])],remainder='passthrough') x = cT.fit_transform(x) from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0) from sklearn.preprocessing import StandardScaler ss = StandardScaler() x_train = ss.fit_transform(x_train) x_test = ss.fit_transform(x_test) import keras from keras.models import Sequential from keras.layers import Dense from sklearn.metrics import confusion_matrix
def test_column_transformer_no_estimators_set_params(): ct = ColumnTransformer([]).set_params(n_jobs=2) assert ct.n_jobs == 2
from sklearn.preprocessing import OneHotEncoder cat_features = [col for col in data.columns if col not in numerical_features] from sklearn.compose import make_column_selector as selector categorical_columns_selector = selector(dtype_include=object) categorical_columns = categorical_columns_selector(data) categorical_columns scaler_imputer_transformer = make_pipeline(StandardScaler(), SimpleImputer(strategy='mean')) cat_ohe_imputer_transformer = make_pipeline( SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown='ignore')) preprocessor = ColumnTransformer(transformers=[("num-preprocessor", scaler_imputer_transformer, numerical_features), ("cat-preprocessor", cat_ohe_imputer_transformer, categorical_columns)]) model = make_pipeline(preprocessor, LogisticRegression()) cv_result = cross_validate(model, data, target, cv=5) cv_result # In[25]: model # In[ ]:
label="population", figsize=(10, 7), c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True) plt.legend() plt.savefig("heat_map_housing.png") housing = load_housing_data() housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5]) train, test = create_split(housing) housing = train.copy() housing_labels = train["median_house_value"].copy() num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num = housing.drop("ocean_proximity", axis=1) num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs)]) housing_prepared = full_pipeline.fit_transform(housing) lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels)
test_size=0.25, random_state=42) # - X_train.shape # + from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer num_pipeline = Pipeline([('std_scaler', StandardScaler())]) num_attribs = list(X_train) full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs)]) # - X_train_prepared = full_pipeline.fit_transform(X_train) X_test_prepared = full_pipeline.transform(X_test) X_test_final_prepared = full_pipeline.transform(X_test_final) X_train_prepared = pd.DataFrame(X_train_prepared, columns=num_attribs) X_test_prepared = pd.DataFrame(X_test_prepared, columns=num_attribs) X_test_final_prepared = pd.DataFrame(X_test_final_prepared, columns=num_attribs) ''' #Scaling from sklearn.preprocessing import StandardScaler x_col = X.columns scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) numeric_features = ['age', 'fare'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['embarked', 'sex', 'pclass'] categorical_transformer = Pipeline(steps=[ # --- SimpleImputer is not available for strings in ONNX-ML specifications. # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features), ]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(solver='lbfgs'))]) clf.fit(X_train, y_train) ################################## # Define the inputs of the ONNX graph # +++++++++++++++++++++++++++++++++++ # # *sklearn-onnx* does not know the features used to train the model # but it needs to know which feature has which name.
def fit(self, X_train, X_test, y_train, y_test): """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test. Parameters ---------- X_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. X_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. y_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. y_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. Returns ------- scores : Pandas DataFrame Returns metrics of all the models in a Pandas DataFrame. predictions : Pandas DataFrame Returns predictions of all the models in a Pandas DataFrame. """ R2 = [] RMSE = [] # WIN = [] names = [] TIME = [] predictions = {} if self.custom_metric != None: CUSTOM_METRIC = [] if type(X_train) is np.ndarray: X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) numeric_features = X_train.select_dtypes( include=['int64', 'float64', 'int32', 'float32']).columns categorical_features = X_train.select_dtypes( include=['object']).columns preprocessor = ColumnTransformer( transformers=[('numeric', numeric_transformer, numeric_features), ('categorical', categorical_transformer, categorical_features)]) for name, model in tqdm(REGRESSORS): start = time.time() try: if 'random_state' in model().get_params().keys(): pipe = Pipeline( steps=[('preprocessor', preprocessor), ('regressor', model(random_state=self.random_state))]) else: pipe = Pipeline( steps=[('preprocessor', preprocessor), ('regressor', model())]) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) r_squared = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) names.append(name) R2.append(r_squared) RMSE.append(rmse) TIME.append(time.time() - start) if self.custom_metric != None: custom_metric = self.custom_metric(y_test, y_pred) CUSTOM_METRIC.append(custom_metric) if self.verbose > 0: if self.custom_metric != None: print({ "Model": name, "R-Squared": r_squared, "RMSE": rmse, self.custom_metric.__name__: custom_metric, "Time taken": time.time() - start }) else: print({ "Model": name, "R-Squared": r_squared, "RMSE": rmse, "Time taken": time.time() - start }) if self.predictions == True: predictions[name] = y_pred except Exception as exception: if self.ignore_warnings == False: print(name + " model failed to execute") print(exception) if self.custom_metric == None: scores = pd.DataFrame({ "Model": names, "R-Squared": R2, "RMSE": RMSE, "Time Taken": TIME }) else: scores = pd.DataFrame({ "Model": names, "R-Squared": R2, "RMSE": RMSE, self.custom_metric.__name__: CUSTOM_METRIC, "Time Taken": TIME }) scores = scores.sort_values(by='R-Squared', ascending=False).set_index('Model') if self.predictions == True: predictions_df = pd.DataFrame.from_dict(predictions) return scores, predictions_df if self.predictions == True else scores
def fit(self, X_train, X_test, y_train, y_test): """Fit Classification algorithms to X_train and y_train, predict and score on X_test, y_test. Parameters ---------- X_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. X_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. y_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. y_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. Returns ------- scores : Pandas DataFrame Returns metrics of all the models in a Pandas DataFrame. predictions : Pandas DataFrame Returns predictions of all the models in a Pandas DataFrame. """ Accuracy = [] B_Accuracy = [] ROC_AUC = [] F1 = [] names = [] TIME = [] predictions = {} if self.custom_metric != None: CUSTOM_METRIC = [] if type(X_train) is np.ndarray: X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) numeric_features = X_train.select_dtypes( include=['int64', 'float64', 'int32', 'float32']).columns categorical_features = X_train.select_dtypes( include=['object']).columns preprocessor = ColumnTransformer( transformers=[('numeric', numeric_transformer, numeric_features), ('categorical', categorical_transformer, categorical_features)]) for name, model in tqdm(CLASSIFIERS): start = time.time() try: if 'random_state' in model().get_params().keys(): pipe = Pipeline( steps=[('preprocessor', preprocessor), ('classifier', model(random_state=self.random_state))]) else: pipe = Pipeline( steps=[('preprocessor', preprocessor), ('classifier', model())]) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) accuracy = accuracy_score(y_test, y_pred, normalize=True) b_accuracy = balanced_accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred, average='weighted') try: roc_auc = roc_auc_score(y_test, y_pred) except Exception as exception: roc_auc = None if self.ignore_warnings == False: print("ROC AUC couldn't be calculated for " + name) print(exception) names.append(name) Accuracy.append(accuracy) B_Accuracy.append(b_accuracy) ROC_AUC.append(roc_auc) F1.append(f1) TIME.append(time.time() - start) if self.custom_metric != None: custom_metric = self.custom_metric(y_test, y_pred) CUSTOM_METRIC.append(custom_metric) if self.verbose > 0: if self.custom_metric != None: print({ "Model": name, "Accuracy": accuracy, "Balanced Accuracy": b_accuracy, "ROC AUC": roc_auc, "F1 Score": f1, self.custom_metric.__name__: custom_metric, "Time taken": time.time() - start }) else: print({ "Model": name, "Accuracy": accuracy, "Balanced Accuracy": b_accuracy, "ROC AUC": roc_auc, "F1 Score": f1, "Time taken": time.time() - start }) if self.predictions == True: predictions[name] = y_pred except Exception as exception: if self.ignore_warnings == False: print(name + " model failed to execute") print(exception) if self.custom_metric == None: scores = pd.DataFrame({ "Model": names, "Accuracy": Accuracy, "Balanced Accuracy": B_Accuracy, "ROC AUC": ROC_AUC, "F1 Score": F1, "Time Taken": TIME }) else: scores = pd.DataFrame({ "Model": names, "Accuracy": Accuracy, "Balanced Accuracy": B_Accuracy, "ROC AUC": ROC_AUC, "F1 Score": F1, self.custom_metric.__name__: CUSTOM_METRIC, "Time Taken": TIME }) scores = scores.sort_values(by='Balanced Accuracy', ascending=False).set_index('Model') if self.predictions == True: predictions_df = pd.DataFrame.from_dict(predictions) return scores, predictions_df if self.predictions == True else scores
numerical_transformer = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median', verbose=1)), ('Scaler', StandardScaler())], verbose=True) # Impute and One Hot Encode categorical features categorical_transformer = Pipeline(steps=[ ('Imputer', SimpleImputer(strategy='constant', fill_value='missing', verbose=1)), ('Onehot', OneHotEncoder(handle_unknown='ignore', sparse=True)) ], verbose=True) # Preprocessor operations preprocessor = ColumnTransformer(transformers=[ ('Numerical Data', numerical_transformer, numerical_features), ('Categorical Data', categorical_transformer, categorical_features) ], verbose=True) # Linear Regression Pipeline: Preprocess -> Ridge Regression lr = Pipeline(steps=[('Preprocessor', preprocessor), ('Ridge Regression', Ridge(alpha=0.5, fit_intercept=True, solver='sag'))], verbose=True) # Create x features and y features x_data = dataset_training.drop(['Instance', 'Income in EUR'], axis=1) y_data = dataset_training['Income in EUR'] # Split data 70/30 x_train, x_test, y_train, y_real = train_test_split(x_data,
target = adult_census[target_name] data = adult_census.drop(columns=[target_name, "education-num"]) data_train, data_test, target_train, target_test = train_test_split( data, target, train_size=0.2, random_state=42) # %% from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_selector as selector from sklearn.preprocessing import OrdinalEncoder categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) preprocessor = ColumnTransformer( [('cat_preprocessor', categorical_preprocessor, selector(dtype_include=object))], remainder='passthrough', sparse_threshold=0) from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.pipeline import Pipeline model = Pipeline([("preprocessor", preprocessor), ("classifier", HistGradientBoostingClassifier(random_state=42))]) # %% [markdown] # # Use the previously defined model (called `model`) and using two nested `for` # loops, make a search of the best combinations of the `learning_rate` and # `max_leaf_nodes` parameters. In this regard, you will need to train and test
imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) # Old Version #from sklearn.preprocessing import LabelEncoder, OneHotEncoder #labelencoder_X = LabelEncoder() #X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) #onehotencoder = OneHotEncoder(categorical_features = [0]) #X = onehotencoder.fit_transform(X).toarray() # New Version # Encoding categorical data # Encoding the Independent Variable from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer ct = ColumnTransformer([("Country", OneHotEncoder(), [0])], remainder='passthrough') X = ct.fit_transform(X) # Encoding the Dependent Variable labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler
df['issue_date'] = [datetime.datetime.strptime(x, format) for x in df['issue_date']] df['listing_date'] = [datetime.datetime.strptime(x, format) for x in df['listing_date']] t = pd.DataFrame() t['TA Time'] = df['listing_date'] - df['issue_date'] t['TA Time'] = t['TA Time']/np.timedelta64(1, 'D') X = df.iloc[:,[3, 4, 5, 6, 7, 8]].values t = t.iloc[:].values X = np.append(X, t, axis = 1) from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder, LabelEncoder le = LabelEncoder() X[:, 1] = le.fit_transform(X[:, 1]) ct = ColumnTransformer([('color_type', OneHotEncoder(), [1])], remainder = 'passthrough') X = ct.fit_transform(X) X = X.astype(float) y = df.iloc[:,10].values #df["height(cm)"] = df['height(cm)'] / 100 #df.rename(columns = {"height(cm)" : "height(m)"}, inplace = True) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7) from sklearn.linear_model import LogisticRegression
# PREPROCESSING PIPELINES # numerical features are in same positions in feature set 1 & 2, so only need to define one list & preprocessor feat_num = [ 'short_pct_mean', 'plan_actual_diff_abs_max', 'trans_count', 'time_since_registration', 'song_pca', # num_songs_mean in feature set 2 'transactions_pca' ] # 'actual_amount_paid_mode' in feature set 2 feat_num_idx = [list(df_feat1.columns).index(x) for x in feat_num] # define Scaling preprocessor preproc_scale = ColumnTransformer(transformers=[('num', StandardScaler(), feat_num_idx)]) # for models that don't require scaling, we want to pass-through these features: preproc_num_pass = ColumnTransformer(transformers=[('num', 'passthrough', feat_num_idx)]) # Categorical: cat_cols = ['registered_via'] cat_cols_idx = [list(df_feat1.columns).index(x) for x in cat_cols] preproc_ohe = ColumnTransformer( transformers=[('cat', OneHotEncoder(categories='auto'), cat_cols_idx)]) # fit to get feature names preproc_ohe.fit(df_feat1) feat_ohe = preproc_ohe.named_transformers_['cat'].get_feature_names()
def test_column_transformer_dataframe(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_both = X_array cases = [ # String keys: label based # scalar ('first', X_res_first), # list (['first'], X_res_first), (['first', 'second'], X_res_both), # slice (slice('first', 'second'), X_res_both), # int keys: positional # scalar (0, X_res_first), # list ([0], X_res_first), ([0, 1], X_res_both), (np.array([0, 1]), X_res_both), # slice (slice(0, 1), X_res_first), (slice(0, 2), X_res_both), # boolean mask (np.array([True, False]), X_res_first), (pd.Series([True, False], index=['first', 'second']), X_res_first), ] for selection, res in cases: ct = ColumnTransformer([('trans', Trans(), selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), res) assert_array_equal(ct.fit(X_df).transform(X_df), res) # callable that returns any of the allowed specifiers ct = ColumnTransformer([('trans', Trans(), lambda X: selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), res) assert_array_equal(ct.fit(X_df).transform(X_df), res) ct = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} both = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])], transformer_weights=transformer_weights) res = np.vstack([ transformer_weights['trans1'] * X_df['first'], transformer_weights['trans2'] * X_df['second'] ]).T assert_array_equal(both.fit_transform(X_df), res) assert_array_equal(both.fit(X_df).transform(X_df), res) assert len(both.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' # test multiple columns both = ColumnTransformer([('trans', Trans(), ['first', 'second'])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder' both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder' # ensure pandas object is passes through class TransAssert(BaseEstimator): def fit(self, X, y=None): return self def transform(self, X, y=None): assert isinstance(X, (pd.DataFrame, pd.Series)) if isinstance(X, pd.Series): X = X.to_frame() return X ct = ColumnTransformer([('trans', TransAssert(), 'first')], remainder='drop') ct.fit_transform(X_df) ct = ColumnTransformer([('trans', TransAssert(), ['first', 'second'])]) ct.fit_transform(X_df) # integer column spec + integer column names -> still use positional X_df2 = X_df.copy() X_df2.columns = [1, 0] ct = ColumnTransformer([('trans', Trans(), 0)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), X_res_first) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'drop' assert_array_equal(ct.transformers_[-1][2], [1])
previsores = census.iloc[:, 0:14].values classe = census.iloc[:, 14].values labelEncoderPrevisores = LabelEncoder() # labels = labelEncoderPrevisores.fit_transform(previsores[:, 1]) previsores[:, 1] = labelEncoderPrevisores.fit_transform(previsores[:, 1]) previsores[:, 3] = labelEncoderPrevisores.fit_transform(previsores[:, 3]) previsores[:, 5] = labelEncoderPrevisores.fit_transform(previsores[:, 5]) previsores[:, 6] = labelEncoderPrevisores.fit_transform(previsores[:, 6]) previsores[:, 7] = labelEncoderPrevisores.fit_transform(previsores[:, 7]) previsores[:, 8] = labelEncoderPrevisores.fit_transform(previsores[:, 8]) previsores[:, 9] = labelEncoderPrevisores.fit_transform(previsores[:, 9]) previsores[:, 13] = labelEncoderPrevisores.fit_transform(previsores[:, 13]) # Dummy variables # etnia = census.iloc[:, 8].values # etnia = labelEncoderPrevisores.fit_transform(etnia) # Column Transformer oneHotEncoder = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(categories='auto'), [1, 3, 5, 6, 7, 8, 9, 13])], remainder='passthrough') previsores = oneHotEncoder.fit_transform(previsores).toarray() labelEncoder_Classe = LabelEncoder() classe = labelEncoder_Classe.fit_transform(classe) # Escalonamento scaler = StandardScaler() previsores = scaler.fit_transform(previsores)
def lab(): form = LabForm() if form.validate_on_submit(): X_test = np.array([[ float(form.latitude.data), float(form.longitude.data), str(form.month.data), str(form.day.data), float(form.avg.data), float(form.max.data), float(form.wind_s.data), float(form.wind_avg.data) ]]) print(X_test.shape) fires = pd.read_csv('datasets/sanbul-5.csv', sep=',') X_test = pd.DataFrame(X_test, columns=[ 'latitude', 'longitude', 'month', 'day', 'avg_temp', 'max_temp', 'max_wind_speed', 'avg_wind' ]) print(X_test) from sklearn.model_selection import train_test_split train_set, test_set = train_test_split(fires, test_size=0.2, random_state=42) from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(fires, fires["month"]): strat_train_set = fires.loc[train_index] strat_test_set = fires.loc[test_index] fires = strat_train_set.drop(["burned_area"], axis=1) # drop labels for training set fires_labels = strat_train_set["burned_area"].copy() fires_num = fires.drop(["month", "day"], axis=1) from sklearn.preprocessing import OneHotEncoder cat_encoder = OneHotEncoder() fires_cat = fires[["month"]] fires_cat_1hot = cat_encoder.fit_transform(fires_cat) cat_encoder = OneHotEncoder(sparse=False) fires_cat_1hot = cat_encoder.fit_transform(fires_cat) cat_encoder2 = OneHotEncoder() fires_cat = fires[["day"]] fires_cat_1hot_2 = cat_encoder2.fit_transform(fires_cat) cat_encoder2 = OneHotEncoder(sparse=False) fires_cat_1hot_2 = cat_encoder2.fit_transform(fires_cat) from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('std_scaler', StandardScaler()), ]) fires_num_tr = num_pipeline.fit_transform(fires_num) from sklearn.compose import ColumnTransformer num_attribs = list(fires_num) cat_attribs = ["month", "day"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) fires_prepared = full_pipeline.fit_transform(fires) X_test = full_pipeline.transform(X_test) MODEL_NAME = "my_sanbul_model" os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = "term-224506-9bc8286b5d7b.json" project_id = 'term-224506' model_id = MODEL_NAME model_path = "projects/{}/models/{}".format(project_id, model_id) model_path += "/versions/v0001/" ml_resource = googleapiclient.discovery.build("ml", "v1").projects() input_data_json = { "signature_name": "serving_default", "instances": X_test.tolist() } request = ml_resource.predict(name=model_path, body=input_data_json) response = request.execute() print("\nresponse:\n", response) if "error" in response: raise RuntimeError(response["error"]) predD = np.array([pred['dense_1'] for pred in response["predictions"]]) print(predD[0][0]) res = predD[0][0] return render_template('result.html', res=res) return render_template('prediction.html', form=form)
# Gerer les variables categorical_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(sparse=False))]) numeric_transformer = Pipeline(steps=[ ('scaler', StandardScaler())]) bool_transformer = Pipeline(steps=[ ('select_bool', PandasDataFrameSelector(binary_features)), ('scale', StandardScaler())]) preprocessor = ColumnTransformer( remainder = 'passthrough', transformers=[ ('num', numeric_transformer, numerical_features), ('cat', categorical_transformer, categorical_features), ('binary', bool_transformer, binary_features)]) model = preprocessor.fit_transform(Xtrain_new) model.shape #?????????????????????????????????????????????????????????????// ml_pipe=Pipeline([('transform', preprocessor), ('lin_reg',LinearRegression())]) ml_pipe.fit(X_train, y_train) ml_pipe.score(X_train, y_train)
pipeline_AnimalType_ChangeAnimalType = Pipeline([ ('mohammed3', TransformationWrapper(transformation=convertAnimalType)) ]) pipeline_SexuponOutcome_ChangeSexUponOutcome = Pipeline([ ('mohammed4', TransformationWrapper(transformation=convertSexUponOutcome)), ('encode', OneHotEncoder(categories='auto', sparse=False)) ]) pipeline_changeBreed = Pipeline([ ('mohammed5', TransformationWrapper(transformation=convertBreed)) ]) full_pipeline = ColumnTransformer( [("bilal", pipeline_ageuponoutcome_changeToWeeks, "AgeuponOutcome"), ("Xiangyi", pipeline_AnimalType_ChangeAnimalType, "AnimalType"), ("Mohammed", pipeline_SexuponOutcome_ChangeSexUponOutcome, "SexuponOutcome"), ('breed', pipeline_changeBreed, "Breed")], remainder='passthrough') columns = [ "AgeuponOutcome", "AnimalType", "Neutered Male", "Spayed Female", "Intact Male", "Intact Female", "Unknown", "Mix" ] #columns = ["AgeuponOutcome"] X_train = pd.DataFrame(full_pipeline.fit_transform(X_train), columns=columns) X_test = pd.DataFrame(full_pipeline.transform(X_test), columns=columns) X_train_all = pd.concat([X_train, X_train1], axis=1) X_test_all = pd.concat([X_test, X_test1], axis=1) print("hello")
from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder # Preprocessing for numerical data numerical_transformer = SimpleImputer(strategy='constant') # Preprocessing for categorical data categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) # Bundle preprocessing for numerical and categorical data preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols) ]) # from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(n_estimators=100, random_state=0) # from sklearn.metrics import mean_absolute_error # Bundle preprocessing and modeling code in a pipeline my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
# Tratamiento de los NAs from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0) imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) # Codificar datos categóricos from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) ct = ColumnTransformer( [ ('one_hot_encoder', OneHotEncoder(categories='auto'), [0]) ], # The column numbers to be transformed (here is [0] but can be [0, 1, 3]) remainder='passthrough' # Leave the rest of the columns untouched ) X = np.array(ct.fit_transform(X), dtype=np.float) labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # Dividir el data set en conjunto de entrenamiento y conjunto de testing from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Escalado de variables
# First let's time the pipeline we used in the main notebook to serve as a # reference: # %% # %%time from sklearn.model_selection import cross_validate from sklearn.pipeline import make_pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OrdinalEncoder from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) preprocessor = ColumnTransformer( [('categorical', categorical_preprocessor, categorical_columns)], remainder="passthrough") model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) cv_results = cross_validate(model, data, target) scores = cv_results["test_score"] print("The mean cross-validation accuracy is: " f"{scores.mean():.3f} +/- {scores.std():.3f}") # %% [markdown] # ## Scaling numerical features # %% # %%time from sklearn.preprocessing import StandardScaler
for mas_vnr_area_strategy in impute_mas_vnr_area: if mas_vnr_area_strategy == "mean": imputer_vnr = Pipeline( steps=[('imputer', SimpleImputer(strategy='mean'))]) elif mas_vnr_area_strategy == "0": imputer_vnr = Pipeline( steps=[('imputer', SimpleImputer(strategy="constant", fill_value=0))]) imputation_combinations.append( [garage_strategy, lot_area_strategy, mas_vnr_area_strategy]) preprocessors.append( ColumnTransformer(transformers=[ ('imputer_garage', imputer_garage, ["GarageYrBlt"]), ('imputer_lotarea', imputer_lotarea, ["LotFrontage"]), ('imputer_vnr', imputer_vnr, ["MasVnrArea"]) ], remainder='passthrough')) #final_train_data = [] #final_valid_data = [] #imputation_strategies = [] for i in range(0, len(preprocessors)): preprocessor = preprocessors[i] for imputation_method in imputation_methods: # New DataFrames with possibly added columns X_train_with_new_cols = X_train.copy() X_valid_with_new_cols = X_valid.copy() X_test_with_new_cols = X_test.copy() # Add new columns in method is "extended"
imputer = imp(missing_values=np.nan, strategy = 'mean') imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) # avoiding Dummy variable/redundant dependancy X = X[:, 1:] # Encoding categorical data # Encoding the Independent Variable from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder enc = ColumnTransformer([('Position', OneHotEncoder(),[0])], remainder='passthrough') X = enc.fit_transform(X) #Splitting dataset into train_set and test_set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X.astype(float))
#Matrix of features X = dataset.iloc[:, 3:13].values #Dependent variable vector y = dataset.iloc[:, 13].values # Label Encoding the "Gender" column from sklearn.preprocessing import LabelEncoder le = LabelEncoder() X[:, 2] = le.fit_transform(X[:, 2]) # One Hot Encoding the "Geography" column from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough') X = np.array(ct.fit_transform(X)) X = X[:, 1:] #splitting dataset training set and test set from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0) #feature scaling from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() X_train = sc_x.fit_transform(X_train) X_test = sc_x.transform(X_test) #creating leyars import keras
# Multiple Linear Regression # Importing the libraries import os import pandas as pd # Importing the dataset dataset = pd.read_csv( os.path.join(os.path.abspath(''), 'MultipleLinearRegression', 'Garch.csv')) X = dataset.iloc[:, 1:8].values y = dataset.iloc[:, 8].values # Encoding categorical data from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct = ColumnTransformer([("Name_Of_Your_Step", OneHotEncoder(), [1])], remainder="passthrough") X = ct.fit_transform(X) # Avoiding the Dummy Variable Trap X = X[:, 1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Fitting Multiple Linear Regression to the Training set from sklearn.linear_model import LinearRegression regressor = LinearRegression()