示例#1
0
import pandas as pd

df = pd.read_csv('train.csv')
df_sub = pd.read_csv('test.csv')
case_id = df_sub['id']
df_sub = df_sub.drop(['id'], axis=1)

X = df.iloc[:, 1:11].values
y = df.iloc[:, 11].values
X_sub = df_sub.iloc[:, :].values

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ohe = OneHotEncoder()
ctX = ColumnTransformer([('X', ohe, [0, 5, 6])], remainder='passthrough')
X = ctX.fit_transform(X)
X_sub = ctX.transform(X_sub)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X[:, [7, 11, 13]] = sc.fit_transform(X[:, [7, 11, 13]])
X_sub[:, [7, 11, 13]] = sc.transform(X_sub[:, [7, 11, 13]])

#Since the dataset is heavily skewed, model always predicts 0. Therefore we assign weights to classes
neg, pos = np.bincount(y)
total = neg + pos
w0 = (1 / neg) * (total) / 2
w1 = (1 / pos) * (total) / 2
weights = {0: w0, 1: w1}
previsores[:, 3] = labelencoder.fit_transform(previsores[:, 3])
previsores[:, 5] = labelencoder.fit_transform(previsores[:, 5])
previsores[:, 6] = labelencoder.fit_transform(previsores[:, 6])
previsores[:, 7] = labelencoder.fit_transform(previsores[:, 7])
previsores[:, 8] = labelencoder.fit_transform(previsores[:, 8])
previsores[:, 9] = labelencoder.fit_transform(previsores[:, 9])
previsores[:, 13] = labelencoder.fit_transform(previsores[:, 13])

classe = labelencoder.fit_transform(classe)


# Instancia a classe OneHotEncoder
onehotencoder = ColumnTransformer(
    transformers=[(
        "OneHot",
        OneHotEncoder(),
        [1,3,5,6,7,8,9,13])],
    remainder='passthrough'
    )

previsores = onehotencoder.fit_transform(previsores).toarray()

scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

# Divide a base em treino e teste
previsores_train, previsores_test, classe_train, classe_test = train_test_split(
        previsores,
        classe,
        test_size=0.25,
        random_state=0
示例#3
0
transformers = []
cols_with_missing_string_data = []
for column in cols_with_missing_vals:
    # Check X_train for type - has no missing data
    if not is_numeric_dtype(X_train[column]):
        imputer = Pipeline(steps=[(
            'imputer',
            SimpleImputer(strategy='constant', fill_value='missing_value'))])
    else:
        imputer = Pipeline(
            steps=[('imputer',
                    SimpleImputer(strategy='constant', fill_value=0))])
    transformers.append((column, imputer, [column]))

preprocessor = ColumnTransformer(transformers, remainder='passthrough')
'''
# We will add an empty row to training data, validation data so that we may encode the missing values
new_row = pd.Series(name='NameOfNewRow')

X_train.append(new_row)
X_valid.append(new_row)
X_test.append(new_row)
'''
preprocessor.fit(X_train)

# Simple imputation
imputed_X_train = pd.DataFrame(preprocessor.transform(X_train))
imputed_X_valid = pd.DataFrame(preprocessor.transform(X_valid))
imputed_X_test = pd.DataFrame(preprocessor.transform(X_test))
# ======================
# Preprocessing Pipeline
# ======================

y = data['Fertilizer Name'].copy()
X = data.drop('Fertilizer Name', axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.7,
                                                    shuffle=True,
                                                    random_state=1)

nominal_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False))])

preprocessor = ColumnTransformer(transformers=[('nominal', nominal_transformer,
                                                ['Soil Type', 'Crop Type'])],
                                 remainder='passthrough')

model = Pipeline(steps=[(
    'preprocessor',
    preprocessor), ('scaler',
                    StandardScaler()), ('classifier',
                                        RandomForestClassifier())])

# ======================
# Training
# ======================

model.fit(X_train, y_train)
joblib.dump(model, 'fertilizer_pred.pkl')
print("Test Accuracy: {:.2f}%".format(model.score(X_test, y_test) * 100))
示例#5
0
    def __init__(
        self,
        Estimator,
        numeric_features,
        categorical_features,
        response,
        kwargs=None,
        sparse=False,
    ):
        """

        """
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features
        self.features = numeric_features + categorical_features
        self.response = response

        ordinal_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="constant",
                                      fill_value="missing")),
            ("ordinal", OrdinalEncoder()),
        ])

        # Standardize numeric
        prenumeric = ColumnTransformer([
            (
                "numimp",
                SimpleImputer(),
                [self.features.index(x) for x in numeric_features],
            ),
            (
                "cat",
                ordinal_pipe,
                [self.features.index(x) for x in categorical_features],
            ),
        ])

        # Round to int to avoid issues with decision boundary sampling non integers
        # for the range.
        # One Hot for most Sci-kit Learn functions.
        one_hot_pipe = Pipeline([
            ("rint", FunctionTransformer(np.rint)),
            ("onehot", OneHotEncoder(sparse=sparse, handle_unknown="ignore")),
        ])

        preprocessing = ColumnTransformer([
            (
                "num",
                RobustScaler(),
                [self.features.index(x) for x in numeric_features],
            ),
            (
                "onepipe",
                one_hot_pipe,
                [self.features.index(x) for x in categorical_features],
            ),
        ])

        pipe = Pipeline([
            ("prenumeric", prenumeric),
            ("preprocess", preprocessing),
            ("estimator", Estimator(**kwargs)),
        ])
        self.Estimator = Estimator
        self.pipe = pipe
        self.transform_numeric = pipe.named_steps["prenumeric"]
        self.numeric_pipe = Pipeline([
            ("preprocess", pipe.named_steps["preprocess"]),
            ("estimator", pipe.named_steps["estimator"]),
        ])
for i in range(len(x)):
	x[i] = list(map(int,x[i]))
y = list(map(int,y))

for i in range(len(x[:,2])):
	if x[:,1][i] == 2:
		x[:,1][i] = 0
	if x[:,2][i] > 4 or x[:,2][i] == 0:
		x[:,2][i] = 4
	if x[:,3][i] > 3 or x[:,3][i] == 0:
		x[:,3][i] = 3

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
cT = ColumnTransformer(transformers=[('cT',OneHotEncoder(categories='auto',drop='first'),[2,3])],remainder='passthrough')
x = cT.fit_transform(x)

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)


from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import confusion_matrix
def test_column_transformer_no_estimators_set_params():
    ct = ColumnTransformer([]).set_params(n_jobs=2)
    assert ct.n_jobs == 2
示例#8
0
from sklearn.preprocessing import OneHotEncoder

cat_features = [col for col in data.columns if col not in numerical_features]
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

categorical_columns

scaler_imputer_transformer = make_pipeline(StandardScaler(),
                                           SimpleImputer(strategy='mean'))
cat_ohe_imputer_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown='ignore'))
preprocessor = ColumnTransformer(transformers=[("num-preprocessor",
                                                scaler_imputer_transformer,
                                                numerical_features),
                                               ("cat-preprocessor",
                                                cat_ohe_imputer_transformer,
                                                categorical_columns)])
model = make_pipeline(preprocessor, LogisticRegression())
cv_result = cross_validate(model, data, target, cv=5)
cv_result

# In[25]:

model

# In[ ]:
示例#9
0
              label="population",
              figsize=(10, 7),
              c="median_house_value",
              cmap=plt.get_cmap("jet"),
              colorbar=True)
    plt.legend()
    plt.savefig("heat_map_housing.png")


housing = load_housing_data()
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
train, test = create_split(housing)
housing = train.copy()
housing_labels = train["median_house_value"].copy()
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs),
                                   ("cat", OneHotEncoder(), cat_attribs)])
housing_prepared = full_pipeline.fit_transform(housing)

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
示例#10
0
                                                    test_size=0.25,
                                                    random_state=42)
# -

X_train.shape

# +
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([('std_scaler', StandardScaler())])

num_attribs = list(X_train)

full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs)])
# -

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)
X_test_final_prepared = full_pipeline.transform(X_test_final)

X_train_prepared = pd.DataFrame(X_train_prepared, columns=num_attribs)
X_test_prepared = pd.DataFrame(X_test_prepared, columns=num_attribs)
X_test_final_prepared = pd.DataFrame(X_test_final_prepared,
                                     columns=num_attribs)
'''
#Scaling
from sklearn.preprocessing import StandardScaler
x_col = X.columns
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    # --- SimpleImputer is not available for strings in ONNX-ML specifications. 
    # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])


clf.fit(X_train, y_train)

##################################
# Define the inputs of the ONNX graph
# +++++++++++++++++++++++++++++++++++
#
# *sklearn-onnx* does not know the features used to train the model
# but it needs to know which feature has which name.
示例#12
0
    def fit(self, X_train, X_test, y_train, y_test):
        """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.
        Parameters
        ----------
        X_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        X_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        y_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        y_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        Returns
        -------
        scores : Pandas DataFrame
            Returns metrics of all the models in a Pandas DataFrame.
        predictions : Pandas DataFrame
            Returns predictions of all the models in a Pandas DataFrame.
        """
        R2 = []
        RMSE = []
        # WIN = []
        names = []
        TIME = []
        predictions = {}

        if self.custom_metric != None:
            CUSTOM_METRIC = []

        if type(X_train) is np.ndarray:
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

        numeric_features = X_train.select_dtypes(
            include=['int64', 'float64', 'int32', 'float32']).columns
        categorical_features = X_train.select_dtypes(
            include=['object']).columns

        preprocessor = ColumnTransformer(
            transformers=[('numeric', numeric_transformer, numeric_features),
                          ('categorical', categorical_transformer,
                           categorical_features)])

        for name, model in tqdm(REGRESSORS):
            start = time.time()
            try:
                if 'random_state' in model().get_params().keys():
                    pipe = Pipeline(
                        steps=[('preprocessor', preprocessor),
                               ('regressor',
                                model(random_state=self.random_state))])
                else:
                    pipe = Pipeline(
                        steps=[('preprocessor',
                                preprocessor), ('regressor', model())])
                pipe.fit(X_train, y_train)
                y_pred = pipe.predict(X_test)
                r_squared = r2_score(y_test, y_pred)
                rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                names.append(name)
                R2.append(r_squared)
                RMSE.append(rmse)
                TIME.append(time.time() - start)
                if self.custom_metric != None:
                    custom_metric = self.custom_metric(y_test, y_pred)
                    CUSTOM_METRIC.append(custom_metric)

                if self.verbose > 0:
                    if self.custom_metric != None:
                        print({
                            "Model": name,
                            "R-Squared": r_squared,
                            "RMSE": rmse,
                            self.custom_metric.__name__: custom_metric,
                            "Time taken": time.time() - start
                        })
                    else:
                        print({
                            "Model": name,
                            "R-Squared": r_squared,
                            "RMSE": rmse,
                            "Time taken": time.time() - start
                        })
                if self.predictions == True:
                    predictions[name] = y_pred
            except Exception as exception:
                if self.ignore_warnings == False:
                    print(name + " model failed to execute")
                    print(exception)

        if self.custom_metric == None:
            scores = pd.DataFrame({
                "Model": names,
                "R-Squared": R2,
                "RMSE": RMSE,
                "Time Taken": TIME
            })
        else:
            scores = pd.DataFrame({
                "Model": names,
                "R-Squared": R2,
                "RMSE": RMSE,
                self.custom_metric.__name__: CUSTOM_METRIC,
                "Time Taken": TIME
            })
        scores = scores.sort_values(by='R-Squared',
                                    ascending=False).set_index('Model')

        if self.predictions == True:
            predictions_df = pd.DataFrame.from_dict(predictions)
        return scores, predictions_df if self.predictions == True else scores
示例#13
0
    def fit(self, X_train, X_test, y_train, y_test):
        """Fit Classification algorithms to X_train and y_train, predict and score on X_test, y_test.
        Parameters
        ----------
        X_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        X_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        y_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        y_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        Returns
        -------
        scores : Pandas DataFrame
            Returns metrics of all the models in a Pandas DataFrame.
        predictions : Pandas DataFrame
            Returns predictions of all the models in a Pandas DataFrame.
        """
        Accuracy = []
        B_Accuracy = []
        ROC_AUC = []
        F1 = []
        names = []
        TIME = []
        predictions = {}

        if self.custom_metric != None:
            CUSTOM_METRIC = []

        if type(X_train) is np.ndarray:
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

        numeric_features = X_train.select_dtypes(
            include=['int64', 'float64', 'int32', 'float32']).columns
        categorical_features = X_train.select_dtypes(
            include=['object']).columns

        preprocessor = ColumnTransformer(
            transformers=[('numeric', numeric_transformer, numeric_features),
                          ('categorical', categorical_transformer,
                           categorical_features)])

        for name, model in tqdm(CLASSIFIERS):
            start = time.time()
            try:
                if 'random_state' in model().get_params().keys():
                    pipe = Pipeline(
                        steps=[('preprocessor', preprocessor),
                               ('classifier',
                                model(random_state=self.random_state))])
                else:
                    pipe = Pipeline(
                        steps=[('preprocessor',
                                preprocessor), ('classifier', model())])

                pipe.fit(X_train, y_train)
                y_pred = pipe.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred, normalize=True)
                b_accuracy = balanced_accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, average='weighted')
                try:
                    roc_auc = roc_auc_score(y_test, y_pred)
                except Exception as exception:
                    roc_auc = None
                    if self.ignore_warnings == False:
                        print("ROC AUC couldn't be calculated for " + name)
                        print(exception)
                names.append(name)
                Accuracy.append(accuracy)
                B_Accuracy.append(b_accuracy)
                ROC_AUC.append(roc_auc)
                F1.append(f1)
                TIME.append(time.time() - start)
                if self.custom_metric != None:
                    custom_metric = self.custom_metric(y_test, y_pred)
                    CUSTOM_METRIC.append(custom_metric)
                if self.verbose > 0:
                    if self.custom_metric != None:
                        print({
                            "Model": name,
                            "Accuracy": accuracy,
                            "Balanced Accuracy": b_accuracy,
                            "ROC AUC": roc_auc,
                            "F1 Score": f1,
                            self.custom_metric.__name__: custom_metric,
                            "Time taken": time.time() - start
                        })
                    else:
                        print({
                            "Model": name,
                            "Accuracy": accuracy,
                            "Balanced Accuracy": b_accuracy,
                            "ROC AUC": roc_auc,
                            "F1 Score": f1,
                            "Time taken": time.time() - start
                        })
                if self.predictions == True:
                    predictions[name] = y_pred
            except Exception as exception:
                if self.ignore_warnings == False:
                    print(name + " model failed to execute")
                    print(exception)
        if self.custom_metric == None:
            scores = pd.DataFrame({
                "Model": names,
                "Accuracy": Accuracy,
                "Balanced Accuracy": B_Accuracy,
                "ROC AUC": ROC_AUC,
                "F1 Score": F1,
                "Time Taken": TIME
            })
        else:
            scores = pd.DataFrame({
                "Model": names,
                "Accuracy": Accuracy,
                "Balanced Accuracy": B_Accuracy,
                "ROC AUC": ROC_AUC,
                "F1 Score": F1,
                self.custom_metric.__name__: CUSTOM_METRIC,
                "Time Taken": TIME
            })
        scores = scores.sort_values(by='Balanced Accuracy',
                                    ascending=False).set_index('Model')

        if self.predictions == True:
            predictions_df = pd.DataFrame.from_dict(predictions)
        return scores, predictions_df if self.predictions == True else scores
numerical_transformer = Pipeline(steps=[('Imputer',
                                         SimpleImputer(strategy='median',
                                                       verbose=1)),
                                        ('Scaler', StandardScaler())],
                                 verbose=True)
# Impute and One Hot Encode categorical features
categorical_transformer = Pipeline(steps=[
    ('Imputer',
     SimpleImputer(strategy='constant', fill_value='missing', verbose=1)),
    ('Onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))
],
                                   verbose=True)
# Preprocessor operations
preprocessor = ColumnTransformer(transformers=[
    ('Numerical Data', numerical_transformer, numerical_features),
    ('Categorical Data', categorical_transformer, categorical_features)
],
                                 verbose=True)

# Linear Regression Pipeline: Preprocess -> Ridge Regression
lr = Pipeline(steps=[('Preprocessor', preprocessor),
                     ('Ridge Regression',
                      Ridge(alpha=0.5, fit_intercept=True, solver='sag'))],
              verbose=True)

# Create x features and y features
x_data = dataset_training.drop(['Instance', 'Income in EUR'], axis=1)
y_data = dataset_training['Income in EUR']

# Split data 70/30
x_train, x_test, y_train, y_real = train_test_split(x_data,
示例#15
0
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])

data_train, data_test, target_train, target_test = train_test_split(
    data, target, train_size=0.2, random_state=42)

# %%
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer(
    [('cat_preprocessor', categorical_preprocessor,
      selector(dtype_include=object))],
    remainder='passthrough',
    sparse_threshold=0)

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([("preprocessor", preprocessor),
                  ("classifier",
                   HistGradientBoostingClassifier(random_state=42))])

# %% [markdown]
#
# Use the previously defined model (called `model`) and using two nested `for`
# loops, make a search of the best combinations of the `learning_rate` and
# `max_leaf_nodes` parameters. In this regard, you will need to train and test
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# Old Version
#from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#labelencoder_X = LabelEncoder()
#X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
#onehotencoder = OneHotEncoder(categorical_features = [0])
#X = onehotencoder.fit_transform(X).toarray()

# New Version
# Encoding categorical data
# Encoding the Independent Variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("Country", OneHotEncoder(), [0])],
                       remainder='passthrough')
X = ct.fit_transform(X)

# Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
示例#17
0
df['issue_date'] = [datetime.datetime.strptime(x, format) for x in df['issue_date']]
df['listing_date'] = [datetime.datetime.strptime(x, format) for x in df['listing_date']]

t = pd.DataFrame()
t['TA Time'] = df['listing_date'] - df['issue_date']
t['TA Time'] = t['TA Time']/np.timedelta64(1, 'D')

X = df.iloc[:,[3, 4, 5, 6, 7, 8]].values
t = t.iloc[:].values
X = np.append(X, t, axis = 1)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
le = LabelEncoder()
X[:, 1] = le.fit_transform(X[:, 1])
ct = ColumnTransformer([('color_type', OneHotEncoder(), [1])], remainder = 'passthrough')
X = ct.fit_transform(X)
X = X.astype(float)



y = df.iloc[:,10].values

#df["height(cm)"] = df['height(cm)'] / 100
#df.rename(columns = {"height(cm)" : "height(m)"}, inplace = True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)  


from sklearn.linear_model import LogisticRegression
# PREPROCESSING PIPELINES
# numerical features are in same positions in feature set 1 & 2, so only need to define one list & preprocessor
feat_num = [
    'short_pct_mean',
    'plan_actual_diff_abs_max',
    'trans_count',
    'time_since_registration',
    'song_pca',  # num_songs_mean in feature set 2
    'transactions_pca'
]  # 'actual_amount_paid_mode' in feature set 2

feat_num_idx = [list(df_feat1.columns).index(x) for x in feat_num]

# define Scaling preprocessor
preproc_scale = ColumnTransformer(transformers=[('num', StandardScaler(),
                                                 feat_num_idx)])

# for models that don't require scaling, we want to pass-through these features:
preproc_num_pass = ColumnTransformer(transformers=[('num', 'passthrough',
                                                    feat_num_idx)])

# Categorical:
cat_cols = ['registered_via']
cat_cols_idx = [list(df_feat1.columns).index(x) for x in cat_cols]

preproc_ohe = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(categories='auto'), cat_cols_idx)])

# fit to get feature names
preproc_ohe.fit(df_feat1)
feat_ohe = preproc_ohe.named_transformers_['cat'].get_feature_names()
def test_column_transformer_dataframe():
    pd = pytest.importorskip('pandas')

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_both = X_array

    cases = [
        # String keys: label based

        # scalar
        ('first', X_res_first),
        # list
        (['first'], X_res_first),
        (['first', 'second'], X_res_both),
        # slice
        (slice('first', 'second'), X_res_both),

        # int keys: positional

        # scalar
        (0, X_res_first),
        # list
        ([0], X_res_first),
        ([0, 1], X_res_both),
        (np.array([0, 1]), X_res_both),
        # slice
        (slice(0, 1), X_res_first),
        (slice(0, 2), X_res_both),

        # boolean mask
        (np.array([True, False]), X_res_first),
        (pd.Series([True, False], index=['first', 'second']), X_res_first),
    ]

    for selection, res in cases:
        ct = ColumnTransformer([('trans', Trans(), selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_df), res)
        assert_array_equal(ct.fit(X_df).transform(X_df), res)

        # callable that returns any of the allowed specifiers
        ct = ColumnTransformer([('trans', Trans(), lambda X: selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_df), res)
        assert_array_equal(ct.fit(X_df).transform(X_df), res)

    ct = ColumnTransformer([('trans1', Trans(), ['first']),
                            ('trans2', Trans(), ['second'])])
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    ct = ColumnTransformer([('trans1', Trans(), [0]),
                            ('trans2', Trans(), [1])])
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    # test with transformer_weights
    transformer_weights = {'trans1': .1, 'trans2': 10}
    both = ColumnTransformer([('trans1', Trans(), ['first']),
                              ('trans2', Trans(), ['second'])],
                             transformer_weights=transformer_weights)
    res = np.vstack([
        transformer_weights['trans1'] * X_df['first'],
        transformer_weights['trans2'] * X_df['second']
    ]).T
    assert_array_equal(both.fit_transform(X_df), res)
    assert_array_equal(both.fit(X_df).transform(X_df), res)
    assert len(both.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    # test multiple columns
    both = ColumnTransformer([('trans', Trans(), ['first', 'second'])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'

    both = ColumnTransformer([('trans', Trans(), [0, 1])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'

    # ensure pandas object is passes through

    class TransAssert(BaseEstimator):
        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):
            assert isinstance(X, (pd.DataFrame, pd.Series))
            if isinstance(X, pd.Series):
                X = X.to_frame()
            return X

    ct = ColumnTransformer([('trans', TransAssert(), 'first')],
                           remainder='drop')
    ct.fit_transform(X_df)
    ct = ColumnTransformer([('trans', TransAssert(), ['first', 'second'])])
    ct.fit_transform(X_df)

    # integer column spec + integer column names -> still use positional
    X_df2 = X_df.copy()
    X_df2.columns = [1, 0]
    ct = ColumnTransformer([('trans', Trans(), 0)], remainder='drop')
    assert_array_equal(ct.fit_transform(X_df), X_res_first)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)

    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'drop'
    assert_array_equal(ct.transformers_[-1][2], [1])
示例#20
0
previsores = census.iloc[:, 0:14].values
classe = census.iloc[:, 14].values

labelEncoderPrevisores = LabelEncoder()
# labels = labelEncoderPrevisores.fit_transform(previsores[:, 1])
previsores[:, 1] = labelEncoderPrevisores.fit_transform(previsores[:, 1])
previsores[:, 3] = labelEncoderPrevisores.fit_transform(previsores[:, 3])
previsores[:, 5] = labelEncoderPrevisores.fit_transform(previsores[:, 5])
previsores[:, 6] = labelEncoderPrevisores.fit_transform(previsores[:, 6])
previsores[:, 7] = labelEncoderPrevisores.fit_transform(previsores[:, 7])
previsores[:, 8] = labelEncoderPrevisores.fit_transform(previsores[:, 8])
previsores[:, 9] = labelEncoderPrevisores.fit_transform(previsores[:, 9])
previsores[:, 13] = labelEncoderPrevisores.fit_transform(previsores[:, 13])

# Dummy variables
# etnia = census.iloc[:, 8].values
# etnia = labelEncoderPrevisores.fit_transform(etnia)

# Column Transformer
oneHotEncoder = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'),
      [1, 3, 5, 6, 7, 8, 9, 13])],
    remainder='passthrough')
previsores = oneHotEncoder.fit_transform(previsores).toarray()

labelEncoder_Classe = LabelEncoder()
classe = labelEncoder_Classe.fit_transform(classe)

# Escalonamento
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)
示例#21
0
def lab():
    form = LabForm()

    if form.validate_on_submit():
        X_test = np.array([[
            float(form.latitude.data),
            float(form.longitude.data),
            str(form.month.data),
            str(form.day.data),
            float(form.avg.data),
            float(form.max.data),
            float(form.wind_s.data),
            float(form.wind_avg.data)
        ]])
        print(X_test.shape)
        fires = pd.read_csv('datasets/sanbul-5.csv', sep=',')
        X_test = pd.DataFrame(X_test,
                              columns=[
                                  'latitude', 'longitude', 'month', 'day',
                                  'avg_temp', 'max_temp', 'max_wind_speed',
                                  'avg_wind'
                              ])
        print(X_test)

        from sklearn.model_selection import train_test_split
        train_set, test_set = train_test_split(fires,
                                               test_size=0.2,
                                               random_state=42)
        from sklearn.model_selection import StratifiedShuffleSplit
        split = StratifiedShuffleSplit(n_splits=1,
                                       test_size=0.2,
                                       random_state=42)
        for train_index, test_index in split.split(fires, fires["month"]):
            strat_train_set = fires.loc[train_index]
            strat_test_set = fires.loc[test_index]

        fires = strat_train_set.drop(["burned_area"],
                                     axis=1)  # drop labels for training set
        fires_labels = strat_train_set["burned_area"].copy()
        fires_num = fires.drop(["month", "day"], axis=1)

        from sklearn.preprocessing import OneHotEncoder
        cat_encoder = OneHotEncoder()
        fires_cat = fires[["month"]]
        fires_cat_1hot = cat_encoder.fit_transform(fires_cat)
        cat_encoder = OneHotEncoder(sparse=False)
        fires_cat_1hot = cat_encoder.fit_transform(fires_cat)

        cat_encoder2 = OneHotEncoder()
        fires_cat = fires[["day"]]
        fires_cat_1hot_2 = cat_encoder2.fit_transform(fires_cat)
        cat_encoder2 = OneHotEncoder(sparse=False)
        fires_cat_1hot_2 = cat_encoder2.fit_transform(fires_cat)

        from sklearn.pipeline import Pipeline
        from sklearn.preprocessing import StandardScaler

        num_pipeline = Pipeline([
            ('std_scaler', StandardScaler()),
        ])
        fires_num_tr = num_pipeline.fit_transform(fires_num)

        from sklearn.compose import ColumnTransformer
        num_attribs = list(fires_num)
        cat_attribs = ["month", "day"]
        full_pipeline = ColumnTransformer([
            ("num", num_pipeline, num_attribs),
            ("cat", OneHotEncoder(), cat_attribs),
        ])
        fires_prepared = full_pipeline.fit_transform(fires)
        X_test = full_pipeline.transform(X_test)

        MODEL_NAME = "my_sanbul_model"
        os.environ[
            "GOOGLE_APPLICATION_CREDENTIALS"] = "term-224506-9bc8286b5d7b.json"
        project_id = 'term-224506'
        model_id = MODEL_NAME
        model_path = "projects/{}/models/{}".format(project_id, model_id)
        model_path += "/versions/v0001/"
        ml_resource = googleapiclient.discovery.build("ml", "v1").projects()

        input_data_json = {
            "signature_name": "serving_default",
            "instances": X_test.tolist()
        }
        request = ml_resource.predict(name=model_path, body=input_data_json)
        response = request.execute()
        print("\nresponse:\n", response)

        if "error" in response:
            raise RuntimeError(response["error"])

        predD = np.array([pred['dense_1'] for pred in response["predictions"]])
        print(predD[0][0])
        res = predD[0][0]
        return render_template('result.html', res=res)

    return render_template('prediction.html', form=form)
                    
# Gerer les variables
categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse=False))])


numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

bool_transformer = Pipeline(steps=[
        ('select_bool',  PandasDataFrameSelector(binary_features)),
        ('scale', StandardScaler())])
    
preprocessor = ColumnTransformer(
        remainder = 'passthrough',
        transformers=[
                ('num', numeric_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features),
                ('binary', bool_transformer, binary_features)])

model = preprocessor.fit_transform(Xtrain_new)
model.shape


#?????????????????????????????????????????????????????????????//



ml_pipe=Pipeline([('transform', preprocessor),
                  ('lin_reg',LinearRegression())])
ml_pipe.fit(X_train, y_train)
ml_pipe.score(X_train, y_train)
示例#23
0
文件: main.py 项目: bilal684/INF8215
pipeline_AnimalType_ChangeAnimalType = Pipeline([
    ('mohammed3', TransformationWrapper(transformation=convertAnimalType))
])

pipeline_SexuponOutcome_ChangeSexUponOutcome = Pipeline([
    ('mohammed4', TransformationWrapper(transformation=convertSexUponOutcome)),
    ('encode', OneHotEncoder(categories='auto', sparse=False))
])

pipeline_changeBreed = Pipeline([
    ('mohammed5', TransformationWrapper(transformation=convertBreed))
])

full_pipeline = ColumnTransformer(
    [("bilal", pipeline_ageuponoutcome_changeToWeeks, "AgeuponOutcome"),
     ("Xiangyi", pipeline_AnimalType_ChangeAnimalType, "AnimalType"),
     ("Mohammed", pipeline_SexuponOutcome_ChangeSexUponOutcome,
      "SexuponOutcome"), ('breed', pipeline_changeBreed, "Breed")],
    remainder='passthrough')

columns = [
    "AgeuponOutcome", "AnimalType", "Neutered Male", "Spayed Female",
    "Intact Male", "Intact Female", "Unknown", "Mix"
]
#columns = ["AgeuponOutcome"]
X_train = pd.DataFrame(full_pipeline.fit_transform(X_train), columns=columns)
X_test = pd.DataFrame(full_pipeline.transform(X_test), columns=columns)

X_train_all = pd.concat([X_train, X_train1], axis=1)
X_test_all = pd.concat([X_test, X_test1], axis=1)
print("hello")
示例#24
0
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

#

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

#

from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
# Tratamiento de los NAs
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# Codificar datos categóricos
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
ct = ColumnTransformer(
    [
        ('one_hot_encoder', OneHotEncoder(categories='auto'), [0])
    ],  # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'  # Leave the rest of the columns untouched
)

X = np.array(ct.fit_transform(X), dtype=np.float)
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

# Dividir el data set en conjunto de entrenamiento y conjunto de testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Escalado de variables
示例#26
0
# First let's time the pipeline we used in the main notebook to serve as a
# reference:

# %%
# %%time
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer(
    [('categorical', categorical_preprocessor, categorical_columns)],
    remainder="passthrough")

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())
cv_results = cross_validate(model, data, target)
scores = cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

# %% [markdown]
# ## Scaling numerical features

# %%
# %%time
from sklearn.preprocessing import StandardScaler
示例#27
0
        for mas_vnr_area_strategy in impute_mas_vnr_area:
            if mas_vnr_area_strategy == "mean":
                imputer_vnr = Pipeline(
                    steps=[('imputer', SimpleImputer(strategy='mean'))])
            elif mas_vnr_area_strategy == "0":
                imputer_vnr = Pipeline(
                    steps=[('imputer',
                            SimpleImputer(strategy="constant", fill_value=0))])

            imputation_combinations.append(
                [garage_strategy, lot_area_strategy, mas_vnr_area_strategy])
            preprocessors.append(
                ColumnTransformer(transformers=[
                    ('imputer_garage', imputer_garage, ["GarageYrBlt"]),
                    ('imputer_lotarea', imputer_lotarea, ["LotFrontage"]),
                    ('imputer_vnr', imputer_vnr, ["MasVnrArea"])
                ],
                                  remainder='passthrough'))

#final_train_data = []
#final_valid_data = []
#imputation_strategies = []
for i in range(0, len(preprocessors)):
    preprocessor = preprocessors[i]
    for imputation_method in imputation_methods:
        # New DataFrames with possibly added columns
        X_train_with_new_cols = X_train.copy()
        X_valid_with_new_cols = X_valid.copy()
        X_test_with_new_cols = X_test.copy()

        # Add new columns in method is "extended"
示例#28
0
imputer = imp(missing_values=np.nan, strategy = 'mean')
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])



# avoiding Dummy variable/redundant dependancy
X = X[:, 1:]



# Encoding categorical data
# Encoding the Independent Variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
enc = ColumnTransformer([('Position', OneHotEncoder(),[0])], remainder='passthrough')
X = enc.fit_transform(X)



#Splitting dataset into train_set and test_set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)



#feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X.astype(float))
示例#29
0
#Matrix of features 
X = dataset.iloc[:, 3:13].values

#Dependent variable vector
y = dataset.iloc[:, 13].values

# Label Encoding the "Gender" column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:, 2] = le.fit_transform(X[:, 2])

# One Hot Encoding the "Geography" column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X = X[:, 1:]

#splitting dataset training set and test set 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

#feature scaling
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

#creating leyars
import keras
示例#30
0
# Multiple Linear Regression

# Importing the libraries
import os
import pandas as pd

# Importing the dataset
dataset = pd.read_csv(
    os.path.join(os.path.abspath(''), 'MultipleLinearRegression', 'Garch.csv'))
X = dataset.iloc[:, 1:8].values
y = dataset.iloc[:, 8].values

# Encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer([("Name_Of_Your_Step", OneHotEncoder(), [1])],
                       remainder="passthrough")
X = ct.fit_transform(X)

# Avoiding the Dummy Variable Trap
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()