def test_column_transformer_mixed_cols_sparse():
    df = np.array([['a', 1, True],
                   ['b', 2, False]],
                  dtype='O')

    ct = make_column_transformer(
        (OneHotEncoder(), [0]),
        ('passthrough', [1, 2]),
        sparse_threshold=1.0
    )

    # this shouldn't fail, since boolean can be coerced into a numeric
    # See: https://github.com/scikit-learn/scikit-learn/issues/11912
    X_trans = ct.fit_transform(df)
    assert X_trans.getformat() == 'csr'
    assert_array_equal(X_trans.toarray(), np.array([[1, 0, 1, 1],
                                                    [0, 1, 2, 0]]))

    ct = make_column_transformer(
        (OneHotEncoder(), [0]),
        ('passthrough', [0]),
        sparse_threshold=1.0
    )
    with pytest.raises(ValueError,
                       match="For a sparse output, all columns should"):
        # this fails since strings `a` and `b` cannot be
        # coerced into a numeric.
        ct.fit_transform(df)
def test_make_column_transformer_pandas():
    pd = pytest.importorskip('pandas')
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    norm = Normalizer()
    # XXX remove in v0.22
    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        ct1 = make_column_transformer((X_df.columns, norm))
    ct2 = make_column_transformer((norm, X_df.columns))
    assert_almost_equal(ct1.fit_transform(X_df),
                        ct2.fit_transform(X_df))
def test_make_column_transformer_kwargs():
    scaler = StandardScaler()
    norm = Normalizer()
    ct = make_column_transformer(('first', scaler), (['second'], norm),
                                 n_jobs=3, remainder='drop')
    assert_equal(ct.transformers, make_column_transformer(
        ('first', scaler), (['second'], norm)).transformers)
    assert_equal(ct.n_jobs, 3)
    assert_equal(ct.remainder, 'drop')
    # invalid keyword parameters should raise an error message
    assert_raise_message(
        TypeError,
        'Unknown keyword arguments: "transformer_weights"',
        make_column_transformer, ('first', scaler), (['second'], norm),
        transformer_weights={'pca': 10, 'Transf': 1}
    )
def test_make_column_transformer_remainder_transformer():
    scaler = StandardScaler()
    norm = Normalizer()
    remainder = StandardScaler()
    ct = make_column_transformer(('first', scaler), (['second'], norm),
                                 remainder=remainder)
    assert ct.remainder == remainder
def test_column_transformer_remainder():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
    X_res_both = X_array

    # default drop
    ct = ColumnTransformer([('trans1', Trans(), [0])])
    assert_array_equal(ct.fit_transform(X_array), X_res_first)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'drop'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # specify passthrough
    ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # column order is not preserved (passed through added to end)
    ct = ColumnTransformer([('trans1', Trans(), [1])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1])
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1])
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [0])

    # passthrough when all actual transformers are skipped
    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_second)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # error on invalid arg
    ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\', \'passthrough\', "
        "or estimator.", ct.fit, X_array)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\', \'passthrough\', "
        "or estimator.", ct.fit_transform, X_array)

    # check default for make_column_transformer
    ct = make_column_transformer(([0], Trans()))
    assert ct.remainder == 'drop'
def test_make_column_transformer():
    scaler = StandardScaler()
    norm = Normalizer()
    ct = make_column_transformer(('first', scaler), (['second'], norm))
    names, transformers, columns = zip(*ct.transformers)
    assert_equal(names, ("standardscaler", "normalizer"))
    assert_equal(transformers, (scaler, norm))
    assert_equal(columns, ('first', ['second']))
def test_make_column_transformer():
    scaler = StandardScaler()
    norm = Normalizer()
    ct = make_column_transformer((scaler, 'first'), (norm, ['second']))
    names, transformers, columns = zip(*ct.transformers)
    assert_equal(names, ("standardscaler", "normalizer"))
    assert_equal(transformers, (scaler, norm))
    assert_equal(columns, ('first', ['second']))

    # XXX remove in v0.22
    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        ct1 = make_column_transformer(([0], norm))
    ct2 = make_column_transformer((norm, [0]))
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    assert_almost_equal(ct1.fit_transform(X_array),
                        ct2.fit_transform(X_array))

    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        make_column_transformer(('first', 'drop'))

    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        make_column_transformer(('passthrough', 'passthrough'),
                                ('first', 'drop'))
def test_make_column_transformer_pandas():
    pd = pytest.importorskip('pandas')
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    norm = Normalizer()
    ct1 = ColumnTransformer([('norm', Normalizer(), X_df.columns)])
    ct2 = make_column_transformer((norm, X_df.columns))
    assert_almost_equal(ct1.fit_transform(X_df),
                        ct2.fit_transform(X_df))
示例#9
0
 def fit(self, X, y):
     encode_columns = [item for item in X.columns if 'suit' in item]
     scale_columns = [item for item in X.columns if item not in encode_columns]
     
     self.column_transformer = make_column_transformer(
         (StandardScaler(), scale_columns),
         (OneHotEncoder(categories='auto'), encode_columns))
     self.column_transformer.fit(X)
     
     return self
示例#10
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 26 15:27:47 2020
@author: dorian
"""
import os
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import problem
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
import shap
def _merge_external_data(X):
    filepath = os.path.join(
        os.path.dirname(__file__), 'external_data.csv'
    )
    # Make sure that DateOfDeparture is of dtype datetime
    X = X.copy()  # modify a copy of X
    X.loc[:, "DateOfDeparture"] = pd.to_datetime(X['DateOfDeparture'])
#
# Since there are rare categories in this dataset we need to specifically
# encode unknown categories at prediction time in order to be able to use
# cross-validation. Otherwise some rare categories could only be present on the
# validation side of the cross-validation split and the `OrdinalEncoder` would
# raise an error when calling its `transform` method with the data points
# of the validation set.

# %%
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer, make_column_selector

categorical_encoder = OrdinalEncoder(handle_unknown="use_encoded_value",
                                     unknown_value=-1)
preprocessor = make_column_transformer(
    (categorical_encoder, make_column_selector(dtype_include=object)),
    remainder="passthrough")

# %% [markdown]
#
# We will first give a simple example where we will train a single decision
# tree classifier and check its generalization performance via cross-validation.

# %%
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

tree = make_pipeline(preprocessor, DecisionTreeClassifier(random_state=0))

# %%
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn_callbacks import ProgressBar

X, y = make_classification(n_samples=500000, n_features=200, random_state=0)

pipe = make_pipeline(
    SimpleImputer(),
    make_column_transformer(
        (StandardScaler(), slice(0, 80)),
        (MinMaxScaler(), slice(80, 120)),
        (StandardScaler(with_mean=False), slice(120, 180)),
    ),
    LogisticRegression(),
)

pbar = ProgressBar()
pipe._set_callbacks(pbar)

_ = pipe.fit(X, y)
# - pclass: ordinal integers {1, 2, 3}.
numeric_features = ['age', 'fare']
categorical_features = ['embarked', 'sex', 'pclass']

# Provisionally, use pd.fillna() to impute missing values for categorical
# features; SimpleImputer will eventually support strategy="constant".
data[categorical_features] = data[categorical_features].fillna(value='missing')

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
categorical_transformer = CategoricalEncoder('onehot-dense',
                                             handle_unknown='ignore')

preprocessing_pl = make_column_transformer(
    (numeric_features, numeric_transformer),
    (categorical_features, categorical_transformer),
    remainder='drop'
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = make_pipeline(preprocessing_pl, LogisticRegression())

X = data.drop('survived', axis=1)
y = data.survived.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    shuffle=True)

clf.fit(X_train, y_train)
print("model score: %f" % clf.score(X_test, y_test))
示例#14
0
from sklearn import set_config
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression

set_config(display='diagram')

num_proc = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())

cat_proc = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OneHotEncoder(handle_unknown='ignore'))

preprocessor = make_column_transformer((num_proc, ('feat1', 'feat3')),
                                       (cat_proc, ('feat0', 'feat2')))

clf = make_pipeline(preprocessor, LogisticRegression())
clf

##############################################################################
# Scalability and stability improvements to KMeans
# ------------------------------------------------
# The :class:`~sklearn.cluster.KMeans` estimator was entirely re-worked, and it
# is now significantly faster and more stable. In addition, the Elkan algorithm
# is now compatible with sparse matrices. The estimator uses OpenMP based
# parallelism instead of relying on joblib, so the `n_jobs` parameter has no
# effect anymore. For more details on how to control the number of threads,
# please refer to our :ref:`parallelism` notes.
import scipy
import numpy as np
示例#15
0
# [!] not implemented

# select feature set from the data frame
df['time_from_trace_start'] = pd.DataFrame(tfts_lst)
df['case_remaining_time'] = pd.DataFrame(crt_lst)
df = df[[
    'activity_type', 'seq_of_event', 'time_from_trace_start',
    'num_of_events_hour_of_day', 'num_of_events_day_of_week',
    'case_remaining_time'
]]
print(df)

# one hot encoding and feature scaling
preprocess = make_column_transformer(
    (OneHotEncoder(), ['activity_type']), (StandardScaler(), [
        'seq_of_event', 'time_from_trace_start', 'num_of_events_hour_of_day',
        'num_of_events_day_of_week', 'case_remaining_time'
    ]))

# separate train/valid sets
train = preprocess.fit_transform(df[:parting_event_idx + 1]).toarray()
valid = preprocess.transform(df[parting_event_idx + 1:]).toarray()

# calculate the size of input vector
input_size = train.shape[1] - 1  # excludes the attribute of target values


# transformation (ndarray -> torch)
def transform_data(arr):
    x = arr[:, 0:input_size]
    x_arr = np.array(x).reshape(1, -1, input_size)
示例#16
0
    expected_label = "LogisticRegression (AP = {:0.2f})".format(avg_prec)
    assert disp.line_.get_label() == expected_label
    assert disp.ax_.get_xlabel() == "Recall (Positive label: 1)"
    assert disp.ax_.get_ylabel() == "Precision (Positive label: 1)"

    # draw again with another label
    disp.plot(name="MySpecialEstimator")
    expected_label = "MySpecialEstimator (AP = {:0.2f})".format(avg_prec)
    assert disp.line_.get_label() == expected_label


@pytest.mark.parametrize(
    "clf",
    [
        make_pipeline(StandardScaler(), LogisticRegression()),
        make_pipeline(make_column_transformer(
            (StandardScaler(), [0, 1])), LogisticRegression()),
    ],
)
def test_precision_recall_curve_pipeline(pyplot, clf):
    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)
    with pytest.raises(NotFittedError):
        plot_precision_recall_curve(clf, X, y)
    clf.fit(X, y)
    disp = plot_precision_recall_curve(clf, X, y)
    assert disp.estimator_name == clf.__class__.__name__


def test_precision_recall_curve_string_labels(pyplot):
    # regression test #15738
    cancer = load_breast_cancer()
    X = cancer.data
示例#17
0
#removing erroneous entries
data = data.drop(data[data.ap_hi == 0].index)
data = data.drop(data[data.ap_lo == 0].index)
data = data.drop(data[data.ap_hi < data.ap_lo].index)

#DATA-PREPROCESSING

#GENDER CHANGE 2 TO 1
data.iloc[:, 2] = [0 if i == 2 else i for i in data.iloc[:, 2]]

#ONE HOT ENCODING
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

transformer = make_column_transformer(
    (['age', 'height', 'weight', 'ap_hi', 'ap_lo'], MinMaxScaler()),
    (['cholesterol', 'gluc'], OneHotEncoder()))
data_transformed = pd.DataFrame(transformer.fit_transform(data))
data_transformed = data_transformed.drop(columns=[7, 10]).reset_index()
data_cat = data.iloc[:, [2, 9, 10, 11, 12]].reset_index()
data_new = pd.concat([data_transformed, data_cat], axis=1, ignore_index=True)
data_new = data_new.drop(columns=[0, 10])
data_new.columns = [
    'age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol_0',
    'cholesterol_1', 'gluc_0', 'gluc_1', 'gender', 'smoke', 'alco', 'active',
    'cardio'
]

data = data_new

from sklearn.model_selection import train_test_split
示例#18
0
# simply a series of sequential steps. The output of each step is passed to
# the next step.

# Workflow 1
print()
print('Workflow 1')
# Impute using the mean
# Select features using SelectFromModel(DecisionTreeRegressor)
# Fit with LinearRegression

# Create the imputer object with
# the default hyperparameter settings
imp = SimpleImputer()

# Create the column transformer object
ct = make_column_transformer((imp, features), remainder='passthrough')

# Create objects to use for feature selection with
# the default hyperparameter settings
linreg_selection = LinearRegression()
dtr_selection = DecisionTreeRegressor()
lasso_selection = Lasso()
lassocv_selection = LassoCV()
rfr_selection = RandomForestRegressor()

# Create the feature selection object
selection = SelectFromModel(estimator=dtr_selection)

# Create an object to use for regression with
# the default hyperparameter settings
linreg = LinearRegression()
#
# - one-hot encode (i.e., generate a column by category) the categorical
#   columns;
# - as a first approach (we will see after how the normalisation of numerical
#   values will affect our discussion), keep numerical values as they are.

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

categorical_columns = [
    'RACE', 'OCCUPATION', 'SECTOR', 'MARR', 'UNION', 'SEX', 'SOUTH'
]
numerical_columns = ['EDUCATION', 'EXPERIENCE', 'AGE']

preprocessor = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), categorical_columns),
    remainder='passthrough')

##############################################################################
# To describe the dataset as a linear model we choose to use a ridge regressor
# with a very small regularization and to model the logarithm of the WAGE.

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import TransformedTargetRegressor

model = make_pipeline(
    preprocessor,
    TransformedTargetRegressor(regressor=Ridge(alpha=1e-10),
                               func=np.log10,
                               inverse_func=sp.special.exp10))
for i in range(len(G3)):
    if G3[i] <= 4:
        G3[i] = 0
    if G3[i] >= 5 and G3[i] <= 8:
        G3[i] = 1
    if G3[i] >= 9 and G3[i] <= 12:
        G3[i] = 2
    if G3[i] >= 13 and G3[i] <= 16:
        G3[i] = 3
    if G3[i] >= 17 and G3[i] <= 20:
        G3[i] = 4

df = pd.concat([df, G3], axis=1)

column_trans = make_column_transformer(
    (OneHotEncoder(), ['Mjob', 'Fjob', 'reason', 'guardian']),
    remainder='passthrough')

data = column_trans.fit_transform(df)
n1 = data.shape[0]
n2 = data.shape[1]
m = int(0.8 * n1)
# train=data[:m,:]
# test=data[m:-1,:]

# X_train=train[:,0:n2-1]
# y_train=train[:,n2-1]
# y_train = np.reshape(y_train, (len(y_train),1))
# y_train = to_categorical(y_train)

# X_test=test[:,0:n2-1]
示例#21
0
# On a toujours un problème de surentraînement, mais bien moindre.

# Import libraries
import pandas as pd

# Import data
dataset = pd.read_csv('data/Churn_Modelling.csv')
X = dataset.iloc[:, 3:13]
y = dataset.iloc[:, 13]

# Encode categorical data and scale continuous data
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
preprocess = make_column_transformer(
    (OneHotEncoder(), ['Geography', 'Gender']), (StandardScaler(), [
        'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
        'HasCrCard', 'IsActiveMember', 'EstimatedSalary'
    ]))
X = preprocess.fit_transform(X)

# Split in train/test
y = y.values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Part 2 - Now let's make the ANN!

# Importing the Keras libraries and packages
示例#22
0
# calculate a number of activity types
num_of_acts = len(np.unique(df['activity_type']))

# find a parting trace which is the last trace for the train/valid separation
parting_trace_idx = int(num_of_traces * 0.8)
parting_trace_id = np.unique(df['case_id'])[parting_trace_idx]

# find a parting event's index which is the last event's index of the parting trace.
# used as a separation line between train/valid sets
parting_event_idx = df.loc[df['case_id'] == parting_trace_id]\
    .index.values.astype(int)[-1]

# set up the transformer (one hot encoder, feature scaler)
preprocess = make_column_transformer(
    (OneHotEncoder(), ['activity_type']),
    (RobustScaler(), ['seq_of_event', 'time_from_trace_start']),
    ('passthrough', ['case_remaining_time'])
)

# transform data and separate it into train/valid sets
train = preprocess.fit_transform(df[:parting_event_idx+1]).toarray()
valid = preprocess.transform(df[parting_event_idx+1:]).toarray()

# scale 'execution_time' values
scaler = MinMaxScaler()

# replace ont-hot-encoded values into execution time values
# for training set
event_idx = 0
for i in range(parting_trace_idx+1):
    trace_len = traces_lens[i]
示例#23
0
def getAnnealingData():
    global Model_num, column_transformer_pipeline

    def replaceUnknows(data):
        data['family'] = data['family'].replace(to_replace='?', value='UNK')
        data['product-type'] = data['product-type'].replace(to_replace='C',
                                                            value=1).apply(
                                                                pd.to_numeric)
        data['steel'] = data['steel'].replace(to_replace='?', value='NA')
        data['temper_rolling'] = data['temper_rolling'].replace(to_replace='?',
                                                                value='NA')
        data['condition'] = data['condition'].replace(to_replace='?',
                                                      value='NA')
        data['formability'] = data['formability'].replace(to_replace='?',
                                                          value='0')
        data['non-ageing'] = data['non-ageing'].replace(to_replace='?',
                                                        value='NA')
        data['surface-finish'] = data['surface-finish'].replace(to_replace='?',
                                                                value='NA')
        data['surface-quality'] = data['surface-quality'].replace(
            to_replace='?', value='NA')
        data['enamelability'] = data['enamelability'].replace(to_replace='?',
                                                              value='0')
        data['bc'] = data['bc'].replace(to_replace='?', value='NA')
        data['bf'] = data['bf'].replace(to_replace='?', value='NA')
        data['bt'] = data['bt'].replace(to_replace='?', value='NA')
        data['bw/me'] = data['bw/me'].replace(to_replace='?', value='NA')
        data['bl'] = data['bl'].replace(to_replace='?', value='NA')
        data['m'] = data['m'].replace(to_replace='?',
                                      value=0).apply(pd.to_numeric)
        data['chrom'] = data['chrom'].replace(to_replace='?', value='NA')
        data['phos'] = data['phos'].replace(to_replace='?', value='NA')
        data['cbond'] = data['cbond'].replace(to_replace='?', value='NA')
        data['marvi'] = data['marvi'].replace(to_replace='?',
                                              value=0).apply(pd.to_numeric)
        data['exptl'] = data['exptl'].replace(to_replace='?', value='NA')
        data['ferro'] = data['ferro'].replace(to_replace='?', value='NA')
        data['corr'] = data['corr'].replace(to_replace='?',
                                            value=0).apply(pd.to_numeric)
        data['exptl'] = data['exptl'].replace(to_replace='?', value='NA')
        data['blue/bright/varn/clean'] = data[
            'blue/bright/varn/clean'].replace(to_replace='?', value='NA')
        data['lustre'] = data['lustre'].replace(to_replace='?', value='NA')
        data['jurofm'] = data['jurofm'].replace(to_replace='?',
                                                value=0).apply(pd.to_numeric)
        data['s'] = data['s'].replace(to_replace='?',
                                      value=0).apply(pd.to_numeric)
        data['p'] = data['p'].replace(to_replace='?',
                                      value=0).apply(pd.to_numeric)
        data['oil'] = data['oil'].replace(to_replace='?', value='NA')
        data['packing'] = data['packing'].replace(to_replace='?',
                                                  value=0).apply(pd.to_numeric)
        return data

    dataSource = 'DataSource/annealing.csv'
    testDataSource = 'DataSource/annealing-TEST.csv'
    data = pd.read_csv(dataSource, header=None)
    testData = pd.read_csv(testDataSource, header=None)

    col_headings = [
        'family', 'product-type', 'steel', 'carbon', 'hardness',
        'temper_rolling', 'condition', 'formability', 'strength', 'non-ageing',
        'surface-finish', 'surface-quality', 'enamelability', 'bc', 'bf', 'bt',
        'bw/me', 'bl', 'm', 'chrom', 'phos', 'cbond', 'marvi', 'exptl',
        'ferro', 'corr', 'blue/bright/varn/clean', 'lustre', 'jurofm', 's',
        'p', 'shape', 'thick', 'width', 'len', 'oil', 'bore', 'packing',
        'target'
    ]
    col_index = {
        0: 'family',
        1: 'product-type',
        2: 'steel',
        3: 'carbon',
        4: 'hardness',
        5: 'temper_rolling',
        6: 'condition',
        7: 'formability',
        8: 'strength',
        9: 'non-ageing',
        10: 'surface-finish',
        11: 'surface-quality',
        12: 'enamelability',
        13: 'bc',
        14: 'bf',
        15: 'bt',
        16: 'bw/me',
        17: 'bl',
        18: 'm',
        19: 'chrom',
        20: 'phos',
        21: 'cbond',
        22: 'marvi',
        23: 'exptl',
        24: 'ferro',
        25: 'corr',
        26: 'blue/bright/varn/clean',
        27: 'lustre',
        28: 'jurofm',
        29: 's',
        30: 'p',
        31: 'shape',
        32: 'thick',
        33: 'width',
        34: 'len',
        35: 'oil',
        36: 'bore',
        37: 'packing',
        38: 'target'
    }
    col_to_drop = [
        'family', 'product-type', 'non-ageing', 'surface-finish',
        'enamelability', 'bc', 'm', 'chrom', 'phos', 'cbond', 'marvi', 'exptl',
        'ferro', 'corr', 'blue/bright/varn/clean', 'lustre', 'jurofm', 's', 'p'
    ]
    # col_to_drop = []
    data.columns = col_headings
    testData.columns = col_headings

    data = replaceUnknows(data)
    testData = replaceUnknows(testData)

    data = data.drop(col_to_drop, axis=1)
    testData = testData.drop(col_to_drop, axis=1)
    X_train = data.drop('target', axis=1)
    y_train = data['target'].values
    lable_enc = LabelEncoder()
    y_train = lable_enc.fit_transform(y_train)
    cols_to_oneHotEncode = [
        'family', 'steel', 'temper_rolling', 'condition', 'formability',
        'non-ageing', 'surface-finish', 'surface-quality', 'enamelability',
        'bc', 'bf', 'bt', 'bw/me', 'bl', 'chrom', 'phos', 'cbond', 'exptl',
        'ferro', 'blue/bright/varn/clean', 'lustre', 'shape', 'oil', 'packing'
    ]
    cols_to_oneHotEncode = list(
        set(list(X_train.columns)).intersection(set(cols_to_oneHotEncode)))
    cols_to_scale = [
        'product-type', 'carbon', 'hardness', 'strength', 'thick', 'width',
        'len', 'bore', 'm', 'jurofm', 'p', 'marvi', 's', 'corr'
    ]
    cols_to_scale = list(
        set(list(X_train.columns)).intersection(set(cols_to_scale)))
    column_transformer_pipeline = make_column_transformer(
        (OneHotEncoder(drop='first'), cols_to_oneHotEncode),
        (StandardScaler(), cols_to_scale),
        remainder='passthrough')

    X_train = column_transformer_pipeline.fit_transform(X=X_train)
    X_test = testData.drop('target', axis=1)
    X_test = column_transformer_pipeline.transform(X=X_test)
    y_test = testData['target'].values
    y_test = lable_enc.transform(y=y_test)
    Model_num = str(
        abs(hash(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))))

    return X_train, X_test, y_train, y_test
示例#24
0
def get_estimator():
    actors = get_actor_party_data()  # Additional data about deputies
    # Doing this is allowed

    find_group_vote_demandeur = FindGroupVoteDemandeurTransformer()
    decompose_vote_object = DecomposeVoteObjetTransformer()
    find_party_actor = FindPartyActorTransformer(actors)

    encode_category = make_pipeline(
        SimpleImputer(strategy="constant", fill_value=["unknown"]))

    idty = lambda x: x

    def encode_party_presence(x):
        y = x.iloc[:, 0].apply(pd.Series)
        return y

    vectorize_vote = make_column_transformer(
        (OneHotEncoder(), ["libelle_type_vote"]),
        (
            CountVectorizer(binary=True, preprocessor=idty, tokenizer=idty),
            "demandeur_group",
        ),
        (
            CountVectorizer(binary=True, preprocessor=idty, tokenizer=idty),
            "auteur_parti",
        ),
        (
            FunctionTransformer(func=encode_party_presence),
            ["presence_per_party"],
        ),
        # (CountVectorizer(binary=True), "libelle_desc"),
        (TfidfVectorizer(binary=True), "libelle_desc"),
    )

    def create_nn_model():
        nn = Sequential()
        nn.add(Dense(64, activation="relu"))
        nn.add(Dropout(0.2))
        nn.add(Dense(10, activation="sigmoid"))
        nn.compile(
            optimizer=Adam(learning_rate=1e-3, decay=1e-2 / 500),
            loss="binary_crossentropy",
            metrics=["accuracy"],
        )
        return nn

    classifier = NeuralNet(create_nn_model,
                           epochs=1000,
                           batch_size=500,
                           verbose=0)

    model = Pipeline([
        ("find_group_vote_demandeur", find_group_vote_demandeur),
        ("decompose_vote_object", decompose_vote_object),
        ("find_party_actor", find_party_actor),
        ("vectorize_vote", vectorize_vote),
        ("densify", DenseTransformer()),
        ("normalize", Normalizer()),
        ("nn", classifier),
    ])
    return model
示例#25
0
def make_pipe(n_splits):
    cols_log = [
        "DER_mass_MMC", "DER_mass_transverse_met_lep", "DER_mass_vis",
        "DER_pt_h", "DER_pt_ratio_lep_tau", "DER_pt_tot", "DER_sum_pt",
        "PRI_jet_all_pt", "PRI_lep_pt", "PRI_met", "PRI_met_sumet",
        "PRI_tau_pt"
    ]

    mem = Memory(location=tempfile.mkdtemp(), verbose=0)

    pipe_imputed_fast = Pipeline(
        [
            ('col',
             make_column_transformer(
                 (Shift_log(), cols_log), remainder="passthrough")),
            ('imp', IterativeImputer(max_iter=int(1e2))),
            ('sca', StandardScaler()),
            # ('pca', PCA(15)),
            (
                'gri',
                GridSearchCV(
                    Pipeline([  #('pca', None),
                        ('clf', None)
                    ]),
                    scoring='accuracy',
                    refit=True,
                    cv=n_splits,
                    iid=True,
                    return_train_score=False,
                    param_grid={})),
        ],
        memory=mem,
        verbose=0)

    param_grid = [
        {
            # 'pca': (None, PCA(15)),
            'clf': (SVC(gamma="auto", max_iter=100000), ),
            'clf__kernel': ("poly", "rbf"),
            'clf__C': np.logspace(-2, .5, num=5),
        },
        {
            'clf': (BaggingClassifier(Perceptron(max_iter=1000),
                                      max_samples=0.5,
                                      max_features=0.5), ),
            'clf__n_estimators': (
                500,
                1000,
                2000,
            ),
        },
        {
            'clf': (RandomForestClassifier(), ),
            'clf__n_estimators': (
                500,
                1000,
                2000,
            ),
            'clf__max_depth': (None, 20, 50),
        },
        {
            'clf': (AdaBoostClassifier(), ),
            'clf__n_estimators': (
                500,
                1000,
                2000,
            ),
        },
    ]

    return pipe_imputed_fast, param_grid
plt.title("By race")
data.groupby("race").income_bin.mean().sort_values().plot.barh()

# Exercise 3
# using pd.get_dummies
data_one_hot = pd.get_dummies(data_features)
X_train, X_test, y_train, y_test = train_test_split(data_one_hot, income)

scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# using OneHotEncoder
cont_features = data_features.dtypes == "int64"
ct = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ~cont_features),
    (StandardScaler(), cont_features))
X_train, X_test, y_train, y_test = train_test_split(data_features, income)
X_train_scaled = ct.fit_transform(X_train)
X_test_scaled = ct.transform(X_test)

# Exercise 4
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=0.1)
logreg.fit(X_train_scaled, y_train)
print("Training score:", logreg.score(X_train_scaled, y_train))

print("Test score:", logreg.score(X_test_scaled, y_test))

print("Faction <= 50k", (y_train.values == " <=50K").mean())
示例#27
0
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
dataset = pd.read_csv('Credit.csv')

# We are going to encode two columns - Personal Status and  other_parties
# Personal Status -> index 8
# other_parties -> index 9

X = dataset.iloc[:, 8:10].values  #Taking Personal Status and other_parties

labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])

onehotencoder = make_column_transformer(
    (OneHotEncoder(categories='auto', sparse=False), [1]),
    remainder="passthrough")
X = onehotencoder.fit_transform(X)
示例#28
0
    case_remaining_time = (trace_end_time - cur_event_time).total_seconds()
    crt_lst.append(case_remaining_time)

# case 2: no 'REG_DATE' in <trace>
# data sets fall into case 2:
# [!] not implemented

# select feature set from the data frame
df['time_from_trace_start'] = pd.DataFrame(tfts_lst)
df['case_remaining_time'] = pd.DataFrame(crt_lst)
df = df[['activity_type', 'time_from_trace_start', 'case_remaining_time']]
print(df)

# one hot encoding and feature scaling
preprocess = make_column_transformer(
    (OneHotEncoder(), ['activity_type']),
    (StandardScaler(), ['time_from_trace_start', 'case_remaining_time']))

# separate train/valid sets
train = preprocess.fit_transform(df[:parting_event_idx + 1]).toarray()
valid = preprocess.transform(df[parting_event_idx + 1:]).toarray()

# calculate the size of input vector
input_size = train.shape[1] - 1  # excludes the attribute of target values


# transformation (ndarray -> torch)
def transform_data(arr):
    x = arr[:, 0:input_size]
    x_arr = np.array(x).reshape(1, -1, input_size)
    y = arr[:, input_size]
示例#29
0
import os
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import problem
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.inspection import permutation_importance
def _merge_external_data(X):
    """
    filepath = os.path.join(
        os.path.dirname(__file__), 'external_data.csv'
    )
    """
    # Make sure that DateOfDeparture is of dtype datetime
    X = X.copy()  # modify a copy of X
    X.loc[:, "DateOfDeparture"] = pd.to_datetime(X['DateOfDeparture'])
    # Parse date to also be of dtype datetime
    data_weather = pd.read_csv("external_data.csv", parse_dates=["DateOfDeparture"])
示例#30
0
    def preprocessing(self, input_data):
        dataset_columns = [
            'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
            'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
            'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
            'NAME_HOUSING_TYPE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
            'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
            'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
            'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1',
            'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_30_CNT_SOCIAL_CIRCLE',
            'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
            'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE',
            'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4',
            'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7',
            'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10',
            'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13',
            'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16',
            'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19',
            'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21',
            'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
            'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON',
            'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'TARGET'
        ]
        # JSON to pandas DataFrame
        # Set an index, orient = 'index'
        # input_data = input_data.set_index('SK_ID_CURR')
        # print("INdex Data", input_data)
        application_data = pd.DataFrame(input_data, index=[0])
        # application_data = pd.DataFrame.from_dict(input_data)
        # application_data = application_data.json()

        application_data = pd.DataFrame.from_dict(application_data)
        # print("Un proccessed application_data", application_data)
        # input_data = input_data.set_index('SK_ID_CURR')
        # application_data = pd.read_csv(path_to_artifacts + 'application_train.csv')

        label_vector = application_data['TARGET']
        np.unique(label_vector, return_counts=True)
        # print("INdex Data 2", input_data)

        categorical_features = [
            'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
            'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
            'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
            'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
            'OCCUPATION_TYPE', 'EXT_SOURCE_1', 'FLAG_DOCUMENT_2',
            'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
            'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8',
            'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11',
            'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14',
            'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
            'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
            'FLAG_DOCUMENT_21'
        ]
        numerical_features = [
            'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_BIRTH',
            'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'CNT_FAM_MEMBERS',
            'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
            'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
            'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
            'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR',
            'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
            'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
            'AMT_REQ_CREDIT_BUREAU_YEAR'
        ]

        application_data['AMT_ANNUITY'] = application_data[
            'AMT_ANNUITY'].fillna(0)
        application_data['OCCUPATION_TYPE'] = application_data[
            'OCCUPATION_TYPE'].fillna('UNKNOWN')
        application_data['CNT_FAM_MEMBERS'] = application_data[
            'CNT_FAM_MEMBERS'].fillna(0)
        application_data['EXT_SOURCE_1'] = application_data[
            'EXT_SOURCE_1'].fillna(0)
        application_data['EXT_SOURCE_2'] = application_data[
            'EXT_SOURCE_2'].fillna(0)
        application_data['EXT_SOURCE_3'] = application_data[
            'EXT_SOURCE_3'].fillna(0)
        application_data['OBS_30_CNT_SOCIAL_CIRCLE'] = application_data[
            'OBS_30_CNT_SOCIAL_CIRCLE'].fillna(0)
        application_data['DEF_30_CNT_SOCIAL_CIRCLE'] = application_data[
            'DEF_30_CNT_SOCIAL_CIRCLE'].fillna(0)
        application_data['OBS_60_CNT_SOCIAL_CIRCLE'] = application_data[
            'OBS_60_CNT_SOCIAL_CIRCLE'].fillna(0)
        application_data['DEF_60_CNT_SOCIAL_CIRCLE'] = application_data[
            'DEF_60_CNT_SOCIAL_CIRCLE'].fillna(0)
        application_data['DAYS_LAST_PHONE_CHANGE'] = application_data[
            'DAYS_LAST_PHONE_CHANGE'].fillna(3650)
        application_data['AMT_REQ_CREDIT_BUREAU_HOUR'] = application_data[
            'AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(0)
        application_data['AMT_REQ_CREDIT_BUREAU_DAY'] = application_data[
            'AMT_REQ_CREDIT_BUREAU_DAY'].fillna(0)
        application_data['AMT_REQ_CREDIT_BUREAU_WEEK'] = application_data[
            'AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(0)
        application_data['AMT_REQ_CREDIT_BUREAU_MON'] = application_data[
            'AMT_REQ_CREDIT_BUREAU_MON'].fillna(0)
        application_data['AMT_REQ_CREDIT_BUREAU_QRT'] = application_data[
            'AMT_REQ_CREDIT_BUREAU_QRT'].fillna(0)
        application_data['AMT_REQ_CREDIT_BUREAU_YEAR'] = application_data[
            'AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(0)
        treated_dataset = application_data[dataset_columns]

        # sample_class_1 = application_data[application_data['TARGET'] == 1][:20000]
        # sample_class_0 = application_data[application_data['TARGET'] == 0][:20000]
        # treated_dataset = pd.concat([sample_class_1, sample_class_0])[dataset_columns]
        # training_dataset, testing_dataset = train_test_split(treated_dataset, shuffle=True, stratify=treated_dataset['TARGET'])
        # train_mode = dict(training_dataset.mode().iloc[0])
        # train_mode
        features = list(set(dataset_columns) - set(['TARGET']))
        # train_features, Y_train = training_dataset[features], training_dataset['TARGET']
        test_features = treated_dataset[features]

        column_trans = make_column_transformer(
            (OneHotEncoder(), categorical_features),
            (StandardScaler(), numerical_features))
        transformer = column_trans.fit(treated_dataset[features])

        # X_train = transformer.transform(train_features)
        clean_data = transformer.transform(test_features)

        return clean_data
示例#31
0
#  Make a column transformer to pre-process our data, selecting 'Sex' and 'Embarked' columns to
#  encode them with OneHotEncoder, the 'remainder' parameter decides what to do with the remaining
#  columns, as the default behavior is to drop them, we use 'passthrough' to just concatenate them
#  with the processed data, resulting:
#  For each possible value of a column, it creates a column with 0 or 1 - false or true:
#  ['sex_value_1', 'sex_value_2', 'embarked_value_1', 'embarked_value_2', 'embarked_value_3', 'untouched_Pclass']
#  [[0. 1. 0. 0. 1. 3.]
#   [1. 0. 1. 0. 0. 1.]
#   [1. 0. 0. 0. 1. 3.]
#   ...
#   [1. 0. 0. 0. 1. 3.]
#   [0. 1. 1. 0. 0. 1.]
#   [0. 1. 0. 1. 0. 3.]]

column_trans = make_column_transformer((OneHotEncoder(), ['Sex', 'Embarked']),
                                       remainder='passthrough')

column_trans.fit_transform(X)

#  Build a Pipeline with a Logistic Regression model and our pre-processor
logreg = LogisticRegression(solver='lbfgs')
pipe = make_pipeline(column_trans, logreg)

#  Cross validate model with X and y, returning the average accuracy of the prediction
score = cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
print(score)

#  Works as model.fit, but runs pre-processing as well
pipe.fit(X, y)

X_new = X.sample(5, random_state=99)
示例#32
0
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Taking care of missing data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])
# or
# X[:, 1:3] = imputer.fit(X[:, 1:3]).transform(X[:, 1:3])

# Encoding categorical data
# Encoding the Independent Variable

le_X = LabelEncoder()
X[:, 0] = le_X.fit_transform(X[:, 0])

colt = make_column_transformer(
    (OneHotEncoder(categories='auto'), [0]), remainder='passthrough')
X = colt.fit_transform(X)

# the below method is depricated and alternative to colt
# ohe = OneHotEncoder(categories=[[0]])
# X = ohe.fit_transform(X).toarray()

# Encoding the Dependent Variable
le_y = LabelEncoder()
y = le_y.fit_transform(y)

# Splitting the dataset into Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

# Feature Scaling
def main(input_df,
         build_feature_pipe=None,
         all_preprocess=None,
         method='fit_transform'):
    """Transforms the source data by applying preprocessing transforms. 

    Args:
        input_df ([Dataframe]): Unprocessed source data
        build_feature_pipe (pipeline object, optional): the build_feature_pipe pipeline object, 
        Required only when method='transform'. Defaults to None.
        all_preprocess (pipeline object, optional): all_preprocess pipeline object. 
        Required when method='transform' or 'inverse_transform'.
         Defaults to None.
        method (str, optional): The operation performed by build_feature method. Valid values are ['fit','transform','fit_transform','inverse_transform].
        Defaults to 'fit_transform'.

    Returns:
        X,y (array): Returns the training data and target variable as arrays
    """

    module_logger.info('Starting to build features module.')

    #identify and define column sets for applying preprocessing transforms
    num_cols = read_csv_to_list('./data/processed/numeric_columns.csv',
                                header=None,
                                squeeze=True)
    cat_cols = read_csv_to_list('./data/processed/categorical_columns.csv',
                                header=None,
                                squeeze=True)
    drop_cols = read_csv_to_list('./data/processed/drop_columns.csv',
                                 header=None,
                                 squeeze=True)
    fe_cols = read_csv_to_list(
        './data/processed/feature_engineering_columns.csv',
        header=None,
        squeeze=True)

    module_logger.info('Importing columns from stored lists complete.')

    #Build preprocessing pipeline
    if method in ['fit', 'fit_transform'] and build_feature_pipe is None:
        module_logger.info('Building build_features_pipe for very first time.')
        build_feature_pipe = make_pipeline(
            transforms.DropRowsTransformer(),
            transforms.BuildFeaturesTransformer(fe_cols))

    if method in ['fit', 'fit_transform'] and all_preprocess is None:
        module_logger.info('Building all_preprocess_pipe for very first time.')

        numerical_preprocess = make_pipeline(SimpleImputer(strategy='median'),
                                             transforms.CustomStandardScaler())

        preprocess_pipe = make_column_transformer(
            (transforms.DropFeaturesTransformer(
                columns=list(drop_cols), inplace=True), list(drop_cols)),
            (transforms.RandomStandardEncoderTransformer(cat_cols), cat_cols),
            (numerical_preprocess, num_cols),
            remainder='passthrough')

        all_preprocess = make_pipeline(preprocess_pipe)

    #apply pipeline
    module_logger.info('Pipeline started')

    if method == 'fit':
        module_logger.info('Starting pipeline.fit')

        fe_df = build_feature_pipe.fit_transform(input_df)
        all_preprocess.fit(fe_df)

        X = []  #return empty array as only pipeline is fitted
        y = []  #return empty array as only pipeline is fitted
        module_logger.info('Pipeline.fit completed successfully')

    elif method == 'transform':

        module_logger.info('Starting pipeline.transform')

        #build_feature_pipe, all_preprocess cannot be None
        assert build_feature_pipe != None, module_logger.error(
            'Missing pipeline object build_feature_pipe.')
        assert all_preprocess != None, module_logger.error(
            'Missing pipeline object all_preprocess.')

        fe_df = build_feature_pipe.transform(input_df)
        module_logger.info('feature engineering + drop rows done')

        #Check if input_df has column CASE_STATUS, dataset during prediction will not have target variable so below code should be skipped
        if 'CASE_STATUS' in fe_df.columns.values:
            module_logger.info('Target column found.')

            assert fe_df[~fe_df.CASE_STATUS.isin(
                ['Certified', 'Denied'])].shape[0] == 0, module_logger.error(
                    'Unexpected values found in CASE_STATUS field.')
            y = fe_df.pop('CASE_STATUS')
            y.replace(['Certified', 'Denied'], [0, 1], inplace=True)

            module_logger.info('Target column separated')
        #if CASE_STATUS is not present, return empty array for y
        else:
            module_logger.info(
                'Target column not found. Returning empty array for y.')
            y = []

        X = all_preprocess.transform(fe_df)

        #Ensure that X has expected number of features
        assert X.shape[1] == 31, module_logger.exception(
            'Arrays X of shape [:,31] expected.')
        #Ensure that X and y have same number of rows
        assert X.shape[0] == y.shape[0], module_logger.exception(
            'Arrays X and y should have same number of rows.')

        module_logger.info('drop columns + encoding done')
        module_logger.info('pipeline.transform completed successfully')

    elif method == 'fit_transform':
        module_logger.info('Starting pipeline.fit_transform')
        fe_df = build_feature_pipe.fit_transform(input_df)
        module_logger.info('feature engineering + drop rows done')

        assert fe_df[~fe_df.CASE_STATUS.isin(['Certified', 'Denied'])].shape[
            0] == 0, module_logger.error(
                'Unexpected values found in CASE_STATUS field.')
        y = fe_df.pop('CASE_STATUS')
        y.replace(['Certified', 'Denied'], [0, 1], inplace=True)
        module_logger.info('Target column separated')

        X = all_preprocess.fit_transform(fe_df)

        #Ensure that X has expected number of features
        assert X.shape[1] == 31, module_logger.exception(
            'Arrays X of shape [:,31] expected.')
        #Ensure that X and y have same number of rows
        assert X.shape[0] == y.shape[0], module_logger.exception(
            'Arrays X and y should have same number of rows.')

        module_logger.info('drop columns + encoding done')
        module_logger.info('pipeline.fit_transform completed successfully')

    elif method == 'inverse':
        module_logger.info('Starting pipeline.inverse_transform')

        # all_preprocess cannot be None
        assert all_preprocess != None, module_logger.error(
            'Missing pipeline object all_preprocess.')

        X = all_preprocess.inverse_transform(input_df)
        y = [
        ]  #return empty array as inverse transform only applies to input features
        module_logger.info('drop columns + encoding done')
        module_logger.info('pipeline.inverse_transform completed successfully')

    module_logger.info('Building features complete.')

    #save pipeline when method is fit, fit_transform
    if method in ['fit', 'fit_transform']:
        dump(build_feature_pipe, open('./models/build_feature_pipe.pkl', 'wb'))
        dump(all_preprocess, open('./models/preprocess_pipe.pkl', 'wb'))
        module_logger.info('Pipeline saved.')

    return X, y
def load_data(data_path: str,
              history_size,
              horizon_size,
              historic_columns=['load', 'is_holiday', 'tempC'],
              horizon_columns=['is_holiday', 'tempC'],
              prediction_columns=['load'],
              splits=['train', 'validate', 'test'],
              shift=None,
              validation_split=None,
              batch_size=32,
              cycle_length=10,
              shuffle_buffer_size=1000,
              seed=42):
    """
    Loads the preprocessed CER data and build the dataset.

    :param      data_path:            The path to the folder containing the
                                      train.csv and test.csv
    :type       data_path:            str
    :param      history_size:         The number of time steps of the historic
                                      data a patch should contain
    :type       history_size:         int
    :param      horizon_size:         The number of time steps in the
                                      prediction horizon a step should contain
    :type       horizon_size:         int
    :param      historic_columns:     The column names to used as historic
                                      data.
    :type       historic_columns:     Array
    :param      horizon_columns:      The column names to be used as horizon
                                      data.
    :type       horizon_columns:      Array
    :param      prediction_columns:   The columns to predict
    :type       prediction_columns:   Array
    :param      splits:               The data splits to be generated. At least
                                      one of 'train', 'validate' or 'test'
    :type       splits:               Array
    :param      shift:                The amount of time steps by which the
                                      window moves on each iteration
    :type       shift:                int
    :param      validation_split:     The amount of data reserved from the
                                      training set for validation
    :type       validation_split:     float
    :param      batch_size:           The batch size
    :type       batch_size:           int
    :param      cycle_length:         The number of input elements that are
                                      processed concurrently
    :type       cycle_length:         int
    :param      shuffle_buffer_size:  The shuffle buffer size
    :type       shuffle_buffer_size:  int
    :param      seed:                 The seed used by the pseudo random
                                      generators
    :type       seed:                 int

    :returns:   A dict containing the windowed TensorFlow datasets generated
                from csv file in `data_path` for the given `spits`.
    :rtype:     dict
    """

    # common ##################################################################
    data = {}

    scalers = {
        'load': MinMaxScaler(feature_range=(0, 1)),
        'tempC': MinMaxScaler(feature_range=(-1, 1)),
        'is_holiday': MinMaxScaler(feature_range=(0, 1))
    }

    column_transformer = make_column_transformer(
        *[(scalers[k], [k]) for k in sorted(scalers.keys())])

    make_dataset = partial(WindowedTimeSeriesDataSet,
                           column_transformer=column_transformer,
                           history_size=history_size,
                           horizon_size=horizon_size,
                           historic_columns=historic_columns,
                           horizon_columns=horizon_columns,
                           prediction_columns=prediction_columns,
                           shift=shift,
                           batch_size=32,
                           cycle_length=cycle_length,
                           shuffle_buffer_size=shuffle_buffer_size,
                           seed=seed)

    # train data ##############################################################
    if 'train' in splits:
        if validation_split is not None:
            data_splitter = TimeSeriesSplit(
                1 - validation_split, TimeSeriesSplit.LEFT)
        else:
            data_splitter = None
        train_data_path = os.path.join(data_path, 'train.csv')

        data['train'] = make_dataset(file_path=train_data_path,
                                     data_splitter=data_splitter,
                                     fit_transformer=True)()

    # validation data #########################################################
    if 'validate' in splits and validation_split is not None:
        data_splitter = TimeSeriesSplit(
            validation_split, TimeSeriesSplit.RIGHT)

        data['validate'] = make_dataset(file_path=train_data_path,
                                        data_splitter=data_splitter)()

    # test data ###############################################################
    if 'test' in splits:
        test_data_path = os.path.join(data_path, 'test.csv')
        data['test'] = make_dataset(file_path=test_data_path)()

    return data
示例#35
0
@author: aswin
"""


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
datasheet=pd.read_csv("50_Startups_EDA.csv") 
datasheet.fillna(datasheet.mean(),inplace=True)
x=datasheet.iloc[:,0:-1].values
y=datasheet.iloc[:,5].values
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
A=make_column_transformer((OneHotEncoder(categories='auto'), [4]),remainder="passthrough")
x=A.fit_transform(x)
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.30,random_state=0)
from sklearn.linear_model import LinearRegression
prediction=LinearRegression()
prediction.fit(xtrain,ytrain)
result=prediction.predict(xtest)
prediction.score(xtrain,ytrain)
prediction.score(xtest,ytest)
import statsmodels.api as sm
x=np.append(arr=np.ones(shape=(60,1),dtype=int),values=x,axis=1)
xnew1= np.array(x[:,[0,2,3,4,5,6,7]], dtype=int)
model = sm.OLS(y,xnew1)
results1=model.fit()
results1.summary()
示例#36
0
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv('./data.csv')
# Data processing
data = data.iloc[:, 1:]  # removes first column with id's
X_raw = data.iloc[:, :-1]  # creates feature matrix without churn
#X_raw = X_raw.drop(['PhoneService','MultipleLines', 'OnlineBackup','DeviceProtection','StreamingTV','StreamingMovies'], axis = 1)
''' Data pre-processing'''
encoder = OneHotEncoder(sparse=False)
column_trans = make_column_transformer((encoder, [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaperlessBilling', 'PaymentMethod'
]),
                                       remainder='passthrough')
#encoder.fit_transform(X_raw[['PaymentMethod']])
#encoder.categories_
# NEW feature matrix
X = column_trans.fit_transform(X_raw)

# Binary encode churn
target = data.iloc[:, -1:]
y = target.apply(LabelEncoder().fit_transform)
''' Train test split '''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
''' Random Forest Model searching '''
#'''------------------------------------------------------------------
示例#37
0
X_val  = val[filtered_columns]
y_val = val['SepsisLabel']

categorical = X_train.dtypes == object

categorical['Gender'] = True

#Defining the pipeline

cat_pipeline = make_pipeline( OneHotEncoder(handle_unknown="ignore"))
cont_scale_pipeline = make_pipeline(SimpleImputer(strategy = "median"),
                                    StandardScaler())

preprocess_trans_scale = make_column_transformer((cont_scale_pipeline, 
                                                  ~categorical), 
                                                 (cat_pipeline, categorical))

#Custom Score function

def score_model(model, metric_list, y_true, y_pred):
    metric_dict = {'precision' :  precision_score, 'recall' : recall_score, 
                 'confusion_matrix' : confusion_matrix, 
                 'avg_precision': average_precision_score}
    df = pd.DataFrame()
    df['model'] = [model]
    for metric in metric_list:
        df[metric] = [metric_dict[metric](y_true, y_pred)]
    return df

#Logistic Regression
numeric_features = [
    "rating", "height", "weight", "salary", "draft_year", "draft_round",
    "draft_peak"
]

categorical_features = ["team", "country"]

numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features))

# Build a pipeline containing the column transformer and an SVC model
# Name it pipe_unbalanced and fit it on the training data
pipe_unbalanced = make_pipeline(preprocessor, SVC())
pipe_unbalanced.fit(X_train, y_train)

# Predict your values on the validation set
# Save them in an object named predicted_y
predicted_y = pipe_unbalanced.predict(X_valid)

# Using sklearn tools, calculate precision
# Save it in an object named precision
precision = precision_score(y_valid, predicted_y, pos_label="F").round(3)
print("precision: ", precision)

@pytest.mark.parametrize(
    "estimator",
    [
        LogisticRegression(max_iter=1000, random_state=0),
        GradientBoostingClassifier(random_state=0, n_estimators=5),
    ],
    ids=["estimator-brute", "estimator-recursion"],
)
@pytest.mark.parametrize(
    "preprocessor",
    [
        None,
        make_column_transformer(
            (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
            (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]),
        ),
        make_column_transformer(
            (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]),
            remainder="passthrough",
        ),
    ],
    ids=["None", "column-transformer", "column-transformer-passthrough"],
)
@pytest.mark.parametrize(
    "features",
    [[0, 2], [iris.feature_names[i] for i in (0, 2)]],
    ids=["features-integer", "features-string"],
)
def test_partial_dependence_dataframe(estimator, preprocessor, features):
    # check that the partial dependence support dataframe and pipeline
示例#40
0
X = df.iloc[:, :-1]
Y = np.array(df.iloc[:, -1])
Y = Y.reshape(len(Y), 1)

############ Target Encoding ############
print("\t> Encoding Target...")
Y = pd.DataFrame(Y)
Y.loc[Y[0] != 'normal.', 0] = 1
Y.loc[Y[0] == 'normal.', 0] = 0
#Y[0].Weight = Y[0].Weight.astype('int64')
Y = np.array(Y)
Y = Y.astype(float)

############ Input Encoding for columns 1,2,3 ############
print("\t> Encoding Input...")
IE = make_column_transformer((OneHotEncoder(), ['1', '2', '3']),
                             remainder='passthrough')
IE.fit(X)
X = pd.DataFrame(IE.transform(X))

############ Train test split (80%, 20% ratio) ############
print("\t> Splitting into Train and Test Data...")
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=1)
Y_train = Y_train.reshape(len(Y_train), 1)
Y_test = Y_test.reshape(len(Y_test), 1)

############# Scaling Input #############
print("\t> Scaling Input...")
SCALE_IN = StandardScaler()