Exemplo n.º 1
0
def test_imputation_error_sparse_0(strategy):
    # check that error are raised when missing_values = 0 and input is sparse
    X = np.ones((3, 5))
    X[0] = 0
    X = sparse.csc_matrix(X)

    imputer = SimpleImputer(strategy=strategy, missing_values=0)
    with pytest.raises(ValueError, match="Provide a dense array"):
        imputer.fit(X)

    imputer.fit(X.toarray())
    with pytest.raises(ValueError, match="Provide a dense array"):
        imputer.transform(X)
Exemplo n.º 2
0
def test_imputation_const_mostf_error_invalid_types(strategy, dtype):
    # Test imputation on non-numeric data using "most_frequent" and "constant"
    # strategy
    X = np.array([
        [np.nan, np.nan, "a", "f"],
        [np.nan, "c", np.nan, "d"],
        [np.nan, "b", "d", np.nan],
        [np.nan, "c", "d", "h"],
    ], dtype=dtype)

    err_msg = "SimpleImputer does not support data"
    with pytest.raises(ValueError, match=err_msg):
        imputer = SimpleImputer(strategy=strategy)
        imputer.fit(X).transform(X)
Exemplo n.º 3
0
def test_imputation_pickle():
    # Test for pickling imputers.
    import pickle

    X = sparse_random_matrix(100, 100, density=0.10)

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = SimpleImputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_almost_equal(
            imputer.transform(X.copy()),
            imputer_pickled.transform(X.copy()),
            err_msg="Fail to transform the data after pickling "
            "(strategy = %s)" % (strategy)
        )
Exemplo n.º 4
0
def test_imputation_copy():
    # Test imputation with copy
    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

    # copy=True, dense => copy
    X = X_orig.copy().toarray()
    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert not np.all(X == Xt)

    # copy=True, sparse csr => copy
    X = X_orig.copy()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert not np.all(X.data == Xt.data)

    # copy=False, dense => no copy
    X = X_orig.copy().toarray()
    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_array_almost_equal(X, Xt)

    # copy=False, sparse csc => no copy
    X = X_orig.copy().tocsc()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_array_almost_equal(X.data, Xt.data)

    # copy=False, sparse csr => copy
    X = X_orig.copy()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert not np.all(X.data == Xt.data)
Exemplo n.º 5
0
def data_preprocessing(dataset):
    # import data
    # dataset = pd.read_csv('data/train.csv')
    X = dataset.iloc[:, 2:13].values
    Y = dataset.iloc[:, 1].values

    # replace missing data
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy= "mean", missing_values = np.nan)
    imputer = imputer.fit(X[:,3])
   
    #X = imputer.fit_transform(X[:, 5]) Testing out new code
    X[:,3] = imputer.transform(X[:,3])
Exemplo n.º 6
0
def _check_statistics(X, X_true,
                      strategy, statistics, missing_values):
    """Utility function for testing imputation for a given strategy.

    Test:
        - along the two axes
        - with dense and sparse arrays

    Check that:
        - the statistics (mean, median, mode) are correct
        - the missing values are imputed correctly"""

    err_msg = "Parameters: strategy = %s, missing_values = %s, " \
              "axis = {0}, sparse = {1}" % (strategy, missing_values)

    assert_ae = assert_array_equal
    if X.dtype.kind == 'f' or X_true.dtype.kind == 'f':
        assert_ae = assert_array_almost_equal

    # Normal matrix
    imputer = SimpleImputer(missing_values, strategy=strategy)
    X_trans = imputer.fit(X).transform(X.copy())
    assert_ae(imputer.statistics_, statistics,
              err_msg=err_msg.format(0, False))
    assert_ae(X_trans, X_true, err_msg=err_msg.format(0, False))

    # Sparse matrix
    imputer = SimpleImputer(missing_values, strategy=strategy)
    imputer.fit(sparse.csc_matrix(X))
    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

    if sparse.issparse(X_trans):
        X_trans = X_trans.toarray()

    assert_ae(imputer.statistics_, statistics,
              err_msg=err_msg.format(0, True))
    assert_ae(X_trans, X_true, err_msg=err_msg.format(0, True))
Exemplo n.º 7
0
def test_imputation_most_frequent_objects(marker):
    # Test imputation using the most-frequent strategy.
    X = np.array([
        [marker, marker, "a", "f"],
        [marker, "c", marker, "d"],
        [marker, "b", "d", marker],
        [marker, "c", "d", "h"],
    ], dtype=object)

    X_true = np.array([
        ["c", "a", "f"],
        ["c", "d", "d"],
        ["b", "d", "d"],
        ["c", "d", "h"],
    ], dtype=object)

    imputer = SimpleImputer(missing_values=marker,
                            strategy="most_frequent")
    X_trans = imputer.fit(X).transform(X)

    assert_array_equal(X_trans, X_true)
Exemplo n.º 8
0
En utilisant la fonction dataset.reindex() et dataset.isnull(), 
introduire des données manquantes et récupérer les indices des valeurs
manquantes. Puis remplacez les valeurs manquantes par 0 par exemple. 
Puis supprimez ces valeurs manquantes.
 '''
df = df.fillna(0)
print(df)
'''
Importer les bibliothèques adéquates.
Importer le jeu de données (data-set).
Transformer les valeurs manquantes en moyenne (SimpleImputer)
'''

s = SimpleImputer(missing_values=np.nan, strategy='mean')
#df1.iloc[:,:] = s.fit_transform(df1)
s = s.fit(df1[['Niv_Etude_Bac']])
df1['Niv_Etude_Bac'] = s.transform(df1[['Niv_Etude_Bac']])
s = s.fit(df1[['enfant_a_Charge']])
df1['enfant_a_Charge'] = s.transform(df1[['enfant_a_Charge']])
print(df1)
'''
Encoder les valeurs catégoriques (LabelEncoder)
'''
t = LabelEncoder()
df1['Solvable'] = t.fit_transform(df1['Solvable'])
print(df1)
'''
Fractionner le jeu de données pour l’entrainement 
et le test (Training and Test set
'''
Exemplo n.º 9
0
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

base = pd.read_csv('../data_bases/credit_data.csv')
base.loc[base.age < 0, 'age'] = 40.92

# 1) Separar os tipos das variáveis (Previsores e Classe)
previsores = base.iloc[:, 1:4].values
classe = base.iloc[:, 4].values

# 2) Aplicar o pré processamento

# Substituir os valores faltantes pela média dos valores previsores
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(previsores[:, 1:4])
previsores[:, 1:4] = imputer.transform(previsores[:, 1:4])

# Escalonamento
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

# Separar os dados de treinamento dos dados de teste
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(
    previsores, classe, test_size=0.25, random_state=0)

# 3) Aplicar o treinamento
classificador = RandomForestClassifier(
    n_estimators=40, criterion='entropy',
    random_state=0)  # n_estimators = Número de árvores
classificador.fit(previsores_treinamento, classe_treinamento)
Exemplo n.º 10
0
# load the csv file as a data frame
dataframe = pd.read_csv('Pohang.csv')
X = dataframe.iloc[:, :-1].values
y = dataframe.iloc[:, 8].values

#Rescale data (between 0 and 1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

# Taking care of missing data
from sklearn.impute import SimpleImputer
# creating object for SimpleImputer class as "imputer"
imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0)
imputer = imputer.fit(
    rescaledX[:, 1:8])  #upper bound is not included, but lower bound
rescaledX[:, 1:8] = imputer.transform(rescaledX[:, 1:8])

# =============================================================================

# summarize the class distribution
target = dataframe.values[:, -1]
counter = Counter(target)
for k, v in counter.items():
    per = v / len(target) * 100
    print('Class=%s, Count=%s, Percentage=%.3f%%' % (k, v, per))

# summarize class distribution
print(X.shape, y.shape, Counter(y))

# Implementing SMOTE for the Imbalanced data in Multi-class classification
Exemplo n.º 11
0
import pandas as pd
from torch import nn, optim
import torch.nn.functional as F
import torch.utils.data as data
import torch
import numpy as np
import io

from sklearn.model_selection import train_test_split

dataset = pd.read_csv('result_set-a.csv')
dataset = dataset.sample(frac=1)
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(dataset)
dataset = imp.transform(dataset)
x = dataset[:, 0:41]
y = dataset[:, 41]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

from xgboost import XGBClassifier

classifier = XGBClassifier()
classifier.fit(x_train, y_train)

from sklearn.externals import joblib

# Save the model as a pickle in a file
joblib.dump(classifier, 'model.pkl')

y_pred = classifier.predict(x_test)
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# import the CSV data into a DataFrame
dataset = pd.read_csv('Machine Learning basics/Data Preprocessing/Data.csv')
# matriX of independent variables
X = dataset.iloc[:, :-1].values
# array of dependent variable (output)
y = dataset.iloc[:, -1].values

# handling missing data
imputer = SimpleImputer()
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# encoding categories:
# categorical data must be represented as numbers
# because ML models are mathematical
lblEncoder = LabelEncoder()
X[:, 0] = lblEncoder.fit_transform(X[:, 0])

# some categorical data doesn't have order nor hierarchy,
# it just needs a numerical format
oneEncoder = OneHotEncoder(categorical_features=[0])
X = oneEncoder.fit_transform(X).toarray()
y = lblEncoder.fit_transform(y)

# splitting the dataset into training and test sets
Exemplo n.º 13
0
def imputer(df, dfv, dfk, target_col, imputer_dict):

    result = {}

    for i in imputer_dict:

        if imputer_dict[i]['Indicator'] == 'deleterows':
            if df[i].isna().sum() > 0:
                df = df[df[i].isfinite()]
                dfv = dfv[dfv[i].isfinite()]
                dfk = dfk[dfk[i].isfinite()]

        if imputer_dict[i]['Indicator'] == True:
            if df[i].isna().sum() > 0:
                df[i + '_null_ind'] = np.where(df[i].isna(), 1, 0)
                dfv[i + '_null_ind'] = np.where(dfv[i].isna(), 1, 0)
                dfk[i + '_null_ind'] = np.where(dfk[i].isna(), 1, 0)

        if imputer_dict[i]['mvi'] in ['mean', 'median', 'most_frequent']:
            imp = SimpleImputer(missing_values=np.nan,
                                strategy=imputer_dict[i]['mvi'],
                                verbose=True,
                                add_indicator=False,
                                fill_value=None)
            imp.fit(df[[i]])
            result[i] = imp
            df.loc[:, i] = result[i].transform(df[[i]])
            dfv.loc[:, i] = result[i].transform(dfv[[i]])
            dfk.loc[:, i] = result[i].transform(dfk[[i]])

        if imputer_dict[i]['mvi'] == 'far_val':
            result[i] = df[i].max() * 100
            df[i] = np.where(df[i].isna(), result[i], df[i])
            dfv[i] = np.where(dfv[i].isna(), result[i], dfv[i])
            dfk[i] = np.where(dfk[i].isna(), result[i], dfk[i])

    ##### interativeimputer (if none of the above then this) ######

    imp = IterativeImputer(
        max_iter=3,
        estimator=ExtraTreesRegressor(
        )  #### hyperparameter, alternatively beysian, knn etc.
        ,
        n_nearest_features=
        5  ##### Change value for maximum columns considered to predict missing value
    )

    dfvc = dfv.copy()
    dfv[target_col] = np.nan

    dfkc = dfk.copy()
    dfk[target_col] = np.nan

    dfcolumns = df.columns
    imp.fit(df)
    df = pd.DataFrame(imp.transform(df))
    df.columns = dfcolumns
    dfv = pd.DataFrame(imp.transform(dfv))
    dfv.columns = dfcolumns
    dfk = pd.DataFrame(imp.transform(dfk))
    dfk.columns = dfcolumns

    dfv[target_col] = np.array(dfvc[target_col])
    dfk[target_col] = np.nan

    for i in imputer_dict:
        if imputer_dict[i]['mvi'] == 'iterativeimputer':
            result[i] = imp

    print("Completed imputer - ", datetime.datetime.now())

    return df, dfv, dfk, result
Exemplo n.º 14
0
def transform(df):

    imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0)
    imputer = imputer.fit(df)
    df = imputer.transform(df)
    return df
Exemplo n.º 15
0
# drop rows that at least 4 no-NaN values
df.dropna(thresh=4)

# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

print(df)

############################
# IMPUTING MISSING VALUES
############################

imr = SimpleImputer(missing_values=np.nan, strategy="mean")

imr = imr.fit(df)
imputed_data = imr.transform(df.values)
print(imputed_data)

############################
# HANDLING CATEGORICAL DATA
############################

df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1'],
])

df.columns = ['color', 'size', 'price', 'classlabel']
Exemplo n.º 16
0
"""

ax = sns.heatmap(df.isnull()).set_title("Wykrywanie brakujących wartości")
"""W tym zbiorze danych w aż 7 kolumnach występują brakujące wartości. 
Istnieje wiele sposobów radzenia sobie z brakujacymi danymi tzw. **missing data**. Techniki te najprościej podzielić na takie w których usuwa się dane (pojedyncze obserwacje lub całe kolumny) oraz na takie, w których imputuje się dane (ang. "impute"). W zależności od tego z jakimi danymi mamy doczynienia, czy są to szeregi czasowe, czy klasyczne dane, czy ciągłe, czy binarne itd. dobieramy odpowiednie metody.

Gender - płeć jest pierwszą kolumną w której występują brakujące dane. Ze względu na małą ilość brakujących danych (12) decyduję się je usunąć. Analogicznie Married. Self_Employed jest zmienną która zawiera wiele brakujących danych. Z tego względu decyduje się nie usuwać tych wartości.

Dla pozostałych 3 kolumn, które miały wyraźnie wiecej obserwacji brakujacych niż inne stosuje statystyczną strategie polegającą na uzupełnieniu danych najpopularniejszą wartościa dla danej zmiennej objaśniającej. Jest to prosty, ale skuteczny sposób radzi sobie zarówno ze zmiennymi ciągłymi jak i kategorycznymi.
"""

#usuwanie brakujących wartości
df = df.dropna(subset=["Gender", "Married", "Dependents", "Loan_Amount_Term"])
# zastepowanie brakujacych wartosco najczesciej wystepujacymi
imp_mean = SimpleImputer(strategy='most_frequent')
imp_mean.fit(df)
imputed_data_india = imp_mean.transform(df)
imputed_data_india = pd.DataFrame(imputed_data_india, columns=col_names)
imputed_data_india['Dependents'] = imputed_data_india['Dependents'].replace(
    ['3+'], 3)
imputed_data_india['Dependents'] = imputed_data_india['Dependents'].astype(int)
df = imputed_data_india
#Konwersja typów
df['Gender'] = df['Gender'].astype(str)
df['Married'] = df['Married'].astype(str)
df['Education'] = df['Education'].astype(str)
df['Self_Employed'] = df['Self_Employed'].astype(str)
df['ApplicantIncome'] = df['ApplicantIncome'].astype(int)
df['CoapplicantIncome'] = df['CoapplicantIncome'].astype(int)
df['LoanAmount'] = df['LoanAmount'].astype(int)
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].astype(int)
Exemplo n.º 17
0
import numpy as np
from sklearn.impute import SimpleImputer

data = [[1, 2], [np.nan, 3], [7, 6]]
inputer = SimpleImputer(missing_values=np.nan, strategy='mean')
inputer.fit(data)
SimpleImputer()

print(inputer.transform(data))
Exemplo n.º 18
0
# -*- coding: utf-8 -*-
"""
Created on Wed Apr  4 00:58:56 2021

@author: Hp
"""

import numpy as np
import pandas as pd

dataset = pd.read_csv('clasificacion2.csv')
x = dataset.iloc[:, :7].values
y = dataset.iloc[:, 7].values

from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, :7])
x[:, :7] = imputer.transform(x[:, :7])
print(x)

from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
sc = StandardScaler()
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    x, y, test_size=0.2)
X_train[:, :7] = sc.fit_transform(X_train[:, :7])
X_test[:, :7] = sc.transform(X_test[:, :7])
Exemplo n.º 19
0
class STMBplus_auto(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                                   Hyperparams]):
    """
    A primitive that performs supervised structured feature selection to reduce input feature dimension. Input to this primitive should be a matrix of tabular numerical/categorical data, consisting of columns of features, and an array of labels. Output will be a reduced data matrix with metadata updated.
    """

    metadata = metadata_base.PrimitiveMetadata({
        'id':
        '9d1a2e58-5f97-386c-babd-5a9b4e9b6d6c',
        'version':
        rpi_d3m_primitives.__coreversion__,
        'name':
        'STMBplus_auto feature selector',
        'keywords': ['Feature Selection'],
        'description':
        'This primitive is a structured feature selection function based on the independence test',
        'source': {
            'name':
            rpi_d3m_primitives.__author__,
            'contact':
            'mailto:[email protected]',
            'uris': [
                'https://github.com/zijun-rpi/d3m-primitives/blob/master/STMBplus_auto.py',
                'https://github.com/zijun-rpi/d3m-primitives.git'
            ]
        },
        'installation': [{
            'type': metadata_base.PrimitiveInstallationType.PIP,
            'package': 'rpi_d3m_primitives',
            'version': rpi_d3m_primitives.__version__
        }],
        'python_path':
        'd3m.primitives.feature_selection.simultaneous_markov_blanket.AutoRPI',
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.
            MINIMUM_REDUNDANCY_FEATURE_SELECTION
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.FEATURE_SELECTION
    })

    def __init__(
        self,
        *,
        hyperparams: Hyperparams,
        random_seed: int = 0,
        docker_containers: typing.Union[typing.Dict[
            str, base.DockerContainer]] = None
    ) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)
        self._index = None
        self._problem_type = 'classification'
        self._training_inputs = None
        self._training_outputs = None
        self._fitted = False
        self._cate_flag = None
        self._LEoutput = preprocessing.LabelEncoder()  # label encoder
        self._Imputer = SimpleImputer(missing_values=np.nan,
                                      strategy='mean')  # imputer
        self._nbins = self.hyperparams['nbins']
        self._Kbins = preprocessing.KBinsDiscretizer(
            n_bins=self._nbins, encode='ordinal', strategy='uniform'
        )  #self.hyperparams['Discretizer_Strategy']) #KbinsDiscretizer

    ## TO DO
    # select columns via semantic types
    # remove preprocessing
    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:

        # set problem type
        metadata = outputs.metadata
        column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, 0))
        semantic_types = column_metadata.get('semantic_types', [])
        if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types:
            self._problem_type = 'classification'
            # set training labels
            self._LEoutput.fit(outputs)
            self._training_outputs = self._LEoutput.transform(outputs)

        else:
            self._problem_type = 'regression'

        # convert cateforical values to numerical values in training data
        metadata = inputs.metadata
        [m, n] = inputs.shape
        self._training_inputs = np.zeros((m, n))
        self._cate_flag = np.zeros((n, ))
        for column_index in metadata.get_elements(
            (metadata_base.ALL_ELEMENTS, )):
            if column_index is metadata_base.ALL_ELEMENTS:
                continue
            column_metadata = metadata.query(
                (metadata_base.ALL_ELEMENTS, column_index))
            semantic_types = column_metadata.get('semantic_types', [])
            if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types:
                LE = preprocessing.LabelEncoder()
                LE = LE.fit(inputs.iloc[:, column_index])
                self._training_inputs[:, column_index] = LE.transform(
                    inputs.iloc[:, column_index])
                self._cate_flag[column_index] = 1
            elif 'http://schema.org/Text' in semantic_types:
                pass
            else:
                temp = list(inputs.iloc[:, column_index].values)
                for i in np.arange(len(temp)):
                    if bool(temp[i]):
                        self._training_inputs[i, column_index] = float(temp[i])
                    else:
                        self._training_inputs[i, column_index] = float('nan')
                if not np.count_nonzero(
                        np.isnan(self._training_inputs[:, column_index])
                ) == 0:  # if there is missing values
                    if np.count_nonzero(
                            np.isnan(self._training_inputs[:, column_index])
                    ) == m:  # all missing
                        self._training_inputs[:, column_index] = np.zeros(
                            m, )  # replace with all zeros

        self._fitted = False

    def fit(self, *, timeout: float = None, iterations: int = None) -> None:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs.any() == None or self._training_outputs.any(
        ) == None:
            raise ValueError('Missing training data, or missing values exist.')

        ## impute missing values
        self._Imputer.fit(self._training_inputs)
        self._training_inputs = self._Imputer.transform(self._training_inputs)

        #        [m,n] = self._training_inputs.shape
        #        for column_index in range(n):
        #            if len(np.unique(self._training_inputs[:,column_index])) == 1:
        #                self._cate_flag[column_index] = 1

        ## discretize non-categorical values
        disc_training_inputs = self._training_inputs
        if not len(np.where(self._cate_flag == 0)[0]) == 0:
            self._Kbins.fit(
                self._training_inputs[:, np.where(
                    self._cate_flag == 0)[0]])  #find non-categorical values
            temp = self._Kbins.transform(
                self._training_inputs[:, np.where(self._cate_flag == 0)[0]])
            disc_training_inputs[:, np.where(self._cate_flag == 0)[0]] = temp
        #start from zero

        Trainset = RelationSet(self._training_inputs,
                               self._training_outputs.reshape(-1, 1))
        discTrainset = RelationSet(disc_training_inputs,
                                   self._training_outputs.reshape(-1, 1))
        validSet, smallTrainSet = Trainset.split(
            self._training_inputs.shape[0] // 4)
        smallDiscTrainSet = discTrainset.split(
            self._training_inputs.shape[0] // 4)[1]
        model = STMB(Trainset,
                     discTrainset,
                     self._problem_type,
                     test_set=Trainset)
        index = model.select_features()
        self._index = []
        [
            m,
        ] = index.shape
        for ii in np.arange(m):
            if not len(np.unique(
                    self._training_inputs[:, index[ii].item()])) == 1:
                self._index.append(index[ii].item())
        self._fitted = True

        return CallResult(None)

    def produce(
        self,
        *,
        inputs: Inputs,
        timeout: float = None,
        iterations: int = None
    ) -> base.CallResult[Outputs]:  # inputs: m x n numpy array
        if self._fitted:
            output = inputs.iloc[:, self._index]
            output.metadata = utils.select_columns_metadata(
                inputs.metadata, columns=self._index)
            return CallResult(output)
        else:
            raise ValueError('Model should be fitted first.')

    def get_params(self) -> None:
        pass

    def set_params(self) -> None:
        pass
# all rows included, but from the columns we only need the 7th (6th index)

print('Original:' '\n=========')
print(dataset)

# ===================
# DATA PREPROCESSING |
# ===================

# -------------
# MISSING DATA |
# -------------
imputer = SimpleImputer(missing_values=np.nan, strategy='constant')
# handles missing data & replaces NaN values
# strategy argument 'constant' replaces missing values with fill_value (for string/object datatypes)
imputer = imputer.fit(X[:, 1:6])
# fits the imputer on X
# # fits data to avoid data leakage during cross validation
X[:, 1:6] = imputer.transform(X[:, 1:6])
# imputes all missing values in X

print("\nImputed:" "\n========")
print(X)

# ------------------------------------------------------------------------
# CONVERT CATEGORICAL TEXT DATA INTO MODEL-UNDERSTANDABLE NUMBERICAL DATA |
# ------------------------------------------------------------------------
labelencoder_X = LabelEncoder()
# encodes target lables with values between 0 and n_classes -1
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
# fit label encoder and return encoded labels
import matplotlib.pyplot as plt
import pandas as pd
from openpyxl import Workbook, load_workbook

arac = load_workbook("ham veri.xlsx")
sheet = arac.active
sheet = arac['işlenmiş veri']

dataset = pd.read_excel('ham veri.xlsx')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 5:7])
X[:, 5:7] = imputer.transform(X[:, 5:7])
print(X)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 1, 3,
                                                                   4])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
Exemplo n.º 22
0
for year in range(2014, 2020):
    yearDf = pd.read_csv('data/playerTotals' + str(year) + '.csv', sep=',')
    names += [name + ' ' + str(year) for name in yearDf['name']]
    df = pd.concat([df, yearDf], axis=0, ignore_index=True)

X = df.values

#%% fill in blank columns
from sklearn.impute import SimpleImputer

print(df.isna().any())

imputer = SimpleImputer(missing_values=np.nan,
                        strategy='constant',
                        fill_value=0)
imputer.fit(X)
X = imputer.transform(X)

#%% add shot selection stats
df = pd.DataFrame(data=X, columns=df.columns)
shooting_ranges = [
    'At Rim', '3 to <10 ft', '10 to <16 ft', '16 ft to <3-pt', '3-pt'
]
playerFGA = pd.Series(np.zeros(len(df.index)))
for shot in shooting_ranges:
    playerFGA = playerFGA + df[shot + ' FGA']
for shot in shooting_ranges:
    df[shot + ' FGA%'] = df[shot + ' FGA'] / playerFGA

#%% save complete player totals
df.to_csv('data/completePlayerTotals.csv')
Exemplo n.º 23
0
    '0s=suspect Blood Donor': 0
})

#Découpage des données par classe
blood_donor = dataset.iloc[:533, :].values
suspect_blood_donor = dataset.iloc[533:540, :].values
hepatitis = dataset.iloc[540:564, :].values
fibrosis = dataset.iloc[564:585, :].values
cirrhosis = dataset.iloc[585:, :].values

#Traitements des données manquantes par classe
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")

imputer = imputer.fit(blood_donor[:, 3:])
blood_donor[:, 3:] = imputer.transform(blood_donor[:, 3:])

imputer = imputer.fit(hepatitis[:, 3:])
hepatitis[:, 3:] = imputer.transform(hepatitis[:, 3:])

imputer = imputer.fit(fibrosis[:, 3:])
fibrosis[:, 3:] = imputer.transform(fibrosis[:, 3:])

imputer = imputer.fit(cirrhosis[:, 3:])
cirrhosis[:, 3:] = imputer.transform(cirrhosis[:, 3:])

#fusion des tableaux
z = np.concatenate(
    (blood_donor, suspect_blood_donor, hepatitis, fibrosis, cirrhosis), axis=0)
x = z[:, 1:]
Exemplo n.º 24
0
def main():
    datafile = input('Filename: ')
    print('\nKeep in mind that bad lines are skipped in original data format.')
    if datafile != 'test_plus':
        df = pd.read_csv(datafile,
                         sep="   ",
                         header=None,
                         error_bad_lines=False)
        print(df.shape)
        #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        #imp = imp.fit(df)
        #df = imp.transform(df)
        df = df.dropna()

    print('\nChosen filename: ', datafile)
    if datafile == "dataset_04.txt":
        df.columns = [
            'frame_number', 'T1_x', 'T1_y', '1_mx', '1_my', '1_ex', '1_ey',
            '2_mx', '2_my', '2_ex', '2_ey', '3_mx', '3_my', '3_ex', '3_ey',
            '4_mx', '4_my', '4_ex', '4_ey', 'orientation', 'label'
        ]
    elif datafile == 'test_1.txt':
        df.columns = [
            'frame_number', 'T1_x', 'T1_y', 'angle1', 'angle2', 'angle3',
            'angle4', 'cu1', 'cu2', 'cu3', 'cu4', 'chL1', 'chL2', 'chL3',
            'chL4', 'theta1', 'theta2', 'theta3', 'theta4', 'AL1', 'AL2',
            'AL3', 'AL4', 'orientation', 'label'
        ]
        #df = pd.DataFrame({'frame_number':df['frame_number'], 'T1_x': df['T1_x'], 'T1_y': df['T1_y'], 'angles': df['angle1']*df['angle2']*df['angle3']*df['angle4'], 'curvatures': df['cu1']*df['cu2']*df['cu3']*df['cu4']*pow(1000,4), 'chLs': df['chL1']*df['chL2']*df['chL3']*df['chL4'], 'thetas': df['theta1']*df['theta2']*df['theta3']*df['theta4'], 'ALs': df['AL1']*df['AL2']*df['AL3']*df['AL4'], 'orientation': df['orientation'], 'label': df['label']})
        print(df.head())
    elif datafile == 'test_plus':
        df1 = pd.read_csv('test_start_end_points.txt',
                          sep="   ",
                          header=None,
                          error_bad_lines=False)
        df2 = pd.read_csv('test_for_mid_points.txt',
                          sep="   ",
                          header=None,
                          error_bad_lines=False)
        df1 = df1.replace([np.inf, -np.inf], np.nan)
        df1 = df1.fillna(df1.mean())
        df2 = df2.replace([np.inf, -np.inf], np.nan)
        df2 = df2.fillna(df2.mean())
        df1 = df1.dropna()
        df2 = df2.dropna()
        df1.columns = [
            'frame_number', 'T1_x', 'T1_y', 'angle1', 'angle2', 'angle3',
            'angle4', 'cu1', 'cu2', 'cu3', 'cu4', 'chL1', 'chL2', 'chL3',
            'chL4', 'd_tot', 'theta1', 'theta2', 'theta3', 'theta4', 'AL1',
            'AL2', 'AL3', 'AL4', 'A1', 'A2', 'A3', 'A4', 'd11', 'd21', 'd31',
            'd41', 'A1_tot', 'A2_tot', 'A3_tot', 'A4_tot', 'h1', 'h2', 'h3',
            'h4', 'orientation', 'label'
        ]
        df2.columns = [
            'frame_number', 'T1_x', 'T1_y', 'angle1', 'angle2', 'angle3',
            'angle4', 'cu1', 'cu2', 'cu3', 'cu4', 'chL1', 'chL2', 'chL3',
            'chL4', 'theta1', 'theta2', 'theta3', 'theta4', 'AL1', 'AL2',
            'AL3', 'AL4', 'A1', 'A2', 'A3', 'A4', 'orientation', 'label'
        ]
        df = df1
        df['mid1'] = df2['angle1']
        df['mid2'] = df2['angle2']
        df['mid3'] = df2['angle3']
        df['mid4'] = df2['angle4']
        df = df.drop([
            'cu1', 'cu2', 'cu3', 'cu4', 'd_tot', 'theta1', 'theta2', 'theta3',
            'theta3', 'A1_tot', 'A2_tot', 'A3_tot', 'A4_tot', 'h1', 'h2', 'h3',
            'h4', 'orientation'
        ],
                     axis=1)
    elif datafile == 'test_start_end_points.txt' or 'test_for_mid_points.txt':
        df.columns = [
            'frame_number', 'T1_x', 'T1_y', 'angle1', 'angle2', 'angle3',
            'angle4', 'cu1', 'cu2', 'cu3', 'cu4', 'chL1', 'chL2', 'chL3',
            'chL4', 'd_tot', 'theta1', 'theta2', 'theta3', 'theta4', 'AL1',
            'AL2', 'AL3', 'AL4', 'A1', 'A2', 'A3', 'A4', 'd11', 'd21', 'd31',
            'd41', 'A1_tot', 'A2_tot', 'A3_tot', 'A4_tot', 'h1', 'h2', 'h3',
            'h4', 'orientation', 'label'
        ]
        #df = pd.DataFrame({'frame_number':df['frame_number'], 'T1_x': df['T1_x'], 'T1_y': df['T1_y'], 'angles': df['angle1']*df['angle2']*df['angle3']*df['angle4'], 'curvatures': df['cu1']*df['cu2']*df['cu3']*df['cu4'], 'chLs': df['chL1']*df['chL2']*df['chL3']*df['chL4'], 'thetas': df['theta1']*df['theta2']*df['theta3']*df['theta4'], 'ALs': df['AL1']*df['AL2']*df['AL3']*df['AL4'], 'As': df['A1']*df['A2']*df['A3']*df['A4'], 'orientation': df['orientation'], 'label': df['label']})
        print(df.head())

    print('Dataset features: ', df.columns)
    print(
        '--------------------------------------------------------------------------------------'
    )
    print('\nData information section.')

    print('\nDataFrame description:')
    print(df.describe())
    print('\nDataFrame info:')
    print(df.info())
    print(
        '--------------------------------------------------------------------------------------'
    )
    data_manipulation = input('Use data manipulation? Type "Yes" or "No": ')
    if data_manipulation == "Yes":
        manipulation_type = input(
            '\nWhich manipulation type do you want to use? Type answer here: ')
        if manipulation_type == "Angle-S1":
            print('Calculating direct angles between end points.')
            df = feature_derivation.angle(df)
        elif manipulation_type == "Angle-S2":
            print(
                'Calculating direct angles between end points AND substracting them from flow orientation.'
            )
            df = feature_derivation.angle(df)
            df = feature_derivation.orientation_strategy(df)

        else:
            print(
                '\nDesired manipulation method has not yet been implemented. Exiting program.'
            )
            exit()
    else:
        print('\nData will not be manipulated in any way.')
        print(
            '--------------------------------------------------------------------------------------'
        )
    strat = input(
        '\nWhich stratagem do we use? 1: [50-50 + 50-50], 2: [50-50 + 10-90], 3: [10-90 + 10-90]: '
    )
    num_sample = input('\nWhat is the standard sample size? ')

    if int(strat) == 1:
        print('\nStratagem 1 chosen.')
        positives = df[df['label'] == 1]
        positives = positives.sample(n=int(int(num_sample) * 0.5),
                                     random_state=1)
        zeros = df[df['label'] == 0]
        zeros = zeros.sample(n=int(int(num_sample) * 0.5), random_state=1)
        frames = [positives, zeros]
        df = pd.concat(frames)
        labels = df['label']
        print(df.columns)
        inputs = df.drop(['frame_number', 'T1_x', 'T1_y', 'label'], 1)
        input_train, input_test, labels_train, labels_test = train_test_split(
            inputs, labels, test_size=0.20, random_state=0)
    elif int(strat) == 2:
        print('\nStratagem 2 chosen.')
        positives = df[df['label'] == 1]

        positives = positives.sample(n=int(int(num_sample) * 0.5),
                                     random_state=1)
        print(positives.shape)
        zeros = df[df['label'] == 0]

        zeros_s = zeros.sample(n=int(int(num_sample) * 0.5), random_state=1)
        print(zeros_s.shape)
        frames = [positives, zeros_s]
        df = pd.concat(frames)
        labels = df['label']

        inputs = df.drop(['frame_number', 'T1_x', 'T1_y', 'label'], 1)
        input_train, input_test, labels_train, labels_test = train_test_split(
            inputs, labels, test_size=0.20, random_state=0)
        positives_2 = positives.sample(n=int(int(num_sample) * 0.1),
                                       random_state=1)
        print(positives_2.shape)
        #zeros_2 = df['label']
        #print(zeros_2.shape)
        zeros_2 = zeros.sample(n=int(int(num_sample) * 0.9),
                               random_state=1,
                               replace=False)
        print(zeros_2.shape)
        frames_2 = [positives_2, zeros_2]
        temp_df = pd.concat(frames_2)
        labels_2 = temp_df['label']

        inputs_2 = temp_df.drop(['frame_number', 'T1_x', 'T1_y', 'label'], 1)
        input_train_2, input_test, labels_train_2, labels_test = train_test_split(
            inputs_2, labels_2, test_size=0.20, random_state=0)

    elif int(strat) == 3:
        print('\nStratagem 3 chosen.')
        positives = df[df['label'] == 1]
        positives = positives.sample(n=int(int(num_sample) * 0.1),
                                     random_state=1)
        zeros = df[df['label'] == 0]
        zeros = zeros.sample(n=int(int(num_sample) * 0.9), random_state=1)
        frames = [positives, zeros]
        df = pd.concat(frames)
        labels = df['label']

        inputs = df.drop(['frame_number', 'T1_x', 'T1_y', 'label'], 1)
        input_train, input_test, labels_train, labels_test = train_test_split(
            inputs, labels, test_size=0.20, random_state=0)
    else:
        print('\nInvalid stratagem. Exiting program.')
        exit()
    i = 0
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp = imp.fit(input_train)
    input_train = imp.transform(input_train)
    input_test = imp.transform(input_test)
    solver_type = input('\nWhich ML solver do we use: ')
    print("\n------------------------------------------------------------")
    while i < 2:
        if i == 0:
            print('\nRunning the model with raw data (no scaling)...')
        #9,10,11,8,
        if i != 0:
            do_pca = input('Do the PCA? Type "Yes" or "No": ')
            if do_pca == "Yes":
                pca = PCA(n_components=2, svd_solver='arpack', random_state=0)
                principalComponents = pca.fit_transform(input_train)
                #labels_2 = pd.concat(labels_train, labels_train)
                label_df = pd.DataFrame({
                    'label': labels_train,
                    'label_2': labels_train
                })
                print('PCA', principalComponents.shape)
                print('labels', labels_train.shape)
                #print(labels_train.columns)
                finalDf = np.hstack((principalComponents, label_df))
                #finalDf =  pd.concat(principalComponents, labels_train)
                #save_pca = input('\nSave PCA data? Type "Yes" or "No": ')
                #if save_pca == "Yes":
                #print(finalDf.head())
                #print(finalDf.transpose().head())
                #    np.savetxt(r'C:\Users\oskar\eclipse-workspace\fEX\T1-feature-extraction-master\pca_data_test1.txt', finalDf, fmt='%d', delimiter=' ')
                print(
                    "\n------------------------------------------------------------"
                )
                print("\nNumber of features in the original data: ",
                      pca.n_features_)
                print(
                    pd.DataFrame(
                        pca.components_.transpose(),
                        index=[
                            'angle1', 'angle2', 'angle3', 'angle4', 'cu1',
                            'cu2', 'cu3', 'cu4', 'chL1', 'chL2', 'chL3',
                            'chL4', 'd_tot', 'theta1', 'theta2', 'theta3',
                            'theta4', 'AL1', 'AL2', 'AL3', 'AL4', 'A1', 'A2',
                            'A3', 'A4', 'd11', 'd21', 'd31', 'd41', 'A1_tot',
                            'A2_tot', 'A3_tot', 'A4_tot', 'h1', 'h2', 'h3',
                            'h4', 'orientation', 'mid1', 'mid2', 'mid3', 'mid4'
                        ],
                        columns=['PC-1', 'PC-2']))
                print("Data variance ratio after PCA tranform: ",
                      pca.explained_variance_ratio_)
                print(
                    "\n------------------------------------------------------------"
                )
                vis_pca = input(
                    '\nVisualize PCA components? Type "Yes" or "No": ')
                if vis_pca == "Yes":
                    fig = plt.figure(figsize=(8, 8))
                    ax = fig.add_subplot(1, 1, 1)
                    ax.set_xlabel('Principal Component 1', fontsize=15)
                    ax.set_ylabel('Principal Component 2', fontsize=15)
                    ax.set_title('PCA: 2 component projection', fontsize=20)
                    targets = [0, 1]
                    colors = ['orangered', 'dodgerblue']
                    #print(finalDf.head())
                    for target, color in zip(targets, colors):
                        #finalDf_2 = finalDf[np.where(finalDf[:,2] == target)]
                        finalDf_2 = finalDf[finalDf[:, 2] == target]
                        if color == 'r':
                            alfa = 1.0
                            normi = 1.0
                        else:
                            alfa = 0.80
                            normi = 1.0
                        ax.scatter(finalDf_2[:, 0],
                                   finalDf_2[:, 1],
                                   c=color,
                                   alpha=alfa,
                                   marker='o',
                                   s=5.0,
                                   norm=normi)
                    ax.legend(targets)
                    ax.grid()
                    plt.show()
        clf = MLPClassifier(solver=solver_type,
                            alpha=1e-5,
                            hidden_layer_sizes=(9, 10, 2),
                            random_state=0,
                            max_iter=20000,
                            shuffle=True)
        clf.fit(input_train, labels_train)
        predicted = clf.predict(input_test)
        conf_matrix = metrics.confusion_matrix(labels_test, predicted)
        tn, fp, fn, tp = metrics.confusion_matrix(labels_test,
                                                  predicted).ravel()
        scores = cross_val_score(clf, input_test, labels_test, cv=5)

        print('Fitting of training data complete.')
        print('Predicting based on test data.')
        print("\nTraining set score: %f" %
              clf.score(input_train, labels_train))
        print("Test set score: %f" % clf.score(input_test, labels_test))
        print('Confusion matrix: \n')
        print('TN:', tn)
        print('TP: ', tp)
        print('FN: ', fn)
        print('FP:', fp)
        print('CROSS VALIDATION SCORES:', scores)
        print("\n------------------------------------------------------------")
        if i == 0:
            print('\nScaling following features', inputs.columns)
            input_train = StandardScaler().fit_transform(input_train)
            input_test = StandardScaler().fit_transform(input_test)
            print('\nRunning the same model with scaled data...')
        i = i + 1
    print('\nFinished.')
Exemplo n.º 25
0
# Lê o dataset credit_data e define como df
df = pd.read_csv('CSVs/credit_data.csv')

# Define as idades anômolas pela média das normais
df.loc[df.age < 0, 'age'] = df.age[df.age > 0].mean()

# Define as features para previsores
previsores = df.iloc[:, 1:4].values
classe = df.iloc[:, 4].values

# Instancia a classe Imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Treina o algoritmo com os valores existentes
imputer = imputer.fit(previsores)

# Insere os valores nas células com valores faltantes
previsores = imputer.transform(previsores)

# Instancia a classe de escalonamento
scaler = StandardScaler()

# Padroniza os valores na mesma escala
previsores = scaler.fit_transform(previsores)
"""
DIVISÃO TREINO E TESTE
"""

# Importa a biblioteca para divisão do dataset em treinamento e teste
from sklearn.model_selection import train_test_split
Exemplo n.º 26
0
import matplotlib.pyplot as plt
import pandas as pd
"""Importing Data Set"""

dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

print(x)

print(y)
"""Taking care of missing data"""

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

print(x)
"""#Encoding Categorical data
###Encoding the independent variable
"""

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')
x = np.array(ct.fit_transform(x))

print(x)
"""###Encoding for dependent variables"""
os.chdir("C:/Venkat/Personal/Trainings/Datasets/")
#creation of data frames from csv
titanic_train = pd.read_csv("Titanic_train.csv")
print(titanic_train.info())

#preprocessing stage
#The SimpleImputer class is for continues numerical values, but also supports categorical data represented
#as string values or pandas categoricals when using the 'most_frequent' or 'constant' strategy:
#SimpleImputer(strategy="most_frequent")

#impute missing values for continuous features
imputable_cont_features = ['Age', 'Fare']
#cont_imputer = preprocessing.Imputer() not working in this version python.
cont_imputer = SimpleImputer()  # default Mean
cont_imputer.fit(titanic_train[imputable_cont_features])
print(cont_imputer.statistics_)
titanic_train[imputable_cont_features] = cont_imputer.transform(
    titanic_train[imputable_cont_features])

#impute missing values for categorical features
#cat_imputer = CategoricalImputer
cat_imputer = SimpleImputer(strategy="most_frequent")
cat_imputer.fit(titanic_train[['Embarked']])
#print(cat_imputer.fill_)
print(cat_imputer.statistics_)
titanic_train[['Embarked']] = cat_imputer.transform(titanic_train[['Embarked'
                                                                   ]])

le_embarked = preprocessing.LabelEncoder()
le_embarked.fit(titanic_train['Embarked'])
Exemplo n.º 28
0
def main():
    # data load
    df = pd.read_csv('./data/' + file_model + '.csv', header=0)
    ID = df.iloc[:, 0]
    y = df.iloc[:, -1]
    X = df.iloc[:, 1:-1]

    # preprocessing-1: one-hot encoding
    X_ohe = pd.get_dummies(X, dummy_na=True, columns=ohe_cols)
    X_ohe = X_ohe.dropna(axis=1, how='all')
    X_ohe_columns = X_ohe.columns.values

    # preprocessing-2: null imputation
    imp = SimpleImputer()
    imp.fit(X_ohe)
    X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)
    print(X_ohe.shape)

    # preprocessing-3: feature selection
    selector = RFECV(estimator=RandomForestClassifier(n_estimators=100,
                                                      random_state=0),
                     step=0.05)
    selector.fit(X_ohe, y)
    X_ohe_selected = selector.transform(X_ohe)
    X_ohe_selected = pd.DataFrame(X_ohe_selected,
                                  columns=X_ohe_columns[selector.support_])
    print(X_ohe_selected.shape)
    X_ohe_selected.head()

    # preprocessing-4: preprocessing of a score data along with a model dataset
    if len(file_score) > 0:
        # load score data
        dfs = pd.read_csv('./data/' + file_score + '.csv', header=0)
        IDs = dfs.iloc[:, [0]]
        Xs = dfs.iloc[:, 1:-1]
        Xs_ohe = pd.get_dummies(Xs, dummy_na=True, columns=ohe_cols)
        cols_m = pd.DataFrame(None, columns=X_ohe_columns, dtype=float)

        # consistent with columns set
        Xs_exp = pd.concat([cols_m, Xs_ohe])
        Xs_exp.loc[:,list(set(X_ohe_columns)-set(Xs_ohe.columns.values))] = \
            Xs_exp.loc[:,list(set(X_ohe_columns)-set(Xs_ohe.columns.values))].fillna(0, axis=1)
        Xs_exp = Xs_exp.drop(
            list(set(Xs_ohe.columns.values) - set(X_ohe_columns)), axis=1)

        # re-order the score data columns
        Xs_exp = Xs_exp.reindex_axis(X_ohe_columns, axis=1)
        Xs_exp = pd.DataFrame(imp.transform(Xs_exp), columns=X_ohe_columns)
        Xs_exp_selected = Xs_exp.loc[:, X_ohe_columns[selector.support_]]

    # modeling
    clf.fit(X_ohe_selected, y.as_matrix().ravel())
    joblib.dump(clf, './model/' + model_name + '.pkl')
    results = cross_val_score(clf, X_ohe_selected, y, scoring='roc_auc', cv=5)
    print('cv score:', np.average(results), '+-', np.std(results))

    # scoring
    if len(file_score) > 0:
        score = pd.DataFrame(clf.predict_proba(Xs_exp_selected)[:, 1],
                             columns=['pred_score'])
        IDs.join(score).to_csv('./data/' + model_name + '_' + file_score +
                               '_with_pred.csv',
                               index=False)

    # model profile
    imp = pd.DataFrame([clf.named_steps['est'].feature_importances_],
                       columns=X_ohe_columns[selector.support_])
    imp.T.to_csv('./data/' + model_name + '_feature_importances.csv',
                 index=True)
Exemplo n.º 29
0
import numpy as np

#Fetching the training dataset
import pandas as pd
train_data = pd.read_excel('../Dataset/Train_dataset(1).xlsx', 'Train_dataset')
test_data = pd.read_excel('../Dataset/Test_dataset_1.xlsx')
test_27March = pd.read_excel('../Dataset/Train_dataset(1).xlsx',
                             'Train_27March')

#Checking for missing values
#print(train_data.isnull().sum())

#Imputing values through simple method like median of the column
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(train_data[[
    'Children', 'Diuresis', 'Platelets', 'HBB', 'd-dimer', 'Heart rate',
    'HDL cholesterol', 'Insurance', 'FT/month'
]])
X = pd.DataFrame(
    imp.transform(train_data[[
        'Children', 'Diuresis', 'Platelets', 'HBB', 'd-dimer', 'Heart rate',
        'HDL cholesterol', 'Insurance', 'FT/month'
    ]]))
train_data[[
    'Children', 'Diuresis', 'Platelets', 'HBB', 'd-dimer', 'Heart rate',
    'HDL cholesterol', 'Insurance', 'FT/month'
]] = imp.transform(train_data[[
    'Children', 'Diuresis', 'Platelets', 'HBB', 'd-dimer', 'Heart rate',
    'HDL cholesterol', 'Insurance', 'FT/month'
]])

# Imputing categorical data by mode
Exemplo n.º 30
0
    # Import Data

    X = pd.read_csv("input/X.csv")
    print(X.shape)
    y = pd.read_csv("input/y.csv", squeeze=True)
    print(y.shape)

    train_metadata = pd.read_csv('input/metadata_train.csv')  # read train data
    test_metadata = pd.read_csv('input/metadata_test.csv')  # read train data

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=.2, stratify=y,
        random_state=1)  # split X, y into 80% train 20% valid

    my_imputer = SimpleImputer(strategy="median")
    my_imputer.fit(X_train)
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = pd.DataFrame(scaler.transform(my_imputer.transform(X_train)))
    X_valid = pd.DataFrame(scaler.transform(my_imputer.transform(X_valid)))

    def opt(X_train, y_train, X_test, y_test, trial):
        # param_list
        n_estimators = trial.suggest_int('n_estimators', 0, 1000)
        max_depth = trial.suggest_int('max_depth', 1, 20)
        min_child_weight = trial.suggest_int('min_child_weight', 1, 20)
        # learning_rate = trial.suggest_discrete_uniform('learning_rate', 0.01, 0.1, 0.01)
        scale_pos_weight = trial.suggest_int('scale_pos_weight', 1, 100)
        subsample = trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1)
        colsample_bytree = trial.suggest_discrete_uniform(
    # plt.show()

    from pandas.tools.plotting import scatter_matrix
    attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
    # scatter_matrix(housing[attributes], figsize=(12, 8))
    # plt.show()

    housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
    housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
    housing["population_per_household"]=housing["population"]/housing["households"]

    housing_num = housing.drop("ocean_proximity", axis=1)

    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_num)
    X = imputer.transform(housing_num)

    housing_tr = pd.DataFrame(X, columns=housing_num.columns)

    # from sklearn.preprocessing import LabelEncoder
    # encoder = LabelEncoder()
    housing_cat = housing["ocean_proximity"]
    # housing_cat_encoded = encoder.fit_transform(housing_cat)

    # from sklearn.preprocessing import OneHotEncoder
    # encoder = OneHotEncoder()
    # housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))

    from sklearn.preprocessing import LabelBinarizer
    encoder = LabelBinarizer()
Exemplo n.º 32
0
# %%
housing_data = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

# clean data
# housing_data.dropna(subset=["total_bedrooms"]) # first option -> drop rows with missing value
# housing_data.drop("total_bedroooms", axis = 1) # second option -> drop whole attribute
# median = housing_data["total_bedrooms"].median()
# housing_data["total_bedrooms"].fillna(median, inplace = True )

# %%

imputer = SimpleImputer(strategy="median")
housing_data_num = housing_data.drop("ocean_proximity", axis=1)
imputer.fit(housing_data_num)
imputer.statistics_

# %%
housing_data_num.median().values

# %%
transform_table = imputer.transform(housing_data_num)
housing_tr = pd.DataFrame(transform_table, columns=housing_data_num.columns)

# %%
encoder = LabelEncoder()
housing_cat = housing_data["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
Exemplo n.º 33
0
def fit_categorical_imputer(train_data: pd.DataFrame,
                            categorical_features: List[str]):
    categorical_imputer = SimpleImputer(strategy="most_frequent", copy=True)
    categorical_imputer.fit(train_data[categorical_features])
    return categorical_imputer
    "test_8": test_8,
    "test_9": test_9,
    "test_10": test_10,
    "test_11": test_11,
    "test_12": test_12,
    "test_13": test_13,
    "test_14": test_14,
    "test_charge": test_charge
})

### normalize data
train_X = dataset_train.drop('train_charge', axis=1)
test_X = dataset_test.drop('test_charge', axis=1)
#print(test_X)

sc.fit(train_X)

train_X_normalized = pd.DataFrame(sc.fit_transform(train_X),
                                  columns=train_X.columns.values)
test_X_normalized = sc.transform(test_X)
#print(test_X_normalized)
train_Y = dataset_train['train_charge']
test_Y = dataset_test['test_charge']
#print(train_X_normalized)
# print(train_Y)
### RFR regressor fit ###
'''
random_state:指定模型随机状态,确保每次生成的模型是相同的
n_jods:进程个数(-1为用所有的CPU进行计算,默认为None,即为1)
最大特征数为所有特征数一半时为佳
最大深度:当特征较少或数据集较少时可选择默认
Exemplo n.º 35
0
def preprocess(data_dir='datasets/', imputation_type='mean'):
    # X is the complete data matrix
    missing_gt = 60
    print(os.getcwd())
    aps_training_df = pd.read_csv(data_dir + 'aps_failure_training_set.csv',
                                  error_bad_lines=False)
    aps_test_df = pd.read_csv(data_dir + 'aps_failure_test_set.csv',
                              error_bad_lines=False)
    print(aps_training_df.shape, aps_test_df.shape)
    print("=======Remove duplicates=================")
    aps_training_df.drop_duplicates(inplace=True)
    print(aps_training_df.shape, aps_test_df.shape)
    # aps_training_df = aps_training_df[aps_training_df['class']=='pos']
    print(aps_training_df.shape)
    print(aps_training_df.columns)
    #print(aps_training_df.isin(['na']).mean() * 100)
    print(aps_training_df.head())
    print('replacing na values to null=========')
    aps_training_df.replace(r'na', np.nan, regex=False, inplace=True)
    aps_test_df.replace(r'na', np.nan, regex=False, inplace=True)

    print('===removing more than ' + str(missing_gt) +
          '% missing column=========')
    percent_missing = aps_training_df.isnull().sum() * 100 / len(
        aps_training_df)
    missing_value_df = pd.DataFrame({
        'column_name': aps_training_df.columns,
        'percent_missing': percent_missing
    })
    missing_value_df.sort_values('percent_missing',
                                 inplace=True,
                                 ascending=False)
    missing_gt_x = missing_value_df[
        missing_value_df['percent_missing'] > missing_gt].column_name

    print(missing_gt_x)
    aps_training_df = aps_training_df.drop(missing_gt_x, axis=1)
    aps_test_df = aps_test_df.drop(missing_gt_x, axis=1)

    print(
        "============================Remove missing rows from Traning data=========================================="
    )
    percent_missing_pos = (
        aps_training_df[aps_training_df['class'] == 'pos'].isnull().sum() /
        len(aps_training_df[aps_training_df['class'] == 'pos'])) * 100

    missing_value_df_pos = pd.DataFrame({
        'column_name': aps_training_df.columns,
        'percent_missing': percent_missing_pos
    })
    missing_value_df_pos.sort_values('percent_missing',
                                     inplace=True,
                                     ascending=False)
    prcent_row_missing = aps_training_df.isnull().sum(axis=1) * 100 / 170
    prcnt_50_row_missing = prcent_row_missing[prcent_row_missing > 50]
    print('More than 50% missing rows:::', len(prcnt_50_row_missing))
    #aps_training_df.drop(prcnt_50_row_missing.index, axis=0, inplace=True)
    print(
        "============================Remove missing rows from Test data =========================================="
    )
    percent_missing_pos = (
        aps_test_df[aps_test_df['class'] == 'pos'].isnull().sum() /
        len(aps_test_df[aps_test_df['class'] == 'pos'])) * 100

    missing_value_df_pos = pd.DataFrame({
        'column_name': aps_test_df.columns,
        'percent_missing': percent_missing_pos
    })
    missing_value_df_pos.sort_values('percent_missing',
                                     inplace=True,
                                     ascending=False)
    prcent_row_missing = aps_test_df.isnull().sum(axis=1) * 100 / 170
    prcnt_50_row_missing = prcent_row_missing[prcent_row_missing > 50]
    print('More than 50% missing rows:::', len(prcnt_50_row_missing))
    #aps_test_df.drop(prcnt_50_row_missing.index, axis=0, inplace=True)

    #intersection_list = list(set(missing_value_df_pos.index) & set(prcnt_50_row_missing.index))
    #print("==Intersection list::::::::::",intersection_list)
    #aps_training_df.drop(intersection_list, axis=1, inplace=True)
    #aps_test_df.drop(intersection_list, axis=1, inplace=True)

    print("Training and Test data-set shape after dropping features is ",
          aps_training_df.shape, aps_test_df.shape)

    # Print number of positive classes and number of negative classes in the training data-set
    print("Number of positive classes = ",
          sum(aps_training_df['class'] == 'pos'))
    print("Number of negative classes = ",
          sum(aps_training_df['class'] == 'neg'))
    print("*******************")

    print("===================Drop outliers=================")
    from scipy import stats

    def drop_numerical_outliers(df, z_thresh=3):
        # Constrains will contain `True` or `False` depending on if it is a value below the threshold.
        constrains = df.select_dtypes(include=[np.number]) \
            .apply(lambda x: np.abs(stats.zscore(x)) < z_thresh) \
            .all(axis=1)
        # Drop (inplace) values set to be rejected
        return df.drop(df.index[~constrains])

    # train = X_train
    # train['failure'] = Y_train

    #outlier = drop_numerical_outliers(aps_training_df)
    #print(outlier['class'].value_counts())
    #print(outlier.shape)

    # Extract features and labels from the training and test data-set
    y_train = aps_training_df[['class']].values
    x_train = aps_training_df.drop('class', axis=1)
    y_test = aps_test_df.loc[:, 'class'].values
    x_test = aps_test_df.drop('class', axis=1)

    columns = x_train.columns

    print('=======================remove duplicates====================')
    print('=============Missing Imputation=========')

    # Fill missing data in training and test data-set
    if (imputation_type == 'median'):
        imputer = SimpleImputer(strategy='median')
        imputer.fit(x_train.values)
        x_train = imputer.transform(x_train.values)
        x_test = imputer.transform(x_test.values)

    elif (imputation_type == 'knn'):
        imputer = KNNImputer(n_neighbors=3)
        imputer.fit(x_train.values)
        x_train = imputer.transform(x_train.values)
        x_test = imputer.transform(x_test.values)

    elif (imputation_type == 'mean'):
        imputer = SimpleImputer(strategy='mean')
        imputer.fit(x_train.values)
        x_train = imputer.transform(x_train.values)
        x_test = imputer.transform(x_test.values)

    else:
        x_train.fillna(-1, inplace=True)
        x_test.fillna(-1, inplace=True)

    scaler = MinMaxScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    x_train_df = pd.DataFrame(x_train, columns=columns)
    y_train_df = pd.DataFrame(y_train, columns=['failure'])

    x_test_df = pd.DataFrame(x_test, columns=columns)
    y_test_df = pd.DataFrame(y_test, columns=['failure'])

    #x_train_df, x_test_df = feature_selection(x_train_df,y_train_df,x_test_df)

    df_train_data = pd.concat([y_train_df, x_train_df], axis=1)
    df_train_data.to_csv("datasets/train_processed_aps_" + imputation_type +
                         ".csv")
    print(df_train_data.shape)

    df_test_data = pd.concat([y_test_df, x_test_df], axis=1)
    df_test_data.to_csv("datasets/test_processed_aps_" + imputation_type +
                        ".csv")
    print(df_test_data.shape)

    df_train_data['failure'] = df_train_data['failure'].replace(['neg', 'pos'],
                                                                [0, 1])
    df_test_data['failure'] = df_test_data['failure'].replace(['neg', 'pos'],
                                                              [0, 1])
    #gc.collect()
    print("Pre processing completed")

    return df_train_data, df_test_data
Exemplo n.º 36
0
print(model.score(x_test, y_test))

print('\n')

#Do Predictions--------------------------------------------------------------
#read test file
new = pd.read_csv(r'testdata.csv')

#replace missing values with NaN
new = new.replace('[?]', numpy.nan, regex=True)

#replace numerical missing values mean
imputer = SimpleImputer(missing_values=numpy.nan, strategy='mean')

# fill A2 and A14
imputer = imputer.fit(new[['A2']])
new[['A2']] = imputer.transform(new[['A2']])

imputer = imputer.fit(new[['A14']])
new[['A14']] = imputer.transform(new[['A14']])

#replace non numerical missing values using forward filling
new = new.ffill(axis=0)

#label encode
new['A1'] = label_encoder.fit_transform(new['A1'])
new['A3'] = label_encoder.fit_transform(new['A3'])
new['A4'] = label_encoder.fit_transform(new['A4'])
new['A6'] = label_encoder.fit_transform(new['A6'])
new['A9'] = label_encoder.fit_transform(new['A9'])
new['A15'] = label_encoder.fit_transform(new['A15'])
Exemplo n.º 37
0
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

imputer = SimpleImputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

# Average CV score on the training set was: -6.068083376489832e+16
exported_pipeline = DecisionTreeRegressor(max_depth=3,
                                          min_samples_leaf=4,
                                          min_samples_split=18)
# Fix random state in exported estimator
if hasattr(exported_pipeline, 'random_state'):
    setattr(exported_pipeline, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 38
0
def main():
    np.set_printoptions(
        threshold=10)  # Ndarray display threshold to avoid hiding some columns
    print('HOUSING_PATH=', HOUSING_PATH)
    print('HOUSING_URL=', HOUSING_URL)
    fetch_housing_data(HOUSING_URL, HOUSING_PATH)
    print('After fetch_housing_data')
    housing = load_housing_data(HOUSING_PATH)
    print('After load_housing_data')
    print(housing.head())

    # INFO statement
    print("\nINFO statement:")
    print(housing.info())

    # Value counts
    print("\nValue counts:")
    print(housing["ocean_proximity"].value_counts())

    # "describe" statement for summary
    print("\nDESCRIBE statement:")
    print(housing.describe())

    # Plot data
    #housing.hist(bins=50,figsize =(20,15))
    #plt.show()

    # Test set sampling - random vs stratification
    housing["income_cat"] = pd.cut(housing["median_income"],
                                   bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf],
                                   labels=[1, 2, 3, 4, 5])
    housing["income_cat"].hist()
    #plt.show()

    # Random test set
    # Not necessarily the best sampling method. Ex. Sex can influence the median income -> right fraction of male/female is critical
    rand_train_set, rand_test_set = train_test_split(housing,
                                                     test_size=0.2,
                                                     random_state=42)

    # Stratification of data
    print("\nStratify housing data:")
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    print(split.split(housing, housing["income_cat"]))
    print(len(list(split.split(housing, housing["income_cat"]))))

    ic = 0
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        ic += 1
        print("ic = ", ic)
        print(len(train_index), train_index)
        print(len(test_index), test_index)
        #sys.exit()
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
        #strat_full_set=housing

    #rts = (rand_test_set["income_cat"].value_counts()/len(rand_test_set)).sort_index()
    #sts = (strat_test_set["income_cat"].value_counts()/len(strat_test_set)).sort_index()
    #sfs = (strat_full_set["income_cat"].value_counts()/len(strat_full_set)).sort_index()
    #print('rand_test:  \n{0}'.format(rts))
    #print('strat_test: \n{0}'.format(sts))
    #print('strat_full: \n{0}'.format(sfs))

    # Separate predictors and labels
    print("\nSeparate predictors and labels:")
    housing = strat_train_set.drop("median_house_value",
                                   axis=1)  # <- Predictor data
    housing_labels = strat_train_set["median_house_value"].copy()  # <- Labels
    housing_cat = housing[["ocean_proximity"]]  # Non-numeric categories
    print("housing_cat.head(10) = {}".format(housing_cat.head(10)))
    ''' Sklearn - Simple imputer '''
    imputer = SimpleImputer(strategy="median")
    housing_num_only = housing.drop("ocean_proximity", axis=1)
    imputer.fit(housing_num_only)
    print("imputer.statistics_ = {0}".format(imputer.statistics_))
    print("housing_num_only.median() = {0}".format(housing_num_only.median()))
    X = imputer.transform(housing_num_only)
    housing_tr = pd.DataFrame(X, columns=housing_num_only.columns)
    print('housing_tr.info() : ')
    print(housing_tr.info())
    ''' Encording '''
    print("housing_cat_encoded = {0}".format(housing_cat[:10]))
    # Ordinal encoder : replace categorical attributes into numbers
    # Issue with this method is the "distance" between the numerical values
    ##print("\nOrdinal encoder:")
    ##ordinal_encoder = OrdinalEncoder()
    ##housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
    ##print("housing_cat_encoded = {0}".format(housing_cat_encoded[:10]))
    ##print("ordinal_encoder.categories_ = {0}".format(ordinal_encoder.categories_))

    # One-hot encorder: Split categories and label only 0 or 1
    # This way can avoid "distance" problem of the ordinal encorder
    # Output is a SiPy sparse matrix. User toarray() to convert to numpy array
    print("\nOne-hot encoder:")
    cat_encoder = OneHotEncoder()
    housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
    #print("housing_cat_1hot = {0}".format(housing_cat_1hot))
    #print("housing_cat_1hot.toarray() = {0}".format(housing_cat_1hot.toarray()))

    # Attribute adder
    attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
    housing_extra_attribs = attr_adder.transform(housing.values)

    # Transformation pipeline
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

    #print("num_pipeline = {0}".format(type(num_pipeline)))
    housing_num_tr = num_pipeline.fit_transform(housing_num_only)
    #print("housing_num_tr = {0}".format(housing_num_tr))

    num_attribs = list(housing_num_only)
    cat_attribs = ["ocean_proximity"]
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

    print("housing.head(5): ", housing.head(5))
    housing_prepared = full_pipeline.fit_transform(housing)
    print("housing_parepared: ", pd.DataFrame(housing_prepared).iloc[:5])
    ''' Training and evaluating on the training set '''
    # Perform linear regression
    print('Linear regression:')
    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)

    #print("housing.head(5): ",housing.head(5))
    ''' 
    some_data = housing.iloc[:5]
    some_labels = housing_labels.iloc[:5]
    print("len(some_data): ",len(some_data))
    print("some_data: ",some_data)
    print("len(some_labels): ",len(some_labels))
    print("some_labels: ",some_labels)
    some_data_prepared = full_pipeline.transform(some_data)  # Output is a numpy array
    #print("some_data_prepared: ",pd.DataFrame(some_data_prepared))
    print("Labels: ",list(some_labels))
    '''
    print("Labels: ", housing_labels)
    #print("Labels: ",list(housing_labels))

    #np.set_printoptions(threshold=np.inf)

    # Compute RMSE
    #some_predictions = lin_reg.predict(some_data_prepared)
    #print("Predictions:", type(some_predictions))
    #print("len(Predictions):", len(some_predictions))
    #print("Predictions:", some_predictions)
    #lin_mse = mean_squared_error(some_labels,some_predictions)
    housing_predictions = lin_reg.predict(housing_prepared)
    print("len(housing_predictions):", len(housing_predictions))
    print("Predictions:", housing_predictions)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)
    print('lin_rmse = {0}\n'.format(lin_rmse))

    # Perform Decision tree regressor
    print('Decision tree regressor:')
    tree_reg = DecisionTreeRegressor()
    tree_reg.fit(housing_prepared, housing_labels)

    # Compute RMSE
    housing_predictions = tree_reg.predict(housing_prepared)
    print("Predictions:", housing_predictions)
    tree_mse = mean_squared_error(housing_labels, housing_predictions)
    tree_rmse = np.sqrt(tree_mse)
    print('tree_rmse = {0}\n'.format(tree_rmse))
    print('')
    ''' Cross-validation '''
    print('*** Cross-validation ***')
    print('  Decision tree:')
    scores = cross_val_score(tree_reg,
                             housing_prepared,
                             housing_labels,
                             scoring="neg_mean_squared_error",
                             cv=10)
    tree_rmse_scores = np.sqrt(-scores)
    display_scores(tree_rmse_scores)
    print('')

    print('  Linear regression:')
    lin_scores = cross_val_score(lin_reg,
                                 housing_prepared,
                                 housing_labels,
                                 scoring="neg_mean_squared_error",
                                 cv=10)
    lin_rmse_scores = np.sqrt(-lin_scores)
    display_scores(lin_rmse_scores)
    print('')

    print('  Random forest regressor:')
    forest_reg = RandomForestRegressor()
    forest_reg.fit(housing_prepared, housing_labels)
    housing_predictions = forest_reg.predict(housing_prepared)
    print("Predictions:", housing_predictions)
    forest_mse = mean_squared_error(housing_labels, housing_predictions)
    forest_rmse = np.sqrt(forest_mse)
    print('forest_rmse = {0}\n'.format(forest_rmse))
    scores = cross_val_score(forest_reg,
                             housing_prepared,
                             housing_labels,
                             scoring="neg_mean_squared_error",
                             cv=10)
    forest_rmse_scores = np.sqrt(-scores)
    display_scores(forest_rmse_scores)
    print('')
    ''' Fine-tuning '''
    param_grid = [
        {
            'n_estimators': [
                3,
                10,
                30,
            ],
            'max_features': [2, 4, 6, 8]
        },
        {
            'bootstrap': [False],
            'n_estimators': [3, 10],
            'max_features': [2, 3, 4]
        },
    ]
    forest_reg = RandomForestRegressor()
    grid_search = GridSearchCV(forest_reg,
                               param_grid,
                               cv=5,
                               scoring='neg_mean_squared_error',
                               return_train_score=True)
    grid_search.fit(housing_prepared, housing_labels)

    print('grid_search.best_params_ = {}'.format(grid_search.best_params_))

    print('grid_search.best_estimator_', grid_search.best_estimator_)

    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)

    feature_importances = grid_search.best_estimator_.feature_importances_
    print('feature_importances: \n{0}'.format(feature_importances))

    extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
    cat_encoder = full_pipeline.named_transformers_["cat"]
    cat_one_hot_attribs = list(cat_encoder.categories_[0])
    attributes = num_attribs + extra_attribs + cat_one_hot_attribs
    print('sorted(fi,attr):')
    ftri_attribs = sorted(zip(feature_importances, attributes), reverse=True)
    for fa in ftri_attribs:
        print(fa)
    ''' Evaluate your system on the Test Set '''
    print('Evaluating the Test Set ')
    final_model = grid_search.best_estimator_
    X_test = strat_test_set.drop("median_house_value", axis=1)
    Y_test = strat_test_set["median_house_value"].copy()
    X_test_prepared = full_pipeline.transform(X_test)
    final_predictions = final_model.predict(X_test_prepared)

    final_mse = mean_squared_error(Y_test, final_predictions)
    print('final_mse = ', final_mse)
    final_rmse = np.sqrt(final_mse)  #=> evalupates to 47,730.2
    print('final_rmse = ', final_rmse)

    return