Exemplo n.º 1
0
def test_imputation_error_sparse_0(strategy):
    # check that error are raised when missing_values = 0 and input is sparse
    X = np.ones((3, 5))
    X[0] = 0
    X = sparse.csc_matrix(X)

    imputer = SimpleImputer(strategy=strategy, missing_values=0)
    with pytest.raises(ValueError, match="Provide a dense array"):
        imputer.fit(X)

    imputer.fit(X.toarray())
    with pytest.raises(ValueError, match="Provide a dense array"):
        imputer.transform(X)
Exemplo n.º 2
0
def data_preprocessing(dataset):
    # import data
    # dataset = pd.read_csv('data/train.csv')
    X = dataset.iloc[:, 2:13].values
    Y = dataset.iloc[:, 1].values

    # replace missing data
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy= "mean", missing_values = np.nan)
    imputer = imputer.fit(X[:,3])
   
    #X = imputer.fit_transform(X[:, 5]) Testing out new code
    X[:,3] = imputer.transform(X[:,3])
Exemplo n.º 3
0
def test_imputation_pickle():
    # Test for pickling imputers.
    import pickle

    X = sparse_random_matrix(100, 100, density=0.10)

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = SimpleImputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_almost_equal(
            imputer.transform(X.copy()),
            imputer_pickled.transform(X.copy()),
            err_msg="Fail to transform the data after pickling "
            "(strategy = %s)" % (strategy)
        )
Exemplo n.º 4
0
def test_iterative_imputer_missing_at_transform(strategy):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X_train = rng.randint(low=0, high=3, size=(n, d))
    X_test = rng.randint(low=0, high=3, size=(n, d))

    X_train[:, 0] = 1  # definitely no missing values in 0th column
    X_test[0, 0] = 0  # definitely missing value in 0th column

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               initial_strategy=strategy,
                               random_state=rng).fit(X_train)
    initial_imputer = SimpleImputer(missing_values=0,
                                    strategy=strategy).fit(X_train)

    # if there were no missing values at time of fit, then imputer will
    # only use the initial imputer for that feature at transform
    assert_allclose(imputer.transform(X_test)[:, 0],
                    initial_imputer.transform(X_test)[:, 0])
Exemplo n.º 5
0
def test_mice_missing_at_transform(strategy):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X_train = rng.randint(low=0, high=3, size=(n, d))
    X_test = rng.randint(low=0, high=3, size=(n, d))

    X_train[:, 0] = 1  # definitely no missing values in 0th column
    X_test[0, 0] = 0  # definitely missing value in 0th column

    mice = MICEImputer(missing_values=0,
                       n_imputations=1,
                       n_burn_in=1,
                       initial_strategy=strategy,
                       random_state=rng).fit(X_train)
    initial_imputer = SimpleImputer(missing_values=0,
                                    strategy=strategy).fit(X_train)

    # if there were no missing values at time of fit, then mice will
    # only use the initial imputer for that feature at transform
    assert np.all(mice.transform(X_test)[:, 0] ==
                  initial_imputer.transform(X_test)[:, 0])
Exemplo n.º 6
0
def _check_statistics(X, X_true,
                      strategy, statistics, missing_values):
    """Utility function for testing imputation for a given strategy.

    Test:
        - along the two axes
        - with dense and sparse arrays

    Check that:
        - the statistics (mean, median, mode) are correct
        - the missing values are imputed correctly"""

    err_msg = "Parameters: strategy = %s, missing_values = %s, " \
              "axis = {0}, sparse = {1}" % (strategy, missing_values)

    assert_ae = assert_array_equal
    if X.dtype.kind == 'f' or X_true.dtype.kind == 'f':
        assert_ae = assert_array_almost_equal

    # Normal matrix
    imputer = SimpleImputer(missing_values, strategy=strategy)
    X_trans = imputer.fit(X).transform(X.copy())
    assert_ae(imputer.statistics_, statistics,
              err_msg=err_msg.format(0, False))
    assert_ae(X_trans, X_true, err_msg=err_msg.format(0, False))

    # Sparse matrix
    imputer = SimpleImputer(missing_values, strategy=strategy)
    imputer.fit(sparse.csc_matrix(X))
    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

    if sparse.issparse(X_trans):
        X_trans = X_trans.toarray()

    assert_ae(imputer.statistics_, statistics,
              err_msg=err_msg.format(0, True))
    assert_ae(X_trans, X_true, err_msg=err_msg.format(0, True))
    from pandas.tools.plotting import scatter_matrix
    attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
    # scatter_matrix(housing[attributes], figsize=(12, 8))
    # plt.show()

    housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
    housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
    housing["population_per_household"]=housing["population"]/housing["households"]

    housing_num = housing.drop("ocean_proximity", axis=1)

    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_num)
    X = imputer.transform(housing_num)

    housing_tr = pd.DataFrame(X, columns=housing_num.columns)

    # from sklearn.preprocessing import LabelEncoder
    # encoder = LabelEncoder()
    housing_cat = housing["ocean_proximity"]
    # housing_cat_encoded = encoder.fit_transform(housing_cat)

    # from sklearn.preprocessing import OneHotEncoder
    # encoder = OneHotEncoder()
    # housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))

    from sklearn.preprocessing import LabelBinarizer
    encoder = LabelBinarizer()
    housing_cat_1hot = encoder.fit_transform(housing_cat)
Exemplo n.º 8
0
#kutuphaneler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#kod bolumu

#veri yukleme

veriler = pd.read_csv('eksikveriler.csv')

#sci - kit learn  ekik veriler

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

Yas = veriler.iloc[:, -2:-1].values

imputer = imputer.fit(
    Yas
)  #fit eğitmek için kullanılır  kolonların ortalama değerlerini öğrenecek

Yas = imputer.transform(Yas)  #transformla öğrendiğini yerine koy
print(Yas)

veriler.iloc[:, -2:-1] = Yas  #değiştirilmiş veriler ile yer değiştir.

print(veriler)
Exemplo n.º 9
0
# statistical imputation transform for the horse colic dataset
from numpy import isnan
from pandas import read_csv
from sklearn.impute import SimpleImputer
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
dataframe = read_csv(url, header=None, na_values='?')
# split into input and output elements
data = dataframe.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
# print total missing
print('Missing: %d' % sum(isnan(X).flatten()))
# define imputer
imputer = SimpleImputer(strategy='mean')
# fit on the dataset
imputer.fit(X)
# transform the dataset
Xtrans = imputer.transform(X)
# print total missing
print('Missing: %d' % sum(isnan(Xtrans).flatten()))
Exemplo n.º 10
0
#X = X.fillna(0) # instead of imputing
from sklearn.impute import SimpleImputer
#imputer = SimpleImputer()

y = df_modified['poi']

# separate financial and mail features to rescale
#X_financial = X.drop(email_features_list, axis=1)
X_financial = X[finance_features_list]
#X_financial['bonus/salary'] = X_financial['bonus'] / X_financial['salary'] # kinda leaky, but imagine we have the data necessary
#X_financial = X_financial.drop(['bonus', 'salary'], axis=1)

#X_financial = X_financial.fillna(0)
imputerF = SimpleImputer(strategy='median')
imputerF.fit(X_financial)
X_financial = imputerF.transform(X_financial)

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
#scalerF = MinMaxScaler()
scalerF = StandardScaler()
scalerF.fit(X_financial)
X_financial = scalerF.transform(X_financial)
X_financial = pd.DataFrame(X_financial, index=X.index.values)

X_mail = X[email_features_list]
X_mail['from_poi/from'] = X_mail['from_this_person_to_poi'] / X_mail[
    'from_messages']
X_mail['to_poi/to'] = (
    X_mail['from_poi_to_this_person'] +
    X_mail['shared_receipt_with_poi']) / X_mail['to_messages']
previsores[:, 1] = lambelEncoder_previsores.fit_transform(previsores[:, 1])
previsores[:, 3] = lambelEncoder_previsores.fit_transform(previsores[:, 3])
previsores[:, 5] = lambelEncoder_previsores.fit_transform(previsores[:, 5])
previsores[:, 6] = lambelEncoder_previsores.fit_transform(previsores[:, 6])
previsores[:, 7] = lambelEncoder_previsores.fit_transform(previsores[:, 7])
previsores[:, 8] = lambelEncoder_previsores.fit_transform(previsores[:, 8])
previsores[:, 9] = lambelEncoder_previsores.fit_transform(previsores[:, 9])
previsores[:, 13] = lambelEncoder_previsores.fit_transform(previsores[:, 13])

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(
    missing_values=np.nan,
    strategy='median')  # COLOCA A MEDIA NOS CAMPOS NULOS(NaN)
imputer = imputer.fit(previsores[:, 0:len(previsores)])
previsores[:, 0:len(previsores)] = imputer.transform(
    previsores[:, 0:len(previsores)])

# TRANSFORMA A CLASSE EM BINARIO
labelEnconderClasse = LabelEncoder()
classe = labelEnconderClasse.fit_transform(classe)

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#ESCALONAMENTO(PADRONIZAÇÃO DOS DADOS)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(
    previsores, classe, test_size=0.25, random_state=0)
Exemplo n.º 12
0
import numpy as np
import pandas as pd
ds = pd.read_csv("Data.csv")
x = ds.iloc[:, 0:3].values
y = ds.iloc[:, 3].values
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy="median")
imp = imp.fit(x[:, 1:3])
x[:, 1:3] = imp.transform(x[:, 1:3])
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lec = LabelEncoder()
lec = lec.fit(x[:, 0])
x[:, 0] = lec.transform(x[:, 0])
ohe = OneHotEncoder(categorical_features=[0])
x = ohe.fit_transform(x).toarray()
lec = lec.fit(y)
y = lec.transform(y)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
Exemplo n.º 13
0
 # Data Preprocessing

# Importing the libraries
# import matplotlib.pyplot as plt
#  library pandas offers data structures and operations for manipulating numerical tables and time series
import numpy as np
import pandas as pd

# Importing the dataset
df = pd.read_csv('Data.csv')
X = df.iloc[:, :-1].values
y = df.iloc[:, 3].values

# Taking care of missing data
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imp = imp.fit(X[:, 1:3])
X[:, 1:3] = imp.transform(X[:, 1:3])

# GroupBy of DataFrame test
# dfGroup = df.groupby('Purchased')
# for Purchased in dfGroup:
#     print(Purchased)
Exemplo n.º 14
0
def impute(train, validate, test, my_strategy, column_list):
    imputer = SimpleImputer(strategy=my_strategy)
    train[column_list] = imputer.fit_transform(train[column_list])
    validate[column_list] = imputer.transform(validate[column_list])
    test[column_list] = imputer.transform(test[column_list])
    return train, validate, test
Exemplo n.º 15
0
test_num_group = num_imp.fit_transform(test_num_group)
std_ = StandardScaler()
train_num_group = std_.fit_transform(train_num_group)
test_num_group = std_.fit_transform(test_num_group)
train['y'] = train['y'].replace({'yes': True, 'no': False})
test['y'] = test['y'].replace({'yes': True, 'no': False})
train = train.drop(num_list, axis=1)
test = test.drop(num_list, axis=1)
one_hot_groupname = ['marital', 'day_of_week']
one_hot_train = train[one_hot_groupname]
one_hot_test = test[one_hot_groupname]
train = train.drop(one_hot_groupname, axis=1)
test = test.drop(one_hot_groupname, axis=1)
imp = SimpleImputer(strategy='most_frequent')
one_hot_train = imp.fit_transform(one_hot_train)
one_hot_test = imp.transform(one_hot_test)
one_hot = OneHotEncoder(handle_unknown='ignore')
one_hot_train = one_hot.fit_transform(one_hot_train)
one_hot_train = one_hot_train.toarray()
one_hot_test = one_hot.transform(one_hot_test)
one_hot_test = one_hot_test.toarray()
encoder_columns = list(train.columns)
encoder_columns = encoder_columns[:-1]
for col_name in encoder_columns:
    df_change = train[[col_name, 'y']]
    df_change = df_change.groupby(col_name).mean().sort_values(
        'y').reset_index()
    num = 1
    match_dict = dict()
    for i in df_change.iloc[:, 0]:
        match_dict[i] = num
    por ky informacion ndihmon vetem ne raste te caktuara, ne pergjithesi 
    shkakton mangesi te shumta ne algoritem, per kete arsye nuk merren ne 
    konsiderate. Shembull kur ndikon: nese dihet perdoruesi perkates perpara 
    blerjes!
'''

# importimi i librarise imputer - mbushja me zero e variablave te NaN
from sklearn.impute import SimpleImputer
# vlerat qe mungojne [kategoria 2 dhe 3 kane shume vlera null]
data.isnull().sum()
# mbush vlerat null me 0 - shablloni
imputer = SimpleImputer(missing_values=np.nan,
                        strategy='constant',
                        fill_value=0)
imputer = imputer.fit(data.iloc[:, 9:11])
data.iloc[:, 9:11] = imputer.transform(data.iloc[:, 9:11])

# heqja e dy kolonave te para te datasetit [user dhe product]
data.drop(data.columns[[0, 1]], axis=1, inplace=True)
'''
    - Per arsye se na duhet te fshijme outliers prej kolonave te caktuara, 
    na duhet qe dataseti te jete me vlera numbers e jo NaN (per arsye se 
    tek funksioni remove_outliers me poshte, kerkohen numra e jo stringje).
'''

# OUTELIERS
from pandas.api.types import is_numeric_dtype


def remove_outlier(df):
    low = .05
Exemplo n.º 17
0
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

dataset = pd.read_csv(r'04_dados_exercicio.csv')

features = dataset.iloc[:, :-1].values

classe = dataset.iloc[:, -1].values

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")

imputer.fit(features[:, 2:4])
features[:, 2:4] = imputer.transform(features[:, 2:4])
columnTransformer = ColumnTransformer(transformers=[('encoder',
                                                     OneHotEncoder(), [1])],
                                      remainder='passthrough')
features = np.array(columnTransformer.fit_transform(features))
labelEncoder = LabelEncoder()
classe = labelEncoder.fit_transform(classe)

features_treinamento, features_teste, classe_treinamento, classe_teste = train_test_split(
    features, classe, test_size=0.15, random_state=1)

standardScaler = StandardScaler()

features_treinamento[:, 4:6] = standardScaler.fit_transform(
    features_treinamento[:, 4:6])
Exemplo n.º 18
0
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

# %% [markdown]
# # Step 3: Imputation
#
# ### Part A
#
# Use the next code cell to impute missing values with the mean value along each column.  Set the preprocessed DataFrames to `imputed_X_train` and `imputed_X_valid`.  Make sure that the column names match those in `X_train` and `X_valid`.

# %% [code]

# Fill in the lines below: imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Fill in the lines below: imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

# Check your answers
step_3.a.check()

# %% [code]
# Lines below will give you a hint or solution code
# step_3.a.hint()
step_3.a.solution()

# %% [markdown]
# Run the next code cell without changes to obtain the MAE for this approach.
# Define as features para previsores
previsores = df.iloc[:,1:4].values
classe = df.iloc[:, 4].values

# Instancia a classe Imputer
imputer = SimpleImputer(
    missing_values = np.nan,
    strategy='mean'
    )

# Treina o algoritmo com os valores existentes
imputer = imputer.fit(previsores[:, 0:3])

# Insere os valores nas células com valores faltantes
previsores = imputer.transform(previsores)

# Istancia a classe StandardScaler para escalonamento dos valores
scaler = StandardScaler()
# Padroniza os valores na mesma escala
previsores = scaler.fit_transform(previsores)

"""
DIVISÃO TREINO E TESTE
"""

previsores_train, previsores_test, classe_train, classe_test = train_test_split(
        previsores,
        classe,
        test_size=0.25,
        random_state=0
Eyas = eksikveriler[['yas']]

#sci - kit learn
# *** Yas Stunundaki NAN(Boş) Değerleri Stundaki Verilerin Ortalaması ile Doldurma İşlemleri ***
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Üzerinde işlem yapılacak olan ilgili stunun(Yas) ayrıştırılması
Yas = eksikveriler.iloc[:, 3:4].values
#print(Yas)

imputer = imputer.fit(Yas)  #fit fonksiyonu eğitmek için kullanılır
Yas = imputer.transform(
    Yas
)  # transfor ile ise öğrendiğini uygulamasını sağlar (nan değerlerin ortalama ile değiştirilmesi)

# düzeltilen eksik verileri geri yükleme ve veriler değişkenine atama işlemi
eksikveriler.iloc[:, 3:4] = Yas
veriler = eksikveriler
#print(veriler)

# *** Kategorik Verileri, Nümeriğe Dönüştürme İşlemleri ***

# ilgili kategorik stunu(ulke) verilerden ayrıştırma işlemi
ulke = veriler.iloc[:, 0:1].values
#print(ulke)

from sklearn import preprocessing
Exemplo n.º 21
0
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#Read the input data data from the external CSV
dataset = pd.read_csv('Haiti.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 8].values

# Taking care of missing data
from sklearn.impute import SimpleImputer
# creating object for SimpleImputer class as "imputer"
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose=0)
imputer = imputer.fit(X[:, 1:8]) #upper bound is not included, but lower bound
X[:, 1:8] = imputer.transform(X[:, 1:8])


# Encoding the dependent Variable
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)


#Rescale data (between 0 and 1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)


dataset.head()
dataset.isna().any()
dataset.isna().sum()

dataset = dataset.drop(
    columns=['MiscFeature', 'Fence', 'PoolQC', 'FireplaceQu', 'Alley'])

# Taking care of missing data
"""
dataset.fillna(value=dataset['LotFrontage'].mean(),inplace=True)

"""
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(dataset.iloc[:, [3, 25, 57]].values)
dataset[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']] = imputer.transform(
    dataset.iloc[:, [3, 25, 57]].values).astype('int32')
dataset.fillna(value='None Avialabe', inplace=True)
dataset.isna().sum()

X = dataset.drop(columns='SalePrice')  #independent fields
y = dataset['SalePrice']  #label

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
Xnumeric = X.select_dtypes(include=numerics)

fig = plt.figure(figsize=(15, 12))
plt.suptitle('Histograms of Independent Columns(Continuous values)',
             fontsize=20)
## Histograms
for i in range(1, Xnumeric.shape[1]):
    plt.subplot(7, 6, i)
# input interval variables
numerical_inputs = list(df.select_dtypes(include=['int64', 'float32']).columns)
inputs = class_inputs + numerical_inputs

# Data engineering #

# Impute missings

categorical_imputer = SimpleImputer(
    missing_values='', strategy='most_frequent')
numerical_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Impute categorical variables

categorical_imputer.fit(df[class_inputs])
categorical_imputed = categorical_imputer.transform(df[class_inputs])
df_categorical_imputed = pd.DataFrame(
    data=categorical_imputed, columns=class_inputs)

# Impute numerical variables

numerical_imputer.fit(df[numerical_inputs])
numerical_imputed = numerical_imputer.transform(df[numerical_inputs])
df_numerical_imputed = pd.DataFrame(
    data=numerical_imputed, columns=numerical_inputs)

# One-hot encoding

encoder = OneHotEncoder()
encoder.fit(categorical_imputed)
categorical_encoded = encoder.transform(categorical_imputed)
Exemplo n.º 24
0
imputed_X_train_plus = X.copy()
imputed_X_test_plus = test_X.copy()

# cols_with_missing = (col for col in X.columns
#                                  if X[col].isnull().any())
# for col in cols_with_missing:
#     imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
#
#
# cols_with_missing_test = (col for col in test_X.columns
#                                       if test_X[col].isnull().any())
# for col in cols_with_missing_test:
#     imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)

model = RandomForestRegressor(n_estimators=20, random_state=0)
model.fit(imputed_X_train_plus, y)

# make predictions which we will submit.
test_preds = model.predict(imputed_X_test_plus)

# The lines below shows how to save predictions in format used for competition scoring
# Just uncomment them.

output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)
Exemplo n.º 25
0
print(X_train.Sex)

#OneHotEncoder()を例で理解する
#多次元配列[0,2,1,1]を列ベクトルにしてOneHotEncoderに入れると
one = sp.OneHotEncoder()
print('ここがonehotencoder')
print(X_train.Sex.values)
print(X_train.Sex.values.reshape(-1, 1))
print(X_train.Sex.values.reshape(-1, 1).transpose())
enced = one.fit_transform(X_train.Sex.values.reshape(1, -1).transpose())
print('ここからがenced')
print(enced)
print(enced.toarray())
#index=df.Sex.indexがわからん
temp = pd.DataFrame(index=df.Sex.index,
                    columns='Sex-' + le.classes_,
                    data=enced.toarray())
print('ここからがtemp')
print(temp)
enced_data = pd.concat([X_train, temp], axis=1)
del enced_data['Sex']

from sklearn.impute import SimpleImputer
im = SimpleImputer(missing_values=np.nan, strategy='mean')
im.fit(enced_data)
im.transform(enced_data)
print('ここがenced_data')
print(enced_data)
enced_data = im.fit_transform(enced_data)
print(enced_data)
Exemplo n.º 26
0
#---------------------------
#Drop Columns with Missing Values
# Get names of columns with missing values
cols_with_missing = [
    col for col in X_train.columns if X_train[col].isnull().any()
]
# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
#---------------------------
#Imputation
from sklearn.impute import SimpleImputer
# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
#---------------------------
#An Extension to Imputation
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
Exemplo n.º 27
0
import numpy as np
import pandas as pd
# ----------------------------------------------------
dataset = pd.read_csv('path/2.9 Ensemble Reg/houses.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
# ----------------------------------------------------
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X)
X = imp.transform(X)
# ----------------------------------------------------
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=1 / 3,
                                                    random_state=42)
# ----------------------------------------------------
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=300, random_state=0)
regressor.fit(X_train, y_train)
# ----------------------------------------------------
print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))
print("=" * 25)
print(regressor.feature_importances_)
print("=" * 25)
# ----------------------------------------------------
# Predicting a new result
y_pred = regressor.predict(X_test)
Exemplo n.º 28
0
# Importing the dataset
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

# Taking care of missing data
from sklearn.impute import SimpleImputer

missingvalues = SimpleImputer(missing_values=np.nan,
                              strategy='mean',
                              verbose=0)

missingvalues = missingvalues.fit(X[:, 1:3])

X[:, 1:3] = missingvalues.transform(X[:, 1:3])

# Encoding categorical data
# Encoding the Independent Variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()
# Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

from sklearn.model_selection import train_test_split
Exemplo n.º 29
0
boykilo = veriler[['boy', 'kilo']]
print(boykilo)

#eksik veriler
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#nan değerleri ortalama değer ile değiştir.
Yas = veriler.iloc[:, 1:4].values
#iloc : integer location
#[:,1,4] => iki nokta bütün satırları getiriyor, 1. sütundan 4. sütuna kadar

print(Yas)

imputer = imputer.fit(Yas[:, 1:4])  #=>öğrenme işlemini sağlıyor
Yas[:, 1:4] = imputer.transform(Yas[:, 1:4])  #değerlerin değiştirilmesi işlemi

# Kategorik verilerin dönüşümü - encoder : Kategorik verilerden Numerik verilere dönüş sağlar
ulke = veriler.iloc[:, 0:1].values
print(ulke)

from sklearn import preprocessing

le = preprocessing.LabelEncoder()

ulke[:, 0] = le.fit_transform(veriler.iloc[:, 0])
print(ulke)

ohe = preprocessing.OneHotEncoder()
ulke = ohe.fit_transform(ulke).toarray()
print(ulke)
Exemplo n.º 30
0
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('3.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

print(x)

print(y)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

print(x)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')
x = np.array(ct.fit_transform(x))

print(x)

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Exemplo n.º 31
0
    for filename in filenames:
        print(os.path.join(dirname, filename))

train_data = pd.read_csv('../input/titanic/train.csv')
test_data = pd.read_csv('../input/titanic/test.csv')

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

test_data = pd.read_csv('../input/titanic/test.csv')

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Age"]
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

imp = SimpleImputer(missing_values=np.nan, strategy="mean")
imp = imp.fit(X_train)

X_train_imp = imp.transform(X_train)
model = RandomForestClassifier(n_estimators=1000, max_depth=32, random_state=1)
model.fit(X_train_imp, y)

X_test_imp = imp.transform(X_test)
predictions = model.predict(X_test_imp)

output = pd.DataFrame({"PassengerId": test_data.PassengerId, "Survived": predictions})
output.to_csv("my_submission.csv", index=False)
print("Your submission has been successfully saved!")
Exemplo n.º 32
0
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt

df_original = pd.read_csv(r"C:\Users\Suzana\Desktop\Iris\iris2.csv")
df_missing = pd.read_csv(r"C:\Users\Suzana\Desktop\Iris\MCAR_30.csv")

# Potrebni su mi samo features-i
features_original = df_original.iloc[:, 0:4]
features_missing = df_missing.iloc[:, 1:5]

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(features_missing)
features_estimated = imp_mean.transform(features_missing)

# Nadji indekse vrijednosti za validaciju
indices_missing = np.argwhere(features_missing.isna().values)
indices_missing_hashable = map(tuple, indices_missing)
indices_set = set(indices_missing_hashable)
indices = list(indices_set)
print(indices)
print(len(indices))
s = 0
r = 0
for i in indices:
    s = s + abs(features_original.values.item(i) - features_estimated.item(i))
    r = r + (features_original.values.item(i) - features_estimated.item(i))**2
print('Prava MAE =', s / len(indices))
print('Fake MAE =',

X['Fecha'] = pd.to_datetime(X['Fecha'])  #IFE
X['Dia'] = pd.DatetimeIndex(X['Fecha']).day.astype('object')
X['Mes'] = pd.DatetimeIndex(X['Fecha']).month.astype('object')
X['Dia_Semana'] = (pd.DatetimeIndex(X['Fecha']).weekday + 1).astype(
    'object')  #FFE

variables_categoricas = X.dtypes.pipe(
    lambda x: x[x == 'object']).index  #Sí va en FE

num_cols = X.dtypes.pipe(lambda x: x[x != 'object']).index
for x in num_cols:
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    imp.fit(np.array(X[x]).reshape(-1, 1))
    X[x] = imp.transform(np.array(X[x]).reshape(-1, 1))

nominal_cols = X.dtypes.pipe(lambda x: x[x == 'object']).index
for x in nominal_cols:
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imp.fit(np.array(X[x]).reshape(-1, 1))
    X[x] = imp.transform(np.array(X[x]).reshape(-1, 1))

x_mat = pd.get_dummies(X, columns=variables_categoricas, drop_first=True)  #FFE

#Inicio de modelado

indice_ent = X['Fecha'] <= '2019-11-30'  #Va en modelado

variables_a_eliminar = ['Fecha', 'Año', 'Afluencia']  #Va en modelado
# In[ ]:


columns = train_data.columns[1:]


# ### Replce NaN values (Imputation) and standardize data

# In[ ]:


# For class 1 data point
imp = SimpleImputer(missing_values=np.nan, strategy='median')
X_train = imp.fit_transform(train_data.iloc[:, 1:])
X_test = imp.transform(test_data.iloc[:, 1:])
pickle.dump(imp, open('imputer.pkl', 'wb'))

print("Number of NaN after imputation", np.count_nonzero(np.isnan(X_train)))


# standardizing data for better EDA and modeling

# In[ ]:


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
pickle.dump(scaler, open('scaler.pkl', 'wb'))
Exemplo n.º 35
0
import numpy as np
import pandas as pd

dataset = pd.read_csv('houses.csv')

dataset.head(20)

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(dataset)
dataset = imp.transform(dataset)

X = dataset[:, :-1]
y = dataset[:, -1]

X
y

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

X_train
X_test
y_train
import pandas as pd

base = pd.read_csv('credit_data.csv')
base.loc[base.age < 0, 'age'] = 40.92

previsores = base.iloc[:,1:4].values
classe = base.iloc[:,4].values

from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
imputer = imputer.fit(previsores[:,0:3])
previsores[:,0:3] = imputer.transform(previsores)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.25, random_state=0)

from import
classificador =
classificador.fit(previsores_treinamento, classe_treinamento)
previsoes = classificador.predict(previsores_teste)

from sklearn.metrics import confusion_matrix, accuracy_score
precisao = accuracy_score(classe_teste, previsoes)
matriz = confusion_matrix(classe_teste, previsoes)

from collections import Counter
Counter(classe_teste)
Exemplo n.º 37
0
# train_X.isna().sum(axis = 0)

# %% [markdown]
# # Xử lý các cột bị khuyết dữ liệu trên tập Train
# 1. Phương án 1: Xóa toàn bộ một dòng có cột dữ liệu bị khuyết
# 2. Phương án 2: Thay thế giá trị số bằng cách dùng trung bình cộng (``mean``) của toàn cột dữ liệu đó
#     * Cách xử lý 1: Dùng lớp ``SimpleImputer`` từ thư viện ``sklearn.impute``
#     * Cách xử lý 2: Cách hai thực hiện một số thao tác trên cột
#
# Các bạn có thể tham khảo tại trang số 63 sách Hands-on Machine Learning with Scikit-Learn, Keras, and TensorFlow

# %%
# Cách 1:
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer_mean.fit(train_X["total_bedrooms"].values.reshape(-1, 1))
tmp_total_bedrooms = imputer_mean.transform(
    train_X['total_bedrooms'].values.reshape(-1, 1))

# %%
# Cách 2:
idx_null = train_X["total_bedrooms"].isnull()  # kiểm tra giá trị khuyết
mean_total_bedrooms = train_X["total_bedrooms"][train_X["total_bedrooms"].isna(
) == False].mean()  # tính trung bình cộng các giá trị không bị khuyết
train_X["total_bedrooms"].fillna(
    mean_total_bedrooms,
    inplace=True)  # thay thế các dòng không bị khuyết bởi giá trị trung bình
print(train_X["total_bedrooms"][idx_null == True]
      )  # in ra màn hình giá trị các dòng bị khuyết ban đầu để kiểm tra

# %%
print((train_X["total_bedrooms"] == tmp_total_bedrooms.squeeze()
       ).all())  # Kiểm tra kết quả Cách 1 và Cách 2 có giống nhau hay không?