示例#1
0
def test_imputation_error_invalid_strategy(strategy):
    X = np.ones((3, 5))
    X[0, 0] = np.nan

    with pytest.raises(ValueError, match=str(strategy)):
        imputer = SimpleImputer(strategy=strategy)
        imputer.fit_transform(X)
示例#2
0
def test_changed_only():
    # Make sure the changed_only param is correctly used
    set_config(print_changed_only=True)
    lr = LogisticRegression(C=99)
    expected = """LogisticRegression(C=99)"""
    assert lr.__repr__() == expected

    # Check with a repr that doesn't fit on a single line
    lr = LogisticRegression(C=99, class_weight=.4, fit_intercept=False,
                            tol=1234, verbose=True)
    expected = """
LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234,
                   verbose=True)"""
    expected = expected[1:]  # remove first \n
    assert lr.__repr__() == expected

    imputer = SimpleImputer(missing_values=0)
    expected = """SimpleImputer(missing_values=0)"""
    assert imputer.__repr__() == expected

    # Defaults to np.NaN, trying with float('NaN')
    imputer = SimpleImputer(missing_values=float('NaN'))
    expected = """SimpleImputer()"""
    assert imputer.__repr__() == expected

    set_config(print_changed_only=False)
示例#3
0
def test_imputation_deletion_warning(strategy):
    X = np.ones((3, 5))
    X[:, 0] = np.nan

    with pytest.warns(UserWarning, match="Deleting"):
        imputer = SimpleImputer(strategy=strategy, verbose=True)
        imputer.fit_transform(X)
示例#4
0
def test_imputation_mean_median_error_invalid_type(strategy, dtype):
    X = np.array([["a", "b", 3],
                  [4, "e", 6],
                  ["g", "h", 9]], dtype=dtype)

    with pytest.raises(ValueError, match="non-numeric data"):
        imputer = SimpleImputer(strategy=strategy)
        imputer.fit_transform(X)
示例#5
0
def test_imputation_constant_error_invalid_type(X_data, missing_value):
    # Verify that exceptions are raised on invalid fill_value type
    X = np.full((3, 5), X_data, dtype=float)
    X[0, 0] = missing_value

    with pytest.raises(ValueError, match="imputing numerical"):
        imputer = SimpleImputer(missing_values=missing_value,
                                strategy="constant",
                                fill_value="x")
        imputer.fit_transform(X)
示例#6
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent']:
        imputer = SimpleImputer(strategy=strategy)
        X_imputed = imputer.fit_transform(X)
        assert_equal(X_imputed.shape, (10, 2))
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert_equal(X_imputed.shape, (10, 2))
示例#7
0
def data_preprocessing(dataset):
    # import data
    # dataset = pd.read_csv('data/train.csv')
    X = dataset.iloc[:, 2:13].values
    Y = dataset.iloc[:, 1].values

    # replace missing data
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy= "mean", missing_values = np.nan)
    imputer = imputer.fit(X[:,3])
   
    #X = imputer.fit_transform(X[:, 5]) Testing out new code
    X[:,3] = imputer.transform(X[:,3])
示例#8
0
def test_imputation_const_mostf_error_invalid_types(strategy, dtype):
    # Test imputation on non-numeric data using "most_frequent" and "constant"
    # strategy
    X = np.array([
        [np.nan, np.nan, "a", "f"],
        [np.nan, "c", np.nan, "d"],
        [np.nan, "b", "d", np.nan],
        [np.nan, "c", "d", "h"],
    ], dtype=dtype)

    err_msg = "SimpleImputer does not support data"
    with pytest.raises(ValueError, match=err_msg):
        imputer = SimpleImputer(strategy=strategy)
        imputer.fit(X).transform(X)
示例#9
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent', "constant"]:
        imputer = SimpleImputer(strategy=strategy)
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert X_imputed.shape == (10, 2)
        X_imputed = imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)

        iterative_imputer = IterativeImputer(initial_strategy=strategy)
        X_imputed = iterative_imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)
示例#10
0
def test_imputation_pickle():
    # Test for pickling imputers.
    import pickle

    X = sparse_random_matrix(100, 100, density=0.10)

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = SimpleImputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_almost_equal(
            imputer.transform(X.copy()),
            imputer_pickled.transform(X.copy()),
            err_msg="Fail to transform the data after pickling "
            "(strategy = %s)" % (strategy)
        )
示例#11
0
def test_imputation_add_indicator(marker):
    X = np.array([
        [marker, 1,      5,       marker, 1],
        [2,      marker, 1,       marker, 2],
        [6,      3,      marker,  marker, 3],
        [1,      2,      9,       marker, 4]
    ])
    X_true = np.array([
        [3., 1., 5., 1., 1., 0., 0., 1.],
        [2., 2., 1., 2., 0., 1., 0., 1.],
        [6., 3., 5., 3., 0., 0., 1., 1.],
        [1., 2., 9., 4., 0., 0., 0., 1.]
    ])

    imputer = SimpleImputer(missing_values=marker, add_indicator=True)
    X_trans = imputer.fit_transform(X)

    assert_allclose(X_trans, X_true)
    assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
示例#12
0
 def __call__(self, data):
     from Orange.data.sql.table import SqlTable
     if isinstance(data, SqlTable):
         return Impute()(data)
     imputer = SimpleImputer(strategy=self.strategy)
     X = imputer.fit_transform(data.X)
     # Create new variables with appropriate `compute_value`, but
     # drop the ones which do not have valid `imputer.statistics_`
     # (i.e. all NaN columns). `sklearn.preprocessing.Imputer` already
     # drops them from the transformed X.
     features = [impute.Average()(data, var, value)
                 for var, value in zip(data.domain.attributes,
                                       imputer.statistics_)
                 if not np.isnan(value)]
     assert X.shape[1] == len(features)
     domain = Orange.data.Domain(features, data.domain.class_vars,
                                 data.domain.metas)
     new_data = data.transform(domain)
     new_data.X = X
     return new_data
示例#13
0
def test_simple_imputation_add_indicator_sparse_matrix(arr_type):
    X_sparse = arr_type([
        [np.nan, 1, 5],
        [2, np.nan, 1],
        [6, 3, np.nan],
        [1, 2, 9]
    ])
    X_true = np.array([
        [3., 1., 5., 1., 0., 0.],
        [2., 2., 1., 0., 1., 0.],
        [6., 3., 5., 0., 0., 1.],
        [1., 2., 9., 0., 0., 0.],
    ])

    imputer = SimpleImputer(missing_values=np.nan, add_indicator=True)
    X_trans = imputer.fit_transform(X_sparse)

    assert sparse.issparse(X_trans)
    assert X_trans.shape == X_true.shape
    assert_allclose(X_trans.toarray(), X_true)
示例#14
0
def test_imputation_copy():
    # Test imputation with copy
    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

    # copy=True, dense => copy
    X = X_orig.copy().toarray()
    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert not np.all(X == Xt)

    # copy=True, sparse csr => copy
    X = X_orig.copy()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert not np.all(X.data == Xt.data)

    # copy=False, dense => no copy
    X = X_orig.copy().toarray()
    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_array_almost_equal(X, Xt)

    # copy=False, sparse csc => no copy
    X = X_orig.copy().tocsc()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_array_almost_equal(X.data, Xt.data)

    # copy=False, sparse csr => copy
    X = X_orig.copy()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert not np.all(X.data == Xt.data)
示例#15
0
def test_iterative_imputer_missing_at_transform(strategy):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X_train = rng.randint(low=0, high=3, size=(n, d))
    X_test = rng.randint(low=0, high=3, size=(n, d))

    X_train[:, 0] = 1  # definitely no missing values in 0th column
    X_test[0, 0] = 0  # definitely missing value in 0th column

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               initial_strategy=strategy,
                               random_state=rng).fit(X_train)
    initial_imputer = SimpleImputer(missing_values=0,
                                    strategy=strategy).fit(X_train)

    # if there were no missing values at time of fit, then imputer will
    # only use the initial imputer for that feature at transform
    assert_allclose(imputer.transform(X_test)[:, 0],
                    initial_imputer.transform(X_test)[:, 0])
示例#16
0
def test_imputation_constant_integer():
    # Test imputation using the constant strategy on integers
    X = np.array([
        [-1, 2, 3, -1],
        [4, -1, 5, -1],
        [6, 7, -1, -1],
        [8, 9, 0, -1]
    ])

    X_true = np.array([
        [0, 2, 3, 0],
        [4, 0, 5, 0],
        [6, 7, 0, 0],
        [8, 9, 0, 0]
    ])

    imputer = SimpleImputer(missing_values=-1, strategy="constant",
                            fill_value=0)
    X_trans = imputer.fit_transform(X)

    assert_array_equal(X_trans, X_true)
示例#17
0
def test_imputation_constant_object(marker):
    # Test imputation using the constant strategy on objects
    X = np.array([
        [marker, "a", "b", marker],
        ["c", marker, "d", marker],
        ["e", "f", marker, marker],
        ["g", "h", "i", marker]
    ], dtype=object)

    X_true = np.array([
        ["missing", "a", "b", "missing"],
        ["c", "missing", "d", "missing"],
        ["e", "f", "missing", "missing"],
        ["g", "h", "i", "missing"]
    ], dtype=object)

    imputer = SimpleImputer(missing_values=marker, strategy="constant",
                            fill_value="missing")
    X_trans = imputer.fit_transform(X)

    assert_array_equal(X_trans, X_true)
示例#18
0
def test_imputation_most_frequent_objects(marker):
    # Test imputation using the most-frequent strategy.
    X = np.array([
        [marker, marker, "a", "f"],
        [marker, "c", marker, "d"],
        [marker, "b", "d", marker],
        [marker, "c", "d", "h"],
    ], dtype=object)

    X_true = np.array([
        ["c", "a", "f"],
        ["c", "d", "d"],
        ["b", "d", "d"],
        ["c", "d", "h"],
    ], dtype=object)

    imputer = SimpleImputer(missing_values=marker,
                            strategy="most_frequent")
    X_trans = imputer.fit(X).transform(X)

    assert_array_equal(X_trans, X_true)
示例#19
0
def test_mice_missing_at_transform(strategy):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X_train = rng.randint(low=0, high=3, size=(n, d))
    X_test = rng.randint(low=0, high=3, size=(n, d))

    X_train[:, 0] = 1  # definitely no missing values in 0th column
    X_test[0, 0] = 0  # definitely missing value in 0th column

    mice = MICEImputer(missing_values=0,
                       n_imputations=1,
                       n_burn_in=1,
                       initial_strategy=strategy,
                       random_state=rng).fit(X_train)
    initial_imputer = SimpleImputer(missing_values=0,
                                    strategy=strategy).fit(X_train)

    # if there were no missing values at time of fit, then mice will
    # only use the initial imputer for that feature at transform
    assert np.all(mice.transform(X_test)[:, 0] ==
                  initial_imputer.transform(X_test)[:, 0])
示例#20
0
def test_imputation_constant_pandas(dtype):
    # Test imputation using the constant strategy on pandas df
    pd = pytest.importorskip("pandas")

    f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n"
                    ",i,x,\n"
                    "a,,y,\n"
                    "a,j,,\n"
                    "b,j,x,")

    df = pd.read_csv(f, dtype=dtype)

    X_true = np.array([
        ["missing_value", "i", "x", "missing_value"],
        ["a", "missing_value", "y", "missing_value"],
        ["a", "j", "missing_value", "missing_value"],
        ["b", "j", "x", "missing_value"]
    ], dtype=object)

    imputer = SimpleImputer(strategy="constant")
    X_trans = imputer.fit_transform(df)

    assert_array_equal(X_trans, X_true)
示例#21
0
def test_imputation_error_sparse_0(strategy):
    # check that error are raised when missing_values = 0 and input is sparse
    X = np.ones((3, 5))
    X[0] = 0
    X = sparse.csc_matrix(X)

    imputer = SimpleImputer(strategy=strategy, missing_values=0)
    with pytest.raises(ValueError, match="Provide a dense array"):
        imputer.fit(X)

    imputer.fit(X.toarray())
    with pytest.raises(ValueError, match="Provide a dense array"):
        imputer.transform(X)
示例#22
0
def test_imputation_constant_float(array_constructor):
    # Test imputation using the constant strategy on floats
    X = np.array([
        [np.nan, 1.1, 0, np.nan],
        [1.2, np.nan, 1.3, np.nan],
        [0, 0, np.nan, np.nan],
        [1.4, 1.5, 0, np.nan]
    ])

    X_true = np.array([
        [-1, 1.1, 0, -1],
        [1.2, -1, 1.3, -1],
        [0, 0, -1, -1],
        [1.4, 1.5, 0, -1]
    ])

    X = array_constructor(X)

    X_true = array_constructor(X_true)

    imputer = SimpleImputer(strategy="constant", fill_value=-1)
    X_trans = imputer.fit_transform(X)

    assert_allclose_dense_sparse(X_trans, X_true)
示例#23
0
def _check_statistics(X, X_true,
                      strategy, statistics, missing_values):
    """Utility function for testing imputation for a given strategy.

    Test:
        - along the two axes
        - with dense and sparse arrays

    Check that:
        - the statistics (mean, median, mode) are correct
        - the missing values are imputed correctly"""

    err_msg = "Parameters: strategy = %s, missing_values = %s, " \
              "axis = {0}, sparse = {1}" % (strategy, missing_values)

    assert_ae = assert_array_equal
    if X.dtype.kind == 'f' or X_true.dtype.kind == 'f':
        assert_ae = assert_array_almost_equal

    # Normal matrix
    imputer = SimpleImputer(missing_values, strategy=strategy)
    X_trans = imputer.fit(X).transform(X.copy())
    assert_ae(imputer.statistics_, statistics,
              err_msg=err_msg.format(0, False))
    assert_ae(X_trans, X_true, err_msg=err_msg.format(0, False))

    # Sparse matrix
    imputer = SimpleImputer(missing_values, strategy=strategy)
    imputer.fit(sparse.csc_matrix(X))
    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

    if sparse.issparse(X_trans):
        X_trans = X_trans.toarray()

    assert_ae(imputer.statistics_, statistics,
              err_msg=err_msg.format(0, True))
    assert_ae(X_trans, X_true, err_msg=err_msg.format(0, True))
             alpha=0.1)
plt.show()

# 数据准备
housing = strat_train_set.drop("median_house_value",
                               axis=1)  # drop()会创建一个数据副本,但不影响strat_train_set
housing_labels = strat_train_set["median_house_value"].copy()

# 处理缺失值
# housing.dropna(subset=["total_bedrooms"]) # total_bedrooms删除属性为缺失值所在的那一行
# housing.drop("total_bedrooms", axis=1) # 删除该属性
# median = housing["total_bedrooms"].median()
# housing["total_bedrooms"].fillna(median) # 缺失值填充中位数
# 利用inputer来处理缺失值
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
print(imputer.statistics_)
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

# 处理文本和分类属性
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()  # 能将文本label转化成数字
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
print(housing_cat_encoded)
print(encoder.classes_)
示例#25
0
# Delete the 'fuel-system' column:
df = df.drop(columns=['fuel-system'])

# Replace the numeric names in the categorical column with its digits:
df['num-of-doors'] = df['num-of-doors'].replace(('two', 'four'), (2, 4))
df['cylinders'] = df['cylinders'].replace(
    ('two', 'three', 'four', 'five', 'six', 'eight', 'twelve'),
    (2, 3, 4, 5, 6, 8, 12))

X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

from sklearn.impute import SimpleImputer
# Handling missing numeric data in several columns:
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X[:, [1, 15, 16, 18, 19]] = imp_median.fit_transform(X[:, [1, 15, 16, 18, 19]])
# Handling missing data in column 'num-of-doors':
imp_most_frequent = SimpleImputer(missing_values=np.nan,
                                  strategy='most_frequent')
X[:, [4]] = imp_most_frequent.fit_transform(X[:, [4]])

# Encoding categorical columns:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 2] = labelencoder_X.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(sparse=False)
A = onehotencoder.fit_transform(X[:, [2]])
X = np.hstack((A, X[:, :2], X[:, 3:]))
X = X[:, 1:]
B = onehotencoder.fit_transform(X[:, [3]])
    # plt.legend()
    # plt.show()

    from pandas.tools.plotting import scatter_matrix
    attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
    # scatter_matrix(housing[attributes], figsize=(12, 8))
    # plt.show()

    housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
    housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
    housing["population_per_household"]=housing["population"]/housing["households"]

    housing_num = housing.drop("ocean_proximity", axis=1)

    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_num)
    X = imputer.transform(housing_num)

    housing_tr = pd.DataFrame(X, columns=housing_num.columns)

    # from sklearn.preprocessing import LabelEncoder
    # encoder = LabelEncoder()
    housing_cat = housing["ocean_proximity"]
    # housing_cat_encoded = encoder.fit_transform(housing_cat)

    # from sklearn.preprocessing import OneHotEncoder
    # encoder = OneHotEncoder()
    # housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))

    from sklearn.preprocessing import LabelBinarizer
示例#27
0
print(missing_val_count_by_column[missing_val_count_by_column > 0])
# Get names of columns with missing values
cols_with_missing = [
    col for col in train_X.columns if train_X[col].isnull().any()
]

reduced_train_X = train_X.drop(cols_with_missing, axis=1)
reduced_val_X = val_X.drop(cols_with_missing, axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_train_X, reduced_val_X, train_y, val_y))

from sklearn.impute import SimpleImputer

# Imputation (Approach 2)
my_imputer = SimpleImputer()  #replace missing values with the mean value
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_X_valid = pd.DataFrame(my_imputer.transform(val_X))

# Imputation removed column names; put them back
imputed_X_train.columns = train_X.columns
imputed_X_valid.columns = val_X.columns

print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, train_y, val_y))

# An Extension to Imputation (Approach 3): keeping track of which values were imputed
# Make copy to avoid changing original data (when imputing)
train_X_plus = train_X.copy()
val_X_plus = val_X.copy()
示例#28
0
dataset = pd.read_csv("kidneyChronic.csv")

print(dataset.describe())
print(dataset.isnull().sum())

print(type(dataset))
dataset = dataset.replace(to_replace="\?", value=np.nan, regex=True)

print(dataset.isnull().sum())

x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 24].values

from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
imputerMode = SimpleImputer(strategy="most_frequent")
x[:, 0:5] = imputer.fit_transform(x[:, 0:5])
x[:, 5:9] = imputerMode.fit_transform(x[:, 5:9])
x[:, 9:18] = imputer.fit_transform(x[:, 9:18])
x[:, 18:24] = imputerMode.fit_transform(x[:, 18:24])

from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
x[:, 5] = labelencoder_X.fit_transform(x[:, 5])
x[:, 6] = labelencoder_X.fit_transform(x[:, 6])
x[:, 7] = labelencoder_X.fit_transform(x[:, 7])
x[:, 8] = labelencoder_X.fit_transform(x[:, 8])
x[:, 18] = labelencoder_X.fit_transform(x[:, 9])
x[:, 19] = labelencoder_X.fit_transform(x[:, 19])
x[:, 20] = labelencoder_X.fit_transform(x[:, 20])
示例#29
0
import numpy as np
from sklearn.impute import SimpleImputer

arr1 = np.array([[1, 3, 5, np.nan], [10, 14, 18, 19], [20, 34, 28, np.nan]])
print(f'{arr1}\n')

imp_mean = SimpleImputer()
transformed = imp_mean.fit_transform(arr1)
print(f"{transformed}\n")

imp_mean = SimpleImputer(strategy="most_frequent")
transformed = imp_mean.fit_transform(arr1)
print(f"{transformed}\n")

imp_constant = SimpleImputer(strategy='constant', fill_value=-1)
transformed = imp_constant.fit_transform(arr1)
print('{}\n'.format(repr(transformed)))
# Mengimpor library yang diperlukan # TRAINING SET DAN TEST SET
import numpy as np
import pandas as pd
 
# Import data ke python
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values  #menentukan X sebagai variable dependen
y = dataset.iloc[:, 3].values    #menentukan y sebagai variable independen
 
# Memproses data yang hilang (missing)
from sklearn.impute import SimpleImputer #mengisi data yg hilang(nan) dgn rata2(mean)
imputer = SimpleImputer(missing_values= np.nan, strategy = 'mean') #memilih usia dan gaji, sehingga kita memilih X index 1 dan 2
imputer = imputer.fit(X[:, 1:3]) #implementasi kebasris yg hilang
X[:, 1:3] = imputer.transform(X[:, 1:3])
 
# Encoding(konversi jd angka) data kategori : variable dependen(negara) dan variabel independen(Beli)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelencoder_X = LabelEncoder()                     # Bisa dihilangkan, baca pembahasan di bawahnya
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])     # Bisa dihilangkan, baca pembahasan di bawahnya
transformer = ColumnTransformer(
        [('Negara', OneHotEncoder(), [0])],
        remainder='passthrough')
X = np.array(transformer.fit_transform(X), dtype=np.float)
 
# Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

# Membagi menjadi training set dan test set
from sklearn.model_selection import train_test_split
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
# =============================================================================

#  ================= Resampling the imbalanced Label of "TakeOver" ========================================
#==========================================================================================================

# We create the preprocessing pipelines for both numeric and categorical data.
from sklearn.pipeline import Pipeline
from sklearn.utils import resample

numeric_features = Cont_Filter_Cleaned
numeric_transformer = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

categorical_features = [
    'LeftLaneType', 'RightLaneType', 'Coming_AlarmType', 'NDTask'
]
categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[(
    'num', numeric_transformer,
    numeric_features), ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Separate input features and target
y = dataset.Takeover
示例#32
0
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Data Preprocessing
'''Generally, you want to treat the test set as though you did not have it during training. 
Whatever transformations you do to the train set should be done to the test set before you make predictions. 
If you apply transformation before splitting and then split into train/test you are leaking data from your test set (that is supposed to be completely withheld) into your training set. 
This will yield extremely biased results on model performance.'''

# Impute missing values - done after train-test-splitting to prevent data leakage
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X_train[:, 1:3])
X_train[:, 1:3] = imputer.transform(X_train[:, 1:3])

# Dummy coding the Independent Variable - we are using one hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))

# Dummy coding the Dependent Variable - we are using label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
示例#33
0
# ip.dsfield.dscp [ 0.  4. 48. 32.  6. 46.  5. nan] most frequent?
# ip.len  unique values: 512 ## median?
# ip.flags [40.  0. 21. 20.  1. nan] most frequent?
# ip.frag_offset  unique values: 190 ## median?
# ip.ttl  unique values: 61 ## integer mean?
# ip.proto [ 1.  6. 17.  2. nan] ## most frequent?

#print ('\n\nColumn | NaN values (before imputing)')
#print ('\nTrain:')
#print (X_train_df.isnull ().sum ())
#print ('\nTest:')
#print (X_test_df.isnull ().sum ())

from sklearn.impute import SimpleImputer
for myColumn, myStrategy in zip(columsWithMissingValues, imputingStrategies):
    myImputer = SimpleImputer(missing_values=np.nan, strategy=myStrategy)
    myImputer.fit(X_train_df[myColumn].values.reshape(-1, 1))
    X_train_df[myColumn] = myImputer.transform(
        X_train_df[myColumn].values.reshape(-1, 1))
    X_val_df[myColumn] = myImputer.transform(X_val_df[myColumn].values.reshape(
        -1, 1))
    X_test_df[myColumn] = myImputer.transform(
        X_test_df[myColumn].values.reshape(-1, 1))

# Round ip.ttl
X_train_df['ip.ttl'] = X_train_df['ip.ttl'].round(decimals=0)
X_val_df['ip.ttl'] = X_val_df['ip.ttl'].round(decimals=0)
X_test_df['ip.ttl'] = X_test_df['ip.ttl'].round(decimals=0)

#print ('\n\nColumn | NaN values (before imputing)')
#print ('\nTrain:')
示例#34
0

dataset = pd.read_csv('Data.csv')

# tomar valores independientes
x = dataset.iloc[:,:-1].values
# tomar valores dependientes
y = dataset.iloc[:,3].values

print(x)
print(y)

# Tratamiento de los NaN
from sklearn.impute import SimpleImputer
# remplazar por la media los valores NaN 
imputer = SimpleImputer(strategy="mean")
#sacar la media de la edad y salario
# remplazar los valoes nan
x[:,1:3] = imputer.fit_transform(x[:,1:3])
print(x)

# Codificar datos categoricos remplazar los paises por 0,1,2 etc.
from sklearn import preprocessing
#crear codificador de datos
le_x = preprocessing.LabelEncoder()
#transformar los paises por valores numericos (france 0, Germany 1 y Spain 2)
x[:,0] = le_x.fit_transform(x[:,0])
print(x)
#x[:,0] = le_x.inverse_transform(list(x[:,0]))
#print(x)
示例#35
0
                    group_pli = np.hstack((group_r_pli, group_u_pli))
                    y_pli = np.hstack((y_r_pli, y_u_pli))

                    X = np.hstack((X_aec, X_pli))
                    if np.array_equal(y_aec, y_pli):
                        print("Y-values equal")
                        y = y_aec
                    if np.array_equal(group_aec, group_pli):
                        print("group-values equal")
                        group = group_aec

                final_acc_filename = commons.OUTPUT_DIR + f"models/final_SVC_{k}_c_{c}_resp_unres_{s}.pickle"

                #build pipeline with best model
                pipe = Pipeline([('imputer',
                                  SimpleImputer(missing_values=np.nan,
                                                strategy='mean')),
                                 ('scaler', StandardScaler()), ('CLF', clf)])

                accuracies, f1s, cms = classify_loso(X, y, group, pipe)

                clf_data = {
                    'accuracies': accuracies,
                    'f1s': f1s,
                    'cms': cms,
                    #'best_params': best_params,
                }

                final_acc_file = open(final_acc_filename, 'ab')
                pickle.dump(clf_data, final_acc_file)
                final_acc_file.close()
                print(sum(accuracies))
        else:
            dx[index].append(x[i] - x[i + j])
            dy[index].append(y[i] - y[i + j])
        index += 1
    for j in range(1, 1 + n):
        if i + j >= len(x):
            dx[index].append(np.NaN)
            dy[index].append(np.NaN)
        else:
            dx[index].append(x[i] - x[i + j])
            dy[index].append(y[i] - y[i + j])
        index += 1

#fill = np.nanmax(dx) if np.nanmax(dx)>np.nanmax(dy) else np.nanmax(dy)

imp = SimpleImputer(missing_values=np.nan, strategy='constant')
dx = imp.fit_transform(dx)
dy = imp.fit_transform(dy)

dx = pd.DataFrame(
    np.array([standardize(d) for d in dx]).T,
    columns=["dx" + str(i) for i in range(0 - n, 1 + n) if i != 0])
dy = pd.DataFrame(
    np.array([standardize(d) for d in dy]).T,
    columns=["dy" + str(i) for i in range(0 - n, 1 + n) if i != 0])

features = dx.join(dy)
features = np.array(features)

points = np.array(data)[:, 1:3]
df = pd.read_csv(
    "chronic_kidney_disease_W_header_missing_category_replaced_and_given_value.csv"
)
# replacing the class values (ckd = 1, notckd = 0)
df = df.replace('ckd', 1)
df = df.replace('notckd', 0)

X = df.iloc[:, 0:24]  # features vectors
# class labels: ckd = Chronic Kidney Disease, notckd = Not Chronic Kidney Disease
y = df.iloc[:, 24]

# Replace missing feature values. 'median' is used for numerical cases and feature value
X = X.replace('?', np.nan)
X.to_csv('X_test_file.csv')
y.to_csv('Y_test_file.csv')
imr = SimpleImputer(missing_values=np.nan, strategy='median')
imr = imr.fit(X)
X_imputed = imr.transform(X.values)

le = LabelEncoder()  # positive class = 1 (ckd), negative class = 0 (notckd)
y = le.fit_transform(y)

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.2, random_state=7)  # , random_state = 7

# Z-score normalization
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values  # все кроме последней колонки
y = dataset.iloc[:, -1].values  # только последняя колонка

# take care of missing data
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])  # исключаем текстовую колонку
X[:, 1:3] = imputer.transform(
    X[:, 1:3])  # поместить в вабраную область, отредактированный масиив

# зашивруем категории в виде вектора
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# 1 элемент - вид трансвормации, 2 - класс странсформера, 3 - колонка для кодирования
# remainder - инструкция для трансформации, говорит расширить
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')
X = ct.fit_transform(X)

# закодируем наши игрики
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# разодьем данные на тестовые и проверочные
class TreeAugmentedNB_BayesianInf(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
     A primitive which does naive bayes classification. During training, input to this primitive should be a matrix of tabular numercal/categorical data, and an array of labels. During testing, input is data matrix of numerical features only and output will be the predicted labels with metadata generated.
    """
    
    metadata = metadata_base.PrimitiveMetadata({
        'id': '2fa0afb2-1b7b-462d-a7c9-11b44efe9eb0',
        'version': rpi_d3m_primitives.__coreversion__,
        'name': 'Tree-Augmented Naive Bayes Classifier',
        'keywords': ['Tree-Augmented Naive Bayes','Bayesian Inference','Classification'],
        'description': 'This algorithm is an implementation of Tree-augmented Naive Bayes classification. Bayesian Inference is applied.',
        'source': {
            'name': rpi_d3m_primitives.__author__,
            'contact': 'mailto:[email protected]',
            'uris': [
                'https://github.com/zijun-rpi/d3m-primitives/blob/master/TreeAugmentNB_BayesianInf.py',
                'https://github.com/zijun-rpi/d3m-primitives.git'
                ]
        },
        'installation':[
            {
                'type': metadata_base.PrimitiveInstallationType.PIP,
                'package': 'rpi_d3m_primitives',
	            'version': rpi_d3m_primitives.__version__
            }
        ],
        'python_path': 'd3m.primitives.classification.tree_augmented_naive_bayes.BayesianInfRPI',
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.NAIVE_BAYES_CLASSIFIER],
        'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION
    })
    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Union[typing.Dict[str, base.DockerContainer]] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
        self._index = None
        self._training_inputs = None
        self._training_outputs = None
        self._origin_inputs = None #for label encoder
        self._fitted = False
        self._cate_flag = None
        self._clf = Model(modelName='tan', bayesInf=1, PointInf=1, alpha=1, N0=self.hyperparams['N0']) #classifier
        self._LEoutput = preprocessing.LabelEncoder() #label encoder
        self._Imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') #imputer
        self._nbins = self.hyperparams['nbins']
        self._Kbins = preprocessing.KBinsDiscretizer(n_bins=self._nbins, encode='ordinal', strategy='uniform') #KbinsDiscretizer
        self._discTrainset = None
        
    
    def _store_target_columns_metadata(self, outputs: Outputs) -> None:
        outputs_length = outputs.metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

        target_columns_metadata: List[Dict] = []

        for column_index in range(outputs_length):
            column_metadata = OrderedDict(outputs.metadata.query_column(column_index))

            # Update semantic types and prepare it for predicted targets.
            semantic_types = list(column_metadata.get('semantic_types', []))
            if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types:
                semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
            semantic_types = [semantic_type for semantic_type in semantic_types if semantic_type != 'https://metadata.datadrivendiscovery.org/types/TrueTarget']
            column_metadata['semantic_types'] = semantic_types

            target_columns_metadata.append(column_metadata)
            
        self._target_columns_metadata = target_columns_metadata
        
    ##TO DO:
    #select columns via semantic types
    #remove preprocessing
    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:

        ## Update semantic types and prepare it for predicted targets
        self._store_target_columns_metadata(outputs)
        
        ## memory original training inputs
        self._origin_inputs = inputs

        ## set training labels
        metadata = outputs.metadata
        column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, 0))
        semantic_types = column_metadata.get('semantic_types', [])
        if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types:
            self._LEoutput.fit(outputs)
            self._training_outputs = self._LEoutput.transform(outputs) #starting from zero
        
        ## convert categorical values to numerical values in training data
        metadata = inputs.metadata
        [m,n] = inputs.shape
        self._training_inputs = np.zeros((m,n))
        self._cate_flag = np.zeros((n,))
        for column_index in metadata.get_elements((metadata_base.ALL_ELEMENTS,)):
            if column_index is metadata_base.ALL_ELEMENTS: 
                continue
            column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
            semantic_types = column_metadata.get('semantic_types', [])
            if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types:
                LE = preprocessing.LabelEncoder()
                LE = LE.fit(inputs.iloc[:,column_index])
                self._training_inputs[:,column_index] = LE.transform(inputs.iloc[:,column_index])
                self._cate_flag[column_index] = 1
            elif 'http://schema.org/Text' in semantic_types:
                pass
            else:
                temp = list(inputs.iloc[:, column_index].values)
                for i in np.arange(len(temp)):
                    if bool(temp[i]):
                        self._training_inputs[i,column_index] = float(temp[i])
                    else:
                        self._training_inputs[i,column_index] = float('nan')
                # imputer will remove the column with purely missing values
                if not np.count_nonzero(np.isnan(self._training_inputs[:, column_index])) == 0:  # if there is missing values
                    if np.count_nonzero(np.isnan(self._training_inputs[:, column_index])) == m:  # all missing
                        self._training_inputs[:, column_index] = np.zeros(m, )  # replace with all zeros

        self._fitted = False
    

    def fit(self, *, timeout: float = None, iterations: int = None) -> None:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs.any() == None or self._training_outputs.any() == None: 
            raise ValueError('Missing training data, or missing values exist.')

        ## impute missing values
        self._Imputer.fit(self._training_inputs)
        self._training_inputs = self._Imputer.transform(self._training_inputs)

        ## discretize non-categorical values
        disc_training_inputs = self._training_inputs
        if not len(np.where(self._cate_flag == 0)[0]) == 0:
            self._Kbins.fit(self._training_inputs[:, np.where(self._cate_flag == 0)[0]]) #find non-categorical values
            temp = self._Kbins.transform(self._training_inputs[:, np.where(self._cate_flag == 0)[0]])
            disc_training_inputs[:, np.where(self._cate_flag == 0)[0]] = temp
        # starting from zero

        ## get number of states for each feature and remove features with only one state
        discTrainset = RelationSet(disc_training_inputs, self._training_outputs.reshape(-1,1))
        discTrainset.getStateNo(self._cate_flag, self._nbins)
        discTrainset.remove()
        X_train = discTrainset.data
        Y_train = discTrainset.labels
        
        self._discTrainset = discTrainset
        stateNo = np.append(discTrainset.NUM_STATES, len(np.unique(Y_train)))

        ## fit the classifier
        self._clf.fit(X_train, Y_train, stateNo)
        self._fitted = True

        return CallResult(None)


    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:  # inputs: m x n numpy array
        if self._fitted:

            ## convert categorical values to numerical values in testing data
            metadata = inputs.metadata
            [m, n] = inputs.shape
            X_test = np.zeros((m, n))
            for column_index in metadata.get_elements((metadata_base.ALL_ELEMENTS,)):
                if column_index is metadata_base.ALL_ELEMENTS:
                    continue
                column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
                semantic_types = column_metadata.get('semantic_types', [])
                if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types:
                    LE = preprocessing.LabelEncoder()
                    LE = LE.fit(self._origin_inputs.iloc[:, column_index]) #use training data to fit
                    X_test[:, column_index] = LE.transform(inputs.iloc[:, column_index])
                elif 'http://schema.org/Text' in semantic_types:
                    pass
                else:
                    temp = list(inputs.iloc[:, column_index].values)
                    for i in np.arange(len(temp)):
                        if bool(temp[i]):
                            X_test[i, column_index] = float(temp[i])
                        else:
                            X_test[i, column_index] = float('nan')

            ## impute testing data
            X_test = self._Imputer.transform(X_test)

            ## Kbins discretize for noncategorical values
            disc_X_test = X_test
            if not len(np.where(self._cate_flag == 0)[0]) == 0:
                temp = self._Kbins.transform(X_test[:, np.where(self._cate_flag == 0)[0]])
                disc_X_test[:,np.where(self._cate_flag == 0)[0]] = temp

            ## remove columns with one states
            index_list = np.setdiff1d(np.arange(self._discTrainset.num_features), np.array(self._discTrainset.removeIdx))
            disc_X_test = disc_X_test[:, index_list]

            ## prediction
            output = self._clf.predict(disc_X_test)

            ## label decode
            output = self._LEoutput.inverse_transform(output)
            
            ## update metadata
            output = container.DataFrame(output, generate_metadata=False, source=self)
            output.metadata = inputs.metadata.clear(source=self, for_value=output, generate_metadata=True)
            
            for column_index, column_metadata in enumerate(self._target_columns_metadata):
                output.metadata = output.metadata.update_column(column_index, column_metadata, source=self)


            return CallResult(output)
        else:
            raise ValueError('Model should be fitted first.')


    def get_params(self) -> None:
        pass


    def set_params(self) -> None:
        pass
示例#40
0
#data preproceesing
#importing the libraries
import numpy as np  #necessary libraries for machine learning models
import pandas as pd
import matplotlib.pyplot as mp

#import the datasets
datasets = pd.read_csv('Data.csv')  #to import the datasets
X = datasets.iloc[:, :-1].values  #independent variables
Y = datasets.iloc[:, 3].values  #dependent variables

#To take care of the missing values
from sklearn.impute import SimpleImputer  #library and class for missing data
imputer = SimpleImputer(missing_values=np.nan,
                        strategy='mean')  #to replace missing data with mean
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

#encode the cataogorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder  #to convert catogorical data to encoded values
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features=[0])  #dummyencoding
X = onehotencoder.fit_transform(X).toarray()
示例#41
0
def preprocess_input(X, le0, le1, le2, oHot, supplemental_label_data=pd.DataFrame()):
    
    # drop unwanted features
    
    unwanted_columns = ["playerName", "Season", "spacer1", "transferredSchools", "ORB", "DRB"]  # "\xa0"
    X = X.drop(unwanted_columns, axis=1)

    
    # organize features
    columns = list(X.columns)
    categorical = ["position", "School", "Conf"]
    #diff = lambda l1, l2: [x for x in l1 if x not in l2]
    #numerical = diff(columns, categorical) 
    
    
    # cast numerical data as floats
    for col in columns:
        if col not in categorical:
            X[col] = X[col].astype(float)
    
    
    
    # impute: fill in missing values
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
    X = X.values # convert to ndarray of objects 
    imputer = imputer.fit(X[:, 3:])
    X[:, 3:] = imputer.transform(X[:, 3:])
    
    
    
    # scale data #
    X_scaler = StandardScaler()
    X_scaler = X_scaler.fit(X[:, 3:])
    X[:, 3:] = X_scaler.transform(X[:, 3:])

    
    
    
    # encode categorial data #
    
    fit_data0 = X[:, 0]
    fit_data1 = X[:, 1]
    fit_data2 = X[:, 2]
    print(fit_data0)
    print(fit_data0.shape)
        
    if len(supplemental_label_data) > 0:
        
        fit_data0 = list(fit_data0)
        fit_data0.extend(list(supplemental_label_data.School))
        
        print("yale:",fit_data0.index('Yale'))
        fit_data1 = list(fit_data1)
        fit_data1.extend(list(supplemental_label_data.Conf))
        
        fit_data2 = list(fit_data2)
        fit_data2.extend(list(supplemental_label_data.position))

        
    if not le0:
        # label encode
        labelEncoder_X0 = LabelEncoder()
        labelEncoder_X0 = labelEncoder_X0.fit(fit_data0)
        X[:, 0] = labelEncoder_X0.transform(X[:, 0])
        
        labelEncoder_X1 = LabelEncoder()
        labelEncoder_X1 = labelEncoder_X1.fit(fit_data1)
        X[:, 1] = labelEncoder_X1.transform(X[:, 1])
        
        labelEncoder_X2 = LabelEncoder()
        labelEncoder_X2 = labelEncoder_X2.fit(fit_data2)
        X[:, 2] = labelEncoder_X2.transform(X[:, 2])
        
        # one hot encode
        oneHotEncoder = OneHotEncoder(categorical_features=[0,1,2], handle_unknown='ignore')
        oneHotEncoder = oneHotEncoder.fit(X)
        X = oneHotEncoder.transform(X)
    else:
        # label encoder with prev encoder
        X[:, 0] = le0.transform(X[:, 0])
        X[:, 1] = le1.transform(X[:, 1])
        X[:, 2] = le2.transform(X[:, 2])
        
        # one hot encode with prev encoder
        X = oHot.transform(X)
        
        return X
        
    
        

    return (X, labelEncoder_X0, labelEncoder_X1, labelEncoder_X2, oneHotEncoder)
示例#42
0
#kütüphane ekleme
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#veri yükleme
veriler = pd.read_csv('eksikveriler.csv')
#print(veriler)

boy = veriler[['boy']]
#print(boy)

boykilo = veriler[['boy', 'kilo']]
print(boykilo)

#eksik veriler

from sklearn.impute import SimpleImputer 

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#nan değerleri ortalama değer ile değiştir.
Yas = veriler.iloc[:,1:4].values
#iloc : integer location 
#[:,1,4] => iki nokta bütün satırları getiriyor, 1. sütundan 4. sütuna kadar

print(Yas)

imputer = imputer.fit(Yas[:,1:4]) #=>öğrenme işlemini sağlıyor
Yas[:,1:4] = imputer.transform(Yas[:,1:4]) #değerlerin değiştirilmesi işlemi
示例#43
0
X = dataset.iloc[:, :-1].values
#X is for columns extraction [:,:-1] this takes all rows that is all 10 members
#and -1 leaves last column while including all the columns

Y = dataset.iloc[:, 3].values
#Y takes the last columns by using its index 3 and all rows of that column
#DON'T FORGET THE VALUES IN Y AND X

#___________________________________________________________________________________________________#

#MISSING DATA TREATMENT

from sklearn.impute import SimpleImputer
#simpleImpute is the new Imputer and is used to fill the missing data (mathematically)
#we have imported a class now lets create and object of that class "IMPUTER"
imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
#here missing values are denoted by nan is our data set and strategy is which way we want to fill it
imputer = imputer.fit(X[:, 1:3])
#here we fit the value in place of nan
X[:, 1:3] = imputer.transform(X[:, 1:3])

#_____________________________________________________________________________________________________#
"""
ENCODING CATEGORICAL DATA 
LabelEncoder = france, spain, germany into numerics
Problem with label encoding is that it assumes higher the categorical value, better the category
OneHotEncoder(converts to binary) =  https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f

"""
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
示例#44
0
dataset = dataset.drop(['overall_rating'], axis=1)

# Note that description of item has quite un-necessary info and the useful info
# is already captured in cols product_specification, category tree, brand
# Hence we can remove description column as well
dataset = dataset.drop(['description'], axis=1)

# We can also remove any duplicated rows
duplicates = dataset[dataset.duplicated()]
dataset = dataset.drop(labels=duplicates.index, axis=0)
raw_dataset = raw_dataset.drop(labels=duplicates.index, axis=0)
summary_dataset = dataset.isnull().sum()
# Note that we can replace nan entries of retail & discounted price with average of that column
from sklearn.impute import SimpleImputer

imp = SimpleImputer(strategy='mean')
dataset.retail_price = imp.fit_transform(
    dataset.retail_price.to_numpy().reshape(-1, 1))
dataset.discounted_price = imp.fit_transform(
    dataset.discounted_price.to_numpy().reshape(-1, 1))

####
raw_dataset.retail_price = imp.fit_transform(
    raw_dataset.retail_price.to_numpy().reshape(-1, 1))
raw_dataset.discounted_price = imp.fit_transform(
    raw_dataset.discounted_price.to_numpy().reshape(-1, 1))

category_tree = []
for x in dataset.product_category_tree:
    x = x[2:-2]
    category_tree.append(x.split(" >>"))
示例#45
0
 def __init__(self):
     self.clf = Pipeline([
         ('imputer', SimpleImputer(strategy='most_frequent')),
         ('rf', RandomForestClassifier(max_depth=5, n_estimators=10))
     ])
示例#46
0
# Xに欠損値あり
# print(X.isnull().any(axis=0))
# print(X.isnull().sum())

# # X.keys()
# # >> Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
# #        'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
# #        'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
# #       dtype='object')

# 設問2:特徴量Xをone-hotエンコーディングし、結果をX_oheにセットせよ。
# 設問3:X_oheに含まれる欠損値を補完し、X_finの名前でセットしてください。
# 欠損値を落としたデータセットX_dnaでImputerを学習
# X_impに欠損値を補完したデータをセット
imputer_mf = SimpleImputer(strategy="most_frequent")
X_dna = X.dropna()
X_imp = pd.DataFrame()
imputer_mf.fit(X_dna)

X_ohe = imputer_mf.transform(X)
X_ohe = pd.DataFrame(X_ohe, columns=X.columns)

# OneHotEncodingと標準化を行って辞書に格納
continuous = np.array(
    ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'])

ohe = OneHotEncoder(sparse=False)
mms = MinMaxScaler()
enc = {}
ans = 0
test_data = pd.read_csv("C:/01_Projects/09_CriticalFormulasandTools/PythonScripts/TitanicData/test.csv")
test_data.head()

y = train_data["Survived"]

features = ["Pclass", "Sex", "Fare", "Age"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
X = my_imputer.fit_transform(X)
X_test = my_imputer.fit_transform(X_test)

model1 = GaussianNB()
model1.fit(X, y)
model2 = RandomForestClassifier(max_depth=15, n_estimators=100, bootstrap=False, max_features= 'sqrt', min_samples_leaf=4, min_samples_split=10)
model2.fit(X, y)
model3 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=2000)
model3.fit(X, y)
model4 = KNeighborsClassifier(3)
model4.fit(X, y)

labels_pred1 = model1.predict(X)
labels_pred2 = model2.predict(X)
labels_pred3 = model3.predict(X)
示例#48
0
act = act.loc[:, [
    'id', 'Bacteria', 'Fungi', 'Prokaryote', 'Virus', 'Cancer',
    'Immunomodulator'
]]
plat = pd.merge(paths, act, how='left', on='id')
## set cluster
clusters = [os.path.basename(file) for file in plat['fasta']]
plat = plat.assign(cluster=clusters)

# read fingerprints
fps = pd.read_csv(git_dir + "/data/platinum/PRISM_fingerprints_mean.csv.gz")
X = fps.set_index('id')
X = X.reindex(plat['id'])

# impute missing values
imputer = SimpleImputer(strategy='constant', fill_value=0)
imputer.fit(X)
X = imputer.transform(X)

# build models
targets = [
    'Bacteria', 'Fungi', 'Prokaryote', 'Virus', 'Cancer', 'Immunomodulator'
]
for target in targets:
    print("building models for target: " + target)
    y = plat[target].values

    # set up model
    svc = SVC(C=0.01,
              gamma=0.1,
              kernel='poly',
print("True Outcome: {0}, False Outcome: {1}".format(diabetes_true_count,
                                                     diabetes_false_count))

#Missing Zeros without Outcome Column
print("How many Zero value are here: ")
print(dataSet.iloc[:, 0:8].eq(0).sum())

# Feature Cloumn or Independent Variable
X = dataSet.iloc[:, :-1].values

#Dependent column or Predict Class
y = dataSet.iloc[:, 8].values

# Zero fill with mean of the column
from sklearn.impute import SimpleImputer
fill_values = SimpleImputer(missing_values=0, strategy='mean')
X[:, 1:8] = fill_values.fit_transform(X[:, 1:8])

#Trin/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)

#Feature Scaling

from sklearn.preprocessing import StandardScaler
scale_X = StandardScaler()
X_train = scale_X.fit_transform(X_train)
X_test = scale_X.transform(X_test)
示例#50
0
def lesson_2():
    print_("LESSON 2: Missing values", 0, 1)
    # ----------------------------------
    # Example: Melbourne Housing dataset
    # ----------------------------------
    # Load data
    features = [
        'Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt',
        'Lattitude', 'Longtitude'
    ]
    X_train, X_valid, y_train, y_valid = load_data_for_lesson_2(features)

    # Build a random forest model
    forest_model = RandomForestRegressor(random_state=1)

    # --------------------------------------------
    # Approach 1: Drop Columns with Missing Values
    # --------------------------------------------
    # Get names of columns with missing values
    cols_with_missing = [
        col for col in X_train.columns if X_train[col].isnull().any()
    ]

    # Drop columns in training and validation data
    reduced_X_train = X_train.drop(cols_with_missing, axis=1)
    reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

    print("MAE from Approach 1 (Drop columns with missing values):")
    print(
        score_model(forest_model, reduced_X_train, reduced_X_valid, y_train,
                    y_valid))

    # ----------------------
    # Approach 2: Imputation
    # ----------------------
    # Imputation
    my_imputer = SimpleImputer()
    imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
    imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

    # Imputation removed column names; put them back
    imputed_X_train.columns = X_train.columns
    imputed_X_valid.columns = X_valid.columns

    print("\nMAE from Approach 2 (Imputation):")
    print(
        score_model(forest_model, imputed_X_train, imputed_X_valid, y_train,
                    y_valid))

    # --------------------------------------
    # Approach 3: An Extension to Imputation
    # --------------------------------------
    # We impute the missing values, while also keeping track of which values
    # were imputed

    # Make copy to avoid changing original data (when imputing)
    X_train_plus = X_train.copy()
    X_valid_plus = X_valid.copy()

    # Make new columns indicating what will be imputed
    for col in cols_with_missing:
        X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
        X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

    # Imputation
    my_imputer = SimpleImputer()
    imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
    imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

    # Imputation removed column names; put them back
    imputed_X_train_plus.columns = X_train_plus.columns
    imputed_X_valid_plus.columns = X_valid_plus.columns

    print("\nMAE from Approach 3 (An Extension to Imputation):")
    print_(
        score_model(forest_model, imputed_X_train_plus, imputed_X_valid_plus,
                    y_train, y_valid))

    # Shape of training data (num_rows, num_columns)
    print_("Shape of training data (num_rows, num_columns)", 0)
    print_(X_train.shape)

    # Number of missing values in each column of training data
    missing_val_count_by_column = (X_train.isnull().sum())
    print_("Number of missing values in each column of training data", 0)
    print_(missing_val_count_by_column[missing_val_count_by_column > 0])
示例#51
0
import numpy as np
import pandas as pd

dataset = pd.read_csv('houses.csv')

dataset.head(20)

from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(dataset)
dataset = imp.transform(dataset)

X = dataset[:, :-1]
y = dataset[:, -1]

X
y

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X = sc.fit_transform(X)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)
示例#52
0
test.drop("Name", axis=1, inplace=True)
test.info()
train.info()

sex_mapping = {"male": 0, "female": 1}
for dataset in train_test_data:
    dataset["Sex"] = dataset["Sex"].map(sex_mapping)

train.head()
barchart("Sex")

xtrain.info()
train["Age"].isnull().sum()

from sklearn.impute import SimpleImputer
si = SimpleImputer(missing_values=np.nan, strategy="median")
si = si.fit(train[['Age']])
train['Age'] = si.transform(train[['Age']])
train["Age"].isnull().sum()

from sklearn.impute import SimpleImputer
si = SimpleImputer(missing_values=np.nan, strategy="median")
si = si.fit(test[['Age']])
test['Age'] = si.transform(test[['Age']])
test['Age'].isnull().sum()

#test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

facet = sns.FacetGrid(train, hue="Survived", aspect=2)
facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0, train["Age"].max()))
示例#53
0
df = pd.read_csv("weatherHistory.csv")

categorical = df.select_dtypes(include=["object"]).keys()
print(categorical)
quantitative = df.select_dtypes(include=["float64"]).keys()
print(quantitative)

#checking if any quantative has zero effect on temperature
df[quantitative].hist()

#Dropping Loud Cover as zero effect
df = df.drop('Loud Cover', axis=1)

#SimpleImputer to replace 0 in pressure(millibars)
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=0, strategy='median')
df.iloc[:, 9:10] = imp.fit_transform(df.iloc[:, 9:10])

imp = SimpleImputer(missing_values=np.nan, strategy="constant")
df.iloc[:, 2:3] = imp.fit_transform(df.iloc[:, 2:3])

X = df.iloc[:, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]].values
Y = df.iloc[:, 3].values

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
X[:, 1] = labelencoder_X.fit_transform(X[:, 1])
X[:, 2] = labelencoder_X.fit_transform(X[:, 2])
X[:, 9] = labelencoder_X.fit_transform(X[:, 9])
ohe = OneHotEncoder(categorical_features=[0, 1, 2, 9])
示例#54
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:, :-1].values  # Get elements by ID
y = dataset.iloc[:, -1].values

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

labelencoder_x = LabelEncoder()
x[:, 0] = labelencoder_x.fit_transform(x[:, 0])

ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],
    remainder='passthrough')
x = np.array(ct.fit_transform(x), dtype=np.float)

print(x)