def test_imputation_error_invalid_strategy(strategy): X = np.ones((3, 5)) X[0, 0] = np.nan with pytest.raises(ValueError, match=str(strategy)): imputer = SimpleImputer(strategy=strategy) imputer.fit_transform(X)
def test_changed_only(): # Make sure the changed_only param is correctly used set_config(print_changed_only=True) lr = LogisticRegression(C=99) expected = """LogisticRegression(C=99)""" assert lr.__repr__() == expected # Check with a repr that doesn't fit on a single line lr = LogisticRegression(C=99, class_weight=.4, fit_intercept=False, tol=1234, verbose=True) expected = """ LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234, verbose=True)""" expected = expected[1:] # remove first \n assert lr.__repr__() == expected imputer = SimpleImputer(missing_values=0) expected = """SimpleImputer(missing_values=0)""" assert imputer.__repr__() == expected # Defaults to np.NaN, trying with float('NaN') imputer = SimpleImputer(missing_values=float('NaN')) expected = """SimpleImputer()""" assert imputer.__repr__() == expected set_config(print_changed_only=False)
def test_imputation_deletion_warning(strategy): X = np.ones((3, 5)) X[:, 0] = np.nan with pytest.warns(UserWarning, match="Deleting"): imputer = SimpleImputer(strategy=strategy, verbose=True) imputer.fit_transform(X)
def test_imputation_mean_median_error_invalid_type(strategy, dtype): X = np.array([["a", "b", 3], [4, "e", 6], ["g", "h", 9]], dtype=dtype) with pytest.raises(ValueError, match="non-numeric data"): imputer = SimpleImputer(strategy=strategy) imputer.fit_transform(X)
def test_imputation_constant_error_invalid_type(X_data, missing_value): # Verify that exceptions are raised on invalid fill_value type X = np.full((3, 5), X_data, dtype=float) X[0, 0] = missing_value with pytest.raises(ValueError, match="imputing numerical"): imputer = SimpleImputer(missing_values=missing_value, strategy="constant", fill_value="x") imputer.fit_transform(X)
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent']: imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (10, 2)) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert_equal(X_imputed.shape, (10, 2))
def data_preprocessing(dataset): # import data # dataset = pd.read_csv('data/train.csv') X = dataset.iloc[:, 2:13].values Y = dataset.iloc[:, 1].values # replace missing data from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy= "mean", missing_values = np.nan) imputer = imputer.fit(X[:,3]) #X = imputer.fit_transform(X[:, 5]) Testing out new code X[:,3] = imputer.transform(X[:,3])
def test_imputation_const_mostf_error_invalid_types(strategy, dtype): # Test imputation on non-numeric data using "most_frequent" and "constant" # strategy X = np.array([ [np.nan, np.nan, "a", "f"], [np.nan, "c", np.nan, "d"], [np.nan, "b", "d", np.nan], [np.nan, "c", "d", "h"], ], dtype=dtype) err_msg = "SimpleImputer does not support data" with pytest.raises(ValueError, match=err_msg): imputer = SimpleImputer(strategy=strategy) imputer.fit(X).transform(X)
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent', "constant"]: imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert X_imputed.shape == (10, 2) X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) iterative_imputer = IterativeImputer(initial_strategy=strategy) X_imputed = iterative_imputer.fit_transform(X) assert X_imputed.shape == (10, 2)
def test_imputation_pickle(): # Test for pickling imputers. import pickle X = sparse_random_matrix(100, 100, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = SimpleImputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_almost_equal( imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), err_msg="Fail to transform the data after pickling " "(strategy = %s)" % (strategy) )
def test_imputation_add_indicator(marker): X = np.array([ [marker, 1, 5, marker, 1], [2, marker, 1, marker, 2], [6, 3, marker, marker, 3], [1, 2, 9, marker, 4] ]) X_true = np.array([ [3., 1., 5., 1., 1., 0., 0., 1.], [2., 2., 1., 2., 0., 1., 0., 1.], [6., 3., 5., 3., 0., 0., 1., 1.], [1., 2., 9., 4., 0., 0., 0., 1.] ]) imputer = SimpleImputer(missing_values=marker, add_indicator=True) X_trans = imputer.fit_transform(X) assert_allclose(X_trans, X_true) assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
def __call__(self, data): from Orange.data.sql.table import SqlTable if isinstance(data, SqlTable): return Impute()(data) imputer = SimpleImputer(strategy=self.strategy) X = imputer.fit_transform(data.X) # Create new variables with appropriate `compute_value`, but # drop the ones which do not have valid `imputer.statistics_` # (i.e. all NaN columns). `sklearn.preprocessing.Imputer` already # drops them from the transformed X. features = [impute.Average()(data, var, value) for var, value in zip(data.domain.attributes, imputer.statistics_) if not np.isnan(value)] assert X.shape[1] == len(features) domain = Orange.data.Domain(features, data.domain.class_vars, data.domain.metas) new_data = data.transform(domain) new_data.X = X return new_data
def test_simple_imputation_add_indicator_sparse_matrix(arr_type): X_sparse = arr_type([ [np.nan, 1, 5], [2, np.nan, 1], [6, 3, np.nan], [1, 2, 9] ]) X_true = np.array([ [3., 1., 5., 1., 0., 0.], [2., 2., 1., 0., 1., 0.], [6., 3., 5., 0., 0., 1.], [1., 2., 9., 0., 0., 0.], ]) imputer = SimpleImputer(missing_values=np.nan, add_indicator=True) X_trans = imputer.fit_transform(X_sparse) assert sparse.issparse(X_trans) assert X_trans.shape == X_true.shape assert_allclose(X_trans.toarray(), X_true)
def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert not np.all(X == Xt) # copy=True, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_array_almost_equal(X, Xt) # copy=False, sparse csc => no copy X = X_orig.copy().tocsc() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_array_almost_equal(X.data, Xt.data) # copy=False, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data)
def test_iterative_imputer_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column imputer = IterativeImputer(missing_values=0, max_iter=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then imputer will # only use the initial imputer for that feature at transform assert_allclose(imputer.transform(X_test)[:, 0], initial_imputer.transform(X_test)[:, 0])
def test_imputation_constant_integer(): # Test imputation using the constant strategy on integers X = np.array([ [-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1] ]) X_true = np.array([ [0, 2, 3, 0], [4, 0, 5, 0], [6, 7, 0, 0], [8, 9, 0, 0] ]) imputer = SimpleImputer(missing_values=-1, strategy="constant", fill_value=0) X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true)
def test_imputation_constant_object(marker): # Test imputation using the constant strategy on objects X = np.array([ [marker, "a", "b", marker], ["c", marker, "d", marker], ["e", "f", marker, marker], ["g", "h", "i", marker] ], dtype=object) X_true = np.array([ ["missing", "a", "b", "missing"], ["c", "missing", "d", "missing"], ["e", "f", "missing", "missing"], ["g", "h", "i", "missing"] ], dtype=object) imputer = SimpleImputer(missing_values=marker, strategy="constant", fill_value="missing") X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true)
def test_imputation_most_frequent_objects(marker): # Test imputation using the most-frequent strategy. X = np.array([ [marker, marker, "a", "f"], [marker, "c", marker, "d"], [marker, "b", "d", marker], [marker, "c", "d", "h"], ], dtype=object) X_true = np.array([ ["c", "a", "f"], ["c", "d", "d"], ["b", "d", "d"], ["c", "d", "h"], ], dtype=object) imputer = SimpleImputer(missing_values=marker, strategy="most_frequent") X_trans = imputer.fit(X).transform(X) assert_array_equal(X_trans, X_true)
def test_mice_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column mice = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then mice will # only use the initial imputer for that feature at transform assert np.all(mice.transform(X_test)[:, 0] == initial_imputer.transform(X_test)[:, 0])
def test_imputation_constant_pandas(dtype): # Test imputation using the constant strategy on pandas df pd = pytest.importorskip("pandas") f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n" ",i,x,\n" "a,,y,\n" "a,j,,\n" "b,j,x,") df = pd.read_csv(f, dtype=dtype) X_true = np.array([ ["missing_value", "i", "x", "missing_value"], ["a", "missing_value", "y", "missing_value"], ["a", "j", "missing_value", "missing_value"], ["b", "j", "x", "missing_value"] ], dtype=object) imputer = SimpleImputer(strategy="constant") X_trans = imputer.fit_transform(df) assert_array_equal(X_trans, X_true)
def test_imputation_error_sparse_0(strategy): # check that error are raised when missing_values = 0 and input is sparse X = np.ones((3, 5)) X[0] = 0 X = sparse.csc_matrix(X) imputer = SimpleImputer(strategy=strategy, missing_values=0) with pytest.raises(ValueError, match="Provide a dense array"): imputer.fit(X) imputer.fit(X.toarray()) with pytest.raises(ValueError, match="Provide a dense array"): imputer.transform(X)
def test_imputation_constant_float(array_constructor): # Test imputation using the constant strategy on floats X = np.array([ [np.nan, 1.1, 0, np.nan], [1.2, np.nan, 1.3, np.nan], [0, 0, np.nan, np.nan], [1.4, 1.5, 0, np.nan] ]) X_true = np.array([ [-1, 1.1, 0, -1], [1.2, -1, 1.3, -1], [0, 0, -1, -1], [1.4, 1.5, 0, -1] ]) X = array_constructor(X) X_true = array_constructor(X_true) imputer = SimpleImputer(strategy="constant", fill_value=-1) X_trans = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans, X_true)
def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test: - along the two axes - with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ "axis = {0}, sparse = {1}" % (strategy, missing_values) assert_ae = assert_array_equal if X.dtype.kind == 'f' or X_true.dtype.kind == 'f': assert_ae = assert_array_almost_equal # Normal matrix imputer = SimpleImputer(missing_values, strategy=strategy) X_trans = imputer.fit(X).transform(X.copy()) assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(0, False)) assert_ae(X_trans, X_true, err_msg=err_msg.format(0, False)) # Sparse matrix imputer = SimpleImputer(missing_values, strategy=strategy) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(0, True)) assert_ae(X_trans, X_true, err_msg=err_msg.format(0, True))
alpha=0.1) plt.show() # 数据准备 housing = strat_train_set.drop("median_house_value", axis=1) # drop()会创建一个数据副本,但不影响strat_train_set housing_labels = strat_train_set["median_house_value"].copy() # 处理缺失值 # housing.dropna(subset=["total_bedrooms"]) # total_bedrooms删除属性为缺失值所在的那一行 # housing.drop("total_bedrooms", axis=1) # 删除该属性 # median = housing["total_bedrooms"].median() # housing["total_bedrooms"].fillna(median) # 缺失值填充中位数 # 利用inputer来处理缺失值 from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy="median") housing_num = housing.drop("ocean_proximity", axis=1) imputer.fit(housing_num) print(imputer.statistics_) X = imputer.transform(housing_num) housing_tr = pd.DataFrame(X, columns=housing_num.columns) # 处理文本和分类属性 from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() # 能将文本label转化成数字 housing_cat = housing["ocean_proximity"] housing_cat_encoded = encoder.fit_transform(housing_cat) print(housing_cat_encoded) print(encoder.classes_)
# Delete the 'fuel-system' column: df = df.drop(columns=['fuel-system']) # Replace the numeric names in the categorical column with its digits: df['num-of-doors'] = df['num-of-doors'].replace(('two', 'four'), (2, 4)) df['cylinders'] = df['cylinders'].replace( ('two', 'three', 'four', 'five', 'six', 'eight', 'twelve'), (2, 3, 4, 5, 6, 8, 12)) X = df.iloc[:, :-1].values Y = df.iloc[:, -1].values from sklearn.impute import SimpleImputer # Handling missing numeric data in several columns: imp_median = SimpleImputer(missing_values=np.nan, strategy='median') X[:, [1, 15, 16, 18, 19]] = imp_median.fit_transform(X[:, [1, 15, 16, 18, 19]]) # Handling missing data in column 'num-of-doors': imp_most_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent') X[:, [4]] = imp_most_frequent.fit_transform(X[:, [4]]) # Encoding categorical columns: from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[:, 2] = labelencoder_X.fit_transform(X[:, 2]) onehotencoder = OneHotEncoder(sparse=False) A = onehotencoder.fit_transform(X[:, [2]]) X = np.hstack((A, X[:, :2], X[:, 3:])) X = X[:, 1:] B = onehotencoder.fit_transform(X[:, [3]])
# plt.legend() # plt.show() from pandas.tools.plotting import scatter_matrix attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"] # scatter_matrix(housing[attributes], figsize=(12, 8)) # plt.show() housing["rooms_per_household"] = housing["total_rooms"]/housing["households"] housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"] housing["population_per_household"]=housing["population"]/housing["households"] housing_num = housing.drop("ocean_proximity", axis=1) from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy="median") imputer.fit(housing_num) X = imputer.transform(housing_num) housing_tr = pd.DataFrame(X, columns=housing_num.columns) # from sklearn.preprocessing import LabelEncoder # encoder = LabelEncoder() housing_cat = housing["ocean_proximity"] # housing_cat_encoded = encoder.fit_transform(housing_cat) # from sklearn.preprocessing import OneHotEncoder # encoder = OneHotEncoder() # housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) from sklearn.preprocessing import LabelBinarizer
print(missing_val_count_by_column[missing_val_count_by_column > 0]) # Get names of columns with missing values cols_with_missing = [ col for col in train_X.columns if train_X[col].isnull().any() ] reduced_train_X = train_X.drop(cols_with_missing, axis=1) reduced_val_X = val_X.drop(cols_with_missing, axis=1) print("MAE from Approach 1 (Drop columns with missing values):") print(score_dataset(reduced_train_X, reduced_val_X, train_y, val_y)) from sklearn.impute import SimpleImputer # Imputation (Approach 2) my_imputer = SimpleImputer() #replace missing values with the mean value imputed_X_train = pd.DataFrame(my_imputer.fit_transform(train_X)) imputed_X_valid = pd.DataFrame(my_imputer.transform(val_X)) # Imputation removed column names; put them back imputed_X_train.columns = train_X.columns imputed_X_valid.columns = val_X.columns print("MAE from Approach 2 (Imputation):") print(score_dataset(imputed_X_train, imputed_X_valid, train_y, val_y)) # An Extension to Imputation (Approach 3): keeping track of which values were imputed # Make copy to avoid changing original data (when imputing) train_X_plus = train_X.copy() val_X_plus = val_X.copy()
dataset = pd.read_csv("kidneyChronic.csv") print(dataset.describe()) print(dataset.isnull().sum()) print(type(dataset)) dataset = dataset.replace(to_replace="\?", value=np.nan, regex=True) print(dataset.isnull().sum()) x = dataset.iloc[:, :-1].values y = dataset.iloc[:, 24].values from sklearn.impute import SimpleImputer imputer = SimpleImputer() imputerMode = SimpleImputer(strategy="most_frequent") x[:, 0:5] = imputer.fit_transform(x[:, 0:5]) x[:, 5:9] = imputerMode.fit_transform(x[:, 5:9]) x[:, 9:18] = imputer.fit_transform(x[:, 9:18]) x[:, 18:24] = imputerMode.fit_transform(x[:, 18:24]) from sklearn.preprocessing import LabelEncoder labelencoder_X = LabelEncoder() x[:, 5] = labelencoder_X.fit_transform(x[:, 5]) x[:, 6] = labelencoder_X.fit_transform(x[:, 6]) x[:, 7] = labelencoder_X.fit_transform(x[:, 7]) x[:, 8] = labelencoder_X.fit_transform(x[:, 8]) x[:, 18] = labelencoder_X.fit_transform(x[:, 9]) x[:, 19] = labelencoder_X.fit_transform(x[:, 19]) x[:, 20] = labelencoder_X.fit_transform(x[:, 20])
import numpy as np from sklearn.impute import SimpleImputer arr1 = np.array([[1, 3, 5, np.nan], [10, 14, 18, 19], [20, 34, 28, np.nan]]) print(f'{arr1}\n') imp_mean = SimpleImputer() transformed = imp_mean.fit_transform(arr1) print(f"{transformed}\n") imp_mean = SimpleImputer(strategy="most_frequent") transformed = imp_mean.fit_transform(arr1) print(f"{transformed}\n") imp_constant = SimpleImputer(strategy='constant', fill_value=-1) transformed = imp_constant.fit_transform(arr1) print('{}\n'.format(repr(transformed)))
# Mengimpor library yang diperlukan # TRAINING SET DAN TEST SET import numpy as np import pandas as pd # Import data ke python dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values #menentukan X sebagai variable dependen y = dataset.iloc[:, 3].values #menentukan y sebagai variable independen # Memproses data yang hilang (missing) from sklearn.impute import SimpleImputer #mengisi data yg hilang(nan) dgn rata2(mean) imputer = SimpleImputer(missing_values= np.nan, strategy = 'mean') #memilih usia dan gaji, sehingga kita memilih X index 1 dan 2 imputer = imputer.fit(X[:, 1:3]) #implementasi kebasris yg hilang X[:, 1:3] = imputer.transform(X[:, 1:3]) # Encoding(konversi jd angka) data kategori : variable dependen(negara) dan variabel independen(Beli) from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer labelencoder_X = LabelEncoder() # Bisa dihilangkan, baca pembahasan di bawahnya X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) # Bisa dihilangkan, baca pembahasan di bawahnya transformer = ColumnTransformer( [('Negara', OneHotEncoder(), [0])], remainder='passthrough') X = np.array(transformer.fit_transform(X), dtype=np.float) # Encoding the Dependent Variable labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # Membagi menjadi training set dan test set from sklearn.model_selection import train_test_split
y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print() # ============================================================================= # ================= Resampling the imbalanced Label of "TakeOver" ======================================== #========================================================================================================== # We create the preprocessing pipelines for both numeric and categorical data. from sklearn.pipeline import Pipeline from sklearn.utils import resample numeric_features = Cont_Filter_Cleaned numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = [ 'LeftLaneType', 'RightLaneType', 'Coming_AlarmType', 'NDTask' ] categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) # Append classifier to preprocessing pipeline. # Separate input features and target y = dataset.Takeover
# Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Data Preprocessing '''Generally, you want to treat the test set as though you did not have it during training. Whatever transformations you do to the train set should be done to the test set before you make predictions. If you apply transformation before splitting and then split into train/test you are leaking data from your test set (that is supposed to be completely withheld) into your training set. This will yield extremely biased results on model performance.''' # Impute missing values - done after train-test-splitting to prevent data leakage from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer.fit(X_train[:, 1:3]) X_train[:, 1:3] = imputer.transform(X_train[:, 1:3]) # Dummy coding the Independent Variable - we are using one hot encoding from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') X_train = np.array(ct.fit_transform(X_train)) # Dummy coding the Dependent Variable - we are using label encoding from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y_train = le.fit_transform(y_train)
# ip.dsfield.dscp [ 0. 4. 48. 32. 6. 46. 5. nan] most frequent? # ip.len unique values: 512 ## median? # ip.flags [40. 0. 21. 20. 1. nan] most frequent? # ip.frag_offset unique values: 190 ## median? # ip.ttl unique values: 61 ## integer mean? # ip.proto [ 1. 6. 17. 2. nan] ## most frequent? #print ('\n\nColumn | NaN values (before imputing)') #print ('\nTrain:') #print (X_train_df.isnull ().sum ()) #print ('\nTest:') #print (X_test_df.isnull ().sum ()) from sklearn.impute import SimpleImputer for myColumn, myStrategy in zip(columsWithMissingValues, imputingStrategies): myImputer = SimpleImputer(missing_values=np.nan, strategy=myStrategy) myImputer.fit(X_train_df[myColumn].values.reshape(-1, 1)) X_train_df[myColumn] = myImputer.transform( X_train_df[myColumn].values.reshape(-1, 1)) X_val_df[myColumn] = myImputer.transform(X_val_df[myColumn].values.reshape( -1, 1)) X_test_df[myColumn] = myImputer.transform( X_test_df[myColumn].values.reshape(-1, 1)) # Round ip.ttl X_train_df['ip.ttl'] = X_train_df['ip.ttl'].round(decimals=0) X_val_df['ip.ttl'] = X_val_df['ip.ttl'].round(decimals=0) X_test_df['ip.ttl'] = X_test_df['ip.ttl'].round(decimals=0) #print ('\n\nColumn | NaN values (before imputing)') #print ('\nTrain:')
dataset = pd.read_csv('Data.csv') # tomar valores independientes x = dataset.iloc[:,:-1].values # tomar valores dependientes y = dataset.iloc[:,3].values print(x) print(y) # Tratamiento de los NaN from sklearn.impute import SimpleImputer # remplazar por la media los valores NaN imputer = SimpleImputer(strategy="mean") #sacar la media de la edad y salario # remplazar los valoes nan x[:,1:3] = imputer.fit_transform(x[:,1:3]) print(x) # Codificar datos categoricos remplazar los paises por 0,1,2 etc. from sklearn import preprocessing #crear codificador de datos le_x = preprocessing.LabelEncoder() #transformar los paises por valores numericos (france 0, Germany 1 y Spain 2) x[:,0] = le_x.fit_transform(x[:,0]) print(x) #x[:,0] = le_x.inverse_transform(list(x[:,0])) #print(x)
group_pli = np.hstack((group_r_pli, group_u_pli)) y_pli = np.hstack((y_r_pli, y_u_pli)) X = np.hstack((X_aec, X_pli)) if np.array_equal(y_aec, y_pli): print("Y-values equal") y = y_aec if np.array_equal(group_aec, group_pli): print("group-values equal") group = group_aec final_acc_filename = commons.OUTPUT_DIR + f"models/final_SVC_{k}_c_{c}_resp_unres_{s}.pickle" #build pipeline with best model pipe = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')), ('scaler', StandardScaler()), ('CLF', clf)]) accuracies, f1s, cms = classify_loso(X, y, group, pipe) clf_data = { 'accuracies': accuracies, 'f1s': f1s, 'cms': cms, #'best_params': best_params, } final_acc_file = open(final_acc_filename, 'ab') pickle.dump(clf_data, final_acc_file) final_acc_file.close() print(sum(accuracies))
else: dx[index].append(x[i] - x[i + j]) dy[index].append(y[i] - y[i + j]) index += 1 for j in range(1, 1 + n): if i + j >= len(x): dx[index].append(np.NaN) dy[index].append(np.NaN) else: dx[index].append(x[i] - x[i + j]) dy[index].append(y[i] - y[i + j]) index += 1 #fill = np.nanmax(dx) if np.nanmax(dx)>np.nanmax(dy) else np.nanmax(dy) imp = SimpleImputer(missing_values=np.nan, strategy='constant') dx = imp.fit_transform(dx) dy = imp.fit_transform(dy) dx = pd.DataFrame( np.array([standardize(d) for d in dx]).T, columns=["dx" + str(i) for i in range(0 - n, 1 + n) if i != 0]) dy = pd.DataFrame( np.array([standardize(d) for d in dy]).T, columns=["dy" + str(i) for i in range(0 - n, 1 + n) if i != 0]) features = dx.join(dy) features = np.array(features) points = np.array(data)[:, 1:3]
df = pd.read_csv( "chronic_kidney_disease_W_header_missing_category_replaced_and_given_value.csv" ) # replacing the class values (ckd = 1, notckd = 0) df = df.replace('ckd', 1) df = df.replace('notckd', 0) X = df.iloc[:, 0:24] # features vectors # class labels: ckd = Chronic Kidney Disease, notckd = Not Chronic Kidney Disease y = df.iloc[:, 24] # Replace missing feature values. 'median' is used for numerical cases and feature value X = X.replace('?', np.nan) X.to_csv('X_test_file.csv') y.to_csv('Y_test_file.csv') imr = SimpleImputer(missing_values=np.nan, strategy='median') imr = imr.fit(X) X_imputed = imr.transform(X.values) le = LabelEncoder() # positive class = 1 (ckd), negative class = 0 (notckd) y = le.fit_transform(y) # Split data into training (80%) and testing (20%) sets X_train, X_test, y_train, y_test = train_test_split( X_imputed, y, test_size=0.2, random_state=7) # , random_state = 7 # Z-score normalization # sc = StandardScaler() # X_train = sc.fit_transform(X_train) # X_test = sc.transform(X_test)
import numpy as np import matplotlib.pyplot as plt import pandas as pd dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values # все кроме последней колонки y = dataset.iloc[:, -1].values # только последняя колонка # take care of missing data from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer.fit(X[:, 1:3]) # исключаем текстовую колонку X[:, 1:3] = imputer.transform( X[:, 1:3]) # поместить в вабраную область, отредактированный масиив # зашивруем категории в виде вектора from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder # 1 элемент - вид трансвормации, 2 - класс странсформера, 3 - колонка для кодирования # remainder - инструкция для трансформации, говорит расширить ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') X = ct.fit_transform(X) # закодируем наши игрики from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y = le.fit_transform(y) # разодьем данные на тестовые и проверочные
class TreeAugmentedNB_BayesianInf(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ A primitive which does naive bayes classification. During training, input to this primitive should be a matrix of tabular numercal/categorical data, and an array of labels. During testing, input is data matrix of numerical features only and output will be the predicted labels with metadata generated. """ metadata = metadata_base.PrimitiveMetadata({ 'id': '2fa0afb2-1b7b-462d-a7c9-11b44efe9eb0', 'version': rpi_d3m_primitives.__coreversion__, 'name': 'Tree-Augmented Naive Bayes Classifier', 'keywords': ['Tree-Augmented Naive Bayes','Bayesian Inference','Classification'], 'description': 'This algorithm is an implementation of Tree-augmented Naive Bayes classification. Bayesian Inference is applied.', 'source': { 'name': rpi_d3m_primitives.__author__, 'contact': 'mailto:[email protected]', 'uris': [ 'https://github.com/zijun-rpi/d3m-primitives/blob/master/TreeAugmentNB_BayesianInf.py', 'https://github.com/zijun-rpi/d3m-primitives.git' ] }, 'installation':[ { 'type': metadata_base.PrimitiveInstallationType.PIP, 'package': 'rpi_d3m_primitives', 'version': rpi_d3m_primitives.__version__ } ], 'python_path': 'd3m.primitives.classification.tree_augmented_naive_bayes.BayesianInfRPI', 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.NAIVE_BAYES_CLASSIFIER], 'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Union[typing.Dict[str, base.DockerContainer]] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._index = None self._training_inputs = None self._training_outputs = None self._origin_inputs = None #for label encoder self._fitted = False self._cate_flag = None self._clf = Model(modelName='tan', bayesInf=1, PointInf=1, alpha=1, N0=self.hyperparams['N0']) #classifier self._LEoutput = preprocessing.LabelEncoder() #label encoder self._Imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') #imputer self._nbins = self.hyperparams['nbins'] self._Kbins = preprocessing.KBinsDiscretizer(n_bins=self._nbins, encode='ordinal', strategy='uniform') #KbinsDiscretizer self._discTrainset = None def _store_target_columns_metadata(self, outputs: Outputs) -> None: outputs_length = outputs.metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[Dict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict(outputs.metadata.query_column(column_index)) # Update semantic types and prepare it for predicted targets. semantic_types = list(column_metadata.get('semantic_types', [])) if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types: semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') semantic_types = [semantic_type for semantic_type in semantic_types if semantic_type != 'https://metadata.datadrivendiscovery.org/types/TrueTarget'] column_metadata['semantic_types'] = semantic_types target_columns_metadata.append(column_metadata) self._target_columns_metadata = target_columns_metadata ##TO DO: #select columns via semantic types #remove preprocessing def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: ## Update semantic types and prepare it for predicted targets self._store_target_columns_metadata(outputs) ## memory original training inputs self._origin_inputs = inputs ## set training labels metadata = outputs.metadata column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, 0)) semantic_types = column_metadata.get('semantic_types', []) if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types: self._LEoutput.fit(outputs) self._training_outputs = self._LEoutput.transform(outputs) #starting from zero ## convert categorical values to numerical values in training data metadata = inputs.metadata [m,n] = inputs.shape self._training_inputs = np.zeros((m,n)) self._cate_flag = np.zeros((n,)) for column_index in metadata.get_elements((metadata_base.ALL_ELEMENTS,)): if column_index is metadata_base.ALL_ELEMENTS: continue column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) semantic_types = column_metadata.get('semantic_types', []) if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types: LE = preprocessing.LabelEncoder() LE = LE.fit(inputs.iloc[:,column_index]) self._training_inputs[:,column_index] = LE.transform(inputs.iloc[:,column_index]) self._cate_flag[column_index] = 1 elif 'http://schema.org/Text' in semantic_types: pass else: temp = list(inputs.iloc[:, column_index].values) for i in np.arange(len(temp)): if bool(temp[i]): self._training_inputs[i,column_index] = float(temp[i]) else: self._training_inputs[i,column_index] = float('nan') # imputer will remove the column with purely missing values if not np.count_nonzero(np.isnan(self._training_inputs[:, column_index])) == 0: # if there is missing values if np.count_nonzero(np.isnan(self._training_inputs[:, column_index])) == m: # all missing self._training_inputs[:, column_index] = np.zeros(m, ) # replace with all zeros self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> None: if self._fitted: return CallResult(None) if self._training_inputs.any() == None or self._training_outputs.any() == None: raise ValueError('Missing training data, or missing values exist.') ## impute missing values self._Imputer.fit(self._training_inputs) self._training_inputs = self._Imputer.transform(self._training_inputs) ## discretize non-categorical values disc_training_inputs = self._training_inputs if not len(np.where(self._cate_flag == 0)[0]) == 0: self._Kbins.fit(self._training_inputs[:, np.where(self._cate_flag == 0)[0]]) #find non-categorical values temp = self._Kbins.transform(self._training_inputs[:, np.where(self._cate_flag == 0)[0]]) disc_training_inputs[:, np.where(self._cate_flag == 0)[0]] = temp # starting from zero ## get number of states for each feature and remove features with only one state discTrainset = RelationSet(disc_training_inputs, self._training_outputs.reshape(-1,1)) discTrainset.getStateNo(self._cate_flag, self._nbins) discTrainset.remove() X_train = discTrainset.data Y_train = discTrainset.labels self._discTrainset = discTrainset stateNo = np.append(discTrainset.NUM_STATES, len(np.unique(Y_train))) ## fit the classifier self._clf.fit(X_train, Y_train, stateNo) self._fitted = True return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: # inputs: m x n numpy array if self._fitted: ## convert categorical values to numerical values in testing data metadata = inputs.metadata [m, n] = inputs.shape X_test = np.zeros((m, n)) for column_index in metadata.get_elements((metadata_base.ALL_ELEMENTS,)): if column_index is metadata_base.ALL_ELEMENTS: continue column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) semantic_types = column_metadata.get('semantic_types', []) if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types: LE = preprocessing.LabelEncoder() LE = LE.fit(self._origin_inputs.iloc[:, column_index]) #use training data to fit X_test[:, column_index] = LE.transform(inputs.iloc[:, column_index]) elif 'http://schema.org/Text' in semantic_types: pass else: temp = list(inputs.iloc[:, column_index].values) for i in np.arange(len(temp)): if bool(temp[i]): X_test[i, column_index] = float(temp[i]) else: X_test[i, column_index] = float('nan') ## impute testing data X_test = self._Imputer.transform(X_test) ## Kbins discretize for noncategorical values disc_X_test = X_test if not len(np.where(self._cate_flag == 0)[0]) == 0: temp = self._Kbins.transform(X_test[:, np.where(self._cate_flag == 0)[0]]) disc_X_test[:,np.where(self._cate_flag == 0)[0]] = temp ## remove columns with one states index_list = np.setdiff1d(np.arange(self._discTrainset.num_features), np.array(self._discTrainset.removeIdx)) disc_X_test = disc_X_test[:, index_list] ## prediction output = self._clf.predict(disc_X_test) ## label decode output = self._LEoutput.inverse_transform(output) ## update metadata output = container.DataFrame(output, generate_metadata=False, source=self) output.metadata = inputs.metadata.clear(source=self, for_value=output, generate_metadata=True) for column_index, column_metadata in enumerate(self._target_columns_metadata): output.metadata = output.metadata.update_column(column_index, column_metadata, source=self) return CallResult(output) else: raise ValueError('Model should be fitted first.') def get_params(self) -> None: pass def set_params(self) -> None: pass
#data preproceesing #importing the libraries import numpy as np #necessary libraries for machine learning models import pandas as pd import matplotlib.pyplot as mp #import the datasets datasets = pd.read_csv('Data.csv') #to import the datasets X = datasets.iloc[:, :-1].values #independent variables Y = datasets.iloc[:, 3].values #dependent variables #To take care of the missing values from sklearn.impute import SimpleImputer #library and class for missing data imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #to replace missing data with mean imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) #encode the cataogorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder #to convert catogorical data to encoded values labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) #dummyencoding X = onehotencoder.fit_transform(X).toarray()
def preprocess_input(X, le0, le1, le2, oHot, supplemental_label_data=pd.DataFrame()): # drop unwanted features unwanted_columns = ["playerName", "Season", "spacer1", "transferredSchools", "ORB", "DRB"] # "\xa0" X = X.drop(unwanted_columns, axis=1) # organize features columns = list(X.columns) categorical = ["position", "School", "Conf"] #diff = lambda l1, l2: [x for x in l1 if x not in l2] #numerical = diff(columns, categorical) # cast numerical data as floats for col in columns: if col not in categorical: X[col] = X[col].astype(float) # impute: fill in missing values from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy="mean") X = X.values # convert to ndarray of objects imputer = imputer.fit(X[:, 3:]) X[:, 3:] = imputer.transform(X[:, 3:]) # scale data # X_scaler = StandardScaler() X_scaler = X_scaler.fit(X[:, 3:]) X[:, 3:] = X_scaler.transform(X[:, 3:]) # encode categorial data # fit_data0 = X[:, 0] fit_data1 = X[:, 1] fit_data2 = X[:, 2] print(fit_data0) print(fit_data0.shape) if len(supplemental_label_data) > 0: fit_data0 = list(fit_data0) fit_data0.extend(list(supplemental_label_data.School)) print("yale:",fit_data0.index('Yale')) fit_data1 = list(fit_data1) fit_data1.extend(list(supplemental_label_data.Conf)) fit_data2 = list(fit_data2) fit_data2.extend(list(supplemental_label_data.position)) if not le0: # label encode labelEncoder_X0 = LabelEncoder() labelEncoder_X0 = labelEncoder_X0.fit(fit_data0) X[:, 0] = labelEncoder_X0.transform(X[:, 0]) labelEncoder_X1 = LabelEncoder() labelEncoder_X1 = labelEncoder_X1.fit(fit_data1) X[:, 1] = labelEncoder_X1.transform(X[:, 1]) labelEncoder_X2 = LabelEncoder() labelEncoder_X2 = labelEncoder_X2.fit(fit_data2) X[:, 2] = labelEncoder_X2.transform(X[:, 2]) # one hot encode oneHotEncoder = OneHotEncoder(categorical_features=[0,1,2], handle_unknown='ignore') oneHotEncoder = oneHotEncoder.fit(X) X = oneHotEncoder.transform(X) else: # label encoder with prev encoder X[:, 0] = le0.transform(X[:, 0]) X[:, 1] = le1.transform(X[:, 1]) X[:, 2] = le2.transform(X[:, 2]) # one hot encode with prev encoder X = oHot.transform(X) return X return (X, labelEncoder_X0, labelEncoder_X1, labelEncoder_X2, oneHotEncoder)
#kütüphane ekleme import numpy as np import pandas as pd import matplotlib.pyplot as plt #veri yükleme veriler = pd.read_csv('eksikveriler.csv') #print(veriler) boy = veriler[['boy']] #print(boy) boykilo = veriler[['boy', 'kilo']] print(boykilo) #eksik veriler from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #nan değerleri ortalama değer ile değiştir. Yas = veriler.iloc[:,1:4].values #iloc : integer location #[:,1,4] => iki nokta bütün satırları getiriyor, 1. sütundan 4. sütuna kadar print(Yas) imputer = imputer.fit(Yas[:,1:4]) #=>öğrenme işlemini sağlıyor Yas[:,1:4] = imputer.transform(Yas[:,1:4]) #değerlerin değiştirilmesi işlemi
X = dataset.iloc[:, :-1].values #X is for columns extraction [:,:-1] this takes all rows that is all 10 members #and -1 leaves last column while including all the columns Y = dataset.iloc[:, 3].values #Y takes the last columns by using its index 3 and all rows of that column #DON'T FORGET THE VALUES IN Y AND X #___________________________________________________________________________________________________# #MISSING DATA TREATMENT from sklearn.impute import SimpleImputer #simpleImpute is the new Imputer and is used to fill the missing data (mathematically) #we have imported a class now lets create and object of that class "IMPUTER" imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent") #here missing values are denoted by nan is our data set and strategy is which way we want to fill it imputer = imputer.fit(X[:, 1:3]) #here we fit the value in place of nan X[:, 1:3] = imputer.transform(X[:, 1:3]) #_____________________________________________________________________________________________________# """ ENCODING CATEGORICAL DATA LabelEncoder = france, spain, germany into numerics Problem with label encoding is that it assumes higher the categorical value, better the category OneHotEncoder(converts to binary) = https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f """ from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer
dataset = dataset.drop(['overall_rating'], axis=1) # Note that description of item has quite un-necessary info and the useful info # is already captured in cols product_specification, category tree, brand # Hence we can remove description column as well dataset = dataset.drop(['description'], axis=1) # We can also remove any duplicated rows duplicates = dataset[dataset.duplicated()] dataset = dataset.drop(labels=duplicates.index, axis=0) raw_dataset = raw_dataset.drop(labels=duplicates.index, axis=0) summary_dataset = dataset.isnull().sum() # Note that we can replace nan entries of retail & discounted price with average of that column from sklearn.impute import SimpleImputer imp = SimpleImputer(strategy='mean') dataset.retail_price = imp.fit_transform( dataset.retail_price.to_numpy().reshape(-1, 1)) dataset.discounted_price = imp.fit_transform( dataset.discounted_price.to_numpy().reshape(-1, 1)) #### raw_dataset.retail_price = imp.fit_transform( raw_dataset.retail_price.to_numpy().reshape(-1, 1)) raw_dataset.discounted_price = imp.fit_transform( raw_dataset.discounted_price.to_numpy().reshape(-1, 1)) category_tree = [] for x in dataset.product_category_tree: x = x[2:-2] category_tree.append(x.split(" >>"))
def __init__(self): self.clf = Pipeline([ ('imputer', SimpleImputer(strategy='most_frequent')), ('rf', RandomForestClassifier(max_depth=5, n_estimators=10)) ])
# Xに欠損値あり # print(X.isnull().any(axis=0)) # print(X.isnull().sum()) # # X.keys() # # >> Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', # # 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', # # 'Loan_Amount_Term', 'Credit_History', 'Property_Area'], # # dtype='object') # 設問2:特徴量Xをone-hotエンコーディングし、結果をX_oheにセットせよ。 # 設問3:X_oheに含まれる欠損値を補完し、X_finの名前でセットしてください。 # 欠損値を落としたデータセットX_dnaでImputerを学習 # X_impに欠損値を補完したデータをセット imputer_mf = SimpleImputer(strategy="most_frequent") X_dna = X.dropna() X_imp = pd.DataFrame() imputer_mf.fit(X_dna) X_ohe = imputer_mf.transform(X) X_ohe = pd.DataFrame(X_ohe, columns=X.columns) # OneHotEncodingと標準化を行って辞書に格納 continuous = np.array( ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']) ohe = OneHotEncoder(sparse=False) mms = MinMaxScaler() enc = {} ans = 0
test_data = pd.read_csv("C:/01_Projects/09_CriticalFormulasandTools/PythonScripts/TitanicData/test.csv") test_data.head() y = train_data["Survived"] features = ["Pclass", "Sex", "Fare", "Age"] X = pd.get_dummies(train_data[features]) X_test = pd.get_dummies(test_data[features]) from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import confusion_matrix from sklearn.impute import SimpleImputer my_imputer = SimpleImputer() X = my_imputer.fit_transform(X) X_test = my_imputer.fit_transform(X_test) model1 = GaussianNB() model1.fit(X, y) model2 = RandomForestClassifier(max_depth=15, n_estimators=100, bootstrap=False, max_features= 'sqrt', min_samples_leaf=4, min_samples_split=10) model2.fit(X, y) model3 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=2000) model3.fit(X, y) model4 = KNeighborsClassifier(3) model4.fit(X, y) labels_pred1 = model1.predict(X) labels_pred2 = model2.predict(X) labels_pred3 = model3.predict(X)
act = act.loc[:, [ 'id', 'Bacteria', 'Fungi', 'Prokaryote', 'Virus', 'Cancer', 'Immunomodulator' ]] plat = pd.merge(paths, act, how='left', on='id') ## set cluster clusters = [os.path.basename(file) for file in plat['fasta']] plat = plat.assign(cluster=clusters) # read fingerprints fps = pd.read_csv(git_dir + "/data/platinum/PRISM_fingerprints_mean.csv.gz") X = fps.set_index('id') X = X.reindex(plat['id']) # impute missing values imputer = SimpleImputer(strategy='constant', fill_value=0) imputer.fit(X) X = imputer.transform(X) # build models targets = [ 'Bacteria', 'Fungi', 'Prokaryote', 'Virus', 'Cancer', 'Immunomodulator' ] for target in targets: print("building models for target: " + target) y = plat[target].values # set up model svc = SVC(C=0.01, gamma=0.1, kernel='poly',
print("True Outcome: {0}, False Outcome: {1}".format(diabetes_true_count, diabetes_false_count)) #Missing Zeros without Outcome Column print("How many Zero value are here: ") print(dataSet.iloc[:, 0:8].eq(0).sum()) # Feature Cloumn or Independent Variable X = dataSet.iloc[:, :-1].values #Dependent column or Predict Class y = dataSet.iloc[:, 8].values # Zero fill with mean of the column from sklearn.impute import SimpleImputer fill_values = SimpleImputer(missing_values=0, strategy='mean') X[:, 1:8] = fill_values.fit_transform(X[:, 1:8]) #Trin/Test Split from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) #Feature Scaling from sklearn.preprocessing import StandardScaler scale_X = StandardScaler() X_train = scale_X.fit_transform(X_train) X_test = scale_X.transform(X_test)
def lesson_2(): print_("LESSON 2: Missing values", 0, 1) # ---------------------------------- # Example: Melbourne Housing dataset # ---------------------------------- # Load data features = [ 'Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude' ] X_train, X_valid, y_train, y_valid = load_data_for_lesson_2(features) # Build a random forest model forest_model = RandomForestRegressor(random_state=1) # -------------------------------------------- # Approach 1: Drop Columns with Missing Values # -------------------------------------------- # Get names of columns with missing values cols_with_missing = [ col for col in X_train.columns if X_train[col].isnull().any() ] # Drop columns in training and validation data reduced_X_train = X_train.drop(cols_with_missing, axis=1) reduced_X_valid = X_valid.drop(cols_with_missing, axis=1) print("MAE from Approach 1 (Drop columns with missing values):") print( score_model(forest_model, reduced_X_train, reduced_X_valid, y_train, y_valid)) # ---------------------- # Approach 2: Imputation # ---------------------- # Imputation my_imputer = SimpleImputer() imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid)) # Imputation removed column names; put them back imputed_X_train.columns = X_train.columns imputed_X_valid.columns = X_valid.columns print("\nMAE from Approach 2 (Imputation):") print( score_model(forest_model, imputed_X_train, imputed_X_valid, y_train, y_valid)) # -------------------------------------- # Approach 3: An Extension to Imputation # -------------------------------------- # We impute the missing values, while also keeping track of which values # were imputed # Make copy to avoid changing original data (when imputing) X_train_plus = X_train.copy() X_valid_plus = X_valid.copy() # Make new columns indicating what will be imputed for col in cols_with_missing: X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull() X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull() # Imputation my_imputer = SimpleImputer() imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus)) imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus)) # Imputation removed column names; put them back imputed_X_train_plus.columns = X_train_plus.columns imputed_X_valid_plus.columns = X_valid_plus.columns print("\nMAE from Approach 3 (An Extension to Imputation):") print_( score_model(forest_model, imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid)) # Shape of training data (num_rows, num_columns) print_("Shape of training data (num_rows, num_columns)", 0) print_(X_train.shape) # Number of missing values in each column of training data missing_val_count_by_column = (X_train.isnull().sum()) print_("Number of missing values in each column of training data", 0) print_(missing_val_count_by_column[missing_val_count_by_column > 0])
import numpy as np import pandas as pd dataset = pd.read_csv('houses.csv') dataset.head(20) from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp = imp.fit(dataset) dataset = imp.transform(dataset) X = dataset[:, :-1] y = dataset[:, -1] X y from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = sc.fit_transform(X) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
test.drop("Name", axis=1, inplace=True) test.info() train.info() sex_mapping = {"male": 0, "female": 1} for dataset in train_test_data: dataset["Sex"] = dataset["Sex"].map(sex_mapping) train.head() barchart("Sex") xtrain.info() train["Age"].isnull().sum() from sklearn.impute import SimpleImputer si = SimpleImputer(missing_values=np.nan, strategy="median") si = si.fit(train[['Age']]) train['Age'] = si.transform(train[['Age']]) train["Age"].isnull().sum() from sklearn.impute import SimpleImputer si = SimpleImputer(missing_values=np.nan, strategy="median") si = si.fit(test[['Age']]) test['Age'] = si.transform(test[['Age']]) test['Age'].isnull().sum() #test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True) facet = sns.FacetGrid(train, hue="Survived", aspect=2) facet.map(sns.kdeplot, 'Age', shade=True) facet.set(xlim=(0, train["Age"].max()))
df = pd.read_csv("weatherHistory.csv") categorical = df.select_dtypes(include=["object"]).keys() print(categorical) quantitative = df.select_dtypes(include=["float64"]).keys() print(quantitative) #checking if any quantative has zero effect on temperature df[quantitative].hist() #Dropping Loud Cover as zero effect df = df.drop('Loud Cover', axis=1) #SimpleImputer to replace 0 in pressure(millibars) from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=0, strategy='median') df.iloc[:, 9:10] = imp.fit_transform(df.iloc[:, 9:10]) imp = SimpleImputer(missing_values=np.nan, strategy="constant") df.iloc[:, 2:3] = imp.fit_transform(df.iloc[:, 2:3]) X = df.iloc[:, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]].values Y = df.iloc[:, 3].values from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) X[:, 1] = labelencoder_X.fit_transform(X[:, 1]) X[:, 2] = labelencoder_X.fit_transform(X[:, 2]) X[:, 9] = labelencoder_X.fit_transform(X[:, 9]) ohe = OneHotEncoder(categorical_features=[0, 1, 2, 9])
import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer dataset = pd.read_csv('Data.csv') x = dataset.iloc[:, :-1].values # Get elements by ID y = dataset.iloc[:, -1].values imputer = SimpleImputer(missing_values=np.nan, strategy="mean") imputer = imputer.fit(x[:, 1:3]) x[:, 1:3] = imputer.transform(x[:, 1:3]) labelencoder_x = LabelEncoder() x[:, 0] = labelencoder_x.fit_transform(x[:, 0]) ct = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])], remainder='passthrough') x = np.array(ct.fit_transform(x), dtype=np.float) print(x)