def test_imputation_error_sparse_0(strategy): # check that error are raised when missing_values = 0 and input is sparse X = np.ones((3, 5)) X[0] = 0 X = sparse.csc_matrix(X) imputer = SimpleImputer(strategy=strategy, missing_values=0) with pytest.raises(ValueError, match="Provide a dense array"): imputer.fit(X) imputer.fit(X.toarray()) with pytest.raises(ValueError, match="Provide a dense array"): imputer.transform(X)
def data_preprocessing(dataset): # import data # dataset = pd.read_csv('data/train.csv') X = dataset.iloc[:, 2:13].values Y = dataset.iloc[:, 1].values # replace missing data from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy= "mean", missing_values = np.nan) imputer = imputer.fit(X[:,3]) #X = imputer.fit_transform(X[:, 5]) Testing out new code X[:,3] = imputer.transform(X[:,3])
def test_imputation_pickle(): # Test for pickling imputers. import pickle X = sparse_random_matrix(100, 100, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = SimpleImputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_almost_equal( imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), err_msg="Fail to transform the data after pickling " "(strategy = %s)" % (strategy) )
def test_iterative_imputer_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column imputer = IterativeImputer(missing_values=0, max_iter=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then imputer will # only use the initial imputer for that feature at transform assert_allclose(imputer.transform(X_test)[:, 0], initial_imputer.transform(X_test)[:, 0])
def test_mice_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column mice = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then mice will # only use the initial imputer for that feature at transform assert np.all(mice.transform(X_test)[:, 0] == initial_imputer.transform(X_test)[:, 0])
def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test: - along the two axes - with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ "axis = {0}, sparse = {1}" % (strategy, missing_values) assert_ae = assert_array_equal if X.dtype.kind == 'f' or X_true.dtype.kind == 'f': assert_ae = assert_array_almost_equal # Normal matrix imputer = SimpleImputer(missing_values, strategy=strategy) X_trans = imputer.fit(X).transform(X.copy()) assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(0, False)) assert_ae(X_trans, X_true, err_msg=err_msg.format(0, False)) # Sparse matrix imputer = SimpleImputer(missing_values, strategy=strategy) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(0, True)) assert_ae(X_trans, X_true, err_msg=err_msg.format(0, True))
from pandas.tools.plotting import scatter_matrix attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"] # scatter_matrix(housing[attributes], figsize=(12, 8)) # plt.show() housing["rooms_per_household"] = housing["total_rooms"]/housing["households"] housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"] housing["population_per_household"]=housing["population"]/housing["households"] housing_num = housing.drop("ocean_proximity", axis=1) from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy="median") imputer.fit(housing_num) X = imputer.transform(housing_num) housing_tr = pd.DataFrame(X, columns=housing_num.columns) # from sklearn.preprocessing import LabelEncoder # encoder = LabelEncoder() housing_cat = housing["ocean_proximity"] # housing_cat_encoded = encoder.fit_transform(housing_cat) # from sklearn.preprocessing import OneHotEncoder # encoder = OneHotEncoder() # housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) from sklearn.preprocessing import LabelBinarizer encoder = LabelBinarizer() housing_cat_1hot = encoder.fit_transform(housing_cat)
#kutuphaneler import pandas as pd import numpy as np import matplotlib.pyplot as plt #kod bolumu #veri yukleme veriler = pd.read_csv('eksikveriler.csv') #sci - kit learn ekik veriler from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') Yas = veriler.iloc[:, -2:-1].values imputer = imputer.fit( Yas ) #fit eğitmek için kullanılır kolonların ortalama değerlerini öğrenecek Yas = imputer.transform(Yas) #transformla öğrendiğini yerine koy print(Yas) veriler.iloc[:, -2:-1] = Yas #değiştirilmiş veriler ile yer değiştir. print(veriler)
# statistical imputation transform for the horse colic dataset from numpy import isnan from pandas import read_csv from sklearn.impute import SimpleImputer # load dataset url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv' dataframe = read_csv(url, header=None, na_values='?') # split into input and output elements data = dataframe.values ix = [i for i in range(data.shape[1]) if i != 23] X, y = data[:, ix], data[:, 23] # print total missing print('Missing: %d' % sum(isnan(X).flatten())) # define imputer imputer = SimpleImputer(strategy='mean') # fit on the dataset imputer.fit(X) # transform the dataset Xtrans = imputer.transform(X) # print total missing print('Missing: %d' % sum(isnan(Xtrans).flatten()))
#X = X.fillna(0) # instead of imputing from sklearn.impute import SimpleImputer #imputer = SimpleImputer() y = df_modified['poi'] # separate financial and mail features to rescale #X_financial = X.drop(email_features_list, axis=1) X_financial = X[finance_features_list] #X_financial['bonus/salary'] = X_financial['bonus'] / X_financial['salary'] # kinda leaky, but imagine we have the data necessary #X_financial = X_financial.drop(['bonus', 'salary'], axis=1) #X_financial = X_financial.fillna(0) imputerF = SimpleImputer(strategy='median') imputerF.fit(X_financial) X_financial = imputerF.transform(X_financial) from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import StandardScaler #scalerF = MinMaxScaler() scalerF = StandardScaler() scalerF.fit(X_financial) X_financial = scalerF.transform(X_financial) X_financial = pd.DataFrame(X_financial, index=X.index.values) X_mail = X[email_features_list] X_mail['from_poi/from'] = X_mail['from_this_person_to_poi'] / X_mail[ 'from_messages'] X_mail['to_poi/to'] = ( X_mail['from_poi_to_this_person'] + X_mail['shared_receipt_with_poi']) / X_mail['to_messages']
previsores[:, 1] = lambelEncoder_previsores.fit_transform(previsores[:, 1]) previsores[:, 3] = lambelEncoder_previsores.fit_transform(previsores[:, 3]) previsores[:, 5] = lambelEncoder_previsores.fit_transform(previsores[:, 5]) previsores[:, 6] = lambelEncoder_previsores.fit_transform(previsores[:, 6]) previsores[:, 7] = lambelEncoder_previsores.fit_transform(previsores[:, 7]) previsores[:, 8] = lambelEncoder_previsores.fit_transform(previsores[:, 8]) previsores[:, 9] = lambelEncoder_previsores.fit_transform(previsores[:, 9]) previsores[:, 13] = lambelEncoder_previsores.fit_transform(previsores[:, 13]) from sklearn.impute import SimpleImputer imputer = SimpleImputer( missing_values=np.nan, strategy='median') # COLOCA A MEDIA NOS CAMPOS NULOS(NaN) imputer = imputer.fit(previsores[:, 0:len(previsores)]) previsores[:, 0:len(previsores)] = imputer.transform( previsores[:, 0:len(previsores)]) # TRANSFORMA A CLASSE EM BINARIO labelEnconderClasse = LabelEncoder() classe = labelEnconderClasse.fit_transform(classe) #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #ESCALONAMENTO(PADRONIZAÇÃO DOS DADOS) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() previsores = scaler.fit_transform(previsores) from sklearn.model_selection import train_test_split previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split( previsores, classe, test_size=0.25, random_state=0)
import numpy as np import pandas as pd ds = pd.read_csv("Data.csv") x = ds.iloc[:, 0:3].values y = ds.iloc[:, 3].values from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.nan, strategy="median") imp = imp.fit(x[:, 1:3]) x[:, 1:3] = imp.transform(x[:, 1:3]) from sklearn.preprocessing import LabelEncoder, OneHotEncoder lec = LabelEncoder() lec = lec.fit(x[:, 0]) x[:, 0] = lec.transform(x[:, 0]) ohe = OneHotEncoder(categorical_features=[0]) x = ohe.fit_transform(x).toarray() lec = lec.fit(y) y = lec.transform(y) from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
# Data Preprocessing # Importing the libraries # import matplotlib.pyplot as plt # library pandas offers data structures and operations for manipulating numerical tables and time series import numpy as np import pandas as pd # Importing the dataset df = pd.read_csv('Data.csv') X = df.iloc[:, :-1].values y = df.iloc[:, 3].values # Taking care of missing data from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values = np.nan, strategy = 'mean') imp = imp.fit(X[:, 1:3]) X[:, 1:3] = imp.transform(X[:, 1:3]) # GroupBy of DataFrame test # dfGroup = df.groupby('Purchased') # for Purchased in dfGroup: # print(Purchased)
def impute(train, validate, test, my_strategy, column_list): imputer = SimpleImputer(strategy=my_strategy) train[column_list] = imputer.fit_transform(train[column_list]) validate[column_list] = imputer.transform(validate[column_list]) test[column_list] = imputer.transform(test[column_list]) return train, validate, test
test_num_group = num_imp.fit_transform(test_num_group) std_ = StandardScaler() train_num_group = std_.fit_transform(train_num_group) test_num_group = std_.fit_transform(test_num_group) train['y'] = train['y'].replace({'yes': True, 'no': False}) test['y'] = test['y'].replace({'yes': True, 'no': False}) train = train.drop(num_list, axis=1) test = test.drop(num_list, axis=1) one_hot_groupname = ['marital', 'day_of_week'] one_hot_train = train[one_hot_groupname] one_hot_test = test[one_hot_groupname] train = train.drop(one_hot_groupname, axis=1) test = test.drop(one_hot_groupname, axis=1) imp = SimpleImputer(strategy='most_frequent') one_hot_train = imp.fit_transform(one_hot_train) one_hot_test = imp.transform(one_hot_test) one_hot = OneHotEncoder(handle_unknown='ignore') one_hot_train = one_hot.fit_transform(one_hot_train) one_hot_train = one_hot_train.toarray() one_hot_test = one_hot.transform(one_hot_test) one_hot_test = one_hot_test.toarray() encoder_columns = list(train.columns) encoder_columns = encoder_columns[:-1] for col_name in encoder_columns: df_change = train[[col_name, 'y']] df_change = df_change.groupby(col_name).mean().sort_values( 'y').reset_index() num = 1 match_dict = dict() for i in df_change.iloc[:, 0]: match_dict[i] = num
por ky informacion ndihmon vetem ne raste te caktuara, ne pergjithesi shkakton mangesi te shumta ne algoritem, per kete arsye nuk merren ne konsiderate. Shembull kur ndikon: nese dihet perdoruesi perkates perpara blerjes! ''' # importimi i librarise imputer - mbushja me zero e variablave te NaN from sklearn.impute import SimpleImputer # vlerat qe mungojne [kategoria 2 dhe 3 kane shume vlera null] data.isnull().sum() # mbush vlerat null me 0 - shablloni imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0) imputer = imputer.fit(data.iloc[:, 9:11]) data.iloc[:, 9:11] = imputer.transform(data.iloc[:, 9:11]) # heqja e dy kolonave te para te datasetit [user dhe product] data.drop(data.columns[[0, 1]], axis=1, inplace=True) ''' - Per arsye se na duhet te fshijme outliers prej kolonave te caktuara, na duhet qe dataseti te jete me vlera numbers e jo NaN (per arsye se tek funksioni remove_outliers me poshte, kerkohen numra e jo stringje). ''' # OUTELIERS from pandas.api.types import is_numeric_dtype def remove_outlier(df): low = .05
from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler dataset = pd.read_csv(r'04_dados_exercicio.csv') features = dataset.iloc[:, :-1].values classe = dataset.iloc[:, -1].values imputer = SimpleImputer(missing_values=np.nan, strategy="mean") imputer.fit(features[:, 2:4]) features[:, 2:4] = imputer.transform(features[:, 2:4]) columnTransformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough') features = np.array(columnTransformer.fit_transform(features)) labelEncoder = LabelEncoder() classe = labelEncoder.fit_transform(classe) features_treinamento, features_teste, classe_treinamento, classe_teste = train_test_split( features, classe, test_size=0.15, random_state=1) standardScaler = StandardScaler() features_treinamento[:, 4:6] = standardScaler.fit_transform( features_treinamento[:, 4:6])
print("MAE (Drop columns with missing values):") print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid)) # %% [markdown] # # Step 3: Imputation # # ### Part A # # Use the next code cell to impute missing values with the mean value along each column. Set the preprocessed DataFrames to `imputed_X_train` and `imputed_X_valid`. Make sure that the column names match those in `X_train` and `X_valid`. # %% [code] # Fill in the lines below: imputation my_imputer = SimpleImputer() imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid)) # Fill in the lines below: imputation removed column names; put them back imputed_X_train.columns = X_train.columns imputed_X_valid.columns = X_valid.columns # Check your answers step_3.a.check() # %% [code] # Lines below will give you a hint or solution code # step_3.a.hint() step_3.a.solution() # %% [markdown] # Run the next code cell without changes to obtain the MAE for this approach.
# Define as features para previsores previsores = df.iloc[:,1:4].values classe = df.iloc[:, 4].values # Instancia a classe Imputer imputer = SimpleImputer( missing_values = np.nan, strategy='mean' ) # Treina o algoritmo com os valores existentes imputer = imputer.fit(previsores[:, 0:3]) # Insere os valores nas células com valores faltantes previsores = imputer.transform(previsores) # Istancia a classe StandardScaler para escalonamento dos valores scaler = StandardScaler() # Padroniza os valores na mesma escala previsores = scaler.fit_transform(previsores) """ DIVISÃO TREINO E TESTE """ previsores_train, previsores_test, classe_train, classe_test = train_test_split( previsores, classe, test_size=0.25, random_state=0
Eyas = eksikveriler[['yas']] #sci - kit learn # *** Yas Stunundaki NAN(Boş) Değerleri Stundaki Verilerin Ortalaması ile Doldurma İşlemleri *** from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # Üzerinde işlem yapılacak olan ilgili stunun(Yas) ayrıştırılması Yas = eksikveriler.iloc[:, 3:4].values #print(Yas) imputer = imputer.fit(Yas) #fit fonksiyonu eğitmek için kullanılır Yas = imputer.transform( Yas ) # transfor ile ise öğrendiğini uygulamasını sağlar (nan değerlerin ortalama ile değiştirilmesi) # düzeltilen eksik verileri geri yükleme ve veriler değişkenine atama işlemi eksikveriler.iloc[:, 3:4] = Yas veriler = eksikveriler #print(veriler) # *** Kategorik Verileri, Nümeriğe Dönüştürme İşlemleri *** # ilgili kategorik stunu(ulke) verilerden ayrıştırma işlemi ulke = veriler.iloc[:, 0:1].values #print(ulke) from sklearn import preprocessing
# Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd #Read the input data data from the external CSV dataset = pd.read_csv('Haiti.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 8].values # Taking care of missing data from sklearn.impute import SimpleImputer # creating object for SimpleImputer class as "imputer" imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose=0) imputer = imputer.fit(X[:, 1:8]) #upper bound is not included, but lower bound X[:, 1:8] = imputer.transform(X[:, 1:8]) # Encoding the dependent Variable from sklearn.preprocessing import LabelEncoder labelencoder_Y = LabelEncoder() y = labelencoder_Y.fit_transform(y) #Rescale data (between 0 and 1) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range=(0, 1)) rescaledX = scaler.fit_transform(X)
dataset.head() dataset.isna().any() dataset.isna().sum() dataset = dataset.drop( columns=['MiscFeature', 'Fence', 'PoolQC', 'FireplaceQu', 'Alley']) # Taking care of missing data """ dataset.fillna(value=dataset['LotFrontage'].mean(),inplace=True) """ from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = imputer.fit(dataset.iloc[:, [3, 25, 57]].values) dataset[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']] = imputer.transform( dataset.iloc[:, [3, 25, 57]].values).astype('int32') dataset.fillna(value='None Avialabe', inplace=True) dataset.isna().sum() X = dataset.drop(columns='SalePrice') #independent fields y = dataset['SalePrice'] #label numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] Xnumeric = X.select_dtypes(include=numerics) fig = plt.figure(figsize=(15, 12)) plt.suptitle('Histograms of Independent Columns(Continuous values)', fontsize=20) ## Histograms for i in range(1, Xnumeric.shape[1]): plt.subplot(7, 6, i)
# input interval variables numerical_inputs = list(df.select_dtypes(include=['int64', 'float32']).columns) inputs = class_inputs + numerical_inputs # Data engineering # # Impute missings categorical_imputer = SimpleImputer( missing_values='', strategy='most_frequent') numerical_imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # Impute categorical variables categorical_imputer.fit(df[class_inputs]) categorical_imputed = categorical_imputer.transform(df[class_inputs]) df_categorical_imputed = pd.DataFrame( data=categorical_imputed, columns=class_inputs) # Impute numerical variables numerical_imputer.fit(df[numerical_inputs]) numerical_imputed = numerical_imputer.transform(df[numerical_inputs]) df_numerical_imputed = pd.DataFrame( data=numerical_imputed, columns=numerical_inputs) # One-hot encoding encoder = OneHotEncoder() encoder.fit(categorical_imputed) categorical_encoded = encoder.transform(categorical_imputed)
imputed_X_train_plus = X.copy() imputed_X_test_plus = test_X.copy() # cols_with_missing = (col for col in X.columns # if X[col].isnull().any()) # for col in cols_with_missing: # imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull() # # # cols_with_missing_test = (col for col in test_X.columns # if test_X[col].isnull().any()) # for col in cols_with_missing_test: # imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull() # Imputation my_imputer = SimpleImputer() imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus) imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus) model = RandomForestRegressor(n_estimators=20, random_state=0) model.fit(imputed_X_train_plus, y) # make predictions which we will submit. test_preds = model.predict(imputed_X_test_plus) # The lines below shows how to save predictions in format used for competition scoring # Just uncomment them. output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_preds}) output.to_csv('submission.csv', index=False)
print(X_train.Sex) #OneHotEncoder()を例で理解する #多次元配列[0,2,1,1]を列ベクトルにしてOneHotEncoderに入れると one = sp.OneHotEncoder() print('ここがonehotencoder') print(X_train.Sex.values) print(X_train.Sex.values.reshape(-1, 1)) print(X_train.Sex.values.reshape(-1, 1).transpose()) enced = one.fit_transform(X_train.Sex.values.reshape(1, -1).transpose()) print('ここからがenced') print(enced) print(enced.toarray()) #index=df.Sex.indexがわからん temp = pd.DataFrame(index=df.Sex.index, columns='Sex-' + le.classes_, data=enced.toarray()) print('ここからがtemp') print(temp) enced_data = pd.concat([X_train, temp], axis=1) del enced_data['Sex'] from sklearn.impute import SimpleImputer im = SimpleImputer(missing_values=np.nan, strategy='mean') im.fit(enced_data) im.transform(enced_data) print('ここがenced_data') print(enced_data) enced_data = im.fit_transform(enced_data) print(enced_data)
#--------------------------- #Drop Columns with Missing Values # Get names of columns with missing values cols_with_missing = [ col for col in X_train.columns if X_train[col].isnull().any() ] # Drop columns in training and validation data reduced_X_train = X_train.drop(cols_with_missing, axis=1) reduced_X_valid = X_valid.drop(cols_with_missing, axis=1) #--------------------------- #Imputation from sklearn.impute import SimpleImputer # Imputation my_imputer = SimpleImputer() imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid)) # Imputation removed column names; put them back imputed_X_train.columns = X_train.columns imputed_X_valid.columns = X_valid.columns #--------------------------- #An Extension to Imputation # Make copy to avoid changing original data (when imputing) X_train_plus = X_train.copy() X_valid_plus = X_valid.copy() # Make new columns indicating what will be imputed for col in cols_with_missing: X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull() X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
import numpy as np import pandas as pd # ---------------------------------------------------- dataset = pd.read_csv('path/2.9 Ensemble Reg/houses.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # ---------------------------------------------------- from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp = imp.fit(X) X = imp.transform(X) # ---------------------------------------------------- from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=42) # ---------------------------------------------------- # Fitting Random Forest Regression to the dataset from sklearn.ensemble import RandomForestRegressor regressor = RandomForestRegressor(n_estimators=300, random_state=0) regressor.fit(X_train, y_train) # ---------------------------------------------------- print(regressor.score(X_train, y_train)) print(regressor.score(X_test, y_test)) print("=" * 25) print(regressor.feature_importances_) print("=" * 25) # ---------------------------------------------------- # Predicting a new result y_pred = regressor.predict(X_test)
# Importing the dataset dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values # Taking care of missing data from sklearn.impute import SimpleImputer missingvalues = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0) missingvalues = missingvalues.fit(X[:, 1:3]) X[:, 1:3] = missingvalues.transform(X[:, 1:3]) # Encoding categorical data # Encoding the Independent Variable from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) X = onehotencoder.fit_transform(X).toarray() # Encoding the Dependent Variable labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) from sklearn.model_selection import train_test_split
boykilo = veriler[['boy', 'kilo']] print(boykilo) #eksik veriler from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #nan değerleri ortalama değer ile değiştir. Yas = veriler.iloc[:, 1:4].values #iloc : integer location #[:,1,4] => iki nokta bütün satırları getiriyor, 1. sütundan 4. sütuna kadar print(Yas) imputer = imputer.fit(Yas[:, 1:4]) #=>öğrenme işlemini sağlıyor Yas[:, 1:4] = imputer.transform(Yas[:, 1:4]) #değerlerin değiştirilmesi işlemi # Kategorik verilerin dönüşümü - encoder : Kategorik verilerden Numerik verilere dönüş sağlar ulke = veriler.iloc[:, 0:1].values print(ulke) from sklearn import preprocessing le = preprocessing.LabelEncoder() ulke[:, 0] = le.fit_transform(veriler.iloc[:, 0]) print(ulke) ohe = preprocessing.OneHotEncoder() ulke = ohe.fit_transform(ulke).toarray() print(ulke)
import matplotlib.pyplot as plt import pandas as pd dataset = pd.read_csv('3.csv') x = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values print(x) print(y) from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer.fit(x[:, 1:3]) x[:, 1:3] = imputer.transform(x[:, 1:3]) print(x) from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') x = np.array(ct.fit_transform(x)) print(x) from sklearn.preprocessing import LabelEncoder le = LabelEncoder()
for filename in filenames: print(os.path.join(dirname, filename)) train_data = pd.read_csv('../input/titanic/train.csv') test_data = pd.read_csv('../input/titanic/test.csv') from sklearn.ensemble import RandomForestClassifier from sklearn.impute import SimpleImputer test_data = pd.read_csv('../input/titanic/test.csv') y = train_data["Survived"] features = ["Pclass", "Sex", "SibSp", "Parch", "Age"] X_train = pd.get_dummies(train_data[features]) X_test = pd.get_dummies(test_data[features]) imp = SimpleImputer(missing_values=np.nan, strategy="mean") imp = imp.fit(X_train) X_train_imp = imp.transform(X_train) model = RandomForestClassifier(n_estimators=1000, max_depth=32, random_state=1) model.fit(X_train_imp, y) X_test_imp = imp.transform(X_test) predictions = model.predict(X_test_imp) output = pd.DataFrame({"PassengerId": test_data.PassengerId, "Survived": predictions}) output.to_csv("my_submission.csv", index=False) print("Your submission has been successfully saved!")
import pandas as pd import numpy as np from sklearn.impute import SimpleImputer from sklearn.metrics import mean_absolute_error, mean_squared_error from math import sqrt df_original = pd.read_csv(r"C:\Users\Suzana\Desktop\Iris\iris2.csv") df_missing = pd.read_csv(r"C:\Users\Suzana\Desktop\Iris\MCAR_30.csv") # Potrebni su mi samo features-i features_original = df_original.iloc[:, 0:4] features_missing = df_missing.iloc[:, 1:5] imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') imp_mean.fit(features_missing) features_estimated = imp_mean.transform(features_missing) # Nadji indekse vrijednosti za validaciju indices_missing = np.argwhere(features_missing.isna().values) indices_missing_hashable = map(tuple, indices_missing) indices_set = set(indices_missing_hashable) indices = list(indices_set) print(indices) print(len(indices)) s = 0 r = 0 for i in indices: s = s + abs(features_original.values.item(i) - features_estimated.item(i)) r = r + (features_original.values.item(i) - features_estimated.item(i))**2 print('Prava MAE =', s / len(indices)) print('Fake MAE =',
X['Fecha'] = pd.to_datetime(X['Fecha']) #IFE X['Dia'] = pd.DatetimeIndex(X['Fecha']).day.astype('object') X['Mes'] = pd.DatetimeIndex(X['Fecha']).month.astype('object') X['Dia_Semana'] = (pd.DatetimeIndex(X['Fecha']).weekday + 1).astype( 'object') #FFE variables_categoricas = X.dtypes.pipe( lambda x: x[x == 'object']).index #Sí va en FE num_cols = X.dtypes.pipe(lambda x: x[x != 'object']).index for x in num_cols: imp = SimpleImputer(missing_values=np.nan, strategy='median') imp.fit(np.array(X[x]).reshape(-1, 1)) X[x] = imp.transform(np.array(X[x]).reshape(-1, 1)) nominal_cols = X.dtypes.pipe(lambda x: x[x == 'object']).index for x in nominal_cols: imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent') imp.fit(np.array(X[x]).reshape(-1, 1)) X[x] = imp.transform(np.array(X[x]).reshape(-1, 1)) x_mat = pd.get_dummies(X, columns=variables_categoricas, drop_first=True) #FFE #Inicio de modelado indice_ent = X['Fecha'] <= '2019-11-30' #Va en modelado variables_a_eliminar = ['Fecha', 'Año', 'Afluencia'] #Va en modelado
# In[ ]: columns = train_data.columns[1:] # ### Replce NaN values (Imputation) and standardize data # In[ ]: # For class 1 data point imp = SimpleImputer(missing_values=np.nan, strategy='median') X_train = imp.fit_transform(train_data.iloc[:, 1:]) X_test = imp.transform(test_data.iloc[:, 1:]) pickle.dump(imp, open('imputer.pkl', 'wb')) print("Number of NaN after imputation", np.count_nonzero(np.isnan(X_train))) # standardizing data for better EDA and modeling # In[ ]: scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) pickle.dump(scaler, open('scaler.pkl', 'wb'))
import numpy as np import pandas as pd dataset = pd.read_csv('houses.csv') dataset.head(20) from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp = imp.fit(dataset) dataset = imp.transform(dataset) X = dataset[:, :-1] y = dataset[:, -1] X y from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = sc.fit_transform(X) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) X_train X_test y_train
import pandas as pd base = pd.read_csv('credit_data.csv') base.loc[base.age < 0, 'age'] = 40.92 previsores = base.iloc[:,1:4].values classe = base.iloc[:,4].values from sklearn.impute import SimpleImputer imputer = SimpleImputer() imputer = imputer.fit(previsores[:,0:3]) previsores[:,0:3] = imputer.transform(previsores) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() previsores = scaler.fit_transform(previsores) from sklearn.model_selection import train_test_split previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.25, random_state=0) from import classificador = classificador.fit(previsores_treinamento, classe_treinamento) previsoes = classificador.predict(previsores_teste) from sklearn.metrics import confusion_matrix, accuracy_score precisao = accuracy_score(classe_teste, previsoes) matriz = confusion_matrix(classe_teste, previsoes) from collections import Counter Counter(classe_teste)
# train_X.isna().sum(axis = 0) # %% [markdown] # # Xử lý các cột bị khuyết dữ liệu trên tập Train # 1. Phương án 1: Xóa toàn bộ một dòng có cột dữ liệu bị khuyết # 2. Phương án 2: Thay thế giá trị số bằng cách dùng trung bình cộng (``mean``) của toàn cột dữ liệu đó # * Cách xử lý 1: Dùng lớp ``SimpleImputer`` từ thư viện ``sklearn.impute`` # * Cách xử lý 2: Cách hai thực hiện một số thao tác trên cột # # Các bạn có thể tham khảo tại trang số 63 sách Hands-on Machine Learning with Scikit-Learn, Keras, and TensorFlow # %% # Cách 1: imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean') imputer_mean.fit(train_X["total_bedrooms"].values.reshape(-1, 1)) tmp_total_bedrooms = imputer_mean.transform( train_X['total_bedrooms'].values.reshape(-1, 1)) # %% # Cách 2: idx_null = train_X["total_bedrooms"].isnull() # kiểm tra giá trị khuyết mean_total_bedrooms = train_X["total_bedrooms"][train_X["total_bedrooms"].isna( ) == False].mean() # tính trung bình cộng các giá trị không bị khuyết train_X["total_bedrooms"].fillna( mean_total_bedrooms, inplace=True) # thay thế các dòng không bị khuyết bởi giá trị trung bình print(train_X["total_bedrooms"][idx_null == True] ) # in ra màn hình giá trị các dòng bị khuyết ban đầu để kiểm tra # %% print((train_X["total_bedrooms"] == tmp_total_bedrooms.squeeze() ).all()) # Kiểm tra kết quả Cách 1 và Cách 2 có giống nhau hay không?