def _knn_impute(self): for col in self.target: k_impute = impute.KNNImputer() k_impute.fit(self.df[col].values) self.output_df.loc[:, col] = k_impute.fit_transform( self.df[col].values) return self.output_df
def fixData(trainFileName, testFileName, features, imputer="simple", strategy="mean"): print("Fixing Data\n") #Read files into pandas array training_data = pd.read_csv(trainFileName) testing_data = pd.read_csv(testFileName) featuresForDummies = ["Embarked", "Sex"] trainSurvived = training_data["Survived"] passengerID = testing_data["PassengerId"] features2 = [] for i in range(len(features)): features2.append( features[i]) #Appends feature selected to the features to use training_data = training_data[features2] testing_data = testing_data[features2] tr_data = pd.get_dummies( training_data, columns=featuresForDummies) #Get dummies for required ones te_data = pd.get_dummies(testing_data, columns=featuresForDummies) if imputer.lower() == "simple": imp = impute.SimpleImputer(missing_values=np.NaN, strategy=strategy) #Imputes data elif imputer.lower() == "knn": imp = impute.KNNImputer(missing_values=np.NaN) elif imputer.lower() == "iterative": imp = impute.IterativeImputer(missing_values=np.NaN, initial_strategy=strategy) else: print("You did not enter a correct imputation method.") print( "Correct imputation methods include: \"Simple\", \"KNN\", \"Iterative\"" ) imp.fit(te_data) dummied_test = imp.transform(te_data) #Fits data imp.fit(tr_data) dummied_train = imp.transform(tr_data) return (dummied_test, dummied_train, trainSurvived, passengerID ) #Returns the completed arrays
def fit(self, X, y=None): X = X.copy() columns = X.columns.values indices = X.index #toto sme uz riesili v preprocessing notebooku - chceme, aby nam null hodnoty neinkrementovali encoding hodnoty v strede datasetu, #ale aby sme mali urcity range celociselnych hodnot, bez dier, ktore sa pouzije v imputerovi #je to klucove aj pri KNN imputerovi, aj pri Iterative imputerovi, lebo pri iterative pracujeme so ciselnymi hodnotami, #ktore su kludne aj desatinne, a teda nakoniec sa vysledok imputera rounduje #a pri knn sice pracujeme s celocislenymi cislami, no nakoniec imputuje sa priemer ziskany z danych #n-susedov, co znova moze byt desatinne cislo #takze, aby sme nahodou pri roundovani sa nedostali na encoding hodnotu, ktora patri null hodnote, tak #feedujeme danemu ordinal encodingu hned na zaciatku null hodnoty null_values = pd.DataFrame(index=pd.Index([-1]), columns=columns, data=[[np.nan for i in range(len(columns))]]) X = pd.concat([null_values, X]) self.ordinal_encoder = ce.ordinal.OrdinalEncoder( handle_missing="return_nan", handle_unknown="return_nan") X = self.ordinal_encoder.fit_transform(X) X = X[1:] if self.imputer_type == "knn": self.imputer = impute.KNNImputer() X = self.imputer.fit(X) elif self.imputer_type == "iterative": self.imputer = impute.IterativeImputer( max_iter=20, random_state=42, initial_strategy="most_frequent", min_value=X.min(), max_value=X.max()) try: X = self.imputer.fit(X) except (ValueError, np.linalg.LinAlgError): print( "Jeden error bol trapnuty, kedy funkcii vadili NaNs. Tento error je ale divny, lebo mu to vadi", "len prvy krat, a potom to uz ide...") X = self.imputer.fit(X) return self
def _get_preprocessor( num_features: List[str], cat_features: List[str] ) -> pipeline.Pipeline: num_transformer = pipeline.Pipeline([ ("scale", preprocessing.StandardScaler()), ("impute", impute.KNNImputer(n_neighbors = 10)), ]) cat_transformer = pipeline.Pipeline([ ("impute", impute.SimpleImputer(strategy = "constant", fill_value = "missing")), ("encode", preprocessing.OneHotEncoder(drop = "first")), ] ) preprocessor = compose.ColumnTransformer( [("num", num_transformer, num_features), ("cat", cat_transformer, cat_features) ]) return preprocessor
def data_imputation(self): """Imputate data with Means""" # KNN Imputer imp1 = impute.KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean') self.training_data_X = imp1.fit_transform(self.training_data_X) imp2 = impute.KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean') self.testing_data_X = imp2.fit_transform(self.testing_data_X) # np.savetxt("C:/Users/lihanmin/Desktop/data_processing/temp1.csv", self.training_data_X, delimiter=',') # Simple Imputer with 'mean' strategy # imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean') # self.training_data_X = imp.fit_transform(self.training_data_X) # np.savetxt("C:/Users/lihanmin/Desktop/data_processing/temp2.csv", self.training_data_X, delimiter=',') if WRITE_TO_CSV: # Write Into CSV training_set_temp = np.concatenate( (self.training_data_Y, self.training_data_X), axis=1) training_set = np.concatenate( (self.ar_training_set[:, 0:2], training_set_temp), axis=1) test_set_temp = np.concatenate( (self.testing_data_Y, self.testing_data_X), axis=1) test_set = np.concatenate( (self.ar_testing_set[:, 0:2], test_set_temp), axis=1) train_num = len(training_set) test_num = len(test_set) # print(train_num, test_num) for i in range(train_num): if training_set[i][7] == 0: training_set[i][7] = '女' else: training_set[i][7] = '男' for j in range(test_num): if test_set[j][7] == 0: test_set[j][7] = '女' else: test_set[j][7] = '男' # print(pd.__version__) # print(type(self.df_org_data.columns)) # Get names col_names = [] for item in self.df_org_data.columns: col_names.append(item) # Merge if MERGE_TRAIN_TEST: total_length = len(self.ar_org_data) total_data = [] idx_train = 0 idx_test = 0 for i in range(total_length): if i % 3 == 0: total_data.append(test_set[idx_test]) idx_test += 1 else: total_data.append(training_set[idx_train]) idx_train += 1 total_data = pd.DataFrame(total_data) total_data.columns = col_names total_file = 'C:/Users/lihanmin/Desktop/data_processing/Imputed_data.csv' total_data.to_csv(total_file, sep=',', encoding='gbk', index=False) exit() # Whole data test_set = pd.DataFrame(test_set) training_set = pd.DataFrame(training_set) # Set name test_set.columns = col_names training_set.columns = col_names # Write train_file = 'C:/Users/lihanmin/Desktop/data_processing/train.csv' test_file = 'C:/Users/lihanmin/Desktop/data_processing/test.csv' test_set.to_csv(test_file, sep=',', encoding='gbk', index=False) training_set.to_csv(train_file, sep=',', encoding='gbk', index=False) exit() # Normalization self.training_data_X = preprocessing.scale(self.training_data_X) self.testing_data_X = preprocessing.scale(self.testing_data_X)
from sklearn import impute, preprocessing, compose, pipeline, linear_model, multioutput from typing import List def _get_preprocessor( num_features: List[str] , cat_features: List: str ) -> compose.ColumnTransformer: """ Returns preprocessing pipeline adapted to specified numerical and categorical features """ num_transformer = pipeline.Pipeline([ ("scale", preprocessing.StandardScaler()), ("impute", impute.KNNImputer(n_neighbors = 10)), ]) cat_transformer = pipeline.Pipeline([ ("impute", impute.SimpleImputer(strategy = "constant", fill_value = "missing")), ("encode", preprocessing.OneHotEncoder(drop = "first")), ] ) preprocessor = compose.ColumnTransformer( [("num", num_transformer, num_features), ("cat", cat_transformer, cat_features) ]) return preprocessor def get_lr_model( num_features: List[str], cat_features: List[str], C: float = 1.0 ) -> pipeline.Pipeline: """
def knn_imputer(self, col, neighbors = 5): imputer = impute.KNNImputer(n_neighbors=neighbors) self.impute(col, imputer)
#test_object_columns = {key: value for key, value in test_columns.items() if value == "object"} #test_object_columns = list(test_object_columns.keys()) #convert to dummy data test_data = pd.get_dummies(test_data, columns=train_object_columns, drop_first=True) full_data = pd.concat([train_data, test_data]).drop("SalePrice", axis=1) #Apply imputer to fill missing values: #imputer = IterativeImputer(max_iter=10, random_state=0) imputer = impute.KNNImputer(n_neighbors=5) #imputed_data = imputer.fit_transform(test_data.values) #test_data = pd.DataFrame(imputed_data, columns = test_data.columns) imputed_data = imputer.fit_transform(full_data.values) test_data = pd.DataFrame( imputed_data[(train_data.shape[0]):(imputed_data.shape[0]), :], columns=full_data.columns) train_data = pd.DataFrame(imputed_data[:train_data.shape[0] - 1, :], columns=full_data.columns) train_data["SalePrice"] = save_prices # *10e8
import pandas as pd from sklearn import impute if __name__ == "__main__": imputer = impute.KNNImputer(copy=False) df_train = pd.read_csv("../input/train.csv") train_len = df_train.shape[0] df_test = pd.read_csv("../input/test.csv") test_len = df_test.shape[0] df = pd.concat([df_train, df_test], ignore_index=True) for c in df.drop(['id', 'target'], axis=1).columns: df.loc[:, c] = df.loc[:, c]
import numpy as np from sklearn import impute # create a random numpy array with 10 sample # and 6 features and values ranging from 1 to 15 X = np.random.randint(1, 15, (10, 6)) # convert the array to float X = X.astype(float) # randomly assign 10 elements to NaN (missing) X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan # use 2 nearest neighbours to fill na values knn_imputer = impute.KNNImputer(n_neighbors=2) X_imputer = knn_imputer.fit_transform(X) print(X) print(X_imputer)
data['Margin'].update(pd.Series(Margin[0])) data['Density'].update(pd.Series(Density[0])) data['Severity'].update(pd.Series(Severity[0])) #create the Labelencoder object le = preprocessing.LabelEncoder() #convert the categorical columns into numeric data["Severity"] = le.fit_transform(data["Severity"]) data["Shape"] = le.fit_transform(data["Shape"]) if preprocesado == 3: #create the Labelencoder object le = preprocessing.LabelEncoder() #convert the categorical columns into numeric data["Severity"] = le.fit_transform(data["Severity"]) data["Shape"] = le.fit_transform(data["Shape"]) imputer = impute.KNNImputer() birads = imputer.fit_transform([data['BI-RADS'].values]) Age = imputer.fit_transform([data['Age'].values]) Shape = imputer.fit_transform([data['Shape'].values]) Margin = imputer.fit_transform([data['Margin'].values]) Density = imputer.fit_transform([data['Density'].values]) Severity = imputer.fit_transform([data['Severity'].values]) data['BI-RADS'].update(pd.Series(birads[0])) data['Age'].update(pd.Series(Age[0])) data['Shape'].update(pd.Series(Shape[0])) data['Margin'].update(pd.Series(Margin[0])) data['Density'].update(pd.Series(Density[0])) data['Severity'].update(pd.Series(Severity[0])) print(data.shape)