def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Running {__name__}") # determine columns to operate on cols = distil_utils.get_operating_columns( inputs, self.hyperparams["use_columns"], CATEGORICALS) logger.debug(f"Found {len(cols)} categorical columns to evaluate") if len(cols) is 0: return base.CallResult(inputs) imputer = CategoricalImputer( strategy=self.hyperparams["strategy"], fill_value=self.hyperparams["fill_value"], missing_values="", tie_breaking="first", ) outputs = inputs.copy() failures: List[int] = [] for c in cols: input_col = inputs.iloc[:, c] try: imputer.fit(input_col) result = imputer.transform(input_col) outputs.iloc[:, c] = result except ValueError as e: # value error gets thrown when all data is missing if not self.hyperparams["error_on_empty"]: failures.append(c) else: raise e # for columns that failed using 'most_frequent' try again using 'constant' if not self.hyperparams["error_on_empty"]: imputer = CategoricalImputer( strategy="constant", fill_value=self.hyperparams["fill_value"], missing_values="", tie_breaking="first", ) for f in failures: outputs_col = outputs.iloc[:, f] imputer.fit(outputs_col) result = imputer.transform(outputs_col) outputs.iloc[:, f] = result logger.debug(f"\n{outputs}") return base.CallResult(outputs)
titanic_test = pd.read_csv("titanic_test.csv") titanic_test.shape titanic_all = pd.concat([titanic_train, titanic_test]) titanic_all.shape titanic_all.info() #impute missing values for continuous features imputable_cont_features = ['Age','Fare'] cont_imputer = preprocessing.Imputer() cont_imputer.fit(titanic_all[imputable_cont_features]) titanic_all[imputable_cont_features] = cont_imputer.transform(titanic_all[imputable_cont_features]) #impute missing values for categorical features cat_imputer = CategoricalImputer() cat_imputer.fit(titanic_all['Embarked']) titanic_all['Embarked'] = cat_imputer.transform(titanic_all['Embarked']) titanic_all['FamilySize'] = titanic_all['SibSp'] + titanic_all['Parch'] + 1 def convert_family_size(size): if(size == 1): return 'Single' elif(size <=3): return 'Small' elif(size <= 6): return 'Medium' else: return 'Large' titanic_all['FamilyCategory'] = titanic_all['FamilySize'].map(convert_family_size)
titanic_train = pd.read_csv( "C:/Users/tauseef.ur.rahman/Desktop/Python-Docs/Titanic/train.csv") print(titanic_train.info()) #preprocessing stage #impute missing values for continuous features imputable_cont_features = ['Age'] cont_imputer = preprocessing.Imputer() cont_imputer.fit(titanic_train[imputable_cont_features]) print(cont_imputer.statistics_) titanic_train[imputable_cont_features] = cont_imputer.transform( titanic_train[imputable_cont_features]) #impute missing values for categorical features cat_imputer = CategoricalImputer() cat_imputer.fit(titanic_train['Embarked']) print(cat_imputer.fill_) titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked']) le_embarked = preprocessing.LabelEncoder() le_embarked.fit(titanic_train['Embarked']) print(le_embarked.classes_) titanic_train['Embarked'] = le_embarked.transform(titanic_train['Embarked']) le_sex = preprocessing.LabelEncoder() le_sex.fit(titanic_train['Sex']) print(le_sex.classes_) titanic_train['Sex'] = le_sex.transform(titanic_train['Sex']) features = ['Pclass', 'Parch', 'SibSp', 'Age', 'Fare', 'Embarked', 'Sex'] X_train = titanic_train[features]
#return Weight print ('Orignal #missing: %d'%sum(data['Item_Weight'].isnull())) #data['Item_Weight'] = data[['Item_Weight','Item_Identifier']].apply(impute_weight,axis=1).astype(float) imputer= Imputer(missing_values='NaN',strategy='mean',axis=0) imputer = imputer.fit(data[['Item_Weight']]) print(data[['Item_Weight']]) data[['Item_Weight']] = imputer.transform(data[['Item_Weight']]) print ('Final #missing: %d'%sum(data['Item_Weight'].isnull())) #Imputing Outlet_Size missing values with the mode #Determining the mode outlet_size_mode= data.pivot_table(values='Outlet_Size',columns='Outlet_Type',aggfunc=lambda x:x.mode()) imputer1= CategoricalImputer(missing_values='NaN',strategy='most_frequent') imputer1 = imputer1.fit(data['Outlet_Size']) print(data['Outlet_Size']) data[['Outlet_Size']] = imputer1.transform(data[['Outlet_Size']]) print(data['Outlet_Size']) #Feature Engineering #checking whether we should combine Outlet_Type or not data.pivot_table(values='Item_Outlet_Sales',columns='Outlet_Type') #values are significantly different so leave these #Considering 0 item_visibility as missing we should impute missing values for these data print(data[['Item_Visibility']]) #print ('Final #zeros: %d'%sum(data['Item_Visibility'] == 0)) data['Item_Visibility'].replace([data['Item_Visibility']==0],np.NaN)
#Import dataset df_train = pd.read_csv('train_u6lujuX_CVtuZ9i.csv') df_test = pd.read_csv('test_Y3wMUE5_7gLdaTN.csv') #splitting dataset into training and test set X_train = df_train.iloc[:, 1:-1].values y_train = df_train.iloc[:, 12].values X_test = df_test.iloc[:, 1:].values #Missing values #--------------training set--------- from sklearn_pandas import CategoricalImputer imputer_train_cat = CategoricalImputer() imputer_train_cat = imputer_train_cat.fit(X_train[:, [0, 1, 4]]) X_train[:, [0, 1, 4]] = imputer_train_cat.transform(X_train[:, [0, 1, 4]]) for i in range(0, 614): if X_train[:, 2][i] == '3+': X_train[:, 2][i] = 3 else: continue from sklearn.preprocessing import Imputer imputer_train_num = Imputer(missing_values="NaN", strategy="mean", axis=0) imputer_train_num = imputer_train_num.fit(X_train[:, [2, 7, 8, 9]]) X_train[:, [2, 7, 8, 9]] = imputer_train_num.transform(X_train[:, [2, 7, 8, 9]])
def impute_categorical_features(df, features): #impute missing values for categorical features cat_imputer = CategoricalImputer() cat_imputer.fit(df[features]) print(cat_imputer.fill_) df[features] = cat_imputer.transform(df[features])
import numpy as np import pandas as pd train = pd.read_csv("train_ctrUa4K.csv") test = pd.read_csv("test_lAUu6dG.csv") X_train = train.iloc[:, 1:12].values y_train = train.iloc[:, 12].values X_test = test.iloc[:, 1:12].values from sklearn.preprocessing import LabelEncoder, OneHotEncoder onehotencoder = OneHotEncoder() X_train = onehotencoder.fit_transform(X_train).toarray() labelencoder_y = LabelEncoder() y_train = labelencoder_y.fit_transform(y_train) # Taking care of missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X_train[:, 1:12]) X_train[:, 1:12] = imputer.transform(X_train[:, 1:12]) from sklearn_pandas import CategoricalImputer categorical_imputer = CategoricalImputer(missing_values='NaN', strategy='most_frequent') categorical_imputer = categorical_imputer.fit(X_train[:, ])
import numpy as np import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv("ChronicKidneyDiseaseFull.csv") nulls_per_column = df.isnull().sum() nulls_per_column df_delete_rows = df.dropna(axis=0) df_delete_rows.shape df_delete_columns = df.dropna(axis=1) df_delete_columns.shape categorical_variables_mask = df.dtypes == object categorical_variables = df.columns[categorical_variables_mask] numerical_variables = df.columns[~categorical_variables_mask] from sklearn_pandas import CategoricalImputer from sklearn.preprocessing import Imputer numerical_imputer = Imputer(missing_values="NaN", strategy="median", copy=True) numerical_imputer.fit(df[numerical_variables]) df_numerical_imputed = numerical_imputer.transform(df[numerical_variables]) categorical_imputer = CategoricalImputer(missing_values="NaN") categorical_imputer.fit(df[categorical_variables])