def test_not_fitted(): """ If imputer is not fitted, NotFittedError is raised. """ imp = CategoricalImputer() with pytest.raises(NotFittedError): imp.transform(np.array(['a', 'b', 'b', None]))
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Running {__name__}") # determine columns to operate on cols = distil_utils.get_operating_columns( inputs, self.hyperparams["use_columns"], CATEGORICALS) logger.debug(f"Found {len(cols)} categorical columns to evaluate") if len(cols) is 0: return base.CallResult(inputs) imputer = CategoricalImputer( strategy=self.hyperparams["strategy"], fill_value=self.hyperparams["fill_value"], missing_values="", tie_breaking="first", ) outputs = inputs.copy() failures: List[int] = [] for c in cols: input_col = inputs.iloc[:, c] try: imputer.fit(input_col) result = imputer.transform(input_col) outputs.iloc[:, c] = result except ValueError as e: # value error gets thrown when all data is missing if not self.hyperparams["error_on_empty"]: failures.append(c) else: raise e # for columns that failed using 'most_frequent' try again using 'constant' if not self.hyperparams["error_on_empty"]: imputer = CategoricalImputer( strategy="constant", fill_value=self.hyperparams["fill_value"], missing_values="", tie_breaking="first", ) for f in failures: outputs_col = outputs.iloc[:, f] imputer.fit(outputs_col) result = imputer.transform(outputs_col) outputs.iloc[:, f] = result logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def imputacion_variable_delegacion(X_train, X_test): " Esta funcion imputa la variable 'delegacion_inicio' con la moda " #Para el set de entrenamiento X = X_train.delegacion_inicio.values.reshape(X_train.shape[0], 1) delegacionInicio_imputer = CategoricalImputer(strategy='most_frequent') X_train['delegacion_inicio'] = delegacionInicio_imputer.fit_transform(X) #Para el set de prueba X = X_test.delegacion_inicio.values.reshape(X_test.shape[0], 1) X_test['delegacion_inicio'] = delegacionInicio_imputer.transform(X) return X_train, X_test
titanic_test.shape titanic_all = pd.concat([titanic_train, titanic_test]) titanic_all.shape titanic_all.info() #impute missing values for continuous features imputable_cont_features = ['Age','Fare'] cont_imputer = preprocessing.Imputer() cont_imputer.fit(titanic_all[imputable_cont_features]) titanic_all[imputable_cont_features] = cont_imputer.transform(titanic_all[imputable_cont_features]) #impute missing values for categorical features cat_imputer = CategoricalImputer() cat_imputer.fit(titanic_all['Embarked']) titanic_all['Embarked'] = cat_imputer.transform(titanic_all['Embarked']) titanic_all['FamilySize'] = titanic_all['SibSp'] + titanic_all['Parch'] + 1 def convert_family_size(size): if(size == 1): return 'Single' elif(size <=3): return 'Small' elif(size <= 6): return 'Medium' else: return 'Large' titanic_all['FamilyCategory'] = titanic_all['FamilySize'].map(convert_family_size) def extract_title(name):
print(titanic_train.info()) #preprocessing stage #impute missing values for continuous features imputable_cont_features = ['Age'] cont_imputer = preprocessing.Imputer() cont_imputer.fit(titanic_train[imputable_cont_features]) print(cont_imputer.statistics_) titanic_train[imputable_cont_features] = cont_imputer.transform( titanic_train[imputable_cont_features]) #impute missing values for categorical features cat_imputer = CategoricalImputer() cat_imputer.fit(titanic_train['Embarked']) print(cat_imputer.fill_) titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked']) le_embarked = preprocessing.LabelEncoder() le_embarked.fit(titanic_train['Embarked']) print(le_embarked.classes_) titanic_train['Embarked'] = le_embarked.transform(titanic_train['Embarked']) le_sex = preprocessing.LabelEncoder() le_sex.fit(titanic_train['Sex']) print(le_sex.classes_) titanic_train['Sex'] = le_sex.transform(titanic_train['Sex']) features = ['Pclass', 'Parch', 'SibSp', 'Age', 'Fare', 'Embarked', 'Sex'] X_train = titanic_train[features] y_train = titanic_train['Survived'] #create an instance of decision tree classifier type
#data['Item_Weight'] = data[['Item_Weight','Item_Identifier']].apply(impute_weight,axis=1).astype(float) imputer= Imputer(missing_values='NaN',strategy='mean',axis=0) imputer = imputer.fit(data[['Item_Weight']]) print(data[['Item_Weight']]) data[['Item_Weight']] = imputer.transform(data[['Item_Weight']]) print ('Final #missing: %d'%sum(data['Item_Weight'].isnull())) #Imputing Outlet_Size missing values with the mode #Determining the mode outlet_size_mode= data.pivot_table(values='Outlet_Size',columns='Outlet_Type',aggfunc=lambda x:x.mode()) imputer1= CategoricalImputer(missing_values='NaN',strategy='most_frequent') imputer1 = imputer1.fit(data['Outlet_Size']) print(data['Outlet_Size']) data[['Outlet_Size']] = imputer1.transform(data[['Outlet_Size']]) print(data['Outlet_Size']) #Feature Engineering #checking whether we should combine Outlet_Type or not data.pivot_table(values='Item_Outlet_Sales',columns='Outlet_Type') #values are significantly different so leave these #Considering 0 item_visibility as missing we should impute missing values for these data print(data[['Item_Visibility']]) #print ('Final #zeros: %d'%sum(data['Item_Visibility'] == 0)) data['Item_Visibility'].replace([data['Item_Visibility']==0],np.NaN) imputer2= Imputer(missing_values='NaN',strategy='mean',axis=0) imputer2 = imputer2.fit(data[['Item_Visibility']])
#Import dataset df_train = pd.read_csv('train_u6lujuX_CVtuZ9i.csv') df_test = pd.read_csv('test_Y3wMUE5_7gLdaTN.csv') #splitting dataset into training and test set X_train = df_train.iloc[:, 1:-1].values y_train = df_train.iloc[:, 12].values X_test = df_test.iloc[:, 1:].values #Missing values #--------------training set--------- from sklearn_pandas import CategoricalImputer imputer_train_cat = CategoricalImputer() imputer_train_cat = imputer_train_cat.fit(X_train[:, [0, 1, 4]]) X_train[:, [0, 1, 4]] = imputer_train_cat.transform(X_train[:, [0, 1, 4]]) for i in range(0, 614): if X_train[:, 2][i] == '3+': X_train[:, 2][i] = 3 else: continue from sklearn.preprocessing import Imputer imputer_train_num = Imputer(missing_values="NaN", strategy="mean", axis=0) imputer_train_num = imputer_train_num.fit(X_train[:, [2, 7, 8, 9]]) X_train[:, [2, 7, 8, 9]] = imputer_train_num.transform(X_train[:, [2, 7, 8, 9]]) #--------------test set------------------
def impute_categorical_features(df, features): #impute missing values for categorical features cat_imputer = CategoricalImputer() cat_imputer.fit(df[features]) print(cat_imputer.fill_) df[features] = cat_imputer.transform(df[features])
titanic_train = pd.read_csv( 'C:\\Users\\tauseef.ur.rahman\\Desktop\\Python-Docs\\Titanic\\train.csv') print(titanic_train.info()) #Continous Imputer cont_impute_feature = ['Age', 'Fare'] cont_imputer = preprocessing.Imputer() cont_imputer.fit(titanic_train[cont_impute_feature]) titanic_train[cont_impute_feature] = cont_imputer.transform( titanic_train[cont_impute_feature]) #Categorical Imputer Cat_imputer = CategoricalImputer() Cat_imputer.fit(titanic_train['Embarked']) titanic_train['Embarked'] = Cat_imputer.transform(titanic_train['Embarked']) #label Encoding le_embarked = preprocessing.LabelEncoder() le_embarked.fit(titanic_train['Embarked']) titanic_train['Embarked'] = le_embarked.transform(titanic_train['Embarked']) le_Sex = preprocessing.LabelEncoder() le_Sex.fit(titanic_train['Sex']) titanic_train['Sex'] = le_Sex.transform(titanic_train['Sex']) le_Pclass = preprocessing.LabelEncoder() le_Pclass.fit(titanic_train['Pclass']) titanic_train['Pclass'] = le_Pclass.transform(titanic_train['Pclass']) features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']