def test_not_fitted(): """ If imputer is not fitted, NotFittedError is raised. """ imp = CategoricalImputer() with pytest.raises(NotFittedError): imp.transform(np.array(['a', 'b', 'b', None]))
def test_missing_values_param(input_type): data = ['x', 'y', 'a_missing', 'y'] if input_type == 'pd': X = pd.Series(data) else: X = np.asarray(data, dtype=object) imp = CategoricalImputer(missing_values='a_missing') Xt = imp.fit_transform(X) assert (Xt == np.array(['x', 'y', 'y', 'y'])).all()
def test_default_fill_value_for_constant_strategy(input_type): data = ['a', np.nan, 'b', 'b'] if input_type == 'pd': X = pd.Series(data) else: X = np.asarray(data, dtype=object) imputer = CategoricalImputer(strategy='constant') Xt = imputer.fit_transform(X) assert imputer.fill_ == '?' assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()
def test_copy_param(input_type): data = ['a', np.nan, 'b', 'a'] if input_type == 'pd': X = pd.Series(data) else: X = np.asarray(data, dtype=object) imp = CategoricalImputer(copy=False) Xt = imp.fit_transform(X) Xe = np.array(['a', 'a', 'b', 'a']) assert (Xt == Xe).all() assert (X == Xe).all()
import pandas as pd #Import dataset df_train = pd.read_csv('train_u6lujuX_CVtuZ9i.csv') df_test = pd.read_csv('test_Y3wMUE5_7gLdaTN.csv') #splitting dataset into training and test set X_train = df_train.iloc[:, 1:-1].values y_train = df_train.iloc[:, 12].values X_test = df_test.iloc[:, 1:].values #Missing values #--------------training set--------- from sklearn_pandas import CategoricalImputer imputer_train_cat = CategoricalImputer() imputer_train_cat = imputer_train_cat.fit(X_train[:, [0, 1, 4]]) X_train[:, [0, 1, 4]] = imputer_train_cat.transform(X_train[:, [0, 1, 4]]) for i in range(0, 614): if X_train[:, 2][i] == '3+': X_train[:, 2][i] = 3 else: continue from sklearn.preprocessing import Imputer imputer_train_num = Imputer(missing_values="NaN", strategy="mean", axis=0) imputer_train_num = imputer_train_num.fit(X_train[:, [2, 7, 8, 9]]) X_train[:, [2, 7, 8, 9]] = imputer_train_num.transform(X_train[:, [2, 7, 8, 9]])
# In[14]: num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('imputer', SimpleImputer(strategy="median")), ('attr_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) # In[15]: cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), ('imputer', CategoricalImputer(strategy="most_frequent")), ('encoder', OneHotEncoder(sparse=False)), ]) # In[16]: str_pipeline = Pipeline([ ('selector', DataFrameSelector("Name")), ('str_finder', StringFinder()), ]) # In[17]: pre_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline),
def predict(): print("__________________________") import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn_pandas import CategoricalImputer import os as os import category_encoders as ce from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score from sklearn.metrics import matthews_corrcoef from sklearn.externals import joblib from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import make_pipeline import warnings warnings.filterwarnings("ignore") esd_df = pd.read_csv('/Users/encoreai/Desktop/new1.csv', encoding='iso-8859-1', sep=',', engine='python') list(esd_df.columns) esd_df.shape #Finding out the null / Nan values in the columns: # for _ in esd_df.columns: # print("The number of null values in:{} == {}".format(_, esd_df[_].isnull().sum())) esd_array = esd_df['Doc_type'].values imputer = CategoricalImputer() imputer.fit_transform(esd_array) esd_df["Error_detail"].fillna("No detail", inplace = True) #print(esd_df) esd_df=esd_df.drop(["Doc_type"],axis=1) esd_df['Doc_type'] = esd_array esd = esd_df.copy() encoder_tc = ce.BinaryEncoder(cols=['Ticket_Category']) df_tc = encoder_tc.fit_transform(esd) encoder_et = ce.BinaryEncoder(cols=['Error_type']) df_et = encoder_et.fit_transform(df_tc) encoder_ed = ce.BinaryEncoder(cols=['Error_detail']) df_ed = encoder_ed.fit_transform(df_et) encoder_dt = ce.BinaryEncoder(cols=['Doc_type']) df_dt = encoder_dt.fit_transform(df_ed) #Next step is creating training and testing datasets: x=df_dt.drop(['Resolution'],axis='columns') x.shape y=df_dt['Resolution'] y.shape from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1) # print(x_train.shape,x_test.shape,y_train.shape,y_test.shape) rf1=RandomForestClassifier(criterion='entropy',n_estimators=100,max_features=3,oob_score=True,bootstrap=True,n_jobs=-1,random_state=1) #Model fit rf1.fit(x_train,y_train) row = x_test.head(1) # print(row) rf1_pred=rf1.predict(x_test) # print(rf1_pred) from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score from sklearn.metrics import matthews_corrcoef # Finding Accracy Score # print('Accuracy Score:',accuracy_score(y_test,rf1_pred)) # Matthews Corealation Coefficient mcc = matthews_corrcoef(y_test,rf1_pred) # print('Matthews_corrcoef for Model is:',mcc) #Feature importances features=df_dt.columns[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21,22]] importances = rf1.feature_importances_ indices = np.argsort(importances) plt.figure(1) plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='b', align='center') plt.yticks(range(len(indices)), features[indices]) plt.xlabel('Relative Importance') oob_error=1-rf1.oob_score_ # print(oob_error) #0.150 params={ 'criterion':['gini','entropy'], 'n_estimators':[50], 'max_features':[2,3,4,5,6,7,8], } rf_gridcv=GridSearchCV(estimator=rf1,cv=5,param_grid=params,scoring='accuracy') rf_grid=rf_gridcv.fit(x_train,y_train) # print(rf_gridcv.best_params_) y_predrf=rf_gridcv.predict(x_test) # print(y_predrf) one_row1 = x_test.head(2) y_pred_one=rf_gridcv.predict(one_row1) # print(y_pred_one) x = x_test.head(1) # print(x) import pickle pickle.dump(rf_gridcv, open('model.pkl','wb')) model = pickle.load(open('model.pkl','rb')) print(model.predict(x)) print("***********************************************") # 'Ticket_Category_0' = 0 # 'Ticket_Category_1' = 1 # 'Ticket_Category_2' = 1 # 'Ticket_Category_3' = 0 # 'Error_type_0' = 0 # 'Error_type_1' = 1 # 'Error_type_2' = 0 # 'Error_type_3' = 0 # 'Error_type_4' = 1 # 'Error_type_5' = 0 # 'Error_type_6' = 0 # 'Error_detail_0' = 0 # 'Error_detail_1' = 0 # 'Error_detail_2' = 0 # 'Error_detail_3' = 0 # 'Error_detail_4' = 0 # 'Error_detail_5' = 1 # 'Doc_type_0' = 0 # 'Doc_type_1' = 0 # 'Doc_type_2' = 1 # 'Doc_type_3' = 1 # 'Doc_type_4' = 1 # # query = [0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,1] # prediction = model.predict(query) # prediction = jsonify({'prediction': list(prediction)}) # print(prediction) prediction = model.predict(x) return jsonify({'prediction': list(prediction)})
esd_df = pd.read_csv('/Users/encoreai/Desktop/new1.csv', encoding='iso-8859-1', sep=',', engine='python') list(esd_df.columns) esd_df.shape #Finding out the null / Nan values in the columns: # for _ in esd_df.columns: # print("The number of null values in:{} == {}".format(_, esd_df[_].isnull().sum())) esd_array = esd_df['Doc_type'].values imputer = CategoricalImputer() imputer.fit_transform(esd_array) esd_df["Error_detail"].fillna("No detail", inplace=True) #print(esd_df) esd_df = esd_df.drop(["Doc_type"], axis=1) esd_df['Doc_type'] = esd_array esd = esd_df.copy() encoder_tc = ce.BinaryEncoder(cols=['Ticket_Category']) df_tc = encoder_tc.fit_transform(esd) encoder_et = ce.BinaryEncoder(cols=['Error_type']) df_et = encoder_et.fit_transform(df_tc) encoder_ed = ce.BinaryEncoder(cols=['Error_detail'])
def build_audit_na(classifier, name, with_proba=True, predict_proba_transformer=None, apply_transformer=None, **pmml_options): employment_mapping = { "CONSULTANT": "PRIVATE", "PSFEDERAL": "PUBLIC", "PSLOCAL": "PUBLIC", "PSSTATE": "PUBLIC", "SELFEMP": "PRIVATE", "PRIVATE": "PRIVATE" } gender_mapping = {"FEMALE": 0, "MALE": 1} mapper = DataFrameMapper([(["Age"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer( "numpy.where(pandas.notnull(X[:, 0]), X[:, 0], -999)"), name="flag_missing(Age, -999)"), Imputer(missing_values=-999) ])] + [(["Hours"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer( "numpy.where(pandas.isnull(X[:, 0]), -999, X[:, 0])"), name="flag_missing(Hours, -999)"), Imputer(missing_values=-999) ])] + [(["Income"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_missing_values", low_value=5000, high_value=200000, with_data=False), Imputer() ])] + [(["Employment"], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(), StringNormalizer(function="uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ])] + [([column], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(missing_values=None), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(), StringNormalizer(function="uppercase"), LookupTransformer(gender_mapping, None) ])]) pipeline = PMMLPipeline( [("mapper", mapper), ("classifier", classifier)], predict_proba_transformer=predict_proba_transformer, apply_transformer=apply_transformer) pipeline.fit(audit_na_X, audit_na_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis=1) store_csv(adjusted, name + ".csv")
df.describe() df.describe().transpose() df.info() df.describe(include='O') #Count missing values df.isna().sum() #Drop unwanted column df=df.drop(['Loan_ID'],axis=1) #impute 'catagorical varibles' ..impute gender df['Gender'].value_counts(dropna=False) #gives na clunts for gender seperately from sklearn_pandas import CategoricalImputer imputer=CategoricalImputer() df['Gender']=imputer.fit_transform(df['Gender']) df['Married'].value_counts(dropna=False) df['Married']=imputer.fit_transform(df['Married']) df['Dependents'].value_counts(dropna=False) df['Dependents']=imputer.fit_transform(df['Dependents']) df['Self_Employed'].value_counts(dropna=False) df['Self_Employed']=imputer.fit_transform(df['Self_Employed']) df['Credit_History'].value_counts(dropna=False) df['Credit_History']=imputer.fit_transform(df['Credit_History']) df.isna().sum() #only numeric data impute #impute loamAmount df['LoanAmount'].isna().sum()
def impute_categorical(df, col_name): imputer = CategoricalImputer() df[col_name] = imputer.fit_transform(df[col_name]) return df
'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week' ] cat = [name for name in adult.columns if name not in num + ['class']] #adult.workclass = adult.workclass.str.strip() #adult.education = adult.education.str.strip() preprocess = DataFrameMapper( [(['age'], [Imputer(), StandardScaler()]), (['fnlwgt'], [Imputer(), StandardScaler()]), (['education_num'], [Imputer(), StandardScaler()]), (['capital_gain'], [Imputer(), StandardScaler()]), (['capital_loss'], [Imputer(), StandardScaler()]), (['hours_per_week'], [Imputer(), StandardScaler()]), (['workclass'], [CategoricalImputer(), LabelEncoder()]), (['education'], [CategoricalImputer(), LabelEncoder()]), (['marital_status'], [CategoricalImputer(), LabelEncoder()]), (['occupation'], [CategoricalImputer(), LabelEncoder()]), (['relationship'], [CategoricalImputer(), LabelEncoder()]), (['race'], [CategoricalImputer(), LabelEncoder()]), (['sex'], [CategoricalImputer(), LabelEncoder()]), (['native_country'], [CategoricalImputer(), LabelEncoder()])], df_out=True) df = preprocess.fit_transform(adult)
y = df[["player_id", "goals"]].groupby("player_id").shift(-1) y = y.dropna(subset=["goals"]) train = pd.merge(X, y, left_index=True, right_index=True, suffixes=("", "_next")) target = "goals_next" X_train = train.drop(target, axis=1) y_train = train[target] mapper = DataFrameMapper( [ ("position", [CategoricalImputer(), LabelBinarizer()]), (["goals"], [SimpleImputer(), StandardScaler()]), (["assists"], [SimpleImputer(), StandardScaler()]), (["shots"], [SimpleImputer(), StandardScaler()]), (["ice_time"], [SimpleImputer(), StandardScaler()]), ], df_out=True, ) model = LinearRegression() pipe = make_pipeline(mapper, model) pipe.fit(X_train, y_train) with open("pickles/pipe.pkl", "wb") as f:
test_data = pd.read_csv('big_mart_test.csv') """Deal With Missing Data The missingno library provides a neat way to showcase which variables have missing data. This is done below using a bar chart. I will then proceed to use Pandas fillna method to fill the two columns that have missing data (Item_Weight, Outlet_Size) """ msno.bar(train_data) msno.bar(test_data) train_data['Item_Weight'].fillna(train_data['Item_Weight'].mean(), inplace=True) test_data['Item_Weight'].fillna(test_data['Item_Weight'].mean(), inplace=True) outlet_size_tr = train_data['Outlet_Size'] outlet_size_ts = test_data['Outlet_Size'] imputer1 = CategoricalImputer() outlet_size_tr = imputer1.fit_transform(outlet_size_tr) outlet_size_ts = imputer1.fit_transform(outlet_size_ts) train_data = train_data.drop(['Outlet_Size'], axis=1) train_data.insert(8, 'Outlet_Size', outlet_size_tr) test_data = test_data.drop(['Outlet_Size'], axis=1) test_data.insert(8, 'Outlet_Size', outlet_size_ts) # Let's see if there are any columns we can drop cor = train_data.corr() cor["Item_Outlet_Sales"].sort_values(ascending=False) # The year that an outlet was established has a very low correlation figure
def encodeCategoricalValuesPrediction(self,data): """ Method Name: encodeCategoricalValuesPrediction Description: This method encodes all the categorical values in the prediction set. Output: A Dataframe which has all the categorical values encoded. On Failure: Raise Exception Written By: Ajinkya Abhang Version: 1.0 Revisions: None """ # We can impute the categorical values like below: features_nan = [feature for feature in data.columns if data[feature].isnull().sum() > 0 and data[feature].dtypes == 'O'] imputer = CategoricalImputer() if len(features_nan) != 0: for cat_feature in features_nan: data[cat_feature] = imputer.fit_transform(data[cat_feature]) # We can impute the non-categorical values like below: numerical_with_nan = [feature for feature in data.columns if data[feature].isnull().sum() > 1 and data[feature].dtypes != 'O'] if len(numerical_with_nan) != 0: imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) data[numerical_with_nan] = imputer.fit_transform(data[numerical_with_nan]) # We can use label encoder for encoding df_new = pd.DataFrame({ 'laundry_options_1': [np.nan] * data.shape[0], 'laundry_options_2': [np.nan] * data.shape[0], 'laundry_options_3': [np.nan] * data.shape[0], 'laundry_options_4': [np.nan] * data.shape[0], 'parking_options_1': [np.nan] * data.shape[0], 'parking_options_2': [np.nan] * data.shape[0], 'parking_options_3': [np.nan] * data.shape[0], 'parking_options_4': [np.nan] * data.shape[0], 'parking_options_5': [np.nan] * data.shape[0], 'parking_options_6': [np.nan] * data.shape[0] }) dat = pd.concat([data, df_new], axis=1) for i in range(data.shape[0]): if (dat['laundry_options'][i] == 'w/d in unit'): dat['laundry_options_1'][i] = 0 dat['laundry_options_2'][i] = 0 dat['laundry_options_3'][i] = 0 dat['laundry_options_4'][i] = 1 elif (dat['laundry_options'][i] == 'w/d hookups'): dat['laundry_options_1'][i] = 0 dat['laundry_options_2'][i] = 0 dat['laundry_options_3'][i] = 1 dat['laundry_options_4'][i] = 0 elif (dat['laundry_options'][i] == 'laundry on site'): dat['laundry_options_1'][i] = 1 dat['laundry_options_2'][i] = 0 dat['laundry_options_3'][i] = 0 dat['laundry_options_4'][i] = 0 elif (dat['laundry_options'][i] == 'no laundry on site'): dat['laundry_options_1'][i] = 0 dat['laundry_options_2'][i] = 1 dat['laundry_options_3'][i] = 0 dat['laundry_options_4'][i] = 0 elif (dat['laundry_options'][i] == 'laundry in bldg'): dat['laundry_options_1'][i] = 0 dat['laundry_options_2'][i] = 0 dat['laundry_options_3'][i] = 0 dat['laundry_options_4'][i] = 0 for i in range(data.shape[0]): if (dat['parking_options'][i] == 'carport'): dat['parking_options_1'][i] = 1 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 0 elif (dat['parking_options'][i] == 'detached garage'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 1 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 0 elif (dat['parking_options'][i] == 'no parking'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 1 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 0 elif (dat['parking_options'][i] == 'off-street parking'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 1 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 0 elif (dat['parking_options'][i] == 'street parking'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 1 dat['parking_options_6'][i] = 0 elif (dat['parking_options'][i] == 'valet parking'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 1 elif (dat['parking_options'][i] == 'attached garage'): dat['parking_options_1'][i] = 0 dat['parking_options_2'][i] = 0 dat['parking_options_3'][i] = 0 dat['parking_options_4'][i] = 0 dat['parking_options_5'][i] = 0 dat['parking_options_6'][i] = 0 dat.drop(['laundry_options', 'parking_options'], axis=1, inplace = True) return dat
def fit(self, X, y=None): self.imputer = CategoricalImputer() return self
le.fit(df['name']) df['model'] = le.transform(df['name']) # Train Test SPlit target = 'price' y = df[target] X = df.drop(target, axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # DataFrame Mapper mapper = DataFrameMapper( [ # ('region', LabelBinarizer()), (['year'], StandardScaler()), # ('manufacturer',[CategoricalImputer(), LabelBinarizer()]), ('model', [CategoricalImputer()]), ('cylinders', [CategoricalImputer(), LabelBinarizer()]), ('fuel', [CategoricalImputer(), LabelBinarizer()]), (['odometer'], [SimpleImputer(), StandardScaler()]), # ('title_status', [CategoricalImputer(), LabelBinarizer()]), ('transmission', [CategoricalImputer(), LabelBinarizer()]), # (['vin'], StandardScaler()), # ('type', [CategoricalImputer(), LabelBinarizer()]), ('paint_color', [CategoricalImputer(), LabelBinarizer()]), ('condition', [CategoricalImputer(), LabelBinarizer()]), ], df_out=True)
def doprediction(): info = request.data json_data = json.loads(info) meldrange = json_data["meldrange"] meldrange = float(meldrange) donor_data = json_data["donor"] dolen = len(donor_data) allrecip_data = json_data["allrecip"] allrecip_len = len(allrecip_data) donor_df = pd.DataFrame(data=donor_data[1:dolen], columns=donor_data[0]) allrecip_df = pd.DataFrame(data=allrecip_data[1:allrecip_len], columns=allrecip_data[0]) filename = 'datafile/donorfile.csv' filename2 = 'datafile/recipfile.csv' silentremove(filename) silentremove(filename2) donor_df.to_csv(filename, encoding='utf-8') allrecip_df.to_csv(filename2, encoding='utf-8') # start to impute -------------------------------------- donor_df = pd.read_csv('datafile/donorfile.csv', index_col=0) recipient_df = pd.read_csv('datafile/recipfile.csv', index_col=0) id_df = pd.DataFrame( recipient_df[['recipient_id', 'FINAL_MELD_PELD_LAB_SCORE']]) X_cf_r = recipient_df.select_dtypes(include=['object']) X_ncf_r = recipient_df.select_dtypes(exclude=['object']) X_cf_d = donor_df.select_dtypes(include=['object']) X_ncf_d = donor_df.select_dtypes(exclude=['object']) imp_cat = CategoricalImputer() X_cf_r = pd.DataFrame(imp_cat.fit_transform(np.array(X_cf_r)), columns=X_cf_r.columns) imp_cat = CategoricalImputer() X_cf_d = pd.DataFrame(imp_cat.fit_transform(np.array(X_cf_d)), columns=X_cf_d.columns) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X_ncf_r) X_ncf_r = pd.DataFrame(imp.transform(X_ncf_r), columns=X_ncf_r.columns) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X_ncf_d) X_ncf_d = pd.DataFrame(imp.transform(X_ncf_d), columns=X_ncf_d.columns) recipient_df = pd.merge(X_ncf_r, X_cf_r, left_index=True, right_index=True) # donor_df = pd.merge(X_ncf_d, X_cf_d, left_index=True, right_index=True) if meldrange != 200: id_df = id_df.loc[(id_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange) & ( id_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20)] recipient_df = recipient_df.loc[ (recipient_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange) & (recipient_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20.0)] X_cf_r = recipient_df.select_dtypes(include=['object']) X_ncf_r = recipient_df.select_dtypes(exclude=['object']) min_max_scaler = preprocessing.MinMaxScaler() header = X_ncf_d.columns X_ncf_d = min_max_scaler.fit_transform(X_ncf_d) X_ncf_d = pd.DataFrame(X_ncf_d, columns=header) min_max_scaler = preprocessing.MinMaxScaler() header = X_ncf_r.columns X_ncf_r = min_max_scaler.fit_transform(X_ncf_r) X_ncf_r = pd.DataFrame(X_ncf_r, columns=header) X_ncf_r.index = X_cf_r.index recipient_df = pd.merge(X_ncf_r, X_cf_r, left_index=True, right_index=True) print("recipdf", recipient_df) donor_df = pd.merge(X_ncf_d, X_cf_d, left_index=True, right_index=True) filename = 'datafile/donorfile.csv' filename2 = 'datafile/recipfile.csv' filename3 = 'datafile/recipidfile.csv' silentremove(filename) silentremove(filename2) silentremove(filename3) donor_df.to_csv(filename, encoding='utf-8') print("meldrange", meldrange) # if meldrange!=200: # id_df = id_df.loc[(id_df['FINAL_MELD_PELD_LAB_SCORE'] < meldrange) & (id_df['FINAL_MELD_PELD_LAB_SCORE'] >= meldrange - 20)] # recipient_df = recipient_df.loc[(recipient_df['FINAL_MELD_PELD_LAB_SCORE']<meldrange) & (recipient_df['FINAL_MELD_PELD_LAB_SCORE']>= meldrange-20.0)] id_df = pd.DataFrame(id_df['recipient_id'], columns=['recipient_id']) recipient_df.to_csv(filename2, encoding='utf-8') id_df.to_csv(filename3, encoding='utf-8') import prediction match_score = prediction.matching() predict_score = prediction.predictscore() return json.dumps({'match': match_score, 'predict': predict_score})
# Get list of categorical column names categorical_columns = X.columns[categorical_feature_mask].tolist() # Get list of non-categorical column names non_categorical_columns = X.columns[~categorical_feature_mask].tolist() # Apply numeric imputer numeric_imputation_mapper = DataFrameMapper( [([numeric_feature], Imputer(strategy="median")) for numeric_feature in non_categorical_columns], input_df=True, df_out=True) # Apply categorical imputer categorical_imputation_mapper = DataFrameMapper( [(category_feature, CategoricalImputer()) for category_feature in categorical_columns], input_df=True, df_out=True) # Import FeatureUnion from sklearn.pipeline import FeatureUnion # Combine the numeric and categorical transformations numeric_categorical_union = FeatureUnion([ ("num_mapper", numeric_imputation_mapper), ("cat_mapper", categorical_imputation_mapper) ]) # Create full pipeline pipeline = Pipeline([("featureunion", numeric_categorical_union),
# # Missing values # =============================================================================''' data_missing = dataset.isnull().sum() print(data_missing) # Numpy array for imputing missing values X = dataset.iloc[:, :-1].values # ============================================================================= ## Missing Categorical Values # ============================================================================= from sklearn_pandas import CategoricalImputer data = np.array(X[:,8], dtype=object) imputer = CategoricalImputer() X[:,8] = imputer.fit_transform(data) dataset['Outlet_Size'] = X[:,8] # ============================================================================= # # Imputer for numeric values # ============================================================================= from sklearn.preprocessing import Imputer imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) X[:, 1:2] = imputer.fit_transform(X[:, 1:2]) dataset['Item_Weight'] = X[:,1:2] # Check Values in Item Visibilty dataset.Item_Visibility.value_counts() # Replace 0 with NaN
def test_invalid_strategy(): """ Raise an error if an invalid strategy is entered """ with pytest.raises(ValueError): CategoricalImputer(strategy="not_a_supported_strategy")
]]) y = df[['player_id', 'goals']].groupby('player_id').shift(-1) y = y.dropna(subset=['goals']) train = pd.merge(X, y, left_index=True, right_index=True, suffixes=('', '_next')) target = 'goals_next' X_train = train.drop(target, axis=1) y_train = train[target] mapper = DataFrameMapper([ ('position', [CategoricalImputer(), LabelBinarizer()]), (['goals'], [SimpleImputer(), StandardScaler()]), (['assists'], [SimpleImputer(), StandardScaler()]), (['shots'], [SimpleImputer(), StandardScaler()]), (['ice_time'], [SimpleImputer(), StandardScaler()]), ], df_out=True) model = LinearRegression() pipe = make_pipeline(mapper, model) pipe.fit(X_train, y_train) score = pipe.score(X_train, y_train) with open('pickles/pipe.pkl', 'wb') as f: pickle.dump(pipe, f)
def impute_categorical_features(df, features): feature_defs = [] for col_name in features: feature_defs.append((col_name, CategoricalImputer())) mapper = DataFrameMapper(feature_defs, input_df=True, df_out=True) df[features] = mapper.fit_transform(df[features])
class Preprocessor: """ This class shall be used to clean and transform the data before training. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self, file_object, logger_object): self.file_object = file_object self.logger_object = logger_object def remove_unwanted_spaces(self,data): """ Method Name: remove_unwanted_spaces Description: This method removes the unwanted spaces from a pandas dataframe. Output: A pandas DataFrame after removing the spaces. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log(self.file_object, 'Entered the remove_unwanted_spaces method of the Preprocessor class') self.data = data try: self.df_without_spaces=self.data.apply(lambda x: x.str.strip() if x.dtype == "object" else x) # drop the labels specified in the columns self.logger_object.log(self.file_object, 'Unwanted spaces removal Successful.Exited the remove_unwanted_spaces method of the Preprocessor class') return self.df_without_spaces except Exception as e: self.logger_object.log(self.file_object, 'Exception occured in remove_unwanted_spaces method of the Preprocessor class. Exception message: ' + str( e)) self.logger_object.log(self.file_object, 'unwanted space removal Unsuccessful. Exited the remove_unwanted_spaces method of the Preprocessor class') raise Exception() def remove_columns(self,data,columns): """ Method Name: remove_columns Description: This method removes the given columns from a pandas dataframe. Output: A pandas DataFrame after removing the specified columns. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log(self.file_object, 'Entered the remove_columns method of the Preprocessor class') self.data=data self.columns=columns try: self.useful_data=self.data.drop(labels=self.columns, axis=1) # drop the labels specified in the columns self.logger_object.log(self.file_object, 'Column removal Successful.Exited the remove_columns method of the Preprocessor class') return self.useful_data except Exception as e: self.logger_object.log(self.file_object,'Exception occured in remove_columns method of the Preprocessor class. Exception message: '+str(e)) self.logger_object.log(self.file_object, 'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class') raise Exception() def separate_label_feature(self, data, label_column_name): """ Method Name: separate_label_feature Description: This method separates the features and a Label Coulmns. Output: Returns two separate Dataframes, one containing features and the other containing Labels . On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log(self.file_object, 'Entered the separate_label_feature method of the Preprocessor class') try: self.X=data.drop(labels=label_column_name,axis=1) # drop the columns specified and separate the feature columns self.Y=data[label_column_name] # Filter the Label columns self.logger_object.log(self.file_object, 'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class') return self.X,self.Y except Exception as e: self.logger_object.log(self.file_object,'Exception occured in separate_label_feature method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log(self.file_object, 'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class') raise Exception() def is_null_present(self,data): """ Method Name: is_null_present Description: This method checks whether there are null values present in the pandas Dataframe or not. Output: Returns True if null values are present in the DataFrame, False if they are not present and returns the list of columns for which null values are present. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log(self.file_object, 'Entered the is_null_present method of the Preprocessor class') self.null_present = False self.cols_with_missing_values=[] self.cols = data.columns try: self.null_counts=data.isna().sum() # check for the count of null values per column for i in range(len(self.null_counts)): if self.null_counts[i]>0: self.null_present=True self.cols_with_missing_values.append(self.cols[i]) if(self.null_present): # write the logs to see which columns have null values self.dataframe_with_null = pd.DataFrame() self.dataframe_with_null['columns'] = data.columns self.dataframe_with_null['missing values count'] = np.asarray(data.isna().sum()) self.dataframe_with_null.to_csv('preprocessing_data/null_values.csv') # storing the null column information to file self.logger_object.log(self.file_object,'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class') return self.null_present, self.cols_with_missing_values except Exception as e: self.logger_object.log(self.file_object,'Exception occured in is_null_present method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log(self.file_object,'Finding missing values failed. Exited the is_null_present method of the Preprocessor class') raise Exception() def impute_missing_values(self, data, cols_with_missing_values): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log(self.file_object, 'Entered the impute_missing_values method of the Preprocessor class') self.data= data self.cols_with_missing_values=cols_with_missing_values try: self.imputer = CategoricalImputer() for col in self.cols_with_missing_values: try: self.data[col] = self.imputer.fit_transform(self.data[col]) except: self.data[col] = data[col].fillna(data[col].mode()[0]) self.logger_object.log(self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class') return self.data except Exception as e: self.logger_object.log(self.file_object,'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log(self.file_object,'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class') raise Exception() def scale_numerical_columns(self,data): """ Method Name: scale_numerical_columns Description: This method scales the numerical values using the Standard scaler. Output: A dataframe with scaled values On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log(self.file_object, 'Entered the scale_numerical_columns method of the Preprocessor class') self.data=data self.num_df = self.data[['months_as_customer', 'policy_deductable', 'umbrella_limit', 'capital-gains', 'capital-loss', 'incident_hour_of_the_day', 'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim', 'vehicle_claim']] try: self.scaler = StandardScaler() self.scaled_data = self.scaler.fit_transform(self.num_df) self.scaled_num_df = pd.DataFrame(data=self.scaled_data, columns=self.num_df.columns,index=self.data.index) self.data.drop(columns=self.scaled_num_df.columns, inplace=True) self.data = pd.concat([self.scaled_num_df, self.data], axis=1) self.logger_object.log(self.file_object, 'scaling for numerical values successful. Exited the scale_numerical_columns method of the Preprocessor class') return self.data except Exception as e: self.logger_object.log(self.file_object,'Exception occured in scale_numerical_columns method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log(self.file_object, 'scaling for numerical columns Failed. Exited the scale_numerical_columns method of the Preprocessor class') raise Exception() def encode_categorical_columns(self,data): """ Method Name: encode_categorical_columns Description: This method encodes the categorical values to numeric values. Output: dataframe with categorical values converted to numerical values On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log(self.file_object, 'Entered the encode_categorical_columns method of the Preprocessor class') self.data=data try: self.cat_df = self.data.select_dtypes(include=['object']).copy() self.cat_df['policy_csl'] = self.cat_df['policy_csl'].map({'100/300': 1, '250/500': 2.5, '500/1000': 5}) self.cat_df['insured_education_level'] = self.cat_df['insured_education_level'].map( {'JD': 1, 'High School': 2, 'College': 3, 'Masters': 4, 'Associate': 5, 'MD': 6, 'PhD': 7}) self.cat_df['incident_severity'] = self.cat_df['incident_severity'].map( {'Trivial Damage': 1, 'Minor Damage': 2, 'Major Damage': 3, 'Total Loss': 4}) self.cat_df['insured_sex'] = self.cat_df['insured_sex'].map({'FEMALE': 0, 'MALE': 1}) self.cat_df['property_damage'] = self.cat_df['property_damage'].map({'NO': 0, 'YES': 1}) self.cat_df['police_report_available'] = self.cat_df['police_report_available'].map({'NO': 0, 'YES': 1}) try: # code block for training self.cat_df['fraud_reported'] = self.cat_df['fraud_reported'].map({'N': 0, 'Y': 1}) self.cols_to_drop=['policy_csl', 'insured_education_level', 'incident_severity', 'insured_sex', 'property_damage', 'police_report_available', 'fraud_reported'] except: # code block for Prediction self.cols_to_drop = ['policy_csl', 'insured_education_level', 'incident_severity', 'insured_sex', 'property_damage', 'police_report_available'] # Using the dummy encoding to encode the categorical columns to numerical ones for col in self.cat_df.drop(columns=self.cols_to_drop).columns: self.cat_df = pd.get_dummies(self.cat_df, columns=[col], prefix=[col], drop_first=True) self.data.drop(columns=self.data.select_dtypes(include=['object']).columns, inplace=True) self.data= pd.concat([self.cat_df,self.data],axis=1) self.logger_object.log(self.file_object, 'encoding for categorical values successful. Exited the encode_categorical_columns method of the Preprocessor class') return self.data except Exception as e: self.logger_object.log(self.file_object,'Exception occured in encode_categorical_columns method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log(self.file_object, 'encoding for categorical columns Failed. Exited the encode_categorical_columns method of the Preprocessor class') raise Exception() def handle_imbalanced_dataset(self,x,y): """ Method Name: handle_imbalanced_dataset Description: This method handles the imbalanced dataset to make it a balanced one. Output: new balanced feature and target columns On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log(self.file_object, 'Entered the handle_imbalanced_dataset method of the Preprocessor class') try: self.rdsmple = RandomOverSampler() self.x_sampled,self.y_sampled = self.rdsmple.fit_sample(x,y) self.logger_object.log(self.file_object, 'dataset balancing successful. Exited the handle_imbalanced_dataset method of the Preprocessor class') return self.x_sampled,self.y_sampled except Exception as e: self.logger_object.log(self.file_object, 'Exception occured in handle_imbalanced_dataset method of the Preprocessor class. Exception message: ' + str( e)) self.logger_object.log(self.file_object, 'dataset balancing Failed. Exited the handle_imbalanced_dataset method of the Preprocessor class') raise Exception()
) #creation of data frames from csv titanic_train = pd.read_csv("Titanic_train.csv") print(titanic_train.info()) #preprocessing stage #impute missing values for continuous features imputable_cont_features = ['Age', 'Fare'] cont_imputer = preprocessing.Imputer() cont_imputer.fit(titanic_train[imputable_cont_features]) print(cont_imputer.statistics_) titanic_train[imputable_cont_features] = cont_imputer.transform( titanic_train[imputable_cont_features]) #impute missing values for categorical features cat_imputer = CategoricalImputer() cat_imputer.fit(titanic_train['Embarked']) print(cat_imputer.fill_) titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked']) le_embarked = preprocessing.LabelEncoder() le_embarked.fit(titanic_train['Embarked']) print(le_embarked.classes_) titanic_train['Embarked'] = le_embarked.transform(titanic_train['Embarked']) le_sex = preprocessing.LabelEncoder() le_sex.fit(titanic_train['Sex']) print(le_sex.classes_) titanic_train['Sex'] = le_sex.transform(titanic_train['Sex']) le_pclass = preprocessing.LabelEncoder()
def processData(): catFeatures = [ 'GENDER', 'ABO', 'LIFE_SUP_TCR', 'MALIG_TCR', 'EXC_HCC', 'EXC_CASE', 'PERM_STATE', 'PREV_AB_SURG_TCR', 'BACT_PERIT_TCR', 'PORTAL_VEIN_TCR', 'TIPSS_TCR', 'WORK_INCOME_TCR', 'INIT_DIALYSIS_PRIOR_WEEK', 'INIT_MELD_OR_PELD', 'FINAL_DIALYSIS_PRIOR_WEEK', 'FINAL_MELD_OR_PELD', 'PERM_STATE_TRR', 'WORK_INCOME_TRR', 'MALIG_TRR', 'LIFE_SUP_TRR', 'PORTAL_VEIN_TRR', 'PREV_AB_SURG_TRR', 'TIPSS_TRR', 'HBV_CORE', 'HBV_SUR_ANTIGEN', 'HCV_SEROSTATUS', 'EBV_SEROSTATUS', 'HIV_SEROSTATUS', 'CMV_STATUS', 'CMV_IGG', 'CMV_IGM', 'TXLIV', 'PREV_TX', 'DDAVP_DON', 'CMV_DON', 'HEP_C_ANTI_DON', 'HBV_CORE_DON', 'HBV_SUR_ANTIGEN_DON', 'DON_TY', 'GENDER_DON', 'HOME_STATE_DON', 'NON_HRT_DON', 'ANTIHYPE_DON', 'PT_DIURETICS_DON', 'PT_STEROIDS_DON', 'PT_T3_DON', 'PT_T4_DON', 'VASODIL_DON', 'VDRL_DON', 'CLIN_INFECT_DON', 'EXTRACRANIAL_CANCER_DON', 'HIST_CIG_DON', 'HIST_COCAINE_DON', 'DIABETES_DON', 'HIST_HYPERTENS_DON', 'HIST_OTH_DRUG_DON', 'ABO_DON', 'INTRACRANIAL_CANCER_DON', 'SKIN_CANCER_DON', 'HIST_CANCER_DON', 'PT_OTH_DON', 'HEPARIN_DON', 'ARGININE_DON', 'INSULIN_DON', 'DIAL_TX', 'ABO_MAT', 'AGE_GROUP', 'MALIG', 'RECOV_OUT_US', 'TATTOOS', 'LI_BIOPSY', 'PROTEIN_URINE', 'CARDARREST_NEURO', 'INOTROP_SUPPORT_DON', 'CDC_RISK_HIV_DON', 'HISTORY_MI_DON', 'CORONARY_ANGIO_DON', 'LT_ONE_WEEK_DON' ] numFeatures = [ 'WGT_KG_DON_CALC', 'INIT_INR', 'ETHCAT_DON', 'ETHNICITY', 'DGN_TCR', 'REM_CD', 'INIT_AGE', 'ALBUMIN_TX', 'BMI_DON_CALC', 'EXC_EVER', 'OTH_LIFE_SUP_TCR', 'FINAL_ASCITES', 'WGT_KG_CALC', 'END_BMI_CALC', 'LISTYR', 'DDR1', 'FINAL_ALBUMIN', 'DB2', 'INIT_BMI_CALC', 'CITIZENSHIP', 'DB1', 'EDUCATION', 'DAYSWAIT_CHRON', 'OTH_LIFE_SUP_TRR', 'MED_COND_TRR', 'INIT_WGT_KG', 'MELD_PELD_LAB_SCORE', 'NUM_PREV_TX', 'INIT_SERUM_SODIUM', 'VENTILATOR_TCR', 'TX_PROCEDUR_TY', 'LITYP', 'INIT_SERUM_CREAT', 'WGT_KG_TCR', 'TBILI_DON', 'HGT_CM_CALC', 'SGOT_DON', 'ASCITES_TX', 'INIT_MELD_PELD_LAB_SCORE', 'ECD_DONOR', 'CREAT_TX', 'INIT_ENCEPH', 'INIT_HGT_CM', 'PRI_PAYMENT_TRR', 'INIT_STAT', 'ARTIFICIAL_LI_TCR', 'PT_CODE', 'WL_ID_CODE', 'INIT_ALBUMIN', 'ARTIFICIAL_LI_TRR', 'AGE_DON', 'ON_VENT_TRR', 'PRI_PAYMENT_TCR', 'BLOOD_INF_DON', 'CREAT_DON', 'REGION', 'INIT_ASCITES', 'HEMATOCRIT_DON', 'DIAB', 'TBILI_TX', 'FINAL_INR', 'AGE', 'FUNC_STAT_TRR', 'ETHCAT', 'CITIZENSHIP_DON', 'DEATH_MECH_DON', 'FUNC_STAT_TCR', 'FINAL_SERUM_SODIUM', 'COD_CAD_DON', 'FINAL_BILIRUBIN', 'BUN_DON', 'END_STAT', 'BMI_CALC', 'DDR2', 'FINAL_SERUM_CREAT', 'HIST_DIABETES_DON', 'ENCEPH_TX', 'SHARE_TY', 'DA1', 'PH_DON', 'FINAL_MELD_PELD_LAB_SCORE', 'BMI_TCR', 'INIT_BILIRUBIN', 'DISTANCE', 'SGPT_DON', 'PULM_INF_DON', 'HGT_CM_TCR', 'TRANSFUS_TERM_DON', 'FINAL_ENCEPH', 'DIAG', 'DA2', 'HGT_CM_DON_CALC', 'URINE_INF_DON', 'COLD_ISCH', 'INR_TX', 'DEATH_CIRCUM_DON', 'CANCER_SITE_DON' ] #Categorical pipeline cat_pipeline = Pipeline([ ('selector', DataFrameSelector(catFeatures)), ('imputer', CategoricalImputer()), ('cat_encoder', CategoricalEncoder("onehot-dense", handle_unknown='ignore')), ]) #Numerical pipeline num_pipeline = Pipeline([ ('selector', DataFrameSelector(numFeatures)), ('imputer', Imputer(strategy="median")), ('std_scaler', StandardScaler()), ]) #Full pipeline full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) train = pd.read_csv("train.csv") X_train = full_pipeline.fit_transform(train.loc[:, catFeatures + numFeatures]) gstatusSixMonths_train = train["GSTATUS_SIX_MONTHS"].values gstatusOneYear_train = train["GSTATUS_ONE_YEAR"].values gstatusThreeYears_train = train["GSTATUS_THREE_YEARS"].values gstatus_train = train["GSTATUS"].values gtime_train = train["GTIME"].values Y_train = np.array([[gstatus_train[i], gtime_train[i]] for i in range(len(gtime_train)) ]) #[is_not_censored, survival time] test = pd.read_csv("test.csv") X_test = full_pipeline.transform(test.loc[:, catFeatures + numFeatures]) gstatusSixMonths_test = test["GSTATUS_SIX_MONTHS"].values gstatusOneYear_test = test["GSTATUS_ONE_YEAR"].values gstatusThreeYears_test = test["GSTATUS_THREE_YEARS"].values gstatus_test = test["GSTATUS"].values gtime_test = test["GTIME"].values Y_test = np.array([[gstatus_test[i], gtime_test[i]] for i in range(len(gtime_test)) ]) #[is_not_censored, survival time] return X_train, Y_train, X_test, Y_test
class Preprocessor: def __init__(self, file_object, logger_object): self.file_object = file_object self.logger_object = logger_object def replaceInvalidValuesWithNull(self, data): for column in data.columns: count = data[column][data[column] == '?'].count() if count != 0: data[column] = data[column].replace('?', np.NaN) return data def is_null_present(self, data): """ Method Name: is_null_present Description: This method checks whether there are null values present in the pandas Dataframe or not. Output: Returns True if null values are present in the DataFrame, False if they are not present and returns the list of columns for which null values are present. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the is_null_present method of the Preprocessor class') self.null_present = False self.cols_with_missing_values = [] self.cols = data.columns try: self.null_counts = data.isna().sum( ) # check for the count of null values per column for i in range(len(self.null_counts)): if self.null_counts[i] > 0: self.null_present = True self.cols_with_missing_values.append(self.cols[i]) if (self.null_present ): # write the logs to see which columns have null values self.dataframe_with_null = pd.DataFrame() self.dataframe_with_null['columns'] = data.columns self.dataframe_with_null['missing values count'] = np.asarray( data.isna().sum()) self.dataframe_with_null.to_csv( 'preprocessing_data/null_values.csv' ) # storing the null column information to file self.logger_object.log( self.file_object, 'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class' ) return self.null_present, self.cols_with_missing_values except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in is_null_present method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Finding missing values failed. Exited the is_null_present method of the Preprocessor class' ) raise Exception() def impute_missing_values(self, data, cols_with_missing_values): self.logger_object.log( self.file_object, 'Entered the impute_missing_values method of the Preprocessor class' ) self.data = data self.cols_with_missing_values = cols_with_missing_values try: self.imputer = CategoricalImputer() for col in self.cols_with_missing_values: self.data[col] = self.imputer.fit_transform(self.data[col]) self.logger_object.log( self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class' ) return self.data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class' ) raise Exception() def separate_label_feature(self, data, label_column_name): self.logger_object.log( self.file_object, 'Entered the separate_label_feature method of the Preprocessor class' ) try: self.X = data.drop(labels=label_column_name, axis=1) self.Y = data[label_column_name] self.logger_object.log( self.file_object, 'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class' ) return self.X, self.Y except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in separate_label_feature method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class' ) raise Exception()
def transform(self, X): for var in config.CAT_FEATURES: imputer = CategoricalImputer() X[var] = imputer.fit_transform(X[var]) return X
# Get list of categorical column names categorical_columns = X.columns[categorical_feature_mask].tolist() # Get list of non-categorical column names non_categorical_columns = X.columns[~categorical_feature_mask].tolist() # Apply numeric imputer numeric_imputation_mapper = DataFrameMapper( [([numeric_feature], Imputer(strategy="median")) for numeric_feature in non_categorical_columns], input_df=True, df_out=True ) # Apply categorical imputer categorical_imputation_mapper = DataFrameMapper( [(category_feature, CategoricalImputer()) for category_feature in categorical_columns], input_df=True, df_out=True ) ## Kidney disease case study II: Feature Union # Import FeatureUnion from sklearn.pipeline import FeatureUnion # Combine the numeric and categorical transformations numeric_categorical_union = FeatureUnion([ ("num_mapper", numeric_imputation_mapper), ("cat_mapper", categorical_imputation_mapper) ])
def test_missing_replacement(): """ Raise error if no replacement value specified and strategy='fixed_value' """ with pytest.raises(ValueError): CategoricalImputer(strategy="fixed_value")
class Preprocessor: """ This class shall be used to clean and transform the data before training. Written By: Arpit Kumar Version: 1.0 Revisions: None """ def __init__(self, file_object, logger_object): self.file_object = file_object self.logger_object = logger_object def remove_columns(self, data, columns): """ Method Name: remove_columns Description: This method removes the given columns from a pandas dataframe. Output: A pandas DataFrame after removing the specified columns. On Failure: Raise Exception Written By: Arpit Kumar Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the remove_columns method of the Preprocessor class') self.data = data self.columns = columns try: self.useful_data = self.data.drop( labels=self.columns, axis=1) # drop the labels specified in the columns self.logger_object.log( self.file_object, 'Column removal Successful.Exited the remove_columns method of the Preprocessor class' ) return self.useful_data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in remove_columns method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class' ) raise Exception() def separate_label_feature(self, data, label_column_name): """ Method Name: separate_label_feature Description: This method separates the features and a Label Coulmns. Output: Returns two separate Dataframes, one containing features and the other containing Labels . On Failure: Raise Exception Written By: Arpit Kumar Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the separate_label_feature method of the Preprocessor class' ) try: self.X = data.drop( labels=label_column_name, axis=1 ) # drop the columns specified and separate the feature columns self.Y = data[label_column_name] # Filter the Label columns self.logger_object.log( self.file_object, 'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class' ) return self.X, self.Y except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in separate_label_feature method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class' ) raise Exception() def dropUnnecessaryColumns(self, data, columnNameList): """ Method Name: is_null_present Description: This method drops the unwanted columns as discussed in EDA section. Written By: Arpit Kumar Version: 1.0 Revisions: None """ data = data.drop(columnNameList, axis=1) return data def replaceInvalidValuesWithNull(self, data): """ Method Name: is_null_present Description: This method replaces invalid values i.e. '?' with null, as discussed in EDA. Written By: Arpit Kumar Version: 1.0 Revisions: None """ for column in data.columns: count = data[column][data[column] == '?'].count() if count != 0: data[column] = data[column].replace('?', np.nan) return data def is_null_present(self, data): """ Method Name: is_null_present Description: This method checks whether there are null values present in the pandas Dataframe or not. Output: Returns True if null values are present in the DataFrame, False if they are not present and returns the list of columns for which null values are present. On Failure: Raise Exception Written By: Arpit Kumar Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the is_null_present method of the Preprocessor class') self.null_present = False self.cols_with_missing_values = [] self.cols = data.columns try: self.null_counts = data.isna().sum( ) # check for the count of null values per column for i in range(len(self.null_counts)): if self.null_counts[i] > 0: self.null_present = True self.cols_with_missing_values.append(self.cols[i]) if (self.null_present ): # write the logs to see which columns have null values self.dataframe_with_null = pd.DataFrame() self.dataframe_with_null['columns'] = data.columns self.dataframe_with_null['missing values count'] = np.asarray( data.isna().sum()) self.dataframe_with_null.to_csv( 'preprocessing_data/null_values.csv' ) # storing the null column information to file self.logger_object.log( self.file_object, 'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class' ) return self.null_present, self.cols_with_missing_values except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in is_null_present method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Finding missing values failed. Exited the is_null_present method of the Preprocessor class' ) raise Exception() def encodeCategoricalValues(self, data): """ Method Name: encodeCategoricalValues Description: This method encodes all the categorical values in the training set. Output: A Dataframe which has all the categorical values encoded. On Failure: Raise Exception Written By: Arpit Kumar Version: 1.0 Revisions: None """ data["class"] = data["class"].map({'p': 1, 'e': 2}) for column in data.drop(['class'], axis=1).columns: data = pd.get_dummies(data, columns=[column]) return data def encodeCategoricalValuesPrediction(self, data): """ Method Name: encodeCategoricalValuesPrediction Description: This method encodes all the categorical values in the prediction set. Output: A Dataframe which has all the categorical values encoded. On Failure: Raise Exception Written By: Arpit Kumar Version: 1.0 Revisions: None """ for column in data.columns: data = pd.get_dummies(data, columns=[column]) return data # def handleImbalanceDataset(self,X,Y): # """ # Method Name: handleImbalanceDataset # Description: This method handles the imbalance in the dataset by oversampling. # Output: A Dataframe which is balanced now. # On Failure: Raise Exception # # Written By: Arpit Kumar # Version: 1.0 # Revisions: None # """ # # # # rdsmple = RandomOverSampler() # x_sampled, y_sampled = rdsmple.fit_sample(X, Y) # # return x_sampled,y_sampled def impute_missing_values(self, data, cols_with_missing_values): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception Written By: Arpit Kumar Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the impute_missing_values method of the Preprocessor class' ) self.data = data self.cols_with_missing_values = cols_with_missing_values try: self.imputer = CategoricalImputer() for col in self.cols_with_missing_values: self.data[col] = self.imputer.fit_transform(self.data[col]) self.logger_object.log( self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class' ) return self.data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class' ) raise Exception() def get_columns_with_zero_std_deviation(self, data): """ Method Name: get_columns_with_zero_std_deviation Description: This method finds out the columns which have a standard deviation of zero. Output: List of the columns with standard deviation of zero On Failure: Raise Exception Written By: Arpit Kumar Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the get_columns_with_zero_std_deviation method of the Preprocessor class' ) self.columns = data.columns self.data_n = data.describe() self.col_to_drop = [] try: for x in self.columns: if (self.data_n[x]['std'] == 0 ): # check if standard deviation is zero self.col_to_drop.append( x ) # prepare the list of columns with standard deviation zero self.logger_object.log( self.file_object, 'Column search for Standard Deviation of Zero Successful. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class' ) return self.col_to_drop except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in get_columns_with_zero_std_deviation method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Column search for Standard Deviation of Zero Failed. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class' ) raise Exception()
Y = df.groupby('id')[Y_COLUMNS].shift(-1) X = df[X_COLUMNS] X = X[~pd.isnull(Y).any(axis=1)] Y = Y.dropna() Y = Y.reset_index(drop=True) X = X.reset_index(drop=True) X_train, X_test, Y_train, Y_test = (train_test_split(X, Y, test_size=0.2, random_state=42)) mapper = DataFrameMapper( [(['age'], [SimpleImputer(), PolynomialFeatures(include_bias=False)]), (['position'], [CategoricalImputer(), LabelBinarizer()]), (['goals'], SimpleImputer()), (['assists'], SimpleImputer()), (['plus_minus'], SimpleImputer()), (['shots_on_goal'], SimpleImputer()), (['blocks'], SimpleImputer()), (['hits'], SimpleImputer())], df_out=False) Z_train = mapper.fit_transform(X_train) Z_test = mapper.transform(X_test) from sklearn.linear_model import LinearRegression multi_model = MultiOutputRegressor(LinearRegression()) multi_model.fit(Z_train, Y_train) pd.DataFrame(Z_test).iloc[:, 8:].head() pd.DataFrame(multi_model.predict(Z_test)).head() multi_model.score(Z_test, Y_test)
def impute_categorical_features(df, features): #impute missing values for categorical features cat_imputer = CategoricalImputer() cat_imputer.fit(df[features]) print(cat_imputer.fill_) df[features] = cat_imputer.transform(df[features])
le.fit(X_train['name']) le.transform(X_train['name']) le.fit(X_train['location']) le.transform(X_train['location']) # In[ ]: mapper = DataFrameMapper([ (['name'], [LabelBinarizer()]), (['location'], [LabelBinarizer()]), (['year'], [StandardScaler()]), (['kilometers_driven'], [SimpleImputer(), StandardScaler()]), (['fuel_type'], [CategoricalImputer(), LabelBinarizer()]), (['transmission'], [CategoricalImputer(), LabelBinarizer()]), (['owner_type'], [SimpleImputer(), StandardScaler()]), (['seats'], [SimpleImputer(), StandardScaler()]), ], df_out=True) # In[ ]: Z_train = mapper.fit_transform(X_train) # In[ ]: Z_test = mapper.transform(X_test)