def PreprocessOriginalData(train, test, filename): # preprocessing training/testing print("preprocessing on whole training/testing set") # deepcopy the train/test set to preprocess from copy import deepcopy train = deepcopy(train) test = deepcopy(test) test.X = pd.DataFrame.copy(test.X) # feature engineering train.X = firststeps(train.X) test.X = firststeps(test.X) # fillNA train.X, NAmethod = PP.fillNA(train.X) test.X, _ = PP.fillNA(test.X, NAmethod) # Empirical Bayesian Encoding encoding_lst = [col for col in train.X.columns if col.endswith('cat') and len(pd.concat([train.X, test.X])[col].unique()) >= 13] train.X, Encodingmethod = PP.encoding(train, encoding_lst) test.X, _ = PP.encoding(test, encoding_lst, param = Encodingmethod) if True: # set down which list need 2b dummied mydummylist = PP.dummylist( pd.concat([train.X, test.X], axis=0)) myheader = PP.makehead( pd.concat([train.X, test.X], axis=0), mydummylist) train.X = PP.dummy( PP.addhead(train.X, myheader), mydummylist) train.X = PP.rmhead(train.X) test.X = PP.dummy( PP.addhead(test.X, myheader), mydummylist) test.X = PP.rmhead(test.X) train.X, STDmethod = PP.standardize(train.X) test.X, _ = PP.standardize(test.X, STDmethod) print("saving %strain_X.csv (%d row * %d col)"%(filename, train.X.shape[0], train.X.shape[1])) train.X.to_csv(filename + "train_X.csv", index=False, float_format="%.5f") train.y.to_csv(filename + "train_y.csv", index=False, float_format="%.5f") print("saving %stest_X.csv (%d row * %d col)"%(filename, test.X.shape[0], test.X.shape[1])) test.X.to_csv(filename + "test_X.csv", index=False, float_format="%.5f") test.y.to_csv(filename + "test_y.csv", index=False, float_format="%.5f") return mydummylist, myheader, encoding_lst
def KFoldsPreprocess(train, test, mydummylist, myheader, encoding_lst, KFOLDS, filename): folds = list(StratifiedKFold(n_splits=KFOLDS, shuffle=True, random_state=10086).split(train.X, train.y)) # do methods in preprocessing in each fold for i, (train_idx, valid_idx) in enumerate(folds): print("preprocessing on cv #%d" % i) train_this_cut = dataset( train.X.iloc[train_idx], train.y.iloc[train_idx]) valid_this_cut = dataset( train.X.iloc[valid_idx], train.y.iloc[valid_idx]) # feature engineering train_this_cut.X = firststeps(train_this_cut.X) valid_this_cut.X = firststeps(valid_this_cut.X) # fillNA train_this_cut.X, NAmethod = PP.fillNA(train_this_cut.X) # Empirical Bayesian Encoding train_this_cut.X, Encodingmethod = PP.encoding(train_this_cut, encoding_lst) train_this_cut.X = PP.dummy( PP.addhead(train_this_cut.X, myheader), mydummylist) train_this_cut.X = PP.rmhead(train_this_cut.X) train_this_cut.X, STDmethod = PP.standardize( train_this_cut.X) valid_this_cut.X, _ = PP.fillNA( valid_this_cut.X, NAmethod) valid_this_cut.X, _ = PP.encoding(valid_this_cut, encoding_lst, param = Encodingmethod) valid_this_cut.X = PP.dummy( PP.addhead(valid_this_cut.X, myheader), mydummylist) valid_this_cut.X = PP.rmhead(valid_this_cut.X) valid_this_cut.X, _ = PP.standardize( valid_this_cut.X, STDmethod) print("saving %strain_X_%d.csv (%d row * %d col)"%(filename, i, train_this_cut.X.shape[0], train_this_cut.X.shape[1])) train_this_cut.X.to_csv(filename + "train_X_%d.csv" % i, index=False, float_format="%.5f") train_this_cut.y.to_csv(filename + "train_y_%d.csv" % i, index=False, float_format="%.5f") print("saving %svalid_X_%d.csv (%d row * %d col)"%(filename, i, valid_this_cut.X.shape[0], valid_this_cut.X.shape[1])) valid_this_cut.X.to_csv(filename + "valid_X_%d.csv" % i, index=False, float_format="%.5f") valid_this_cut.y.to_csv(filename + "valid_y_%d.csv" % i, index=False, float_format="%.5f")