def run(): x_df_all = pd.read_csv(ROOT + '/training1_X.csv') ys_df_all = pd.read_csv(ROOT + '/training1_Y.csv') dfLen = len(x_df_all.index) removeDates(x_df_all) removeDates(ys_df_all) # preprocess X x_imputer = gPre.impute(x_df_all) dump(x_imputer, ROOT + '/results/preprocess/x_imputer.joblib') gPre.fillMissingFinal(x_df_all, value=0) x_scaler = gPre.scale(x_df_all) # preprocess Ys ys_imputer = gPre.impute(ys_df_all) dump(ys_imputer, ROOT + '/results/preprocess/ys_imputer.joblib') gPre.fillMissingFinal(ys_df_all, value=0) # ys_df_all = gPre.myDiscretize(ys_df_all, 5) ys_df_all = gPre.percentageChangeToCtg(ys_df_all, yCtgArr) # trim bad rows x_df = x_df_all[trimCount:dfLen - trimCount] ys_df = ys_df_all[trimCount:dfLen - trimCount] # save preprocessed data x_df.to_csv(ROOT + '/preprocessed/training1_X.csv', index=False) ys_df.to_csv(ROOT + '/preprocessed/training1_Y.csv', index=False) # persist preprocessors try: # dump(x_imputer, ROOT + '/results/preprocess/x_imputer.joblib') # dump(ys_imputer, ROOT + '/results/preprocess/ys_imputer.joblib') dump(x_scaler, ROOT + '/results/preprocess/x_scaler.joblib') except Exception as e3: print('preprocess err pt 1: dump') print(e3)
def run(df): res = gPre.run(df) gPre.impute(df) print("p1 preprocess") # encoderF4 = gPre.encodeCtgs(df, "f4") # encoderF5 = gPre.encodeCtgs(df, "f5") # res["ctg_f4"] = encoderF4 # res["ctg_f5"] = encoderF5 return res
def processY(df): # do all preprocess, feature extraction, ... res = gPre.impute(df) return res
def run(): # # v2 flow # train_df: temp df storing joined x_df and picked y # read csv to df # x_df: X df # ys_df: poss Ys df x_df_all = pd.read_csv(X_CSV_FILEPATH) ys_df_all = pd.read_csv(YS_CSV_FILEPATH) all_df_all = pd.read_csv(ALL_CSV_FILEPATH) trimCount = 8 dfLen = len(x_df_all.index) removeDates(x_df_all) removeDates(ys_df_all) # preprocess # standardize scale # x_df # returned scaler keep for prediction use # no scaling for y # scalers, ... obtained in preo=process preprocess_x_res = processX(x_df_all) # preprocess_y_res = processX(ys_df_all) gPre.impute(ys_df_all) gPre.finalImpute(ys_df_all) print('discretize df nan check 1:') print(ys_df_all.isnull().values.any()) # print(ys_df_all.isnull().values) ys_df_all.to_csv(ROOT + '/data/test/ys_df_all_imputed.csv') print('ys_df_all:') print(ys_df_all) # y_discretize_res = gPre.discretize(ys_df_all, 5) y_discretize_res = gPre.myDiscretize(ys_df_all, 5) print(y_discretize_res) print(type(y_discretize_res)) # ys_df_all = pd.DataFrame(data=y_discretize_res,index=ys_df_all.index, columns=ys_df_all.columns) # ys_df_all = y_discretize_res # ys_df_all = pd.DataFrame(y_discretize_res.toarray()) # ys_df_all = pd.DataFrame(y_discretize_res, index=ys_df_all.index, columns=ys_df_all.columns) ys_df_all = y_discretize_res ys_df_all.to_csv(ROOT + '/data/test/ys_df_all_imputed_2.csv') y_discretize_res.to_csv(ROOT + '/data/test/ys_df_all_imputed_3.csv') preprocess_all_res = processX(all_df_all) x_df = x_df_all[trimCount:dfLen - trimCount] ys_df = ys_df_all[trimCount:dfLen - trimCount] all_df = all_df_all[trimCount:dfLen - trimCount] # inspect x_df x_df.to_csv(ROOT + '/data/test/x_t1_imputed.csv') ys_df.to_csv(ROOT + '/data/test/ys_t1_imputed.csv') all_df.to_csv(ROOT + '/data/test/all_t1_imputed.csv') # train # foreach col in ys_df: # the col is y picked # train_df = join x_df and picked col # df of x and a y picked # try diff MLs on the df maxCount = 200 count = 0 for col in ys_df: # train_df = pd.concat([df1, df2], axis=1) try: if (col != 'datetime' and col != 'dateObj'): print('Using ' + str(col) + ' as Y.') # train_df = x_df.copy() # train_df['Y'] = ys_df[col] train(str(col), x_df, ys_df[col]) count += 1 except Exception as e: print(e) if count == maxCount: break # save results linear_regression_results_df = pd.DataFrame(linear_regression_results) linear_regression_results_df.to_json(TRAINING_RESULTS_FILEPATH, orient='records', lines=True)