def run(): x_df_all = pd.read_csv(ROOT + '/training1_X.csv') ys_df_all = pd.read_csv(ROOT + '/training1_Y.csv') dfLen = len(x_df_all.index) removeDates(x_df_all) removeDates(ys_df_all) # preprocess X x_imputer = gPre.impute(x_df_all) dump(x_imputer, ROOT + '/results/preprocess/x_imputer.joblib') gPre.fillMissingFinal(x_df_all, value=0) x_scaler = gPre.scale(x_df_all) # preprocess Ys ys_imputer = gPre.impute(ys_df_all) dump(ys_imputer, ROOT + '/results/preprocess/ys_imputer.joblib') gPre.fillMissingFinal(ys_df_all, value=0) # ys_df_all = gPre.myDiscretize(ys_df_all, 5) ys_df_all = gPre.percentageChangeToCtg(ys_df_all, yCtgArr) # trim bad rows x_df = x_df_all[trimCount:dfLen - trimCount] ys_df = ys_df_all[trimCount:dfLen - trimCount] # save preprocessed data x_df.to_csv(ROOT + '/preprocessed/training1_X.csv', index=False) ys_df.to_csv(ROOT + '/preprocessed/training1_Y.csv', index=False) # persist preprocessors try: # dump(x_imputer, ROOT + '/results/preprocess/x_imputer.joblib') # dump(ys_imputer, ROOT + '/results/preprocess/ys_imputer.joblib') dump(x_scaler, ROOT + '/results/preprocess/x_scaler.joblib') except Exception as e3: print('preprocess err pt 1: dump') print(e3)
def run(): x_df = pd.read_csv(ROOT + '/predict_X.csv') x_dfDatetime = x_df['datetime'] removeDates(x_df) # load preprocessors x_imputer = load(ROOT + '/results/preprocess/x_imputer.joblib') # x_imputer = load(ROOT + '/results/preprocess/ys_imputer.joblib') x_scaler = load(ROOT + '/results/preprocess/x_scaler.joblib') # preprocess using the same process as in preprocess gPre.imputeNoFit(x_df, imputer=x_imputer) gPre.fillMissingFinal(x_df, value=0) gPre.scaleNoFit(x_df, x_scaler) # load models # stocks = pd.read_csv(ROOT + '/symbols/marketstack.csv') # for stock in stocks: # symb = stock['Symbol'] # refers to train Y csv to see wt Ys are available to be predicted count = 0 train_ys_df = pd.read_csv(ROOT + '/preprocessed/training1_Y.csv') total = maxCount allResDf = pd.DataFrame() allResDf['datetime'] = x_dfDatetime # print(len(train_ys_df)) for col in train_ys_df: # col is the Y name try: if (col == 'datetime' or col == 'dateObj'): continue if (count == total): break count += 1 yName = str(col) print('Predicting ' + yName + '. (' + str(count) + '/' + str(total) + ')') resDf = pd.DataFrame() resDf['datetime'] = x_dfDatetime modelPath = ROOT + '/results/models/' + yName + '.joblib' model = load(modelPath) pred = model.predict(x_df) predP = model.predict_proba(x_df) predPMaxs = [max(probaRow) for probaRow in predP] resDf['predict'] = pred allResDf[yName + '_predict'] = pred resDf['predict_maxP'] = predPMaxs allResDf[yName + '_predict_maxP'] = predPMaxs except Exception as e: print('Predict ' + yName + ' err pt1: ') print(e) # try: # predProbs = model.predict_proba(x_df) # pMax = np.max(predProbs, axis=1) # resDf['predict_proba_max'] = pMax # allResDf[yName + '_predict_proba_max'] = pMax # except Exception as e: # print('Predict ' + yName + ' err pt2: ') # print(e) try: resDf.to_csv(ROOT + '/prediction/' + yName + '.csv', index=False) except Exception as e: print('Predict ' + yName + ' err pt3: ') print(e) try: allResDf.to_csv(ROOT + '/results/all_predictions.csv', index=False) except Exception as e: print('pt4: Write allResDf err: ') print(e)