def run():
    x_df_all = pd.read_csv(ROOT + '/training1_X.csv')
    ys_df_all = pd.read_csv(ROOT + '/training1_Y.csv')
    dfLen = len(x_df_all.index)
    removeDates(x_df_all)
    removeDates(ys_df_all)

    # preprocess X
    x_imputer = gPre.impute(x_df_all)
    dump(x_imputer, ROOT + '/results/preprocess/x_imputer.joblib')
    gPre.fillMissingFinal(x_df_all, value=0)
    x_scaler = gPre.scale(x_df_all)

    # preprocess Ys
    ys_imputer = gPre.impute(ys_df_all)
    dump(ys_imputer, ROOT + '/results/preprocess/ys_imputer.joblib')
    gPre.fillMissingFinal(ys_df_all, value=0)
    # ys_df_all = gPre.myDiscretize(ys_df_all, 5)
    ys_df_all = gPre.percentageChangeToCtg(ys_df_all, yCtgArr)

    # trim bad rows
    x_df = x_df_all[trimCount:dfLen - trimCount]
    ys_df = ys_df_all[trimCount:dfLen - trimCount]
    # save preprocessed data
    x_df.to_csv(ROOT + '/preprocessed/training1_X.csv', index=False)
    ys_df.to_csv(ROOT + '/preprocessed/training1_Y.csv', index=False)
    # persist preprocessors
    try:
        # dump(x_imputer, ROOT + '/results/preprocess/x_imputer.joblib')
        # dump(ys_imputer, ROOT + '/results/preprocess/ys_imputer.joblib')
        dump(x_scaler, ROOT + '/results/preprocess/x_scaler.joblib')
    except Exception as e3:
        print('preprocess err pt 1: dump')
        print(e3)
예제 #2
0
def run():
    x_df = pd.read_csv(ROOT + '/predict_X.csv')
    x_dfDatetime = x_df['datetime']
    removeDates(x_df)
    # load preprocessors
    x_imputer = load(ROOT + '/results/preprocess/x_imputer.joblib')
    # x_imputer = load(ROOT + '/results/preprocess/ys_imputer.joblib')
    x_scaler = load(ROOT + '/results/preprocess/x_scaler.joblib')
    # preprocess using the same process as in preprocess
    gPre.imputeNoFit(x_df, imputer=x_imputer)
    gPre.fillMissingFinal(x_df, value=0)
    gPre.scaleNoFit(x_df, x_scaler)

    # load models
    # stocks = pd.read_csv(ROOT + '/symbols/marketstack.csv')
    # for stock in stocks:
    #   symb = stock['Symbol']

    # refers to train Y csv to see wt Ys are available to be predicted
    count = 0
    train_ys_df = pd.read_csv(ROOT + '/preprocessed/training1_Y.csv')
    total = maxCount
    allResDf = pd.DataFrame()
    allResDf['datetime'] = x_dfDatetime
    # print(len(train_ys_df))
    for col in train_ys_df:
        # col is the Y name
        try:
            if (col == 'datetime' or col == 'dateObj'):
                continue
            if (count == total):
                break
            count += 1
            yName = str(col)
            print('Predicting ' + yName + '. (' + str(count) + '/' +
                  str(total) + ')')
            resDf = pd.DataFrame()
            resDf['datetime'] = x_dfDatetime
            modelPath = ROOT + '/results/models/' + yName + '.joblib'
            model = load(modelPath)
            pred = model.predict(x_df)
            predP = model.predict_proba(x_df)
            predPMaxs = [max(probaRow) for probaRow in predP]
            resDf['predict'] = pred
            allResDf[yName + '_predict'] = pred
            resDf['predict_maxP'] = predPMaxs
            allResDf[yName + '_predict_maxP'] = predPMaxs
        except Exception as e:
            print('Predict ' + yName + ' err pt1: ')
            print(e)
        # try:
        #   predProbs = model.predict_proba(x_df)
        #   pMax = np.max(predProbs, axis=1)
        #   resDf['predict_proba_max'] = pMax
        #   allResDf[yName + '_predict_proba_max'] = pMax
        # except Exception as e:
        #   print('Predict ' + yName + ' err pt2: ')
        #   print(e)
        try:
            resDf.to_csv(ROOT + '/prediction/' + yName + '.csv', index=False)
        except Exception as e:
            print('Predict ' + yName + ' err pt3: ')
            print(e)
    try:
        allResDf.to_csv(ROOT + '/results/all_predictions.csv', index=False)
    except Exception as e:
        print('pt4: Write allResDf err: ')
        print(e)