def run():
    x_df_all = pd.read_csv(ROOT + '/training1_X.csv')
    ys_df_all = pd.read_csv(ROOT + '/training1_Y.csv')
    dfLen = len(x_df_all.index)
    removeDates(x_df_all)
    removeDates(ys_df_all)

    # preprocess X
    x_imputer = gPre.impute(x_df_all)
    dump(x_imputer, ROOT + '/results/preprocess/x_imputer.joblib')
    gPre.fillMissingFinal(x_df_all, value=0)
    x_scaler = gPre.scale(x_df_all)

    # preprocess Ys
    ys_imputer = gPre.impute(ys_df_all)
    dump(ys_imputer, ROOT + '/results/preprocess/ys_imputer.joblib')
    gPre.fillMissingFinal(ys_df_all, value=0)
    # ys_df_all = gPre.myDiscretize(ys_df_all, 5)
    ys_df_all = gPre.percentageChangeToCtg(ys_df_all, yCtgArr)

    # trim bad rows
    x_df = x_df_all[trimCount:dfLen - trimCount]
    ys_df = ys_df_all[trimCount:dfLen - trimCount]
    # save preprocessed data
    x_df.to_csv(ROOT + '/preprocessed/training1_X.csv', index=False)
    ys_df.to_csv(ROOT + '/preprocessed/training1_Y.csv', index=False)
    # persist preprocessors
    try:
        # dump(x_imputer, ROOT + '/results/preprocess/x_imputer.joblib')
        # dump(ys_imputer, ROOT + '/results/preprocess/ys_imputer.joblib')
        dump(x_scaler, ROOT + '/results/preprocess/x_scaler.joblib')
    except Exception as e3:
        print('preprocess err pt 1: dump')
        print(e3)
예제 #2
0
def run(df):
    res = gPre.run(df)
    gPre.impute(df)
    print("p1 preprocess")
    # encoderF4 = gPre.encodeCtgs(df, "f4")
    # encoderF5 = gPre.encodeCtgs(df, "f5")
    # res["ctg_f4"] = encoderF4
    # res["ctg_f5"] = encoderF5
    return res
예제 #3
0
def processY(df):
    # do all preprocess, feature extraction, ...
    res = gPre.impute(df)
    return res
def run():
    # # v2 flow
    # train_df: temp df storing joined x_df and picked y
    # read csv to df
    #   x_df: X df
    #   ys_df: poss Ys df
    x_df_all = pd.read_csv(X_CSV_FILEPATH)
    ys_df_all = pd.read_csv(YS_CSV_FILEPATH)
    all_df_all = pd.read_csv(ALL_CSV_FILEPATH)
    trimCount = 8
    dfLen = len(x_df_all.index)
    removeDates(x_df_all)
    removeDates(ys_df_all)

    # preprocess
    #   standardize scale
    #     x_df
    #     returned scaler keep for prediction use
    #     no scaling for y

    # scalers, ... obtained in preo=process
    preprocess_x_res = processX(x_df_all)
    # preprocess_y_res = processX(ys_df_all)
    gPre.impute(ys_df_all)
    gPre.finalImpute(ys_df_all)
    print('discretize df nan check 1:')
    print(ys_df_all.isnull().values.any())
    # print(ys_df_all.isnull().values)
    ys_df_all.to_csv(ROOT + '/data/test/ys_df_all_imputed.csv')
    print('ys_df_all:')
    print(ys_df_all)
    # y_discretize_res = gPre.discretize(ys_df_all, 5)
    y_discretize_res = gPre.myDiscretize(ys_df_all, 5)
    print(y_discretize_res)
    print(type(y_discretize_res))
    # ys_df_all = pd.DataFrame(data=y_discretize_res,index=ys_df_all.index, columns=ys_df_all.columns)
    # ys_df_all = y_discretize_res
    # ys_df_all = pd.DataFrame(y_discretize_res.toarray())
    # ys_df_all = pd.DataFrame(y_discretize_res, index=ys_df_all.index, columns=ys_df_all.columns)
    ys_df_all = y_discretize_res
    ys_df_all.to_csv(ROOT + '/data/test/ys_df_all_imputed_2.csv')
    y_discretize_res.to_csv(ROOT + '/data/test/ys_df_all_imputed_3.csv')
    preprocess_all_res = processX(all_df_all)

    x_df = x_df_all[trimCount:dfLen - trimCount]
    ys_df = ys_df_all[trimCount:dfLen - trimCount]
    all_df = all_df_all[trimCount:dfLen - trimCount]

    # inspect x_df
    x_df.to_csv(ROOT + '/data/test/x_t1_imputed.csv')
    ys_df.to_csv(ROOT + '/data/test/ys_t1_imputed.csv')
    all_df.to_csv(ROOT + '/data/test/all_t1_imputed.csv')

    # train
    #   foreach col in ys_df:
    #     the col is y picked
    #     train_df = join x_df and picked col
    #     df of x and a y picked
    #     try diff MLs on the df
    maxCount = 200
    count = 0
    for col in ys_df:
        # train_df = pd.concat([df1, df2], axis=1)
        try:
            if (col != 'datetime' and col != 'dateObj'):
                print('Using ' + str(col) + ' as Y.')
                # train_df = x_df.copy()
                # train_df['Y'] = ys_df[col]
                train(str(col), x_df, ys_df[col])
                count += 1
        except Exception as e:
            print(e)
        if count == maxCount:
            break

    # save results
    linear_regression_results_df = pd.DataFrame(linear_regression_results)
    linear_regression_results_df.to_json(TRAINING_RESULTS_FILEPATH,
                                         orient='records',
                                         lines=True)