Exemplo n.º 1
0
def data_gen_process_env(*args, **kwargs):

    # logger
    log = kwargs['logger']
    try:
        # read the data from the database
        df = kwargs['df'].copy()

        # smooth the data
        # df = a_utils.dfsmoothing(df=df, column_names=list(df.columns))
        df.clip(lower=0, inplace=True
                )  # Remove <0 values for all columns as a result of smoothing

        # aggregate data
        rolling_sum_target, rolling_mean_target = [], []
        for col_name in df.columns:
            if kwargs['agg'][col_name] == 'sum':
                rolling_sum_target.append(col_name)
            else:
                rolling_mean_target.append(col_name)

        df[rolling_sum_target] = a_utils.window_sum(
            df, window_size=6, column_names=rolling_sum_target)
        df[rolling_mean_target] = a_utils.window_mean(
            df, window_size=6, column_names=rolling_mean_target)
        df = a_utils.dropNaNrows(df)

        # Sample the data at period intervals
        df = a_utils.sample_timeseries_df(df, period=6)

        # scale the columns: here we will use min-max
        df[df.columns] = kwargs['scaler'].minmax_scale(df, df.columns,
                                                       df.columns)

        # creating sat-oat for the data
        df['sat-oat'] = df['sat'] - df['oat']

        # create avg_stpt column
        stpt_cols = [ele for ele in df.columns if 'vrf' in ele]
        df['avg_stpt'] = df[stpt_cols].mean(axis=1)
        # drop individual set point cols
        df.drop(columns=stpt_cols, inplace=True)

        # select retrain range of the data
        time_start_of_train = df.index[-1] - timedelta(
            weeks=kwargs['retrain_range_rl_weeks'])
        df = df.loc[time_start_of_train:, :]

        # save the data frame
        df.to_pickle(kwargs['save_path'] + 'env_data/env_data.pkl')

    except Exception as e:
        log.error('ENV Data Generator Module: %s', str(e))
        log.debug(e, exc_info=True)
Exemplo n.º 2
0
def data_gen_process_vlv(*args, **kwargs):

    # logger
    log = kwargs['logger']
    try:
        # read the data from the database
        df = kwargs['df'].copy()

        # smooth the data
        # df = a_utils.dfsmoothing(df=df, column_names=list(df.columns))
        df.clip(lower=0, inplace=True
                )  # Remove <0 values for all columns as a result of smoothing

        # aggregate data
        rolling_sum_target, rolling_mean_target = [], []
        for col_name in df.columns:
            if kwargs['agg'][col_name] == 'sum':
                rolling_sum_target.append(col_name)
            else:
                rolling_mean_target.append(col_name)

        df[rolling_sum_target] = a_utils.window_sum(
            df, window_size=6, column_names=rolling_sum_target)
        df[rolling_mean_target] = a_utils.window_mean(
            df, window_size=6, column_names=rolling_mean_target)
        df = a_utils.dropNaNrows(df)

        # Sample the data at period intervals
        df = a_utils.sample_timeseries_df(df, period=6)

        # scale the columns: here we will use min-max
        df[df.columns] = kwargs['scaler'].minmax_scale(df, df.columns,
                                                       df.columns)

        # creating sat-oat for the data
        df['sat-oat'] = df['sat'] - df['oat']

        # add binary classification column
        df['vlv'] = 1.0
        df.loc[df['hwe'] <= 0.001, ['vlv']] = 0

        # determine split point for last 1 week test data
        t_train_end = df.index[-1] - timedelta(weeks=10)
        test_df = df.loc[t_train_end:, :]
        splitvalue = test_df.shape[0]

        # create train and test/validate data
        X_test, X_train, y_test, y_train = a_utils.df_2_arrays(
            df=df,
            predictorcols=['oat', 'oah', 'wbt', 'sat-oat'],
            outputcols=['vlv'],
            lag=0,
            scaling=False,
            scaler=None,
            scaleX=True,
            scaleY=True,
            split=splitvalue,
            shuffle=False,
            reshaping=True,
            input_timesteps=1,
            output_timesteps=1,
        )

        y_train = to_categorical(y_train)
        y_test = to_categorical(y_test)

        # save test ids for later plots
        # idx_end = -max(X_test.shape[1],y_test.shape[1])
        # idx_start = idx_end - X_test.shape[0] + 1
        # test_idx = df.index[[ i for i in range(idx_start, idx_end+1, 1) ]]
        # test_info = {'test_idx' : [str(i) for i in test_idx], 'year_num': kwargs['year_num'], 'week_num':kwargs['week_num'] }
        # with open(kwargs['save_path']+'vlv_data/vlv_test_info.txt', 'a') as ifile:
        # 	ifile.write(json.dumps(test_info)+'\n')

        np.save(kwargs['save_path'] + 'vlv_data/vlv_X_train.npy', X_train)
        np.save(kwargs['save_path'] + 'vlv_data/vlv_X_val.npy', X_test)
        np.save(kwargs['save_path'] + 'vlv_data/vlv_y_train.npy', y_train)
        np.save(kwargs['save_path'] + 'vlv_data/vlv_y_val.npy', y_test)

    except Exception as e:
        log.error('VLV Data Generator Module: %s', str(e))
        log.debug(e, exc_info=True)