예제 #1
0
def generate_files_for_station(stat, index_column, cut_off_date,
                               forecast_period, list_of_lags):
    folder = 'STN_%s' % str(stat)
    ensure_dir(folder)
    forecast_column = 'STN_%s_PM10' % str(stat)
    new_df = dg.generate_time_dependent_features(df, index_column,
                                                 forecast_column,
                                                 forecast_period, list_of_lags)
    train_df = new_df[new_df['Date'] < cut_off_date]
    test_df = new_df[new_df['Date'] >= cut_off_date]
    # ###########################################################################################################
    #  REMOVE UNWANTED COLUMNS, NORMALISE AND WRITE TO DISK
    #  -- WE REMOVE THE DIFF VERSION OF THE TARGET
    # AS IN THIS PROBLEM DATA IS GENERALLY STATIONARY (IT DOES NOT EXHIBIT OVERALL TREND)
    # ###########################################################################################################
    features = train_df.columns.tolist()

    val_name = 'STN_%s_PM10' % str(stat)
    targ1_name = 'TARGET_STN_%s_PM10_7_VALUE' % str(stat)
    targ2_name = 'TARGET_STN_%s_PM10_7_DIFF' % str(stat)
    targ3_name = 'TARGET_STN_%s_PM10_7_PROP_DIFF' % str(stat)
    unwanted = ['No', 'Date', val_name, targ1_name, targ2_name, targ3_name]

    for x in unwanted:
        features.remove(x)
    features.append(val_name)
    features.append(targ1_name)
    # WRITE OUT THE UN-NORMALISED VERSION
    train_df2 = train_df.loc[:, features]
    test_df2 = test_df.loc[:, features]
    train_df2.to_csv(folder + '/train.csv',
                     encoding='utf-8',
                     index=False,
                     header=True)
    test_df2.to_csv(folder + '/test.csv',
                    encoding='utf-8',
                    index=False,
                    header=True)

    config = nzr.create_normalization_config(train_df2)
    nzr.write_field_config(config, targ1_name, folder + '/nzr_config.yaml')

    train_df_norm = nzr.normalize(train_df2, config, [])
    test_df_norm = nzr.normalize(test_df2, config, [])

    train_df_norm.to_csv(folder + '/train_normalised.csv',
                         sep=' ',
                         encoding='utf-8',
                         index=False,
                         header=False)
    test_df_norm.to_csv(folder + '/test_normalised.csv',
                        sep=' ',
                        encoding='utf-8',
                        index=False,
                        header=False)
df2['S'] = np.where(df2.cbwd.str[0:1] == 'S', 1, 0)
df2['E'] = np.where(df2.cbwd.str[1:2] == 'E', 1, 0)
df2['W'] = np.where(df2.cbwd.str[1:2] == 'W', 1, 0)
df2.drop(["cbwd"], axis=1, inplace=True)

# WHERE PM2.5 IS ZERO - POTENTIAL MEASUREMENT LIMIT ERROR - REPLACE WITH NOMINAL SMALL VALUE
default_value = 0.01
df2["pm2.5"] = np.where(df2["pm2.5"] == 0, default_value, df2["pm2.5"])

index_column = "No"
forecast_column = "pm2.5"
forecast_period = 24
list_of_lags = [1, 2, 24, 48]

new_df = dg.generate_time_dependent_features(df2, index_column,
                                             forecast_column, forecast_period,
                                             list_of_lags)

trainset = 30000
train_df = new_df.loc[0:trainset, :]
test_df = new_df.loc[trainset + 1:, :]

# ###########################################################################################################
# WRITE OUT THE FULL UN-NORMALISED VERSION WITH ALL TARGETS AND HEADERS
# ###########################################################################################################
train_df.to_csv('sets/Train_24_hour_full.csv',
                sep=',',
                encoding='utf-8',
                index=False,
                header=True)
test_df.to_csv('sets/Test_24_hour_full.csv',