def generate_files_for_station(stat, index_column, cut_off_date, forecast_period, list_of_lags): folder = 'STN_%s' % str(stat) ensure_dir(folder) forecast_column = 'STN_%s_PM10' % str(stat) new_df = dg.generate_time_dependent_features(df, index_column, forecast_column, forecast_period, list_of_lags) train_df = new_df[new_df['Date'] < cut_off_date] test_df = new_df[new_df['Date'] >= cut_off_date] # ########################################################################################################### # REMOVE UNWANTED COLUMNS, NORMALISE AND WRITE TO DISK # -- WE REMOVE THE DIFF VERSION OF THE TARGET # AS IN THIS PROBLEM DATA IS GENERALLY STATIONARY (IT DOES NOT EXHIBIT OVERALL TREND) # ########################################################################################################### features = train_df.columns.tolist() val_name = 'STN_%s_PM10' % str(stat) targ1_name = 'TARGET_STN_%s_PM10_7_VALUE' % str(stat) targ2_name = 'TARGET_STN_%s_PM10_7_DIFF' % str(stat) targ3_name = 'TARGET_STN_%s_PM10_7_PROP_DIFF' % str(stat) unwanted = ['No', 'Date', val_name, targ1_name, targ2_name, targ3_name] for x in unwanted: features.remove(x) features.append(val_name) features.append(targ1_name) # WRITE OUT THE UN-NORMALISED VERSION train_df2 = train_df.loc[:, features] test_df2 = test_df.loc[:, features] train_df2.to_csv(folder + '/train.csv', encoding='utf-8', index=False, header=True) test_df2.to_csv(folder + '/test.csv', encoding='utf-8', index=False, header=True) config = nzr.create_normalization_config(train_df2) nzr.write_field_config(config, targ1_name, folder + '/nzr_config.yaml') train_df_norm = nzr.normalize(train_df2, config, []) test_df_norm = nzr.normalize(test_df2, config, []) train_df_norm.to_csv(folder + '/train_normalised.csv', sep=' ', encoding='utf-8', index=False, header=False) test_df_norm.to_csv(folder + '/test_normalised.csv', sep=' ', encoding='utf-8', index=False, header=False)
df2['S'] = np.where(df2.cbwd.str[0:1] == 'S', 1, 0) df2['E'] = np.where(df2.cbwd.str[1:2] == 'E', 1, 0) df2['W'] = np.where(df2.cbwd.str[1:2] == 'W', 1, 0) df2.drop(["cbwd"], axis=1, inplace=True) # WHERE PM2.5 IS ZERO - POTENTIAL MEASUREMENT LIMIT ERROR - REPLACE WITH NOMINAL SMALL VALUE default_value = 0.01 df2["pm2.5"] = np.where(df2["pm2.5"] == 0, default_value, df2["pm2.5"]) index_column = "No" forecast_column = "pm2.5" forecast_period = 24 list_of_lags = [1, 2, 24, 48] new_df = dg.generate_time_dependent_features(df2, index_column, forecast_column, forecast_period, list_of_lags) trainset = 30000 train_df = new_df.loc[0:trainset, :] test_df = new_df.loc[trainset + 1:, :] # ########################################################################################################### # WRITE OUT THE FULL UN-NORMALISED VERSION WITH ALL TARGETS AND HEADERS # ########################################################################################################### train_df.to_csv('sets/Train_24_hour_full.csv', sep=',', encoding='utf-8', index=False, header=True) test_df.to_csv('sets/Test_24_hour_full.csv',