def main(): # preprocess data / ETL / interpolation / data curation data_df,train_forecast_ids,normal_forecast_ids,linear_interpolation,last_window,submit_zeroes,submit_averages = preprocess_data() best_1d_model = 'weights/bl_1d_1e2_best.pth.tar' best_1h_model = 'weights/bl_1h_1e2_best.pth.tar' best_15m_model = 'weights/bl_15m_3lyr_best.pth.tar' model_1d = E2ELSTM_day(in_sequence_len = 30, out_sequence_len = 30, features_meta_total = 43, features_ar_total = 1, meta_hidden_layer_length = 30, ar_hidden_layer_length = 30, meta_hidden_layers = 2, ar_hidden_layers = 2, lstm_dropout = 0, classifier_hidden_length = 256) model_1h = E2ELSTM(in_sequence_len = 192, out_sequence_len = 192, features_meta_total = 72, features_ar_total = 1, meta_hidden_layer_length = 192, ar_hidden_layer_length = 192, meta_hidden_layers = 2, ar_hidden_layers = 2, lstm_dropout = 0, classifier_hidden_length = 512) model_15m = E2ELSTM(in_sequence_len = 192, out_sequence_len = 192, features_meta_total = 72, features_ar_total = 1, meta_hidden_layer_length = 192, ar_hidden_layer_length = 192, meta_hidden_layers = 3, ar_hidden_layers = 3, lstm_dropout = 0, classifier_hidden_length = 512) checkpoint = torch.load(best_1d_model) model_1d.load_state_dict(checkpoint['state_dict']) print("model_1d => loaded checkpoint (epoch {})".format(checkpoint['epoch'])) checkpoint = torch.load(best_1h_model) model_1h.load_state_dict(checkpoint['state_dict']) print("model_1h => loaded checkpoint (epoch {})".format(checkpoint['epoch'])) checkpoint = torch.load(best_15m_model) model_15m.load_state_dict(checkpoint['state_dict']) print("model_15m => loaded checkpoint (epoch {})".format(checkpoint['epoch'])) submission_df = pd.read_csv('../data/forecast/submission_format.csv') submission_df = submission_df.set_index('obs_id') model_1d.cuda() model_1h.cuda() model_15m.cuda() model_1d.eval() model_1h.eval() model_15m.eval() stat_cols = ['forecast_id','wrmse_val','r2_val'] stat_df = pd.DataFrame(columns = stat_cols) # select only series we marked as trainable # negation is for speed only trainable_df = data_df[(~data_df.ForecastId.isin(list(set(data_df.ForecastId.unique()) - set(train_forecast_ids))))] print('Predicting for 1 day series ...') temp_features = ['Temperature'] hol_emb_features = ['Holiday'] time_emb_features = ['month','day','dow'] target = ['Value'] predictors = temp_features + hol_emb_features + time_emb_features predict_dataset = S2SDataset(df = trainable_df, series_type = '1_day', in_sequence_len = 30, out_sequence_len = 30, target = 'Value', mode = 'test', split_mode = 'random', predictors = predictors) predict_dataset_wrmse = S2SDataset(df = trainable_df, series_type = '1_day', in_sequence_len = 30, out_sequence_len = 30, target = 'Value', mode = 'evaluate_wrmse', split_mode = 'random', predictors = predictors) print('Dataset length is {}'.format(len(predict_dataset.forecast_ids))) with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i,forecast_id in enumerate(predict_dataset.forecast_ids): i = predict_dataset.forecast_ids.index(forecast_id) test_X_sequences_meta,test_X_sequences_ar,len_diff = predict_dataset.__getitem__(i) # into PyTorch format test_X_sequences_meta = torch.from_numpy(test_X_sequences_meta).view(1,-1,5) test_X_sequences_ar = torch.from_numpy(test_X_sequences_ar).view(1,-1,1) # transform data from Batch x Window x Etc into Batch x Etc format test_X_sequences_ar = test_X_sequences_ar.float() test_X_sequences_temp = test_X_sequences_meta[:,:,0:1].float() test_X_sequences_meta = test_X_sequences_meta[:,:,1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda(async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda(async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda(async=True) # compute output output = model_1d(x_temp_var,x_meta_var,x_ar_var) output = output[0,:].data.cpu().numpy() # predict first 30 time points output1 = output # then predict the remaining points using data we have predict_len = predict_dataset.df[(predict_dataset.df.ForecastId == forecast_id) & (predict_dataset.df.is_train == 0)].shape[0] remaining_len = predict_len - len(output1) # use our preds as AR values for final prediction # predict more values test_X_sequences_ar = output1 test_X_sequences_meta = predict_dataset.df[(predict_dataset.df.ForecastId == forecast_id) &(predict_dataset.df.is_train == 0)].iloc[-len(output1) * 2:][predictors].values test_X_sequences_meta = test_X_sequences_meta.copy() # into PyTorch format test_X_sequences_meta = torch.from_numpy(test_X_sequences_meta).view(1,-1,5) test_X_sequences_ar = torch.from_numpy(test_X_sequences_ar).view(1,-1,1) # transform data from Batch x Window x Etc into Batch x Etc format test_X_sequences_ar = test_X_sequences_ar.float() test_X_sequences_temp = test_X_sequences_meta[:,:,0:1].float() test_X_sequences_meta = test_X_sequences_meta[:,:,1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda(async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda(async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda(async=True) # compute output output = model_1d(x_temp_var,x_meta_var,x_ar_var) output = output[0,:].data.cpu().numpy() # predict first 30 time points output2 = output truncate_len = predict_len - len(output1) - len(output2) final_output = np.hstack((output1,output2[-truncate_len:])) final_output = final_output * predict_dataset.std_dict[forecast_id] + predict_dataset.mean_dict[forecast_id] submission_df.loc[submission_df.ForecastId == forecast_id, 'Value'] = final_output pbar.update(1) # forecast loop - evaluate on the last sequence on trainval dataset with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i,forecast_id in enumerate(predict_dataset_wrmse.forecast_ids): i = predict_dataset_wrmse.forecast_ids.index(forecast_id) X_sequences_ar,X_sequences_meta,y_sequences = predict_dataset_wrmse.__getitem__(i) X_sequences_meta = torch.from_numpy(X_sequences_meta).view(1,-1,8) X_sequences_ar = torch.from_numpy(X_sequences_ar).view(1,-1,1) y_true = y_sequences.reshape(-1) * predict_dataset_wrmse.std_dict[forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] # transform data from Batch x Window x Etc into Batch x Etc format X_sequences_ar = X_sequences_ar.float() X_sequences_temp = X_sequences_meta[:,:,0:1].float() X_sequences_meta = X_sequences_meta[:,:,1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda(async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda(async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda(async=True) # compute output output = model_1d(x_temp_var,x_meta_var,x_ar_var) output = output[0,:].data.cpu().numpy() output = output * predict_dataset_wrmse.std_dict[forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] wrmse_val = WRMSE(y_true, output) r2_score_val = metrics.r2_score(y_true, output) stat_df = stat_df.append(pd.DataFrame([dict(zip(stat_cols,[forecast_id,r2_score_val,wrmse_val]))])) pbar.update(1) print('Predicting for 1 hour series ...') temp_features = ['Temperature'] hol_emb_features = ['Holiday'] time_emb_features = ['year', 'month', 'day', 'hour', 'minute','dow'] target = ['Value'] predictors = temp_features + hol_emb_features + time_emb_features predict_dataset = S2SDataset(df = trainable_df, series_type = '1_hour', in_sequence_len = 192, out_sequence_len = 192, target = 'Value', mode = 'test', split_mode = 'random', predictors = predictors) predict_dataset_wrmse = S2SDataset(df = trainable_df, series_type = '1_hour', in_sequence_len = 192, out_sequence_len = 192, target = 'Value', mode = 'evaluate_wrmse', split_mode = 'random', predictors = predictors) print('Dataset length is {}'.format(len(predict_dataset.forecast_ids))) with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i,forecast_id in enumerate(predict_dataset.forecast_ids): i = predict_dataset.forecast_ids.index(forecast_id) test_X_sequences_meta,test_X_sequences_ar,len_diff = predict_dataset.__getitem__(i) # into PyTorch format test_X_sequences_meta = torch.from_numpy(test_X_sequences_meta).view(1,-1,8) test_X_sequences_ar = torch.from_numpy(test_X_sequences_ar).view(1,-1,1) # transform data from Batch x Window x Etc into Batch x Etc format test_X_sequences_ar = test_X_sequences_ar.float() test_X_sequences_temp = test_X_sequences_meta[:,:,0:1].float() test_X_sequences_meta = test_X_sequences_meta[:,:,1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda(async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda(async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda(async=True) # compute output output = model_1h(x_temp_var,x_meta_var,x_ar_var) output = output[0,:].data.cpu().numpy() # only the necessary length output = output[-len_diff:] output = output * predict_dataset.std_dict[forecast_id] + predict_dataset.mean_dict[forecast_id] submission_df.loc[submission_df.ForecastId == forecast_id, 'Value'] = output pbar.update(1) with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i,forecast_id in enumerate(predict_dataset_wrmse.forecast_ids): i = predict_dataset_wrmse.forecast_ids.index(forecast_id) X_sequences_ar,X_sequences_meta,y_sequences = predict_dataset_wrmse.__getitem__(i) X_sequences_meta = torch.from_numpy(X_sequences_meta).view(1,-1,8) X_sequences_ar = torch.from_numpy(X_sequences_ar).view(1,-1,1) y_true = y_sequences.reshape(-1) * predict_dataset_wrmse.std_dict[forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] # transform data from Batch x Window x Etc into Batch x Etc format X_sequences_ar = X_sequences_ar.float() X_sequences_temp = X_sequences_meta[:,:,0:1].float() X_sequences_meta = X_sequences_meta[:,:,1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda(async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda(async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda(async=True) # compute output output = model_1h(x_temp_var,x_meta_var,x_ar_var) output = output[0,:].data.cpu().numpy() output = output * predict_dataset_wrmse.std_dict[forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] wrmse_val = WRMSE(y_true, output) r2_score_val = metrics.r2_score(y_true, output) stat_df = stat_df.append(pd.DataFrame([dict(zip(stat_cols,[forecast_id,r2_score_val,wrmse_val]))]))\ pbar.update(1) print('Predicting for 15 min series ...') predict_dataset = S2SDataset(df = trainable_df, series_type = '15_mins', in_sequence_len = 192, out_sequence_len = 192, target = 'Value', mode = 'test', split_mode = 'random', predictors = predictors) predict_dataset_wrmse = S2SDataset(df = trainable_df, series_type = '15_mins', in_sequence_len = 192, out_sequence_len = 192, target = 'Value', mode = 'evaluate_wrmse', split_mode = 'random', predictors = predictors) print('Dataset length is {}'.format(len(predict_dataset.forecast_ids))) with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i,forecast_id in enumerate(predict_dataset.forecast_ids): i = predict_dataset.forecast_ids.index(forecast_id) test_X_sequences_meta,test_X_sequences_ar,len_diff = predict_dataset.__getitem__(i) # into PyTorch format test_X_sequences_meta = torch.from_numpy(test_X_sequences_meta).view(1,-1,8) test_X_sequences_ar = torch.from_numpy(test_X_sequences_ar).view(1,-1,1) # transform data from Batch x Window x Etc into Batch x Etc format test_X_sequences_ar = test_X_sequences_ar.float() test_X_sequences_temp = test_X_sequences_meta[:,:,0:1].float() test_X_sequences_meta = test_X_sequences_meta[:,:,1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda(async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda(async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda(async=True) # compute output output = model_15m(x_temp_var,x_meta_var,x_ar_var) output = output[0,:].data.cpu().numpy() # only the necessary length output = output[-len_diff:] output = output * predict_dataset.std_dict[forecast_id] + predict_dataset.mean_dict[forecast_id] submission_df.loc[submission_df.ForecastId == forecast_id, 'Value'] = output pbar.update(1) with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i,forecast_id in enumerate(predict_dataset_wrmse.forecast_ids): i = predict_dataset_wrmse.forecast_ids.index(forecast_id) X_sequences_ar,X_sequences_meta,y_sequences = predict_dataset_wrmse.__getitem__(i) X_sequences_meta = torch.from_numpy(X_sequences_meta).view(1,-1,8) X_sequences_ar = torch.from_numpy(X_sequences_ar).view(1,-1,1) y_true = y_sequences.reshape(-1) * predict_dataset_wrmse.std_dict[forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] # transform data from Batch x Window x Etc into Batch x Etc format X_sequences_ar = X_sequences_ar.float() X_sequences_temp = X_sequences_meta[:,:,0:1].float() X_sequences_meta = X_sequences_meta[:,:,1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda(async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda(async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda(async=True) # compute output output = model_15m(x_temp_var,x_meta_var,x_ar_var) output = output[0,:].data.cpu().numpy() output = output * predict_dataset_wrmse.std_dict[forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] wrmse_val = WRMSE(y_true, output) r2_score_val = metrics.r2_score(y_true, output) stat_df = stat_df.append(pd.DataFrame([dict(zip(stat_cols,[forecast_id,r2_score_val,wrmse_val]))])) pbar.update(1) # submit zeroes and averages print('Submitting averages ... ') with tqdm.tqdm(total=len(submit_averages)) as pbar: for forecast_id in submit_averages: submission_df.loc[submission_df.ForecastId == forecast_id, 'Value'] = data_df[data_df.ForecastId == forecast_id].mean() pbar.update(1) print('Submitting zeroes ... ') with tqdm.tqdm(total=len(submit_zeroes)) as pbar: for forecast_id in submit_zeroes: submission_df.loc[submission_df.ForecastId == forecast_id, 'Value'] = 0 pbar.update(1) stat_df.to_csv('forest_stats_{}.csv'.format(LOGNUMBER)) submission_df['Value'] = submission_df['Value'].fillna(value=0) submission_df.to_csv('../submissions/forecast_{}.csv'.format(LOGNUMBER))
def main(): global args, best_val_loss global logger # preprocess data / ETL / interpolation / data curation # data_df,train_forecast_ids,normal_forecast_ids,linear_interpolation,last_window,submit_zeroes,submit_averages = preprocess_data() # suppress pandas warnings # do not do this in production! pd.set_option('mode.chained_assignment', None) # read all pre-calculated objects data_df = pd.read_feather('../data/forecast/data_df_feather') with open('train_forecast_ids.pkl', 'rb') as input: train_forecast_ids = pickle.load(input) with open('normal_forecast_ids.pkl', 'rb') as input: normal_forecast_ids = pickle.load(input) with open('linear_interpolation.pkl', 'rb') as input: linear_interpolation = pickle.load(input) with open('use_last_window.pkl', 'rb') as input: use_last_window = pickle.load(input) with open('submit_zeroes.pkl', 'rb') as input: submit_zeroes = pickle.load(input) with open('submit_averages.pkl', 'rb') as input: submit_averages = pickle.load(input) # override - exclude last window series train_forecast_ids = normal_forecast_ids + linear_interpolation # take last window from previous submit # features we use if args.series_type == '1_day': temp_features = ['Temperature'] hol_emb_features = ['Holiday'] time_emb_features = ['month','day','dow'] target = ['Value'] predictors = temp_features + hol_emb_features + time_emb_features model = E2ELSTM_day(in_sequence_len = args.inp_seq, out_sequence_len = args.out_seq, features_meta_total = args.features_meta, features_ar_total = args.features_ar, meta_hidden_layer_length = args.lstm_meta_hid_feat, ar_hidden_layer_length = args.lstm_ar_hid_feat, meta_hidden_layers = args.lstm_meta_hid_lyr, ar_hidden_layers = args.lstm_ar_hid_lyr, lstm_dropout = args.lstm_dropout, classifier_hidden_length = args.mlp_hid_lyr) else: temp_features = ['Temperature'] hol_emb_features = ['Holiday'] time_emb_features = ['year', 'month', 'day', 'hour', 'minute','dow'] target = ['Value'] predictors = temp_features + hol_emb_features + time_emb_features # E2EGRU or E2ELSTM model = EncoderDecoderGRU(in_sequence_len = args.inp_seq, out_sequence_len = args.out_seq, features_meta_total = args.features_meta, features_ar_total = args.features_ar, meta_hidden_layer_length = args.lstm_meta_hid_feat, ar_hidden_layer_length = args.lstm_ar_hid_feat, meta_hidden_layers = args.lstm_meta_hid_lyr, ar_hidden_layers = args.lstm_ar_hid_lyr, lstm_dropout = args.lstm_dropout, classifier_hidden_length = args.mlp_hid_lyr, use_output = args.use_output) # model.cuda() model = torch.nn.DataParallel(model).cuda() # select only series we marked as trainable # negation is for speed only trainable_df = data_df[(~data_df.ForecastId.isin(list(set(data_df.ForecastId.unique()) - set(train_forecast_ids))))] train_dataset = S2SDataset(df = trainable_df, series_type = args.series_type, in_sequence_len = args.inp_seq, out_sequence_len = args.out_seq, target = 'Value', mode = 'train', split_mode = 'random', predictors = predictors, val_size = args.val_size) val_dataset = S2SDataset(df = trainable_df, series_type = args.series_type, in_sequence_len = args.inp_seq, out_sequence_len = args.out_seq, target = 'Value', mode = 'val', split_mode = 'random', predictors = predictors, val_size = args.val_size) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=False) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=False) criterion = nn.MSELoss().cuda() optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr = args.lr) scheduler = ReduceLROnPlateau(optimizer = optimizer, mode = 'min', factor = 0.1, patience = 4, verbose = True, threshold = 1e-3, min_lr = 1e-6 ) for epoch in range(args.start_epoch, args.epochs): # adjust_learning_rate(optimizer, epoch) # train for one epoch train_loss = train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set val_loss = validate(val_loader, model, criterion) scheduler.step(val_loss) # add code for early stopping here # # #============ TensorBoard logging ============# # Log the scalar values if args.tensorboard: info = { 'train_epoch_loss': train_loss, 'valid_epoch_loss': val_loss } for tag, value in info.items(): logger.scalar_summary(tag, value, epoch+1) # remember best prec@1 and save checkpoint is_best = val_loss < best_val_loss best_val_loss = min(val_loss, best_val_loss) save_checkpoint({ 'epoch': epoch + 1, 'optimizer': optimizer.state_dict(), 'state_dict': model.state_dict(), 'best_val_loss': best_val_loss, }, is_best, 'weights/{}_checkpoint.pth.tar'.format(str(args.lognumber)), 'weights/{}_best.pth.tar'.format(str(args.lognumber)) )
def main(): # suppress pandas warnings # do not do this in production! pd.set_option('mode.chained_assignment', None) # read all pre-calculated objects data_df = pd.read_feather('../data/forecast/data_df_feather_ar_values') with open('train_forecast_ids.pkl', 'rb') as input: train_forecast_ids = pickle.load(input) with open('normal_forecast_ids.pkl', 'rb') as input: normal_forecast_ids = pickle.load(input) with open('linear_interpolation.pkl', 'rb') as input: linear_interpolation = pickle.load(input) with open('use_last_window.pkl', 'rb') as input: use_last_window = pickle.load(input) with open('submit_zeroes.pkl', 'rb') as input: submit_zeroes = pickle.load(input) with open('submit_averages.pkl', 'rb') as input: submit_averages = pickle.load(input) # override - exclude last window series train_forecast_ids = normal_forecast_ids + linear_interpolation # take last window from previous submit best_1d_model = 'weights/bl_1d_1e2_best.pth.tar' best_1h_model = 'weights/15m1h_encdec_w192_hid512_2lyr_7ar_best.pth.tar' best_15m_model = 'weights/15m1h_encdec_w192_hid512_2lyr_7ar_best.pth.tar' model_1d = E2ELSTM_day(in_sequence_len=30, out_sequence_len=30, features_meta_total=43, features_ar_total=1, meta_hidden_layer_length=30, ar_hidden_layer_length=30, meta_hidden_layers=2, ar_hidden_layers=2, lstm_dropout=0, classifier_hidden_length=256) model_1h = EncoderDecoderGRU(in_sequence_len=192, out_sequence_len=192, features_meta_total=72, features_ar_total=7, meta_hidden_layer_length=512, ar_hidden_layer_length=512, meta_hidden_layers=2, ar_hidden_layers=2, lstm_dropout=0, classifier_hidden_length=1024) model_15m = EncoderDecoderGRU(in_sequence_len=192, out_sequence_len=192, features_meta_total=72, features_ar_total=7, meta_hidden_layer_length=512, ar_hidden_layer_length=512, meta_hidden_layers=2, ar_hidden_layers=2, lstm_dropout=0, classifier_hidden_length=1024) model_1h = torch.nn.DataParallel(model_1h) model_15m = torch.nn.DataParallel(model_15m) checkpoint = torch.load(best_1d_model) model_1d.load_state_dict(checkpoint['state_dict']) print("model_1d => loaded checkpoint (epoch {})".format( checkpoint['epoch'])) checkpoint = torch.load(best_1h_model) model_1h.load_state_dict(checkpoint['state_dict']) print("model_1h => loaded checkpoint (epoch {})".format( checkpoint['epoch'])) checkpoint = torch.load(best_15m_model) model_15m.load_state_dict(checkpoint['state_dict']) print("model_15m => loaded checkpoint (epoch {})".format( checkpoint['epoch'])) submission_df = pd.read_csv('../data/forecast/submission_format.csv') submission_df = submission_df.set_index('obs_id') model_1d.cuda() model_1h.cuda() model_15m.cuda() model_1d.eval() model_1h.eval() model_15m.eval() stat_cols = ['forecast_id', 'wrmse_val', 'r2_val'] stat_df = pd.DataFrame(columns=stat_cols) # select only series we marked as trainable # negation is for speed only trainable_df = data_df[(~data_df.ForecastId.isin( list(set(data_df.ForecastId.unique()) - set(train_forecast_ids))))] print('Predicting for 1 day series ...') temp_features = ['Temperature'] ar_features = ['Value'] hol_emb_features = ['Holiday'] time_emb_features = ['month', 'day', 'dow'] target = ['Value'] predictors = temp_features + hol_emb_features + time_emb_features predict_dataset = S2SDataset(df=trainable_df, series_type='1_day', in_sequence_len=30, out_sequence_len=30, target='Value', mode='test', split_mode='random', predictors=predictors, ar_features=ar_features) predict_dataset_wrmse = S2SDataset(df=trainable_df, series_type='1_day', in_sequence_len=30, out_sequence_len=30, target='Value', mode='evaluate_wrmse', split_mode='random', predictors=predictors, ar_features=ar_features) print('Dataset length is {}'.format(len(predict_dataset.forecast_ids))) with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i, forecast_id in enumerate(predict_dataset.forecast_ids): i = predict_dataset.forecast_ids.index(forecast_id) test_X_sequences_meta, test_X_sequences_ar, len_diff = predict_dataset.__getitem__( i) # into PyTorch format test_X_sequences_meta = torch.from_numpy( test_X_sequences_meta).view(1, -1, 5) test_X_sequences_ar = torch.from_numpy(test_X_sequences_ar).view( 1, -1, 1) # transform data from Batch x Window x Etc into Batch x Etc format test_X_sequences_ar = test_X_sequences_ar.float() test_X_sequences_temp = test_X_sequences_meta[:, :, 0:1].float() test_X_sequences_meta = test_X_sequences_meta[:, :, 1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda( async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda( async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda( async=True) # compute output output = model_1d(x_temp_var, x_meta_var, x_ar_var) output = output[0, :].data.cpu().numpy() # predict first 30 time points output1 = output # then predict the remaining points using data we have predict_len = predict_dataset.df[ (predict_dataset.df.ForecastId == forecast_id) & (predict_dataset.df.is_train == 0)].shape[0] remaining_len = predict_len - len(output1) # use our preds as AR values for final prediction # predict more values test_X_sequences_ar = output1 test_X_sequences_meta = predict_dataset.df[ (predict_dataset.df.ForecastId == forecast_id) & (predict_dataset.df.is_train == 0)].iloc[-len(output1) * 2:][predictors].values test_X_sequences_meta = test_X_sequences_meta.copy() # into PyTorch format test_X_sequences_meta = torch.from_numpy( test_X_sequences_meta).view(1, -1, 5) test_X_sequences_ar = torch.from_numpy(test_X_sequences_ar).view( 1, -1, 1) # transform data from Batch x Window x Etc into Batch x Etc format test_X_sequences_ar = test_X_sequences_ar.float() test_X_sequences_temp = test_X_sequences_meta[:, :, 0:1].float() test_X_sequences_meta = test_X_sequences_meta[:, :, 1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda( async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda( async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda( async=True) # compute output output = model_1d(x_temp_var, x_meta_var, x_ar_var) output = output[0, :].data.cpu().numpy() # predict first 30 time points output2 = output truncate_len = predict_len - len(output1) - len(output2) final_output = np.hstack((output1, output2[-truncate_len:])) final_output = final_output * predict_dataset.std_dict[ forecast_id] + predict_dataset.mean_dict[forecast_id] submission_df.loc[submission_df.ForecastId == forecast_id, 'Value'] = final_output pbar.update(1) # forecast loop - evaluate on the last sequence on trainval dataset with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i, forecast_id in enumerate(predict_dataset_wrmse.forecast_ids): i = predict_dataset_wrmse.forecast_ids.index(forecast_id) X_sequences_ar, X_sequences_meta, y_sequences = predict_dataset_wrmse.__getitem__( i) X_sequences_meta = torch.from_numpy(X_sequences_meta).view( 1, -1, 5) X_sequences_ar = torch.from_numpy(X_sequences_ar).view(1, -1, 1) y_true = y_sequences.reshape(-1) * predict_dataset_wrmse.std_dict[ forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] # transform data from Batch x Window x Etc into Batch x Etc format X_sequences_ar = X_sequences_ar.float() X_sequences_temp = X_sequences_meta[:, :, 0:1].float() X_sequences_meta = X_sequences_meta[:, :, 1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda( async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda( async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda( async=True) # compute output output = model_1d(x_temp_var, x_meta_var, x_ar_var) output = output[0, :].data.cpu().numpy() output = output * predict_dataset_wrmse.std_dict[ forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] wrmse_val = WRMSE(y_true, output) r2_score_val = metrics.r2_score(y_true, output) stat_df = stat_df.append( pd.DataFrame([ dict(zip(stat_cols, [forecast_id, r2_score_val, wrmse_val])) ])) pbar.update(1) print('Predicting for 1 hour series ...') temp_features = ['Temperature'] ar_features = [ 'Value', 'Value1', 'Value4', 'Value12', 'Value24', 'Value96', 'Value168' ] hol_emb_features = ['Holiday'] time_emb_features = ['year', 'month', 'day', 'hour', 'minute', 'dow'] target = ['Value'] predictors = temp_features + hol_emb_features + time_emb_features predict_dataset = S2SDataset(df=trainable_df, series_type='1_hour', in_sequence_len=192, out_sequence_len=192, target='Value', mode='test', split_mode='random', predictors=predictors, ar_features=ar_features) """ predict_dataset_wrmse = S2SDataset(df = trainable_df, series_type = '1_hour', in_sequence_len = 192, out_sequence_len = 192, target = 'Value', mode = 'evaluate_wrmse', split_mode = 'random', predictors = predictors, ar_features = ar_features) """ print('Dataset length is {}'.format(len(predict_dataset.forecast_ids))) with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i, forecast_id in enumerate(predict_dataset.forecast_ids): i = predict_dataset.forecast_ids.index(forecast_id) test_X_sequences_meta, test_X_sequences_ar, len_diff = predict_dataset.__getitem__( i) # into PyTorch format test_X_sequences_meta = torch.from_numpy( test_X_sequences_meta).view(1, -1, 8) test_X_sequences_ar = torch.from_numpy(test_X_sequences_ar).view( 1, -1, 7) # transform data from Batch x Window x Etc into Batch x Etc format test_X_sequences_ar = test_X_sequences_ar.float() test_X_sequences_temp = test_X_sequences_meta[:, :, 0:1].float() test_X_sequences_meta = test_X_sequences_meta[:, :, 1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda( async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda( async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda( async=True) # compute output output = model_1h(x_temp_var, x_meta_var, x_ar_var) output = output[0, :].data.cpu().numpy() # only the necessary length output = output[-len_diff:] output = output * predict_dataset.std_dict[ forecast_id] + predict_dataset.mean_dict[forecast_id] submission_df.loc[submission_df.ForecastId == forecast_id, 'Value'] = output pbar.update(1) """ with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i,forecast_id in enumerate(predict_dataset_wrmse.forecast_ids): i = predict_dataset_wrmse.forecast_ids.index(forecast_id) X_sequences_ar,X_sequences_meta,y_sequences = predict_dataset_wrmse.__getitem__(i) test_X_sequences_meta = torch.from_numpy(test_X_sequences_meta).view(1,-1,8) test_X_sequences_ar = torch.from_numpy(test_X_sequences_ar).view(1,-1,7) y_true = y_sequences.reshape(-1) * predict_dataset_wrmse.std_dict[forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] # transform data from Batch x Window x Etc into Batch x Etc format X_sequences_ar = X_sequences_ar.float() X_sequences_temp = X_sequences_meta[:,:,0:1].float() X_sequences_meta = X_sequences_meta[:,:,1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda(async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda(async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda(async=True) # compute output output = model_1h(x_temp_var,x_meta_var,x_ar_var) output = output[0,:].data.cpu().numpy() output = output * predict_dataset_wrmse.std_dict[forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] wrmse_val = WRMSE(y_true, output) r2_score_val = metrics.r2_score(y_true, output) stat_df = stat_df.append(pd.DataFrame([dict(zip(stat_cols,[forecast_id,r2_score_val,wrmse_val]))])) pbar.update(1) """ print('Predicting for 15 min series ...') predict_dataset = S2SDataset(df=trainable_df, series_type='15_mins', in_sequence_len=192, out_sequence_len=192, target='Value', mode='test', split_mode='random', predictors=predictors, ar_features=ar_features) """ predict_dataset_wrmse = S2SDataset(df = trainable_df, series_type = '15_mins', in_sequence_len = 192, out_sequence_len = 192, target = 'Value', mode = 'evaluate_wrmse', split_mode = 'random', predictors = predictors, ar_features = ar_features) """ print('Dataset length is {}'.format(len(predict_dataset.forecast_ids))) with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i, forecast_id in enumerate(predict_dataset.forecast_ids): i = predict_dataset.forecast_ids.index(forecast_id) test_X_sequences_meta, test_X_sequences_ar, len_diff = predict_dataset.__getitem__( i) # into PyTorch format test_X_sequences_meta = torch.from_numpy( test_X_sequences_meta).view(1, -1, 8) test_X_sequences_ar = torch.from_numpy(test_X_sequences_ar).view( 1, -1, 7) # transform data from Batch x Window x Etc into Batch x Etc format test_X_sequences_ar = test_X_sequences_ar.float() test_X_sequences_temp = test_X_sequences_meta[:, :, 0:1].float() test_X_sequences_meta = test_X_sequences_meta[:, :, 1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda( async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda( async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda( async=True) # compute output output = model_15m(x_temp_var, x_meta_var, x_ar_var) output = output[0, :].data.cpu().numpy() # only the necessary length output = output[-len_diff:] output = output * predict_dataset.std_dict[ forecast_id] + predict_dataset.mean_dict[forecast_id] submission_df.loc[submission_df.ForecastId == forecast_id, 'Value'] = output pbar.update(1) """ with tqdm.tqdm(total=len(predict_dataset.forecast_ids)) as pbar: for i,forecast_id in enumerate(predict_dataset_wrmse.forecast_ids): i = predict_dataset_wrmse.forecast_ids.index(forecast_id) X_sequences_ar,X_sequences_meta,y_sequences = predict_dataset_wrmse.__getitem__(i) test_X_sequences_meta = torch.from_numpy(test_X_sequences_meta).view(1,-1,8) test_X_sequences_ar = torch.from_numpy(test_X_sequences_ar).view(1,-1,7) y_true = y_sequences.reshape(-1) * predict_dataset_wrmse.std_dict[forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] # transform data from Batch x Window x Etc into Batch x Etc format X_sequences_ar = X_sequences_ar.float() X_sequences_temp = X_sequences_meta[:,:,0:1].float() X_sequences_meta = X_sequences_meta[:,:,1:].long() x_temp_var = torch.autograd.Variable(test_X_sequences_temp).cuda(async=True) x_meta_var = torch.autograd.Variable(test_X_sequences_meta).cuda(async=True) x_ar_var = torch.autograd.Variable(test_X_sequences_ar).cuda(async=True) # compute output output = model_15m(x_temp_var,x_meta_var,x_ar_var) output = output[0,:].data.cpu().numpy() output = output * predict_dataset_wrmse.std_dict[forecast_id] + predict_dataset_wrmse.mean_dict[forecast_id] wrmse_val = WRMSE(y_true, output) r2_score_val = metrics.r2_score(y_true, output) stat_df = stat_df.append(pd.DataFrame([dict(zip(stat_cols,[forecast_id,r2_score_val,wrmse_val]))])) pbar.update(1) """ # submit zeroes and averages print('Submitting averages ... ') with tqdm.tqdm(total=len(submit_averages)) as pbar: for forecast_id in submit_averages: submission_df.loc[submission_df.ForecastId == forecast_id, 'Value'] = data_df[data_df.ForecastId == forecast_id].mean() pbar.update(1) print('Submitting zeroes ... ') with tqdm.tqdm(total=len(submit_zeroes)) as pbar: for forecast_id in submit_zeroes: submission_df.loc[submission_df.ForecastId == forecast_id, 'Value'] = 0 pbar.update(1) print('Using short sequence data from other model for {} series'.format( len(use_last_window))) previous_preds = pd.read_csv('../submissions/blended_lstm_forests.csv') with tqdm.tqdm(total=len(use_last_window)) as pbar: for forecast_id in use_last_window: submission_df.loc[submission_df.ForecastId == forecast_id, 'Value'] = previous_preds.loc[ previous_preds.ForecastId == forecast_id, 'Value'] pbar.update(1) # stat_df.to_csv('forest_stats_{}.csv'.format(LOGNUMBER)) submission_df['Value'] = submission_df['Value'].fillna(value=0) submission_df.to_csv('../submissions/forecast_{}.csv'.format(LOGNUMBER))