avg_t = 0 avg_r = 0 for epi in range(1, n_episodes + 1): print("episode {} start!".format(epi)) obs = test_env.reset() done = False while not done: t = 0 action_list = [] while t < T_horizon: prob = agent.model.pi(torch.FloatTensor(obs).unsqueeze(0).to(device)) action = Categorical(prob).sample().item() obs_prime, reward, done, _ = test_env.step(action) if reward is None: continue action_list.append(action) memory.put((obs, action, reward, obs_prime, prob.squeeze()[action].detach().item(), done)) obs = obs_prime avg_r += reward avg_t += 1 t += 1
avg_rewards = [0] policy_net.train() # to tell the model how to treat dropout (train: uses dropout, eval: do not use dropout) for episode in range(max_episode_num): state = train_env.reset() log_probs = [] rewards = [] profits = [] hold_profits = [] for steps in range(train_env.steps_left): if steps % 500 ==0: train_env.render() action, log_prob = policy_net.get_action(state, device) new_state, reward, done, _ = train_env.step(action) log_probs.append(log_prob) rewards.append(reward) profits.append(train_env._get_profit()) hold_profits.append(train_env._get_hold_profit()) if done: # Evaluate model all_rewards.append(np.mean(rewards)) avg_rewards.append(np.mean(all_rewards[-10:])) if all_rewards[-1] > all_rewards[-2]: torch.save(policy_net.state_dict(), 'model/state_dict3.pt') print(f'Reward increased ({episode - 1} --> {episode}).\nSaving model ...')
policy_net.eval( ) # to tell the model how to treat dropout (train: uses dropout, eval: do not use dropout) state = test_env.reset() log_probs = [] rewards = [] profits = [] hold_profits = [] for steps in range(test_env.steps_left): if steps % 500 == 0: test_env.render() with torch.no_grad(): action, log_prob = policy_net.get_action(state, device) new_state, reward, done, _ = test_env.step(action) log_probs.append(log_prob) rewards.append(reward) profits.append(test_env._get_profit()) hold_profits.append(test_env._get_hold_profit()) if done: print('\nTrading session terminated:') print(f"total reward: {np.round(np.sum(rewards), decimals=3)}") print(f"bot profit: {profits[-1]}") print(f"hold profit: {hold_profits[-1]}") print(f"steps: {steps}") break state = new_state