# Loading the best model model_name = 'model/state_dict2.pt' policy_net.load_state_dict(torch.load(model_name)) policy_net.to(device) policy_net.eval( ) # to tell the model how to treat dropout (train: uses dropout, eval: do not use dropout) state = test_env.reset() log_probs = [] rewards = [] profits = [] hold_profits = [] for steps in range(test_env.steps_left): if steps % 500 == 0: test_env.render() with torch.no_grad(): action, log_prob = policy_net.get_action(state, device) new_state, reward, done, _ = test_env.step(action) log_probs.append(log_prob) rewards.append(reward) profits.append(test_env._get_profit()) hold_profits.append(test_env._get_hold_profit()) if done: print('\nTrading session terminated:') print(f"total reward: {np.round(np.sum(rewards), decimals=3)}") print(f"bot profit: {profits[-1]}") print(f"hold profit: {hold_profits[-1]}")
max_episode_num = 1_000 all_rewards = [0] avg_rewards = [0] policy_net.train() # to tell the model how to treat dropout (train: uses dropout, eval: do not use dropout) for episode in range(max_episode_num): state = train_env.reset() log_probs = [] rewards = [] profits = [] hold_profits = [] for steps in range(train_env.steps_left): if steps % 500 ==0: train_env.render() action, log_prob = policy_net.get_action(state, device) new_state, reward, done, _ = train_env.step(action) log_probs.append(log_prob) rewards.append(reward) profits.append(train_env._get_profit()) hold_profits.append(train_env._get_hold_profit()) if done: # Evaluate model all_rewards.append(np.mean(rewards)) avg_rewards.append(np.mean(all_rewards[-10:])) if all_rewards[-1] > all_rewards[-2]:
c_out=test_env.action_space.n, seq_len=test_env.observation_space.shape[1]) #acer = ACER(model=model, memory=memory, config=acer_config) if os.path.exists('./save/model.m5'): model.load_state_dict(torch.load('./save/model.m5')) avg_t = 0 avg_r = 0 obs = test_env.reset() done = False while True: prob = model.pi(torch.FloatTensor(obs).unsqueeze(0)) action = Categorical(prob).sample().item() obs_prime, reward, done, _ = test_env.step(action) if reward is None: continue print("current price : {}".format(test_env._get_current_price())) print("action : {}".format(action)) print("prob : {}".format(prob)) test_env.render(mode='human') obs = obs_prime print('Finished.')