model = MODEL(c_in=test_env.observation_space.shape[0], c_out=test_env.action_space.n, seq_len=test_env.observation_space.shape[1]) model = model.to(device) agent = PPO(model=model, memory=memory, config=config, device=device) if os.path.exists('./save/model.m5'): agent.model.load_state_dict(torch.load('./save/model.m5')) avg_t = 0 avg_r = 0 for epi in range(1, n_episodes + 1): print("episode {} start!".format(epi)) obs = test_env.reset() done = False while not done: t = 0 action_list = [] while t < T_horizon: prob = agent.model.pi(torch.FloatTensor(obs).unsqueeze(0).to(device)) action = Categorical(prob).sample().item() obs_prime, reward, done, _ = test_env.step(action) if reward is None: continue action_list.append(action)
#device = torch.device('cpu') print(f"Device used: {device}") policy_net = PolicyNetwork(input_dim, output_dim1, output_dim2, hidden_dim, n_layers=lstm_layers) # Loading the best model model_name = 'model/state_dict3.pt' policy_net.load_state_dict(torch.load(model_name)) policy_net.to(device) max_episode_num = 1_000 all_rewards = [0] avg_rewards = [0] policy_net.train() # to tell the model how to treat dropout (train: uses dropout, eval: do not use dropout) for episode in range(max_episode_num): state = train_env.reset() log_probs = [] rewards = [] profits = [] hold_profits = [] for steps in range(train_env.steps_left): if steps % 500 ==0: train_env.render() action, log_prob = policy_net.get_action(state, device) new_state, reward, done, _ = train_env.step(action) log_probs.append(log_prob) rewards.append(reward) profits.append(train_env._get_profit())
print(f"Device used: {device}") policy_net = PolicyNetwork(input_dim, output_dim1, output_dim2, hidden_dim, n_layers=lstm_layers) # Loading the best model model_name = 'model/state_dict2.pt' policy_net.load_state_dict(torch.load(model_name)) policy_net.to(device) policy_net.eval( ) # to tell the model how to treat dropout (train: uses dropout, eval: do not use dropout) state = test_env.reset() log_probs = [] rewards = [] profits = [] hold_profits = [] for steps in range(test_env.steps_left): if steps % 500 == 0: test_env.render() with torch.no_grad(): action, log_prob = policy_net.get_action(state, device) new_state, reward, done, _ = test_env.step(action) log_probs.append(log_prob) rewards.append(reward)