def test_model(net: model.DQNModel, device: torch.device, gw_config) -> Tuple[float, float]: test_env = magent.GridWorld(gw_config, map_size=MAP_SIZE) deer_handle, tiger_handle = test_env.get_handles() def reset_env(): test_env.reset() test_env.add_walls(method="random", n=MAP_SIZE * MAP_SIZE * WALLS_DENSITY) test_env.add_agents(deer_handle, method="random", n=COUNT_DEERS) test_env.add_agents(tiger_handle, method="random", n=COUNT_TIGERS) env = data.MAgentEnv(test_env, tiger_handle, reset_env_func=reset_env) preproc = model.MAgentPreprocessor(device) agent = ptan.agent.DQNAgent(net, ptan.actions.ArgmaxActionSelector(), device, preprocessor=preproc) obs = env.reset() steps = 0 rewards = 0.0 while True: actions = agent(obs)[0] obs, r, dones, _ = env.step(actions) steps += len(obs) rewards += sum(r) if dones[0]: break return rewards / COUNT_TIGERS, steps / COUNT_TIGERS
def test_model(net_deer: model.DQNModel, net_tiger: model.DQNModel, device: torch.device, gw_config) -> Tuple[float, float, float, float]: test_env = magent.GridWorld(gw_config, map_size=MAP_SIZE) deer_handle, tiger_handle = test_env.get_handles() def reset_env(): test_env.reset() test_env.add_walls(method="random", n=MAP_SIZE * MAP_SIZE * WALLS_DENSITY) test_env.add_agents(deer_handle, method="random", n=COUNT_DEERS) test_env.add_agents(tiger_handle, method="random", n=COUNT_TIGERS) deer_env = data.MAgentEnv(test_env, deer_handle, reset_env_func=reset_env, is_slave=True) tiger_env = data.MAgentEnv(test_env, tiger_handle, reset_env_func=reset_env) preproc = model.MAgentPreprocessor(device) deer_agent = ptan.agent.DQNAgent(net_deer, ptan.actions.ArgmaxActionSelector(), device, preprocessor=preproc) tiger_agent = ptan.agent.DQNAgent(net_tiger, ptan.actions.ArgmaxActionSelector(), device, preprocessor=preproc) t_obs = tiger_env.reset() d_obs = deer_env.reset() deer_steps = 0 deer_rewards = 0.0 tiger_steps = 0 tiger_rewards = 0.0 while True: d_actions = deer_agent(d_obs)[0] t_actions = tiger_agent(t_obs)[0] d_obs, d_r, d_dones, _ = deer_env.step(d_actions) t_obs, t_r, t_dones, _ = tiger_env.step(t_actions) tiger_steps += len(t_obs) tiger_rewards += sum(t_r) if t_dones[0]: break deer_steps += len(d_obs) deer_rewards += sum(d_r) if d_dones[0]: break return deer_rewards / COUNT_DEERS, deer_steps / COUNT_DEERS, \ tiger_rewards / COUNT_TIGERS, tiger_steps / COUNT_TIGERS
def test_model(net: model.DQNModel, device: torch.device, gw_config) -> Tuple[float, float, float, float]: test_env = magent.GridWorld(gw_config, map_size=MAP_SIZE) group_a, group_b = test_env.get_handles() def reset_env(): test_env.reset() test_env.add_walls(method="random", n=MAP_SIZE * MAP_SIZE * WALLS_DENSITY) test_env.add_agents(group_a, method="random", n=COUNT_AGENTS_1) test_env.add_agents(group_b, method="random", n=COUNT_AGENTS_2) env_a = data.MAgentEnv(test_env, group_a, reset_env_func=reset_env, is_slave=True) env_b = data.MAgentEnv(test_env, group_b, reset_env_func=reset_env, steps_limit=MAX_EPISODE) preproc = model.MAgentPreprocessor(device) agent_a = ptan.agent.DQNAgent(net, ptan.actions.ArgmaxActionSelector(), device, preprocessor=preproc) agent_b = ptan.agent.DQNAgent(net, ptan.actions.ArgmaxActionSelector(), device, preprocessor=preproc) a_obs = env_a.reset() b_obs = env_b.reset() a_steps = 0 a_rewards = 0.0 b_steps = 0 b_rewards = 0.0 while True: a_actions = agent_a(a_obs)[0] b_actions = agent_b(b_obs)[0] a_obs, a_r, a_dones, _ = env_a.step(a_actions) b_obs, b_r, b_dones, _ = env_b.step(b_actions) a_steps += len(a_obs) a_rewards += sum(a_r) if a_dones[0]: break b_steps += len(b_obs) b_rewards += sum(b_r) if b_dones[0]: break return a_rewards / COUNT_AGENTS_1, b_steps / COUNT_AGENTS_1, \ b_rewards / COUNT_AGENTS_2, b_steps / COUNT_AGENTS_2
net_deer = model.DQNModel( deer_obs.spaces[0].shape, deer_obs.spaces[1].shape, m_env.get_action_space(deer_handle)[0]).to(device) tgt_net_deer = ptan.agent.TargetNet(net_deer) print(net_deer) net_tiger = model.DQNModel( tiger_obs.spaces[0].shape, tiger_obs.spaces[1].shape, m_env.get_action_space(tiger_handle)[0]).to(device) tgt_net_tiger = ptan.agent.TargetNet(net_tiger) print(net_tiger) action_selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=PARAMS.epsilon_start) epsilon_tracker = common.EpsilonTracker(action_selector, PARAMS) preproc = model.MAgentPreprocessor(device) agent = model.GroupDQNAgent([net_deer, net_tiger], action_selector, device, preprocessor=preproc) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, PARAMS.gamma, vectorized=True) deer_buffer = ptan.experience.ExperienceReplayBuffer( None, PARAMS.replay_size) tiger_buffer = ptan.experience.ExperienceReplayBuffer( None, PARAMS.replay_size) deer_optimizer = optim.Adam(net_deer.parameters(), lr=PARAMS.learning_rate) tiger_optimizer = optim.Adam(net_tiger.parameters(),