def test(self, episodes, render=False, log=False, record=False): self.model.eval() env = self.env if record: env = Monitor(self.env_fn(), directory=os.path.join(self.path, 'recordings'), force=True, video_callable=lambda episode_id: True) with torch.no_grad(): test_rewards = [] total_test_steps = 0 for ep in range(episodes): terminal = False obs_n = env.reset() step = 0 ep_reward = [0 for _ in range(self.model.n_agents)] while not terminal: if render: env.render() torch_obs_n = torch.FloatTensor(obs_n).to( self.device).unsqueeze(0) action_n = self._select_action(self.model, torch_obs_n, explore=False) next_obs_n, reward_n, done_n, info = env.step(action_n) terminal = all(done_n) or step >= self.episode_max_steps obs_n = next_obs_n step += 1 for i, r_n in enumerate(reward_n): ep_reward[i] += r_n total_test_steps += step test_rewards.append(ep_reward) test_rewards = np.array(test_rewards).mean(axis=0) if log: # log - test for i, r_n in enumerate(test_rewards): self.writer.add_scalar('agent_{}/eval_reward'.format(i), r_n, self._step_iter) self.writer.add_scalar('_overall/eval_reward', sum(test_rewards), self._step_iter) self.writer.add_scalar('_overall/test_ep_steps', total_test_steps / episodes, self._step_iter) if record: env.close() return test_rewards
def test(self, episodes, render=False, log=False, record=False): self.model.eval() env = self.env if record: env = Monitor(self.env_fn(), directory=os.path.join(self.path, 'recordings'), force=True, video_callable=lambda episode_id: True) with torch.no_grad(): test_rewards = [] total_test_steps = 0 for ep in range(episodes): terminal = False obs_n = env.reset() step = 0 ep_reward = [0 for _ in range(self.model.n_agents)] self.model.init_hidden(device=self.device) while not terminal: if render: env.render() torch_obs_n = torch.FloatTensor(obs_n).to( self.device).unsqueeze(0) thoughts = [] for agent_i in range(self.model.n_agents): thoughts.append( self.model.agent(agent_i).get_thought( torch_obs_n[:, agent_i])) for i in range(self.share_iter): for agent_i in range(self.model.n_agents): thoughts[agent_i] = (thoughts[agent_i] + thoughts[ (agent_i + 1) % len(thoughts)]) / 2 thoughts = torch.stack(thoughts) action_n = [] for agent_i in range(self.model.n_agents): # assuming every other agent is a neighbour as of now _neighbours = list(range(self.model.n_agents)) _neighbours.remove(agent_i) logits = self.model.agent(agent_i)(thoughts[agent_i]) prob = F.softmax(logits, dim=1) action = prob.argmax(1).item() # action = prob.multinomial(num_samples=1).detach().item() if log and step == 0 and ep == 0: log_prob = F.log_softmax(logits, dim=1) entropy = -(log_prob * prob).sum(1) self.writer.add_scalar( 'agent_{}/entropy'.format(agent_i), entropy, self._step_iter) action_n.append(action) next_obs_n, reward_n, done_n, info = env.step(action_n) terminal = all(done_n) or step >= self.episode_max_steps obs_n = next_obs_n step += 1 for i, r_n in enumerate(reward_n): ep_reward[i] += r_n total_test_steps += step test_rewards.append(ep_reward) test_rewards = np.array(test_rewards).mean(axis=0) # log - test if log: for i, r_n in enumerate(test_rewards): self.writer.add_scalar('agent_{}/eval_reward'.format(i), r_n, self._step_iter) self.writer.add_scalar('_overall/eval_reward', sum(test_rewards), self._step_iter) self.writer.add_scalar('_overall/test_ep_steps', total_test_steps / episodes, self._step_iter) if record: env.close() return test_rewards
description='Interactive Agent for ma-gym') parser.add_argument('--env', default='Checkers-v0', help='Name of the environment (default: %(default)s)') parser.add_argument('--episodes', type=int, default=1, help='episodes (default: %(default)s)') args = parser.parse_args() print( 'Enter the actions space together and press enter ( Eg: \'11<enter>\' which meanes take 1' ' for agent 1 and 1 for agent 2)') env = gym.make('ma_gym:{}'.format(args.env)) env = Monitor(env, directory='recordings', force=True) for ep_i in range(args.episodes): done_n = [False for _ in range(env.n_agents)] ep_reward = 0 obs_n = env.reset() env.render() while not all(done_n): action_n = [int(_) for _ in input('Action:')] obs_n, reward_n, done_n, _ = env.step(action_n) ep_reward += sum(reward_n) env.render() print('Episode #{} Reward: {}'.format(ep_i, ep_reward)) env.close()
class Tester(): def __init__(self, models, env_name="PongDuel-v0", render=True, video=True, step_number=1000, log_after_steps=200, log_on_win=True): self._models = models # list of model id to test. If "all", test "all" if "all" in self._models: self._models = [i for i in registered_models["all"]] self._render = render self._video = video self._step_number = step_number self._log_after_steps = log_after_steps self._log_on_win = log_on_win self._env_name = env_name self._env = None def log_score(self, step, score, msg=""): print("Step: {0:05d} Score: {1}".format(step, score), end="") if msg != "": print(" [{}]".format(msg)) else: print("") def run_tests(self): print("Running tests for model IDs: {}".format(self._models)) print("-" * 10) models_score_summary = {} for model_id in self._models: print("Selected model_id: {}".format(model_id)) model = AutoLoadModel(model_id) score = {"agent_0": {"moves": 0, "wins": 0}, "agent_1": {"moves": 0, "wins": 0}} self._env = gym.make(self._env_name) if self._video: if type(model_id) is list: model_id = "{}_VS_{}".format(model_id[0], model_id[1]) output_directory = "recordings/{}".format(model_id) self._env = Monitor(self._env, directory=output_directory, video_callable=lambda episode_id: True, force=True) obs_n = self._env.reset() for _ in range(self._step_number): # render env if self._render: self._env.render() # select actions actions, actions_as_list = model.get_agents_actions(obs_n) # update moves counter for an in ["agent_0", "agent_1"]: if actions[an] in [1, 2]: score[an]["moves"] += 1 # execute actions obs_n, reward_n, done_n, info = self._env.step(actions_as_list) # update score if any(reward_n): score["agent_0"]["wins"] += reward_n[0] score["agent_1"]["wins"] += reward_n[1] if self._log_on_win == True: self.log_score(_, score, "win") models_score_summary[model_id] = score if _ % self._log_after_steps == 0: self.log_score(_, score) if all(done_n): break self.log_score(_, score, "end") print("-" * 10) self._env.close() # Score summary print("Summary:") for k, v in models_score_summary.items(): n_moves = 0 n_wins = 0 print("Model{}:".format(k)) for a, b in v.items(): print(a, b) n_moves += b["moves"] n_wins += b["wins"] print("Average move count: {}".format(n_moves / 2)) print("Total move count: {}".format(n_moves)) print("Total win count: {}".format(n_wins)) print("")