if __name__ == '__main__': # parse arguments parser = argparse.ArgumentParser(description=None) parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.') args = parser.parse_args() # load scenario from script scenario = scenarios.load(args.scenario).Scenario() # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False) # render call to create viewer window (necessary only for interactive policies) env.render() # create interactive policies for each agent policies = [InteractivePolicy(env,i) for i in range(env.n)] # execution loop obs_n = env.reset() while True: # query for action from each agent's policy act_n = [] for i, policy in enumerate(policies): act_n.append(policy.action(obs_n[i])) # step environment obs_n, reward_n, done_n, _ = env.step(act_n) # render all agent views env.render() # display rewards #for agent in env.world.agents: # print(agent.name + " reward: %0.3f" % env._get_reward(agent))
def evaluate(config): DirectoryManager.root = Path(config.root) if config.seed_num is None: all_seeds = list((DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}").iterdir()) config.seed_num = all_seeds[0].stem.strip('seed') # Creates paths and directories seed_path = DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}" / f"seed{config.seed_num}" dir_manager = DirectoryManager.init_from_seed_path(seed_path) if config.incremental is not None: model_path = dir_manager.incrementals_dir / ( f'model_ep{config.incremental}.pt') elif config.last_model: last_models = [ path for path in dir_manager.seed_dir.iterdir() if path.suffix == ".pt" and not path.stem.endswith('best') ] assert len(last_models) == 1 model_path = last_models[0] else: best_models = [ path for path in dir_manager.seed_dir.iterdir() if path.suffix == ".pt" and path.stem.endswith('best') ] assert len(best_models) == 1 model_path = best_models[0] # Retrieves world_params if there were any (see make_world function in multiagent.scenarios) if (dir_manager.seed_dir / 'world_params.json').exists(): world_params = load_dict_from_json( str(dir_manager.seed_dir / 'world_params.json')) else: world_params = {} # Overwrites world_params if specified if config.shuffle_landmarks is not None: world_params['shuffle_landmarks'] = config.shuffle_landmarks if config.color_objects is not None: world_params['color_objects'] = config.color_objects if config.small_agents is not None: world_params['small_agents'] = config.small_agents if config.individual_reward is not None: world_params['individual_reward'] = config.individual_reward if config.use_dense_rewards is not None: world_params['use_dense_rewards'] = config.use_dense_rewards # Retrieves env_params (see multiagent.environment.MultiAgentEnv) if (dir_manager.seed_dir / 'env_params.json').exists(): env_params = load_dict_from_json( str(dir_manager.seed_dir / 'env_params.json')) else: env_params = {} env_params['use_max_speed'] = False # Initializes model and environment set_seeds(config.rollout_seed) algorithm = init_from_save(model_path) env = make_env(scenario_name=env_params['env_name'], use_discrete_action=algorithm.use_discrete_action, use_max_speed=env_params['use_max_speed'], world_params=world_params) if config.render: env.render() if config.runner_prey: # makes sure the environment involves a prey assert config.env_name.endswith('tag') runner_policy = RunnerPolicy() for agent in env.world.agents: if agent.adversary: agent.action_callback = runner_policy.action if config.rusher_predators: # makes sure the environment involves predators assert config.env_name.endswith('tag') rusher_policy = RusherPolicy() for agent in env.world.agents: if not agent.adversary: agent.action_callback = rusher_policy.action if config.pendulum_agent is not None: # makes sure the agent to be controlled has a valid id assert config.pendulum_agent in list(range(len(env.world.agents))) pendulum_policy = DoublePendulumPolicy() env.world.agents[ config.pendulum_agent].action_callback = pendulum_policy.action if config.interactive_agent is not None: # makes sure the agent to be controlled has a valid id assert config.interactive_agent in list(range(len(env.world.agents))) interactive_policy = InteractivePolicy(env, viewer_id=0) env.world.agents[ config. interactive_agent].action_callback = interactive_policy.action algorithm.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval total_reward = [] # EPISODES LOOP for ep_i in range(config.n_episodes): ep_recorder = EpisodeRecorder(stuff_to_record=['reward']) # Resets the environment obs = env.reset() if config.save_gifs: frames = [] if ep_i == 0 else frames frames.append(env.render('rgb_array')[0]) if config.render: env.render('human') if not algorithm.soft: # Resets exploration noise algorithm.scale_noise(config.noise_scale) algorithm.reset_noise() # STEPS LOOP for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(algorithm.nagents) ] # get actions as torch Variables torch_actions = algorithm.select_action( torch_obs, is_exploring=False if config.noise_scale is None else True) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] # steps forward in the environment obs, rewards, done, infos = env.step(actions) ep_recorder.add_step(None, None, rewards, None) # record frames if config.save_gifs: frames.append(env.render('rgb_array')[0]) if config.render or config.save_gifs: # Enforces the fps config calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human', close=False) if done and config.interrupt_episode: if config.render: time.sleep(2) break total_reward.append(ep_recorder.get_total_reward()) # Saves gif of all the episodes if config.save_gifs: gif_path = dir_manager.storage_dir / 'gifs' gif_path.mkdir(exist_ok=True) gif_num = 0 while (gif_path / f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif" ).exists(): gif_num += 1 imageio.mimsave(str( gif_path / f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif" ), frames, duration=ifi) env.close() return total_reward
def play(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=True) env.window_pos = 'right' # render call to create viewer window (necessary only for interactive policies) env.render() # create interactive policies for one agent policy = InteractivePolicy(env, -1) # execution loop obs_n = env.reset() while True: # query for action from each agent's policy act_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # trained policy act_n[-1] = policy.action(obs_n[-1]) # interactive keyboard policy # step environment new_obs_n, reward_n, done_n, _ = env.step(act_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], act_n[i], reward_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(reward_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for game over try: if scenario.game_over: sys.exit(0) except AttributeError: pass # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(_): agent_info[-1][i].append(_['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # render all agent views time.sleep(arglist.delay) env.render() # display rewards for agent in env.world.agents: pass # print(agent.name + " reward: %0.3f" % env._get_reward(agent)) # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: U.save_state(arglist.plots_dir, saver=saver) rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def evaluate(config): if config.seed_num is None: all_seeds = list((DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}").iterdir()) config.seed_num = all_seeds[0].stem.strip('seed') # Creates paths and directories seed_path = DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}" / f"seed{config.seed_num}" dir_manager = DirectoryManager.init_from_seed_path(seed_path) if config.incremental is not None: model_path = dir_manager.incrementals_dir / ( f'model_ep{config.incremental}.pt') elif config.last_model: last_models = [ path for path in dir_manager.seed_dir.iterdir() if path.suffix == ".pt" and not path.stem.endswith('best') ] assert len(last_models) == 1 model_path = last_models[0] else: best_models = [ path for path in dir_manager.seed_dir.iterdir() if path.suffix == ".pt" and path.stem.endswith('best') ] assert len(best_models) == 1 model_path = best_models[0] # Retrieves world_params if there were any (see make_world function in multiagent.scenarios) if (dir_manager.seed_dir / 'world_params.json').exists(): world_params = load_dict_from_json( str(dir_manager.seed_dir / 'world_params.json')) else: world_params = {} # Overwrites world_params if specified if config.shuffle_landmarks is not None: world_params['shuffle_landmarks'] = config.shuffle_landmarks if config.color_objects is not None: world_params['color_objects'] = config.color_objects if config.small_agents is not None: world_params['small_agents'] = config.small_agents if config.individual_reward is not None: world_params['individual_reward'] = config.individual_reward if config.use_dense_rewards is not None: world_params['use_dense_rewards'] = config.use_dense_rewards # Retrieves env_params (see multiagent.environment.MultiAgentEnv) if (dir_manager.seed_dir / 'env_params.json').exists(): env_params = load_dict_from_json( str(dir_manager.seed_dir / 'env_params.json')) else: env_params = {} env_params['use_max_speed'] = False # Initializes model and environment algorithm = init_from_save(model_path) env = make_env(scenario_name=env_params['env_name'], use_discrete_action=algorithm.use_discrete_action, use_max_speed=env_params['use_max_speed'], world_params=world_params) if config.render: env.render() if config.runner_prey: # makes sure the environment involves a prey assert config.env_name.endswith('tag') runner_policy = RunnerPolicy() for agent in env.world.agents: if agent.adversary: agent.action_callback = runner_policy.action if config.rusher_predators: # makes sure the environment involves predators assert config.env_name.endswith('tag') rusher_policy = RusherPolicy() for agent in env.world.agents: if not agent.adversary: agent.action_callback = rusher_policy.action if config.pendulum_agent is not None: # makes sure the agent to be controlled has a valid id assert config.pendulum_agent in list(range(len(env.world.agents))) pendulum_policy = DoublePendulumPolicy() env.world.agents[ config.pendulum_agent].action_callback = pendulum_policy.action if config.interactive_agent is not None: # makes sure the agent to be controlled has a valid id assert config.interactive_agent in list(range(len(env.world.agents))) interactive_policy = InteractivePolicy(env, viewer_id=0) env.world.agents[ config. interactive_agent].action_callback = interactive_policy.action algorithm.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval total_reward = [] all_episodes_agent_embeddings = [] all_episodes_coach_embeddings = [] all_trajs = [] overide_color = None color_agents = True if env_params['env_name'] == 'bounce': env.agents[0].size = 1. * env.agents[0].size env.world.overwrite = config.overwrite elif env_params['env_name'] == 'spread': color_agents = False elif env_params['env_name'] == 'compromise': env.agents[0].lightness = 0.9 env.world.landmarks[0].lightness = 0.9 env.agents[1].lightness = 0.5 env.world.landmarks[1].lightness = 0.5 # cmo = plt.cm.get_cmap('viridis') env.world.overwrite = config.overwrite # overide_color = [np.array(cmo(float(i) / float(2))[:3]) for i in range(2)] # set_seeds_env(2, env) # EPISODES LOOP for ep_i in range(config.n_episodes): # set_seeds(2) # set_seeds_env(2, env) agent_embeddings = [] coach_embeddings = [] traj = [] ep_recorder = EpisodeRecorder(stuff_to_record=['reward']) # Resets the environment obs = env.reset() if config.save_gifs: frames = None if config.render: env.render('human') if not algorithm.soft: # Resets exploration noise algorithm.scale_noise(config.noise_scale) algorithm.reset_noise() # STEPS LOOP for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(algorithm.nagents) ] # get actions as torch Variables torch_actions, torch_embed = algorithm.select_action( torch_obs, is_exploring=False if config.noise_scale is None else True, return_embed=True) torch_total_obs = torch.cat(torch_obs, dim=-1) coach_embed = onehot_from_logits( algorithm.coach.model(torch_total_obs)) coach_embeddings.append(coach_embed.data.numpy().squeeze()) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] embeds = [emb.data.numpy().squeeze() for emb in torch_embed] agent_embeddings.append(embeds) # steps forward in the environment next_obs, rewards, dones, infos = env.step(actions) ep_recorder.add_step(None, None, rewards, None) traj.append((obs, actions, next_obs, rewards, dones)) obs = next_obs colors = list(cm.get_cmap('Set1').colors[:len(embeds[0])]) if overide_color is not None: colors[0] = overide_color[0] colors[2] = overide_color[1] if color_agents: for agent, emb in zip(env.agents, embeds): agent.color = colors[np.argmax(emb)] # record frames if config.save_gifs: frames = [] if frames is None else frames frames.append(env.render('rgb_array')[0]) if config.render or config.save_gifs: # Enforces the fps config calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') if all(dones) and config.interrupt_episode: if config.render: time.sleep(2) break # print(ep_recorder.get_total_reward()) total_reward.append(ep_recorder.get_total_reward()) all_episodes_agent_embeddings.append(agent_embeddings) all_episodes_coach_embeddings.append(coach_embeddings) all_trajs.append(traj) # Saves gif of all the episodes if config.save_gifs: gif_path = dir_manager.storage_dir / 'gifs' gif_path.mkdir(exist_ok=True) gif_num = 0 while (gif_path / f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif" ).exists(): gif_num += 1 imageio.mimsave(str( gif_path / f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif" ), frames, duration=ifi) env.close() embeddings = { 'agents': all_episodes_agent_embeddings, 'coach': all_episodes_coach_embeddings } save_folder = dir_manager.experiment_dir if config.save_to_exp_folder else dir_manager.seed_dir embeddings_path = U.directory_tree.uniquify( save_folder / f"{config.file_name_to_save}.pkl") trajs_path = osp.splitext(embeddings_path)[0] + "_trajs.pkl" with open(embeddings_path, 'wb') as fp: pickle.dump(embeddings, fp) fp.close() with open(trajs_path, 'wb') as fp: pickle.dump(all_trajs, fp) fp.close() return total_reward, str(embeddings_path)