def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' # print(config.save_gifs) # print(model_path.parent) # print(type(model_path.parent)) if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) maddpg = MADDPG.init_from_save(str(model_path)) env = make_env(config.env_id, discrete_action=maddpg.discrete_action) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames = [] # print(len(env.render('rgb_array', close=False))) # print(type(env.render('rgb_array', close=False))) # print(env.render('rgb_array', close=False)) frames.append(env.render('rgb_array', close=False)[0]) env.render('human', close=False) for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) if config.save_gifs: frames.append(env.render('rgb_array', close=False)[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human', close=False) if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) env.close()
def init_env(): env = make_env(env_id, discrete_action=discrete_action, benchmark=True) env.seed(seed + rank * 1000) np.random.seed(seed + rank * 1000) return env
def run(config): # Load model if not os.path.exists(config.model_cp_path): sys.exit("Path to the model checkpoint %s does not exist" % config.model_cp_path) # Load scenario config sce_conf = {} if config.sce_conf_path is not None: with open(config.sce_conf_path) as cf: sce_conf = json.load(cf) print('Special config for scenario:', config.env_path) print(sce_conf) # Initiate env env = make_env(config.env_path, sce_conf, discrete_action=config.discrete_action) # Create model num_in_pol = env.observation_space[0].shape[0] if config.discrete_action: num_out_pol = env.action_space[0].n else: num_out_pol = env.action_space[0].shape[0] policy = PolicyNetwork(num_in_pol, num_out_pol, config.hidden_dim, discrete_action=config.discrete_action) policy.load_state_dict(torch.load(config.model_cp_path)) policy.eval() for ep_i in range(config.n_episodes): obs = env.reset() episode_reward = 0.0 for step_i in range(config.episode_length): # Rearrange observations to fit in the model torch_obs = Variable(torch.Tensor(np.vstack(obs)), requires_grad=False) actions = policy(torch_obs) # Convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in actions] next_obs, rewards, dones, infos = env.step(agent_actions) episode_reward += sum(rewards) / sce_conf['nb_agents'] env.render() if dones[0]: break obs = next_obs print(f'Episode {ep_i + 1} finished after {step_i + 1} steps with return {episode_reward}.')
def run(config): """ :param config: """ # model_dir = Path('./models') / config.env_id / config.model_name env = make_env(config.env_id) np.random.seed(config.seed) torch.manual_seed(config.seed) if all([hasattr(a, 'adversary') for a in env.agents]): agent_types = [ 'adversary' if a.adversary else 'agent' for a in env.agents ] else: agent_types = ['agent' for _ in env.agents] maddpg = MADDPG.init_from_env(env, agent_types, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.num_agent) for ep_i in range(config.n_episodes): print("Episodes %i of %i" % (ep_i + 1, config.n_episodes)) observations = env.reset() for et_i in range(config.episode_length): torch_observations = [ torch.from_numpy(observations[i]).float() for i in range(maddpg.num_agent) ] torch_agent_actions = maddpg.step(torch_observations) agent_actions = [ action.data.numpy() for action in torch_agent_actions ] next_observations, rewards, dones, infos = env.step(agent_actions) replay_buffer.push_data(observations, agent_actions, rewards, next_observations, dones) observations = next_observations if replay_buffer.get_size() >= config.batch_size: for a_i in range(maddpg.num_agent): sample = replay_buffer.sample(config.batch_size) maddpg.update(sample, agent_i=a_i) maddpg.update_all_agent() print("Episode rewards ") print(replay_buffer.get_episode_rewards(config.episode_length)) env.close()
def __init__(self, args, discrete_action, benchmark=False): self.args = args self.scenario_name = args.scenario if self.scenario_name in mujoco: args.scenario_env = 'mujoco' args.finish_at_max = False self.env = gym.make(self.scenario_name) else: args.scenario_env = 'openai' args.finish_at_max = True self.env = make_env(self.scenario_name, benchmark, discrete_action) if self.args.discrete_action: self.action_dims = [aspace.n for aspace in self.env.action_space] if all([hasattr(a, 'adversary') for a in self.env.agents]): self.agent_types = ['adversary' if a.adversary else 'agent' for a in self.env.agents] else: self.agent_types = ['agent' for _ in self.env.agents]
def evaluate(config): DirectoryManager.root = Path(config.root) if config.seed_num is None: all_seeds = list((DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}").iterdir()) config.seed_num = all_seeds[0].stem.strip('seed') # Creates paths and directories seed_path = DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}" / f"seed{config.seed_num}" dir_manager = DirectoryManager.init_from_seed_path(seed_path) if config.incremental is not None: model_path = dir_manager.incrementals_dir / ( f'model_ep{config.incremental}.pt') elif config.last_model: last_models = [ path for path in dir_manager.seed_dir.iterdir() if path.suffix == ".pt" and not path.stem.endswith('best') ] assert len(last_models) == 1 model_path = last_models[0] else: best_models = [ path for path in dir_manager.seed_dir.iterdir() if path.suffix == ".pt" and path.stem.endswith('best') ] assert len(best_models) == 1 model_path = best_models[0] # Retrieves world_params if there were any (see make_world function in multiagent.scenarios) if (dir_manager.seed_dir / 'world_params.json').exists(): world_params = load_dict_from_json( str(dir_manager.seed_dir / 'world_params.json')) else: world_params = {} # Overwrites world_params if specified if config.shuffle_landmarks is not None: world_params['shuffle_landmarks'] = config.shuffle_landmarks if config.color_objects is not None: world_params['color_objects'] = config.color_objects if config.small_agents is not None: world_params['small_agents'] = config.small_agents if config.individual_reward is not None: world_params['individual_reward'] = config.individual_reward if config.use_dense_rewards is not None: world_params['use_dense_rewards'] = config.use_dense_rewards # Retrieves env_params (see multiagent.environment.MultiAgentEnv) if (dir_manager.seed_dir / 'env_params.json').exists(): env_params = load_dict_from_json( str(dir_manager.seed_dir / 'env_params.json')) else: env_params = {} env_params['use_max_speed'] = False # Initializes model and environment set_seeds(config.rollout_seed) algorithm = init_from_save(model_path) env = make_env(scenario_name=env_params['env_name'], use_discrete_action=algorithm.use_discrete_action, use_max_speed=env_params['use_max_speed'], world_params=world_params) if config.render: env.render() if config.runner_prey: # makes sure the environment involves a prey assert config.env_name.endswith('tag') runner_policy = RunnerPolicy() for agent in env.world.agents: if agent.adversary: agent.action_callback = runner_policy.action if config.rusher_predators: # makes sure the environment involves predators assert config.env_name.endswith('tag') rusher_policy = RusherPolicy() for agent in env.world.agents: if not agent.adversary: agent.action_callback = rusher_policy.action if config.pendulum_agent is not None: # makes sure the agent to be controlled has a valid id assert config.pendulum_agent in list(range(len(env.world.agents))) pendulum_policy = DoublePendulumPolicy() env.world.agents[ config.pendulum_agent].action_callback = pendulum_policy.action if config.interactive_agent is not None: # makes sure the agent to be controlled has a valid id assert config.interactive_agent in list(range(len(env.world.agents))) interactive_policy = InteractivePolicy(env, viewer_id=0) env.world.agents[ config. interactive_agent].action_callback = interactive_policy.action algorithm.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval total_reward = [] # EPISODES LOOP for ep_i in range(config.n_episodes): ep_recorder = EpisodeRecorder(stuff_to_record=['reward']) # Resets the environment obs = env.reset() if config.save_gifs: frames = [] if ep_i == 0 else frames frames.append(env.render('rgb_array')[0]) if config.render: env.render('human') if not algorithm.soft: # Resets exploration noise algorithm.scale_noise(config.noise_scale) algorithm.reset_noise() # STEPS LOOP for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(algorithm.nagents) ] # get actions as torch Variables torch_actions = algorithm.select_action( torch_obs, is_exploring=False if config.noise_scale is None else True) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] # steps forward in the environment obs, rewards, done, infos = env.step(actions) ep_recorder.add_step(None, None, rewards, None) # record frames if config.save_gifs: frames.append(env.render('rgb_array')[0]) if config.render or config.save_gifs: # Enforces the fps config calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human', close=False) if done and config.interrupt_episode: if config.render: time.sleep(2) break total_reward.append(ep_recorder.get_total_reward()) # Saves gif of all the episodes if config.save_gifs: gif_path = dir_manager.storage_dir / 'gifs' gif_path.mkdir(exist_ok=True) gif_num = 0 while (gif_path / f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif" ).exists(): gif_num += 1 imageio.mimsave(str( gif_path / f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif" ), frames, duration=ifi) env.close() return total_reward
def run(config): original_model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) # if config.incremental is not None: # model_path = model_path / 'incremental' / ('model_ep%i.pt' % # config.incremental) # else: # model_path = model_path / 'model.pt' # # print(model_path) ########################################################################### # FORCE MODEL PATH # ########################################################################### model_path_list = [] rrange = [1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001] # FOR EACH MODEL, DO STATISTICAL RUNS # for r in rrange: # model_path = model_path / 'incremental' / ('model_ep%i.pt' % r) ###################### SAVING STAT RUNS FOR EACH MODEL ################### stat_run_all_models = [] for r in rrange: model_path = original_model_path / 'incremental' / ('model_ep%i.pt' % r) if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, discrete_action=maddpg.discrete_action) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval ##################################################################################################### # CONFIGURATION FOR STATISTICAL RUNS (EPISODES) ##################################################################################################### ##################################################################################################### # START EPISODES # ##################################################################################################### stat_return_list = [] for ep_i in range(config.n_episodes): # number of stat runs print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() # For RNN history buffer obs_tminus_0 = copy(obs) obs_tminus_1 = copy(obs) obs_tminus_2 = copy(obs) obs_tminus_3 = copy(obs) obs_tminus_4 = copy(obs) obs_tminus_5 = copy(obs) # TODO: obs_history shape different from main.py, so parameterize it based on "obs" # It is different because main.py can run multiple threads, so has an extra dimension obs_history = np.empty([3, 108]) next_obs_history = np.empty([3, 108]) if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) #env.render('human') ################################################################################################## # START TIME-STEPS # ################################################################################################## episode_reward = 0 for t_i in range(config.episode_length): # Populate current history for RNN for a in range(3): # env.nagents #obs_history[a][:] = np.concatenate((obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:])) obs_history[a][:] = np.concatenate( (obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:], obs_tminus_3[a][:], obs_tminus_4[a][:], obs_tminus_5[a][:])) # Now, temp has history of 6 timesteps for each agent calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable rnn_torch_obs = [ Variable(torch.Tensor(obs_history[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_actions = maddpg.step(rnn_torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] next_obs, rewards, dones, infos = env.step(actions) # get the global reward episode_reward += rewards[0][0] # Update histories obs_tminus_5 = copy(obs_tminus_4) obs_tminus_4 = copy(obs_tminus_3) obs_tminus_3 = copy(obs_tminus_2) obs_tminus_2 = copy(obs_tminus_1) obs_tminus_1 = copy(obs_tminus_0) obs_tminus_0 = copy(next_obs) # --------------------------------------# if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) #env.render('human') # end of an episode if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) # end of episodes (one stat-run) stat_return_list.append(episode_reward / config.episode_length) # end of model stat_run_all_models.append(stat_return_list) env.close() pickling_on = open(str(original_model_path) + "/stat_runs", "wb") pkl.dump(stat_run_all_models, pickling_on) pickling_on.close()
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) #model_path = config.path if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, config.benchmark, discrete_action=maddpg.discrete_action) print(type(env)) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval if config.save_gifs: frames = [] agent_info = [[[]]] reward_info = [] trajectories = [] for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames.append(env.render('rgb_array')[0]) env.render('human') episode_rewards = np.zeros((config.episode_length, maddpg.nagents)) current_trajectory = [] current_entities = [] if config.store_traj: cur_state_ent = env.getStateEntities() current_entities.append(cur_state_ent) cur_state = env.getState() current_trajectory.append(cur_state) for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) if config.store_traj: cur_state = env.getState() current_trajectory.append(cur_state) if config.benchmark: for i, info in enumerate(infos): agent_info[-1][i].append(infos['n']) if config.sparse_reward: if t_i == 0: total = np.array(rewards) if t_i!=config.episode_length-1: total = total + np.array(rewards) rewards = list(np.zeros(len(rewards))) else: rewards = list(total) episode_rewards[t_i] = rewards if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if config.save_gifs: if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') agent_info.append([[]]) mean_rewards = np.mean(episode_rewards, axis=0) reward_info.append(mean_rewards) if config.store_traj: trajectories.append([current_entities, current_trajectory]) if config.save_gifs: gif_num = 0 while (gif_path / ('%i.gif' % gif_num)).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i.gif' % gif_num)), frames, duration=ifi) run_dir = model_path.parent if config.benchmark: with open(run_dir / 'eval_info.pkl', 'wb') as fp: pickle.dump(agent_info, fp) with open(run_dir / 'eval_rew.pkl', 'wb') as fp: pickle.dump(reward_info, fp) if config.store_traj: with open(run_dir / 'static_trajectories_eval.pkl', 'wb') as fp: pickle.dump(trajectories, fp) env.close()
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' print("\n"+str(model_path)+"\n\n\n") if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, discrete_action=maddpg.discrete_action) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval ##################################################################################################### # START EPISODES # ##################################################################################################### for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() # For RNN history buffer obs_tminus_0 = copy(obs) obs_tminus_1 = copy(obs) obs_tminus_2 = copy(obs) obs_tminus_3 = copy(obs) obs_tminus_4 = copy(obs) obs_tminus_5 = copy(obs) # TODO: obs_history shape different from main.py, so parameterize it based on "obs" # It is different because main.py can run multiple threads, so has an extra dimension obs_history = np.empty([3,108]) next_obs_history = np.empty([3,108]) if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) env.render('human') ################################################################################################## # START TIME-STEPS # ################################################################################################## for t_i in range(config.episode_length): # Populate current history for RNN for a in range(3): # env.nagents #obs_history[a][:] = np.concatenate((obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:])) obs_history[a][:] = np.concatenate( (obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:], obs_tminus_3[a][:], obs_tminus_4[a][:], obs_tminus_5[a][:])) # Now, temp has history of 6 timesteps for each agent calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable rnn_torch_obs = [Variable(torch.Tensor(obs_history[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_actions = maddpg.step(rnn_torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] next_obs, rewards, dones, infos = env.step(actions) # Update histories obs_tminus_5 = copy(obs_tminus_4) obs_tminus_4 = copy(obs_tminus_3) obs_tminus_3 = copy(obs_tminus_2) obs_tminus_2 = copy(obs_tminus_1) obs_tminus_1 = copy(obs_tminus_0) obs_tminus_0 = copy(next_obs) # --------------------------------------# if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) env.close()
def run(config): original_model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) # if config.incremental is not None: # model_path = model_path / 'incremental' / ('model_ep%i.pt' % # config.incremental) # else: # model_path = model_path / 'model.pt' if config.save_gifs: gif_path = original_model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) # Model numbers in folder for stat runs rrange = [1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001] stat_run_all_models = [] for r in rrange: print("Model :" + str(r)) model_path = original_model_path / 'incremental' / ('model_ep%i.pt' % r) maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, discrete_action=maddpg.discrete_action) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval stat_return_list = [] for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) #env.render('human') episode_reward = 0 for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) # get the global reward episode_reward += rewards[0][0] if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) #env.render('human') if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) # end of episodes (one-stat-run) stat_return_list.append(episode_reward / config.episode_length) # end of model stat_run_all_models.append(stat_return_list) env.close() pickling_on = open(str(original_model_path) + "/stat_runs", "wb") pkl.dump(stat_run_all_models, pickling_on) pickling_on.close()
def init_env(): env = make_env(env_id, discrete_action=discrete_action) env.seed(seed + rank * 1000) np.random.seed(seed + rank * 1000) print(type(env)) return env
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) env = make_env(config.env_id, discrete_action=False) if isinstance(env.action_space[0], Box): discr_act = False get_shape = lambda x: x.shape[0] else: # Discrete discr_act = True get_shape = lambda x: x.n num_out_pol = get_shape(env.action_space[0]) agent_init_params = {'num_in_pol': env.observation_space[0].shape[0], 'num_out_pol': num_out_pol, 'num_vars': 3} maddpg = MADDPG(agent_init_params, nagents = 3, hidden_dim=config.hidden_dim, discrete_action=discr_act) save_dict = torch.load(model_path) maddpg.agents.load_params(save_dict['agent_params']) ifi = 1 / config.fps # inter-frame interval for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) env.render('human') rnn_hidden = ( torch.zeros(1, config.n_rollout_threads * (maddpg.nagents)*(maddpg.nagents - 1), config.hidden_dim), torch.zeros(1, config.n_rollout_threads * (maddpg.nagents)*(maddpg.nagents - 1), config.hidden_dim) ) for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_actions, new_rnn_hidden = maddpg.step(torch_obs, rnn_hidden, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions.cpu()] obs, rewards, dones, infos = env.step(actions) if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') rnn_hidden = new_rnn_hidden if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) env.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run #log_dir = run_dir / 'logs' os.makedirs(run_dir) #logger = SummaryWriter(str(log_dir)) # Initialization of evaluation metrics collisions = [0] success_nums = [0] ccr_activates = [0] final_ep_rewards = [] # sum of rewards for training curve final_ep_collisions = [] final_ep_activates = [] final_ep_success_nums = [] torch.manual_seed(run_num) np.random.seed(run_num) env = make_env(config.env_id, discrete_action=True) num_agents = env.n env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) # if config.emergency: # env.switch_emergency() model = AttentionSAC.init_from_env( env, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 #### remove all tensorboard methods, replace with print and pickle for ep_i in range(0, config.n_episodes, config.n_rollout_threads): #print("Episodes %i-%i of %i" % (ep_i + 1, # ep_i + 1 + config.n_rollout_threads, # config.n_episodes)) if config.emergency: env.switch_emergency() obs = env.reset() model.prep_rollouts(device='cpu') t_start = time.time() prev_obs = None act_n_t_minus_1 = None for et_i in range(config.episode_length): if config.CCR: if act_n_t_minus_1: target_obs_n, _, _, _ = env.oracle_step(act_n_t_minus_1) diff_state = obs[:, :, :4] - target_obs_n[:, :, : 4] # 12x4x4 if config.env_id == 'wall' or config.env_id == 'strong_wind' or config.env_id == 'wall_expos': diff_obs = obs[:, :, -(model.nagents + 8 + 1)] elif config.env_id == 'turbulence': diff_obs = obs[:, :, -(model.nagents + 2 + 1)] else: assert (False) emerg_n = np.sum(diff_state**2, axis=-1) + diff_obs # 12x4 env.oracle_update() # obs: 12x4x20 # emerg_n: 12x4 for agent_i in range(model.nagents): for agent_j in range(model.nagents): #print(obs[:, agent_i, -agent_j]) #print(emerg_n[:, agent_j]) obs[:, agent_i, -agent_j] = emerg_n[:, agent_j] #print(obs[:, agent_i, -agent_j]) #print(emerg_n[:, agent_j]) # collect experience if prev_obs is not None: replay_buffer.push(prev_obs, agent_actions, rewards, obs, dones) # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) if config.CCR: if act_n_t_minus_1: for i in range(model.nagents): for j in range(model.nagents): # ccr_activates[-1] += 1 intrinsic_reward = np.linalg.norm( next_obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1) - np.linalg.norm( obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1) intrinsic_reward /= (1 + np.linalg.norm( obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1)) intrinsic_reward *= (emerg_n[:, j] - emerg_n[:, i]) rewards[:, i] += 10 * intrinsic_reward / np.sqrt( num_agents) """ if (len(episode_rewards) == 2 or len(episode_rewards) == 2000 or len(episode_rewards) == 5000) and episode_step % 5 == 0: Ls[i].append(' intrinsic reward = ' + str(intrinsic_reward) + '\n') """ # if i == j: continue # emerg_invalid = ~((emerg_n[:,j] > emerg_n[:,i]) & (emerg_n[:,j] > 0)) # ccr_activates[-1] += (~emerg_invalid).sum() # intrinsic_reward = np.linalg.norm(next_obs[:,i,2:4] - obs[:,j,2:4], axis=-1) - np.linalg.norm(obs[:,i,2:4] - obs[:,j,2:4], axis=-1) # intrinsic_reward[emerg_invalid] = 0 # rewards[:,i] += 10 * intrinsic_reward act_n_t_minus_1 = actions prev_obs = obs obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) model.update_critic(sample, logger=None) model.update_policies(sample, logger=None) model.update_all_targets() model.prep_rollouts(device='cpu') ls_num_collision = env.get_collision_and_zero_out() collisions.append(np.array( ls_num_collision).mean()) # might need to convert to np.int ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) ep_rews = np.array(ep_rews).mean() # save model, display training output print( "episodes: {}, mean episode reward: {}, mean number of collisions with wall: {}, ccr activates: {}, success numbers: {}, time: {}" .format(ep_i, ep_rews, np.mean(collisions[-config.save_rate:]), np.mean(ccr_activates[-config.save_rate:]), np.mean(success_nums[-config.save_rate:]), round(time.time() - t_start, 3))) # Keep track of final episode reward final_ep_rewards.append(ep_rews) # final_ep_activates.append(np.mean(ccr_activates[-config.save_rate:])) final_ep_collisions.append(np.mean(collisions[-config.save_rate:])) final_ep_success_nums.append(np.mean(success_nums[-config.save_rate:])) if ep_i % config.save_rate == 0: x_axis = np.arange(0, ep_i + 1, step=12) # plot reward data rew_file_name = run_dir / 'rewards.png' plt.plot(x_axis, final_ep_rewards) plt.xlabel('training episode') plt.ylabel('reward') #plt.legend() plt.savefig(rew_file_name) plt.clf() collision_file_name = run_dir / 'collisions.png' plt.plot(x_axis, final_ep_collisions) plt.xlabel('training episode') plt.ylabel('number of collisions') #plt.legend() plt.savefig(collision_file_name) plt.clf() # activates_file_name = run_dir / 'activates.png' # plt.plot(x_axis, final_ep_activates) # plt.xlabel('training episode') # plt.ylabel('CCR activates') # #plt.legend() # plt.savefig(activates_file_name) # plt.clf() success_file_name = run_dir / 'successes.png' plt.plot(x_axis, final_ep_success_nums) plt.xlabel('training episode') plt.ylabel('success numbers') #plt.legend() plt.savefig(success_file_name) plt.clf() rew_file_name = run_dir collision_file_name = run_dir success_nums_file_name = run_dir activates_file_name = run_dir rew_file_name /= 'rewards.pkl' collision_file_name /= 'collisions.pkl' success_nums_file_name /= 'success_nums.pkl' # activates_file_name /= 'activates.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) with open(collision_file_name, 'wb') as fp: pickle.dump(final_ep_collisions, fp) # with open(activates_file_name, 'wb') as fp: # pickle.dump(final_ep_activates, fp) with open(success_nums_file_name, 'wb') as fp: pickle.dump(final_ep_success_nums, fp) plt.clf() if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close()
def test(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] # runs the newest run_num = max(exst_run_nums) curr_run = 'run%i' % run_num run_dir = model_dir / curr_run # Initialization of evaluation metrics collisions = [0] success_nums = [0] ccr_activates = [0] final_ep_rewards = [] # sum of rewards for training curve final_ep_collisions = [] final_ep_activates = [] final_ep_success_nums = [] torch.manual_seed(run_num) np.random.seed(run_num) env = make_env(config.env_id, discrete_action=True) env.seed(run_num) np.random.seed(run_num) model = AttentionSAC.init_from_save(run_dir / 'model.pt', True) replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 #### remove all tensorboard methods, replace with print and pickle for ep_i in range(0, config.n_episodes): obs = np.expand_dims(np.array(env.reset()), 0) model.prep_rollouts(device='cpu') t_start = time.time() prev_obs = None act_n_t_minus_1 = None for et_i in range(config.episode_length): if config.CCR: if act_n_t_minus_1: target_obs_n, _, _, _ = env.oracle_step(act_n_t_minus_1[0]) target_obs_n = np.expand_dims(np.array(target_obs_n), 0) diff_state = obs[:, :, :4] - target_obs_n[:, :, : 4] # 1x4x4 if config.env_id == 'wall': diff_obs = obs[:, :, -(model.nagents + 8 + 1)] elif config.env_id == 'turbulence': diff_obs = obs[:, :, -(model.nagents + 2 + 1)] else: assert (False) emerg_n = np.sum(diff_state**2, axis=-1) + diff_obs # 1x4 env.oracle_update() # obs: 1x4x20 # emerg_n: 1x4 for agent_i in range(model.nagents): for agent_j in range(model.nagents): obs[:, agent_i, -agent_j] = emerg_n[:, agent_j] # collect experience if prev_obs is not None: replay_buffer.push(prev_obs, agent_actions, rewards, obs, dones) #print(obs) # convert observation to torch Variable torch_obs = [] for i in range(model.nagents): torch_obs.append( Variable(torch.Tensor(obs[:, i]), requires_grad=False)) # print(torch_obs) # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[0] for ac in agent_actions]] # rearrange actions to be per environment #actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions[0]) next_obs = np.expand_dims(np.array(next_obs), 0) rewards = np.expand_dims(np.array(rewards), 0) dones = np.expand_dims(np.array(dones), 0) infos = np.expand_dims(np.array(infos), 0) if config.CCR: act_n_t_minus_1 = actions prev_obs = obs obs = next_obs t += 1 # for displaying learned policies if config.display: time.sleep(0.1) env.render() continue env.close()
def run(config): model_path = (Path('../models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' gif_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed' gif_path.mkdir(exist_ok=True) torch.manual_seed(config.seed) np.random.seed(config.seed) if config.mixed_policies: maddpg = MADDPG.init_from_directory( Path('../models') / config.env_id / config.model_name) else: maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, benchmark=True, discrete_action=maddpg.discrete_action) env.seed(config.seed) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval all_infos = np.empty( (config.n_episodes, config.episode_length, maddpg.nagents, 10)) n_movable_agents = sum([1 if a.movable else 0 for a in env.agents]) n_speaking_agents = sum([0 if a.silent else 1 for a in env.agents]) all_positions = np.zeros((config.n_episodes, config.episode_length, n_movable_agents, env.world.dim_p)) all_communications = np.zeros((config.n_episodes, config.episode_length, n_speaking_agents, env.world.dim_c)) all_actions = np.zeros((config.n_episodes, config.episode_length, len(env.agents), env.world.dim_c)) obs_space = sum([obsp.shape[0] for obsp in env.observation_space]) all_obs = np.zeros((config.n_episodes, config.episode_length, obs_space)) for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() # env.agents[1].state.p_pos = np.array([0., 0.]) for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) if not obs[i].ndim == 4 else Variable(torch.Tensor(obs[i]), requires_grad=False) for i in range(maddpg.nagents) ] all_positions[ep_i, t_i] = env.get_positions() all_communications[ep_i, t_i] = env.get_communications() # get actions as torch Variables torch_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] # actions[0] = np.array([0., 0., 0., 0., 0.], dtype=np.float32) # actions[0][ep_i] = 1. obs, rewards, dones, infos = env.step(actions) all_actions[ep_i, t_i, :, :] = actions all_obs[ep_i, t_i, :] = np.concatenate(np.asarray(obs)) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) if len(np.array(infos['n']).shape) < 4: all_infos[ep_i, t_i, :, :len(infos['n'][-1])] = np.array(infos['n']) env.close() if config.save_stats: stats_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed' stats_path.mkdir(exist_ok=True) save(f'{stats_path}/all_infos.npy', all_infos) save(f'{stats_path}/all_positions.npy', all_positions) save(f'{stats_path}/all_communications.npy', all_communications) save(f'{stats_path}/all_actions.npy', all_actions) save(f'{stats_path}/all_observations.npy', all_obs)
def run(config): # Load model if config.model_dir is not None: model_path = os.path.join(config.model_dir, "model.pt") sce_conf_path = os.path.join(config.model_dir, "sce_config.json") elif config.model_cp_path is not None and config.sce_conf_path is not None: model_path = config.model_cp_path sce_conf_path = config.sce_conf_path else: print("ERROR with model paths: you need to provide the path of either \ the model directory (--model_dir) or the model checkpoint and \ the scenario config (--model_cp_path and --sce_conf_path).") exit(1) if not os.path.exists(model_path): sys.exit("Path to the model checkpoint %s does not exist" % model_path) maddpg = MADDPG.init_from_save(model_path) maddpg.prep_rollouts(device='cpu') # Load scenario config sce_conf = {} if sce_conf_path is not None: with open(sce_conf_path) as cf: sce_conf = json.load(cf) print('Special config for scenario:', config.env_path) print(sce_conf) # Seed env seed = config.seed if config.seed is not None else np.random.randint(1e9) np.random.seed(seed) print("Creating environment with seed", seed) # Create environment env = make_env(config.env_path, discrete_action=config.discrete_action, sce_conf=sce_conf) for ep_i in range(config.n_episodes): obs = env.reset() rew = 0 for step_i in range(config.episode_length): # rearrange observations to be per agent torch_obs = [ Variable(torch.Tensor(obs[a]).unsqueeze(0), requires_grad=False) for a in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs) # convert actions to numpy arrays actions = [ac.data.numpy().squeeze() for ac in torch_agent_actions] # Environment step next_obs, rewards, dones, infos = env.step(actions) print(rewards) rew += rewards[0] time.sleep(config.step_time) env.render() if dones[0]: break obs = next_obs print(f'Episode {ep_i + 1} finished after {step_i + 1} steps with \ return {rew}.') print("SEED was", seed)
def run(config): # Get paths for saving logs and model run_dir, model_cp_path, log_dir = get_paths(config) # Init summary writer logger = SummaryWriter(str(log_dir)) # Load scenario config sce_conf = load_scenario_config(config, run_dir) nb_agents = sce_conf['nb_agents'] # Initiate env torch.manual_seed(config.seed) np.random.seed(config.seed) env = make_env(config.env_path, sce_conf, discrete_action=config.discrete_action) # Create model num_in_pol = env.observation_space[0].shape[0] if config.discrete_action: num_out_pol = env.action_space[0].n else: num_out_pol = env.action_space[0].shape[0] policy = PolicyNetwork(num_in_pol, num_out_pol, config.hidden_dim, discrete_action=config.discrete_action) policy.eval() # Create the CMA-ES trainer es = cma.CMAEvolutionStrategy(np.zeros(get_num_params(policy)), 1, {'seed': config.seed}) t = 0 for ep_i in tqdm(range(0, config.n_episodes, es.popsize)): # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor # Ask for candidate solutions solutions = es.ask() # Perform one episode for each solution tell_rewards = [] for i in range(len(solutions)): # Load solution in model load_array_in_model(solutions[i], policy) # Reset env obs = env.reset() episode_reward = 0.0 for et_i in range(config.episode_length): # Rearrange observations to fit in the model torch_obs = Variable(torch.Tensor(np.vstack(obs)), requires_grad=False) actions = policy(torch_obs) # Convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in actions] next_obs, rewards, dones, infos = env.step(agent_actions) episode_reward += sum(rewards) / nb_agents if dones[0]: break obs = next_obs tell_rewards.append(-episode_reward) # Update CMA-ES model es.tell(solutions, tell_rewards) # Log rewards logger.add_scalar('agent0/mean_episode_rewards', -sum(tell_rewards) / es.popsize, ep_i) # Save model if ep_i % config.save_interval < es.popsize: os.makedirs(run_dir / 'incremental', exist_ok=True) save_model( policy, run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) save_model(policy, model_cp_path) save_model(policy, model_cp_path) env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def init_env(): env = make_env(env_id, discrete_action=discrete_action, mode=mode) env.seed(seed + rank * 1000) np.random.seed(seed + rank * 1000) return env
def run(config): cover_ratio = [] model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' # os.makedirs(log_dir) # logger = SummaryWriter(str(log_dir)) # torch.manual_seed(run_num) # np.random.seed(run_num) #env = make_parallel_env(, config.n_rollout_threads, run_num) env = make_env(config.env_id, benchmark=BENCHMARK, discrete_action=True, use_handcraft_policy=config.use_handcraft_policy) model = AttentionSAC.init_from_env( env, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) model.init_from_save_self('./models/swift_scenario/model/run8/model.pt') replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 update_count = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ ac.data.numpy().squeeze() for ac in torch_agent_actions ] # rearrange actions to be per environment # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] # agent_actions[0][5]=1 # agent_actions[1][5]=1 # agent_actions[2][5]=1 next_obs, rewards, dones, infos = env.step( agent_actions, use_handcraft_policy=config.use_handcraft_policy) env.render() time.sleep(0.1) # # # get actions as torch Variables # torch_agent_actions = model.step(torch_obs, explore=True) # # convert actions to numpy arrays # agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # # rearrange actions to be per environment # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] # next_obs, rewards, dones, infos = env.step(actions) # env.render() #if et_i == config.episode_length - 1: #print(infos) #print(type(infos['cover_ratio'])) #cover_ratio.append(float(infos[0]['n'][0]['cover_ratio'])) #print(infos) # replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs ''' t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): update_count += 1 print("episode:", ep_i, ", total steps:", t, " update_count:", update_count) sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') logger.export_scalars_to_json(str(log_dir / 'summary.json')) model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() print(cover_ratio) ''' env.close()
def evaluate(config): if config.seed_num is None: all_seeds = list((DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}").iterdir()) config.seed_num = all_seeds[0].stem.strip('seed') # Creates paths and directories seed_path = DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}" / f"seed{config.seed_num}" dir_manager = DirectoryManager.init_from_seed_path(seed_path) if config.incremental is not None: model_path = dir_manager.incrementals_dir / ( f'model_ep{config.incremental}.pt') elif config.last_model: last_models = [ path for path in dir_manager.seed_dir.iterdir() if path.suffix == ".pt" and not path.stem.endswith('best') ] assert len(last_models) == 1 model_path = last_models[0] else: best_models = [ path for path in dir_manager.seed_dir.iterdir() if path.suffix == ".pt" and path.stem.endswith('best') ] assert len(best_models) == 1 model_path = best_models[0] # Retrieves world_params if there were any (see make_world function in multiagent.scenarios) if (dir_manager.seed_dir / 'world_params.json').exists(): world_params = load_dict_from_json( str(dir_manager.seed_dir / 'world_params.json')) else: world_params = {} # Overwrites world_params if specified if config.shuffle_landmarks is not None: world_params['shuffle_landmarks'] = config.shuffle_landmarks if config.color_objects is not None: world_params['color_objects'] = config.color_objects if config.small_agents is not None: world_params['small_agents'] = config.small_agents if config.individual_reward is not None: world_params['individual_reward'] = config.individual_reward if config.use_dense_rewards is not None: world_params['use_dense_rewards'] = config.use_dense_rewards # Retrieves env_params (see multiagent.environment.MultiAgentEnv) if (dir_manager.seed_dir / 'env_params.json').exists(): env_params = load_dict_from_json( str(dir_manager.seed_dir / 'env_params.json')) else: env_params = {} env_params['use_max_speed'] = False # Initializes model and environment algorithm = init_from_save(model_path) env = make_env(scenario_name=env_params['env_name'], use_discrete_action=algorithm.use_discrete_action, use_max_speed=env_params['use_max_speed'], world_params=world_params) if config.render: env.render() if config.runner_prey: # makes sure the environment involves a prey assert config.env_name.endswith('tag') runner_policy = RunnerPolicy() for agent in env.world.agents: if agent.adversary: agent.action_callback = runner_policy.action if config.rusher_predators: # makes sure the environment involves predators assert config.env_name.endswith('tag') rusher_policy = RusherPolicy() for agent in env.world.agents: if not agent.adversary: agent.action_callback = rusher_policy.action if config.pendulum_agent is not None: # makes sure the agent to be controlled has a valid id assert config.pendulum_agent in list(range(len(env.world.agents))) pendulum_policy = DoublePendulumPolicy() env.world.agents[ config.pendulum_agent].action_callback = pendulum_policy.action if config.interactive_agent is not None: # makes sure the agent to be controlled has a valid id assert config.interactive_agent in list(range(len(env.world.agents))) interactive_policy = InteractivePolicy(env, viewer_id=0) env.world.agents[ config. interactive_agent].action_callback = interactive_policy.action algorithm.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval total_reward = [] all_episodes_agent_embeddings = [] all_episodes_coach_embeddings = [] all_trajs = [] overide_color = None color_agents = True if env_params['env_name'] == 'bounce': env.agents[0].size = 1. * env.agents[0].size env.world.overwrite = config.overwrite elif env_params['env_name'] == 'spread': color_agents = False elif env_params['env_name'] == 'compromise': env.agents[0].lightness = 0.9 env.world.landmarks[0].lightness = 0.9 env.agents[1].lightness = 0.5 env.world.landmarks[1].lightness = 0.5 # cmo = plt.cm.get_cmap('viridis') env.world.overwrite = config.overwrite # overide_color = [np.array(cmo(float(i) / float(2))[:3]) for i in range(2)] # set_seeds_env(2, env) # EPISODES LOOP for ep_i in range(config.n_episodes): # set_seeds(2) # set_seeds_env(2, env) agent_embeddings = [] coach_embeddings = [] traj = [] ep_recorder = EpisodeRecorder(stuff_to_record=['reward']) # Resets the environment obs = env.reset() if config.save_gifs: frames = None if config.render: env.render('human') if not algorithm.soft: # Resets exploration noise algorithm.scale_noise(config.noise_scale) algorithm.reset_noise() # STEPS LOOP for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(algorithm.nagents) ] # get actions as torch Variables torch_actions, torch_embed = algorithm.select_action( torch_obs, is_exploring=False if config.noise_scale is None else True, return_embed=True) torch_total_obs = torch.cat(torch_obs, dim=-1) coach_embed = onehot_from_logits( algorithm.coach.model(torch_total_obs)) coach_embeddings.append(coach_embed.data.numpy().squeeze()) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] embeds = [emb.data.numpy().squeeze() for emb in torch_embed] agent_embeddings.append(embeds) # steps forward in the environment next_obs, rewards, dones, infos = env.step(actions) ep_recorder.add_step(None, None, rewards, None) traj.append((obs, actions, next_obs, rewards, dones)) obs = next_obs colors = list(cm.get_cmap('Set1').colors[:len(embeds[0])]) if overide_color is not None: colors[0] = overide_color[0] colors[2] = overide_color[1] if color_agents: for agent, emb in zip(env.agents, embeds): agent.color = colors[np.argmax(emb)] # record frames if config.save_gifs: frames = [] if frames is None else frames frames.append(env.render('rgb_array')[0]) if config.render or config.save_gifs: # Enforces the fps config calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') if all(dones) and config.interrupt_episode: if config.render: time.sleep(2) break # print(ep_recorder.get_total_reward()) total_reward.append(ep_recorder.get_total_reward()) all_episodes_agent_embeddings.append(agent_embeddings) all_episodes_coach_embeddings.append(coach_embeddings) all_trajs.append(traj) # Saves gif of all the episodes if config.save_gifs: gif_path = dir_manager.storage_dir / 'gifs' gif_path.mkdir(exist_ok=True) gif_num = 0 while (gif_path / f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif" ).exists(): gif_num += 1 imageio.mimsave(str( gif_path / f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif" ), frames, duration=ifi) env.close() embeddings = { 'agents': all_episodes_agent_embeddings, 'coach': all_episodes_coach_embeddings } save_folder = dir_manager.experiment_dir if config.save_to_exp_folder else dir_manager.seed_dir embeddings_path = U.directory_tree.uniquify( save_folder / f"{config.file_name_to_save}.pkl") trajs_path = osp.splitext(embeddings_path)[0] + "_trajs.pkl" with open(embeddings_path, 'wb') as fp: pickle.dump(embeddings, fp) fp.close() with open(trajs_path, 'wb') as fp: pickle.dump(all_trajs, fp) fp.close() return total_reward, str(embeddings_path)
import torch.nn.functional as F import torch.optim as optim from torch.distributions import Normal, Categorical from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler from tensorboardX import SummaryWriter from utils.make_env import make_env from torch.autograd import Variable import imageio # Parameters gamma = 0.95 render = False seed = 1 log_interval = 10 env = make_env("simple_spread", discrete_action=True) num_state = env.observation_space[0].shape[0] num_action = env.action_space[0].n #torch.manual_seed(seed) #env.seed(seed) Transition = namedtuple( 'Transition', ['state', 'action', 'a_log_prob', 'reward', 'next_state']) class Actor(nn.Module): def __init__(self): super(Actor, self).__init__() self.fc1 = nn.Linear(num_state, 100) self.action_head = nn.Linear(100, num_action) def forward(self, x):
def init_env(): env = make_env(env_id) env.seed(seed + rank * 1000) np.random.seed(seed + rank * 1000) return env
def init_env(): env = make_env(env_id, discrete_action=discrete_action) env.seed(seed + rank * 1000) np.random.seed(seed + rank * 1000) # pdb.set_trace() return env
def init_env(): env = make_env(**kwargs) env.seed(kwargs['seed'] + rank * 1000) np.random.seed(kwargs['seed'] + rank * 1000) return env
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' if config.save_gifs: gif_path = model_path.parent / 'gifs' if not config.mixed_policies else model_path.parent / 'gifs_mixed' gif_path.mkdir(exist_ok=True) torch.manual_seed(config.seed) np.random.seed(config.seed) if config.mixed_policies: maddpg = MADDPG.init_from_directory( Path('./models') / config.env_id / config.model_name) else: maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, benchmark=True, discrete_action=maddpg.discrete_action) env.world.seed(config.seed) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval all_infos = np.empty( (config.n_episodes, config.episode_length, maddpg.nagents, 10)) all_positions = np.zeros( (config.n_episodes, config.episode_length, maddpg.nagents, 2)) for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) env.render('human') for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) if not obs[i].ndim == 4 else Variable(torch.Tensor(obs[i]), requires_grad=False) for i in range(maddpg.nagents) ] all_positions[ep_i, t_i] = env.get_positions() # get actions as torch Variables torch_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) if config.save_gifs: frames.append(env.render('rgb_array')[0]) # frames.append(env.world.viewers[0].render(return_rgb_array = True)) uncomment if local views visible calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') if len(np.array(infos['n']).shape) < 4: all_infos[ep_i, t_i, :, :len(infos['n'][-1])] = np.array(infos['n']) if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) env.close() if config.save_stats: stats_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed' stats_path.mkdir(exist_ok=True) save(f'{stats_path}/all_infos.npy', all_infos) save(f'{stats_path}/all_positions.npy', all_positions)
def init_env(): env = make_env(original_drug_smile, original_target, Hyperparams, atoms_, model_to_explain, original_drug, original_target_aff, pred_aff, device, cof) return env