示例#1
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    # print(config.save_gifs)
    # print(model_path.parent)
    # print(type(model_path.parent))

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)

    maddpg = MADDPG.init_from_save(str(model_path))
    env = make_env(config.env_id, discrete_action=maddpg.discrete_action)
    maddpg.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        if config.save_gifs:
            frames = []
            # print(len(env.render('rgb_array', close=False)))
            # print(type(env.render('rgb_array', close=False)))
            # print(env.render('rgb_array', close=False))
            frames.append(env.render('rgb_array', close=False)[0])
        env.render('human', close=False)
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            obs, rewards, dones, infos = env.step(actions)
            if config.save_gifs:
                frames.append(env.render('rgb_array', close=False)[0])
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            env.render('human', close=False)
        if config.save_gifs:
            gif_num = 0
            while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                gif_num += 1
            imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
                            frames,
                            duration=ifi)

    env.close()
示例#2
0
 def init_env():
     env = make_env(env_id,
                    discrete_action=discrete_action,
                    benchmark=True)
     env.seed(seed + rank * 1000)
     np.random.seed(seed + rank * 1000)
     return env
def run(config):
    # Load model
    if not os.path.exists(config.model_cp_path):
        sys.exit("Path to the model checkpoint %s does not exist" % 
                    config.model_cp_path)

    # Load scenario config
    sce_conf = {}
    if config.sce_conf_path is not None:
        with open(config.sce_conf_path) as cf:
            sce_conf = json.load(cf)
            print('Special config for scenario:', config.env_path)
            print(sce_conf)

    # Initiate env
    env = make_env(config.env_path, sce_conf, 
                   discrete_action=config.discrete_action)

    # Create model
    num_in_pol = env.observation_space[0].shape[0]
    if config.discrete_action:
        num_out_pol = env.action_space[0].n
    else:
        num_out_pol = env.action_space[0].shape[0]
    policy = PolicyNetwork(num_in_pol, num_out_pol, config.hidden_dim,  
                           discrete_action=config.discrete_action)
    policy.load_state_dict(torch.load(config.model_cp_path))
    policy.eval()

    for ep_i in range(config.n_episodes):
        obs = env.reset()
        episode_reward = 0.0
        for step_i in range(config.episode_length):
            # Rearrange observations to fit in the model
            torch_obs = Variable(torch.Tensor(np.vstack(obs)),
                                    requires_grad=False)
            
            actions = policy(torch_obs)

            # Convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in actions]

            next_obs, rewards, dones, infos = env.step(agent_actions)

            episode_reward += sum(rewards) / sce_conf['nb_agents']

            env.render()

            if dones[0]:
                break

            obs = next_obs
        
        print(f'Episode {ep_i + 1} finished after {step_i + 1} steps with return {episode_reward}.')
示例#4
0
def run(config):
    """

    :param config:
    """
    # model_dir = Path('./models') / config.env_id / config.model_name
    env = make_env(config.env_id)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    if all([hasattr(a, 'adversary') for a in env.agents]):
        agent_types = [
            'adversary' if a.adversary else 'agent' for a in env.agents
        ]
    else:
        agent_types = ['agent' for _ in env.agents]

    maddpg = MADDPG.init_from_env(env,
                                  agent_types,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.num_agent)

    for ep_i in range(config.n_episodes):
        print("Episodes %i of %i" % (ep_i + 1, config.n_episodes))
        observations = env.reset()

        for et_i in range(config.episode_length):
            torch_observations = [
                torch.from_numpy(observations[i]).float()
                for i in range(maddpg.num_agent)
            ]
            torch_agent_actions = maddpg.step(torch_observations)
            agent_actions = [
                action.data.numpy() for action in torch_agent_actions
            ]
            next_observations, rewards, dones, infos = env.step(agent_actions)

            replay_buffer.push_data(observations, agent_actions, rewards,
                                    next_observations, dones)

            observations = next_observations

            if replay_buffer.get_size() >= config.batch_size:
                for a_i in range(maddpg.num_agent):
                    sample = replay_buffer.sample(config.batch_size)
                    maddpg.update(sample, agent_i=a_i)
                maddpg.update_all_agent()
        print("Episode rewards ")
        print(replay_buffer.get_episode_rewards(config.episode_length))

    env.close()
示例#5
0
    def __init__(self, args, discrete_action, benchmark=False):
        self.args = args
        self.scenario_name = args.scenario
        if self.scenario_name in mujoco:
            args.scenario_env = 'mujoco'
            args.finish_at_max = False
            self.env = gym.make(self.scenario_name)
        else:
            args.scenario_env = 'openai'
            args.finish_at_max = True
            self.env = make_env(self.scenario_name, benchmark, discrete_action)

            if self.args.discrete_action:
                self.action_dims = [aspace.n for aspace in self.env.action_space]

            if all([hasattr(a, 'adversary') for a in self.env.agents]):
                self.agent_types = ['adversary' if a.adversary else 'agent' for a in
                                    self.env.agents]
            else:
                self.agent_types = ['agent' for _ in self.env.agents]
示例#6
0
def evaluate(config):
    DirectoryManager.root = Path(config.root)

    if config.seed_num is None:
        all_seeds = list((DirectoryManager.root / config.storage_name /
                          f"experiment{config.experiment_num}").iterdir())
        config.seed_num = all_seeds[0].stem.strip('seed')

    # Creates paths and directories

    seed_path = DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}" / f"seed{config.seed_num}"
    dir_manager = DirectoryManager.init_from_seed_path(seed_path)
    if config.incremental is not None:
        model_path = dir_manager.incrementals_dir / (
            f'model_ep{config.incremental}.pt')
    elif config.last_model:
        last_models = [
            path for path in dir_manager.seed_dir.iterdir()
            if path.suffix == ".pt" and not path.stem.endswith('best')
        ]
        assert len(last_models) == 1
        model_path = last_models[0]
    else:
        best_models = [
            path for path in dir_manager.seed_dir.iterdir()
            if path.suffix == ".pt" and path.stem.endswith('best')
        ]
        assert len(best_models) == 1
        model_path = best_models[0]

    # Retrieves world_params if there were any (see make_world function in multiagent.scenarios)
    if (dir_manager.seed_dir / 'world_params.json').exists():
        world_params = load_dict_from_json(
            str(dir_manager.seed_dir / 'world_params.json'))
    else:
        world_params = {}

    # Overwrites world_params if specified
    if config.shuffle_landmarks is not None:
        world_params['shuffle_landmarks'] = config.shuffle_landmarks

    if config.color_objects is not None:
        world_params['color_objects'] = config.color_objects

    if config.small_agents is not None:
        world_params['small_agents'] = config.small_agents

    if config.individual_reward is not None:
        world_params['individual_reward'] = config.individual_reward

    if config.use_dense_rewards is not None:
        world_params['use_dense_rewards'] = config.use_dense_rewards

    # Retrieves env_params (see multiagent.environment.MultiAgentEnv)
    if (dir_manager.seed_dir / 'env_params.json').exists():
        env_params = load_dict_from_json(
            str(dir_manager.seed_dir / 'env_params.json'))
    else:
        env_params = {}
        env_params['use_max_speed'] = False

    # Initializes model and environment
    set_seeds(config.rollout_seed)
    algorithm = init_from_save(model_path)
    env = make_env(scenario_name=env_params['env_name'],
                   use_discrete_action=algorithm.use_discrete_action,
                   use_max_speed=env_params['use_max_speed'],
                   world_params=world_params)
    if config.render:
        env.render()

    if config.runner_prey:
        # makes sure the environment involves a prey
        assert config.env_name.endswith('tag')
        runner_policy = RunnerPolicy()

        for agent in env.world.agents:
            if agent.adversary:
                agent.action_callback = runner_policy.action

    if config.rusher_predators:
        # makes sure the environment involves predators
        assert config.env_name.endswith('tag')
        rusher_policy = RusherPolicy()

        for agent in env.world.agents:
            if not agent.adversary:
                agent.action_callback = rusher_policy.action

    if config.pendulum_agent is not None:
        # makes sure the agent to be controlled has a valid id
        assert config.pendulum_agent in list(range(len(env.world.agents)))

        pendulum_policy = DoublePendulumPolicy()
        env.world.agents[
            config.pendulum_agent].action_callback = pendulum_policy.action

    if config.interactive_agent is not None:
        # makes sure the agent to be controlled has a valid id
        assert config.interactive_agent in list(range(len(env.world.agents)))

        interactive_policy = InteractivePolicy(env, viewer_id=0)
        env.world.agents[
            config.
            interactive_agent].action_callback = interactive_policy.action

    algorithm.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval
    total_reward = []

    # EPISODES LOOP
    for ep_i in range(config.n_episodes):
        ep_recorder = EpisodeRecorder(stuff_to_record=['reward'])

        # Resets the environment
        obs = env.reset()

        if config.save_gifs:
            frames = [] if ep_i == 0 else frames
            frames.append(env.render('rgb_array')[0])
        if config.render:
            env.render('human')

        if not algorithm.soft:
            # Resets exploration noise
            algorithm.scale_noise(config.noise_scale)
            algorithm.reset_noise()

        # STEPS LOOP
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(algorithm.nagents)
            ]
            # get actions as torch Variables
            torch_actions = algorithm.select_action(
                torch_obs,
                is_exploring=False if config.noise_scale is None else True)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            # steps forward in the environment
            obs, rewards, done, infos = env.step(actions)
            ep_recorder.add_step(None, None, rewards, None)

            # record frames
            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])

            if config.render or config.save_gifs:
                # Enforces the fps config
                calc_end = time.time()
                elapsed = calc_end - calc_start
                if elapsed < ifi:
                    time.sleep(ifi - elapsed)
                env.render('human', close=False)

            if done and config.interrupt_episode:
                if config.render:
                    time.sleep(2)
                break

        total_reward.append(ep_recorder.get_total_reward())

    # Saves gif of all the episodes
    if config.save_gifs:
        gif_path = dir_manager.storage_dir / 'gifs'
        gif_path.mkdir(exist_ok=True)

        gif_num = 0
        while (gif_path /
               f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif"
               ).exists():
            gif_num += 1
        imageio.mimsave(str(
            gif_path /
            f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif"
        ),
                        frames,
                        duration=ifi)
    env.close()

    return total_reward
示例#7
0
def run(config):
    original_model_path = (Path('./models') / config.env_id /
                           config.model_name / ('run%i' % config.run_num))
    # if config.incremental is not None:
    #     model_path = model_path / 'incremental' / ('model_ep%i.pt' %
    #                                                config.incremental)
    # else:
    #     model_path = model_path / 'model.pt'
    #
    # print(model_path)

    ###########################################################################
    #                      FORCE MODEL PATH                                   #
    ###########################################################################
    model_path_list = []
    rrange = [1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001]

    # FOR EACH MODEL, DO STATISTICAL RUNS
    # for r in rrange:
    #     model_path = model_path / 'incremental' / ('model_ep%i.pt' % r)

    ######################  SAVING STAT RUNS FOR EACH MODEL ###################
    stat_run_all_models = []

    for r in rrange:
        model_path = original_model_path / 'incremental' / ('model_ep%i.pt' %
                                                            r)
        if config.save_gifs:
            gif_path = model_path.parent / 'gifs'
            gif_path.mkdir(exist_ok=True)

        maddpg = MADDPG.init_from_save(model_path)
        env = make_env(config.env_id, discrete_action=maddpg.discrete_action)
        maddpg.prep_rollouts(device='cpu')
        ifi = 1 / config.fps  # inter-frame interval

        #####################################################################################################
        #                             CONFIGURATION FOR STATISTICAL RUNS (EPISODES)
        #####################################################################################################
        #####################################################################################################
        #                                       START EPISODES                                              #
        #####################################################################################################
        stat_return_list = []
        for ep_i in range(config.n_episodes):  # number of stat runs
            print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
            obs = env.reset()
            # For RNN history buffer
            obs_tminus_0 = copy(obs)
            obs_tminus_1 = copy(obs)
            obs_tminus_2 = copy(obs)
            obs_tminus_3 = copy(obs)
            obs_tminus_4 = copy(obs)
            obs_tminus_5 = copy(obs)

            # TODO: obs_history shape different from main.py, so parameterize it based on "obs"
            # It is different because main.py can run multiple threads, so has an extra dimension
            obs_history = np.empty([3, 108])
            next_obs_history = np.empty([3, 108])

            if config.save_gifs:
                frames = []
                frames.append(env.render('rgb_array')[0])
            #env.render('human')

            ##################################################################################################
            #                                       START TIME-STEPS                                         #
            ##################################################################################################
            episode_reward = 0
            for t_i in range(config.episode_length):

                # Populate current history for RNN
                for a in range(3):  # env.nagents
                    #obs_history[a][:] = np.concatenate((obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:]))
                    obs_history[a][:] = np.concatenate(
                        (obs_tminus_0[a][:], obs_tminus_1[a][:],
                         obs_tminus_2[a][:], obs_tminus_3[a][:],
                         obs_tminus_4[a][:], obs_tminus_5[a][:]))
                    # Now, temp has history of 6 timesteps for each agent

                calc_start = time.time()

                # rearrange observations to be per agent, and convert to torch Variable
                rnn_torch_obs = [
                    Variable(torch.Tensor(obs_history[i]).view(1, -1),
                             requires_grad=False)
                    for i in range(maddpg.nagents)
                ]
                # get actions as torch Variables
                torch_actions = maddpg.step(rnn_torch_obs, explore=False)
                # convert actions to numpy arrays
                actions = [ac.data.numpy().flatten() for ac in torch_actions]
                next_obs, rewards, dones, infos = env.step(actions)

                # get the global reward
                episode_reward += rewards[0][0]

                # Update histories
                obs_tminus_5 = copy(obs_tminus_4)
                obs_tminus_4 = copy(obs_tminus_3)
                obs_tminus_3 = copy(obs_tminus_2)
                obs_tminus_2 = copy(obs_tminus_1)
                obs_tminus_1 = copy(obs_tminus_0)
                obs_tminus_0 = copy(next_obs)
                # --------------------------------------#

                if config.save_gifs:
                    frames.append(env.render('rgb_array')[0])
                calc_end = time.time()
                elapsed = calc_end - calc_start
                if elapsed < ifi:
                    time.sleep(ifi - elapsed)
                #env.render('human')
                # end of an episode

            if config.save_gifs:
                gif_num = 0
                while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                    gif_num += 1
                imageio.mimsave(str(gif_path / ('%i_%i.gif' %
                                                (gif_num, ep_i))),
                                frames,
                                duration=ifi)
            # end of episodes (one stat-run)
            stat_return_list.append(episode_reward / config.episode_length)
        # end of model
        stat_run_all_models.append(stat_return_list)
        env.close()

    pickling_on = open(str(original_model_path) + "/stat_runs", "wb")
    pkl.dump(stat_run_all_models, pickling_on)
    pickling_on.close()
示例#8
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    #model_path = config.path
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)
    
    maddpg = MADDPG.init_from_save(model_path)
    env = make_env(config.env_id, config.benchmark, discrete_action=maddpg.discrete_action)
    print(type(env))
    maddpg.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval
    if config.save_gifs:
        frames = []
    agent_info = [[[]]]
    reward_info = []
    trajectories = []
    
    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        if config.save_gifs:
            frames.append(env.render('rgb_array')[0])
            env.render('human')
        episode_rewards = np.zeros((config.episode_length, maddpg.nagents))
        current_trajectory = []
        current_entities = []
        if config.store_traj:
            cur_state_ent = env.getStateEntities()
            current_entities.append(cur_state_ent)
            cur_state = env.getState()
            current_trajectory.append(cur_state)
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(obs[i]).view(1, -1),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            obs, rewards, dones, infos = env.step(actions)
            if config.store_traj:
                cur_state = env.getState()
                current_trajectory.append(cur_state)

            if config.benchmark:
                for i, info in enumerate(infos):
                    agent_info[-1][i].append(infos['n'])
            if config.sparse_reward:
                if t_i == 0:
                    total = np.array(rewards)
                if t_i!=config.episode_length-1:
                    total = total + np.array(rewards)
                    rewards = list(np.zeros(len(rewards)))
                else:
                    rewards = list(total)
            episode_rewards[t_i] = rewards
            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if config.save_gifs:
                if elapsed < ifi:
                    time.sleep(ifi - elapsed)
                env.render('human')
        agent_info.append([[]])
        mean_rewards = np.mean(episode_rewards, axis=0)
        reward_info.append(mean_rewards)
        if config.store_traj:
            trajectories.append([current_entities, current_trajectory])
    if config.save_gifs:
        gif_num = 0
        while (gif_path / ('%i.gif' % gif_num)).exists():
            gif_num += 1
        imageio.mimsave(str(gif_path / ('%i.gif' % gif_num)),
                        frames, duration=ifi)
    run_dir = model_path.parent 
    if config.benchmark:
        with open(run_dir / 'eval_info.pkl', 'wb') as fp:
            pickle.dump(agent_info, fp)
    with open(run_dir / 'eval_rew.pkl', 'wb') as fp:
        pickle.dump(reward_info, fp)
    if config.store_traj:
        with open(run_dir / 'static_trajectories_eval.pkl', 'wb') as fp:
            pickle.dump(trajectories, fp)
    env.close()
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'


    print("\n"+str(model_path)+"\n\n\n")

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)

    maddpg = MADDPG.init_from_save(model_path)
    env = make_env(config.env_id, discrete_action=maddpg.discrete_action)
    maddpg.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval

    #####################################################################################################
    #                                       START EPISODES                                              #
    #####################################################################################################

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        # For RNN history buffer
        obs_tminus_0 = copy(obs)
        obs_tminus_1 = copy(obs)
        obs_tminus_2 = copy(obs)

        obs_tminus_3 = copy(obs)
        obs_tminus_4 = copy(obs)
        obs_tminus_5 = copy(obs)

        # TODO: obs_history shape different from main.py, so parameterize it based on "obs"
        # It is different because main.py can run multiple threads, so has an extra dimension
        obs_history = np.empty([3,108])
        next_obs_history = np.empty([3,108])

        if config.save_gifs:
            frames = []
            frames.append(env.render('rgb_array')[0])
        env.render('human')

        ##################################################################################################
        #                                       START TIME-STEPS                                         #
        ##################################################################################################

        for t_i in range(config.episode_length):

            # Populate current history for RNN
            for a in range(3):  # env.nagents
                    #obs_history[a][:] = np.concatenate((obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:]))
                    obs_history[a][:] = np.concatenate(
                        (obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:],
                         obs_tminus_3[a][:], obs_tminus_4[a][:], obs_tminus_5[a][:]))
                    # Now, temp has history of 6 timesteps for each agent

            calc_start = time.time()

            # rearrange observations to be per agent, and convert to torch Variable
            rnn_torch_obs = [Variable(torch.Tensor(obs_history[i]).view(1, -1),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_actions = maddpg.step(rnn_torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            next_obs, rewards, dones, infos = env.step(actions)

            # Update histories
            obs_tminus_5 = copy(obs_tminus_4)
            obs_tminus_4 = copy(obs_tminus_3)
            obs_tminus_3 = copy(obs_tminus_2)

            obs_tminus_2 = copy(obs_tminus_1)
            obs_tminus_1 = copy(obs_tminus_0)
            obs_tminus_0 = copy(next_obs)
            # --------------------------------------#

            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            env.render('human')
        if config.save_gifs:
            gif_num = 0
            while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                gif_num += 1
            imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
                            frames, duration=ifi)

    env.close()
示例#10
0
def run(config):
    original_model_path = (Path('./models') / config.env_id /
                           config.model_name / ('run%i' % config.run_num))
    # if config.incremental is not None:
    #     model_path = model_path / 'incremental' / ('model_ep%i.pt' %
    #                                                config.incremental)
    # else:
    #     model_path = model_path / 'model.pt'

    if config.save_gifs:
        gif_path = original_model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)

    # Model numbers in folder for stat runs
    rrange = [1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001]
    stat_run_all_models = []

    for r in rrange:
        print("Model :" + str(r))
        model_path = original_model_path / 'incremental' / ('model_ep%i.pt' %
                                                            r)
        maddpg = MADDPG.init_from_save(model_path)
        env = make_env(config.env_id, discrete_action=maddpg.discrete_action)
        maddpg.prep_rollouts(device='cpu')
        ifi = 1 / config.fps  # inter-frame interval

        stat_return_list = []
        for ep_i in range(config.n_episodes):
            print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
            obs = env.reset()
            if config.save_gifs:
                frames = []
                frames.append(env.render('rgb_array')[0])
            #env.render('human')

            episode_reward = 0
            for t_i in range(config.episode_length):
                calc_start = time.time()
                # rearrange observations to be per agent, and convert to torch Variable
                torch_obs = [
                    Variable(torch.Tensor(obs[i]).view(1, -1),
                             requires_grad=False)
                    for i in range(maddpg.nagents)
                ]
                # get actions as torch Variables
                torch_actions = maddpg.step(torch_obs, explore=False)
                # convert actions to numpy arrays
                actions = [ac.data.numpy().flatten() for ac in torch_actions]
                obs, rewards, dones, infos = env.step(actions)

                # get the global reward
                episode_reward += rewards[0][0]

                if config.save_gifs:
                    frames.append(env.render('rgb_array')[0])
                calc_end = time.time()
                elapsed = calc_end - calc_start
                if elapsed < ifi:
                    time.sleep(ifi - elapsed)
                #env.render('human')
            if config.save_gifs:
                gif_num = 0
                while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                    gif_num += 1
                imageio.mimsave(str(gif_path / ('%i_%i.gif' %
                                                (gif_num, ep_i))),
                                frames,
                                duration=ifi)
            # end of episodes (one-stat-run)
            stat_return_list.append(episode_reward / config.episode_length)
        # end of model
        stat_run_all_models.append(stat_return_list)
        env.close()

    pickling_on = open(str(original_model_path) + "/stat_runs", "wb")
    pkl.dump(stat_run_all_models, pickling_on)
    pickling_on.close()
示例#11
0
 def init_env():
     env = make_env(env_id, discrete_action=discrete_action)
     env.seed(seed + rank * 1000)
     np.random.seed(seed + rank * 1000)
     print(type(env))
     return env
示例#12
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)

    env = make_env(config.env_id, discrete_action=False)
    if isinstance(env.action_space[0], Box):
        discr_act = False
        get_shape = lambda x: x.shape[0]
    else:  # Discrete
        discr_act = True
        get_shape = lambda x: x.n
    num_out_pol = get_shape(env.action_space[0])
    
    agent_init_params = {'num_in_pol': env.observation_space[0].shape[0],
                        'num_out_pol': num_out_pol,
                        'num_vars': 3}
    maddpg = MADDPG(agent_init_params, 
                    nagents = 3,
                    hidden_dim=config.hidden_dim,
                    discrete_action=discr_act)
    save_dict = torch.load(model_path)
    maddpg.agents.load_params(save_dict['agent_params'])
    ifi = 1 / config.fps  # inter-frame interval

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        if config.save_gifs:
            frames = []
            frames.append(env.render('rgb_array')[0])
        env.render('human')

        rnn_hidden = ( torch.zeros(1, config.n_rollout_threads * (maddpg.nagents)*(maddpg.nagents - 1), config.hidden_dim), 
                        torch.zeros(1, config.n_rollout_threads * (maddpg.nagents)*(maddpg.nagents - 1), config.hidden_dim) )
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(obs[i]).view(1, -1),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_actions, new_rnn_hidden = maddpg.step(torch_obs, rnn_hidden, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions.cpu()]
            obs, rewards, dones, infos = env.step(actions)
            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            env.render('human')
            rnn_hidden = new_rnn_hidden
        if config.save_gifs:
            gif_num = 0
            while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                gif_num += 1
            imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
                            frames, duration=ifi)

    env.close()
示例#13
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    #log_dir = run_dir / 'logs'
    os.makedirs(run_dir)
    #logger = SummaryWriter(str(log_dir))

    # Initialization of evaluation metrics
    collisions = [0]
    success_nums = [0]
    ccr_activates = [0]
    final_ep_rewards = []  # sum of rewards for training curve
    final_ep_collisions = []
    final_ep_activates = []
    final_ep_success_nums = []

    torch.manual_seed(run_num)
    np.random.seed(run_num)

    env = make_env(config.env_id, discrete_action=True)
    num_agents = env.n
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)

    # if config.emergency:
    #     env.switch_emergency()

    model = AttentionSAC.init_from_env(
        env,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,
        critic_hidden_dim=config.critic_hidden_dim,
        attend_heads=config.attend_heads,
        reward_scale=config.reward_scale)

    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0

    #### remove all tensorboard methods, replace with print and pickle

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        #print("Episodes %i-%i of %i" % (ep_i + 1,
        #                                ep_i + 1 + config.n_rollout_threads,
        #                                config.n_episodes))
        if config.emergency:
            env.switch_emergency()
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        t_start = time.time()

        prev_obs = None
        act_n_t_minus_1 = None

        for et_i in range(config.episode_length):
            if config.CCR:
                if act_n_t_minus_1:
                    target_obs_n, _, _, _ = env.oracle_step(act_n_t_minus_1)
                    diff_state = obs[:, :, :4] - target_obs_n[:, :, :
                                                              4]  # 12x4x4

                    if config.env_id == 'wall' or config.env_id == 'strong_wind' or config.env_id == 'wall_expos':
                        diff_obs = obs[:, :, -(model.nagents + 8 + 1)]
                    elif config.env_id == 'turbulence':
                        diff_obs = obs[:, :, -(model.nagents + 2 + 1)]
                    else:
                        assert (False)

                    emerg_n = np.sum(diff_state**2, axis=-1) + diff_obs  # 12x4

                    env.oracle_update()

                    # obs: 12x4x20
                    # emerg_n: 12x4
                    for agent_i in range(model.nagents):
                        for agent_j in range(model.nagents):
                            #print(obs[:, agent_i, -agent_j])
                            #print(emerg_n[:, agent_j])
                            obs[:, agent_i, -agent_j] = emerg_n[:, agent_j]
                            #print(obs[:, agent_i, -agent_j])
                            #print(emerg_n[:, agent_j])
            # collect experience
            if prev_obs is not None:
                replay_buffer.push(prev_obs, agent_actions, rewards, obs,
                                   dones)

            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]

            next_obs, rewards, dones, infos = env.step(actions)

            if config.CCR:
                if act_n_t_minus_1:
                    for i in range(model.nagents):
                        for j in range(model.nagents):
                            # ccr_activates[-1] += 1
                            intrinsic_reward = np.linalg.norm(
                                next_obs[:, i, 2:4] - obs[:, j, 2:4],
                                axis=-1) - np.linalg.norm(
                                    obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1)
                            intrinsic_reward /= (1 + np.linalg.norm(
                                obs[:, i, 2:4] - obs[:, j, 2:4], axis=-1))
                            intrinsic_reward *= (emerg_n[:, j] - emerg_n[:, i])
                            rewards[:, i] += 10 * intrinsic_reward / np.sqrt(
                                num_agents)
                            """
                            if (len(episode_rewards) == 2 or len(episode_rewards) == 2000 or len(episode_rewards) == 5000) and episode_step % 5 == 0:
                                Ls[i].append('      intrinsic reward = ' + str(intrinsic_reward) + '\n')
                            """
                            # if i == j: continue
                            # emerg_invalid = ~((emerg_n[:,j] > emerg_n[:,i]) & (emerg_n[:,j] > 0))
                            # ccr_activates[-1] += (~emerg_invalid).sum()
                            # intrinsic_reward = np.linalg.norm(next_obs[:,i,2:4] - obs[:,j,2:4], axis=-1) - np.linalg.norm(obs[:,i,2:4] - obs[:,j,2:4], axis=-1)
                            # intrinsic_reward[emerg_invalid] = 0
                            # rewards[:,i] += 10 * intrinsic_reward

                act_n_t_minus_1 = actions

            prev_obs = obs

            obs = next_obs

            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):
                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_critic(sample, logger=None)
                    model.update_policies(sample, logger=None)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')

        ls_num_collision = env.get_collision_and_zero_out()

        collisions.append(np.array(
            ls_num_collision).mean())  # might need to convert to np.int

        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        ep_rews = np.array(ep_rews).mean()
        # save model, display training output

        print(
            "episodes: {}, mean episode reward: {}, mean number of collisions with wall: {}, ccr activates: {}, success numbers: {}, time: {}"
            .format(ep_i, ep_rews, np.mean(collisions[-config.save_rate:]),
                    np.mean(ccr_activates[-config.save_rate:]),
                    np.mean(success_nums[-config.save_rate:]),
                    round(time.time() - t_start, 3)))

        # Keep track of final episode reward
        final_ep_rewards.append(ep_rews)
        # final_ep_activates.append(np.mean(ccr_activates[-config.save_rate:]))
        final_ep_collisions.append(np.mean(collisions[-config.save_rate:]))
        final_ep_success_nums.append(np.mean(success_nums[-config.save_rate:]))
        if ep_i % config.save_rate == 0:
            x_axis = np.arange(0, ep_i + 1, step=12)
            # plot reward data
            rew_file_name = run_dir / 'rewards.png'

            plt.plot(x_axis, final_ep_rewards)
            plt.xlabel('training episode')
            plt.ylabel('reward')
            #plt.legend()
            plt.savefig(rew_file_name)

            plt.clf()

            collision_file_name = run_dir / 'collisions.png'

            plt.plot(x_axis, final_ep_collisions)
            plt.xlabel('training episode')
            plt.ylabel('number of collisions')
            #plt.legend()
            plt.savefig(collision_file_name)

            plt.clf()

            # activates_file_name = run_dir / 'activates.png'

            # plt.plot(x_axis, final_ep_activates)
            # plt.xlabel('training episode')
            # plt.ylabel('CCR activates')
            # #plt.legend()
            # plt.savefig(activates_file_name)

            # plt.clf()

            success_file_name = run_dir / 'successes.png'

            plt.plot(x_axis, final_ep_success_nums)
            plt.xlabel('training episode')
            plt.ylabel('success numbers')
            #plt.legend()
            plt.savefig(success_file_name)

            plt.clf()

            rew_file_name = run_dir
            collision_file_name = run_dir
            success_nums_file_name = run_dir
            activates_file_name = run_dir

            rew_file_name /= 'rewards.pkl'
            collision_file_name /= 'collisions.pkl'
            success_nums_file_name /= 'success_nums.pkl'
            # activates_file_name /= 'activates.pkl'

            with open(rew_file_name, 'wb') as fp:
                pickle.dump(final_ep_rewards, fp)
            with open(collision_file_name, 'wb') as fp:
                pickle.dump(final_ep_collisions, fp)

            # with open(activates_file_name, 'wb') as fp:
            #     pickle.dump(final_ep_activates, fp)

            with open(success_nums_file_name, 'wb') as fp:
                pickle.dump(final_ep_success_nums, fp)

                plt.clf()

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
示例#14
0
def test(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        # runs the newest
        run_num = max(exst_run_nums)

    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run

    # Initialization of evaluation metrics
    collisions = [0]
    success_nums = [0]
    ccr_activates = [0]
    final_ep_rewards = []  # sum of rewards for training curve
    final_ep_collisions = []
    final_ep_activates = []
    final_ep_success_nums = []

    torch.manual_seed(run_num)
    np.random.seed(run_num)

    env = make_env(config.env_id, discrete_action=True)
    env.seed(run_num)
    np.random.seed(run_num)
    model = AttentionSAC.init_from_save(run_dir / 'model.pt', True)

    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0

    #### remove all tensorboard methods, replace with print and pickle

    for ep_i in range(0, config.n_episodes):

        obs = np.expand_dims(np.array(env.reset()), 0)
        model.prep_rollouts(device='cpu')

        t_start = time.time()

        prev_obs = None
        act_n_t_minus_1 = None

        for et_i in range(config.episode_length):
            if config.CCR:
                if act_n_t_minus_1:
                    target_obs_n, _, _, _ = env.oracle_step(act_n_t_minus_1[0])

                    target_obs_n = np.expand_dims(np.array(target_obs_n), 0)

                    diff_state = obs[:, :, :4] - target_obs_n[:, :, :
                                                              4]  # 1x4x4

                    if config.env_id == 'wall':
                        diff_obs = obs[:, :, -(model.nagents + 8 + 1)]
                    elif config.env_id == 'turbulence':
                        diff_obs = obs[:, :, -(model.nagents + 2 + 1)]
                    else:
                        assert (False)

                    emerg_n = np.sum(diff_state**2, axis=-1) + diff_obs  # 1x4

                    env.oracle_update()

                    # obs: 1x4x20
                    # emerg_n: 1x4
                    for agent_i in range(model.nagents):
                        for agent_j in range(model.nagents):
                            obs[:, agent_i, -agent_j] = emerg_n[:, agent_j]

            # collect experience
            if prev_obs is not None:
                replay_buffer.push(prev_obs, agent_actions, rewards, obs,
                                   dones)

            #print(obs)
            # convert observation to torch Variable
            torch_obs = []
            for i in range(model.nagents):
                torch_obs.append(
                    Variable(torch.Tensor(obs[:, i]), requires_grad=False))
            # print(torch_obs)
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=False)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[0] for ac in agent_actions]]

            # rearrange actions to be per environment
            #actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]

            next_obs, rewards, dones, infos = env.step(actions[0])

            next_obs = np.expand_dims(np.array(next_obs), 0)
            rewards = np.expand_dims(np.array(rewards), 0)
            dones = np.expand_dims(np.array(dones), 0)
            infos = np.expand_dims(np.array(infos), 0)

            if config.CCR:
                act_n_t_minus_1 = actions

            prev_obs = obs

            obs = next_obs

            t += 1

            # for displaying learned policies
            if config.display:
                time.sleep(0.1)
                env.render()
                continue

    env.close()
def run(config):
    model_path = (Path('../models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    gif_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed'
    gif_path.mkdir(exist_ok=True)

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if config.mixed_policies:
        maddpg = MADDPG.init_from_directory(
            Path('../models') / config.env_id / config.model_name)
    else:
        maddpg = MADDPG.init_from_save(model_path)
    env = make_env(config.env_id,
                   benchmark=True,
                   discrete_action=maddpg.discrete_action)
    env.seed(config.seed)
    maddpg.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval
    all_infos = np.empty(
        (config.n_episodes, config.episode_length, maddpg.nagents, 10))
    n_movable_agents = sum([1 if a.movable else 0 for a in env.agents])
    n_speaking_agents = sum([0 if a.silent else 1 for a in env.agents])
    all_positions = np.zeros((config.n_episodes, config.episode_length,
                              n_movable_agents, env.world.dim_p))
    all_communications = np.zeros((config.n_episodes, config.episode_length,
                                   n_speaking_agents, env.world.dim_c))
    all_actions = np.zeros((config.n_episodes, config.episode_length,
                            len(env.agents), env.world.dim_c))
    obs_space = sum([obsp.shape[0] for obsp in env.observation_space])
    all_obs = np.zeros((config.n_episodes, config.episode_length, obs_space))

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        # env.agents[1].state.p_pos = np.array([0., 0.])
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                if not obs[i].ndim == 4 else Variable(torch.Tensor(obs[i]),
                                                      requires_grad=False)
                for i in range(maddpg.nagents)
            ]

            all_positions[ep_i, t_i] = env.get_positions()
            all_communications[ep_i, t_i] = env.get_communications()
            # get actions as torch Variables
            torch_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            # actions[0] = np.array([0., 0., 0., 0., 0.], dtype=np.float32)
            # actions[0][ep_i] = 1.
            obs, rewards, dones, infos = env.step(actions)

            all_actions[ep_i, t_i, :, :] = actions
            all_obs[ep_i, t_i, :] = np.concatenate(np.asarray(obs))

            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            if len(np.array(infos['n']).shape) < 4:
                all_infos[ep_i,
                          t_i, :, :len(infos['n'][-1])] = np.array(infos['n'])

    env.close()

    if config.save_stats:
        stats_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed'
        stats_path.mkdir(exist_ok=True)
        save(f'{stats_path}/all_infos.npy', all_infos)
        save(f'{stats_path}/all_positions.npy', all_positions)
        save(f'{stats_path}/all_communications.npy', all_communications)
        save(f'{stats_path}/all_actions.npy', all_actions)
        save(f'{stats_path}/all_observations.npy', all_obs)
def run(config):
    # Load model
    if config.model_dir is not None:
        model_path = os.path.join(config.model_dir, "model.pt")
        sce_conf_path = os.path.join(config.model_dir, "sce_config.json")
    elif config.model_cp_path is not None and config.sce_conf_path is not None:
        model_path = config.model_cp_path
        sce_conf_path = config.sce_conf_path
    else:
        print("ERROR with model paths: you need to provide the path of either \
               the model directory (--model_dir) or the model checkpoint and \
               the scenario config (--model_cp_path and --sce_conf_path).")
        exit(1)
    if not os.path.exists(model_path):
        sys.exit("Path to the model checkpoint %s does not exist" % model_path)
    maddpg = MADDPG.init_from_save(model_path)
    maddpg.prep_rollouts(device='cpu')

    # Load scenario config
    sce_conf = {}
    if sce_conf_path is not None:
        with open(sce_conf_path) as cf:
            sce_conf = json.load(cf)
            print('Special config for scenario:', config.env_path)
            print(sce_conf)

    # Seed env
    seed = config.seed if config.seed is not None else np.random.randint(1e9)
    np.random.seed(seed)
    print("Creating environment with seed", seed)

    # Create environment
    env = make_env(config.env_path,
                   discrete_action=config.discrete_action,
                   sce_conf=sce_conf)

    for ep_i in range(config.n_episodes):
        obs = env.reset()
        rew = 0
        for step_i in range(config.episode_length):
            # rearrange observations to be per agent
            torch_obs = [
                Variable(torch.Tensor(obs[a]).unsqueeze(0),
                         requires_grad=False) for a in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().squeeze() for ac in torch_agent_actions]

            # Environment step
            next_obs, rewards, dones, infos = env.step(actions)
            print(rewards)
            rew += rewards[0]

            time.sleep(config.step_time)
            env.render()

            if dones[0]:
                break
            obs = next_obs

        print(f'Episode {ep_i + 1} finished after {step_i + 1} steps with \
                return {rew}.')
    print("SEED was", seed)
示例#17
0
def run(config):
    # Get paths for saving logs and model
    run_dir, model_cp_path, log_dir = get_paths(config)

    # Init summary writer
    logger = SummaryWriter(str(log_dir))

    # Load scenario config
    sce_conf = load_scenario_config(config, run_dir)
    nb_agents = sce_conf['nb_agents']

    # Initiate env
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    env = make_env(config.env_path,
                   sce_conf,
                   discrete_action=config.discrete_action)

    # Create model
    num_in_pol = env.observation_space[0].shape[0]
    if config.discrete_action:
        num_out_pol = env.action_space[0].n
    else:
        num_out_pol = env.action_space[0].shape[0]
    policy = PolicyNetwork(num_in_pol,
                           num_out_pol,
                           config.hidden_dim,
                           discrete_action=config.discrete_action)
    policy.eval()

    # Create the CMA-ES trainer
    es = cma.CMAEvolutionStrategy(np.zeros(get_num_params(policy)), 1,
                                  {'seed': config.seed})

    t = 0
    for ep_i in tqdm(range(0, config.n_episodes, es.popsize)):
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor

        # Ask for candidate solutions
        solutions = es.ask()

        # Perform one episode for each solution
        tell_rewards = []
        for i in range(len(solutions)):
            # Load solution in model
            load_array_in_model(solutions[i], policy)

            # Reset env
            obs = env.reset()
            episode_reward = 0.0
            for et_i in range(config.episode_length):
                # Rearrange observations to fit in the model
                torch_obs = Variable(torch.Tensor(np.vstack(obs)),
                                     requires_grad=False)

                actions = policy(torch_obs)

                # Convert actions to numpy arrays
                agent_actions = [ac.data.numpy() for ac in actions]

                next_obs, rewards, dones, infos = env.step(agent_actions)

                episode_reward += sum(rewards) / nb_agents

                if dones[0]:
                    break

                obs = next_obs
            tell_rewards.append(-episode_reward)

        # Update CMA-ES model
        es.tell(solutions, tell_rewards)

        # Log rewards
        logger.add_scalar('agent0/mean_episode_rewards',
                          -sum(tell_rewards) / es.popsize, ep_i)

        # Save model
        if ep_i % config.save_interval < es.popsize:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            save_model(
                policy,
                run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            save_model(policy, model_cp_path)

    save_model(policy, model_cp_path)
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
示例#18
0
 def init_env():
     env = make_env(env_id, discrete_action=discrete_action, mode=mode)
     env.seed(seed + rank * 1000)
     np.random.seed(seed + rank * 1000)
     return env
示例#19
0
def run(config):
    cover_ratio = []

    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    # os.makedirs(log_dir)
    # logger = SummaryWriter(str(log_dir))

    #    torch.manual_seed(run_num)
    #    np.random.seed(run_num)
    #env = make_parallel_env(, config.n_rollout_threads, run_num)
    env = make_env(config.env_id,
                   benchmark=BENCHMARK,
                   discrete_action=True,
                   use_handcraft_policy=config.use_handcraft_policy)
    model = AttentionSAC.init_from_env(
        env,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,
        critic_hidden_dim=config.critic_hidden_dim,
        attend_heads=config.attend_heads,
        reward_scale=config.reward_scale)

    model.init_from_save_self('./models/swift_scenario/model/run8/model.pt')
    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0

    update_count = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(model.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            agent_actions = [
                ac.data.numpy().squeeze() for ac in torch_agent_actions
            ]
            # rearrange actions to be per environment
            # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            # agent_actions[0][5]=1
            # agent_actions[1][5]=1
            # agent_actions[2][5]=1
            next_obs, rewards, dones, infos = env.step(
                agent_actions,
                use_handcraft_policy=config.use_handcraft_policy)
            env.render()
            time.sleep(0.1)

            # # # get actions as torch Variables
            # torch_agent_actions = model.step(torch_obs, explore=True)
            # # convert actions to numpy arrays
            # agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # # rearrange actions to be per environment
            # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            # next_obs, rewards, dones, infos = env.step(actions)
            # env.render()

            #if et_i == config.episode_length - 1:
            #print(infos)
            #print(type(infos['cover_ratio']))
            #cover_ratio.append(float(infos[0]['n'][0]['cover_ratio']))
            #print(infos)

            #            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            '''
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):

                    update_count += 1
                    print("episode:", ep_i, ", total steps:", t, " update_count:", update_count)

                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')

        logger.export_scalars_to_json(str(log_dir / 'summary.json'))

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    print(cover_ratio)
    '''
    env.close()
def evaluate(config):
    if config.seed_num is None:
        all_seeds = list((DirectoryManager.root / config.storage_name /
                          f"experiment{config.experiment_num}").iterdir())
        config.seed_num = all_seeds[0].stem.strip('seed')

    # Creates paths and directories

    seed_path = DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}" / f"seed{config.seed_num}"
    dir_manager = DirectoryManager.init_from_seed_path(seed_path)
    if config.incremental is not None:
        model_path = dir_manager.incrementals_dir / (
            f'model_ep{config.incremental}.pt')
    elif config.last_model:
        last_models = [
            path for path in dir_manager.seed_dir.iterdir()
            if path.suffix == ".pt" and not path.stem.endswith('best')
        ]
        assert len(last_models) == 1
        model_path = last_models[0]
    else:
        best_models = [
            path for path in dir_manager.seed_dir.iterdir()
            if path.suffix == ".pt" and path.stem.endswith('best')
        ]
        assert len(best_models) == 1
        model_path = best_models[0]

    # Retrieves world_params if there were any (see make_world function in multiagent.scenarios)
    if (dir_manager.seed_dir / 'world_params.json').exists():
        world_params = load_dict_from_json(
            str(dir_manager.seed_dir / 'world_params.json'))
    else:
        world_params = {}

    # Overwrites world_params if specified
    if config.shuffle_landmarks is not None:
        world_params['shuffle_landmarks'] = config.shuffle_landmarks

    if config.color_objects is not None:
        world_params['color_objects'] = config.color_objects

    if config.small_agents is not None:
        world_params['small_agents'] = config.small_agents

    if config.individual_reward is not None:
        world_params['individual_reward'] = config.individual_reward

    if config.use_dense_rewards is not None:
        world_params['use_dense_rewards'] = config.use_dense_rewards

    # Retrieves env_params (see multiagent.environment.MultiAgentEnv)
    if (dir_manager.seed_dir / 'env_params.json').exists():
        env_params = load_dict_from_json(
            str(dir_manager.seed_dir / 'env_params.json'))
    else:
        env_params = {}
        env_params['use_max_speed'] = False

    # Initializes model and environment
    algorithm = init_from_save(model_path)
    env = make_env(scenario_name=env_params['env_name'],
                   use_discrete_action=algorithm.use_discrete_action,
                   use_max_speed=env_params['use_max_speed'],
                   world_params=world_params)

    if config.render:
        env.render()

    if config.runner_prey:
        # makes sure the environment involves a prey
        assert config.env_name.endswith('tag')
        runner_policy = RunnerPolicy()

        for agent in env.world.agents:
            if agent.adversary:
                agent.action_callback = runner_policy.action

    if config.rusher_predators:
        # makes sure the environment involves predators
        assert config.env_name.endswith('tag')
        rusher_policy = RusherPolicy()

        for agent in env.world.agents:
            if not agent.adversary:
                agent.action_callback = rusher_policy.action

    if config.pendulum_agent is not None:
        # makes sure the agent to be controlled has a valid id
        assert config.pendulum_agent in list(range(len(env.world.agents)))

        pendulum_policy = DoublePendulumPolicy()
        env.world.agents[
            config.pendulum_agent].action_callback = pendulum_policy.action

    if config.interactive_agent is not None:
        # makes sure the agent to be controlled has a valid id
        assert config.interactive_agent in list(range(len(env.world.agents)))

        interactive_policy = InteractivePolicy(env, viewer_id=0)
        env.world.agents[
            config.
            interactive_agent].action_callback = interactive_policy.action

    algorithm.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval
    total_reward = []
    all_episodes_agent_embeddings = []
    all_episodes_coach_embeddings = []
    all_trajs = []

    overide_color = None

    color_agents = True

    if env_params['env_name'] == 'bounce':
        env.agents[0].size = 1. * env.agents[0].size
        env.world.overwrite = config.overwrite
    elif env_params['env_name'] == 'spread':
        color_agents = False
    elif env_params['env_name'] == 'compromise':
        env.agents[0].lightness = 0.9
        env.world.landmarks[0].lightness = 0.9
        env.agents[1].lightness = 0.5
        env.world.landmarks[1].lightness = 0.5
        # cmo = plt.cm.get_cmap('viridis')
        env.world.overwrite = config.overwrite
        # overide_color = [np.array(cmo(float(i) / float(2))[:3]) for i in range(2)]

    # set_seeds_env(2, env)
    # EPISODES LOOP
    for ep_i in range(config.n_episodes):
        # set_seeds(2)
        # set_seeds_env(2, env)
        agent_embeddings = []
        coach_embeddings = []
        traj = []
        ep_recorder = EpisodeRecorder(stuff_to_record=['reward'])

        # Resets the environment
        obs = env.reset()

        if config.save_gifs:
            frames = None
        if config.render:
            env.render('human')

        if not algorithm.soft:
            # Resets exploration noise
            algorithm.scale_noise(config.noise_scale)
            algorithm.reset_noise()

        # STEPS LOOP
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(algorithm.nagents)
            ]
            # get actions as torch Variables
            torch_actions, torch_embed = algorithm.select_action(
                torch_obs,
                is_exploring=False if config.noise_scale is None else True,
                return_embed=True)
            torch_total_obs = torch.cat(torch_obs, dim=-1)
            coach_embed = onehot_from_logits(
                algorithm.coach.model(torch_total_obs))
            coach_embeddings.append(coach_embed.data.numpy().squeeze())
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            embeds = [emb.data.numpy().squeeze() for emb in torch_embed]
            agent_embeddings.append(embeds)
            # steps forward in the environment
            next_obs, rewards, dones, infos = env.step(actions)
            ep_recorder.add_step(None, None, rewards, None)
            traj.append((obs, actions, next_obs, rewards, dones))
            obs = next_obs
            colors = list(cm.get_cmap('Set1').colors[:len(embeds[0])])
            if overide_color is not None:
                colors[0] = overide_color[0]
                colors[2] = overide_color[1]
            if color_agents:
                for agent, emb in zip(env.agents, embeds):
                    agent.color = colors[np.argmax(emb)]

            # record frames
            if config.save_gifs:
                frames = [] if frames is None else frames
                frames.append(env.render('rgb_array')[0])

            if config.render or config.save_gifs:
                # Enforces the fps config
                calc_end = time.time()
                elapsed = calc_end - calc_start
                if elapsed < ifi:
                    time.sleep(ifi - elapsed)
                env.render('human')

            if all(dones) and config.interrupt_episode:
                if config.render:
                    time.sleep(2)
                break

        # print(ep_recorder.get_total_reward())
        total_reward.append(ep_recorder.get_total_reward())
        all_episodes_agent_embeddings.append(agent_embeddings)
        all_episodes_coach_embeddings.append(coach_embeddings)
        all_trajs.append(traj)

    # Saves gif of all the episodes
    if config.save_gifs:
        gif_path = dir_manager.storage_dir / 'gifs'
        gif_path.mkdir(exist_ok=True)

        gif_num = 0
        while (gif_path /
               f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif"
               ).exists():
            gif_num += 1
        imageio.mimsave(str(
            gif_path /
            f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif"
        ),
                        frames,
                        duration=ifi)
    env.close()

    embeddings = {
        'agents': all_episodes_agent_embeddings,
        'coach': all_episodes_coach_embeddings
    }

    save_folder = dir_manager.experiment_dir if config.save_to_exp_folder else dir_manager.seed_dir
    embeddings_path = U.directory_tree.uniquify(
        save_folder / f"{config.file_name_to_save}.pkl")
    trajs_path = osp.splitext(embeddings_path)[0] + "_trajs.pkl"

    with open(embeddings_path, 'wb') as fp:
        pickle.dump(embeddings, fp)
        fp.close()

    with open(trajs_path, 'wb') as fp:
        pickle.dump(all_trajs, fp)
        fp.close()

    return total_reward, str(embeddings_path)
示例#21
0
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal, Categorical
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
from tensorboardX import SummaryWriter
from utils.make_env import make_env
from torch.autograd import Variable
import imageio

# Parameters
gamma = 0.95
render = False
seed = 1
log_interval = 10

env = make_env("simple_spread", discrete_action=True)
num_state = env.observation_space[0].shape[0]
num_action = env.action_space[0].n
#torch.manual_seed(seed)
#env.seed(seed)
Transition = namedtuple(
    'Transition', ['state', 'action', 'a_log_prob', 'reward', 'next_state'])


class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(num_state, 100)
        self.action_head = nn.Linear(100, num_action)

    def forward(self, x):
示例#22
0
 def init_env():
     env = make_env(env_id)
     env.seed(seed + rank * 1000)
     np.random.seed(seed + rank * 1000)
     return env
 def init_env():
     env = make_env(env_id, discrete_action=discrete_action)
     env.seed(seed + rank * 1000)
     np.random.seed(seed + rank * 1000)
     # pdb.set_trace()
     return env
示例#24
0
 def init_env():
     env = make_env(**kwargs)
     env.seed(kwargs['seed'] + rank * 1000)
     np.random.seed(kwargs['seed'] + rank * 1000)
     return env
示例#25
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs' if not config.mixed_policies else model_path.parent / 'gifs_mixed'
        gif_path.mkdir(exist_ok=True)
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if config.mixed_policies:
        maddpg = MADDPG.init_from_directory(
            Path('./models') / config.env_id / config.model_name)
    else:
        maddpg = MADDPG.init_from_save(model_path)
    env = make_env(config.env_id,
                   benchmark=True,
                   discrete_action=maddpg.discrete_action)
    env.world.seed(config.seed)
    maddpg.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval
    all_infos = np.empty(
        (config.n_episodes, config.episode_length, maddpg.nagents, 10))
    all_positions = np.zeros(
        (config.n_episodes, config.episode_length, maddpg.nagents, 2))
    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        if config.save_gifs:
            frames = []
            frames.append(env.render('rgb_array')[0])
        env.render('human')
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                if not obs[i].ndim == 4 else Variable(torch.Tensor(obs[i]),
                                                      requires_grad=False)
                for i in range(maddpg.nagents)
            ]

            all_positions[ep_i, t_i] = env.get_positions()
            # get actions as torch Variables
            torch_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            obs, rewards, dones, infos = env.step(actions)

            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])
                # frames.append(env.world.viewers[0].render(return_rgb_array = True)) uncomment if local views visible
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            env.render('human')
            if len(np.array(infos['n']).shape) < 4:
                all_infos[ep_i,
                          t_i, :, :len(infos['n'][-1])] = np.array(infos['n'])

        if config.save_gifs:
            gif_num = 0
            while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                gif_num += 1
            imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
                            frames,
                            duration=ifi)

    env.close()

    if config.save_stats:
        stats_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed'
        stats_path.mkdir(exist_ok=True)
        save(f'{stats_path}/all_infos.npy', all_infos)
        save(f'{stats_path}/all_positions.npy', all_positions)
示例#26
0
 def init_env():
     env = make_env(original_drug_smile, original_target, Hyperparams,
                    atoms_, model_to_explain, original_drug,
                    original_target_aff, pred_aff, device, cof)
     return env