Пример #1
0
if __name__ == '__main__':
    # parse arguments
    parser = argparse.ArgumentParser(description=None)
    parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.')
    args = parser.parse_args()

    # load scenario from script
    scenario = scenarios.load(args.scenario).Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False)
    # render call to create viewer window (necessary only for interactive policies)
    env.render()
    # create interactive policies for each agent
    policies = [InteractivePolicy(env,i) for i in range(env.n)]
    # execution loop
    obs_n = env.reset()
    while True:
        # query for action from each agent's policy
        act_n = []
        for i, policy in enumerate(policies):
            act_n.append(policy.action(obs_n[i]))
        # step environment
        obs_n, reward_n, done_n, _ = env.step(act_n)
        # render all agent views
        env.render()
        # display rewards
        #for agent in env.world.agents:
        #    print(agent.name + " reward: %0.3f" % env._get_reward(agent))
Пример #2
0
def evaluate(config):
    DirectoryManager.root = Path(config.root)

    if config.seed_num is None:
        all_seeds = list((DirectoryManager.root / config.storage_name /
                          f"experiment{config.experiment_num}").iterdir())
        config.seed_num = all_seeds[0].stem.strip('seed')

    # Creates paths and directories

    seed_path = DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}" / f"seed{config.seed_num}"
    dir_manager = DirectoryManager.init_from_seed_path(seed_path)
    if config.incremental is not None:
        model_path = dir_manager.incrementals_dir / (
            f'model_ep{config.incremental}.pt')
    elif config.last_model:
        last_models = [
            path for path in dir_manager.seed_dir.iterdir()
            if path.suffix == ".pt" and not path.stem.endswith('best')
        ]
        assert len(last_models) == 1
        model_path = last_models[0]
    else:
        best_models = [
            path for path in dir_manager.seed_dir.iterdir()
            if path.suffix == ".pt" and path.stem.endswith('best')
        ]
        assert len(best_models) == 1
        model_path = best_models[0]

    # Retrieves world_params if there were any (see make_world function in multiagent.scenarios)
    if (dir_manager.seed_dir / 'world_params.json').exists():
        world_params = load_dict_from_json(
            str(dir_manager.seed_dir / 'world_params.json'))
    else:
        world_params = {}

    # Overwrites world_params if specified
    if config.shuffle_landmarks is not None:
        world_params['shuffle_landmarks'] = config.shuffle_landmarks

    if config.color_objects is not None:
        world_params['color_objects'] = config.color_objects

    if config.small_agents is not None:
        world_params['small_agents'] = config.small_agents

    if config.individual_reward is not None:
        world_params['individual_reward'] = config.individual_reward

    if config.use_dense_rewards is not None:
        world_params['use_dense_rewards'] = config.use_dense_rewards

    # Retrieves env_params (see multiagent.environment.MultiAgentEnv)
    if (dir_manager.seed_dir / 'env_params.json').exists():
        env_params = load_dict_from_json(
            str(dir_manager.seed_dir / 'env_params.json'))
    else:
        env_params = {}
        env_params['use_max_speed'] = False

    # Initializes model and environment
    set_seeds(config.rollout_seed)
    algorithm = init_from_save(model_path)
    env = make_env(scenario_name=env_params['env_name'],
                   use_discrete_action=algorithm.use_discrete_action,
                   use_max_speed=env_params['use_max_speed'],
                   world_params=world_params)
    if config.render:
        env.render()

    if config.runner_prey:
        # makes sure the environment involves a prey
        assert config.env_name.endswith('tag')
        runner_policy = RunnerPolicy()

        for agent in env.world.agents:
            if agent.adversary:
                agent.action_callback = runner_policy.action

    if config.rusher_predators:
        # makes sure the environment involves predators
        assert config.env_name.endswith('tag')
        rusher_policy = RusherPolicy()

        for agent in env.world.agents:
            if not agent.adversary:
                agent.action_callback = rusher_policy.action

    if config.pendulum_agent is not None:
        # makes sure the agent to be controlled has a valid id
        assert config.pendulum_agent in list(range(len(env.world.agents)))

        pendulum_policy = DoublePendulumPolicy()
        env.world.agents[
            config.pendulum_agent].action_callback = pendulum_policy.action

    if config.interactive_agent is not None:
        # makes sure the agent to be controlled has a valid id
        assert config.interactive_agent in list(range(len(env.world.agents)))

        interactive_policy = InteractivePolicy(env, viewer_id=0)
        env.world.agents[
            config.
            interactive_agent].action_callback = interactive_policy.action

    algorithm.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval
    total_reward = []

    # EPISODES LOOP
    for ep_i in range(config.n_episodes):
        ep_recorder = EpisodeRecorder(stuff_to_record=['reward'])

        # Resets the environment
        obs = env.reset()

        if config.save_gifs:
            frames = [] if ep_i == 0 else frames
            frames.append(env.render('rgb_array')[0])
        if config.render:
            env.render('human')

        if not algorithm.soft:
            # Resets exploration noise
            algorithm.scale_noise(config.noise_scale)
            algorithm.reset_noise()

        # STEPS LOOP
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(algorithm.nagents)
            ]
            # get actions as torch Variables
            torch_actions = algorithm.select_action(
                torch_obs,
                is_exploring=False if config.noise_scale is None else True)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            # steps forward in the environment
            obs, rewards, done, infos = env.step(actions)
            ep_recorder.add_step(None, None, rewards, None)

            # record frames
            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])

            if config.render or config.save_gifs:
                # Enforces the fps config
                calc_end = time.time()
                elapsed = calc_end - calc_start
                if elapsed < ifi:
                    time.sleep(ifi - elapsed)
                env.render('human', close=False)

            if done and config.interrupt_episode:
                if config.render:
                    time.sleep(2)
                break

        total_reward.append(ep_recorder.get_total_reward())

    # Saves gif of all the episodes
    if config.save_gifs:
        gif_path = dir_manager.storage_dir / 'gifs'
        gif_path.mkdir(exist_ok=True)

        gif_num = 0
        while (gif_path /
               f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif"
               ).exists():
            gif_num += 1
        imageio.mimsave(str(
            gif_path /
            f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif"
        ),
                        frames,
                        duration=ifi)
    env.close()

    return total_reward
Пример #3
0
def play(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')

        # create world
        world = scenario.make_world()
        # create multiagent environment
        env = MultiAgentEnv(world,
                            scenario.reset_world,
                            scenario.reward,
                            scenario.observation,
                            info_callback=None,
                            shared_viewer=True)
        env.window_pos = 'right'
        # render call to create viewer window (necessary only for interactive policies)
        env.render()
        # create interactive policies for one agent
        policy = InteractivePolicy(env, -1)
        # execution loop
        obs_n = env.reset()
        while True:
            # query for action from each agent's policy
            act_n = [agent.action(obs)
                     for agent, obs in zip(trainers, obs_n)]  # trained policy
            act_n[-1] = policy.action(obs_n[-1])  # interactive keyboard policy
            # step environment
            new_obs_n, reward_n, done_n, _ = env.step(act_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)

            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], act_n[i], reward_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(reward_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for game over
            try:
                if scenario.game_over:
                    sys.exit(0)
            except AttributeError:
                pass

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(_):
                    agent_info[-1][i].append(_['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # render all agent views
            time.sleep(arglist.delay)
            env.render()
            # display rewards
            for agent in env.world.agents:
                pass  # print(agent.name + " reward: %0.3f" % env._get_reward(agent))

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                U.save_state(arglist.plots_dir, saver=saver)
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Пример #4
0
def evaluate(config):
    if config.seed_num is None:
        all_seeds = list((DirectoryManager.root / config.storage_name /
                          f"experiment{config.experiment_num}").iterdir())
        config.seed_num = all_seeds[0].stem.strip('seed')

    # Creates paths and directories

    seed_path = DirectoryManager.root / config.storage_name / f"experiment{config.experiment_num}" / f"seed{config.seed_num}"
    dir_manager = DirectoryManager.init_from_seed_path(seed_path)
    if config.incremental is not None:
        model_path = dir_manager.incrementals_dir / (
            f'model_ep{config.incremental}.pt')
    elif config.last_model:
        last_models = [
            path for path in dir_manager.seed_dir.iterdir()
            if path.suffix == ".pt" and not path.stem.endswith('best')
        ]
        assert len(last_models) == 1
        model_path = last_models[0]
    else:
        best_models = [
            path for path in dir_manager.seed_dir.iterdir()
            if path.suffix == ".pt" and path.stem.endswith('best')
        ]
        assert len(best_models) == 1
        model_path = best_models[0]

    # Retrieves world_params if there were any (see make_world function in multiagent.scenarios)
    if (dir_manager.seed_dir / 'world_params.json').exists():
        world_params = load_dict_from_json(
            str(dir_manager.seed_dir / 'world_params.json'))
    else:
        world_params = {}

    # Overwrites world_params if specified
    if config.shuffle_landmarks is not None:
        world_params['shuffle_landmarks'] = config.shuffle_landmarks

    if config.color_objects is not None:
        world_params['color_objects'] = config.color_objects

    if config.small_agents is not None:
        world_params['small_agents'] = config.small_agents

    if config.individual_reward is not None:
        world_params['individual_reward'] = config.individual_reward

    if config.use_dense_rewards is not None:
        world_params['use_dense_rewards'] = config.use_dense_rewards

    # Retrieves env_params (see multiagent.environment.MultiAgentEnv)
    if (dir_manager.seed_dir / 'env_params.json').exists():
        env_params = load_dict_from_json(
            str(dir_manager.seed_dir / 'env_params.json'))
    else:
        env_params = {}
        env_params['use_max_speed'] = False

    # Initializes model and environment
    algorithm = init_from_save(model_path)
    env = make_env(scenario_name=env_params['env_name'],
                   use_discrete_action=algorithm.use_discrete_action,
                   use_max_speed=env_params['use_max_speed'],
                   world_params=world_params)

    if config.render:
        env.render()

    if config.runner_prey:
        # makes sure the environment involves a prey
        assert config.env_name.endswith('tag')
        runner_policy = RunnerPolicy()

        for agent in env.world.agents:
            if agent.adversary:
                agent.action_callback = runner_policy.action

    if config.rusher_predators:
        # makes sure the environment involves predators
        assert config.env_name.endswith('tag')
        rusher_policy = RusherPolicy()

        for agent in env.world.agents:
            if not agent.adversary:
                agent.action_callback = rusher_policy.action

    if config.pendulum_agent is not None:
        # makes sure the agent to be controlled has a valid id
        assert config.pendulum_agent in list(range(len(env.world.agents)))

        pendulum_policy = DoublePendulumPolicy()
        env.world.agents[
            config.pendulum_agent].action_callback = pendulum_policy.action

    if config.interactive_agent is not None:
        # makes sure the agent to be controlled has a valid id
        assert config.interactive_agent in list(range(len(env.world.agents)))

        interactive_policy = InteractivePolicy(env, viewer_id=0)
        env.world.agents[
            config.
            interactive_agent].action_callback = interactive_policy.action

    algorithm.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval
    total_reward = []
    all_episodes_agent_embeddings = []
    all_episodes_coach_embeddings = []
    all_trajs = []

    overide_color = None

    color_agents = True

    if env_params['env_name'] == 'bounce':
        env.agents[0].size = 1. * env.agents[0].size
        env.world.overwrite = config.overwrite
    elif env_params['env_name'] == 'spread':
        color_agents = False
    elif env_params['env_name'] == 'compromise':
        env.agents[0].lightness = 0.9
        env.world.landmarks[0].lightness = 0.9
        env.agents[1].lightness = 0.5
        env.world.landmarks[1].lightness = 0.5
        # cmo = plt.cm.get_cmap('viridis')
        env.world.overwrite = config.overwrite
        # overide_color = [np.array(cmo(float(i) / float(2))[:3]) for i in range(2)]

    # set_seeds_env(2, env)
    # EPISODES LOOP
    for ep_i in range(config.n_episodes):
        # set_seeds(2)
        # set_seeds_env(2, env)
        agent_embeddings = []
        coach_embeddings = []
        traj = []
        ep_recorder = EpisodeRecorder(stuff_to_record=['reward'])

        # Resets the environment
        obs = env.reset()

        if config.save_gifs:
            frames = None
        if config.render:
            env.render('human')

        if not algorithm.soft:
            # Resets exploration noise
            algorithm.scale_noise(config.noise_scale)
            algorithm.reset_noise()

        # STEPS LOOP
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(algorithm.nagents)
            ]
            # get actions as torch Variables
            torch_actions, torch_embed = algorithm.select_action(
                torch_obs,
                is_exploring=False if config.noise_scale is None else True,
                return_embed=True)
            torch_total_obs = torch.cat(torch_obs, dim=-1)
            coach_embed = onehot_from_logits(
                algorithm.coach.model(torch_total_obs))
            coach_embeddings.append(coach_embed.data.numpy().squeeze())
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            embeds = [emb.data.numpy().squeeze() for emb in torch_embed]
            agent_embeddings.append(embeds)
            # steps forward in the environment
            next_obs, rewards, dones, infos = env.step(actions)
            ep_recorder.add_step(None, None, rewards, None)
            traj.append((obs, actions, next_obs, rewards, dones))
            obs = next_obs
            colors = list(cm.get_cmap('Set1').colors[:len(embeds[0])])
            if overide_color is not None:
                colors[0] = overide_color[0]
                colors[2] = overide_color[1]
            if color_agents:
                for agent, emb in zip(env.agents, embeds):
                    agent.color = colors[np.argmax(emb)]

            # record frames
            if config.save_gifs:
                frames = [] if frames is None else frames
                frames.append(env.render('rgb_array')[0])

            if config.render or config.save_gifs:
                # Enforces the fps config
                calc_end = time.time()
                elapsed = calc_end - calc_start
                if elapsed < ifi:
                    time.sleep(ifi - elapsed)
                env.render('human')

            if all(dones) and config.interrupt_episode:
                if config.render:
                    time.sleep(2)
                break

        # print(ep_recorder.get_total_reward())
        total_reward.append(ep_recorder.get_total_reward())
        all_episodes_agent_embeddings.append(agent_embeddings)
        all_episodes_coach_embeddings.append(coach_embeddings)
        all_trajs.append(traj)

    # Saves gif of all the episodes
    if config.save_gifs:
        gif_path = dir_manager.storage_dir / 'gifs'
        gif_path.mkdir(exist_ok=True)

        gif_num = 0
        while (gif_path /
               f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif"
               ).exists():
            gif_num += 1
        imageio.mimsave(str(
            gif_path /
            f"{env_params['env_name']}__experiment{config.experiment_num}_seed{config.seed_num}_{gif_num}.gif"
        ),
                        frames,
                        duration=ifi)
    env.close()

    embeddings = {
        'agents': all_episodes_agent_embeddings,
        'coach': all_episodes_coach_embeddings
    }

    save_folder = dir_manager.experiment_dir if config.save_to_exp_folder else dir_manager.seed_dir
    embeddings_path = U.directory_tree.uniquify(
        save_folder / f"{config.file_name_to_save}.pkl")
    trajs_path = osp.splitext(embeddings_path)[0] + "_trajs.pkl"

    with open(embeddings_path, 'wb') as fp:
        pickle.dump(embeddings, fp)
        fp.close()

    with open(trajs_path, 'wb') as fp:
        pickle.dump(all_trajs, fp)
        fp.close()

    return total_reward, str(embeddings_path)