Пример #1
0
    def __init__(self, config_env):

        self.name = 'ssd'
        self.config = config_env
        self.dim_obs = [self.config.obs_height, self.config.obs_width, 3]
        self.max_steps = self.config.max_steps

        self.cleaning_penalty = self.config.cleaning_penalty
        # Original space (not necessarily in this order, see
        # the original ssd files):
        # no-op, up, down, left, right, turn-ccw, turn-cw, penalty, clean
        if (self.config.disable_left_right_action
                and self.config.disable_rotation_action):
            self.l_action = 4
            self.cleaning_action_idx = 3
            # up, down, no-op, clean
            self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 8}
        elif self.config.disable_left_right_action:
            self.l_action = 6
            self.cleaning_action_idx = 5
            # up, down, no-op, rotate cw, rotate ccw, clean
            self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 5, 4: 6, 5: 8}
        elif self.config.disable_rotation_action:
            self.l_action = 6
            self.cleaning_action_idx = 5
            # left, right, up, down, no-op, clean
            self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8}
        else:  # full action space except penalty beam
            self.l_action = 8
            self.cleaning_action_idx = 7
            # Don't allow penalty beam
            self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8}

        self.obs_cleaned_1hot = self.config.obs_cleaned_1hot

        self.n_agents = self.config.n_agents

        if self.config.map_name == 'cleanup_small_sym':
            ascii_map = maps.CLEANUP_SMALL_SYM
        elif self.config.map_name == 'cleanup_10x10_sym':
            ascii_map = maps.CLEANUP_10x10_SYM

        self.env = CleanupEnv(
            ascii_map=ascii_map,
            num_agents=self.n_agents,
            render=False,
            shuffle_spawn=self.config.shuffle_spawn,
            global_ref_point=self.config.global_ref_point,
            view_size=self.config.view_size,
            random_orientation=self.config.random_orientation,
            cleanup_params=self.config.cleanup_params,
            beam_width=self.config.beam_width)

        # length of action input to learned reward function
        if self.config.obs_cleaned_1hot:
            self.l_action_for_r = 2
        else:
            self.l_action_for_r = self.l_action

        self.steps = 0
Пример #2
0
    def __init__(self, env_name='harvest', num_agents=1):
        self.env_name = env_name
        if env_name == 'harvest':
            print('Initializing Harvest environment')
            self.env = HarvestEnv(ascii_map=HARVEST_MAP_CPR,
                                  num_agents=num_agents,
                                  render=True)
        elif env_name == 'cleanup':
            print('Initializing Cleanup environment')
            self.env = CleanupEnv(num_agents=num_agents, render=True)
        else:
            print('Error! Not a valid environment type')
            return

        self.num_agents = num_agents

        self.agent_policies = []
        self.agents = list(self.env.agents.values())
        # print(agents[0].action_space)
        self.action_dim = self.agents[0].action_space.n
        for _ in range(num_agents):
            # TODO right now only using 1 frame, update later to look back x (e.g. 4) frames. Later RNN/LSTM
            neural_net = ConvFC(
                conv_in_channels=
                3,  # harvest specific input is 15x15x3 (HARVEST_VIEW_SIZE = 7)
                conv_out_channels=3,
                input_size=15,
                hidden_size=64,
                output_size=self.action_dim)
            self.agent_policies.append(
                DQNAgent(0, self.action_dim - 1, neural_net))

        self.env.reset()
Пример #3
0
    def __init__(self, env_name='cleanup'):
        self.env_name = env_name
        if env_name == 'harvest':
            print('Initializing Harvest environment')
            self.env = HarvestEnv(num_agents=5, render=True)
        elif env_name == 'cleanup':
            print('Initializing Cleanup environment')
            self.env = CleanupEnv(num_agents=5, render=True)
        else:
            print('Error! Not a valid environment type')
            return

        self.env.reset()
 def env_creator(_):
     ascii_map = CLEANUP_MAP
     if cleanup_map == 'small':
         ascii_map = CLEANUP_MAP_SMALL
     created_env = CleanupEnv(ascii_map=ascii_map,
                              num_agents=num_agents,
                              ir_param_list=ir_param_list,
                              hit_penalty=hit_penalty,
                              fire_cost=fire_cost)
     return created_env
    def __init__(self, args):
        self.env_name = args.env
        if self.env_name == "harvest":
            print("Initializing Harvest environment")
            self.env = HarvestEnv(num_agents=5)
        elif self.env_name == "cleanup":
            print("Initializing Cleanup environment")
            self.env = CleanupEnv(num_agents=5)
        elif self.env_name == "switch":
            print("Initializing Switch environment")
            self.env = SwitchEnv(args, num_agents=1)
        else:
            print("Error! Not a valid environment type")
            return

        self.env.reset()
Пример #6
0
from social_dilemmas.envs.harvest import HarvestEnv
from social_dilemmas.envs.cleanup import CleanupEnv
import numpy as np
FIRING_CLEANUP_MAP = [
    '@@@@@@',
    '@    @',
    '@HHP @',
    '@RH  @',
    '@H P @',
    '@@@@@@',
]
CLEANUP_VIEW_SIZE = 1

n_agents = 2
n_states = (CLEANUP_VIEW_SIZE*2+1)*(CLEANUP_VIEW_SIZE*2+1)*3
world = CleanupEnv(ascii_map=FIRING_CLEANUP_MAP, num_agents=2)
world.reset()
rand_action = np.random.randint(9, size=2)
obs, rew, dones, info, = world.step({'agent-0': rand_action[0],
                                      'agent-1': rand_action[1]})
for key,value in obs.items():
    value = value.flatten()
    obs[key] = value

def contactSta(stadict,mode):
    sta = []
    for key,value in stadict.items():
        if mode == 's':
            value = value.flatten()
        sta.append(value)
    return sta
Пример #7
0
 def env_creator(env_config):
     return CleanupEnv(env_config)
Пример #8
0
def setup(env,
          hparams,
          algorithm,
          train_batch_size,
          num_cpus,
          num_gpus,
          num_agents,
          num_symbols,
          grid_search,
          use_gpus_for_workers=False,
          use_gpu_for_driver=False,
          num_workers_per_device=1):

    obs_space = None
    act_space = None
    if env == 'harvest':
        obs_space = HarvestEnv.observation_space(num_agents, num_symbols)
        act_space = HarvestEnv.action_space(num_agents, num_symbols)

        def env_creator(env_config):
            return HarvestEnv(env_config)
    else:
        obs_space = CleanupEnv.observation_space(num_agents, num_symbols)
        act_space = CleanupEnv.action_space(num_agents, num_symbols)

        def env_creator(env_config):
            return CleanupEnv(env_config)

    env_name = env + "_env"
    register_env(env_name, env_creator)

    # register the custom model
    ModelCatalog.register_custom_model(MODEL_NAME, ObedienceLSTM)

    # Each policy can have a different configuration (including custom model)
    def gen_policy():
        return None, obs_space, act_space, {'custom_model': MODEL_NAME}

    # Setup with an ensemble of `num_policies` different policy graphs
    policy_graphs = {}
    for i in range(num_agents):
        policy_graphs['agent-' + str(i)] = gen_policy()

    def policy_mapping_fn(agent_id):
        return agent_id

    # gets the A3C trainer and its default config
    # source at https://github.com/ray-project/ray/blob/d537e9f0d8b84414a2aba7a7d0a68d59241f1490/rllib/agents/a3c/a3c.py
    agent_cls = get_agent_class(algorithm)
    config = agent_cls._default_config.copy()

    # information for replay
    config['env_config']['func_create'] = env_creator
    config['env_config']['env_name'] = env_name
    # config['env_config']['run'] = algorithm
    config['callbacks']['on_postprocess_traj'] = on_postprocess_traj

    # Calculate device configurations
    gpus_for_driver = int(use_gpu_for_driver)
    cpus_for_driver = 1 - gpus_for_driver
    if use_gpus_for_workers:
        spare_gpus = (num_gpus - gpus_for_driver)
        num_workers = int(spare_gpus * num_workers_per_device)
        num_gpus_per_worker = spare_gpus / num_workers
        num_cpus_per_worker = 0
    else:
        spare_cpus = (num_cpus - cpus_for_driver)
        num_workers = int(spare_cpus * num_workers_per_device)
        num_gpus_per_worker = 0
        num_cpus_per_worker = spare_cpus / num_workers

    # hyperparams
    config.update({
        "train_batch_size":
        train_batch_size,
        "sample_batch_size":
        50,
        # "batch_mode": "complete_episodes",
        # "metrics_smoothing_episodes": 1,
        "vf_loss_coeff":
        0.1,
        "horizon":
        1000,
        "gamma":
        0.99,
        "lr_schedule": [[0, hparams['lr_init']],
                        [20000000, hparams['lr_final']]],
        "num_workers":
        num_workers,
        "num_gpus":
        num_gpus,  # The number of GPUs for the driver
        "num_cpus_for_driver":
        cpus_for_driver,
        "num_gpus_per_worker":
        num_gpus_per_worker,  # Can be a fraction
        "num_cpus_per_worker":
        num_cpus_per_worker,  # Can be a fraction
        "entropy_coeff":
        hparams['entropy_coeff'],
        "multiagent": {
            "policies": policy_graphs,
            "policy_mapping_fn": policy_mapping_fn,
        },
        "model": {
            "custom_model": MODEL_NAME,
            #"custom_preprocessor": "nothing",
            "use_lstm": False,
            "custom_options": {
                "num_agents": num_agents,
                "num_symbols": num_symbols,
                "fcnet_hiddens": [32, 32],
                "cell_size": 128,
            },
            "conv_filters": [[6, [3, 3], 1]],
            #"lstm_cell_size": 128
            # conv filters??
        },
        "env_config": {
            "num_agents": num_agents,
            "num_symbols": num_symbols,
            "obedience_weight": .001,
            "leadership_weight": .001,
        },
    })

    if args.algorithm == "PPO":
        config.update({
            "num_sgd_iter": 10,
            "sgd_minibatch_size": 500,
            "vf_loss_coeff": 1e-4
        })

    if args.grid_search:
        pass

    return algorithm, env_name, config
Пример #9
0
 def env_creator(_):
     return CleanupEnv(num_agents=num_agents)
Пример #10
0
def setup(env,
          hparams,
          algorithm,
          train_batch_size,
          num_cpus,
          num_gpus,
          num_agents,
          use_gpus_for_workers=False,
          use_gpu_for_driver=False,
          num_workers_per_device=1):

    if env == 'harvest':

        def env_creator(_):
            return HarvestEnv(num_agents=num_agents)

        single_env = HarvestEnv()
    elif env == "harvest_comm":

        def env_creator(_):
            return HarvestCommEnv(num_agents=num_agents)

        single_env = HarvestCommEnv()
    else:

        def env_creator(_):
            return CleanupEnv(num_agents=num_agents)

        single_env = CleanupEnv()

    env_name = env + "_env"
    register_env(env_name, env_creator)

    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Each policy can have a different configuration (including custom model)
    def gen_policy():
        return (PPOPolicyGraph, obs_space, act_space, {})

    # Setup PPO with an ensemble of `num_policies` different policy graphs
    policy_graphs = {}
    for i in range(num_agents):
        policy_graphs['agent-' + str(i)] = gen_policy()

    def policy_mapping_fn(agent_id):
        return agent_id

    # register the custom model
    model_name = "conv_to_fc_net"
    ModelCatalog.register_custom_model(model_name, ConvToFCNet)

    agent_cls = get_agent_class(algorithm)
    config = agent_cls._default_config.copy()

    # information for replay
    config['env_config']['func_create'] = tune.function(env_creator)
    config['env_config']['env_name'] = env_name
    config['env_config']['run'] = algorithm

    # Calculate device configurations
    gpus_for_driver = int(use_gpu_for_driver)
    cpus_for_driver = 1 - gpus_for_driver
    if use_gpus_for_workers:
        spare_gpus = (num_gpus - gpus_for_driver)
        num_workers = int(spare_gpus * num_workers_per_device)
        num_gpus_per_worker = spare_gpus / num_workers
        num_cpus_per_worker = 0
    else:
        spare_cpus = (num_cpus - cpus_for_driver)
        num_workers = int(spare_cpus * num_workers_per_device)
        num_gpus_per_worker = 0
        num_cpus_per_worker = spare_cpus / num_workers

    # hyperparams
    config.update({
        "train_batch_size":
        train_batch_size,
        "horizon":
        1000,
        "lr_schedule": [[0, hparams['lr_init']],
                        [20000000, hparams['lr_final']]],
        "num_workers":
        num_workers,
        "num_gpus":
        gpus_for_driver,  # The number of GPUs for the driver
        "num_cpus_for_driver":
        cpus_for_driver,
        "num_gpus_per_worker":
        num_gpus_per_worker,  # Can be a fraction
        "num_cpus_per_worker":
        num_cpus_per_worker,  # Can be a fraction
        "entropy_coeff":
        hparams['entropy_coeff'],
        "multiagent": {
            "policy_graphs": policy_graphs,
            "policy_mapping_fn": tune.function(policy_mapping_fn),
        },
        "model": {
            "custom_model": "conv_to_fc_net",
            "use_lstm": True,
            "lstm_cell_size": 128
        }
    })
    return algorithm, env_name, config
Пример #11
0
class Controller(object):
    def __init__(self, env_name='cleanup'):
        self.env_name = env_name
        if env_name == 'harvest':
            print('Initializing Harvest environment')
            self.env = HarvestEnv(num_agents=5, render=True)
        elif env_name == 'cleanup':
            print('Initializing Cleanup environment')
            self.env = CleanupEnv(num_agents=5, render=True)
        else:
            print('Error! Not a valid environment type')
            return

        self.env.reset()

        # TODO: initialize agents here

    def rollout(self, horizon=50, save_path=None):
        """ Rollout several timesteps of an episode of the environment.

        Args:
            horizon: The number of timesteps to roll out.
            save_path: If provided, will save each frame to disk at this
                location.
        """
        rewards = []
        observations = []
        shape = self.env.world_map.shape
        full_obs = [
            np.zeros((shape[0], shape[1], 3), dtype=np.uint8)
            for i in range(horizon)
        ]

        for i in range(horizon):
            agents = list(self.env.agents.values())
            action_dim = agents[0].action_space.n
            rand_action = np.random.randint(action_dim, size=5)
            obs, rew, dones, info, = self.env.step({
                'agent-0': rand_action[0],
                'agent-1': rand_action[1],
                'agent-2': rand_action[2],
                'agent-3': rand_action[3],
                'agent-4': rand_action[4]
            })

            sys.stdout.flush()

            if save_path is not None:
                self.env.render(filename=save_path + 'frame' +
                                str(i).zfill(6) + '.png')

            rgb_arr = self.env.map_to_colors()
            full_obs[i] = rgb_arr.astype(np.uint8)
            observations.append(obs['agent-0'])
            rewards.append(rew['agent-0'])

        return rewards, observations, full_obs

    def render_rollout(self,
                       horizon=50,
                       path=None,
                       render_type='pretty',
                       fps=8):
        """ Render a rollout into a video.

        Args:
            horizon: The number of timesteps to roll out.
            path: Directory where the video will be saved.
            render_type: Can be 'pretty' or 'fast'. Impliciations obvious.
            fps: Integer frames per second.
        """
        if path is None:
            path = os.path.abspath(os.path.dirname(__file__)) + '/videos'
            print(path)
            if not os.path.exists(path):
                os.makedirs(path)
        video_name = self.env_name + '_trajectory'

        if render_type == 'pretty':
            image_path = os.path.join(path, 'frames/')
            if not os.path.exists(image_path):
                os.makedirs(image_path)

            rewards, observations, full_obs = self.rollout(
                horizon=horizon, save_path=image_path)
            utility_funcs.make_video_from_image_dir(path,
                                                    image_path,
                                                    fps=fps,
                                                    video_name=video_name)

            # Clean up images
            shutil.rmtree(image_path)
        else:
            rewards, observations, full_obs = self.rollout(horizon=horizon)
            utility_funcs.make_video_from_rgb_imgs(full_obs,
                                                   path,
                                                   fps=fps,
                                                   video_name=video_name)
Пример #12
0
class Controller(object):
    def __init__(self, env_name='harvest', num_agents=1):
        self.env_name = env_name
        if env_name == 'harvest':
            print('Initializing Harvest environment')
            self.env = HarvestEnv(ascii_map=HARVEST_MAP_CPR,
                                  num_agents=num_agents,
                                  render=True)
        elif env_name == 'cleanup':
            print('Initializing Cleanup environment')
            self.env = CleanupEnv(num_agents=num_agents, render=True)
        else:
            print('Error! Not a valid environment type')
            return

        self.num_agents = num_agents

        self.agent_policies = []
        self.agents = list(self.env.agents.values())
        # print(agents[0].action_space)
        self.action_dim = self.agents[0].action_space.n
        for _ in range(num_agents):
            # TODO right now only using 1 frame, update later to look back x (e.g. 4) frames. Later RNN/LSTM
            neural_net = ConvFC(
                conv_in_channels=
                3,  # harvest specific input is 15x15x3 (HARVEST_VIEW_SIZE = 7)
                conv_out_channels=3,
                input_size=15,
                hidden_size=64,
                output_size=self.action_dim)
            self.agent_policies.append(
                DQNAgent(0, self.action_dim - 1, neural_net))

        self.env.reset()

    def process_experiences(self,
                            id,
                            i,
                            obs,
                            action_dict,
                            rew,
                            next_obs,
                            dones,
                            train_agents=False):
        # print(id)
        # print(i)
        agent_i = "agent-{}".format(i)
        self.agent_policies[i].push_experience(
            reshape_obs_for_convfc(obs[agent_i][0]),
            action_dict[agent_i],
            rew[agent_i],
            reshape_obs_for_convfc(
                next_obs[agent_i][0]
            ),  # we here using without the reward info... can modify later but this is just a test
            dones[agent_i])

        if train_agents:
            self.agent_policies[i].q_learn_update()

    # def train_parallel_agents(self, id, obs, action_dict, rew, next_obs, dones):
    #     for i in range(self.num_agents):
    #         # torch.multiprocessing.spawn(self.train_agent, args=(i, obs, action_dict, rew, next_obs, dones))
    #         self.train_agent(id, i, obs, action_dict, rew, next_obs, dones)

    def rollout(self,
                horizon,
                train_every=100,
                save_path=None,
                train_agents=True,
                print_act=False):
        """ Rollout several timesteps of an episode of the environment.

        Args:
            horizon: The number of timesteps to roll out.
            save_path: If provided, will save each frame to disk at this
                location.
        """

        rewards = np.zeros(self.num_agents)
        observations = []
        shape = self.env.world_map.shape
        full_obs = [
            np.zeros((shape[0], shape[1], 3), dtype=np.uint8)
            for i in range(horizon)
        ]

        init_obs = self.env.reset()
        # print(init_obs)
        obs = init_obs

        for time_step in range(horizon):
            # print(time_step )
            action_dim = self.action_dim

            # Single agent hardcoded for now

            hard_coded = False
            if hard_coded:
                action_cycle = 40
                prep_time = 4 + 2  #10
                single_obs = obs["agent-{}".format(0)][0]
                if time_step < prep_time - 2:
                    # print(single_obs)
                    # print(single_obs.shape)
                    # print(single_obs[7][7])
                    #
                    # print(single_obs[7][6])
                    # print(single_obs[6][7])
                    # print(single_obs[7][8])
                    # print(single_obs[8][7])
                    # if single_obs[8][7].sum() == 540 and single_obs[7][6].sum() == 540: # 200
                    if single_obs[6][7].sum() == 540 and single_obs[7][8].sum(
                    ) == 540:  # 200
                        # if single_obs[6][7].sum() == 540 and single_obs[7][6].sum() == 540: # 100
                        # if single_obs[8][7].sum() == 540 and single_obs[7][8].sum() == 540: # 100
                        action = 4
                    # elif single_obs[7][9].sum() == 0 and single_obs[5][7].sum() == 0: # lower and left empty
                    #     action = 5
                    else:
                        action = 6  # got lazy, just keep turning otherwise
                    # action = 5
                # elif time_step == prep_time - 3:
                #     # print(single_obs[7][6])
                #     # print(single_obs[6][7])
                #     # print(single_obs[7][8])
                #     # print(single_obs[8][7])
                #     action=2 # first up movement, start the cycle
                elif time_step == prep_time - 2:
                    # print(single_obs[7][6])
                    # print(single_obs[6][7])
                    # print(single_obs[7][8])
                    # print(single_obs[8][7])
                    action = 1  #0 # first left movement, start the cycle # left and right are wrong? Yeah they messed it up
                    # Um anyway... around 450 is optimal in this env.
                elif time_step == prep_time - 1:
                    # print(single_obs[7][6])
                    # print(single_obs[6][7])
                    # print(single_obs[7][8])
                    # print(single_obs[8][7])
                    action = 2  # up again for smoe reason
                else:
                    # if time_step == prep_time:
                    # print(single_obs[7][6])
                    # print(single_obs[6][7])
                    # print(single_obs[7][8])
                    # print(single_obs[8][7])
                    # Assumes up orientation
                    if (time_step - prep_time) % action_cycle < 16:
                        action = 1  # left
                    elif (time_step - prep_time) % action_cycle < 20:
                        action = 2
                    elif (time_step - prep_time) % action_cycle < 36:
                        action = 0  # right
                    elif (time_step - prep_time) % action_cycle < 40:
                        action = 3  # down
                    # print(action)

                actions = [action]

            action_dict = {}

            if not hard_coded:
                actions = []
                if train_agents:
                    # for i in range(self.num_agents):
                    #     print(i)
                    #     action = self.agent_policies[i].act(reshape_obs_for_convfc(obs["agent-{}".format(i)]), print_act=print_act)
                    # actions.append(action)
                    actions = [
                        self.agent_policies[i].act(reshape_obs_for_convfc(
                            obs["agent-{}".format(i)][0]),
                                                   print_act=print_act)
                        for i in range(self.num_agents)
                    ]
                else:
                    # can choose eps=0 or something else after
                    actions = [
                        self.agent_policies[i].act(reshape_obs_for_convfc(
                            obs["agent-{}".format(i)][0]),
                                                   print_act=print_act)
                        for i in range(self.num_agents)
                    ]

            for i in range(self.num_agents):
                agent_i = "agent-{}".format(i)
                action_dict[agent_i] = actions[i]
                # if train_agents:
                #     # print(ray.get(self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i]))))
                #     action_dict[agent_i] = self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i]))
                # else:
                #     action_dict[agent_i] = self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i]), epsilon=0)
                #     # 1, obs[agent_i].shape[2], obs[agent_i].shape[0], obs[agent_i].shape[1] )) # batch size = 1 for 1 obs right now...

            next_obs, rew, dones, info, = self.env.step(action_dict)

            if not hard_coded:
                if train_agents:
                    for i in range(self.num_agents):
                        if ((time_step + 1) % train_every == 0):
                            self.process_experiences(0,
                                                     i,
                                                     obs,
                                                     action_dict,
                                                     rew,
                                                     next_obs,
                                                     dones,
                                                     train_agents=True)
                        else:
                            self.process_experiences(0,
                                                     i,
                                                     obs,
                                                     action_dict,
                                                     rew,
                                                     next_obs,
                                                     dones,
                                                     train_agents=False)

            obs = next_obs

            sys.stdout.flush()

            if save_path is not None:
                self.env.render(filename=save_path + 'frame' +
                                str(time_step).zfill(6) + '.png')

            rgb_arr = self.env.map_to_colors()
            full_obs[time_step] = rgb_arr.astype(np.uint8)

            # rewards.append(rew)
            observations.append(obs)
            for i in range(self.num_agents):
                agent_i = "agent-{}".format(i)
                rewards[i] += rew[agent_i]
            # observations.append(obs['agent-0'])
            # rewards.append(rew['agent-0'])

        return rewards, observations, full_obs

    def render_rollout(self, horizon=50, path=None, fps=8):
        """ Render a rollout into a video.

        Args:
            horizon: The number of timesteps to roll out.
            path: Directory where the video will be saved.
            render_type: Can be 'pretty' or 'fast'. Impliciations obvious.
            fps: Integer frames per second.
        """
        if path is None:
            path = os.path.abspath(os.path.dirname(__file__)) + '/videos'
            print(path)
            if not os.path.exists(path):
                os.makedirs(path)
        video_name = self.env_name + '_trajectory'

        # if render_type == 'pretty':
        #     image_path = os.path.join(path, 'frames/')
        #     if not os.path.exists(image_path):
        #         os.makedirs(image_path)
        #
        #     rewards, observations, full_obs = self.rollout(
        #         horizon=horizon, save_path=image_path, train_agents=False)
        #     utility_funcs.make_video_from_image_dir(path, image_path, fps=fps,
        #                                             video_name=video_name)
        #
        #     # Clean up images
        #     shutil.rmtree(image_path)
        # else:
        rewards, observations, full_obs = self.rollout(horizon=horizon,
                                                       train_agents=False,
                                                       print_act=False)
        utility_funcs.make_video_from_rgb_imgs(full_obs,
                                               path,
                                               fps=fps,
                                               video_name=video_name)
        return rewards
 def env_creator(_):
     return CleanupEnv(
         num_agents=num_agents,
         return_agent_actions=True,
         use_collective_reward=args.use_collective_reward,
     )
Пример #14
0
 def __call__(self):
     return CleanupEnv(ascii_map=FIRING_CLEANUP_MAP,
                       num_agents=self._num_agents)
Пример #15
0
 def __call__(self):
     return CleanupEnv()
Пример #16
0
class Env(object):
    def __init__(self, config_env):

        self.name = 'ssd'
        self.config = config_env
        self.dim_obs = [self.config.obs_height, self.config.obs_width, 3]
        self.max_steps = self.config.max_steps

        self.cleaning_penalty = self.config.cleaning_penalty
        # Original space (not necessarily in this order, see
        # the original ssd files):
        # no-op, up, down, left, right, turn-ccw, turn-cw, penalty, clean
        if (self.config.disable_left_right_action
                and self.config.disable_rotation_action):
            self.l_action = 4
            self.cleaning_action_idx = 3
            # up, down, no-op, clean
            self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 8}
        elif self.config.disable_left_right_action:
            self.l_action = 6
            self.cleaning_action_idx = 5
            # up, down, no-op, rotate cw, rotate ccw, clean
            self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 5, 4: 6, 5: 8}
        elif self.config.disable_rotation_action:
            self.l_action = 6
            self.cleaning_action_idx = 5
            # left, right, up, down, no-op, clean
            self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8}
        else:  # full action space except penalty beam
            self.l_action = 8
            self.cleaning_action_idx = 7
            # Don't allow penalty beam
            self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8}

        self.obs_cleaned_1hot = self.config.obs_cleaned_1hot

        self.n_agents = self.config.n_agents

        if self.config.map_name == 'cleanup_small_sym':
            ascii_map = maps.CLEANUP_SMALL_SYM
        elif self.config.map_name == 'cleanup_10x10_sym':
            ascii_map = maps.CLEANUP_10x10_SYM

        self.env = CleanupEnv(
            ascii_map=ascii_map,
            num_agents=self.n_agents,
            render=False,
            shuffle_spawn=self.config.shuffle_spawn,
            global_ref_point=self.config.global_ref_point,
            view_size=self.config.view_size,
            random_orientation=self.config.random_orientation,
            cleanup_params=self.config.cleanup_params,
            beam_width=self.config.beam_width)

        # length of action input to learned reward function
        if self.config.obs_cleaned_1hot:
            self.l_action_for_r = 2
        else:
            self.l_action_for_r = self.l_action

        self.steps = 0

    def process_obs(self, obs_dict):

        return [obs / 256.0 for obs in list(obs_dict.values())]

    def reset(self):
        """Resets the environemnt.

        Returns:
            List of agent observations
        """
        obs = self.env.reset()
        self.steps = 0

        return self.process_obs(obs)

    def step(self, actions):
        """Takes a step in env.
        
        Args:
            actions: list of integers

        Returns:
            List of observations, list of rewards, done, info
        """
        actions = [self.map_to_orig[a] for a in actions]
        actions_dict = {
            'agent-%d' % idx: actions[idx]
            for idx in range(self.n_agents)
        }

        # all objects returned by env.step are dicts
        obs_next, rewards, dones, info = self.env.step(actions_dict)
        self.steps += 1

        obs_next = self.process_obs(obs_next)
        rewards = list(rewards.values())
        if self.cleaning_penalty > 0:
            for idx in range(self.n_agents):
                if actions[idx] == 8:
                    rewards[idx] -= self.cleaning_penalty

        # done = dones['__all__']  # apparently they hardcode done to False
        done = dones['__all__'] or self.steps == self.max_steps

        return obs_next, rewards, done, info

    def render(self):

        self.env.render()