def __init__(self, config_env): self.name = 'ssd' self.config = config_env self.dim_obs = [self.config.obs_height, self.config.obs_width, 3] self.max_steps = self.config.max_steps self.cleaning_penalty = self.config.cleaning_penalty # Original space (not necessarily in this order, see # the original ssd files): # no-op, up, down, left, right, turn-ccw, turn-cw, penalty, clean if (self.config.disable_left_right_action and self.config.disable_rotation_action): self.l_action = 4 self.cleaning_action_idx = 3 # up, down, no-op, clean self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 8} elif self.config.disable_left_right_action: self.l_action = 6 self.cleaning_action_idx = 5 # up, down, no-op, rotate cw, rotate ccw, clean self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 5, 4: 6, 5: 8} elif self.config.disable_rotation_action: self.l_action = 6 self.cleaning_action_idx = 5 # left, right, up, down, no-op, clean self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8} else: # full action space except penalty beam self.l_action = 8 self.cleaning_action_idx = 7 # Don't allow penalty beam self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8} self.obs_cleaned_1hot = self.config.obs_cleaned_1hot self.n_agents = self.config.n_agents if self.config.map_name == 'cleanup_small_sym': ascii_map = maps.CLEANUP_SMALL_SYM elif self.config.map_name == 'cleanup_10x10_sym': ascii_map = maps.CLEANUP_10x10_SYM self.env = CleanupEnv( ascii_map=ascii_map, num_agents=self.n_agents, render=False, shuffle_spawn=self.config.shuffle_spawn, global_ref_point=self.config.global_ref_point, view_size=self.config.view_size, random_orientation=self.config.random_orientation, cleanup_params=self.config.cleanup_params, beam_width=self.config.beam_width) # length of action input to learned reward function if self.config.obs_cleaned_1hot: self.l_action_for_r = 2 else: self.l_action_for_r = self.l_action self.steps = 0
def __init__(self, env_name='harvest', num_agents=1): self.env_name = env_name if env_name == 'harvest': print('Initializing Harvest environment') self.env = HarvestEnv(ascii_map=HARVEST_MAP_CPR, num_agents=num_agents, render=True) elif env_name == 'cleanup': print('Initializing Cleanup environment') self.env = CleanupEnv(num_agents=num_agents, render=True) else: print('Error! Not a valid environment type') return self.num_agents = num_agents self.agent_policies = [] self.agents = list(self.env.agents.values()) # print(agents[0].action_space) self.action_dim = self.agents[0].action_space.n for _ in range(num_agents): # TODO right now only using 1 frame, update later to look back x (e.g. 4) frames. Later RNN/LSTM neural_net = ConvFC( conv_in_channels= 3, # harvest specific input is 15x15x3 (HARVEST_VIEW_SIZE = 7) conv_out_channels=3, input_size=15, hidden_size=64, output_size=self.action_dim) self.agent_policies.append( DQNAgent(0, self.action_dim - 1, neural_net)) self.env.reset()
def __init__(self, env_name='cleanup'): self.env_name = env_name if env_name == 'harvest': print('Initializing Harvest environment') self.env = HarvestEnv(num_agents=5, render=True) elif env_name == 'cleanup': print('Initializing Cleanup environment') self.env = CleanupEnv(num_agents=5, render=True) else: print('Error! Not a valid environment type') return self.env.reset()
def env_creator(_): ascii_map = CLEANUP_MAP if cleanup_map == 'small': ascii_map = CLEANUP_MAP_SMALL created_env = CleanupEnv(ascii_map=ascii_map, num_agents=num_agents, ir_param_list=ir_param_list, hit_penalty=hit_penalty, fire_cost=fire_cost) return created_env
def __init__(self, args): self.env_name = args.env if self.env_name == "harvest": print("Initializing Harvest environment") self.env = HarvestEnv(num_agents=5) elif self.env_name == "cleanup": print("Initializing Cleanup environment") self.env = CleanupEnv(num_agents=5) elif self.env_name == "switch": print("Initializing Switch environment") self.env = SwitchEnv(args, num_agents=1) else: print("Error! Not a valid environment type") return self.env.reset()
from social_dilemmas.envs.harvest import HarvestEnv from social_dilemmas.envs.cleanup import CleanupEnv import numpy as np FIRING_CLEANUP_MAP = [ '@@@@@@', '@ @', '@HHP @', '@RH @', '@H P @', '@@@@@@', ] CLEANUP_VIEW_SIZE = 1 n_agents = 2 n_states = (CLEANUP_VIEW_SIZE*2+1)*(CLEANUP_VIEW_SIZE*2+1)*3 world = CleanupEnv(ascii_map=FIRING_CLEANUP_MAP, num_agents=2) world.reset() rand_action = np.random.randint(9, size=2) obs, rew, dones, info, = world.step({'agent-0': rand_action[0], 'agent-1': rand_action[1]}) for key,value in obs.items(): value = value.flatten() obs[key] = value def contactSta(stadict,mode): sta = [] for key,value in stadict.items(): if mode == 's': value = value.flatten() sta.append(value) return sta
def env_creator(env_config): return CleanupEnv(env_config)
def setup(env, hparams, algorithm, train_batch_size, num_cpus, num_gpus, num_agents, num_symbols, grid_search, use_gpus_for_workers=False, use_gpu_for_driver=False, num_workers_per_device=1): obs_space = None act_space = None if env == 'harvest': obs_space = HarvestEnv.observation_space(num_agents, num_symbols) act_space = HarvestEnv.action_space(num_agents, num_symbols) def env_creator(env_config): return HarvestEnv(env_config) else: obs_space = CleanupEnv.observation_space(num_agents, num_symbols) act_space = CleanupEnv.action_space(num_agents, num_symbols) def env_creator(env_config): return CleanupEnv(env_config) env_name = env + "_env" register_env(env_name, env_creator) # register the custom model ModelCatalog.register_custom_model(MODEL_NAME, ObedienceLSTM) # Each policy can have a different configuration (including custom model) def gen_policy(): return None, obs_space, act_space, {'custom_model': MODEL_NAME} # Setup with an ensemble of `num_policies` different policy graphs policy_graphs = {} for i in range(num_agents): policy_graphs['agent-' + str(i)] = gen_policy() def policy_mapping_fn(agent_id): return agent_id # gets the A3C trainer and its default config # source at https://github.com/ray-project/ray/blob/d537e9f0d8b84414a2aba7a7d0a68d59241f1490/rllib/agents/a3c/a3c.py agent_cls = get_agent_class(algorithm) config = agent_cls._default_config.copy() # information for replay config['env_config']['func_create'] = env_creator config['env_config']['env_name'] = env_name # config['env_config']['run'] = algorithm config['callbacks']['on_postprocess_traj'] = on_postprocess_traj # Calculate device configurations gpus_for_driver = int(use_gpu_for_driver) cpus_for_driver = 1 - gpus_for_driver if use_gpus_for_workers: spare_gpus = (num_gpus - gpus_for_driver) num_workers = int(spare_gpus * num_workers_per_device) num_gpus_per_worker = spare_gpus / num_workers num_cpus_per_worker = 0 else: spare_cpus = (num_cpus - cpus_for_driver) num_workers = int(spare_cpus * num_workers_per_device) num_gpus_per_worker = 0 num_cpus_per_worker = spare_cpus / num_workers # hyperparams config.update({ "train_batch_size": train_batch_size, "sample_batch_size": 50, # "batch_mode": "complete_episodes", # "metrics_smoothing_episodes": 1, "vf_loss_coeff": 0.1, "horizon": 1000, "gamma": 0.99, "lr_schedule": [[0, hparams['lr_init']], [20000000, hparams['lr_final']]], "num_workers": num_workers, "num_gpus": num_gpus, # The number of GPUs for the driver "num_cpus_for_driver": cpus_for_driver, "num_gpus_per_worker": num_gpus_per_worker, # Can be a fraction "num_cpus_per_worker": num_cpus_per_worker, # Can be a fraction "entropy_coeff": hparams['entropy_coeff'], "multiagent": { "policies": policy_graphs, "policy_mapping_fn": policy_mapping_fn, }, "model": { "custom_model": MODEL_NAME, #"custom_preprocessor": "nothing", "use_lstm": False, "custom_options": { "num_agents": num_agents, "num_symbols": num_symbols, "fcnet_hiddens": [32, 32], "cell_size": 128, }, "conv_filters": [[6, [3, 3], 1]], #"lstm_cell_size": 128 # conv filters?? }, "env_config": { "num_agents": num_agents, "num_symbols": num_symbols, "obedience_weight": .001, "leadership_weight": .001, }, }) if args.algorithm == "PPO": config.update({ "num_sgd_iter": 10, "sgd_minibatch_size": 500, "vf_loss_coeff": 1e-4 }) if args.grid_search: pass return algorithm, env_name, config
def env_creator(_): return CleanupEnv(num_agents=num_agents)
def setup(env, hparams, algorithm, train_batch_size, num_cpus, num_gpus, num_agents, use_gpus_for_workers=False, use_gpu_for_driver=False, num_workers_per_device=1): if env == 'harvest': def env_creator(_): return HarvestEnv(num_agents=num_agents) single_env = HarvestEnv() elif env == "harvest_comm": def env_creator(_): return HarvestCommEnv(num_agents=num_agents) single_env = HarvestCommEnv() else: def env_creator(_): return CleanupEnv(num_agents=num_agents) single_env = CleanupEnv() env_name = env + "_env" register_env(env_name, env_creator) obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model) def gen_policy(): return (PPOPolicyGraph, obs_space, act_space, {}) # Setup PPO with an ensemble of `num_policies` different policy graphs policy_graphs = {} for i in range(num_agents): policy_graphs['agent-' + str(i)] = gen_policy() def policy_mapping_fn(agent_id): return agent_id # register the custom model model_name = "conv_to_fc_net" ModelCatalog.register_custom_model(model_name, ConvToFCNet) agent_cls = get_agent_class(algorithm) config = agent_cls._default_config.copy() # information for replay config['env_config']['func_create'] = tune.function(env_creator) config['env_config']['env_name'] = env_name config['env_config']['run'] = algorithm # Calculate device configurations gpus_for_driver = int(use_gpu_for_driver) cpus_for_driver = 1 - gpus_for_driver if use_gpus_for_workers: spare_gpus = (num_gpus - gpus_for_driver) num_workers = int(spare_gpus * num_workers_per_device) num_gpus_per_worker = spare_gpus / num_workers num_cpus_per_worker = 0 else: spare_cpus = (num_cpus - cpus_for_driver) num_workers = int(spare_cpus * num_workers_per_device) num_gpus_per_worker = 0 num_cpus_per_worker = spare_cpus / num_workers # hyperparams config.update({ "train_batch_size": train_batch_size, "horizon": 1000, "lr_schedule": [[0, hparams['lr_init']], [20000000, hparams['lr_final']]], "num_workers": num_workers, "num_gpus": gpus_for_driver, # The number of GPUs for the driver "num_cpus_for_driver": cpus_for_driver, "num_gpus_per_worker": num_gpus_per_worker, # Can be a fraction "num_cpus_per_worker": num_cpus_per_worker, # Can be a fraction "entropy_coeff": hparams['entropy_coeff'], "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": tune.function(policy_mapping_fn), }, "model": { "custom_model": "conv_to_fc_net", "use_lstm": True, "lstm_cell_size": 128 } }) return algorithm, env_name, config
class Controller(object): def __init__(self, env_name='cleanup'): self.env_name = env_name if env_name == 'harvest': print('Initializing Harvest environment') self.env = HarvestEnv(num_agents=5, render=True) elif env_name == 'cleanup': print('Initializing Cleanup environment') self.env = CleanupEnv(num_agents=5, render=True) else: print('Error! Not a valid environment type') return self.env.reset() # TODO: initialize agents here def rollout(self, horizon=50, save_path=None): """ Rollout several timesteps of an episode of the environment. Args: horizon: The number of timesteps to roll out. save_path: If provided, will save each frame to disk at this location. """ rewards = [] observations = [] shape = self.env.world_map.shape full_obs = [ np.zeros((shape[0], shape[1], 3), dtype=np.uint8) for i in range(horizon) ] for i in range(horizon): agents = list(self.env.agents.values()) action_dim = agents[0].action_space.n rand_action = np.random.randint(action_dim, size=5) obs, rew, dones, info, = self.env.step({ 'agent-0': rand_action[0], 'agent-1': rand_action[1], 'agent-2': rand_action[2], 'agent-3': rand_action[3], 'agent-4': rand_action[4] }) sys.stdout.flush() if save_path is not None: self.env.render(filename=save_path + 'frame' + str(i).zfill(6) + '.png') rgb_arr = self.env.map_to_colors() full_obs[i] = rgb_arr.astype(np.uint8) observations.append(obs['agent-0']) rewards.append(rew['agent-0']) return rewards, observations, full_obs def render_rollout(self, horizon=50, path=None, render_type='pretty', fps=8): """ Render a rollout into a video. Args: horizon: The number of timesteps to roll out. path: Directory where the video will be saved. render_type: Can be 'pretty' or 'fast'. Impliciations obvious. fps: Integer frames per second. """ if path is None: path = os.path.abspath(os.path.dirname(__file__)) + '/videos' print(path) if not os.path.exists(path): os.makedirs(path) video_name = self.env_name + '_trajectory' if render_type == 'pretty': image_path = os.path.join(path, 'frames/') if not os.path.exists(image_path): os.makedirs(image_path) rewards, observations, full_obs = self.rollout( horizon=horizon, save_path=image_path) utility_funcs.make_video_from_image_dir(path, image_path, fps=fps, video_name=video_name) # Clean up images shutil.rmtree(image_path) else: rewards, observations, full_obs = self.rollout(horizon=horizon) utility_funcs.make_video_from_rgb_imgs(full_obs, path, fps=fps, video_name=video_name)
class Controller(object): def __init__(self, env_name='harvest', num_agents=1): self.env_name = env_name if env_name == 'harvest': print('Initializing Harvest environment') self.env = HarvestEnv(ascii_map=HARVEST_MAP_CPR, num_agents=num_agents, render=True) elif env_name == 'cleanup': print('Initializing Cleanup environment') self.env = CleanupEnv(num_agents=num_agents, render=True) else: print('Error! Not a valid environment type') return self.num_agents = num_agents self.agent_policies = [] self.agents = list(self.env.agents.values()) # print(agents[0].action_space) self.action_dim = self.agents[0].action_space.n for _ in range(num_agents): # TODO right now only using 1 frame, update later to look back x (e.g. 4) frames. Later RNN/LSTM neural_net = ConvFC( conv_in_channels= 3, # harvest specific input is 15x15x3 (HARVEST_VIEW_SIZE = 7) conv_out_channels=3, input_size=15, hidden_size=64, output_size=self.action_dim) self.agent_policies.append( DQNAgent(0, self.action_dim - 1, neural_net)) self.env.reset() def process_experiences(self, id, i, obs, action_dict, rew, next_obs, dones, train_agents=False): # print(id) # print(i) agent_i = "agent-{}".format(i) self.agent_policies[i].push_experience( reshape_obs_for_convfc(obs[agent_i][0]), action_dict[agent_i], rew[agent_i], reshape_obs_for_convfc( next_obs[agent_i][0] ), # we here using without the reward info... can modify later but this is just a test dones[agent_i]) if train_agents: self.agent_policies[i].q_learn_update() # def train_parallel_agents(self, id, obs, action_dict, rew, next_obs, dones): # for i in range(self.num_agents): # # torch.multiprocessing.spawn(self.train_agent, args=(i, obs, action_dict, rew, next_obs, dones)) # self.train_agent(id, i, obs, action_dict, rew, next_obs, dones) def rollout(self, horizon, train_every=100, save_path=None, train_agents=True, print_act=False): """ Rollout several timesteps of an episode of the environment. Args: horizon: The number of timesteps to roll out. save_path: If provided, will save each frame to disk at this location. """ rewards = np.zeros(self.num_agents) observations = [] shape = self.env.world_map.shape full_obs = [ np.zeros((shape[0], shape[1], 3), dtype=np.uint8) for i in range(horizon) ] init_obs = self.env.reset() # print(init_obs) obs = init_obs for time_step in range(horizon): # print(time_step ) action_dim = self.action_dim # Single agent hardcoded for now hard_coded = False if hard_coded: action_cycle = 40 prep_time = 4 + 2 #10 single_obs = obs["agent-{}".format(0)][0] if time_step < prep_time - 2: # print(single_obs) # print(single_obs.shape) # print(single_obs[7][7]) # # print(single_obs[7][6]) # print(single_obs[6][7]) # print(single_obs[7][8]) # print(single_obs[8][7]) # if single_obs[8][7].sum() == 540 and single_obs[7][6].sum() == 540: # 200 if single_obs[6][7].sum() == 540 and single_obs[7][8].sum( ) == 540: # 200 # if single_obs[6][7].sum() == 540 and single_obs[7][6].sum() == 540: # 100 # if single_obs[8][7].sum() == 540 and single_obs[7][8].sum() == 540: # 100 action = 4 # elif single_obs[7][9].sum() == 0 and single_obs[5][7].sum() == 0: # lower and left empty # action = 5 else: action = 6 # got lazy, just keep turning otherwise # action = 5 # elif time_step == prep_time - 3: # # print(single_obs[7][6]) # # print(single_obs[6][7]) # # print(single_obs[7][8]) # # print(single_obs[8][7]) # action=2 # first up movement, start the cycle elif time_step == prep_time - 2: # print(single_obs[7][6]) # print(single_obs[6][7]) # print(single_obs[7][8]) # print(single_obs[8][7]) action = 1 #0 # first left movement, start the cycle # left and right are wrong? Yeah they messed it up # Um anyway... around 450 is optimal in this env. elif time_step == prep_time - 1: # print(single_obs[7][6]) # print(single_obs[6][7]) # print(single_obs[7][8]) # print(single_obs[8][7]) action = 2 # up again for smoe reason else: # if time_step == prep_time: # print(single_obs[7][6]) # print(single_obs[6][7]) # print(single_obs[7][8]) # print(single_obs[8][7]) # Assumes up orientation if (time_step - prep_time) % action_cycle < 16: action = 1 # left elif (time_step - prep_time) % action_cycle < 20: action = 2 elif (time_step - prep_time) % action_cycle < 36: action = 0 # right elif (time_step - prep_time) % action_cycle < 40: action = 3 # down # print(action) actions = [action] action_dict = {} if not hard_coded: actions = [] if train_agents: # for i in range(self.num_agents): # print(i) # action = self.agent_policies[i].act(reshape_obs_for_convfc(obs["agent-{}".format(i)]), print_act=print_act) # actions.append(action) actions = [ self.agent_policies[i].act(reshape_obs_for_convfc( obs["agent-{}".format(i)][0]), print_act=print_act) for i in range(self.num_agents) ] else: # can choose eps=0 or something else after actions = [ self.agent_policies[i].act(reshape_obs_for_convfc( obs["agent-{}".format(i)][0]), print_act=print_act) for i in range(self.num_agents) ] for i in range(self.num_agents): agent_i = "agent-{}".format(i) action_dict[agent_i] = actions[i] # if train_agents: # # print(ray.get(self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i])))) # action_dict[agent_i] = self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i])) # else: # action_dict[agent_i] = self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i]), epsilon=0) # # 1, obs[agent_i].shape[2], obs[agent_i].shape[0], obs[agent_i].shape[1] )) # batch size = 1 for 1 obs right now... next_obs, rew, dones, info, = self.env.step(action_dict) if not hard_coded: if train_agents: for i in range(self.num_agents): if ((time_step + 1) % train_every == 0): self.process_experiences(0, i, obs, action_dict, rew, next_obs, dones, train_agents=True) else: self.process_experiences(0, i, obs, action_dict, rew, next_obs, dones, train_agents=False) obs = next_obs sys.stdout.flush() if save_path is not None: self.env.render(filename=save_path + 'frame' + str(time_step).zfill(6) + '.png') rgb_arr = self.env.map_to_colors() full_obs[time_step] = rgb_arr.astype(np.uint8) # rewards.append(rew) observations.append(obs) for i in range(self.num_agents): agent_i = "agent-{}".format(i) rewards[i] += rew[agent_i] # observations.append(obs['agent-0']) # rewards.append(rew['agent-0']) return rewards, observations, full_obs def render_rollout(self, horizon=50, path=None, fps=8): """ Render a rollout into a video. Args: horizon: The number of timesteps to roll out. path: Directory where the video will be saved. render_type: Can be 'pretty' or 'fast'. Impliciations obvious. fps: Integer frames per second. """ if path is None: path = os.path.abspath(os.path.dirname(__file__)) + '/videos' print(path) if not os.path.exists(path): os.makedirs(path) video_name = self.env_name + '_trajectory' # if render_type == 'pretty': # image_path = os.path.join(path, 'frames/') # if not os.path.exists(image_path): # os.makedirs(image_path) # # rewards, observations, full_obs = self.rollout( # horizon=horizon, save_path=image_path, train_agents=False) # utility_funcs.make_video_from_image_dir(path, image_path, fps=fps, # video_name=video_name) # # # Clean up images # shutil.rmtree(image_path) # else: rewards, observations, full_obs = self.rollout(horizon=horizon, train_agents=False, print_act=False) utility_funcs.make_video_from_rgb_imgs(full_obs, path, fps=fps, video_name=video_name) return rewards
def env_creator(_): return CleanupEnv( num_agents=num_agents, return_agent_actions=True, use_collective_reward=args.use_collective_reward, )
def __call__(self): return CleanupEnv(ascii_map=FIRING_CLEANUP_MAP, num_agents=self._num_agents)
def __call__(self): return CleanupEnv()
class Env(object): def __init__(self, config_env): self.name = 'ssd' self.config = config_env self.dim_obs = [self.config.obs_height, self.config.obs_width, 3] self.max_steps = self.config.max_steps self.cleaning_penalty = self.config.cleaning_penalty # Original space (not necessarily in this order, see # the original ssd files): # no-op, up, down, left, right, turn-ccw, turn-cw, penalty, clean if (self.config.disable_left_right_action and self.config.disable_rotation_action): self.l_action = 4 self.cleaning_action_idx = 3 # up, down, no-op, clean self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 8} elif self.config.disable_left_right_action: self.l_action = 6 self.cleaning_action_idx = 5 # up, down, no-op, rotate cw, rotate ccw, clean self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 5, 4: 6, 5: 8} elif self.config.disable_rotation_action: self.l_action = 6 self.cleaning_action_idx = 5 # left, right, up, down, no-op, clean self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8} else: # full action space except penalty beam self.l_action = 8 self.cleaning_action_idx = 7 # Don't allow penalty beam self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8} self.obs_cleaned_1hot = self.config.obs_cleaned_1hot self.n_agents = self.config.n_agents if self.config.map_name == 'cleanup_small_sym': ascii_map = maps.CLEANUP_SMALL_SYM elif self.config.map_name == 'cleanup_10x10_sym': ascii_map = maps.CLEANUP_10x10_SYM self.env = CleanupEnv( ascii_map=ascii_map, num_agents=self.n_agents, render=False, shuffle_spawn=self.config.shuffle_spawn, global_ref_point=self.config.global_ref_point, view_size=self.config.view_size, random_orientation=self.config.random_orientation, cleanup_params=self.config.cleanup_params, beam_width=self.config.beam_width) # length of action input to learned reward function if self.config.obs_cleaned_1hot: self.l_action_for_r = 2 else: self.l_action_for_r = self.l_action self.steps = 0 def process_obs(self, obs_dict): return [obs / 256.0 for obs in list(obs_dict.values())] def reset(self): """Resets the environemnt. Returns: List of agent observations """ obs = self.env.reset() self.steps = 0 return self.process_obs(obs) def step(self, actions): """Takes a step in env. Args: actions: list of integers Returns: List of observations, list of rewards, done, info """ actions = [self.map_to_orig[a] for a in actions] actions_dict = { 'agent-%d' % idx: actions[idx] for idx in range(self.n_agents) } # all objects returned by env.step are dicts obs_next, rewards, dones, info = self.env.step(actions_dict) self.steps += 1 obs_next = self.process_obs(obs_next) rewards = list(rewards.values()) if self.cleaning_penalty > 0: for idx in range(self.n_agents): if actions[idx] == 8: rewards[idx] -= self.cleaning_penalty # done = dones['__all__'] # apparently they hardcode done to False done = dones['__all__'] or self.steps == self.max_steps return obs_next, rewards, done, info def render(self): self.env.render()