def wrap_adv_noise_ball(env_name, our_idx, multi_venv, adv_noise_params, deterministic): adv_noise_agent_val = adv_noise_params["noise_val"] base_policy_path = adv_noise_params["base_path"] base_policy_type = adv_noise_params["base_type"] base_policy = load_policy( policy_path=base_policy_path, policy_type=base_policy_type, env=multi_venv, env_name=env_name, index=our_idx, ) base_action_space = multi_venv.action_space.spaces[our_idx] adv_noise_action_space = Box( low=adv_noise_agent_val * base_action_space.low, high=adv_noise_agent_val * base_action_space.high, ) multi_venv = MergeAgentVecEnv( venv=multi_venv, policy=base_policy, replace_action_space=adv_noise_action_space, merge_agent_idx=our_idx, deterministic=deterministic, ) return multi_venv
def create_multi_agent_curried_policy_wrapper(mon_dir, env_name, num_envs, embed_index, max_steps, state_shape=None, add_zoo=False, num_zoo=5): def episode_limit(env): return time_limit.TimeLimit(env, max_episode_steps=max_steps) def env_fn(i): return make_env(env_name, seed=42, i=i, out_dir=mon_dir, pre_wrappers=[episode_limit]) vec_env = make_dummy_vec_multi_env( [lambda: env_fn(i) for i in range(num_envs)]) zoo = load_policy( policy_path="1", policy_type="zoo", env=vec_env, env_name=env_name, index=1 - embed_index, transparent_params=None, ) half_env = FakeSingleSpacesVec(vec_env, agent_id=embed_index) policies = [ _get_constant_policy(half_env, constant_value=half_env.action_space.sample(), state_shape=state_shape) for _ in range(10) ] if add_zoo: policies += [zoo] * num_zoo policy_wrapper = MultiPolicyWrapper(policies=policies, num_envs=num_envs) vec_env = CurryVecEnv(venv=vec_env, policy=policy_wrapper, agent_idx=embed_index, deterministic=False) vec_env = FlattenSingletonVecEnv(vec_env) yield vec_env, policy_wrapper, zoo policy_wrapper.close()
def __init__( self, venv, env_name, use_debug, victim_index, victim_path, victim_type, transparent_params, lb_mul, lb_num, lb_path, lb_type, ): super().__init__(venv) self.lb_num = lb_num self.lb_mul = lb_mul if transparent_params is None: raise ValueError( "LookbackRewardVecWrapper assumes transparent policies and venvs." ) self.transparent_params = transparent_params self.victim_index = victim_index self._policy = load_policy( lb_type, lb_path, self.venv.unwrapped, env_name, 1 - victim_index, transparent_params=None, ) self._action = None self._obs = None self._state = None self._new_lb_state = None self._dones = [False] * self.num_envs self.ep_lens = np.zeros(self.num_envs).astype(int) self.lb_tuples = self._create_lb_tuples(env_name, use_debug, victim_index, victim_path, victim_type) self.use_debug = use_debug if self.use_debug: # create a debug file for this venv and also every lookback venv ordinally self.debug_files = [ open(f"debug{i}.pkl", "wb") for i in range(self.lb_num + 1) ] self.get_debug_venv().set_debug_file(self.debug_files[0])
def _create_lb_tuples(self, env_name, use_debug, victim_index, victim_path, victim_type): """Create lookback data structures which are used to compare our episode rollouts against those of an environment where a lookback base policy acted instead. params victim_index, victim_path, victim_type are the same as in policy_loader.load_policy :param use_debug (bool): Use DummyVecEnv instead of SubprocVecEnv :return: (list<LookbackTuple>) lb_tuples """ def env_fn(i): return make_env(env_name, 0, i, out_dir="data/lookbacks/", pre_wrappers=[OldMujocoResettableWrapper]) lb_tuples = [] for _ in range(self.lb_num): make_vec_env = make_dummy_vec_multi_env if use_debug else make_subproc_vec_multi_env multi_venv = make_vec_env( [lambda: env_fn(i) for i in range(self.num_envs)]) if use_debug: multi_venv = DebugVenv(multi_venv) victim = load_policy( policy_path=victim_path, policy_type=victim_type, env=multi_venv, env_name=env_name, index=victim_index, transparent_params=self.transparent_params, ) multi_venv = TransparentCurryVecEnv(venv=multi_venv, policy=victim, agent_idx=victim_index, deterministic=True) single_venv = FlattenSingletonVecEnv(multi_venv) data_dict = { "state": None, "action": None, "info": defaultdict(dict) } lb_tuples.append(LookbackTuple(venv=single_venv, data=data_dict)) return lb_tuples
def maybe_embed_agent( multi_venv, our_idx, scheduler, log_callbacks, env_name, embed_types, embed_paths, embed_index, embed_noise, embed_noise_params, adv_noise_params, transparent_params, lookback_params, ): if len(embed_types) > 0: deterministic = lookback_params is not None # If we are actually training an epsilon-ball noise agent on top of a zoo agent if adv_noise_params["noise_val"] is not None: multi_venv = wrap_adv_noise_ball( env_name, our_idx, multi_venv, adv_noise_params=adv_noise_params, deterministic=deterministic, ) embedded_policies = [] # If we're loading multiple embedded agents for embed_type, embed_path in zip(embed_types, embed_paths): embedded_policies.append( load_policy( policy_path=embed_path, policy_type=embed_type, env=multi_venv, env_name=env_name, index=embed_index, transparent_params=transparent_params, )) if embed_noise: for i in range(len(embedded_policies)): embedded = apply_embedded_agent_wrapper( embedded=embedded_policies[i], noise_params=embed_noise_params, scheduler=scheduler, ) log_callbacks.append(LoggerOnlyLogCallback(embedded)) embedded_policies[i] = embedded if len(embedded_policies) > 1: embedded_policy = MultiPolicyWrapper(embedded_policies, num_envs=multi_venv.num_envs) else: embedded_policy = embedded_policies[0] # Curry the embedded agent cls = TransparentCurryVecEnv if transparent_params is not None else CurryVecEnv multi_venv = cls( venv=multi_venv, policy=embedded_policy, agent_idx=embed_index, deterministic=deterministic, ) return multi_venv
def score_agent( _run, _seed, env_name, agent_a_path, agent_b_path, agent_a_type, agent_b_type, record_traj, record_traj_params, transparent_params, num_env, videos, video_params, mask_agent_index, noisy_agent_index, noisy_agent_magnitude, mask_agent_noise, ): save_dir = video_params["save_dir"] if videos: if save_dir is None: score_ex_logger.info( "No directory provided for saving videos; using a tmpdir instead," " but videos will be saved to Sacred run directory") tmp_dir = tempfile.TemporaryDirectory(prefix="score-videos") save_dir = tmp_dir.name else: tmp_dir = None video_dirs = [osp.join(save_dir, str(i)) for i in range(num_env)] agent_wrappers = {} if mask_agent_index is not None: mask_agent_kwargs = {} if mask_agent_noise is not None: mask_agent_kwargs["noise_magnitude"] = mask_agent_noise agent_wrappers = make_mask_agent_wrappers(env_name, mask_agent_index, **mask_agent_kwargs) video_params = utils.sacred_copy(video_params) # Sacred issue #499 def env_fn(i): env = make_env(env_name, _seed, i, None, agent_wrappers=agent_wrappers) if videos: if video_params["annotated"]: if "multicomp" in env_name: assert num_env == 1, "pretty videos requires num_env=1" env = AnnotatedGymCompete( env, env_name, agent_a_type, agent_a_path, agent_b_type, agent_b_path, mask_agent_index, **video_params["annotation_params"], ) else: warnings.warn( f"Annotated videos not supported for environment '{env_name}'" ) env = VideoWrapper(env, video_dirs[i], video_params["single_file"]) return env env_fns = [functools.partial(env_fn, i) for i in range(num_env)] if num_env > 1: venv = make_subproc_vec_multi_env(env_fns) else: venv = make_dummy_vec_multi_env(env_fns) if record_traj: venv = TrajectoryRecorder(venv, record_traj_params["agent_indices"]) if venv.num_agents == 1 and agent_b_path != "none": raise ValueError( "Set agent_b_path to 'none' if environment only uses one agent.") agent_paths = [agent_a_path, agent_b_path] agent_types = [agent_a_type, agent_b_type] zipped = list(zip(agent_types, agent_paths)) agents = [ load_policy(policy_type, policy_path, venv, env_name, i, transparent_params) for i, (policy_type, policy_path) in enumerate(zipped[:venv.num_agents]) ] if noisy_agent_index is not None: agents[noisy_agent_index] = NoisyAgentWrapper( agents[noisy_agent_index], noise_annealer=lambda: noisy_agent_magnitude) score = get_empirical_score(venv, agents) for agent in agents: if agent.sess is not None: agent.sess.close() if record_traj: save_paths = venv.save(save_dir=record_traj_params["save_dir"]) for save_path in save_paths: score_ex.add_artifact(save_path, name="victim_activations.npz") venv.close() if videos: for env_video_dir in video_dirs: added = False for file_path in os.listdir(env_video_dir): added |= _save_video_or_metadata(env_video_dir, file_path) if not added: raise FileNotFoundError( f"No video artifacts found in path {env_video_dir}.") if tmp_dir is not None: tmp_dir.cleanup() for observer in score_ex.observers: if hasattr(observer, "dir"): _clean_video_directory_structure(observer) return score