class Multi2SingleEnv(Wrapper): def __init__(self, env, env_name, agent, agent_idx, shaping_params, scheduler, total_step, norm=True, retrain_victim=False, clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8, mix_agent=False, mix_ratio=0.5, _agent=None): """ from multi-agent environment to single-agent environment. :param: env: two-agent environment. :param: agent: victim agent. :param: agent_idx: victim agent index. :param: shaping_params: shaping parameters. :param: scheduler: anneal scheduler. :param: norm: normalize agent or not. :param: retrain_victim: retrain victim agent or not. :param: clip_obs: observation clip value. :param: clip_rewards: reward clip value. :param: gamma: discount factor. :param: epsilon: additive coefficient. """ Wrapper.__init__(self, env) self.env_name = env_name self.agent = agent self.reward = 0 # observation dimensionality self.observation_space = env.observation_space.spaces[0] # action dimensionality self.action_space = env.action_space.spaces[0] self.total_step = total_step # normalize the victim agent's obs and rets self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) self.obs_rms_next = RunningMeanStd(shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) self.ret_abs_rms = RunningMeanStd(shape=()) self.done = False self.mix_agent = mix_agent self.mix_ratio = mix_ratio self._agent = _agent # determine which policy norm|adv self.is_advagent = True # time step count self.cnt = 0 self.agent_idx = agent_idx self.norm = norm self.retrain_victim = retrain_victim self.shaping_params = shaping_params self.scheduler = scheduler # set normalize hyper self.clip_obs = clip_obs self.clip_reward = clip_reward self.gamma = gamma self.epsilon = epsilon self.num_agents = 2 self.outcomes = [] # return - total discounted reward. self.ret = np.zeros(1) self.ret_abs = np.zeros(1) def step(self, action): """get the reward, observation, and information at each step. :param: action: action of adversarial agent at this time. :return: obs: adversarial agent observation of the next step. :return: rew: adversarial agent reward of the next step. :return: dones: adversarial agent flag of whether the game finished or not. :return: infos: adversarial agent winning information. """ self.cnt += 1 self.oppo_ob = self.ob.copy() self.obs_rms.update(self.oppo_ob) self.oppo_ob = np.clip((self.oppo_ob - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) if self.retrain_victim: if not self.agent.adv_loadnorm: self_action = self.agent.act(observation=self.oppo_ob[None, :], reward=self.reward, done=self.done).flatten() else: self_action = self.agent.act(observation=self.ob[None, :], reward=self.reward, done=self.done).flatten() # mix agent if self.mix_agent and not self.is_advagent: self_action = self._agent.act(observation=self.ob, reward=self.reward, done=self.done) else: self_action = self.agent.act(observation=self.ob, reward=self.reward, done=self.done) # note: current observation self.action = self_action # combine agents' actions if self.agent_idx == 0: actions = (self_action, action) else: actions = (action, self_action) # obtain needed information from the environment. obs, rewards, dones, infos = self.env.step(actions) if dones[0] and 'Ant' in self.env_name: if infos[0]['reward_remaining'] == 0: infos[0]['reward_remaining'] = -1000 if infos[1]['reward_remaining'] == 0: infos[1]['reward_remaining'] = -1000 # separate victim and adversarial information. if self.agent_idx == 0: # vic is 0; adv is 1 self.ob, ob = obs self.reward, reward = rewards self.done, done = dones self.info, info = infos else: # vic is 1; adv is 0 ob, self.ob = obs reward, self.reward = rewards done, self.done = dones info, self.info = infos done = func(done) self.oppo_ob_next = self.ob.copy() self.obs_rms_next.update(self.oppo_ob_next) self.oppo_ob_next = np.clip( (self.oppo_ob_next - self.obs_rms_next.mean) / np.sqrt(self.obs_rms_next.var + self.epsilon), -self.clip_obs, self.clip_obs) # Save and normalize the victim observation and return. # self.oppo_reward = self.reward # self.oppo_reward = -1.0 * self.info['reward_remaining'] * 0.01 # self.abs_reward = info['reward_remaining'] * 0.01 - self.info['reward_remaining'] * 0.01 frac_remaining = max(1 - self.cnt / self.total_step, 0) self.oppo_reward = apply_reward_shapping(self.info, self.shaping_params, self.scheduler, frac_remaining) self.abs_reward = apply_reward_shapping(info, self.shaping_params, self.scheduler, frac_remaining) self.abs_reward = self.abs_reward - self.oppo_reward if self.norm: self.ret = self.ret * self.gamma + self.oppo_reward self.ret_abs = self.ret_abs * self.gamma + self.abs_reward self.oppo_reward, self.abs_reward = self._normalize_( self.ret, self.ret_abs, self.oppo_reward, self.abs_reward) if self.done: self.ret[0] = 0 self.ret_abs[0] = 0 if done: if 'winner' in self.info: # opponent (the agent that is not being trained) win. info['loser'] = True if self.is_advagent and self.retrain_victim: # Number of adversarial agent trajectories info['adv_agent'] = True return ob, reward, done, info def _normalize_(self, ret, ret_abs, reward, abs_reward): """ :param: obs: observation. :param: ret: return. :param: reward: reward. :return: obs: normalized and cliped observation. :return: reward: normalized and cliped reward. """ self.ret_rms.update(ret) reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) # update the ret_abs self.ret_abs_rms.update(ret_abs) abs_reward = np.clip( abs_reward / np.sqrt(self.ret_abs_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) return reward, abs_reward def reset(self): """reset everything. :return: ob: reset observation. """ self.cnt = 0 self.reward = 0 self.done = False self.ret = np.zeros(1) self.ret_abs = np.zeros(1) # reset the agent # reset the h and c self.agent.reset() if self._agent != None: self._agent.reset() ## sampling from the mix-ratio ## mix-ratio adv_agent:norm_agent if self.mix_ratio == 0.5: self.is_advagent = not self.is_advagent else: self.is_advagent = (random.uniform(0, 1) < self.mix_ratio ) # mix_ratio means the ratio of adv_agent if self.agent_idx == 1: ob, self.ob = self.env.reset() else: self.ob, ob = self.env.reset() return ob
class VecNormalize(VecEnvWrapper): """ A moving average, normalizing wrapper for vectorized environment. has support for saving/loading moving average, :param venv: (VecEnv) the vectorized environment to wrap :param training: (bool) Whether to update or not the moving average :param norm_obs: (bool) Whether to normalize observation or not (default: True) :param norm_reward: (bool) Whether to normalize rewards or not (default: True) :param clip_obs: (float) Max absolute value for observation :param clip_reward: (float) Max value absolute for discounted reward :param gamma: (float) discount factor :param epsilon: (float) To avoid division by zero """ def __init__(self, venv, training=True, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8, mean_mask=None): VecEnvWrapper.__init__(self, venv) self.obs_rms = RunningMeanStd(shape=self.observation_space.shape, mean_mask=mean_mask) self.ret_rms = RunningMeanStd(shape=()) self.clip_obs = clip_obs self.clip_reward = clip_reward # Returns: discounted rewards self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.training = training self.norm_obs = norm_obs self.norm_reward = norm_reward self.old_obs = np.array([]) def step_wait(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews self.old_obs = obs obs = self._normalize_observation(obs) if self.norm_reward: if self.training: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) self.ret[news] = 0 return obs, rews, news, infos def _normalize_observation(self, obs): """ :param obs: (numpy tensor) """ if self.norm_obs: if self.training: self.obs_rms.update(obs) obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) return obs else: return obs def get_original_obs(self): """ returns the unnormalized observation :return: (numpy float) """ return self.old_obs def reset(self, indices=None, *args, **kwargs): """ Reset all environments """ obs = self.venv.reset(indices, *args, **kwargs) if len(np.array(obs).shape) == 1: # for when num_cpu is 1 self.old_obs = [obs] else: self.old_obs = obs self.ret = np.zeros(self.num_envs) return self._normalize_observation(obs) def save_running_average(self, path, suffix=None): """ :param path: (str) path to log dir :param suffix: (str) suffix to file """ file_names = ['obs_rms', 'ret_rms'] if suffix is not None: file_names = [f + suffix for f in file_names] for rms, name in zip([self.obs_rms, self.ret_rms], file_names): with open("{}/{}.pkl".format(path, name), 'wb') as file_handler: pickle.dump(rms, file_handler) def load_running_average(self, path, suffix=None): """ :param path: (str) path to log dir :param suffix: (str) suffix to file """ file_names = ['obs_rms', 'ret_rms'] for name in file_names: open_name = name if suffix is not None: open_name += suffix with open("{}/{}.pkl".format(path, open_name), 'rb') as file_handler: setattr(self, name, pickle.load(file_handler))
class RossettaVecNormalize(VecEnvWrapper): """ A moving average, normalizing wrapper for vectorized environment. It is pickleable which will save moving averages and configuration parameters. The wrapped environment `venv` is not saved, and must be restored manually with `set_venv` after being unpickled. :param venv: (VecEnv) the vectorized environment to wrap :param training: (bool) Whether to update or not the moving average :param norm_obs: (bool) Whether to normalize observation or not (default: True) :param norm_reward: (bool) Whether to normalize rewards or not (default: True) :param clip_obs: (float) Max absolute value for observation :param clip_reward: (float) Max value absolute for discounted reward :param gamma: (float) discount factor :param epsilon: (float) To avoid division by zero """ def __init__(self, venv, training=True, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) self.clip_obs = clip_obs self.clip_reward = clip_reward # Returns: discounted rewards self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.training = training self.norm_obs = norm_obs self.norm_reward = norm_reward self.old_obs = None self.old_rews = None def __getstate__(self): """ Gets state for pickling. Excludes self.venv, as in general VecEnv's may not be pickleable.""" state = self.__dict__.copy() # these attributes are not pickleable del state['venv'] del state['class_attributes'] # these attributes depend on the above and so we would prefer not to pickle del state['ret'] return state def __setstate__(self, state): """ Restores pickled state. User must call set_venv() after unpickling before using. :param state: (dict)""" self.__dict__.update(state) assert 'venv' not in state self.venv = None def set_venv(self, venv): """ Sets the vector environment to wrap to venv. Also sets attributes derived from this such as `num_env`. :param venv: (VecEnv) """ if self.venv is not None: raise ValueError( "Trying to set venv of already initialized VecNormalize wrapper." ) VecEnvWrapper.__init__(self, venv) if self.obs_rms.mean.shape != self.observation_space.shape: raise ValueError("venv is incompatible with current statistics.") self.ret = np.zeros(self.num_envs) def step_wait(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() self.old_obs = obs self.old_rews = rews # if self.training: # self.obs_rms.update(obs) # obs = self.normalize_obs(obs) if self.training: self._update_reward(rews) rews = self.normalize_reward(rews) self.ret[news] = 0 return obs, rews, news, infos def _update_reward(self, reward: np.ndarray) -> None: """Update reward normalization statistics.""" self.ret = self.ret * self.gamma + reward self.ret_rms.update(self.ret) def normalize_obs(self, obs: np.ndarray) -> np.ndarray: """ Normalize observations using this VecNormalize's observations statistics. Calling this method does not update statistics. """ if self.norm_obs: obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) return obs def normalize_reward(self, reward: np.ndarray) -> np.ndarray: """ Normalize rewards using this VecNormalize's rewards statistics. Calling this method does not update statistics. """ if self.norm_reward: reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) return reward def get_original_obs(self) -> np.ndarray: """ Returns an unnormalized version of the observations from the most recent step or reset. """ return self.old_obs.copy() def get_original_reward(self) -> np.ndarray: """ Returns an unnormalized version of the rewards from the most recent step. """ return self.old_rews.copy() def reset(self): """ Reset all environments """ obs = self.venv.reset() self.old_obs = obs self.ret = np.zeros(self.num_envs) if self.training: self._update_reward(self.ret) # return self.normalize_obs(obs) return obs @staticmethod def load(load_path, venv): """ Loads a saved VecNormalize object. :param load_path: the path to load from. :param venv: the VecEnv to wrap. :return: (VecNormalize) """ with open(load_path, "rb") as file_handler: vec_normalize = pickle.load(file_handler) vec_normalize.set_venv(venv) return vec_normalize def save(self, save_path): with open(save_path, "wb") as file_handler: pickle.dump(self, file_handler) def save_running_average(self, path): """ :param path: (str) path to log dir .. deprecated:: 2.9.0 This function will be removed in a future version """ warnings.warn( "Usage of `save_running_average` is deprecated. Please " "use `save` or pickle instead.", DeprecationWarning) for rms, name in zip([self.obs_rms, self.ret_rms], ['obs_rms', 'ret_rms']): with open("{}/{}.pkl".format(path, name), 'wb') as file_handler: pickle.dump(rms, file_handler) def load_running_average(self, path): """ :param path: (str) path to log dir .. deprecated:: 2.9.0 This function will be removed in a future version """ warnings.warn( "Usage of `load_running_average` is deprecated. Please " "use `load` or pickle instead.", DeprecationWarning) for name in ['obs_rms', 'ret_rms']: with open("{}/{}.pkl".format(path, name), 'rb') as file_handler: setattr(self, name, pickle.load(file_handler))
class bVecNormalize(VecEnv): def __init__(self, venv, ob=True, st=True, ret=True, clipob=10., clipst=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnv.__init__(self, observation_space=venv.observation_space, state_space=venv.state_space, action_space=venv.action_space) print('bullet vec normalize initialization. ') self.venv = venv self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.st_rms = RunningMeanStd( shape=self.state_space.shape) if st else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.clipst = clipst self.cliprew = cliprew self.ret = np.zeros(1) self.gamma = gamma self.epsilon = epsilon def step(self, action, z, skel): return self.step_norm(action, z, skel) def step_norm(self, action, z, skel): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, state, rews, done, infos = self.venv.step( action, z, skel) # 각 robot에서 정의된 step()이 호출됨 true_rews = copy.deepcopy(rews) self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) state = self._stfilt(state) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, state, rews, done, infos, true_rews def step_broadcast(self, action): res, obs, state, rews, done, infos = self.venv.step_broadcast( action) # 각 robot에서 정의된 step()이 호출됨 true_rews = copy.deepcopy(rews) for a in range(self.venv.num_agent): self.ret = self.ret * self.gamma + rews[a] obs[a] = self._obfilt(obs[a]) state[a] = self._stfilt(state[a]) if self.ret_rms: self.ret_rms.update(self.ret) rews[a] = np.clip( rews[a] / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return res, obs, state, rews, done, infos, true_rews def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) if self.ret_rms else None obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def _stfilt(self, state): if self.st_rms: self.st_rms.update(state) if self.ret_rms else None state = np.clip((state - self.st_rms.mean) / np.sqrt(self.st_rms.var + self.epsilon), -self.clipst, self.clipst) return state else: return state def reset(self, z, skel): obs, state = self.venv.reset(z, skel) return self._obfilt(obs), self._stfilt(state) def reset_broadcast(self): obs, state = self.venv.reset_broadcast() for i in range(self.venv.num_agent): obs[i] = self._obfilt(obs[i]) state[i] = self._stfilt(state[i]) return obs, state def get_vrep_scene_path(self): return self.venv.get_vrep_scene_path() def initialize_robot(self, clientID): self.venv.initialize_robot(clientID)