Пример #1
0
class Multi2SingleEnv(Wrapper):
    def __init__(self,
                 env,
                 env_name,
                 agent,
                 agent_idx,
                 shaping_params,
                 scheduler,
                 total_step,
                 norm=True,
                 retrain_victim=False,
                 clip_obs=10.,
                 clip_reward=10.,
                 gamma=0.99,
                 epsilon=1e-8,
                 mix_agent=False,
                 mix_ratio=0.5,
                 _agent=None):
        """ from multi-agent environment to single-agent environment.
        :param: env: two-agent environment.
        :param: agent: victim agent.
        :param: agent_idx: victim agent index.
        :param: shaping_params: shaping parameters.
        :param: scheduler: anneal scheduler.
        :param: norm: normalize agent or not.
        :param: retrain_victim: retrain victim agent or not.
        :param: clip_obs: observation clip value.
        :param: clip_rewards: reward clip value.
        :param: gamma: discount factor.
        :param: epsilon: additive coefficient.
        """
        Wrapper.__init__(self, env)
        self.env_name = env_name
        self.agent = agent
        self.reward = 0
        # observation dimensionality
        self.observation_space = env.observation_space.spaces[0]
        # action dimensionality
        self.action_space = env.action_space.spaces[0]
        self.total_step = total_step

        # normalize the victim agent's obs and rets
        self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
        self.obs_rms_next = RunningMeanStd(shape=self.observation_space.shape)

        self.ret_rms = RunningMeanStd(shape=())
        self.ret_abs_rms = RunningMeanStd(shape=())

        self.done = False
        self.mix_agent = mix_agent
        self.mix_ratio = mix_ratio

        self._agent = _agent
        # determine which policy norm|adv
        self.is_advagent = True

        # time step count
        self.cnt = 0
        self.agent_idx = agent_idx
        self.norm = norm
        self.retrain_victim = retrain_victim

        self.shaping_params = shaping_params
        self.scheduler = scheduler

        # set normalize hyper
        self.clip_obs = clip_obs
        self.clip_reward = clip_reward

        self.gamma = gamma
        self.epsilon = epsilon

        self.num_agents = 2
        self.outcomes = []

        # return - total discounted reward.
        self.ret = np.zeros(1)
        self.ret_abs = np.zeros(1)

    def step(self, action):
        """get the reward, observation, and information at each step.
        :param: action: action of adversarial agent at this time.
        :return: obs: adversarial agent observation of the next step.
        :return: rew: adversarial agent reward of the next step.
        :return: dones: adversarial agent flag of whether the game finished or not.
        :return: infos: adversarial agent winning information.
        """

        self.cnt += 1
        self.oppo_ob = self.ob.copy()
        self.obs_rms.update(self.oppo_ob)
        self.oppo_ob = np.clip((self.oppo_ob - self.obs_rms.mean) /
                               np.sqrt(self.obs_rms.var + self.epsilon),
                               -self.clip_obs, self.clip_obs)
        if self.retrain_victim:
            if not self.agent.adv_loadnorm:
                self_action = self.agent.act(observation=self.oppo_ob[None, :],
                                             reward=self.reward,
                                             done=self.done).flatten()
            else:
                self_action = self.agent.act(observation=self.ob[None, :],
                                             reward=self.reward,
                                             done=self.done).flatten()
            # mix agent
            if self.mix_agent and not self.is_advagent:
                self_action = self._agent.act(observation=self.ob,
                                              reward=self.reward,
                                              done=self.done)
        else:
            self_action = self.agent.act(observation=self.ob,
                                         reward=self.reward,
                                         done=self.done)
        # note: current observation
        self.action = self_action

        # combine agents' actions
        if self.agent_idx == 0:
            actions = (self_action, action)
        else:
            actions = (action, self_action)

        # obtain needed information from the environment.
        obs, rewards, dones, infos = self.env.step(actions)

        if dones[0] and 'Ant' in self.env_name:
            if infos[0]['reward_remaining'] == 0:
                infos[0]['reward_remaining'] = -1000
            if infos[1]['reward_remaining'] == 0:
                infos[1]['reward_remaining'] = -1000

        # separate victim and adversarial information.
        if self.agent_idx == 0:  # vic is 0; adv is 1
            self.ob, ob = obs
            self.reward, reward = rewards
            self.done, done = dones
            self.info, info = infos
        else:  # vic is 1; adv is 0
            ob, self.ob = obs
            reward, self.reward = rewards
            done, self.done = dones
            info, self.info = infos
        done = func(done)

        self.oppo_ob_next = self.ob.copy()
        self.obs_rms_next.update(self.oppo_ob_next)
        self.oppo_ob_next = np.clip(
            (self.oppo_ob_next - self.obs_rms_next.mean) /
            np.sqrt(self.obs_rms_next.var + self.epsilon), -self.clip_obs,
            self.clip_obs)

        # Save and normalize the victim observation and return.
        # self.oppo_reward = self.reward
        # self.oppo_reward = -1.0 * self.info['reward_remaining'] * 0.01
        # self.abs_reward =  info['reward_remaining'] * 0.01 - self.info['reward_remaining'] * 0.01

        frac_remaining = max(1 - self.cnt / self.total_step, 0)

        self.oppo_reward = apply_reward_shapping(self.info,
                                                 self.shaping_params,
                                                 self.scheduler,
                                                 frac_remaining)
        self.abs_reward = apply_reward_shapping(info, self.shaping_params,
                                                self.scheduler, frac_remaining)
        self.abs_reward = self.abs_reward - self.oppo_reward

        if self.norm:
            self.ret = self.ret * self.gamma + self.oppo_reward
            self.ret_abs = self.ret_abs * self.gamma + self.abs_reward
            self.oppo_reward, self.abs_reward = self._normalize_(
                self.ret, self.ret_abs, self.oppo_reward, self.abs_reward)
            if self.done:
                self.ret[0] = 0
                self.ret_abs[0] = 0

        if done:
            if 'winner' in self.info:  # opponent (the agent that is not being trained) win.
                info['loser'] = True
            if self.is_advagent and self.retrain_victim:  # Number of adversarial agent trajectories
                info['adv_agent'] = True
        return ob, reward, done, info

    def _normalize_(self, ret, ret_abs, reward, abs_reward):
        """
        :param: obs: observation.
        :param: ret: return.
        :param: reward: reward.
        :return: obs: normalized and cliped observation.
        :return: reward: normalized and cliped reward.
        """
        self.ret_rms.update(ret)
        reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon),
                         -self.clip_reward, self.clip_reward)
        # update the ret_abs
        self.ret_abs_rms.update(ret_abs)
        abs_reward = np.clip(
            abs_reward / np.sqrt(self.ret_abs_rms.var + self.epsilon),
            -self.clip_reward, self.clip_reward)

        return reward, abs_reward

    def reset(self):
        """reset everything.
        :return: ob: reset observation.
        """
        self.cnt = 0
        self.reward = 0
        self.done = False
        self.ret = np.zeros(1)
        self.ret_abs = np.zeros(1)
        # reset the agent
        # reset the h and c
        self.agent.reset()
        if self._agent != None:
            self._agent.reset()

        ## sampling from the mix-ratio
        ## mix-ratio adv_agent:norm_agent
        if self.mix_ratio == 0.5:
            self.is_advagent = not self.is_advagent
        else:
            self.is_advagent = (random.uniform(0, 1) < self.mix_ratio
                                )  # mix_ratio means the ratio of adv_agent

        if self.agent_idx == 1:
            ob, self.ob = self.env.reset()
        else:
            self.ob, ob = self.env.reset()
        return ob
Пример #2
0
class VecNormalize(VecEnvWrapper):
    """
    A moving average, normalizing wrapper for vectorized environment.
    has support for saving/loading moving average,

    :param venv: (VecEnv) the vectorized environment to wrap
    :param training: (bool) Whether to update or not the moving average
    :param norm_obs: (bool) Whether to normalize observation or not (default: True)
    :param norm_reward: (bool) Whether to normalize rewards or not (default: True)
    :param clip_obs: (float) Max absolute value for observation
    :param clip_reward: (float) Max value absolute for discounted reward
    :param gamma: (float) discount factor
    :param epsilon: (float) To avoid division by zero
    """
    def __init__(self,
                 venv,
                 training=True,
                 norm_obs=True,
                 norm_reward=True,
                 clip_obs=10.,
                 clip_reward=10.,
                 gamma=0.99,
                 epsilon=1e-8,
                 mean_mask=None):
        VecEnvWrapper.__init__(self, venv)
        self.obs_rms = RunningMeanStd(shape=self.observation_space.shape,
                                      mean_mask=mean_mask)
        self.ret_rms = RunningMeanStd(shape=())
        self.clip_obs = clip_obs
        self.clip_reward = clip_reward
        # Returns: discounted rewards
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
        self.training = training
        self.norm_obs = norm_obs
        self.norm_reward = norm_reward
        self.old_obs = np.array([])

    def step_wait(self):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        self.old_obs = obs
        obs = self._normalize_observation(obs)
        if self.norm_reward:
            if self.training:
                self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.clip_reward, self.clip_reward)
        self.ret[news] = 0
        return obs, rews, news, infos

    def _normalize_observation(self, obs):
        """
        :param obs: (numpy tensor)
        """
        if self.norm_obs:
            if self.training:
                self.obs_rms.update(obs)
            obs = np.clip((obs - self.obs_rms.mean) /
                          np.sqrt(self.obs_rms.var + self.epsilon),
                          -self.clip_obs, self.clip_obs)
            return obs
        else:
            return obs

    def get_original_obs(self):
        """
        returns the unnormalized observation

        :return: (numpy float)
        """
        return self.old_obs

    def reset(self, indices=None, *args, **kwargs):
        """
        Reset all environments
        """
        obs = self.venv.reset(indices, *args, **kwargs)
        if len(np.array(obs).shape) == 1:  # for when num_cpu is 1
            self.old_obs = [obs]
        else:
            self.old_obs = obs
        self.ret = np.zeros(self.num_envs)
        return self._normalize_observation(obs)

    def save_running_average(self, path, suffix=None):
        """
        :param path: (str) path to log dir
        :param suffix: (str) suffix to file
        """
        file_names = ['obs_rms', 'ret_rms']
        if suffix is not None:
            file_names = [f + suffix for f in file_names]
        for rms, name in zip([self.obs_rms, self.ret_rms], file_names):
            with open("{}/{}.pkl".format(path, name), 'wb') as file_handler:
                pickle.dump(rms, file_handler)

    def load_running_average(self, path, suffix=None):
        """
        :param path: (str) path to log dir
        :param suffix: (str) suffix to file
        """

        file_names = ['obs_rms', 'ret_rms']
        for name in file_names:
            open_name = name
            if suffix is not None:
                open_name += suffix
            with open("{}/{}.pkl".format(path, open_name),
                      'rb') as file_handler:
                setattr(self, name, pickle.load(file_handler))
Пример #3
0
class RossettaVecNormalize(VecEnvWrapper):
    """
    A moving average, normalizing wrapper for vectorized environment.

    It is pickleable which will save moving averages and configuration parameters.
    The wrapped environment `venv` is not saved, and must be restored manually with
    `set_venv` after being unpickled.

    :param venv: (VecEnv) the vectorized environment to wrap
    :param training: (bool) Whether to update or not the moving average
    :param norm_obs: (bool) Whether to normalize observation or not (default: True)
    :param norm_reward: (bool) Whether to normalize rewards or not (default: True)
    :param clip_obs: (float) Max absolute value for observation
    :param clip_reward: (float) Max value absolute for discounted reward
    :param gamma: (float) discount factor
    :param epsilon: (float) To avoid division by zero
    """
    def __init__(self,
                 venv,
                 training=True,
                 norm_obs=True,
                 norm_reward=True,
                 clip_obs=10.,
                 clip_reward=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
        self.ret_rms = RunningMeanStd(shape=())
        self.clip_obs = clip_obs
        self.clip_reward = clip_reward
        # Returns: discounted rewards
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
        self.training = training
        self.norm_obs = norm_obs
        self.norm_reward = norm_reward
        self.old_obs = None
        self.old_rews = None

    def __getstate__(self):
        """
        Gets state for pickling.

        Excludes self.venv, as in general VecEnv's may not be pickleable."""
        state = self.__dict__.copy()
        # these attributes are not pickleable
        del state['venv']
        del state['class_attributes']
        # these attributes depend on the above and so we would prefer not to pickle
        del state['ret']
        return state

    def __setstate__(self, state):
        """
        Restores pickled state.

        User must call set_venv() after unpickling before using.

        :param state: (dict)"""
        self.__dict__.update(state)
        assert 'venv' not in state
        self.venv = None

    def set_venv(self, venv):
        """
        Sets the vector environment to wrap to venv.

        Also sets attributes derived from this such as `num_env`.

        :param venv: (VecEnv)
        """
        if self.venv is not None:
            raise ValueError(
                "Trying to set venv of already initialized VecNormalize wrapper."
            )
        VecEnvWrapper.__init__(self, venv)
        if self.obs_rms.mean.shape != self.observation_space.shape:
            raise ValueError("venv is incompatible with current statistics.")
        self.ret = np.zeros(self.num_envs)

    def step_wait(self):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, infos = self.venv.step_wait()
        self.old_obs = obs
        self.old_rews = rews

        # if self.training:
        #     self.obs_rms.update(obs)
        # obs = self.normalize_obs(obs)

        if self.training:
            self._update_reward(rews)
        rews = self.normalize_reward(rews)

        self.ret[news] = 0
        return obs, rews, news, infos

    def _update_reward(self, reward: np.ndarray) -> None:
        """Update reward normalization statistics."""
        self.ret = self.ret * self.gamma + reward
        self.ret_rms.update(self.ret)

    def normalize_obs(self, obs: np.ndarray) -> np.ndarray:
        """
        Normalize observations using this VecNormalize's observations statistics.
        Calling this method does not update statistics.
        """
        if self.norm_obs:
            obs = np.clip((obs - self.obs_rms.mean) /
                          np.sqrt(self.obs_rms.var + self.epsilon),
                          -self.clip_obs, self.clip_obs)
        return obs

    def normalize_reward(self, reward: np.ndarray) -> np.ndarray:
        """
        Normalize rewards using this VecNormalize's rewards statistics.
        Calling this method does not update statistics.
        """
        if self.norm_reward:
            reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon),
                             -self.clip_reward, self.clip_reward)
        return reward

    def get_original_obs(self) -> np.ndarray:
        """
        Returns an unnormalized version of the observations from the most recent
        step or reset.
        """
        return self.old_obs.copy()

    def get_original_reward(self) -> np.ndarray:
        """
        Returns an unnormalized version of the rewards from the most recent step.
        """
        return self.old_rews.copy()

    def reset(self):
        """
        Reset all environments
        """
        obs = self.venv.reset()
        self.old_obs = obs
        self.ret = np.zeros(self.num_envs)
        if self.training:
            self._update_reward(self.ret)
        # return self.normalize_obs(obs)
        return obs

    @staticmethod
    def load(load_path, venv):
        """
        Loads a saved VecNormalize object.

        :param load_path: the path to load from.
        :param venv: the VecEnv to wrap.
        :return: (VecNormalize)
        """
        with open(load_path, "rb") as file_handler:
            vec_normalize = pickle.load(file_handler)
        vec_normalize.set_venv(venv)
        return vec_normalize

    def save(self, save_path):
        with open(save_path, "wb") as file_handler:
            pickle.dump(self, file_handler)

    def save_running_average(self, path):
        """
        :param path: (str) path to log dir

        .. deprecated:: 2.9.0
            This function will be removed in a future version
        """
        warnings.warn(
            "Usage of `save_running_average` is deprecated. Please "
            "use `save` or pickle instead.", DeprecationWarning)
        for rms, name in zip([self.obs_rms, self.ret_rms],
                             ['obs_rms', 'ret_rms']):
            with open("{}/{}.pkl".format(path, name), 'wb') as file_handler:
                pickle.dump(rms, file_handler)

    def load_running_average(self, path):
        """
        :param path: (str) path to log dir

        .. deprecated:: 2.9.0
            This function will be removed in a future version
        """
        warnings.warn(
            "Usage of `load_running_average` is deprecated. Please "
            "use `load` or pickle instead.", DeprecationWarning)
        for name in ['obs_rms', 'ret_rms']:
            with open("{}/{}.pkl".format(path, name), 'rb') as file_handler:
                setattr(self, name, pickle.load(file_handler))
class bVecNormalize(VecEnv):
    def __init__(self,
                 venv,
                 ob=True,
                 st=True,
                 ret=True,
                 clipob=10.,
                 clipst=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        VecEnv.__init__(self,
                        observation_space=venv.observation_space,
                        state_space=venv.state_space,
                        action_space=venv.action_space)
        print('bullet vec normalize initialization. ')
        self.venv = venv
        self.ob_rms = RunningMeanStd(
            shape=self.observation_space.shape) if ob else None
        self.st_rms = RunningMeanStd(
            shape=self.state_space.shape) if st else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.clipst = clipst
        self.cliprew = cliprew
        self.ret = np.zeros(1)
        self.gamma = gamma
        self.epsilon = epsilon

    def step(self, action, z, skel):
        return self.step_norm(action, z, skel)

    def step_norm(self, action, z, skel):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, state, rews, done, infos = self.venv.step(
            action, z, skel)  # 각 robot에서 정의된 step()이 호출됨
        true_rews = copy.deepcopy(rews)
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        state = self._stfilt(state)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)

        return obs, state, rews, done, infos, true_rews

    def step_broadcast(self, action):
        res, obs, state, rews, done, infos = self.venv.step_broadcast(
            action)  # 각 robot에서 정의된 step()이 호출됨
        true_rews = copy.deepcopy(rews)
        for a in range(self.venv.num_agent):
            self.ret = self.ret * self.gamma + rews[a]
            obs[a] = self._obfilt(obs[a])
            state[a] = self._stfilt(state[a])

            if self.ret_rms:
                self.ret_rms.update(self.ret)
                rews[a] = np.clip(
                    rews[a] / np.sqrt(self.ret_rms.var + self.epsilon),
                    -self.cliprew, self.cliprew)

        return res, obs, state, rews, done, infos, true_rews

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs) if self.ret_rms else None
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def _stfilt(self, state):
        if self.st_rms:
            self.st_rms.update(state) if self.ret_rms else None
            state = np.clip((state - self.st_rms.mean) /
                            np.sqrt(self.st_rms.var + self.epsilon),
                            -self.clipst, self.clipst)
            return state
        else:
            return state

    def reset(self, z, skel):
        obs, state = self.venv.reset(z, skel)
        return self._obfilt(obs), self._stfilt(state)

    def reset_broadcast(self):
        obs, state = self.venv.reset_broadcast()
        for i in range(self.venv.num_agent):
            obs[i] = self._obfilt(obs[i])
            state[i] = self._stfilt(state[i])
        return obs, state

    def get_vrep_scene_path(self):
        return self.venv.get_vrep_scene_path()

    def initialize_robot(self, clientID):
        self.venv.initialize_robot(clientID)