class PpoOptimizer(object): envs = None def __init__(self, *, scope, ob_space, ac_space, stochpol, ent_coef, gamma, lam, nepochs, lr, cliprange, nminibatches, normrew, normadv, use_news, ext_coeff, int_coeff, nsteps_per_seg, nsegs_per_env, dynamics): self.dynamics = dynamics with tf.variable_scope(scope): self.use_recorder = True self.n_updates = 0 self.scope = scope self.ob_space = ob_space self.ac_space = ac_space self.stochpol = stochpol self.nepochs = nepochs self.lr = lr self.cliprange = cliprange self.nsteps_per_seg = nsteps_per_seg self.nsegs_per_env = nsegs_per_env self.nminibatches = nminibatches self.gamma = gamma self.lam = lam self.normrew = normrew self.normadv = normadv self.use_news = use_news self.ext_coeff = ext_coeff self.int_coeff = int_coeff self.ph_adv = tf.placeholder(tf.float32, [None, None]) self.ph_ret = tf.placeholder(tf.float32, [None, None]) self.ph_rews = tf.placeholder(tf.float32, [None, None]) self.ph_oldnlp = tf.placeholder(tf.float32, [None, None]) self.ph_oldvpred = tf.placeholder(tf.float32, [None, None]) self.ph_lr = tf.placeholder(tf.float32, []) self.ph_cliprange = tf.placeholder(tf.float32, []) neglogpac = self.stochpol.pd.neglogp(self.stochpol.ph_ac) entropy = tf.reduce_mean(self.stochpol.pd.entropy()) vpred = self.stochpol.vpred vf_loss = 0.5 * tf.reduce_mean((vpred - self.ph_ret)**2) ratio = tf.exp(self.ph_oldnlp - neglogpac) # p_new / p_old negadv = -self.ph_adv pg_losses1 = negadv * ratio pg_losses2 = negadv * tf.clip_by_value( ratio, 1.0 - self.ph_cliprange, 1.0 + self.ph_cliprange) pg_loss_surr = tf.maximum(pg_losses1, pg_losses2) pg_loss = tf.reduce_mean(pg_loss_surr) ent_loss = (-ent_coef) * entropy approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - self.ph_oldnlp)) clipfrac = tf.reduce_mean( tf.to_float(tf.abs(pg_losses2 - pg_loss_surr) > 1e-6)) self.total_loss = pg_loss + ent_loss + vf_loss self.to_report = { 'tot': self.total_loss, 'pg': pg_loss, 'vf': vf_loss, 'ent': entropy, 'approxkl': approxkl, 'clipfrac': clipfrac } def start_interaction(self, env_fns, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if MPI.COMM_WORLD.Get_size() > 1: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) gradsandvars = trainer.compute_gradients(self.total_loss, params) self._train = trainer.apply_gradients(gradsandvars) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run( tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root( getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump) ] self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time() def stop_interaction(self): for env in self.envs: env.close() def calculate_advantages(self, rews, use_news, gamma, lam): nsteps = self.rollout.nsteps lastgaelam = 0 for t in range(nsteps - 1, -1, -1): # nsteps-2 ... 0 nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last if not use_news: nextnew = 0 nextvals = self.rollout.buf_vpreds[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_last nextnotnew = 1 - nextnew delta = rews[:, t] + gamma * nextvals * nextnotnew - self.rollout.buf_vpreds[:, t] self.buf_advs[:, t] = lastgaelam = delta + gamma * lam * nextnotnew * lastgaelam self.buf_rets[:] = self.buf_advs + self.rollout.buf_vpreds def update(self): if self.normrew: rffs = np.array( [self.rff.update(rew) for rew in self.rollout.buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) self.rff_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var) else: rews = np.copy(self.rollout.buf_rews) self.calculate_advantages(rews=rews, use_news=self.use_news, gamma=self.gamma, lam=self.lam) info = dict(advmean=self.buf_advs.mean(), advstd=self.buf_advs.std(), retmean=self.buf_rets.mean(), retstd=self.buf_rets.std(), vpredmean=self.rollout.buf_vpreds.mean(), vpredstd=self.rollout.buf_vpreds.std(), ev=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()), rew_mean=np.mean(self.rollout.buf_rews), recent_best_ext_ret=self.rollout.current_max) if self.rollout.best_ext_ret is not None: info['best_ext_ret'] = self.rollout.best_ext_ret # normalize advantages if self.normadv: m, s = get_mean_and_std(self.buf_advs) self.buf_advs = (self.buf_advs - m) / (s + 1e-7) envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) def resh(x): if self.nsegs_per_env == 1: return x sh = x.shape return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:]) ph_buf = [ (self.stochpol.ph_ac, resh(self.rollout.buf_acs)), (self.ph_rews, resh(self.rollout.buf_rews)), (self.ph_oldvpred, resh(self.rollout.buf_vpreds)), (self.ph_oldnlp, resh(self.rollout.buf_nlps)), (self.stochpol.ph_ob, resh(self.rollout.buf_obs)), (self.ph_ret, resh(self.buf_rets)), (self.ph_adv, resh(self.buf_advs)), ] ph_buf.extend([(self.dynamics.last_ob, self.rollout.buf_obs_last.reshape([ self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape ]))]) mblossvals = [] for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} fd.update({ self.ph_lr: self.lr, self.ph_cliprange: self.cliprange }) mblossvals.append(getsess().run(self._losses + (self._train, ), fd)[:-1]) mblossvals = [mblossvals[0]] info.update( zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0))) info["rank"] = MPI.COMM_WORLD.Get_rank() self.n_updates += 1 info["n_updates"] = self.n_updates info.update({ dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items() }) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() info["ups"] = 1. / (tnow - self.t_last_update) info["total_secs"] = tnow - self.t_start info['tps'] = MPI.COMM_WORLD.Get_size( ) * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update) self.t_last_update = tnow return info def step(self): self.rollout.collect_rollout() update_info = self.update() print("Update info:", update_info) return {'update': update_info} def get_var_values(self): return self.stochpol.get_var_values() def set_var_values(self, vv): self.stochpol.set_var_values(vv)
class VecNormalize(VecEnvWrapper): """ A moving average, normalizing wrapper for vectorized environment. has support for saving/loading moving average, :param venv: (VecEnv) the vectorized environment to wrap :param training: (bool) Whether to update or not the moving average :param norm_obs: (bool) Whether to normalize observation or not (default: True) :param norm_reward: (bool) Whether to normalize rewards or not (default: True) :param clip_obs: (float) Max absolute value for observation :param clip_reward: (float) Max value absolute for discounted reward :param gamma: (float) discount factor :param epsilon: (float) To avoid division by zero """ def __init__(self, venv, training=True, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) self.clip_obs = clip_obs self.clip_reward = clip_reward # Returns: discounted rewards self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.training = training self.norm_obs = norm_obs self.norm_reward = norm_reward self.old_obs = np.array([]) def step_wait(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews self.old_obs = obs obs = self._normalize_observation(obs) if self.norm_reward: if self.training: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) self.ret[news] = 0 return obs, rews, news, infos def _normalize_observation(self, obs): """ :param obs: (numpy tensor) """ if self.norm_obs: if self.training: self.obs_rms.update(obs) obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) return obs else: return obs def get_original_obs(self): """ returns the unnormalized observation :return: (numpy float) """ return self.old_obs def reset(self): """ Reset all environments """ obs = self.venv.reset() if len(np.array(obs).shape) == 1: # for when num_cpu is 1 self.old_obs = [obs] else: self.old_obs = obs self.ret = np.zeros(self.num_envs) return self._normalize_observation(obs) def save_running_average(self, path): """ :param path: (str) path to log dir """ for rms, name in zip([self.obs_rms, self.ret_rms], ['obs_rms', 'ret_rms']): with open("{}/{}.pkl".format(path, name), 'wb') as file_handler: pickle.dump(rms, file_handler) def load_running_average(self, path): """ :param path: (str) path to log dir """ for name in ['obs_rms', 'ret_rms']: with open("{}/{}.pkl".format(path, name), 'rb') as file_handler: setattr(self, name, pickle.load(file_handler))
class bVecNormalize(VecEnv): def __init__(self, venv, ob=True, st=True, ret=True, clipob=10., clipst=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnv.__init__(self, observation_space=venv.observation_space, state_space=venv.state_space, action_space=venv.action_space) print('bullet vec normalize initialization. ') self.venv = venv self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.st_rms = RunningMeanStd( shape=self.state_space.shape) if st else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.clipst = clipst self.cliprew = cliprew self.ret = np.zeros(1) self.gamma = gamma self.epsilon = epsilon def step(self, action, z, skel): return self.step_norm(action, z, skel) def step_norm(self, action, z, skel): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, state, rews, done, infos = self.venv.step( action, z, skel) # 각 robot에서 정의된 step()이 호출됨 true_rews = copy.deepcopy(rews) self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) state = self._stfilt(state) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, state, rews, done, infos, true_rews def step_broadcast(self, action): res, obs, state, rews, done, infos = self.venv.step_broadcast( action) # 각 robot에서 정의된 step()이 호출됨 true_rews = copy.deepcopy(rews) for a in range(self.venv.num_agent): self.ret = self.ret * self.gamma + rews[a] obs[a] = self._obfilt(obs[a]) state[a] = self._stfilt(state[a]) if self.ret_rms: self.ret_rms.update(self.ret) rews[a] = np.clip( rews[a] / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return res, obs, state, rews, done, infos, true_rews def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) if self.ret_rms else None obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def _stfilt(self, state): if self.st_rms: self.st_rms.update(state) if self.ret_rms else None state = np.clip((state - self.st_rms.mean) / np.sqrt(self.st_rms.var + self.epsilon), -self.clipst, self.clipst) return state else: return state def reset(self, z, skel): obs, state = self.venv.reset(z, skel) return self._obfilt(obs), self._stfilt(state) def reset_broadcast(self): obs, state = self.venv.reset_broadcast() for i in range(self.venv.num_agent): obs[i] = self._obfilt(obs[i]) state[i] = self._stfilt(state[i]) return obs, state def get_vrep_scene_path(self): return self.venv.get_vrep_scene_path() def initialize_robot(self, clientID): self.venv.initialize_robot(clientID)
class VecNormalize(VecEnvWrapper): """ A moving average, normalizing wrapper for vectorized environment. It is pickleable which will save moving averages and configuration parameters. The wrapped environment `venv` is not saved, and must be restored manually with `set_venv` after being unpickled. :param venv: (VecEnv) the vectorized environment to wrap :param training: (bool) Whether to update or not the moving average :param norm_obs: (bool) Whether to normalize observation or not (default: True) :param norm_reward: (bool) Whether to normalize rewards or not (default: True) :param clip_obs: (float) Max absolute value for observation :param clip_reward: (float) Max value absolute for discounted reward :param gamma: (float) discount factor :param epsilon: (float) To avoid division by zero """ def __init__(self, venv, training=True, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) self.clip_obs = clip_obs self.clip_reward = clip_reward # Returns: discounted rewards self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.training = training self.norm_obs = norm_obs self.norm_reward = norm_reward self.old_obs = np.array([]) def __getstate__(self): """ Gets state for pickling. Excludes self.venv, as in general VecEnv's may not be pickleable.""" state = self.__dict__.copy() # these attributes are not pickleable del state['venv'] del state['class_attributes'] # these attributes depend on the above and so we would prefer not to pickle del state['ret'] return state def __setstate__(self, state): """ Restores pickled state. User must call set_venv() after unpickling before using. :param state: (dict)""" self.__dict__.update(state) assert 'venv' not in state self.venv = None def set_venv(self, venv): """ Sets the vector environment to wrap to venv. Also sets attributes derived from this such as `num_env`. :param venv: (VecEnv) """ if self.venv is not None: raise ValueError("Trying to set venv of already initialized VecNormalize wrapper.") VecEnvWrapper.__init__(self, venv) if self.obs_rms.mean.shape != self.observation_space.shape: raise ValueError("venv is incompatible with current statistics.") self.ret = np.zeros(self.num_envs) def step_wait(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews self.old_obs = obs obs = self._normalize_observation(obs) if self.norm_reward: if self.training: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) self.ret[news] = 0 return obs, rews, news, infos def _normalize_observation(self, obs): """ :param obs: (numpy tensor) """ if self.norm_obs: if self.training: self.obs_rms.update(obs) obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) return obs else: return obs def get_original_obs(self): """ returns the unnormalized observation :return: (numpy float) """ return self.old_obs def reset(self): """ Reset all environments """ obs = self.venv.reset() if len(np.array(obs).shape) == 1: # for when num_cpu is 1 self.old_obs = [obs] else: self.old_obs = obs self.ret = np.zeros(self.num_envs) return self._normalize_observation(obs) @staticmethod def load(load_path, venv): """ Loads a saved VecNormalize object. :param load_path: the path to load from. :param venv: the VecEnv to wrap. :return: (VecNormalize) """ with open(load_path, "rb") as file_handler: vec_normalize = pickle.load(file_handler) vec_normalize.set_venv(venv) return vec_normalize def save(self, save_path): with open(save_path, "wb") as file_handler: pickle.dump(self, file_handler) def save_running_average(self, path): """ :param path: (str) path to log dir .. deprecated:: 2.9.0 This function will be removed in a future version """ warnings.warn("Usage of `save_running_average` is deprecated. Please " "use `save` or pickle instead.", DeprecationWarning) for rms, name in zip([self.obs_rms, self.ret_rms], ['obs_rms', 'ret_rms']): with open("{}/{}.pkl".format(path, name), 'wb') as file_handler: pickle.dump(rms, file_handler) def load_running_average(self, path): """ :param path: (str) path to log dir .. deprecated:: 2.9.0 This function will be removed in a future version """ warnings.warn("Usage of `load_running_average` is deprecated. Please " "use `load` or pickle instead.", DeprecationWarning) for name in ['obs_rms', 'ret_rms']: with open("{}/{}.pkl".format(path, name), 'rb') as file_handler: setattr(self, name, pickle.load(file_handler))