def retrieve_all(self): assert self.eps == self.max_batch occup_slice = slice(0, self.ptr) self.ptr = 0 self.eps = 0 adv_mean, adv_std = mpi_statistics_scalar(self.adv[occup_slice]) pos_mean, pos_std = mpi_statistics_scalar(self.pos[occup_slice]) self.adv[occup_slice] = (self.adv[occup_slice] - adv_mean) / adv_std self.pos[occup_slice] = (self.pos[occup_slice] - pos_mean) / pos_std return [ self.obs[occup_slice], self.act[occup_slice], self.adv[occup_slice], self.pos[occup_slice], self.ret[occup_slice], self.lgt[occup_slice] ]
def get_stats(self, key): """ Lets an algorithm ask the logger for mean/std/min/max of a diagnostic. """ v = self.epoch_dict[key] vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v return mpi_statistics_scalar(vals)
def log_tabular(self, key, val=None, with_min_and_max=False, average_only=False): """ Log a value or possibly the mean/std/min/max values of a diagnostic. Args: key (string): The name of the diagnostic. If you are logging a diagnostic whose state has previously been saved with ``store``, the key here has to match the key you used there. val: A value for the diagnostic. If you have previously saved values for this key via ``store``, do *not* provide a ``val`` here. with_min_and_max (bool): If true, log min and max values of the diagnostic over the epoch. average_only (bool): If true, do not log the standard deviation of the diagnostic over the epoch. """ if val is not None: super().log_tabular(key,val) else: v = self.epoch_dict[key] vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v stats = mpi_statistics_scalar(vals, with_min_and_max=with_min_and_max) super().log_tabular(key if average_only else 'Average' + key, stats[0]) if not(average_only): super().log_tabular('Std'+key, stats[1]) if with_min_and_max: super().log_tabular('Max'+key, stats[3]) super().log_tabular('Min'+key, stats[2]) self.epoch_dict[key] = []
def get(self): """ Call this at the end of an epoch to get all of the data from the buffer, with advantages appropriately normalized (shifted to have mean zero and std one). Also, resets some pointers in the buffer. """ assert self.ptr == self.max_size # buffer has to be full before you can get self.ptr, self.path_start_idx = 0, 0 # the next two lines implement the advantage normalization trick adv_mean, adv_std = mpi_statistics_scalar(self.adv_buf) self.adv_buf = (self.adv_buf - adv_mean) / adv_std return self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf
def sample(self): """Get values from the buffer for training. Returns: Dictionary of environment-agent information for training. """ assert self.ptr == self.size self.ptr, self.path_start_idx = 0, 0 adv_mean, adv_std = mpi_statistics_scalar(self.advantages) self.advantages = (self.advantages - adv_mean) / adv_std data = dict(obs=self.observations, act=self.actions, ret=self.returns, adv=self.advantages, logp=self.logp) return { k: torch.as_tensor(v, dtype=torch.float32) for k, v in data.items() }
def get(self): """ Call this at the end of an epoch to get all of the data from the buffer, with advantages appropriately normalized (shifted to have mean zero and std one). Also, resets some pointers in the buffer. """ assert self.ptr == self.max_size # buffer has to be full before you can get self.ptr, self.path_start_idx = 0, 0 # the next two lines implement the advantage normalization trick adv_mean, adv_std = mpi_statistics_scalar(self.adv_buf) self.adv_buf = (self.adv_buf - adv_mean) / adv_std data = dict(obs=self.obs_buf, act=self.act_buf, ret=self.ret_buf, adv=self.adv_buf, logp=self.logp_buf) return { k: torch.as_tensor(v, dtype=torch.float32) for k, v in data.items() }
def vpg(env, ac_kwargs=None, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, lam=0.97, max_ep_len=1000, save_freq=10): seed += 10000 * proc_id() tf.random.set_seed(seed) np.random.seed(seed) # Create actor-critic agent and synchronize it ac_kwargs['action_space'] = env.action_space actor_critic = ActorCritic(**ac_kwargs) # Experience buffer obs_dim = env.observation_space.shape act_dim = env.action_space.shape local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) """ Main loop: collect experience in env and update/log each epoch """ # o for observation, r for reward, d for done o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 all_ep_ret = [] summary_ep_ret = [] totalEnvInteracts = [] for epoch in range(epochs): for t in range(local_steps_per_epoch): a, logp_t, v_t = actor_critic(o.reshape(1, -1)) # save and log a = a.numpy()[0] buf.store(o, a, r, v_t, logp_t) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal) and proc_id() == 0: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else v_t buf.finish_path(last_val) if terminal: all_ep_ret.append(ep_ret) # reset environment o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Perform VPG update! actor_critic.update(buf) mean, std = mpi_statistics_scalar(all_ep_ret) all_ep_ret = [] if proc_id() == 0: print(f'epoch {epoch}: mean {mean}, std {std}') summary_ep_ret.append(mean) totalEnvInteracts.append((epoch + 1) * steps_per_epoch) if proc_id() == 0: plt.plot(totalEnvInteracts, summary_ep_ret) plt.grid(True) plt.show()