def _encode_sample(self, idxes): batch = nest.map_structure(lambda x: x[idxes], self.data) def _batch(obs): return np.concatenate([ob[np.newaxis, :] for ob in obs], 0) obs = [self._encode_observation(idx) for idx in idxes] batch['obs'] = nest.map_structure(_batch, nest.zip_structure(*obs)) next_obs = [self._encode_observation(idx + 1) for idx in idxes] batch['next_obs'] = nest.map_structure(_batch, nest.zip_structure(*next_obs)) return batch
def _get_env_ob_norm(env, steps, eps): ob = env.reset() obs = [ob] for _ in range(steps): ob, _, done, _ = env.step(env.action_space.sample()) if done: ob = env.reset() obs.append(ob) obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs)) data = nest.zip_structure(obs, unpack_space(env.observation_space)) mean = nest.map_structure(_compute_mean, data) std = nest.map_structure(_compute_std(eps), data) return mean, std
def _encode_sample(self, idxes): batch = {} def _batch(obs): return np.concatenate([ob[np.newaxis, :] for ob in obs], 0) obs = [self._encode_observation(idx) for idx in idxes] batch['obs'] = nest.map_structure(_batch, nest.zip_structure(*obs)) for k in self.data.keys(): batch[k] = self.data[k][idxes] next_obs = [self._encode_observation(idx + 1) for idx in idxes] batch['next_obs'] = nest.map_structure(_batch, nest.zip_structure(*next_obs)) return batch
def _get_venv_ob_norm(env, steps, eps): ob = env.reset() obs = [ob] for _ in range(steps // env.num_envs): ob, _, done, _ = env.step( np.array([env.action_space.sample() for _ in range(env.num_envs)])) if np.any(done): ob = env.reset(force=False) obs.append(ob) obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs)) data = nest.zip_structure(obs, unpack_space(env.observation_space)) mean = nest.map_structure(_compute_mean, data) std = nest.map_structure(_compute_std(eps), data) return mean, std
def store_observation(self, obs): """Store a single observation in the buffer at the next available index. Overwrites old observations if necessary. Parameters ---------- obs: nest of np.array Returns ------- idx: int Index at which the obs is stored. To be used for `store_effect` later. """ if self.obs is None: self.obs = nest.map_structure(self._init_obs_data, obs) def _store_ob(item): buffer, ob = item buffer[self.next_idx] = ob nest.map_structure(_store_ob, nest.zip_structure(self.obs, obs)) ret = self.next_idx self.next_idx = (self.next_idx + 1) % self.size self.num_in_buffer = min(self.size, self.num_in_buffer + 1) return ret
def store_effect(self, idx, step_data): """Store effects of action taken after obeserving obs stored at idx. The reason `store_observation` and `store_effect` is broken up into two functions is so that one can call `encode_recent_observation` in between. Paramters --------- idx: int Index in buffer of recent observation (returned by `store_observation`). data: dict The data to store in the buffer. """ if self.data == {}: self._init_replay_data(step_data) if not nest.has_same_structure(self.data, step_data): raise ValueError("The data passed to ReplayBuffer must the same" " at all time steps.") def _insert(item): buffer, x = item buffer[idx] = x nest.map_structure(_insert, nest.zip_structure(self.data, step_data))
def _reset_done_envs(self): obs = [] for e in range(self.num_envs): if self.transitions[e] is None or self.transitions[e][2]: self.transitions[e] = None obs.append(self.envs[e].reset()) else: obs.append(self.transitions[e][0]) return nest.map_structure(np.stack, nest.zip_structure(*obs))
def log_prob(self, ac): """Log prob.""" def _log_prob(item): dist, action = item return dist.log_prob(action) log_probs = nest.map_structure(_log_prob, nest.zip_structure(self.dists, ac)) return sum(nest.flatten(log_probs))
def forward(self, ob, ac): """Forward.""" ob = self.obs_filter.get_value_fn_observation(ob) ob = nest.map_structure(lambda z: z.float(), ob) if self.device is None: self.device = nest.flatten(ob)[0].device self.ac_mean = nest.map_structure(self._to_torch, self.ac_mean) self.ac_std = nest.map_structure(self._to_torch, self.ac_std) # combine actions, but action observations are normalized so we have to # unnormalize them first combined_ac = nest.map_structure( self._unnorm_action, nest.zip_structure(ob['action'], self.ac_mean, self.ac_std)) combined_ac['torque'] = (combined_ac['torque'] + self.params.max_torque * ac) ob['action'] = nest.map_structure( self._norm_action, nest.zip_structure(combined_ac, self.ac_mean, self.ac_std)) return self.net(self.embedding(ob))
def _encode_sample(self, idxes): def _batch(obs): return np.concatenate([ob[np.newaxis, :] for ob in obs], 0) batch = {} batch['obs'] = [] for k in self.data.keys(): batch[k] = [] for i in range(self.n): obs = [self._encode_observation(idx + i) for idx in idxes] batch['obs'].append( nest.map_structure(_batch, nest.zip_structure(*obs))) for k in self.data.keys(): batch[k].append(self.data[k][[idx + i for idx in idxes]]) obs = [self._encode_observation(idx + self.n) for idx in idxes] batch['obs'].append( nest.map_structure(_batch, nest.zip_structure(*obs))) return batch
def _get_env_ob_norm(env, steps): ob = env.reset() obs = [ob] for _ in range(steps): ob, _, done, _ = env.step(env.action_space.sample()) if done: ob = env.reset() obs.append(ob) obs = nest.map_structure(np.stack, nest.zip_structure(*obs)) mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs) std = nest.map_structure(lambda x: np.std(x, axis=0), obs) return mean, std
def insert(self, step_data): """Insert new data into storage. Transfers to the correct device if needed. """ if self.data is None: self.init_data(step_data) if self.rollout_complete: raise ValueError("Tried to insert data when the rollout is " " complete. Call rollout.reset() to reset.") if self.step >= self.num_steps: self.extend_storage() if set(step_data.keys()) != self.keys: raise ValueError("The same data must be provided at every step.") def _copy_data(item): storage, step_data = item if step_data.device != self.device: storage[self.step].copy_(step_data.to(self.device)) else: storage[self.step].copy_(step_data) def _check_shape(data, key): if data.shape[0] != self.num_processes: raise ValueError(f"data '{key}' is expected to have its " f"0th dimension equal to the number " f"of processes: {self.num_processes}") for k in self.keys: nest.map_structure(partial(_check_shape, key=k), step_data[k]) nest.map_structure(_copy_data, nest.zip_structure(self.data[k], step_data[k])) if self.step == 0: self.data['return'].fill_(0.) self.data['q_mc'].fill_(0.) done = torch.zeros_like(self.data['done'][0]) else: done = self.data['done'][self.step - 1] if len(step_data['reward'].shape) == 2: r = torch.logical_not(done.unsqueeze(-1)) * step_data['reward'].to( self.device) else: r = torch.logical_not(done) * step_data['reward'].to(self.device) self.data['return'] += r self.sequence_lengths += torch.logical_not(step_data['done'].cpu()) self.step = self.step + 1 self.rollout_complete = bool(torch.all(step_data['done']))
def update(self, mean, var, count): if self.count == 0: self.mean = mean self.var = var self.count = count else: self.batch_count = count self.new_count = count + self.count nest.map_structure( self._update, nest.zip_structure(self.mean, self.var, mean, var) ) self.count = self.new_count
def _get_venv_ob_norm(env, steps): ob = env.reset() obs = [ob] for _ in range(steps // env.num_envs): ob, _, done, _ = env.step( np.array([env.action_space.sample() for _ in range(env.num_envs)])) if np.any(done): ob = env.reset(force=False) obs.append(ob) obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs)) mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs) std = nest.map_structure(lambda x: np.std(x, axis=0), obs) return mean, std
def step_wait(self): active = [False for _ in range(self.num_envs)] for e in range(self.num_envs): if self.transitions[e] is None or not self.transitions[e][ 2]: # if episode is over: action = nest.map_structure(lambda ac: ac[e], self.actions) self.transitions[e] = self.envs[e].step(action) active[e] = True obs, rs, dones, infos = zip(*self.transitions) for e, info in enumerate(infos): info['active'] = active[e] obs = nest.map_structure(np.stack, nest.zip_structure(*obs)) return obs, np.stack(rs), np.stack(dones), infos
def _get_venv_ob_norm(env, steps): # Only collect obs from the first environment. This is hacky and # inefficient but is that simplest solution given that environments sync # their resets. ob = env.reset() obs = [nest.map_structure(lambda x: x[0], ob)] for _ in range(steps): ob, _, done, _ = env.step( np.array([env.action_space.sample() for _ in range(env.num_envs)])) if done[0]: ob = env.reset() obs.append(nest.map_structure(lambda x: x[0], ob)) obs = nest.map_structure(np.stack, nest.zip_structure(*obs)) mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs) std = nest.map_structure(lambda x: np.std(x, axis=0), obs) return mean, std
def _normalize(self, obs): if not self.should_norm: return obs if self.mean is None or self.std is None: self.find_norm_params() obs = nest.map_structure(np.asarray, obs) obs = nest.map_structure(np.float32, obs) if not nest.has_same_structure(self.mean, obs): raise ValueError("mean and obs do not have the same structure!") def norm(item): ob, mean, std = item return (ob - mean) / std return nest.map_structure(norm, nest.zip_structure(obs, self.mean, self.std))
def step(self, action): """Step.""" ob, reward, done, info = self.venv.step(action) def _zero_frames(frames): for i, d in enumerate(done): if d: frames[i] = 0 return frames def _add_ob(item): return self._add_new_observation(*item) self.frames = nest.map_structure(_zero_frames, self.frames) self.frames = nest.map_structure(_add_ob, nest.zip_structure(self.frames, ob)) ob = nest.map_structure(lambda x: x.copy(), self.frames) self._dones = np.logical_or(done, self._dones) return ob, reward, done, info
def reset(self, force=True): """Reset.""" ob = self.venv.reset(force=force) def _zero_frames(frames): if force: frames[:] = 0 else: frames[self._dones] = 0 return frames def _add_ob(item): return self._add_new_observation(*item) self.frames = nest.map_structure(_zero_frames, self.frames) self.frames = nest.map_structure(_add_ob, nest.zip_structure(self.frames, ob)) self._dones[:] = False return nest.map_structure(lambda x: x.copy(), self.frames)
def sample(self, batch_size): """Sample `batch_size` different transitions. Parameters ---------- batch_size: int How many transitions to sample. Returns ------- batched data: dict a dictionary containing batched observations, next_observations, action, reward, done, and other data stored in the replay buffer. """ sizes = self._get_sizes(batch_size) batches = [ buffer.sample(s) for buffer, s in zip(self.buffers, sizes) if s > 0 ] return nest.map_structure(lambda x: np.concatenate(x, axis=0), nest.zip_structure(*batches))
def get_observation(self, obs, base_action): # Add base action to obs obs = self._add_action_to_obs(obs, base_action) # Add batch dim obs = nest.map_structure(lambda x: x[None], obs) # Normalize observations def norm(item): ob, mean, std = item if mean is not None: return (ob - mean) / std else: return ob obs = nest.map_structure( norm, nest.zip_structure(obs, self.ob_norm['mean'], self.ob_norm['std'])) # convert to torch tensors return nest.map_structure( lambda x: torch.from_numpy(x).to(self.device), obs)
def _flatten_obs(obs): assert isinstance(obs, (list, tuple)) assert len(obs) > 0 return nest.map_structure(np.stack, nest.zip_structure(*obs))
def reset(self, force=True): if not force: return self._reset_done_envs() obs = [self.envs[e].reset() for e in range(self.num_envs)] self.transitions = [None for _ in range(self.num_envs)] return nest.map_structure(np.stack, nest.zip_structure(*obs))
def encode_recent_observation(self): obs = [buf.encode_recent_observation() for buf in self.buffers] return nest.map_structure(np.stack, nest.zip_structure(*obs))
def kl(self, other): "KL divergence." kls = nest.map_structure(lambda dists: dists[0].kl(dists[1]), nest.zip_structure(self.dists, other.dists)) return sum(nest.flatten(kls))