def store_observation(self, obs): """Store a single observation in the buffer at the next available index. Overwrites old observations if necessary. Parameters ---------- obs: nest of np.array Returns ------- idx: int Index at which the obs is stored. To be used for `store_effect` later. """ if self.obs is None: self.obs = nest.map_structure(self._init_obs_data, obs) def _store_ob(item): buffer, ob = item buffer[self.next_idx] = ob nest.map_structure(_store_ob, nest.zip_structure(self.obs, obs)) ret = self.next_idx self.next_idx = (self.next_idx + 1) % self.size self.num_in_buffer = min(self.size, self.num_in_buffer + 1) return ret
def init_rollout_storage(self): """Initialize rollout storage.""" def _to_torch(o): return torch.from_numpy(o).to(self.device) self._ob = nest.map_structure(_to_torch, self.env.reset()) data = self.act(self._ob) if 'action' not in data: raise ValueError('the key "action" must be in the dict returned ' 'act_fn') if 'value' not in data: raise ValueError('the key "value" must be in the dict returned ' 'act_fn') state = None if 'state' in data: state = data['state'] if state is None: self.init_state = None self.recurrent = False else: self.recurrent = True def _init_state(s): return torch.zeros(size=s.shape, device=self.device, dtype=s.dtype) self.init_state = nest.map_structure(_init_state, state) self._state = self.init_state self._initialized = True
def store_effect(self, idx, step_data): """Store effects of action taken after obeserving obs stored at idx. The reason `store_observation` and `store_effect` is broken up into two functions is so that one can call `encode_recent_observation` in between. Paramters --------- idx: int Index in buffer of recent observation (returned by `store_observation`). data: dict The data to store in the buffer. """ if self.data == {}: self._init_replay_data(step_data) if not nest.has_same_structure(self.data, step_data): raise ValueError("The data passed to ReplayBuffer must the same" " at all time steps.") def _insert(item): buffer, x = item buffer[idx] = x nest.map_structure(_insert, nest.zip_structure(self.data, step_data))
def env_step_and_store_transition(self): """Step env and store transition in replay buffer.""" if self._ob is None: self.manual_reset() def _remove_batch_dim(ob): return ob[0] def _to_torch(ob): return torch.from_numpy(ob).to(self.device)[None] idx = self.buffer.store_observation( nest.map_structure(_remove_batch_dim, self._ob)) ob = self.buffer.encode_recent_observation() with torch.no_grad(): data = self.act(nest.map_structure(_to_torch, ob)) for k in data: data[k] = data[k].cpu().numpy() self._ob, r, done, _ = self.env.step(data['action']) data['reward'] = r data['done'] = done # remove batch dimensions for k in data: data[k] = data[k][0] self.buffer.store_effect(idx, data) if done: self._ob = self.env.reset()
def _state_reset(self, dones): if self.recurrent: def _state_item_reset(x): x[0, dones].zero_() nest.map_structure(_state_item_reset, self._state)
def step_async(self, actions): def _numpy_check(ac): if not isinstance(ac, np.ndarray): raise ValueError("You must pass actions as nested numpy arrays" " to DummyVecEnv.") nest.map_structure(_numpy_check, actions) self.actions = actions
def main(): parser = argparse.ArgumentParser() parser.add_argument('-t', type=int, default=None, help="checkpoint timestep") parser.add_argument('-n', type=int, default=1, help="number of episodes") parser.add_argument('--base', default=False, action='store_true', help="visualize the base_policy") args = parser.parse_args() t = get_best_eval() if args.t is None else args.t env, pi, device, obs, ckpt = _load_env_and_policy('/logdir', t) def _to_torch(x): return torch.from_numpy(x).to(device) def _to_numpy(x): return x.cpu().numpy() video_dir = '/logdir/video' os.makedirs(video_dir, exist_ok=True) tmp_dir = os.path.join(video_dir, 'tmp') if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) if args.base: output_path = os.path.join(video_dir, 'base_policy.mp4') else: output_path = os.path.join(video_dir, f'{ckpt:09d}.mp4') if os.path.exists(output_path): return video_writer = VideoWriter(img_dir=tmp_dir) for i in range(args.n): obs = env.reset() video_writer.add_frame(env.render(mode='rgb_array')) done = False while not done: if args.base: action = np.zeros_like(obs['action']['torque']) else: obs = nest.map_structure(_to_torch, obs) with torch.no_grad(): action = pi(obs).action action = nest.map_structure(_to_numpy, action) obs, _, done, _ = env.step(action) video_writer.add_frame(env.render(mode='rgb_array')) video_writer.make_video(output_path)
def _encode_sample(self, idxes): batch = nest.map_structure(lambda x: x[idxes], self.data) def _batch(obs): return np.concatenate([ob[np.newaxis, :] for ob in obs], 0) obs = [self._encode_observation(idx) for idx in idxes] batch['obs'] = nest.map_structure(_batch, nest.zip_structure(*obs)) next_obs = [self._encode_observation(idx + 1) for idx in idxes] batch['next_obs'] = nest.map_structure(_batch, nest.zip_structure(*next_obs)) return batch
def _get_env_ob_norm(env, steps): ob = env.reset() obs = [ob] for _ in range(steps): ob, _, done, _ = env.step(env.action_space.sample()) if done: ob = env.reset() obs.append(ob) obs = nest.map_structure(np.stack, nest.zip_structure(*obs)) mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs) std = nest.map_structure(lambda x: np.std(x, axis=0), obs) return mean, std
def insert(self, step_data): """Insert new data into storage. Transfers to the correct device if needed. """ if self.data is None: self.init_data(step_data) if self.rollout_complete: raise ValueError("Tried to insert data when the rollout is " " complete. Call rollout.reset() to reset.") if self.step >= self.num_steps: self.extend_storage() if set(step_data.keys()) != self.keys: raise ValueError("The same data must be provided at every step.") def _copy_data(item): storage, step_data = item if step_data.device != self.device: storage[self.step].copy_(step_data.to(self.device)) else: storage[self.step].copy_(step_data) def _check_shape(data, key): if data.shape[0] != self.num_processes: raise ValueError(f"data '{key}' is expected to have its " f"0th dimension equal to the number " f"of processes: {self.num_processes}") for k in self.keys: nest.map_structure(partial(_check_shape, key=k), step_data[k]) nest.map_structure(_copy_data, nest.zip_structure(self.data[k], step_data[k])) if self.step == 0: self.data['return'].fill_(0.) self.data['q_mc'].fill_(0.) done = torch.zeros_like(self.data['done'][0]) else: done = self.data['done'][self.step - 1] if len(step_data['reward'].shape) == 2: r = torch.logical_not(done.unsqueeze(-1)) * step_data['reward'].to( self.device) else: r = torch.logical_not(done) * step_data['reward'].to(self.device) self.data['return'] += r self.sequence_lengths += torch.logical_not(step_data['done'].cpu()) self.step = self.step + 1 self.rollout_complete = bool(torch.all(step_data['done']))
def _get_env_ob_norm(env, steps, eps): ob = env.reset() obs = [ob] for _ in range(steps): ob, _, done, _ = env.step(env.action_space.sample()) if done: ob = env.reset() obs.append(ob) obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs)) data = nest.zip_structure(obs, unpack_space(env.observation_space)) mean = nest.map_structure(_compute_mean, data) std = nest.map_structure(_compute_std(eps), data) return mean, std
def _get_venv_ob_norm(env, steps): ob = env.reset() obs = [ob] for _ in range(steps // env.num_envs): ob, _, done, _ = env.step( np.array([env.action_space.sample() for _ in range(env.num_envs)])) if np.any(done): ob = env.reset(force=False) obs.append(ob) obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs)) mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs) std = nest.map_structure(lambda x: np.std(x, axis=0), obs) return mean, std
def update(self, mean, var, count): if self.count == 0: self.mean = mean self.var = var self.count = count else: self.batch_count = count self.new_count = count + self.count nest.map_structure( self._update, nest.zip_structure(self.mean, self.var, mean, var) ) self.count = self.new_count
def step_async(self, actions): self._assert_not_closed() def _numpy_check(ac): if not isinstance(ac, np.ndarray): raise ValueError("You must pass actions as nested numpy arrays" " to SubprocVecEnv.") nest.map_structure(_numpy_check, actions) for i, remote in enumerate(self.remotes): if self._dones[i]: continue action = nest.map_structure(lambda ac: ac[i], actions) remote.send(('step', action)) self.waiting = True
def rollout_step(self): """Compute one environment step.""" with torch.no_grad(): if self.recurrent: outs = self.act(self._ob, state_in=self._state) else: outs = self.act(self._ob, state_in=None) cpu_action = nest.map_structure(lambda ac: ac.cpu().numpy(), outs['action']) ob, r, done, infos = self.env.step(cpu_action) data = {} data['obs'] = self._ob data['action'] = outs['action'] data['reward'] = torch.from_numpy(r).float().to(self.device) data['done'] = torch.from_numpy(done).to(self.device) data['vpred'] = outs['value'] for key in outs: if key not in ['action', 'value', 'state']: data[key] = outs[key] def _to_torch(o): return torch.from_numpy(o).to(self.device) self._ob = nest.map_structure(_to_torch, ob) if self.recurrent: self._state = outs['state'] self._step += 1 truncated = self._get_truncated_envs(infos) if self._dones is not None: prev_step_not_done = torch.logical_not(self._dones) truncated = truncated & prev_step_not_done at_end_of_rollout = (self.rollout_length and self._step >= self.rollout_length) if at_end_of_rollout or torch.any(truncated): next_vpred = self._get_next_value() if self.rollout_length: assert self._step <= self.rollout_length if at_end_of_rollout: self._state_reset(data['done']) to_augment = torch.logical_not(data['done']) | truncated data['done'][:] = True else: to_augment = truncated if torch.any(to_augment): data['reward'][to_augment] += self.gamma * next_vpred[to_augment] self._dones = data['done'] self.storage.insert(data)
def _encode_sample(self, idxes): batch = {} def _batch(obs): return np.concatenate([ob[np.newaxis, :] for ob in obs], 0) obs = [self._encode_observation(idx) for idx in idxes] batch['obs'] = nest.map_structure(_batch, nest.zip_structure(*obs)) for k in self.data.keys(): batch[k] = self.data[k][idxes] next_obs = [self._encode_observation(idx + 1) for idx in idxes] batch['next_obs'] = nest.map_structure(_batch, nest.zip_structure(*next_obs)) return batch
def step_wait(self): active = [False for _ in range(self.num_envs)] for e in range(self.num_envs): if self.transitions[e] is None or not self.transitions[e][ 2]: # if episode is over: action = nest.map_structure(lambda ac: ac[e], self.actions) self.transitions[e] = self.envs[e].step(action) active[e] = True obs, rs, dones, infos = zip(*self.transitions) for e, info in enumerate(infos): info['active'] = active[e] obs = nest.map_structure(np.stack, nest.zip_structure(*obs)) return obs, np.stack(rs), np.stack(dones), infos
def _get_venv_ob_norm(env, steps, eps): ob = env.reset() obs = [ob] for _ in range(steps // env.num_envs): ob, _, done, _ = env.step( np.array([env.action_space.sample() for _ in range(env.num_envs)])) if np.any(done): ob = env.reset(force=False) obs.append(ob) obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs)) data = nest.zip_structure(obs, unpack_space(env.observation_space)) mean = nest.map_structure(_compute_mean, data) std = nest.map_structure(_compute_std(eps), data) return mean, std
def __call__(self, ob): """__call__.""" with torch.no_grad(): def _to_torch(o): return torch.from_numpy(o).to(self.device) ob = nest.map_structure(_to_torch, ob) if self.state is None: out = self.net(ob) else: out = self.net(ob, self.state) if hasattr(out, 'state_out'): self.state = out.state_out return nest.map_structure(lambda x: x.cpu().numpy(), out.action)
def _get_venv_ob_norm(env, steps): # Only collect obs from the first environment. This is hacky and # inefficient but is that simplest solution given that environments sync # their resets. ob = env.reset() obs = [nest.map_structure(lambda x: x[0], ob)] for _ in range(steps): ob, _, done, _ = env.step( np.array([env.action_space.sample() for _ in range(env.num_envs)])) if done[0]: ob = env.reset() obs.append(nest.map_structure(lambda x: x[0], ob)) obs = nest.map_structure(np.stack, nest.zip_structure(*obs)) mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs) std = nest.map_structure(lambda x: np.std(x, axis=0), obs) return mean, std
def _normalize(self, obs): if not self.should_norm: return obs if self.mean is None or self.std is None: self.find_norm_params() obs = nest.map_structure(np.asarray, obs) obs = nest.map_structure(np.float32, obs) if not nest.has_same_structure(self.mean, obs): raise ValueError("mean and obs do not have the same structure!") def norm(item): ob, mean, std = item return (ob - mean) / std return nest.map_structure(norm, nest.zip_structure(obs, self.mean, self.std))
def __init__(self, venv, norm=True, steps=10000, mean=None, std=None, eps=1e-2, log=True, log_prob=0.01): """Init.""" super().__init__(venv) self.steps = steps self.should_norm = norm self.eps = eps self.log = log self.log_prob = log_prob self.t = 0 self._eval = False self.mean = None self.std = None self._dones = np.zeros(self.num_envs, dtype=np.bool) if mean is not None and std is not None: if not nest.has_same_structure(mean, std): raise ValueError("mean and std must have the same structure.") self.mean = mean self.std = nest.map_structure(lambda x: np.maximum(x, self.eps), std)
def _encode_observation(self, idx): def _encode(ob, idx): end_idx = idx + 1 # make noninclusive start_idx = end_idx - self.obs_history_len # if there weren't enough obs ever in the buffer for context if start_idx < 0 and self.num_in_buffer != self.size: start_idx = 0 for idx in range(start_idx, end_idx - 1): if self.data['done'][idx % self.size]: start_idx = idx + 1 missing_context = self.obs_history_len - (end_idx - start_idx) # if zero padding is needed for missing context # or we are on the boundry of the buffer if start_idx < 0 or missing_context > 0: obs = [np.zeros_like(ob[0]) for _ in range(missing_context)] for idx in range(start_idx, end_idx): obs.append(ob[idx % self.size]) return np.concatenate(obs, 0) else: # this optimization has potential to saves about 30% compute # time s = ob.shape[2:] return ob[start_idx:end_idx].reshape(-1, *s) return nest.map_structure(partial(_encode, idx=idx), self.obs)
def __call__(self, ob, state_in=None): """Produce decision from model.""" if self.t < self.policy_training_start: outs = self.pi(ob, state_in, deterministic=True) else: outs = self.pi(ob, state_in) def _res_norm(ac): return ac.abs().sum(dim=1).mean() residual_norm = nest.map_structure(_res_norm, outs.action) if isinstance(residual_norm, torch.Tensor): logger.add_scalar('actor/l1_residual_norm', residual_norm, self.t, time.time()) self.t += outs.action.shape[0] else: self.t += nest.flatten(outs.action)[0].shape[0] for k, v in residual_norm.items(): logger.add_scalar(f'actor/{k}_residual_norm', v, self.t, time.time()) data = {'action': outs.action, 'value': self.vf(ob).value, 'logp': outs.dist.log_prob(outs.action), 'dist': outs.dist.to_tensors()} if outs.state_out: data['state'] = outs.state_out return data
def get_norm_params(n, difficulty, use_domain_rand): term_fn = 'position_close_to_goal' if difficulty < 4 else 'pos_and_rot_close_to_goal' env = make_training_env(32, MPPGStateMachine, difficulty, 'torque_and_position', frameskip=3, sim=True, visualization=False, reward_fn='competition_reward', termination_fn=term_fn, initializer='training_init', episode_length=3750, monitor=False, seed=0, norm_observations=True, max_torque=0.0, max_position=0.0, # set all residual actions to 0 denylist_states=['FailureState'], domain_randomization=use_domain_rand ) env.steps = n env.find_norm_params() def get_var(std): if std is not None: return std ** 2 return env.mean, nest.map_structure(get_var, env.std), env
def step(self): # Get batch. if self._diter is None: self._diter = self.dtrain.__iter__() try: batch = self._diter.__next__() except StopIteration: self.epochs += 1 self._diter = None return self.epochs batch = nest.map_structure(lambda x: x.to(self.device), batch) # compute loss ob, ac = batch self.model.train() loss = -self.model(ob).log_prob(ac).mean() logger.add_scalar('train/loss', loss.detach().cpu().numpy(), self.t, time.time()) # update model self.opt.zero_grad() loss.backward() self.opt.step() # increment step self.t += min( len(self.data) - (self.t % len(self.data)), self.batch_size) return self.epochs
def store_observation(self, obs): inds = [] for i, buf in enumerate(self.buffers): inds.append( buf.store_observation(nest.map_structure(lambda x: x[i], obs))) self._update_num_in_buffer() return inds
def sample(self, batch_size): """Sample a batch of self play data.""" batch = self.buffer.sample(batch_size) def _to_torch(x): return torch.from_numpy(x).to(self.device) return nest.map_structure(_to_torch, batch)
def _reset_done_envs(self): obs = [] for e in range(self.num_envs): if self.transitions[e] is None or self.transitions[e][2]: self.transitions[e] = None obs.append(self.envs[e].reset()) else: obs.append(self.transitions[e][0]) return nest.map_structure(np.stack, nest.zip_structure(*obs))
def log_prob(self, ac): """Log prob.""" def _log_prob(item): dist, action = item return dist.log_prob(action) log_probs = nest.map_structure(_log_prob, nest.zip_structure(self.dists, ac)) return sum(nest.flatten(log_probs))