def select_action(self, obs): if self.is_continuous: if self._share_net: mu, log_std, value = self.net(obs, rnncs=self.rnncs) # [B, A] self.rnncs_ = self.net.get_rnncs() else: mu, log_std = self.actor(obs, rnncs=self.rnncs) # [B, A] self.rnncs_ = self.actor.get_rnncs() value = self.critic(obs, rnncs=self.rnncs) # [B, 1] dist = td.Independent(td.Normal(mu, log_std.exp()), 1) action = dist.sample().clamp(-1, 1) # [B, A] log_prob = dist.log_prob(action).unsqueeze(-1) # [B, 1] else: if self._share_net: logits, value = self.net(obs, rnncs=self.rnncs) # [B, A], [B, 1] self.rnncs_ = self.net.get_rnncs() else: logits = self.actor(obs, rnncs=self.rnncs) # [B, A] self.rnncs_ = self.actor.get_rnncs() value = self.critic(obs, rnncs=self.rnncs) # [B, 1] norm_dist = td.Categorical(logits=logits) action = norm_dist.sample() # [B,] log_prob = norm_dist.log_prob(action).unsqueeze(-1) # [B, 1] acts_info = Data(action=action, value=value, log_prob=log_prob + th.finfo().eps) if self.use_rnn: acts_info.update(rnncs=self.rnncs) return action, acts_info
def select_action(self, obs): output = self.actor(obs, rnncs=self.rnncs) # [B, A] self.rnncs_ = self.actor.get_rnncs() if self.is_continuous: mu, log_std = output # [B, A] dist = td.Independent(td.Normal(mu, log_std.exp()), 1) action = dist.sample().clamp(-1, 1) # [B, A] else: logits = output # [B, A] norm_dist = td.Categorical(logits=logits) action = norm_dist.sample() # [B,] acts_info = Data(action=action) if self.use_rnn: acts_info.update(rnncs=self.rnncs) return action, acts_info
def episode_step(self, obs: Data, env_rets: Data, begin_mask: np.ndarray): super().episode_step() if self._store: exps = Data( obs=obs, # [B, ] => [B, 1] reward=env_rets.reward[:, np.newaxis], obs_=env_rets.obs_fs, done=env_rets.done[:, np.newaxis], begin_mask=begin_mask) exps.update(self._acts_info) self._buffer.add({self._agent_id: exps}) idxs = np.where(env_rets.done)[0] self._pre_act[idxs] = 0. self.rnncs = self.rnncs_ if self.rnncs is not None: for k in self.rnncs.keys(): self.rnncs[k][idxs] = 0.
def select_action(self, obs): # [B, P], [B, P, A], [B, P] (q, pi, beta) = self.net(obs, rnncs=self.rnncs) self.rnncs_ = self.net.get_rnncs() options_onehot = F.one_hot(self.options, self.options_num).float() # [B, P] options_onehot_expanded = options_onehot.unsqueeze(-1) # [B, P, 1] pi = (pi * options_onehot_expanded).sum(-2) # [B, A] if self.is_continuous: mu = pi # [B, A] log_std = self.log_std[self.options] # [B, A] dist = td.Independent(td.Normal(mu, log_std.exp()), 1) action = dist.sample().clamp(-1, 1) # [B, A] log_prob = dist.log_prob(action).unsqueeze(-1) # [B, 1] else: logits = pi # [B, A] norm_dist = td.Categorical(logits=logits) action = norm_dist.sample() # [B,] log_prob = norm_dist.log_prob(action).unsqueeze(-1) # [B, 1] value = q_o = (q * options_onehot).sum(-1, keepdim=True) # [B, 1] beta_adv = q_o - ((1 - self.eps) * q.max(-1, keepdim=True)[0] + self.eps * q.mean(-1, keepdim=True)) # [B, 1] max_options = q.argmax(-1) # [B, P] => [B, ] beta_probs = (beta * options_onehot).sum(-1) # [B, P] => [B,] beta_dist = td.Bernoulli(probs=beta_probs) # <1 则不改变op, =1 则改变op new_options = th.where(beta_dist.sample() < 1, self.options, max_options) self.new_options = th.where(self._done_mask, max_options, new_options) self.oc_mask = (self.new_options == self.options).float() acts_info = Data( action=action, value=value, log_prob=log_prob + th.finfo().eps, beta_advantage=beta_adv + self.dc, last_options=self.options, options=self.new_options, reward_offset=-((1 - self.oc_mask) * self.dc).unsqueeze(-1)) if self.use_rnn: acts_info.update(rnncs=self.rnncs) return action, acts_info
def select_action(self, obs): output = self.actor(obs, rnncs=self.rnncs) # [B, A] self.rnncs_ = self.actor.get_rnncs() value = self.critic(obs, rnncs=self.rnncs) # [B, 1] if self.is_continuous: mu, log_std = output # [B, A] dist = td.Independent(td.Normal(mu, log_std.exp()), 1) action = dist.sample().clamp(-1, 1) # [B, A] log_prob = dist.log_prob(action).unsqueeze(-1) # [B, 1] else: logits = output # [B, A] logp_all = logits.log_softmax(-1) # [B, A] norm_dist = td.Categorical(logits=logp_all) action = norm_dist.sample() # [B,] log_prob = norm_dist.log_prob(action).unsqueeze(-1) # [B, 1] acts_info = Data(action=action, value=value, log_prob=log_prob + th.finfo().eps) if self.use_rnn: acts_info.update(rnncs=self.rnncs) if self.is_continuous: acts_info.update(mu=mu, log_std=log_std) else: acts_info.update(logp_all=logp_all) return action, acts_info
def get_obs(self, behavior_names=None, only_obs=False): """ 解析环境反馈的信息,将反馈信息分为四部分:向量、图像、奖励、done信号 """ behavior_names = behavior_names or self.behavior_names whole_done = np.full(self._n_copies, False) whole_info_max_step = np.full(self._n_copies, False) all_obs_fa, all_obs_fs = {}, {} all_reward = {} for bn in behavior_names: ps = [] # TODO: optimize while True: ds, ts = self.env.get_steps(bn) if len(ts): ps.append(ts) if len(ds) == self._n_copies: break elif len(ds) == 0: self.env.step( ) # some of environments done, but some of not else: raise ValueError( f'agents number error. Expected 0 or {self._n_copies}, received {len(ds)}' ) obs_fs, reward = ds.obs, ds.reward obs_fa = deepcopy(obs_fs) done = np.full(self._n_copies, False) begin_mask = np.full(self._n_copies, False) info_max_step = np.full(self._n_copies, False) info_real_done = np.full(self._n_copies, False) for ts in ps: # TODO: 有待优化 _ids = ts.agent_id reward[_ids] = ts.reward info_max_step[_ids] = ts.interrupted # 因为达到episode最大步数而终止的 # 去掉因为max_step而done的,只记录因为失败/成功而done的 info_real_done[_ids[~ts.interrupted]] = True done[_ids] = True begin_mask[_ids] = True # zip: vector, visual, ... for _obs, _tobs in zip(obs_fa, ts.obs): _obs[_ids] = _tobs if self._real_done: done = np.array(info_real_done) _obs_fa = Data() _obs_fs = Data() if len(self._vector_idxs[bn]) > 0: _obs_fa.update( vector={ f'vector_{i}': obs_fa[vi] for i, vi in enumerate(self._vector_idxs[bn]) }) _obs_fs.update( vector={ f'vector_{i}': obs_fs[vi] for i, vi in enumerate(self._vector_idxs[bn]) }) if len(self._visual_idxs[bn]) > 0: _obs_fa.update( visual={ f'visual_{i}': obs_fa[vi] for i, vi in enumerate(self._visual_idxs[bn]) }) _obs_fs.update( visual={ f'visual_{i}': obs_fs[vi] for i, vi in enumerate(self._visual_idxs[bn]) }) all_obs_fa[bn] = _obs_fa all_obs_fs[bn] = _obs_fs all_reward[bn] = reward whole_done = np.logical_or(whole_done, done) whole_info_max_step = np.logical_or(whole_info_max_step, info_max_step) if only_obs: all_obs_fa.update({ 'global': Data(begin_mask=np.full((self._n_copies, 1), True)) }) return all_obs_fa else: rets = {} for bn in self.behavior_names: rets[bn] = Data(obs_fa=all_obs_fa[bn], obs_fs=all_obs_fs[bn], reward=all_reward[bn], done=whole_done, info=dict(max_step=whole_info_max_step)) rets.update({'global': Data(begin_mask=begin_mask[:, np.newaxis])}) # [B, 1] return rets