def _decide_action(self, s, g): with self.sess.as_default(), self.graph.as_default(): self._exp_action = self._enable_stoch_policy() and MathUtil.flip_coin(self.exp_params_curr.rate) a, logp = self._eval_actor(s, g, self._exp_action) a = a[0] logp = logp[0] return a, logp
def _decide_action(self, s, g): with self.sess.as_default(), self.graph.as_default(): self._exp_action = False a = self._eval_actor(s, g)[0] logp = 0 if self._enable_stoch_policy(): # epsilon-greedy rand_action = MathUtil.flip_coin(self.exp_params_curr.rate) if rand_action: norm_exp_noise = np.random.randn(*a.shape) norm_exp_noise *= self.exp_params_curr.noise exp_noise = norm_exp_noise * self.a_norm.std a += exp_noise logp = self._calc_action_logp(norm_exp_noise) self._exp_action = True return a, logp