def select_action(self, observation: types.ObservationType) -> types.ActionData: if np.random.uniform() < self.epsilon: return types.ActionData(action=np.random.choice(self.action_space)) distribution = self.policy.get_distribution(observation) return types.ActionData( action=np.random.choice(self.action_space, p=distribution))
def select_action(self, observation: types.ObservationType) -> types.ActionData: self._memory_buffer.append(observation) if np.random.uniform() < self.epsilon: return types.ActionData(action=np.random.choice(self.action_space)) observation = np.asarray(list(self._memory_buffer)) distribution = self.policy.get_distribution(observation) return types.ActionData(action=np.argmax(distribution))
def select_action(self, observation: types.ObservationType) -> types.ActionData: distribution = self.policy.get_distribution(observation) noise = np.random.normal(loc=0., scale=self.sigma, size=len(self.action_space)) distribution += noise return types.ActionData(action=np.argmax(distribution))