def react(self, h, a): happy_ratio = h.stats.get(type(self).__name__, dict()).get(':)', 0.0) self.sm.hm.record([self.sm.state]) if a == self.optimal_actions[self.sm.state]: e_next = grl.epsilon_sample(self.pm.percept_space, ':)', 1 - happy_ratio) if e_next == ':)': self.sm.state = (self.sm.state + 1) % len(self.sm.state_space) else: e_next = ':(' return e_next
def react(self, h, a): bang_ratio = h.stats.get(type(self).__name__, dict()).get('#', self.pmin) self.sm.hm.record([self.sm.state]) if a == self.optimal_actions[self.sm.state]: e_next = grl.epsilon_sample(self.pm.percept_space, '#', 1 - bang_ratio) if (e_next == '#' and self.sm.state == 0) or (e_next == '@' and self.sm.state == 1): self.sm.state = (self.sm.state + 1) % len(self.sm.state_space) else: self.sm.state = 0 e_next = '@' return e_next
def act(self, h): self.pi, self.v = grl.PITabular(self.p, self.r, self.v, self.pi, g=self.g, steps=1, vi_steps=1) # Oracle Alert! s = self.hm.state(h, g=self.g, q_func=self.oracle) return grl.epsilon_sample(self.am.action_space, self.pi[s].argmax(), self.xpl)
def reset(self): self.sm.state = grl.epsilon_sample(self.sm.state_space)
def setup(self): self.sm.state_space = ['s-left', 's-right'] self.am.action_space = ['left', 'right'] self.sm.state = grl.epsilon_sample(self.sm.state_space)
def act(self, h): return grl.epsilon_sample(self.am.actions)
def act(self, h): s = self.hm.state(h) self.am.action = grl.epsilon_sample(self.am.action_space, max(self.Q[s])[1], 0.1) return self.am.action
def start(self, e=None, order=1): self.order = order self.am.action = grl.epsilon_sample(self.am.action_space) return self.am.action