def run(self): """ Run the evaluation. """ t = 0 while t < self.T: self.e_trace.clear() s = generate_initial_state() a = self._choose_action(s) while True: for i in range(len(_feature_space)): self.e_trace[i] *= self.lambda_ f_a = _phi(s, a) for i in f_a: self.e_trace[i] += 1 s1, r = step(s, a) delta = r - self.q(s, a) if is_episode_terminated(r, a): self._update_theta(delta) break s = s1 a = self._choose_action(s) delta += self.q(s, a) self._update_theta(delta) self.learning_curve.append((t, self.extract_q())) t += 1 return self
def _update_state(self, s, a, r, s1, a1): q = self.Q[(s1, a1)] if not is_episode_terminated(r, a) else 0.0 delta = r + q - self.Q[(s, a)] self.e_trace[(s, a)] += 1 for i in range(1, NUMBER_OF_CARDS+1): for j in range(1, MAX_POINTS+1): st = i, j for act in ACTIONS: self.Q[(st, act)] += self.alpha[(st, act)] * delta * self.e_trace[(st, act)] self.e_trace[(st, act)] *= self.lambda_ self.N[s] += 1 self.N[(s, a)] += 1 self.alpha[(s, a)] = 1/self.N[(s, a)] self.eta[s] = self.N0/(self.N0 + self.N[s])