示例#1
0
    def run(self):
        """
        Run the evaluation.
        """
        t = 0

        while t < self.T:
            self.e_trace.clear()
            s = generate_initial_state()
            a = self._choose_action(s)

            while True:
                for i in range(len(_feature_space)):
                    self.e_trace[i] *= self.lambda_

                f_a = _phi(s, a)

                for i in f_a:
                    self.e_trace[i] += 1

                s1, r = step(s, a)
                delta = r - self.q(s, a)

                if is_episode_terminated(r, a):
                    self._update_theta(delta)
                    break

                s = s1
                a = self._choose_action(s)
                delta += self.q(s, a)
                self._update_theta(delta)
            self.learning_curve.append((t, self.extract_q()))
            t += 1
        return self
示例#2
0
文件: question3.py 项目: terry2012/RL
 def _update_state(self, s, a, r, s1, a1):
     q = self.Q[(s1, a1)] if not is_episode_terminated(r, a) else 0.0
     delta = r + q - self.Q[(s, a)]
     self.e_trace[(s, a)] += 1
     for i in range(1, NUMBER_OF_CARDS+1):
         for j in range(1, MAX_POINTS+1):
             st = i, j
             for act in ACTIONS:
                 self.Q[(st, act)] += self.alpha[(st, act)] * delta * self.e_trace[(st, act)]
                 self.e_trace[(st, act)] *= self.lambda_
     self.N[s] += 1
     self.N[(s, a)] += 1
     self.alpha[(s, a)] = 1/self.N[(s, a)]
     self.eta[s] = self.N0/(self.N0 + self.N[s])