Пример #1
0
    def __call__(self, percept):
        s1, r1 = percept
        mdp = self.mdp
        R, P, terminals, pi = mdp.reward, mdp.P, mdp.terminals, self.pi
        s, a, Nsa, Ns1_sa, U = self.s, self.a, self.Nsa, self.Ns1_sa, self.U

        if s1 not in self.visited:  # Reward is only known for visited state.
            U[s1] = R[s1] = r1
            self.visited.add(s1)
        if s is not None:
            Nsa[(s, a)] += 1
            Ns1_sa[(s1, s, a)] += 1
            # for each t such that Ns′|sa [t, s, a] is nonzero
            for t in [res for (res, state, act), freq in Ns1_sa.items()
                      if (state, act) == (s, a) and freq != 0]:
                P[(s, a)][t] = Ns1_sa[(t, s, a)] / Nsa[(s, a)]

        self.U = policy_evaluation(pi, U, mdp)
        ##
        ##
        self.Nsa, self.Ns1_sa = Nsa, Ns1_sa
        if s1 in terminals:
            self.s = self.a = None
        else:
            self.s, self.a = s1, self.pi[s1]
        return self.a
Пример #2
0
    def __call__(self, percept):
        """What you need to do:
        1. update the transistion mdp.P by current <s,a,r>, here a is in the MDP, <s,r> is in the percept
        2. update the value function self.U and policy self.pi by policy_evaluation--Implemented in mdp.py
        3. through the policy get the current action self.a (self.pi is pre-defined)
        Eventually, you only need to get the value function. It is a non-learning agent.
        Tips: How to deal with the terminal states ?"""
        s1, r1 = percept
        mdp = self.mdp
        R, P, terminals, pi = mdp.reward, mdp.P, mdp.terminals, self.pi
        s, a, Nsa, Ns1_sa, U = self.s, self.a, self.Nsa, self.Ns1_sa, self.U

        if s1 not in self.visited:  # Reward is only known for visited state.
            U[s1] = R[s1] = r1
            self.visited.add(s1)
        if s is not None:
            Nsa[(s, a)] += 1
            Ns1_sa[(s1, s, a)] += 1
            # for each t such that Ns′|sa [t, s, a] is nonzero
            for t in [
                    res for (res, state, act), freq in Ns1_sa.items()
                    if (state, act) == (s, a) and freq != 0
            ]:
                P[(s, a)][t] = Ns1_sa[(t, s, a)] / Nsa[(s, a)]

        self.U = policy_evaluation(pi, U, mdp)
        ##
        ##
        self.Nsa, self.Ns1_sa = Nsa, Ns1_sa
        if s1 in terminals:
            self.s = self.a = None
        else:
            self.s, self.a = s1, self.pi[s1]
        return self.a
Пример #3
0
    def __call__(self, percept):
        s1, r1 = percept
        mdp = self.mdp
        R, P, terminals, pi = mdp.reward, mdp.P, mdp.terminals, self.pi
        s, a, Nsa, Ns1_sa, U = self.s, self.a, self.Nsa, self.Ns1_sa, self.U

        if s1 not in self.visited:  # Reward is only known for visited state.
            U[s1] = R[s1] = r1
            self.visited.add(s1)
        if s is not None:
            Nsa[(s, a)] += 1
            Ns1_sa[(s1, s, a)] += 1
            # for each t such that Ns′|sa [t, s, a] is nonzero
            for t in [res for (res, state, act), freq in Ns1_sa.items()
                      if (state, act) == (s, a) and freq != 0]:
                P[(s, a)][t] = Ns1_sa[(t, s, a)] / Nsa[(s, a)]

        self.U = policy_evaluation(pi, U, mdp)
        ##
        ##
        self.Nsa, self.Ns1_sa = Nsa, Ns1_sa
        if s1 in terminals:
            self.s = self.a = None
        else:
            self.s, self.a = s1, self.pi[s1]
        return self.a
Пример #4
0
 def program(self, percept):
     s1,r1 = percept
     mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa
     if s1 not in mdp.reward: # mdp.R also tracks the visited states
         U[s1] = mdp.reward[s1] = r1
     if s is not None:
         Nsa[s][a] += 1
         Ns_sa[s][a][s1] += 1
         for t in Ns_sa[s][a]:
             if Ns_sa[s][a][t] > 0:
                 self.mdp.T_set((s,a,t), Ns_sa[s][a][t]/Nsa[s][a])
     U = policy_evaluation(self.pi, U, mdp)
     if s1 in mdp.terminals:
         self.s = self.a = None
     else:
         self.s, self.a = s1, self.pi[s1]
     return self.a
Пример #5
0
 def program(self, percept):
     s1, r1 = percept
     mdp, U, s, a, Nsa, Ns_sa = self.mdp, self.U, self.s, self.a, self.Nsa, self.Ns_sa
     if s1 not in mdp.reward:  # mdp.R also tracks the visited states
         U[s1] = mdp.reward[s1] = r1
     if s is not None:
         Nsa[s][a] += 1
         Ns_sa[s][a][s1] += 1
         for t in Ns_sa[s][a]:
             if Ns_sa[s][a][t] > 0:
                 self.mdp.T_set((s, a, t), Ns_sa[s][a][t] / Nsa[s][a])
     U = policy_evaluation(self.pi, U, mdp)
     if s1 in mdp.terminals:
         self.s = self.a = None
     else:
         self.s, self.a = s1, self.pi[s1]
     return self.a
Пример #6
0
    def __call__(self, percept):
        s1, r1 = percept
        self.mdp.states.add(s1)  # Model keeps track of visited states.
        R, P, mdp, pi = self.mdp.reward, self.mdp.P, self.mdp, self.pi
        s, a, Nsa, Ns1_sa, U = self.s, self.a, self.Nsa, self.Ns1_sa, self.U

        if s1 not in R:  # Reward is only available for visted state.
            U[s1] = R[s1] = r1
        if s is not None:
            Nsa[(s, a)] += 1
            Ns1_sa[(s1, s, a)] += 1
            # for each t such that Ns′|sa [t, s, a] is nonzero
            for t in [res for (res, state, act), freq in Ns1_sa.items()
                      if (state, act) == (s, a) and freq != 0]:
                P[(s, a)][t] = Ns1_sa[(t, s, a)] / Nsa[(s, a)]

        U = policy_evaluation(pi, U, mdp)
        if s1 in mdp.terminals:
            self.s = self.a = None
        else:
            self.s, self.a = s1, self.pi[s1]
        return self.a