class IterativePolicyEvaluation(object):
    def __init__(self):
        self.grid = GridWorld()
        self.rewards = rewards
        self.actions = actions

    def initialize_V(self):
        V = {}
        S = []
        for i in range(self.grid.rows):
            for j in range(self.grid.cols):
                V[(i, j)] = 0
                if (i, j) not in [death, goal]:
                    S.append((i, j))
        self.V = V
        self.S = S
        self.dynamic_p = 1.0 / len(S)

    def value_step(self):
        diff = 0
        old_V = self.V
        for s in self.S:
            new_v = 0
            old_v = old_V[s]
            for a in self.actions:
                self.grid.set_state(s)
                self.grid.move(a)
                s_new = self.grid.current_state()
                r = self.rewards.get(s_new, 0)
                if self.grid.game_over(s_new):
                    new_v += self.dynamic_p * r
                    break
                else:
                    new_v += self.dynamic_p * (r + gamma * self.V[s_new])
            self.V[s] = new_v
            diff = max(diff, np.abs(old_v - new_v))
        return diff

    def policy_evaluation(self):
        self.initialize_V()
        while True:
            diff = self.value_step()
            if diff < delta:
                self.print_values()
                return None

    def print_values(self):
        for i in range(self.grid.rows):
            print("------------------------")
            for j in range(self.grid.cols):
                v = self.V.get((i, j), 0)
                if v >= 0:
                    print(" %.2f|" % v, end="")
                else:
                    print("%.2f|" % v, end="")
            print("")
        print("------------------------")
class PolicyIteration(object):
    def __init__(self):
        self.grid = GridWorld()
        self.rewards = rewards
        self.actions = actions

    def initialize_V(self):
        V = {}
        S = []
        for i in range(self.grid.rows):
            for j in range(self.grid.cols):
                V[(i, j)] = 0
                if (i, j) not in [death, goal]:
                    S.append((i, j))
        self.V = V
        self.S = S
        self.dynamic_p = 1.0 / len(S)

    def initialize_P(self):
        P = {}
        for i in range(self.grid.rows):
            for j in range(self.grid.cols):
                if (i, j) not in [death, goal]:
                    P[(i, j)] = np.random.choice(self.actions)
        self.P = P

    def value_step(self):
        diff = 0
        old_V = self.V
        for s in self.S:
            new_v = 0
            old_v = old_V[s]
            for a in self.actions:
                self.grid.set_state(s)
                self.grid.move(a)
                s_new = self.grid.current_state()
                r = self.rewards.get(s_new, 0)
                new_v += self.dynamic_p * (r + gamma * self.V[s_new])
            self.V[s] = new_v
            diff = max(diff, np.abs(old_v - new_v))
        return diff

    def policy_evaluation(self):
        while True:
            diff = self.value_step()
            if diff < delta:
                return None

    def improvement_step(self):
        policy_stable = True
        for s in self.S:
            old_action = self.P[s]
            action_values = []
            for a in self.actions:
                self.grid.set_state(s)
                self.grid.move(a)
                s_new = self.grid.current_state()
                r = self.rewards.get(s_new, 0)
                new_v = r + gamma * self.V[s_new]
                action_values.append(new_v)

            idx = np.argmax(action_values)
            new_action = self.actions[idx]
            self.P[s] = new_action
            if old_action != new_action:
                policy_stable = False

        return policy_stable

    def policy_iteration(self):
        self.initialize_V()
        self.initialize_P()
        print("Initial Policy:")
        self.print_policy()
        while True:
            self.policy_evaluation()
            if self.improvement_step():

                print("Final Policy:")
                self.print_policy()

                print("Final Values:")
                self.print_values()
                return None

    def print_values(self):
        for i in range(self.grid.rows):
            print("------------------------")
            for j in range(self.grid.cols):
                v = self.V.get((i, j), 0)
                if v >= 0:
                    print(" %.2f|" % v, end="")
                else:
                    print("%.2f|" % v, end="")
            print("")
        print("------------------------")

    def print_policy(self):
        for i in range(self.grid.rows):
            print("---------------------------")
            for j in range(self.grid.cols):
                a = self.P.get((i, j), '#')
                print("  %s  |" % a, end="")
            print("")
        print("------------------------")
예제 #3
0
from grid_world import GridWorld

gw = GridWorld(num_rows=4, num_columns=4)
gw.current_state = 14
print(gw.step('RIGHT'))