Exemplo n.º 1
0
def on_mc():
    grid_size = 4
    env = Env(grid_size)
    policy = EspionGreedyPolicy(env.actions(), range(grid_size**2))
    Q = defaultdict(float)
    R = defaultdict(list)
    for i in range(5000):
        G = 0
        states = get_episode(env, policy)
        for (s0, a, s1, r) in reversed(states):
            G = 0.9 * G + r
            R[(s0, a)].append(G)
            Q[(s0, a)] = sum(R[(s0, a)]) / len(R[(s0, a)])

        for (s0, a, s1, r) in reversed(states):
            mm = [(x, Q[(s0, x)]) for x in env.actions()]
            action = max(mm, key=lambda x: x[1])[0]
            policy.set_max(s0, action)

    Pi = {}
    for i in range(grid_size**2):
        Pi[i] = policy.get_m(i)
    for t in env.get_t():
        Pi[t] = 'ter'

    env.render(Pi)
Exemplo n.º 2
0
def off_mc():
    env = Env(6)
    policy = RandomPolicy(env.actions())
    C = defaultdict(float)
    Q = defaultdict(float)
    Pi = {}
    for i in range(10000):
        G = 0
        W = 1.0
        n = 0
        states = get_episode(env, policy)
        for (s0, a, s1, r) in reversed(states):
            n += 1
            G = 0.9 * G + r
            C[(s0, a)] += W
            Q[(s0, a)] += W / C[(s0, a)] * (G - Q[(s0, a)])
            Pi[s0] = max([(x, Q[(s0, x)]) for x in env.actions()],
                         key=lambda x: x[1])[0]
            if a != Pi[s0]:
                break
            W = W / policy.get_p(s0, a)

    for t in env.get_t():
        Pi[t] = 'ter'
    env.render(Pi)
Exemplo n.º 3
0
def sarsa():
    grid_size = 4
    env = Env(grid_size)
    policy = EspionGreedyPolicy(env.actions(), range(grid_size**2))
    Q = defaultdict(float)
    for i in range(5000):
        s0 = env.init()
        if env.is_t(s0):
            continue
        a0 = policy.get_a(s0)
        while not env.is_t(s0):
            s, r = env.step(a0)
            a = policy.get_a(s)
            Q[(s0, a0)] += 0.9 * (r + 0.9 * Q[(s, a)] - Q[(s0, a0)])
            s0 = s
            a0 = a
            mm = [(x, Q[(s0, x)]) for x in env.actions()]
            action = max(mm, key=lambda x:x[1])[0]
            policy.set_max(s0, action)

    Pi = {}
    for i in range(grid_size**2):
        Pi[i] = policy.get_m(i)
    for t in env.get_t():
        Pi[t] = 'ter'

    env.render(Pi)
class Sarsa(object):
    def __init__(self, size=4):
        self.grid_size = size
        self.env = Env(self.grid_size)
        self.a_id = dict([(a, i) for i, a in enumerate(self.env.actions())])
        self.policy = EspionGreedyPolicy(self.env.actions(),
                                         range(self.grid_size**2))

    def get_f(self, s, a):
        f = range(self.grid_size**2 + 4)
        f[s], f[self.a_id[a]] = 1, 1
        return f

    def sarsa(self):
        policy = self.policy
        Q = SGDRegressor()
        f = self.get_f(1, 'left')
        Q.fit([f], [1])
        for i in range(500):
            s0 = self.env.init()
            if self.env.is_t(s0):
                continue
            a0 = policy.get_a(s0)
            while not self.env.is_t(s0):
                s, r = self.env.step(a0)
                a = policy.get_a(s)
                f0 = self.get_f(s0, a0)
                f = self.get_f(s, a)
                target = Q.predict([f0])[0] + 0.9 * (
                    r + 0.9 * Q.predict([f])[0] - Q.predict([f0])[0])
                Q.partial_fit([f], [target])
                s0 = s
                a0 = a
                mm = [(x, Q.predict([self.get_f(s0, x)])[0])
                      for x in self.env.actions()]
                action = max(mm, key=lambda x: x[1])[0]
                policy.set_max(s0, action)

        Pi = {}
        for i in range(self.grid_size**2):
            Pi[i] = policy.get_m(i)
        for t in self.env.get_t():
            Pi[t] = 'ter'

        self.env.render(Pi)