示例#1
0
文件: part3.py 项目: dteoh/easy21
def sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, yield_progress=False):
    q_sa = {}
    n_s = {}
    n_sa = {}

    for n in range(num_episodes):
        e_sa = {}
        state = State()
        s = state.as_tuple()
        a = epsilon_greedy_action(q_sa, s, calculate_epsilon(n_s, s))
        while not state.terminal:
            state, reward = step(state, a)
            n_s[s] = n_s.get(s, 0) + 1

            s_next = state.as_tuple()
            a_next = epsilon_greedy_action(q_sa, s_next,
                                           calculate_epsilon(n_s, s_next))

            sa = s + (a, )
            sa_next = s_next + (a_next, )
            qsa = q_sa.get(sa, 0)
            qsa_next = q_sa.get(sa_next, 0)

            nsa = n_sa.get(sa, 0) + 1
            n_sa[sa] = nsa

            delta = reward + gamma * qsa_next - qsa
            e_sa[sa] = e_sa.get(sa, 0) + 1
            for (s, a) in generate_all_state_action_pairs():
                sa = s + (a, )
                q_sa[sa] = q_sa.get(sa, 0) + (delta * e_sa.get(sa, 0)) / nsa
                e_sa[sa] = gamma * lamba * e_sa.get(sa, 0)

            s = s_next
            a = a_next

        if yield_progress:
            yield n + 1, q_sa

    if not yield_progress:
        yield num_episodes, q_sa
示例#2
0
def mc_control(num_episodes=10000):
    q_sa = {}
    p = {}
    n_s = {}
    n_sa = {}
    n0 = 100

    for _ in range(num_episodes):
        state = State()
        reward = 0
        episode_s = []
        episode_sa = []

        while not state.terminal:
            s = state.as_tuple()
            if s in p:
                a = sample_action(p[s])
            else:
                a = Action.random()

            episode_s.append(s)
            episode_sa.append(s + (a, ))
            state, reward = step(state, a)

            ns = n_s.get(s, 0)
            n_s[s] = ns + 1

            sa = s + (a, )
            nsa = n_sa.get(sa, 0)
            n_sa[sa] = nsa + 1

        # GLIE MC Control
        for sa in set(episode_sa):
            nsa = n_sa[sa]
            qsa = q_sa.get(sa, 0)
            q_sa[sa] = qsa + ((reward - qsa) / nsa)

        # Improve policy
        for s in set(episode_s):
            a_best = greedy_action(q_sa, s)
            ns = n_s.get(s, 0)
            epsilon = n0 / (n0 + ns)

            selection_probs = []
            for a in list(Action):
                if a is a_best:
                    selection_probs.append(1 - epsilon + epsilon / len(Action))
                else:
                    selection_probs.append(epsilon / len(Action))
            p[s] = selection_probs
    return q_sa
示例#3
0
文件: part4.py 项目: dteoh/easy21
def lfa_sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, alpha=0.01, yield_progress=False):

    # Set up the coarse codes, initial weights.
    action_codes = {}
    for action in list(Action):
        action_fns = []
        for dealer_interval in [(1,4), (4,7), (7,10)]:
            for player_interval in [(1,6), (4,9), (7,12), (10,15), (13,18), (16,21)]:
                cuboid_fn = create_cuboid_fn(dealer_interval, player_interval, action)
                action_fns.append(cuboid_fn)
        action_codes[action] = action_fns

    def greedy(s, w):
        p, d = s
        action_values = []
        for a in list(Action):
            value = 0
            for cuboid_fn in action_codes[a]:
                if cuboid_fn(p, d, a):
                    value += w.get(cuboid_fn, 0)
            action_values.append((a, value))
        action_values.sort(key=itemgetter(1), reverse=True)
        return action_values[0][0]

    def e_greedy(s, w, epsilon=0.05):
        a_best = greedy(s, w)
        selection_probs = []
        default_p = epsilon / len(Action)
        for a in list(Action):
            if a is a_best:
                selection_probs.append(1 - epsilon + default_p)
            else:
                selection_probs.append(default_p)
        return sample_action(selection_probs)

    def f_sa(s, a):
        p, d = s
        for cuboid_fn in action_codes[a]:
            if cuboid_fn(p, d, a):
                yield cuboid_fn

    def compile_q_sa(w):
        q_sa = {}
        for (p, d), a in generate_all_state_action_pairs():
            sa = (p, d, a)
            val = 0
            for i in f_sa((p, d), a):
                val += w.get(i, 0)
            q_sa[sa] = val
        return q_sa

    w_f = {}
    for n in range(num_episodes):
        state = State()
        s = state.as_tuple()
        a = e_greedy(s, w_f)
        z_f = {}
        while not state.terminal:
            state, reward = step(state, a)
            delta = reward
            for i in f_sa(s, a):
                delta = delta - w_f.get(i, 0)
                z_f[i] = z_f.get(i, 0) + 1
            if state.terminal:
                for i, zi in z_f.items():
                    w_f[i] = w_f.get(i, 0) + alpha * delta * zi
                break
            s_next = state.as_tuple()
            a_next = e_greedy(s_next, w_f)
            for i in f_sa(s_next, a_next):
                delta = delta + gamma * w_f.get(i, 0)
            for i, zi in z_f.items():
                w_f[i] = w_f.get(i, 0) + alpha * delta * zi
                z_f[i] = gamma * lamba * zi
            s = s_next
            a = a_next
        if yield_progress:
            yield n+1, compile_q_sa(w_f)

    if not yield_progress:
        yield num_episodes, compile_q_sa(w_f)