예제 #1
0
def grid_sampling(theta, cm, K, Ke, N, epsilon):
    theta_list = np.random.multivariate_normal(theta, cm, K)
    result_list = []
    for x in range(K):
        # concurrent_eval(theta_list, x, result_list, N)
        avg_reward = 0
        for i in range(N):
            grid = Grid()
            grid.pi_params = theta_list[x].reshape(23, 4)
            grid.softmax()
            epi = GridEpisode(grid)
            avg_reward += epi.run_all_steps()
        result_list.append((theta_list[x], avg_reward / N))

    # print(sorted(result_list, key=lambda n: n[-1], reverse=True))
    elite_list = sorted(result_list, key=lambda n: n[-1], reverse=True)[:Ke]
    # print(elite_list)
    theta_final = np.zeros(92)
    cm_final = epsilon * np.identity(92)
    J_final = 0
    for t in elite_list:
        theta_final += t[0]
        cm_final += np.array([t[0] - theta]).T.dot(np.array([t[0] - theta]))
        J_final += t[1]
    theta_final /= Ke
    cm_final /= (epsilon + Ke)
    # print(cm_final)
    J_final /= Ke
    return theta_final, cm_final, J_final
def grid_evaluate(table, N):
    avg_reward = 0
    for i in range(N):
        g = Grid()
        g.pi_params = table
        g.softmax()
        epi = GridEpisode(g)
        avg_reward += epi.run_all_steps()
    return avg_reward / N
예제 #3
0
def multi_grid_episode(table, l):
    # total_reward = 0
    for i in l:
        grid = Grid()
        # print(i)
        grid.pi_params = table
        grid.softmax()
        epi = GridEpisode(grid)
        grid_q.put(epi.run_all_steps())
    return 0
예제 #4
0
def grid_evaluate(t, N):
    reward_l = []
    table = t.reshape(23, 4)

    for i in range(N):
        # concurrent_eval(theta_list, x, result_list, N)
        grid = Grid()
        # print(i)
        grid.pi_params = table
        grid.softmax()
        epi = GridEpisode(grid)
        reward_l.append(epi.run_all_steps())
    return sum(reward_l) / N
def reinforce_grid(lr, eps, epoch=100, searchbound=400):
    estimated_rewards = np.zeros(epoch)

    # theta is a representation of policy
    theta = np.zeros((23, 4))
    grid = Grid()
    actions = grid.action
    # print(epoch)

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()
        count = 0
        hist_s = []
        hist_a = []
        hist_r = []
        grid.pi_params = estimation.softmax(theta, eps(x))
        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5] and count < 1000:
            hist_s.append(s)
            a = grid.pi(s)
            hist_a.append(a)
            new_s, r = grid.P_and_R(s, a)
            hist_r.append(r)
            s = new_s
            count += 1

        # delta_j = 0
        decay = 1
        for i in range(len(hist_s)):
            g = 0
            gd = 1
            for j in range(i, len(hist_s)):
                g += gd * hist_r[j]
                gd *= grid.gamma
            theta[grid.get_index(hist_s[i]),
                  actions.index(hist_a[i])] += lr * decay * g
            decay *= grid.gamma

        grid.pi_params = estimation.softmax(theta, eps(x))
        # grid.softmax()
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        if x == epoch - 1:
            print('episode: ', x, ', reward: ', estimated_rewards[x])
        # decay *= decay_rate

    return estimated_rewards
예제 #6
0
def td_grid(lrs):
    tabular = np.zeros(23 * 4)

    grid = Grid()
    grid.pi_params = tabular.reshape(23, 4)
    grid.softmax()

    print('gridworld td')

    alpha_result = []
    for alpha in lrs:
        estimated_v = np.zeros(23)
        print('alpha = ', alpha)
        # update tabular in 100 loops
        for x in range(100):
            s = grid.d_zero()
            count = 0
            while s != [5, 5] and count < 500:
                a = grid.pi(s)
                new_s, r = grid.P_and_R(s, a)
                i = grid.get_index(s)
                new_i = grid.get_index(new_s)
                estimated_v[i] += alpha * (r + grid.gamma * estimated_v[new_i] - estimated_v[i])
                s = new_s
                count += 1

        # calculate td in another 100 loops
        td_list = []
        for x in range(100):
            s = grid.d_zero()
            count = 0
            while s != [5, 5] and count < 500:
                a = grid.pi(s)
                new_s, r = grid.P_and_R(s, a)
                i = grid.get_index(s)
                new_i = grid.get_index(new_s)
                td_list.append((r + grid.gamma * estimated_v[new_i] - estimated_v[i]) ** 2)
                s = new_s
                count += 1
            td_list.append(0)

        print('square td = ', np.mean(np.array(td_list)))
        alpha_result.append(np.mean(np.array(td_list)))

    print('##########################')
    return alpha_result
예제 #7
0
def sarsa_grid(lr, eps, epoch=100, searchbound=400):
    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    q = np.zeros((23, 4))

    for x in range(epoch):
        s = grid.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        pi_s = pe.softmax(q[grid.get_index(s)], actions, eps(x))
        a = np.random.choice(actions, 1, p=pi_s)[0]

        while s != [5, 5]:
            # print(q)
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            # choose new_a from new_s using policy derived from q
            pi_temp = pe.softmax(q[grid.get_index(new_s)], actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            q[grid.get_index(s), actions.index(a)] += lr * (
                r + grid.gamma * q[grid.get_index(new_s),
                                   actions.index(new_a)] -
                q[grid.get_index(s), actions.index(a)])
            s = new_s
            a = new_a
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = pe.softmax(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ',
              eps(x))
        # decay *= decay_rate

    return estimated_rewards
def qlearning_grid(lr, eps, epoch=100, searchbound=400):
    q = np.zeros((23, 4))

    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    for x in range(epoch):
        s = grid.d_zero()
        while s != [5, 5]:
            # Choose a from s using a policy derived from q;
            pi_temp = pe.epsilon_greedy(q[grid.get_index(s)], actions, eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]

            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            q[grid.get_index(s), actions.index(a)] += lr * (
                r + grid.gamma * np.max(q[grid.get_index(new_s)]) -
                q[grid.get_index(s), actions.index(a)])
            s = new_s

        grid.pi_params = pe.epsilon_greedy(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x],
              ', epsilon: ', eps(x))

    return estimated_rewards
def cartpole_evaluate(table, N):
    avg_reward = 0
    for i in range(N):
        cartpole = CartPole()
        cartpole.pi_params = table
        epi = CartPoleEpisode(cartpole)
        avg_reward += epi.run_all_steps()
    return avg_reward / N


tic = time.time()

theta = np.ones(92) * 0.25
theta_f = grid_param_sampling(theta, 0.5, 200)
grid = Grid()
grid.pi_params = theta_f.reshape(23, 4)
grid.softmax()
episode = GridEpisode(grid)

print('optimized reward: ', episode.run_all_steps())
print('optimized theta: ', theta_f.reshape(23, 4))

# theta = np.ones(8) * 0.25
# theta_f = cartpole_sampling(theta, 0.5, 500)
# cartpole = CartPole()
# cartpole.pi_params = theta_f.reshape(4, 2)
# episode = CartPoleEpisode(cartpole)

# print('optimized reward: ', episode.run_all_steps())
# print('optimized theta: ', theta_f.reshape(4, 2))
예제 #10
0
def sarsa_lambda_grid(lr, l, eps, epoch=100, searchbound=400):
    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    # Initialize tabular-q arbitrarily
    q = np.zeros((23, 4))

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()

        # e ← 0
        e = np.zeros((23, 4))

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        pi_s = estimation.epsilon_greedy(q[grid.get_index(s)], actions, eps(x))
        a = np.random.choice(actions, 1, p=pi_s)[0]

        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5]:
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            # choose new_a from new_s using policy derived from q
            pi_temp = estimation.epsilon_greedy(q[grid.get_index(new_s)], actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # e ← γλe + ∂qw(s,a)/∂qw;
            e = l * grid.gamma * e
            e[grid.get_index(s), actions.index(a)] += 1
            # δ ← r + γqw(s′,a′) − qw(s,a);
            delta = r + grid.gamma * q[grid.get_index(new_s), actions.index(new_a)] - q[grid.get_index(s), actions.index(a)]
            # w ← w + αδe;
            q += lr * delta * e

            s = new_s
            a = new_a
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = estimation.epsilon_greedy(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x))
        # decay *= decay_rate

    return estimated_rewards
def actor_critic_grid(lr, eps, epoch=100, searchbound=400):
    estimated_rewards = np.zeros(epoch)

    # Initialize tabular-v arbitrarily
    v = np.zeros(23)
    # theta is a representation of policy
    theta = np.zeros((23, 4))
    grid = Grid()
    actions = grid.action

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()
        count = 0
        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5] and count < 1000:
            # a ∼ π(s, ·);
            grid.pi_params = estimation.softmax(theta, eps(x))
            a = grid.pi(s)
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            # Critic update using TD(λ)
            # e ← γλe + ∂qw(s,a)/∂qw;
            delta = r + grid.gamma * v[grid.get_index(new_s)] - v[
                grid.get_index(s)]
            # w←w+αδev;
            v[grid.get_index(s)] += lr * delta

            theta[grid.get_index(s), actions.index(a)] += lr * delta
            # print(theta)

            s = new_s
            count += 1
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = estimation.softmax(theta, eps(x))
        # grid.softmax()
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        if x == 99:
            print('episode: ', x, ', reward: ', estimated_rewards[x])
        # decay *= decay_rate

    return estimated_rewards