Пример #1
0
def grid_sampling(theta, cm, K, Ke, N, epsilon):
    theta_list = np.random.multivariate_normal(theta, cm, K)
    result_list = []
    for x in range(K):
        # concurrent_eval(theta_list, x, result_list, N)
        avg_reward = 0
        for i in range(N):
            grid = Grid()
            grid.pi_params = theta_list[x].reshape(23, 4)
            grid.softmax()
            epi = GridEpisode(grid)
            avg_reward += epi.run_all_steps()
        result_list.append((theta_list[x], avg_reward / N))

    # print(sorted(result_list, key=lambda n: n[-1], reverse=True))
    elite_list = sorted(result_list, key=lambda n: n[-1], reverse=True)[:Ke]
    # print(elite_list)
    theta_final = np.zeros(92)
    cm_final = epsilon * np.identity(92)
    J_final = 0
    for t in elite_list:
        theta_final += t[0]
        cm_final += np.array([t[0] - theta]).T.dot(np.array([t[0] - theta]))
        J_final += t[1]
    theta_final /= Ke
    cm_final /= (epsilon + Ke)
    # print(cm_final)
    J_final /= Ke
    return theta_final, cm_final, J_final
Пример #2
0
def qlearning_grid(lr, eps, epoch=100, searchbound=400):
    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    q = np.zeros((23, 4))

    for x in range(epoch):
        s = grid.d_zero()

        while s != [5, 5]:
            # choose new_a from new_s using policy derived from q
            pi_temp = pe.softmax(q[grid.get_index(s)], actions, eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]
            # print(q)
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)
            q[grid.get_index(s), actions.index(a)] += lr * (
                r + grid.gamma * np.max(q[grid.get_index(new_s)]) -
                q[grid.get_index(s), actions.index(a)])
            s = new_s
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = pe.softmax(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ',
              eps(x))
        # decay *= decay_rate

    return estimated_rewards
def grid_evaluate(table, N):
    avg_reward = 0
    for i in range(N):
        g = Grid()
        g.pi_params = table
        g.softmax()
        epi = GridEpisode(g)
        avg_reward += epi.run_all_steps()
    return avg_reward / N
Пример #4
0
def multi_grid_episode(table, l):
    # total_reward = 0
    for i in l:
        grid = Grid()
        # print(i)
        grid.pi_params = table
        grid.softmax()
        epi = GridEpisode(grid)
        grid_q.put(epi.run_all_steps())
    return 0
Пример #5
0
def grid_evaluate(t, N):
    reward_l = []
    table = t.reshape(23, 4)

    for i in range(N):
        # concurrent_eval(theta_list, x, result_list, N)
        grid = Grid()
        # print(i)
        grid.pi_params = table
        grid.softmax()
        epi = GridEpisode(grid)
        reward_l.append(epi.run_all_steps())
    return sum(reward_l) / N
Пример #6
0
def sarsa_lambda_grid(lr, l, eps, epoch=100, searchbound=400):
    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    # Initialize tabular-q arbitrarily
    q = np.zeros((23, 4))

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()

        # e ← 0
        e = np.zeros((23, 4))

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        pi_s = estimation.epsilon_greedy(q[grid.get_index(s)], actions, eps(x))
        a = np.random.choice(actions, 1, p=pi_s)[0]

        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5]:
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            # choose new_a from new_s using policy derived from q
            pi_temp = estimation.epsilon_greedy(q[grid.get_index(new_s)], actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # e ← γλe + ∂qw(s,a)/∂qw;
            e = l * grid.gamma * e
            e[grid.get_index(s), actions.index(a)] += 1
            # δ ← r + γqw(s′,a′) − qw(s,a);
            delta = r + grid.gamma * q[grid.get_index(new_s), actions.index(new_a)] - q[grid.get_index(s), actions.index(a)]
            # w ← w + αδe;
            q += lr * delta * e

            s = new_s
            a = new_a
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = estimation.epsilon_greedy(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x))
        # decay *= decay_rate

    return estimated_rewards
Пример #7
0
def td_grid(lrs):
    tabular = np.zeros(23 * 4)

    grid = Grid()
    grid.pi_params = tabular.reshape(23, 4)
    grid.softmax()

    print('gridworld td')

    alpha_result = []
    for alpha in lrs:
        estimated_v = np.zeros(23)
        print('alpha = ', alpha)
        # update tabular in 100 loops
        for x in range(100):
            s = grid.d_zero()
            count = 0
            while s != [5, 5] and count < 500:
                a = grid.pi(s)
                new_s, r = grid.P_and_R(s, a)
                i = grid.get_index(s)
                new_i = grid.get_index(new_s)
                estimated_v[i] += alpha * (r + grid.gamma * estimated_v[new_i] - estimated_v[i])
                s = new_s
                count += 1

        # calculate td in another 100 loops
        td_list = []
        for x in range(100):
            s = grid.d_zero()
            count = 0
            while s != [5, 5] and count < 500:
                a = grid.pi(s)
                new_s, r = grid.P_and_R(s, a)
                i = grid.get_index(s)
                new_i = grid.get_index(new_s)
                td_list.append((r + grid.gamma * estimated_v[new_i] - estimated_v[i]) ** 2)
                s = new_s
                count += 1
            td_list.append(0)

        print('square td = ', np.mean(np.array(td_list)))
        alpha_result.append(np.mean(np.array(td_list)))

    print('##########################')
    return alpha_result
    avg_reward = 0
    for i in range(N):
        cartpole = CartPole()
        cartpole.pi_params = table
        epi = CartPoleEpisode(cartpole)
        avg_reward += epi.run_all_steps()
    return avg_reward / N


tic = time.time()

theta = np.ones(92) * 0.25
theta_f = grid_param_sampling(theta, 0.5, 200)
grid = Grid()
grid.pi_params = theta_f.reshape(23, 4)
grid.softmax()
episode = GridEpisode(grid)

print('optimized reward: ', episode.run_all_steps())
print('optimized theta: ', theta_f.reshape(23, 4))

# theta = np.ones(8) * 0.25
# theta_f = cartpole_sampling(theta, 0.5, 500)
# cartpole = CartPole()
# cartpole.pi_params = theta_f.reshape(4, 2)
# episode = CartPoleEpisode(cartpole)

# print('optimized reward: ', episode.run_all_steps())
# print('optimized theta: ', theta_f.reshape(4, 2))

toc = time.time()