예제 #1
0
def qlearning_grid(lr, eps, epoch=100, searchbound=400):
    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    q = np.zeros((23, 4))

    for x in range(epoch):
        s = grid.d_zero()

        while s != [5, 5]:
            # choose new_a from new_s using policy derived from q
            pi_temp = pe.softmax(q[grid.get_index(s)], actions, eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]
            # print(q)
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)
            q[grid.get_index(s), actions.index(a)] += lr * (
                r + grid.gamma * np.max(q[grid.get_index(new_s)]) -
                q[grid.get_index(s), actions.index(a)])
            s = new_s
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = pe.softmax(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ',
              eps(x))
        # decay *= decay_rate

    return estimated_rewards
예제 #2
0
def td_grid(lrs):
    tabular = np.zeros(23 * 4)

    grid = Grid()
    grid.pi_params = tabular.reshape(23, 4)
    grid.softmax()

    print('gridworld td')

    alpha_result = []
    for alpha in lrs:
        estimated_v = np.zeros(23)
        print('alpha = ', alpha)
        # update tabular in 100 loops
        for x in range(100):
            s = grid.d_zero()
            count = 0
            while s != [5, 5] and count < 500:
                a = grid.pi(s)
                new_s, r = grid.P_and_R(s, a)
                i = grid.get_index(s)
                new_i = grid.get_index(new_s)
                estimated_v[i] += alpha * (r + grid.gamma * estimated_v[new_i] - estimated_v[i])
                s = new_s
                count += 1

        # calculate td in another 100 loops
        td_list = []
        for x in range(100):
            s = grid.d_zero()
            count = 0
            while s != [5, 5] and count < 500:
                a = grid.pi(s)
                new_s, r = grid.P_and_R(s, a)
                i = grid.get_index(s)
                new_i = grid.get_index(new_s)
                td_list.append((r + grid.gamma * estimated_v[new_i] - estimated_v[i]) ** 2)
                s = new_s
                count += 1
            td_list.append(0)

        print('square td = ', np.mean(np.array(td_list)))
        alpha_result.append(np.mean(np.array(td_list)))

    print('##########################')
    return alpha_result
예제 #3
0
def sarsa_lambda_grid(lr, l, eps, epoch=100, searchbound=400):
    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    # Initialize tabular-q arbitrarily
    q = np.zeros((23, 4))

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()

        # e ← 0
        e = np.zeros((23, 4))

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        pi_s = estimation.epsilon_greedy(q[grid.get_index(s)], actions, eps(x))
        a = np.random.choice(actions, 1, p=pi_s)[0]

        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5]:
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            # choose new_a from new_s using policy derived from q
            pi_temp = estimation.epsilon_greedy(q[grid.get_index(new_s)], actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # e ← γλe + ∂qw(s,a)/∂qw;
            e = l * grid.gamma * e
            e[grid.get_index(s), actions.index(a)] += 1
            # δ ← r + γqw(s′,a′) − qw(s,a);
            delta = r + grid.gamma * q[grid.get_index(new_s), actions.index(new_a)] - q[grid.get_index(s), actions.index(a)]
            # w ← w + αδe;
            q += lr * delta * e

            s = new_s
            a = new_a
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = estimation.epsilon_greedy(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x))
        # decay *= decay_rate

    return estimated_rewards
def reinforce_grid(lr, eps, epoch=100, searchbound=400):
    estimated_rewards = np.zeros(epoch)

    # theta is a representation of policy
    theta = np.zeros((23, 4))
    grid = Grid()
    actions = grid.action
    # print(epoch)

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()
        count = 0
        hist_s = []
        hist_a = []
        hist_r = []
        grid.pi_params = estimation.softmax(theta, eps(x))
        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5] and count < 1000:
            hist_s.append(s)
            a = grid.pi(s)
            hist_a.append(a)
            new_s, r = grid.P_and_R(s, a)
            hist_r.append(r)
            s = new_s
            count += 1

        # delta_j = 0
        decay = 1
        for i in range(len(hist_s)):
            g = 0
            gd = 1
            for j in range(i, len(hist_s)):
                g += gd * hist_r[j]
                gd *= grid.gamma
            theta[grid.get_index(hist_s[i]),
                  actions.index(hist_a[i])] += lr * decay * g
            decay *= grid.gamma

        grid.pi_params = estimation.softmax(theta, eps(x))
        # grid.softmax()
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        if x == epoch - 1:
            print('episode: ', x, ', reward: ', estimated_rewards[x])
        # decay *= decay_rate

    return estimated_rewards
def actor_critic_grid(lr, eps, epoch=100, searchbound=400):
    estimated_rewards = np.zeros(epoch)

    # Initialize tabular-v arbitrarily
    v = np.zeros(23)
    # theta is a representation of policy
    theta = np.zeros((23, 4))
    grid = Grid()
    actions = grid.action

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()
        count = 0
        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5] and count < 1000:
            # a ∼ π(s, ·);
            grid.pi_params = estimation.softmax(theta, eps(x))
            a = grid.pi(s)
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            # Critic update using TD(λ)
            # e ← γλe + ∂qw(s,a)/∂qw;
            delta = r + grid.gamma * v[grid.get_index(new_s)] - v[
                grid.get_index(s)]
            # w←w+αδev;
            v[grid.get_index(s)] += lr * delta

            theta[grid.get_index(s), actions.index(a)] += lr * delta
            # print(theta)

            s = new_s
            count += 1
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = estimation.softmax(theta, eps(x))
        # grid.softmax()
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        if x == 99:
            print('episode: ', x, ', reward: ', estimated_rewards[x])
        # decay *= decay_rate

    return estimated_rewards