alpha = ALPHA / t2

    # instead of 'generating' an epsiode, we will PLAY
    # an episode within this loop
    s = (2, 0) # start state
    grid.set_state(s)

    # get Q(s) so we can choose the first action
    Qs = getQs(model, s)

    # the first (s, r) tuple is the state we start in and 0
    # (since we don't get a reward) for simply starting the game
    # the last (s, r) tuple is the terminal state and the final reward
    # the value for the terminal state is by definition 0, so we don't
    # care about updating it.
    a = max_dict(Qs)[0]
    a = random_action(a, eps=0.5/t) # epsilon-greedy
    biggest_change = 0
    while not grid.game_over():
      r = grid.move(a)
      s2 = grid.current_state()

      # we need the next action as well since Q(s,a) depends on Q(s',a')
      # if s2 not in policy then it's a terminal state, all Q are 0
      old_theta = model.theta.copy()
      if grid.is_terminal(s2):
        model.theta += alpha*(r - model.predict(s, a))*model.grad(s, a)
      else:
        # not terminal
        Qs2 = getQs(model, s2)
        a2 = max_dict(Qs2)[0]
Пример #2
0
        if it % 100 == 0:
            t += 1e-2
        if it % 2000 == 0:
            print("it:", it)

        # instead of 'generating' an epsiode, we will PLAY
        # an episode within this loop
        s = (2, 0)  # start state
        grid.set_state(s)

        # the first (s, r) tuple is the state we start in and 0
        # (since we don't get a reward) for simply starting the game
        # the last (s, r) tuple is the terminal state and the final reward
        # the value for the terminal state is by definition 0, so we don't
        # care about updating it.
        a = max_dict(Q[s])[0]
        biggest_change = 0
        while not grid.is_game_over():
            a = random_action(a, eps=0.5 / t)  # Epsilon-greedy

            r = grid.move(a)
            s2 = grid.current_state()

            # we will update Q(s,a) AS we experience the episode
            alpha = ALPHA / update_counts_sa[s][a]
            update_counts_sa[s][a] += 0.005

            old_qsa = Q[s][a]
            a2, max_q_s2a2 = max_dict(Q[s2])
            Q[s][a] = Q[s][a] + alpha * (r + GAMMA * max_q_s2a2 - Q[s][a])
            biggest_change = max(biggest_change, np.abs(old_qsa - Q[s][a]))
    if it % 100 == 0:
      t += 1e-2
    if it % 2000 == 0:
      print "it:", it

    # instead of 'generating' an epsiode, we will PLAY
    # an episode within this loop
    s = (2, 0) # start state
    grid.set_state(s)

    # the first (s, r) tuple is the state we start in and 0
    # (since we don't get a reward) for simply starting the game
    # the last (s, r) tuple is the terminal state and the final reward
    # the value for the terminal state is by definition 0, so we don't
    # care about updating it.
    a, _ = max_dict(Q[s])
    biggest_change = 0
    while not grid.game_over():
      a = random_action(a, eps=0.5/t) # epsilon-greedy
      # random action also works, but slower since you can bump into walls
      # a = np.random.choice(ALL_POSSIBLE_ACTIONS)
      r = grid.move(a)
      s2 = grid.current_state()

      # adaptive learning rate
      alpha = ALPHA / update_counts_sa[s][a]
      update_counts_sa[s][a] += 0.005

      # we will update Q(s,a) AS we experience the episode
      old_qsa = Q[s][a]
      # the difference between SARSA and Q-Learning is with Q-Learning
Пример #4
0
        states_actions_returns = play_game(grid, policy)
        seen_state_action_pairs = set()
        for state, action, G in states_actions_returns:
            # First-visit
            sa = (state, action)
            if sa not in seen_state_action_pairs:
                old_q = Q[state][action]
                returns[sa].append(G)
                Q[state][action] = np.mean(returns[sa])
                biggest_change = max(biggest_change, np.abs(old_q - Q[state][action]))
                seen_state_action_pairs.add(sa)
        deltas.append(biggest_change)

        # Update policy argmax
        for state in policy:
            action, _ = max_dict(Q[state])
            policy[state] = action

    plt.plot(deltas)
    plt.show()

    print('Final Policy:')
    print_policy(policy, grid)
    print('')

    V = {}
    for state, Qs in Q.items():
        V[state] = max_dict(Q[state])[1]

    print('Final Values:')
    print_values(V, grid)
        alpha = ALPHA / t2

        # instead of 'generating' an epsiode, we will PLAY
        # an episode within this loop
        s = (2, 0)  # start state
        grid.set_state(s)

        # get Q(s) so we can choose the first action
        Qs = getQs(model, s)

        # the first (s, r) tuple is the state we start in and 0
        # (since we don't get a reward) for simply starting the game
        # the last (s, r) tuple is the terminal state and the final reward
        # the value for the terminal state is by definition 0, so we don't
        # care about updating it.
        a = max_dict(Qs)[0]
        a = random_action(a, eps=0.5 / t)  # epsilon-greedy
        biggest_change = 0
        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state()

            # we need the next action as well since Q(s,a) depends on Q(s',a')
            # if s2 not in policy then it's a terminal state, all Q are 0
            old_theta = model.theta.copy()
            if grid.is_terminal(s2):
                model.theta += alpha * (r - model.predict(s, a)) * model.grad(
                    s, a)
            else:
                # not terminal
                Qs2 = getQs(model, s2)
    alpha = ALPHA / t2

    # instead of 'generating' an epsiode, we will PLAY
    # an episode within this loop
    s = (2, 0) # start state
    grid.set_state(s)

    # get Q(s) so we can choose the first action
    Qs = getQs(model, s)

    # the first (s, r) tuple is the state we start in and 0
    # (since we don't get a reward) for simply starting the game
    # the last (s, r) tuple is the terminal state and the final reward
    # the value for the terminal state is by definition 0, so we don't
    # care about updating it.
    a = max_dict(Qs)[0]
    a = random_action(a, eps=0.5/t) # epsilon-greedy
    biggest_change = 0
    while not grid.game_over():
      r = grid.move(a)
      s2 = grid.current_state()

      # we need the next action as well since Q(s,a) depends on Q(s',a')
      # if s2 not in policy then it's a terminal state, all Q are 0
      old_theta = model.theta.copy()
      if grid.is_terminal(s2):
        model.theta += alpha*(r - model.predict(s, a))*model.grad(s, a)
      else:
        # not terminal
        Qs2 = getQs(model, s2)
        a2, maxQs2a2 = max_dict(Qs2)
        seen_state_action_pairs = set()
        for s, a, G in states_actions_returns:
            # check if we have already seen s
            # called first-visit MC policy evaluation
            sa = (s, a)
            if sa not in seen_state_action_pairs:
                old_Q = Q[s][a]
                returns[sa].append(G)
                Q[s][a] = np.mean(returns[sa])
                biggest_change = max(biggest_change, np.abs(old_Q - Q[s][a]))
                seen_state_action_pairs.add(sa)
        deltas.append(biggest_change)

        # update policy
        for s in list(policy):
            policy[s] = max_dict(Q[s])[0]

    plt.plot(deltas)
    plt.show()

    print("final policy:")
    print_policy(policy, grid)

    # Find V
    V = {}
    for s, Qs in Q.items():
        V[s] = max_dict(Qs)[1]

    print("values:")
    print_values(V, grid)
Пример #8
0
        seen_state_action_pairs = set()
        for s, a, G in states_actions_returns:
            # check if we have already seen s
            # called "first-visit" MC policy evaluation
            sa = (s, a)
            if sa not in seen_state_action_pairs:
                old_q = Q[s][a]
                returns[sa].append(G)
                Q[s][a] = np.mean(returns[sa])
                biggest_change = max(biggest_change, np.abs(old_q - Q[s][a]))
                seen_state_action_pairs.add(sa)
        deltas.append(biggest_change)

        # calculate new policy pi(s) = argmax[a]{ Q(s,a) }
        for s in policy.keys():
            a, _ = max_dict(Q[s])
            policy[s] = a

    plt.plot(deltas)
    plt.show()

    # find the optimal state-value function
    # V(s) = max[a]{ Q(s,a) }
    V = {}
    for s in policy.keys():
        V[s] = max_dict(Q[s])[1]

    print("final values:")
    print_values(V, grid)
    print("final policy:")
    print_policy(policy, grid)
    deltas = []
    for it in range(100):
        print("\n\n\n Iteration: {}".format(it))
        print("Q: {}".format(Q))
        print("State-Action Count: {}".format(update_counts_sa))
        # instead of 'generating' an epsiode, we will PLAY
        # an episode within this loop
        s = (2, 0)  # start state
        grid.set_state(s)

        # the first (s, r) tuple is the state we start in and 0
        # (since we don't get a reward) for simply starting the game
        # the last (s, r) tuple is the terminal state and the final reward
        # the value for the terminal state is by definition 0, so we don't
        # care about updating it.
        a, _ = max_dict(Q[s])
        print("Max Q Action for State {} is: {}".format(s, a))
        biggest_change = 0
        while not grid.game_over():
            a = random_action(a, eps=0.5 / t)  # epsilon-greedy,
            # random action also works, but slower since you can bump into walls
            # a = np.random.choice(ALL_POSSIBLE_ACTIONS)
            r = grid.move(a)
            s2 = grid.current_state()
            print("\nCurrent State: {}".format(s))
            print("Action: {}".format(a))
            print("Reward: {}".format(r))
            print("Next State: {}".format(s2))

            # adaptive learning rate
            alpha = ALPHA / update_counts_sa[s][a]
Пример #10
0
        alpha = ALPHA / t2

        # instead of 'generating' an epsiode, we will PLAY
        # an episode within this loop
        s = (2, 0)  # start state
        grid.set_state(s)

        # get Q(s) so we can choose the first action
        Qs = getQs(model, s)

        # the first (s, r) tuple is the state we start in and 0
        # (since we don't get a reward) for simply starting the game
        # the last (s, r) tuple is the terminal state and the final reward
        # the value for the terminal state is by definition 0, so we don't
        # care about updating it.
        a = max_dict(Qs)[0]
        a = random_action(a, eps=0.5 / t)  # epsilon-greedy
        biggest_change = 0
        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state()

            # we need the next action as well since Q(s,a) depends on Q(s',a')
            # if s2 not in policy then it's a terminal state, all Q are 0
            old_theta = model.theta.copy()
            if grid.is_terminal(s2):
                model.theta += alpha * (r - model.predict(s, a)) * model.grad(
                    s, a)
            else:
                # not terminal
                Qs2 = getQs(model, s2)
Пример #11
0
    if it % 100 == 0:
      t += 1e-2
    if it % 2000 == 0:
      print "it:", it

    # instead of 'generating' an epsiode, we will PLAY
    # an episode within this loop
    s = (2, 0) # start state
    grid.set_state(s)

    # the first (s, r) tuple is the state we start in and 0
    # (since we don't get a reward) for simply starting the game
    # the last (s, r) tuple is the terminal state and the final reward
    # the value for the terminal state is by definition 0, so we don't
    # care about updating it.
    a = max_dict(Q[s])[0]
    a = random_action(a, eps=0.5/t)
    biggest_change = 0
    while not grid.game_over():
      r = grid.move(a)
      s2 = grid.current_state()

      # we need the next action as well since Q(s,a) depends on Q(s',a')
      # if s2 not in policy then it's a terminal state, all Q are 0
      a2 = max_dict(Q[s2])[0]
      a2 = random_action(a2, eps=0.5/t) # epsilon-greedy

      # we will update Q(s,a) AS we experience the episode
      alpha = ALPHA / update_counts_sa[s][a]
      update_counts_sa[s][a] += 0.005
      old_qsa = Q[s][a]
def epsilon_greedy(Q, s, eps=0.1):
    if np.random.random() < eps:
        return np.random.choice(ALL_POSSIBLE_ACTIONS)
    else:
        a_opt = max_dict(Q[s])[0]
        return a_opt
            print("it:", it)

        # begin a new episode
        s = grid.reset()
        episode_reward = 0
        while not grid.game_over():
            # perform action and get next state + reward
            a = epsilon_greedy(Q, s, eps=0.1)
            r = grid.move(a)
            s2 = grid.current_state()

            # update reward
            episode_reward += r

            # update Q(s,a)
            maxQ = max_dict(Q[s2])[1]
            Q[s][a] = Q[s][a] + ALPHA * (r + GAMMA * maxQ - Q[s][a])

            # we would like to know how often Q(s) has been updated too
            update_counts[s] = update_counts.get(s, 0) + 1

            # next state becomes current state
            s = s2

        # log the reward for this episode
        reward_per_episode.append(episode_reward)

    plt.plot(reward_per_episode)
    plt.title("reward_per_episode")
    plt.show()
Пример #14
0
     update_counts_sa[s] = {}
     for a in ALL_POSSIBLE_ACTIONS:
         update_counts_sa[s][a] = 1.0
         
 t = 1.0
 deltas = []
 for it in range(10000):
     if it % 100 == 0:
         t += 10e-3
     if it % 2000 == 0:
         print('it:', it)
         
     s = (2,0)
     grid.set_state(s)
     
     a = max_dict(Q[s])[0]
     a = random_action(a, eps = 0.5/t)            
     biggest_change = 0
     while not grid.game_over():
         r = grid.move(a)
         s1 = grid.current_state()
         
         a1 = max_dict(Q[s1])[0]
         a1 = random_action(a1, eps = 0.5/t)
         
         alpha = ALPHA/update_counts_sa[s][a]
         update_counts_sa[s][a] += 0.005
         old_qsa = Q[s][a]
         
         Q[s][a] = Q[s][a] + ALPHA*(r + GAMMA*Q[s1][a1] - Q[s][a])
         
    seen_state_action_pairs = set()
    for s, a, G in states_actions_returns:
      # check if we have already seen s
      # called "first-visit" MC policy evaluation
      sa = (s, a)
      if sa not in seen_state_action_pairs:
        old_q = Q[s][a]
        returns[sa].append(G)
        Q[s][a] = np.mean(returns[sa])
        biggest_change = max(biggest_change, np.abs(old_q - Q[s][a]))
        seen_state_action_pairs.add(sa)
    deltas.append(biggest_change)

    # calculate new policy pi(s) = argmax[a]{ Q(s,a) }
    for s in policy.keys():
      a, _ = max_dict(Q[s])
      policy[s] = a

  plt.plot(deltas)
  plt.show()

  # find the optimal state-value function
  # V(s) = max[a]{ Q(s,a) }
  V = {}
  for s in policy.keys():
    V[s] = max_dict(Q[s])[1]

  print("final values:")
  print_values(V, grid)
  print("final policy:")
  print_policy(policy, grid)
Пример #16
0
    rewards = []
    for it in range(10000):  # we start playing the game
        if it % 100 == 0:
            t += 1e-2  # how often and by how much we increae t represents a HYPERPARAMETER
        if it % 1000 == 0:
            print("it:", it, "/10000")

        # instead of generating an epsisod, we will play an epsisode withing the loop

        s = (2, 0)  # starting position
        grid.set_state(s)

        # the first (s,r) tuple is the start state and is equal to 0 (no rewards for starting the game)
        # the final (s,r) is the terminal state = 0 (by definition) -> no need to update it

        a = max_dict(Q[s])[0]
        a = random_action(
            a, eps=0.5 / t
        )  # epsilon greedy, as the action is choosen between the best available action and a random exploration action

        delta = 0
        reward = 0

        while not grid.game_over():

            r = grid.move(a)
            s2 = grid.current_state()

            a2 = max_dict(Q[s2])[0]  # we need the next action for Q(s',a')
            a2 = random_action(a2, eps=0.5 / t)
Пример #17
0
        if i % 1000 == 0:
            print(i)

        delta = 0
        # Policy Evaluation Step
        states_actions_returns = play_episode(grid, pi)
        seen_state_actions_pairs = set()
        for s, a, G in states_actions_returns:
            sa = (s, a)
            if sa not in seen_state_actions_pairs:
                old_q = Q[s][a]
                returns[sa].append(G)
                Q[s][a] = np.mean(returns[sa])
                delta = max(delta, np.abs(old_q - Q[s][a]))
                seen_state_actions_pairs.add(sa)
        deltas.append(delta)

        # Policy Improvement Step
        for s in pi.keys():
            pi[s] = max_dict(Q[s])[0]

    plt.plot(deltas)
    plt.show()

    print('Done')
    print_policy(pi, grid)

    V = {}
    for s, Qs in Q.items():
        V[s] = max_dict(Q[s])[1]
    print_values(V, grid)
Пример #18
0
        for a in ALL_POSSIBLE_ACTIONS:
            update_counts_sa[s][a] = 1.0

    # repeat until convergence
    t = 1.0
    deltas = []
    for it in range(10000):
        if it % 100 == 0:
            t += 10e-3
        if it % 2000 == 0:
            print('it:', it)

        s = (2, 0)
        grid.set_state(s)

        a = max_dict(Q[s])[0]
        a = random_action(a, eps=0.5 / t)
        biggest_change = 0
        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state()

            a2 = max_dict(Q[s2])[0]
            a2 = random_action(a2, eps=0.5 / t)

            # update Q(s,a) as we experience the episode
            alpha = ALPHA / update_counts_sa[s][a]
            update_counts_sa[s][a] += 0.005

            old_qsa = Q[s][a]
            Q[s][a] = Q[s][a] + alpha * (r + GAMMA * Q[s2][a2] - Q[s][a])
Пример #19
0
        Q[s] = {}
        update_counts_sa[s] = {}
        for a in ALL_POSSIBLE_ACTIONS:
            Q[s][a] = 0
            update_counts_sa[s][a] = 1.0

    t = 1.0
    delta = []
    for it in range(10000):
        if it % 100 == 0:
            t += 10e-3

        s = (2, 0)
        grid.set_state(s)

        a = max_dict(Q[s])[0]
        biggest_change = 0
        while not grid.game_over():
            a = random_action(a, eps=0.5 / t)
            r = grid.move(a)
            s2 = grid.current_state()

            a2, max_q2s2 = max_dict(Q[s2])
            alpha = ALPHA / update_counts_sa[s][a]
            update_counts_sa[s][a] += 0.005

            old_qsa = Q[s][a]
            Q[s][a] = Q[s][a] + alpha * (r + GAMMA * max_q2s2 - Q[s][a])
            biggest_change = max(biggest_change, np.abs(old_qsa - Q[s][a]))

            update_counts[s] = update_counts.get(s, 0) + 1
Пример #20
0
        if it % 100 == 0:
            t += 1e-2
        if it % 2000 == 0:
            print("it:", it)

        # instead of 'generating' an epsiode, we will PLAY
        # an episode within this loop
        s = (2, 0)  # start state
        grid.set_state(s)

        # the first (s, r) tuple is the state we start in and 0
        # (since we don't get a reward) for simply starting the game
        # the last (s, r) tuple is the terminal state and the final reward
        # the value for the terminal state is by definition 0, so we don't
        # care about updating it.
        a, _ = max_dict(Q[s])
        biggest_change = 0
        while not grid.game_over():
            a = random_action(a, eps=0.5 / t)  # epsilon-greedy
            # random action also works, but slower since you can bump into walls
            # a = np.random.choice(ALL_POSSIBLE_ACTIONS)
            r = grid.move(a)
            s2 = grid.current_state()

            # we will update Q(s,a) AS we experience the episode
            old_qsa = Q[s][a]
            # the difference between SARSA and Q-Learning is with Q-Learning
            # we will use this max[a']{ Q(s',a')} in our update
            # even if we do not end up taking this action in the next step
            a2, max_q_s2a2 = max_dict(Q[s2])
            Q[s][a] = Q[s][a] + ALPHA * (r + GAMMA * max_q_s2a2 - Q[s][a])
Пример #21
0
            s = s2
            a = a2

        # log the reward for this episode
        reward_per_episode.append(episode_reward)

    plt.plot(reward_per_episode)
    plt.title("reward_per_episode")
    plt.show()

    # determine the policy from Q*
    # find V* from Q*
    policy = {}
    V = {}
    for s in grid.actions.keys():
        a, max_q = max_dict(Q[s])
        policy[s] = a
        V[s] = max_q

    # what's the proportion of time we spend updating each part of Q?
    print("update counts:")
    total = np.sum(list(update_counts.values()))
    for k, v in update_counts.items():
        update_counts[k] = float(v) / total
    print_values(update_counts, grid)

    print("values:")
    print_values(V, grid)
    print("policy:")
    print_policy(policy, grid)