(2, 2): 'U',
        (2, 3): 'L',
    }

    model = Model()
    deltas = []

    # repeat until convergence
    k = 1.0
    for it in range(20000):
        if it % 10 == 0:
            k += 0.01
        alpha = ALPHA / k
        biggest_change = 0

        states_and_rewards = play_game(grid, policy)

        for t in range(len(states_and_rewards) - 1):
            s, _ = states_and_rewards[t]
            s2, r = states_and_rewards[t+1]
            # We update V(s) as we experience the episode
            old_theta = model.theta.copy()
            if grid.is_terminal(s2):
                target = r
            else:
                target = r + GAMMA * model.predict(s2)
            model.theta += alpha*(target - model.predict(s))*model.grad(s)
            biggest_change = max(biggest_change, np.abs(old_theta - model.theta).sum())
        deltas.append(biggest_change)

    plt.plot(deltas)
    (2, 3): 'U',
  }

  model = Model()
  deltas = []

  # repeat until convergence
  k = 1.0
  for it in range(20000):
    if it % 10 == 0:
      k += 0.01
    alpha = ALPHA/k
    biggest_change = 0

    # generate an episode using pi
    states_and_rewards = play_game(grid, policy)
    # the first (s, r) tuple is the state we start in and 0
    # (since we don't get a reward) for simply starting the game
    # the last (s, r) tuple is the terminal state and the final reward
    # the value for the terminal state is by definition 0, so we don't
    # care about updating it.
    for t in range(len(states_and_rewards) - 1):
      s, _ = states_and_rewards[t]
      s2, r = states_and_rewards[t+1]
      # we will update V(s) AS we experience the episode
      old_theta = model.theta.copy()
      if grid.is_terminal(s2):
        target = r
      else:
        target = r + GAMMA*model.predict(s2)
      model.theta += alpha*(target - model.predict(s))*model.grad(s)