def main():
    env = standard_grid()
    vs = monte_carlo_evaluation(standard_grid, win_policy)
    render_vs(env, vs)

    env = standard_grid()
    vs = monte_carlo_evaluation(standard_grid, win_policy)
    render_vs(env, vs)
示例#2
0
def main():
    grid = standard_grid(obey_prob=1.0, step_cost=None)
    print_values(grid.rewards, grid)
    V, Policy, Deltas = monte_carlo(grid)
    print_values(V, grid)
    print_policy(Policy, grid)
    plt.plot(Deltas)
    plt.show()
def main():
  env = standard_grid()
  qs = monte_carlo_control(standard_grid, epsilon_soft_greedy)
  render_qs_policy(env, qs)
  print()
  env = negative_grid()
  qs = monte_carlo_control(negative_grid, epsilon_soft_greedy)
  render_qs_policy(env, qs)
def main():
    print('Standard Grid')
    env = standard_grid()
    policy, vs = policy_iteration(env)
    render_vs(env, vs)
    render_policy(env, policy)

    print('Negative Grid:')
    env = negative_grid()
    policy, vs = policy_iteration(env)
    render_vs(env, vs)
    render_policy(env, policy)
示例#5
0
def main():
    print('Standard Grid')
    env = standard_grid()
    v_star = value_iteration(env)
    render_vs(env, v_star)
    render_policy(env, policy_from_v(env, v_star))

    print('Negative Grid:')
    env = negative_grid()
    v_star = value_iteration(env)
    render_vs(env, v_star)
    render_policy(env, policy_from_v(env, v_star))
示例#6
0
def first_visit_monte_carlo_prediction(pi, N):
    V = {}
    all_returns = {}  # default = []
    for i in range(N):
        visited_states = set()
        states_and_returns = play_episode(standard_grid(), pi)
        for s, g in states_and_returns:
            if s not in visited_states:
                visited_states.add(s)
                if s not in all_returns:
                    all_returns[s] = []
                all_returns[s].append(g)
                V[s] = np.mean(all_returns[s])
    return V
示例#7
0
def negative_grid(step_cost=-0.1):
    # in this game we want to try to minimize the number of moves
    # so we will penalize every move
    g = standard_grid()
    g.rewards.update({
        (0, 0): step_cost,
        (0, 1): step_cost,
        (0, 2): step_cost,
        (1, 0): step_cost,
        (1, 2): step_cost,
        (2, 0): step_cost,
        (2, 1): step_cost,
        (2, 2): step_cost,
        (2, 3): step_cost,
    })
    return g
def main():
  grid = standard_grid(obey_prob=1.0, step_cost=None)

  # print rewards
  print("rewards:")
  print_values(grid.rewards, grid)

  V, policy, deltas = monte_carlo(grid)

  print("final values:")
  print_values(V, grid)
  print("final policy:")
  print_policy(policy, grid)

  plt.plot(deltas)
  plt.show()
示例#9
0
def temporal_difference(alpha=0.1, gamma=0.9):
    env = standard_grid()
    V = {}
    policy = initial_policy(env)
    states = env.all_states()
    for s in states:
        V[s] = 0

    for i in range(2000):

        s_r = play_game(env, policy, (2, 0))
        for t in range(len(s_r) - 1):
            s, r = s_r[t]
            s1, r1 = s_r[t + 1]
            V[s] = V[s] + alpha * (r1 + gamma * V[s1] - V[s])

    return (env, V, policy)
示例#10
0
def q_learning(episodes=2000, initial_state=(2, 0), alpha=0.1, gamma=0.9):
    env = standard_grid()
    Q = initial_Q(env, initial_value=0)
    s = initial_state

    for episode in range(episodes):
        s = initial_state
        env.set_state(s)

        while not env.game_over():
            a = choose_action(env.all_actions, max_dict(Q[s])[0])  # action
            s2 = env.move(a)  # state
            r = env.get_state_reward()  # reward
            a2 = max_dict(Q[s2])[0]
            Q[s][a] = Q[s][a] + alpha * (r + gamma * Q[s2][a2] - Q[s][a])
            s = s2
            a = a2
    return (Q, env)
示例#11
0
def main(
):  #If you want to run using python interpreter directly, replace def main(): to if __name__ == '__main__':
    #Create enviroment
    env = standard_grid(obey_prob=0.9, step_cost=None)
    #Create agent
    agent = QLearningAgent(env.all_states(), CONST_ACTION_LST)
    #Learn Policy by playing many episodes and Q-Learning adapting the Policy
    for episode in range(10000):
        env.set_state(CONST_START_STATE)
        state = env.current_state()
        while True:
            action = agent.get_action(state)
            reward = env.move(action)
            next_state = env.current_state()
            agent.learn(state, action, reward, next_state)
            if env.game_over():
                break
            state = next_state
    print(agent.Q)
        if env.is_terminal(env.current_state()):
            target = reward
        else:
            target = reward + discount * next_est

        # Update current state
        theta = theta + alpha * (target - c_est) * x
    return theta


def semi_gradient_td(create_env, policy, episodes=100000):
    theta = np.random.randn(4) / 2

    for _ in range(episodes):
        theta = td_episode(create_env, policy, theta)

    return theta


if __name__ == '__main__':
    env = standard_grid()
    theta = semi_gradient_td(standard_grid, eps_win_policy)
    vs = get_value(env, theta, preprocess_features)
    render_vs(env, vs)

    print()

    env = negative_grid()
    theta = semi_gradient_td(negative_grid, eps_win_policy)
    vs = get_value(env, theta, preprocess_features)
    render_vs(env, vs)
示例#13
0
    elif len(sys.argv) > 1:
        try:
            obey_prob = float(sys.argv[1])
            step_cost = 0
        except:
            print("Bad arguments: Usage python " + sys.argv[0] +
                  " obey_prob(float) + step_cost(float)")
            sys.exit()
    else:
        step_cost = 0
        obey_prob = 1.0

# this grid gives you a reward of -0.1 for every non-terminal state
# we want to see if this will encourage finding a shorter path to the goal
    grid = standard_grid(obey_prob=obey_prob, step_cost=step_cost)

    # print rewards
    print("rewards:")
    print_values(grid.rewards, grid)

    # calculate accurate values for each square
    values = calculate_values(grid)

    # calculate the optimum policy based on our values
    policy = calculate_greedy_policy(grid, values)

    # our goal here is to verify that we get the same answer as with policy iteration
    print("values:")
    print_values(values, grid)
    print("optimal policy:")
示例#14
0
def calculate_greedy_policy(grid, V):
  policy = initialize_random_policy()
  # find a policy that leads to optimal value function
  for s in policy.keys():
    grid.set_state(s)
    # loop through all possible actions to find the best current action
    best_a, _ = best_action_value(grid, V, s)
    policy[s] = best_a
  return policy


if __name__ == '__main__':
  # this grid gives you a reward of -0.1 for every non-terminal state
  # we want to see if this will encourage finding a shorter path to the goal
  grid = standard_grid(obey_prob=1, step_cost=-.4)

  # print rewards
  print("rewards:")
  print_values(grid.rewards, grid)

  # calculate accurate values for each square
  V = calculate_values(grid)

  # calculate the optimum policy based on our values
  policy = calculate_greedy_policy(grid, V)

  # our goal here is to verify that we get the same answer as with policy iteration
  print("values:")
  print_values(V, grid)
  print("policy:")
def calculate_greedy_policy(grid, V):
    policy = initialize_random_policy()
    # find a policy that leads to optimal value function
    for s in policy.keys():
        grid.set_state(s)
        # loop through all possible actions to find the best current action
        best_a, _ = best_action_value(grid, V, s)
        policy[s] = best_a
    return policy


if __name__ == '__main__':
    # this grid gives you a reward of -0.1 for every non-terminal state
    # we want to see if this will encourage finding a shorter path to the goal
    grid = standard_grid(obey_prob=0.8, step_cost=-0.05)

    # print rewards
    print("rewards:")
    print_values(grid.rewards, grid)

    # calculate accurate values for each square
    V = calculate_values(grid)

    # calculate the optimum policy based on our values
    policy = calculate_greedy_policy(grid, V)

    # our goal here is to verify that we get the same answer as with policy iteration
    print("values:")
    print_values(V, grid)
    print("policy:")
    # the value of the terminal state is 0 by definition
    # we should ignore the first state we encounter
    # and ignore the last G, which is meaningless since it doesn't correspond to any move
    if first:
      first = False
    else:
      states_and_returns.append((s, G))
    G = r + GAMMA*G
  states_and_returns.reverse() # we want it to be in order of state visited
  return states_and_returns


if __name__ == '__main__':
  # use the standard grid again (0 for every step) so that we can compare
  # to iterative policy evaluation
  grid = standard_grid()

  # print rewards
  print "rewards:"
  print_values(grid.rewards, grid)

  # state -> action
  # found by policy_iteration_random on standard_grid
  # MC method won't get exactly this, but should be close
  # values:
  # ---------------------------
  #  0.43|  0.56|  0.72|  0.00|
  # ---------------------------
  #  0.33|  0.00|  0.21|  0.00|
  # ---------------------------
  #  0.25|  0.18|  0.11| -0.17|
示例#17
0
def calculate_greedy_policy(grid, V):
    policy = initialize_random_policy()
    # find a policy that leads to optimal value function
    for s in policy.keys():
        grid.set_state(s)
        # loop through all possible actions to find the best current action
        best_a, _ = best_action_value(grid, V, s)
        policy[s] = best_a
    return policy


if __name__ == '__main__':
    # this grid gives you a reward of -0.1 for every non-terminal state
    # we want to see if this will encourage finding a shorter path to the goal
    grid = standard_grid(obey_prob=0.8, step_cost=None)

    # print rewards
    print("rewards:")
    print_values(grid.rewards, grid)

    # calculate accurate values for each square
    V = calculate_values(grid)

    # calculate the optimum policy based on our values
    policy = calculate_greedy_policy(grid, V)

    # our goal here is to verify that we get the same answer as with policy iteration
    print("values:")
    print_values(V, grid)
    print("policy:")
示例#18
0
                                    reversed(states[:-1]),
                                    reversed(states[1:]), reversed(rewards)):
            new_value = self.state_values[s] + 1/(np.log(t+2)) * \
                        (r + self.discount_factor * self.state_values[s_prime] - self.state_values[s])
            deltas[i] = np.abs(new_value - self.state_values[s])
            self.state_values[s] = new_value

    def solve_prediction_problem(self, max_iter=10000):

        state_values = {}
        for t in tqdm(range(max_iter)):
            states, actions, rewards = self.play_game()
            self.update_state_value_function(states, rewards, t)
            if t % 1000 == 0:
                state_values[t] = copy.deepcopy(self.state_values)
        return state_values


if __name__ == "__main__":

    a = Agent(grid_world.standard_grid(), policy='random', discount_factor=1.0)
    state_values = a.solve_prediction_problem()
    for k, v in state_values.items():
        print(k)
        grid_world.print_values(v, a.env)
    a = Agent(grid_world.standard_grid(), policy='win-from-start')
    state_values = a.solve_prediction_problem()
    for k, v in state_values.items():
        print(k)
        grid_world.print_values(v, a.env)
示例#19
0
                Q[s][a] = np.mean(returns[sa])
                biggest_change = max(biggest_change, np.abs(old_q - Q[s][a]))
                seen_state_action_pairs.add(sa)
        deltas.append(biggest_change)

        for s in policy.keys():
            a, _ = max_dict(Q[s])
            policy[s] = a
    V = {}
    for s in policy.keys():
        V[s] = max_dict(Q[s])[1]

    return V, policy, deltas


if __name__ == '__main__':
    grid = standard_grid(obey_prob=1.0, step_cost=None)

    print("rewards:")
    print_values(grid.rewards, grid)

    V, policy, deltas = monte_carlo(grid)

    print("final values:")
    print_values(V, grid)
    print("final policy:")
    print_policy(policy, grid)

    plt.plot(deltas)
    plt.show()
def main():
    env = standard_grid()
    theta = monte_carlo_evaluation(standard_grid, eps_win_policy)
    print(theta)
    vs = get_value(env, theta, preprocess_features)
    render_vs(env, vs)
def main():
    env = standard_grid()
    policy, vs = policy_iteration(env)
    render_vs(env, vs)
    render_policy(env, policy)
    for s, r in reversed(states_and_rewards):
        print("State, Immediate Reward: {},{}".format(s, r))
        if first:
            first = False
        else:
            print("State, Future Reward: {},{}".format(s, G))
            states_and_returns.append((s, G))
        G = r + GAMMA*G
    states_and_returns.reverse()
    print(states_and_returns)
    return states_and_returns


if __name__ == '__main__':
    # create standard_grid
    grid = standard_grid()

    # print rewards
    print("Rewards:")
    print_values(grid.rewards, grid)

    # state-> action is the policy
    policy = {
        (2, 0): 'U',
        (1, 0): 'U',
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'R',
        (2, 1): 'R',
        (2, 2): 'R',
示例#23
0
def calculate_greedy_policy(grid, V):
    policy = initialize_random_policy()
    # find a policy that leads to optimal value function
    for s in policy.keys():
        grid.set_state(s)
        # loop through all possible actions to find the best current action
        best_a, _ = best_action_value(grid, V, s)
        policy[s] = best_a
    return policy


if __name__ == '__main__':
    # this grid gives you a reward of -0.1 for every non-terminal state
    # we want to see if this will encourage finding a shorter path to the goal
    grid = standard_grid(obey_prob=0.5, step_cost=-2)

    # print rewards
    print("rewards:")
    print_values(grid.rewards, grid)

    # calculate accurate values for each square
    V = calculate_values(grid)

    # calculate the optimum policy based on our values
    policy = calculate_greedy_policy(grid, V)

    # our goal here is to verify that we get the same answer as with policy iteration
    print("values:")
    print_values(V, grid)
    print("policy:")