示例#1
0
def monte_carlo(grid):
    """ 
    Functions runs games and updates the value function monte carlo style
    Args: grid: The grid world
    Return: Value gunction, Policy fuction, Delta list
    """
    #Create a random policy
    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(CONST_ACTION_LST)

    #Init Q(s,a) function and return
    Q = {}
    returns = {}
    for s in grid.non_terminal_states():
        Q[s] = {}
        for a in CONST_ACTION_LST:
            Q[s][a] = 0
            returns[(s, a)] = []

    # protocol changes of the Q values in each episode
    deltas = []
    # run the monte carlo approimation for a specified amount of times
    for t in range(CONST_N_EPISODES):
        if t % 1000 == 0:
            print(t)

        biggest_change = 0
        states_actions_returns = play_a_game(grid, policy)

        # calculate Q(s,a)
        seen_state_action_pairs = set()
        for s, a, G in states_actions_returns:
            # check if we have already seen, otherwise skip (first-visit insead of every visit)
            sa = (s, a)
            if sa not in seen_state_action_pairs:
                returns[sa].append(G)
                old_q = Q[s][a]
                Q[s][a] = np.mean(returns[sa])
                biggest_change = max(biggest_change, np.abs(old_q - Q[s][a]))
                seen_state_action_pairs.add(sa)
            deltas.append(biggest_change)

        # calculate new policy p[s] = argmax[a]{Q(s,a)}
        for s in policy.keys():
            a, _ = max_dict(Q[s])
            policy[s] = a

    # calculate values for each state (just to print and compare)
    # V(s) = max[a]{ Q(s,a) }
    V = {}
    for s in policy.keys():
        V[s] = max_dict(
            Q[s]
        )[1]  #this function was givin by utils and is basically argmax for a python dict

    return V, policy, deltas
def monte_carlo(grid):
  # initialize a random policy
  policy = {}
  for s in grid.actions.keys():
    policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

  # initialize Q(s,a) and returns
  Q = {}
  returns = {} # dictionary of state -> list of returns we've received
  states = grid.non_terminal_states()
  for s in states:
    Q[s] = {}
    for a in ALL_POSSIBLE_ACTIONS:
      Q[s][a] = 0
      returns[(s,a)] = []
  
  # keep track of how much our Q values change each episode so we can know when it converges
  deltas = []
  # repeat for the number of episodes specified (enough that it converges)
  for t in range(N_EPISODES):
    if t % 1000 == 0:
      print(t)

    # generate an episode using the current policy
    biggest_change = 0
    states_actions_returns = play_game(grid, policy)

    # calculate Q(s,a)
    seen_state_action_pairs = set()
    for s, a, G in states_actions_returns:
      # check if we have already seen s
      # first-visit Monte Carlo optimization
      sa = (s, a)
      if sa not in seen_state_action_pairs:
        returns[sa].append(G)
        old_q = Q[s][a]
        # the new Q[s][a] is the sample mean of all our returns for that (state, action)
        Q[s][a] = np.mean(returns[sa])
        biggest_change = max(biggest_change, np.abs(old_q - Q[s][a]))
        seen_state_action_pairs.add(sa)
    deltas.append(biggest_change)

    # calculate new policy pi(s) = argmax[a]{ Q(s,a) }
    for s in policy.keys():
      a, _ = max_dict(Q[s])
      policy[s] = a
  
  # calculate values for each state (just to print and compare)
  # V(s) = max[a]{ Q(s,a) }
  V = {}
  for s in policy.keys():
    V[s] = max_dict(Q[s])[1]
  
  return V, policy, deltas
def monte_carlo(grid):
    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

    Q = {}
    returns = {}
    states = grid.non_terminal_states()
    for s in states:
        Q[s] = {}
        for a in ALL_POSSIBLE_ACTIONS:
            Q[s][a] = 0
            returns[(s, a)] = []

    deltas = []
    for t in range(N_EPISODES):
        if t % 1000 == 0:
            print(t)

        biggest_change = 0
        states_actions_returns = play_game(grid, policy)

        seen_state_action_pairs = set()

        for s, a, G in states_actions_returns:
            sa = (s, a)
            if sa not in seen_state_action_pairs:
                returns[sa].append(G)
                old_q = Q[s][a]
                Q[s][a] = np.mean(returns[sa])
                biggest_change = max(biggest_change, np.abs(old_q - Q[s][a]))
                seen_state_action_pairs.add(sa)
        deltas.append(biggest_change)

        for s in policy.keys():
            a, _ = max_dict(Q[s])
            policy[s] = a
    V = {}
    for s in policy.keys():
        V[s] = max_dict(Q[s])[1]

    return V, policy, deltas