(0, 1): 'R', (0, 2): 'R', (1, 2): 'R', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U', } V = {} returns = {} # dictionary of state -> list of returns we've received states = grid.all_states() for s in states: if s in grid.actions: returns[s] = [] else: V[s] = 0 for t in range(100): states_and_returns = play_game(grid, policy) seen_states = set() for s, G in states_and_returns: if s not in seen_states: returns[s].append(G) V[s] = np.mean(returns[s]) seen_states.add(s) print("values:") print_values(V, grid) print("policy:") print_policy(policy, grid)
for s in states: if ( len(actions[states.index(s)]) != 0 ): # Check for terminal or unreachable positions, they have no further action random_index = np.random.choice( np.arange(len(actions[states.index( s)]))) #Choose randomnly one of the allowed next positions policy_list.append(actions[states.index(s)][random_index]) else: policy_list.append( ' ') # Terminal states have no further action policy = dict(zip(states, policy_list) ) # Create a dictionary keys: position, value: next position print("The initial random policy is:") print_policy(policy, grid) # Print the initial policy print("") ####################################### ### initialize the values V(s) randomly #### V = {} for s in states: # Initialize the values to 0 if ( len(actions[states.index(s)]) != 0 ): # Check for terminal or unreachable positions, they have no further action V[s] = np.random.random() else: V[s] = 0 print( "The values are initialized randomly, terminal and unreachable positions have value 0:" )