def play_game(grid: Grid, policy): # Reset game to start at a random position. # Need to do this becasue of the current deterministic policy # we would never end up at certain states start_states = list(grid.actions.keys()) start_idx = np.random.choice(len(start_states)) grid.set_state(start_states[start_idx]) state = grid.current_state() states_and_rewards = [(state, 0)] # List of tuples (state, reward) while not grid.is_game_over(): action = policy[state] action = random_action(action) reward = grid.move(action) state = grid.current_state() states_and_rewards.append((state, reward)) # Calculate returns, G, by working backwards from the terminal state G = 0 states_and_returns = [] first = True for state, reward in reversed(states_and_rewards): # Value of terminal state is 0 so ignore it. Can also ignore last G if first: first = False else: states_and_returns.append((state, G)) G = reward + gamma*G states_and_returns.reverse() # Order of states visited, which was reverse return states_and_returns
def play_game(grid: Grid, policy): state = (2, 0) grid.set_state(state) action = random_action(policy[state]) states_actions_rewards = [(state, action, 0)] while True: reward = grid.move(action) state = grid.current_state() if grid.is_game_over(): states_actions_rewards.append((state, None, reward)) break else: action = random_action(policy[state]) states_actions_rewards.append((state, action, reward)) # Calculate returns, G, by working backwards from the terminal state G = 0 states_actions_returns = [] first = True for state, action, reward in reversed(states_actions_rewards): # Value of terminal state is 0 so ignore it. Can also ignore last G if first: first = False else: states_actions_returns.append((state, action, G)) G = reward + gamma*G states_actions_returns.reverse() # Order of states visited, which was reverse return states_actions_returns
def play_game(grid: Grid, policy: dict): # returns a list of states and corresponding rewards # start at the designated start state s = (2, 0) grid.set_state(s) states_and_rewards = [(s, 0)] #list of tuples (state, reward) while not grid.game_over(): a = policy[s] a = random_action(a) r = grid.move(a) s = grid.current_state() states_and_rewards.append((s, r)) return states_and_rewards
def play_game(grid: Grid, policy): # Reset game to start at a random position. # Need to do this becasue of the current deterministic policy # we would never end up at certain states start_states = list(grid.actions.keys()) start_idx = np.random.choice(len(start_states)) grid.set_state(start_states[start_idx]) state = grid.current_state() action = np.random.choice( ALL_POSSIBLE_ACTIONS) # First action is uniformly random states_actions_rewards = [(state, action, 0)] seen_states = set() seen_states.add(grid.current_state()) num_steps = 0 while True: reward = grid.move(action) num_steps += 1 state = grid.current_state() if state in seen_states: r = -10. / num_steps # Hack so we don't end up in an infinitely long episode bumping into a wall states_actions_rewards.append((state, None, r)) break elif grid.is_game_over(): states_actions_rewards.append((state, None, reward)) break else: action = policy[state] states_actions_rewards.append((state, action, reward)) seen_states.add(state) # Calculate returns, G, by working backwards from the terminal state G = 0 states_actions_returns = [] first = True for state, action, reward in reversed(states_actions_rewards): # Value of terminal state is 0 so ignore it. Can also ignore last G if first: first = False else: states_actions_returns.append((state, action, G)) G = reward + gamma * G states_actions_returns.reverse( ) # Order of states visited, which was reverse return states_actions_returns