action_to_take = policy[ grid.pos] if grid.pos in policy else np.random.choice( grid.possible_actions()) reward = grid.player_take_action(action_to_take) state_and_rewards.append((grid.pos, reward)) G = 0 states_and_returns = [] for (state, reward) in reversed(state_and_rewards): G = reward + alpha * G states_and_returns.append((state, G)) return list(reversed(states_and_returns))[1:] if __name__ == "__main__": grid = standard_grid(success_prob=0.8) states = grid.all_states() possible_states = [s for s in states if not grid.is_terminal_state(s)] # Initialize V print("Initialize steps") alpha = 0.9 epsilon = 10e-4 iterations = 0 V = {state: 0 for state in states} counts = {state: 1 for state in states} policy = { (2, 0): Action("U", -1, 0), (1, 0): Action("U", -1, 0), (0, 0): Action("R", 0, 1), (0, 1): Action("R", 0, 1),
import numpy as np import math from rl.gridworld.utils import standard_grid from rl.gridworld.Action import Action from collections import defaultdict from rl.gridworld.utils import * if __name__ == "__main__": grid = standard_grid(0.5, -0.1) states = grid.all_states() possible_states = [s for s in states if not grid.is_terminal_state(s)] # Initialize V, policy print("Initialize steps") alpha = 0.9 epsilon = 10e-4 V = {state: 0 for state in states} policy = {} for state in possible_states: grid.set_player_position(state) policy[state] = np.random.choice(grid.possible_actions()) print_values(V, grid) print_policy(policy, grid) # repeat until policy converged while True: # policy evaluation while True:
import numpy as np import math from rl.gridworld.utils import standard_grid from rl.gridworld.Action import Action from collections import defaultdict from rl.gridworld.utils import * if __name__ == "__main__": grid = standard_grid() states = grid.all_states() possible_states = [s for s in states if not grid.is_terminal_state(s)] # Initialize V, policy print("Initialize steps") alpha = 0.9 epsilon = 10e-4 iterations = 0 V = {state: 0 for state in states} policy = {} for state in possible_states: grid.set_player_position(state) policy[state] = np.random.choice(grid.possible_actions()) print_values(V, grid) print_policy(policy, grid) # repeat until policy converged while True: iterations += 1 # policy evaluation
import numpy as np import math from rl.gridworld.utils import standard_grid from rl.gridworld.Action import Action from collections import defaultdict from rl.gridworld.utils import * if __name__ == "__main__": grid = standard_grid(normal_reward=-0.1) states = grid.all_states() possible_states = [s for s in states if not grid.is_terminal_state(s)] # Initialize V print("Initialize steps") alpha = 0.9 epsilon = 10e-4 iterations = 0 V = {state: 0 for state in states} print_values(V, grid) # repeat until policy converged # value iteration while True: iterations += 1 biggest_change = -math.inf for state in possible_states: old_v = V[state] grid.set_player_position(state) new_v = -math.inf for action in grid.possible_actions(): grid.set_player_position(state)