def __init__(self): self.actions = ["up", "down", "left", "right"] self.num_actions = len(self.actions) self.grid_world = GridWorld() # initial state reward self.state_values = {} for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.state_values[(i, j)] = 0 # set initial value to 0 self.state_indices = {} k = 0 for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.state_indices[(i, j)] = k # set initial value to 0 k += 1 self.num_states = len(self.state_values) self.state_values_vec = np.zeros((self.num_states)) self.rewards = np.zeros((self.num_states)) self.state_transition_prob = np.zeros((self.num_states, self.num_actions, self.num_states)) #self.get_observation_by_random(20000) self.discount = 0.99
class LevelGrid(Level): """ level with grid """ def __init__(self, screen, screen_size, mic=None): from grid_world import GridWorld # parent class init super().__init__(screen, screen_size) # new vars self.mic = mic # create gridworld self.grid_world = GridWorld(self.screen_size, self.color_bag, self.mic) # setup self.setup_level() # append interactable #self.interactables.append(self.grid_world) self.interactable_dict.update({'grid_world': self.grid_world}) # sprites self.all_sprites.add(self.grid_world.wall_sprites, self.grid_world.move_wall_sprites) def setup_level(self): """ setup level """ # set walls self.setup_wall_edge() # create walls self.grid_world.create_walls() def setup_wall_edge(self): """ limit edges """ # set walls self.grid_world.wall_grid[:, 0] = 1 self.grid_world.wall_grid[:, -1] = 1 self.grid_world.wall_grid[0, :] = 1 self.grid_world.wall_grid[-1, :] = 1
def __init__(self): self.states = [] # record position and action taken at the position self.actions = ["up", "down", "left", "right"] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd self.lr = 0.2 self.exp_rate = 0.3 self.decay_gamma = 0.9 # initial Q values self.Q_values = {} for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.Q_values[(i, j)] = {} for a in self.actions: self.Q_values[(i, j)][a] = 0 # Q value is a dict of dict
def run_grid_world(): world = GridWorld('simple_grid.txt', -0.01, include_treasure=True) print('# of states: {}'.format(len(world.all_states))) # uncomment this after the transition matrix has been saved #transitions = GridWorld.read_transition_matrix_file('simple_grid_t_matrix.csv') transitions = world.get_transition_matrix( save_to='simple_grid_t_matrix.csv') reward = world.get_reward_matrix() run_value_iteration_grid_world(world, transitions, reward) run_policy_iteration_grid_world(world, transitions, reward) compare_different_gamma_policies(world, transitions, reward) get_graphs_and_time_stats_grid_world_mdp(world, transitions) find_converged_policy(world, transitions) run_q_learning_grid_world() get_graph_q_learning()
def objectDist(self, start, obj): """ Return cost of going to some object """ # Generate a grid that only cares about # getting to the input 'obj' objectGrid = copy.deepcopy(self.grid) objValue = objectGrid.objects[obj] objectGrid.objects.clear() objectGrid.objects[obj] = objValue # Simulate GridWorld where only goal is obj objectWorld = GridWorld(objectGrid, [10]) startCoord = self.grid.objects[start] # Count num. of steps to get to obj from start, that is the distance dist = objectWorld.simulate(objectWorld.coordToScalar(startCoord)) return dist
def __init__(self, screen, screen_size, mic=None): from grid_world import GridWorld # parent class init super().__init__(screen, screen_size) # new vars self.mic = mic # create gridworld self.grid_world = GridWorld(self.screen_size, self.color_bag, self.mic) # setup self.setup_level() # append interactable self.interactables.append(self.grid_world) # sprites self.all_sprites.add(self.grid_world.wall_sprites, self.grid_world.move_wall_sprites)
class IterativePolicyEvaluation(object): def __init__(self): self.grid = GridWorld() self.rewards = rewards self.actions = actions def initialize_V(self): V = {} S = [] for i in range(self.grid.rows): for j in range(self.grid.cols): V[(i, j)] = 0 if (i, j) not in [death, goal]: S.append((i, j)) self.V = V self.S = S self.dynamic_p = 1.0 / len(S) def value_step(self): diff = 0 old_V = self.V for s in self.S: new_v = 0 old_v = old_V[s] for a in self.actions: self.grid.set_state(s) self.grid.move(a) s_new = self.grid.current_state() r = self.rewards.get(s_new, 0) if self.grid.game_over(s_new): new_v += self.dynamic_p * r break else: new_v += self.dynamic_p * (r + gamma * self.V[s_new]) self.V[s] = new_v diff = max(diff, np.abs(old_v - new_v)) return diff def policy_evaluation(self): self.initialize_V() while True: diff = self.value_step() if diff < delta: self.print_values() return None def print_values(self): for i in range(self.grid.rows): print("------------------------") for j in range(self.grid.cols): v = self.V.get((i, j), 0) if v >= 0: print(" %.2f|" % v, end="") else: print("%.2f|" % v, end="") print("") print("------------------------")
def buildBiasEngine(self): """ Simulates the GridWorlds necessary to conduct inference. """ # Builds/solves gridworld for each objectGrid, generating policies for # each object in grid. One for going only to A, another just for B, etc. for i in range(len(self.objectGrids)): simsBuffer = list() for j in range(len(self.objectGrids[0])): simsBuffer.append( GridWorld(self.objectGrids[i][j], [10], self.discount, self.tau, self.epsilon)) self.sims.append(simsBuffer)
def test_gridworld_q_learning(): np.random.seed(0) N = 5 goal_pos = np.array([[N-1, N-1]]) human_pos = np.array([[N-1, 0]]) human_radius = 2 grid = np.ones((N, N), dtype=float) * -1 grid = construct_goal_reward(grid, goal_pos, 10) grid = construct_human_radius_reward(grid, human_pos, human_radius, -10) env = GridWorld( dimensions=(N, N), init_pos=(0, 0), goal_pos=goal_pos, reward_grid=grid, human_pos=human_pos, action_success_rate=0.8, render=True, ) mdp_algo = q_learning(env.transition, env.reward, gamma=0.99) mdp_algo.run() policy = StochasticGreedyPolicy( env.action_space(), mdp_algo, env.transition) # plot results R = env.reward.reshape((N, N)).T V = np.asarray(mdp_algo.V).reshape((N, N)).T plot_grid_map(R, "Reward", cmap=plt.cm.Reds) plot_grid_map(V, "Value Function", cmap=plt.cm.Blues) plt.show() obs, rew, done, info = env.reset() while not done: act = policy.get_action(obs) obs, rew, done, info = env.step(act) time.sleep(0.2) env.close()
def test_gridworld_value_iteration(): np.random.seed(0) N = 10 goal_pos = np.array([[N-1, N-1], [N-1, N-2]]) human_pos = np.array([[N//2, N//2], [N-1, 0]]) human_radius = 3 grid = np.zeros((N, N), dtype=float) grid = construct_goal_reward(grid, goal_pos, 10) grid = construct_human_radius_reward(grid, human_pos, human_radius, -10) env = GridWorld( dimensions=(N, N), init_pos=(0, 0), goal_pos=goal_pos, reward_grid=grid, human_pos=human_pos, action_success_rate=1, render=True, ) mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99) policy = EpsGreedyPolicy(env.action_space(), mdp_algo) # plot results R = env.reward.reshape((N, N)).T V = np.asarray(mdp_algo.V).reshape((N, N)).T plot_grid_map(R, "Reward", cmap=plt.cm.Reds) plot_grid_map(V, "Value Function", cmap=plt.cm.Blues) plot_policy(policy, (N, N), "Policy", values=V, cmap=plt.cm.Blues) plt.show() obs, rew, done, info = env.reset() while not done: act = policy.get_action(obs) obs, rew, done, info = env.step(act) time.sleep(0.2) env.close()
def __init__(self): self.actions = ["up", "down", "left", "right"] self.num_actions = len(self.actions) self.grid_world = GridWorld() # initial state reward self.state_values = {} for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.state_values[(i, j)] = 0 # set initial value to 0 self.state_indices = {} k = 0 for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.state_indices[(i, j)] = k # set initial value to 0 k += 1 self.num_states = len(self.state_values) self.state_values_vec = np.zeros((self.num_states)) self.rewards = np.zeros((self.num_states)) self.state_transition_prob = np.zeros((self.num_states, self.num_actions, self.num_states)) for state in self.state_values.keys(): self.rewards[self.state_indices[state]] = self.giveReward(state) for action in self.actions: if action == "up": action_probs = zip(["up", "left", "right"], [0.8, 0.1, 0.1]) if action == "down": action_probs = zip(["down", "left", "right"], [0.8, 0.1, 0.1]) if action == "left": action_probs = zip(["left", "up", "down"], [0.8, 0.1, 0.1]) if action == "right": action_probs = zip(["right", "up", "down"], [0.8, 0.1, 0.1]) for a, p in action_probs: nxtState = self.nxtPosition(state, a) self.state_transition_prob[self.state_indices[state], self.actions.index(a), self.state_indices[nxtState]] += p self.discount = 0.99
def main(): logging.basicConfig(level=logging.INFO) action_probs, special_nodes, grid_dims, start_pos, default_r = configure_world_options( option=WORLD_OPTION) grid_world = GridWorld(start_pos=start_pos, action_probs=action_probs, grid_dims=grid_dims, default_reward=default_r, special_nodes=special_nodes, heuristic=True) # grid_world.print_info() grid_world.visualize_heuristic(store_path=STORE_PATH) # UNINFORMED SEARCH # Breadth-First Search breadth_first_agent = BreadthlyCooper(world=grid_world, debug=True) breadth_first_agent.solve() breadth_first_agent.visualize(store_path=STORE_PATH) # Branch-And-Bound Search branch_and_bound_agent = BreadthlyCooper(world=grid_world, debug=True, b_bound=True) branch_and_bound_agent.solve() branch_and_bound_agent.visualize(store_path=STORE_PATH) # Depth-First Search depth_first_agent = JohnnyDeppth(world=grid_world, debug=True) depth_first_agent.solve() depth_first_agent.visualize(store_path=STORE_PATH) # INFORMED SEARCH # Greedy-Best-First Search best_first_agent = AStarIsClimbing(world=grid_world, debug=True, alg=1) best_first_agent.solve() best_first_agent.visualize(store_path=STORE_PATH) # Hill-Climbing Search hill_climbing_agent = AStarIsClimbing(world=grid_world, debug=True, alg=2) hill_climbing_agent.solve() hill_climbing_agent.visualize(store_path=STORE_PATH) # A* Search a_star_agent = AStarIsClimbing(world=grid_world, debug=True) a_star_agent.solve() a_star_agent.visualize(store_path=STORE_PATH) # ITERATIVE PLANNING ALGORITHMS q_iter_agent = QIteration(world=grid_world, debug=True, gamma=GAMMA, error=ERROR) q_iter_agent.solve() q_iter_agent.visualize(store_path=STORE_PATH) v_iter_agent = ValueIteration(world=grid_world, debug=True, gamma=GAMMA, error=ERROR) v_iter_agent.solve() v_iter_agent.visualize(store_path=STORE_PATH) p_iter_agent = PolicyIteration(world=grid_world, debug=True, gamma=GAMMA, error=ERROR, policy_shift_max=10) p_iter_agent.solve() p_iter_agent.visualize(store_path=STORE_PATH) # LEARNING ALGORITHMS exp_start_mc = OnMonteCarlo(world=grid_world, debug=False, gamma=0.9, epsilon=0.3, num_episodes=1000, explore_starts=False, max_steps=100) exp_start_mc.solve() exp_start_mc.visualize(store_path=STORE_PATH) exp_start_mc = OffMonteCarlo(world=grid_world, debug=False, gamma=0.9, epsilon=0.3, num_episodes=1000, max_steps=100) exp_start_mc.solve() exp_start_mc.visualize(store_path=STORE_PATH) qq7 = QLearning(world=grid_world, debug=True, gamma=0.9, epsilon=0.3, num_episodes=1000, step=ALPHA) qq7.solve() qq7.visualize(store_path=STORE_PATH) sarsa = SARSA(world=grid_world, debug=False, gamma=0.9, epsilon=0.3, num_episodes=1000, expected=False, step=ALPHA) sarsa.solve() sarsa.visualize(store_path=STORE_PATH) e_sarsa = SARSA(world=grid_world, debug=False, gamma=0.9, epsilon=0.3, num_episodes=1000, expected=True, step=ALPHA) e_sarsa.solve() e_sarsa.visualize(store_path=STORE_PATH)
import warnings warnings.filterwarnings("ignore") ITER = 30 EPISODES = 2000 REWARD_MATRIX = [] FIDELITY_MATRIX = [] ENV_LIST = [] AGENT_LIST = [] for i in range(ITER): reward_list = [] fidelity_list = [] env = GridWorld(shape=(3, 5), reward=(100, 0, 200), num_of_w=1, num_of_p=1, num_of_r=1, num_of_steps=10) Agent = DoubleQNetworkAgent( n_obs=env.state_space - 1, n_action=env.action_space, units_layer=(8, 8), learning_rate=0.0025, name='dqn_' + str(i), gamma=0.9, buffer_size=500, batch_size=10, target_change_step=10, min_buffer_size=100, load_path=None, save_path=None ) for episode in range(EPISODES):
from grid_world import GridWorld game = GridWorld(size=4, mode='static') print(game.display()) game.makeMove('d') game.makeMove('d') game.makeMove('l') print(game.display()) print(game.reward())
print(']') CORRECT_ACTION_PROB = 1.0 # probability of correctly executing the chosen action GAMMA = 1.0 # discount factor #CORRECT_ACTION_PROB = 0.8 # probability of correctly executing the chosen action #GAMMA = 0.98 # discount factor np.random.seed(0) dimensions = (6, 6) num_obstacles = 6 goal_state = (5, 5) # Instantiating the grid world grid_world = GridWorld(dimensions, num_obstacles, goal_state, CORRECT_ACTION_PROB, GAMMA) # Testing policy evaluation print( 'Evaluating random policy, except for the goal state, where policy always executes stop:' ) policy = random_policy(grid_world) policy[goal_state[0], goal_state[1], STOP] = 1.0 policy[goal_state[0], goal_state[1], UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1) initial_value = np.zeros(dimensions) value = policy_evaluation(grid_world, initial_value, policy) print_value(value) print_policy(policy) print('----------------------------------------------------------------\n')
class PolicyIteration(object): def __init__(self): self.grid = GridWorld() self.rewards = rewards self.actions = actions def initialize_V(self): V = {} S = [] for i in range(self.grid.rows): for j in range(self.grid.cols): V[(i, j)] = 0 if (i, j) not in [death, goal]: S.append((i, j)) self.V = V self.S = S self.dynamic_p = 1.0 / len(S) def initialize_P(self): P = {} for i in range(self.grid.rows): for j in range(self.grid.cols): if (i, j) not in [death, goal]: P[(i, j)] = np.random.choice(self.actions) self.P = P def value_step(self): diff = 0 old_V = self.V for s in self.S: new_v = 0 old_v = old_V[s] for a in self.actions: self.grid.set_state(s) self.grid.move(a) s_new = self.grid.current_state() r = self.rewards.get(s_new, 0) new_v += self.dynamic_p * (r + gamma * self.V[s_new]) self.V[s] = new_v diff = max(diff, np.abs(old_v - new_v)) return diff def policy_evaluation(self): while True: diff = self.value_step() if diff < delta: return None def improvement_step(self): policy_stable = True for s in self.S: old_action = self.P[s] action_values = [] for a in self.actions: self.grid.set_state(s) self.grid.move(a) s_new = self.grid.current_state() r = self.rewards.get(s_new, 0) new_v = r + gamma * self.V[s_new] action_values.append(new_v) idx = np.argmax(action_values) new_action = self.actions[idx] self.P[s] = new_action if old_action != new_action: policy_stable = False return policy_stable def policy_iteration(self): self.initialize_V() self.initialize_P() print("Initial Policy:") self.print_policy() while True: self.policy_evaluation() if self.improvement_step(): print("Final Policy:") self.print_policy() print("Final Values:") self.print_values() return None def print_values(self): for i in range(self.grid.rows): print("------------------------") for j in range(self.grid.cols): v = self.V.get((i, j), 0) if v >= 0: print(" %.2f|" % v, end="") else: print("%.2f|" % v, end="") print("") print("------------------------") def print_policy(self): for i in range(self.grid.rows): print("---------------------------") for j in range(self.grid.cols): a = self.P.get((i, j), '#') print(" %s |" % a, end="") print("") print("------------------------")
def test_feature_gridworld_maxent_irl(): np.random.seed(0) # env N = 15 init_pos = np.zeros((N, N), dtype=float) for i, j in product(range(N // 2 + 2, N), range(N // 2 + 2, N)): init_pos[i, j] = i**2 + j**2 init_pos /= np.sum(init_pos) goal_pos = np.array([[n, 0] for n in range(N)]) grid = np.zeros((N, N), dtype=float) grid = construct_goal_reward(grid, goal_pos, 10) grid = construct_feature_boundary_reward( grid, boundary_axis=0, boundary_value=N // 2, reward=-10, exp_constant=0.2, ) plot_grid_map(init_pos.T, "Initial Position Distribution", cmap=plt.cm.Blues) plot_grid_map(grid.T, "Reward (Ground Truth)", cmap=plt.cm.Reds) env = GridWorld( dimensions=(N, N), init_pos=init_pos, goal_pos=goal_pos, reward_grid=grid, action_success_rate=1, render=False, ) # learn a policy mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99) policy = StochasticGreedyPolicy(env.action_space(), mdp_algo, env.transition) # roll out trajectories dataset = collect_trajectories(policy=policy, env=env, num_trajectories=20, maxlen=N * 2) plot_dataset_distribution(dataset, (N, N), "Dataset State Distribution") # IRL feature map feature_map = [ env._feature_map(s) for s in range(env.observation_space().n) ] feature_map = np.array(feature_map) # IRL me_irl = MaxEntIRL(observation_space=env.observation_space(), action_space=env.action_space(), transition=env.transition, goal_states=env.goal_states, dataset=dataset, feature_map=feature_map, max_iter=10, lr=0.1, anneal_rate=0.9) Rprime = me_irl.train() Rprime = Rprime.reshape((N, N)).T # plot results plot_grid_map(Rprime, "Reward (IRL)", cmap=plt.cm.Blues) plt.show()
def run_q_learning_grid_world(): world = GridWorld('simple_grid.txt', -0.01, include_treasure=False) n_episodes = 500000 how_often = n_episodes / 500 stats = IterationStats('stats/ql_simple_grid.csv', dims=5) def on_update(state, action, next_state, q_learner): #print('[{},{}] - {} -> [{},{}]'.format(state.x, state.y, action[0], next_state.x, next_state.y)) pass def on_episode(episode, time, q_learner, q): world.print_policy(print, q_learner.get_policy()) stats.save_iteration(episode, time, numpy.nanmean(numpy.nanmax(q, axis=0)), q) #time.sleep(1) for state in world.get_states(): if state.tile_type == GridWorldTile.GOAL: goal_state = state break def initialize_toward_goal(state: GridWorldTile): actions = state.get_actions() if len(actions) == 0: return [] diff_x = goal_state.x - state.x diff_y = goal_state.y - state.y best_value = 0.1 if len(actions) == 5 and actions[4][0].startswith('get treasure'): best_action = actions[4][0] elif abs(diff_x) >= abs(diff_y): if diff_x > 0: best_action = 'move east' else: best_action = 'move west' else: if diff_y < 0: best_action = 'move north' else: best_action = 'move south' values = [-0.1] * len(actions) for i, action in enumerate(actions): if action[0] == best_action: values[i] = best_value return values gamma = 0.99 q_l = QLearning(world, 0.5, 0.05, gamma, on_update=on_update, on_episode=on_episode, initializer=initialize_toward_goal, start_at_0=True, alpha=0.1, every_n_episode=how_often) stats.start_writing() q_l.run(n_episodes) stats.done_writing() world.print_policy(print, q_l.get_policy())
def costly_walk_builder(): rewards = {(i, j): -0.5 for i in range(4) for j in range(3)} rewards[(3, 2)] = 1. rewards[(3, 1)] = -1. env = GridWorld(rewards) return env
from grid_world import GridWorld gw = GridWorld(num_rows=4, num_columns=4) gw.current_state = 14 print(gw.step('RIGHT'))
def main(): # Test world WORLD_OPTION = 2 USE_DYNAMIC_DEFAULT = True STORE_PATH = '/Users/djordje/ML/personal/RL/rl_projects/grid_world_playground/experimentation/test/' # Planning ground truth params ERROR = 0.00001 # Learning Agent Params GAMMA = 0.9 EPSILON = 0.3 ALPHA = 0.01 DECAY = False NUM_EP = 10000 MAX_STEPS = 1000 # Debug Params DEBUG_FREQ = 100 STATES_OF_INTEREST = {(5, 0), (5, 2), (3, 2), (3, 3), (3, 4), (3, 5), (4, 5), (0, 2), (1, 3)} # Agent tags: ['Q', 'SARSA', 'ESARSA', 'MC-ES', 'MC-Soft', 'MC-ES-Soft'] AGENTS_TO_TEST = ['Q', 'ESARSA', 'MC-ES', 'MC-ES-Soft'] logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) action_probs, special_nodes, grid_dims, start_pos, default_r = configure_world_options( option=WORLD_OPTION) if USE_DYNAMIC_DEFAULT is True: default_r = -1 / MAX_STEPS grid_world = GridWorld(start_pos=start_pos, action_probs=action_probs, grid_dims=grid_dims, default_reward=default_r, special_nodes=special_nodes, heuristic=True) ground_truth_u, ground_truth_ret = obtain_optimal_values( world=grid_world, error=ERROR, gamma=GAMMA, store_path=STORE_PATH, states_of_interest=STATES_OF_INTEREST) ground_truth_u_plt = plot_ready_utils(optimal_utilities=ground_truth_u, debug_freq=DEBUG_FREQ, num_episodes=NUM_EP) ground_truth_ret_plt = plot_ready_returns(optimal_returns=ground_truth_ret, debug_freq=DEBUG_FREQ, num_episodes=NUM_EP) comparison_utilities = {} comparison_returns = {} for agent_tag in AGENTS_TO_TEST: utility_arg = {} agent = get_agent(tag=agent_tag, world=grid_world, gamma=GAMMA, max_steps=MAX_STEPS, num_ep=NUM_EP, epsilon=EPSILON, alpha=ALPHA, decay=DECAY, debug_freq=DEBUG_FREQ) logger.info('%s: Running.\n', agent_tag) agent.solve() if STORE_PATH is not None: agent.visualize(store_path=STORE_PATH) utilities = agent.export_utility_history( state_subset=STATES_OF_INTEREST) returns = agent.export_returns_history() detailed_tag = agent.get_model_name() comparison_utilities[agent_tag] = utilities comparison_returns[agent_tag] = returns utility_arg[detailed_tag] = utilities logger.info('%s Visualizing utility convergence.\n', agent_tag) visualize_utilities(ground_truth_dict=ground_truth_u_plt, utility_history_dicts=utility_arg, store_path=STORE_PATH, tag=detailed_tag + '_state_plt') logger.info('Visualizing utility convergence comparison.\n') visualize_utilities(ground_truth_dict=ground_truth_u_plt, utility_history_dicts=comparison_utilities, store_path=STORE_PATH, tag='agent_utility_comparison') logger.info('Visualizing returns convergence comparison.\n') visualize_returns(ground_truth=ground_truth_ret_plt, returns_dict=comparison_returns, store_path=STORE_PATH, tag='agent_returns_comparison')
print() print(line) def show_policy(V): line = '-' * 10 print(line) for i in range(3): for j in range(4): val = V.get((i, j), " ") print(val, ' ', end='') print() print(line) grid_world = GridWorld.default_game() states = grid_world.all_states() #All possible actions ALL_ACTIONS = ['U', 'D', 'L', 'R'] #creates the initial Value function V = { state: random.random() if not grid_world.is_terminal(state) else 0 for state in states } #creates the inital policy policy = {action: random.choice(ALL_ACTIONS) for action in grid_world._actions}
def test_gridworld_maxent_irl(): np.random.seed(0) # env N = 10 goal_pos = np.array([[N - 1, N - 1]]) human_pos = np.array([[3, 3]]) human_radius = 2 grid = np.zeros((N, N), dtype=float) grid = construct_goal_reward(grid, goal_pos, 10) grid = construct_human_radius_reward(grid, human_pos, human_radius, -10) env = GridWorld( dimensions=(N, N), init_pos=(0, 0), goal_pos=goal_pos, reward_grid=grid, human_pos=human_pos, action_success_rate=1, render=False, ) # learn a policy mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99) # mdp_algo = q_learning(env.transition, env.reward, gamma=0.99) # policy = GreedyPolicy(env.action_space(), mdp_algo) # policy = EpsGreedyPolicy(env.action_space(), mdp_algo, epsilon=0.1) policy = StochasticGreedyPolicy(env.action_space(), mdp_algo, env.transition) V = np.asarray(mdp_algo.V).reshape((N, N)).T R = env.reward.reshape((N, N)).T plot_grid_map(R, "Reward (Ground Truth)", cmap=plt.cm.Reds) plot_grid_map(V, "Value Function", cmap=plt.cm.Blues) # roll out trajectories dataset = collect_trajectories(policy=policy, env=env, num_trajectories=200, maxlen=N * 2) plot_dataset_distribution(dataset, (N, N), "Dataset State Distribution") # feature map feature_map = [ env._feature_map(s) for s in range(env.observation_space().n) ] feature_map = np.array(feature_map) # IRL me_irl = MaxEntIRL(observation_space=env.observation_space(), action_space=env.action_space(), transition=env.transition, goal_states=env.goal_states, dataset=dataset, feature_map=feature_map, max_iter=10, lr=0.1, anneal_rate=0.9) Rprime = me_irl.train() Rprime = Rprime.reshape((N, N)).T # plot results plot_grid_map(Rprime, "Reward (IRL)", print_values=True, cmap=plt.cm.Blues) plt.show()
class TestGridWorld(unittest.TestCase): def setUp(self): height = 6 width = 5 self.height = height self.width = width self.grid_world = GridWorld(width, height, (0,0), (width-1 , height-1)) def test_grid(self): a = [[False for x in range(self.width)] for x in range(self.height)] self.assertEqual(self.grid_world.grid, a) def test_get_neighbors(self): a = [(1,0), (0,1)] self.assertEqual(self.grid_world.get_neighbors(0,0), a) a = [(1,0),(2,1),(1,2),(0,1)] self.assertEqual(self.grid_world.get_neighbors(1,1), a) def test_build_on_endpoint(self): endpoint = self.grid_world.endpoint self.assertFalse(self.grid_world.build_tower(endpoint[0], endpoint[1])) def test_build_out_of_bounds(self): self.assertFalse(self.grid_world.build_tower(0, self.height+1)) self.assertFalse(self.grid_world.build_tower(self.width+1, 0)) def test_build_on_spawnpoint(self): spawnpoint = self.grid_world.spawnpoint self.assertFalse(self.grid_world.build_tower(spawnpoint[0], spawnpoint[1])) def test_build_on_tower(self): self.assertTrue(self.grid_world.build_tower(2,1)) self.assertFalse(self.grid_world.build_tower(2,1)) def test_block_path1(self):#try to build around the endpoint endpoint = self.grid_world.endpoint self.assertTrue(self.grid_world.build_tower(endpoint[0]-1, endpoint[1])) self.assertTrue(self.grid_world.build_tower(endpoint[0]-1, endpoint[1]-1)) self.assertFalse(self.grid_world.build_tower(endpoint[0], endpoint[1]-1)) def test_block_path2(self):#try to build through the center self.assertTrue(self.grid_world.build_tower(0, 1)) self.assertTrue(self.grid_world.build_tower(1, 1)) self.assertTrue(self.grid_world.build_tower(2, 1)) self.assertTrue(self.grid_world.build_tower(3, 1)) self.assertFalse(self.grid_world.build_tower(4, 1)) def test_best_path(self): self.grid_world.build_tower(0, 1) self.grid_world.build_tower(1, 1) self.grid_world.build_tower(2, 1) self.grid_world.build_tower(3, 1) self.grid_world.build_tower(4, 3) self.grid_world.build_tower(3, 3) self.grid_world.build_tower(2, 3) self.grid_world.build_tower(1, 3) self.grid_world.build_tower(0, 5) self.grid_world.build_tower(1, 5) self.grid_world.build_tower(2, 5) self.grid_world.build_tower(3, 5) bestPath = {(1, 2): (0, 2), (3, 2): (2, 2), (0, 0): (1, 0), (2, 0): (3, 0), (4, 5): None, (4, 1): (4, 2), (4, 4): (4, 5), (2, 2): (1, 2), (1, 4): (2, 4), (0, 2): (0, 3), (3, 0): (4, 0), (0, 4): (1, 4), (1, 0): (2, 0), (4, 2): (3, 2), (0, 3): (0, 4), (3, 4): (4, 4), (2, 4): (3, 4), (4, 0): (4, 1)} self.assertEqual(self.grid_world.get_path(self.grid_world.grid), bestPath) def test_diagonal_movement(self): self.assertTrue(self.grid_world.build_tower(0, 1)) self.assertFalse(self.grid_world.build_tower(1, 0))
def main(cfg): pygame.init() # フォントの作成 sysfont = pygame.font.SysFont(None, 40) screen = pygame.display.set_mode(WINDOW_SIZE) pygame.display.set_caption("Grid World") done = False clock = pygame.time.Clock() # grid worldの初期化 grid_env = GridWorld() # grid worldの環境の初期化 ini_state = grid_env.start_pos # 初期状態(エージェントのスタート地点の位置) agent = QLearningAgent( epsilon=cfg["agent"]["epsilon"], epsilon_decay_rate=cfg["agent"]["epsilon_decay_rate"], actions=np.arange(4), observation=ini_state) # Q学習エージェント nb_episode = cfg["nb_episode"] # エピソード数 save_interval = cfg["save_interval"] result_dir = cfg["result_dir"] max_step = 1 rewards = [] # 評価用報酬の保存 is_end_episode = False # エージェントがゴールしてるかどうか? step = 0 # time.sleep(30) for episode in range(nb_episode): print("episode:", episode) episode_reward = [] # 1エピソードの累積報酬 step = 0 while (is_end_episode is False and step < max_step): # ゴールするまで続ける action = agent.act() # 行動選択 state, reward, is_end_episode = grid_env.step(action) agent.observe(state, reward) # 状態と報酬の観測 episode_reward.append(reward) screen.fill(BLACK) # grid worldの描画 draw_grid_world(grid_env.map, screen) # テキストを描画したSurfaceを作成 step_str = sysfont.render("step:{}".format(step), False, WHITE) # 位# テキストを描画する screen.blit(step_str, (500, 50)) clock.tick(1) step += 1 # 再描画 pygame.display.flip() rewards.append(np.sum(episode_reward)) # このエピソードの平均報酬を与える state = grid_env.reset() # 初期化 agent.observe(state) # エージェントを初期位置に is_end_episode = False print("step:", step) agents = [agent] if episode % save_interval == 0: save_result(agents, episode, result_dir) pygame.quit()
def takeAction(self, action): position = self.grid_world.nxtPosition(action) # update GridWorld return GridWorld(state=position)
def __init__(self): self.grid = GridWorld() self.rewards = rewards self.actions = actions
def reset(self): self.states = [] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd
def basic_env_builder(): rewards = {(i, j): 0.0 for i in range(4) for j in range(3)} rewards[(3, 2)] = 1. rewards[(3, 1)] = -1. env = GridWorld(rewards) return env
def setUp(self): height = 6 width = 5 self.height = height self.width = width self.grid_world = GridWorld(width, height, (0,0), (width-1 , height-1))
from grid_world import GridWorld from numpy import Infinity from utils import * import random grid_world = GridWorld.painful_game() states = grid_world.all_states() #creates the initial Q function #state: value q, number of visits. Q = {} for state in states: if not grid_world.is_terminal(state): for action in grid_world._actions[state]: Q.update({(state, action): [0, 0]}) policy = {} for state in grid_world._actions: action = random.choice(ALL_ACTIONS) while action not in grid_world._actions[state]: action = random.choice(ALL_ACTIONS) policy.update({state: action}) gamma = 0.9 show_policy(policy) ''' The main loops of this process, it's stoped later
class Agent: def __init__(self): self.states = [] # record position and action taken at the position self.actions = ["up", "down", "left", "right"] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd self.lr = 0.2 self.exp_rate = 0.3 self.decay_gamma = 0.9 # initial Q values self.Q_values = {} for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.Q_values[(i, j)] = {} for a in self.actions: self.Q_values[(i, j)][a] = 0 # Q value is a dict of dict def chooseAction(self): # choose action with most expected value mx_nxt_reward = 0 action = "" if np.random.uniform(0, 1) <= self.exp_rate: action = np.random.choice(self.actions) else: # greedy action for a in self.actions: current_position = self.grid_world.state nxt_reward = self.Q_values[current_position][a] if nxt_reward >= mx_nxt_reward: action = a mx_nxt_reward = nxt_reward # print("current pos: {}, greedy aciton: {}".format(self.grid_world.state, action)) return action def takeAction(self, action): position = self.grid_world.nxtPosition(action) # update GridWorld return GridWorld(state=position) def reset(self): self.states = [] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd def play(self, rounds=10): i = 0 while i < rounds: # to the end of game back propagate reward if self.grid_world.isEnd: # back propagate reward = self.grid_world.giveReward() for a in self.actions: self.Q_values[self.grid_world.state][a] = reward print("Game End Reward", reward) for s in reversed(self.states): current_q_value = self.Q_values[s[0]][s[1]] reward = current_q_value + self.lr * (self.decay_gamma * reward - current_q_value) self.Q_values[s[0]][s[1]] = round(reward, 3) self.reset() i += 1 else: action = self.chooseAction() # append trace self.states.append([(self.grid_world.state), action]) print("current position {} action {}".format(self.grid_world.state, action)) # by taking the action, it reaches the next state self.grid_world = self.takeAction(action) # mark is end self.grid_world.isEndFunc() print("nxt state", self.grid_world.state) print("---------------------") self.isEnd = self.grid_world.isEnd