def main(): parser = argparse.ArgumentParser() parser.add_argument('--grid_size', nargs='?', const=1, type=int, default=8) parser.add_argument('--macro_block', nargs='?', const=1, type=int, default=2) parser.add_argument('--gamma', nargs='?', const=1, type=float, default=0.90) args = parser.parse_args() start_time = time.time() env = GridWorld(args.grid_size, args.macro_block, args.gamma) env_time = time.time() - start_time print ("Time to create environment: {}".format(env_time)) start_time = time.time() policy, v = value_iteration_sparse(env) value_iteration_time = time.time() - start_time print ("Time to find optimal policy: {}".format(value_iteration_time)) start_time = time.time() weights_inv, policy_inv = irl(env, policy) irl_time = time.time() - start_time print ("Time to solve IRL problem: {}".format(irl_time)) print ("Displaying comparison between optimal policy and IRL policy:") env.draw(policy, policy_inv, weights_inv)
def Q_learning(env: GridWorld, epsilon: float, lr: float, initQ: np.ndarray, converge=False) -> (np.ndarray, float): """ Performs Q learning for single episode in environment and returns learned policy. :param env: GridWorld subclass :param epsilon: Exploitation rate :param lr: learning rate :param initQ: Q table to update :param converge: Flag to determine if delta of Q-values need to be tracked for convergence within set bound. :return: Updated Q table after a single episode of training and maximum change for any Q-value """ # keep track of maximum state-action value change delta = 0.0 state = env.reset() done = False while not done: # explore action space if np.random.uniform(0, 1) > epsilon: action = env.sample() # exploit else: action = np.argmax(initQ[state]) # take step in env obs, r, done = env.step(action) # update Q table prev_value = initQ[state, action] new_value = prev_value + lr * (r + env.gamma * np.max(initQ[obs]) - prev_value) initQ[state, action] = new_value # update state state = obs if converge: delta = max(delta, np.abs(new_value - prev_value)) return initQ, delta
def control(self): map_names = ['map{}'.format(i) for i in range(1, 21)] train_combos = list(product(range(10), range(10))) test_combos = train_combos env = PORGBEnv( ComboEnv( GridWorld(map_names, num_obj_types=5, train_combos=train_combos, test_combos=test_combos, window=1, seed=0))) control(env) env.render()
def testNeighbors(self): """ #Check that 1, 5, and 9 have the correct neighbors B B B B B 1 2 3 B 1 2 3 B 4 5 6 => B 4 5 6 B 7 8 9 B 7 8 9 B B B B B B """ gridworld = GridWorld() #Check 1, index is 0 grid_1 = gridworld.grids[gridworld.map_states(0)] grid_1_neighbors = list(grid_1.neighbors.values()) grid_1_neighbors_ids = [grid.id for grid in grid_1_neighbors] results = list(set(grid_1_neighbors_ids) - set([None])) results.sort() target = [2,4] self.assertEqual(results,target) #Check 5, index is 4 grid_5 = gridworld.grids[gridworld.map_states(4)] grid_5_neighbors = list(grid_5.neighbors.values()) grid_5_neighbors_ids = [grid.id for grid in grid_5_neighbors] results = list(set(grid_5_neighbors_ids) - set([None])) results.sort() target = [2,4,6,8] self.assertEqual(results,target) #Check 9, index is 8 grid_9 = gridworld.grids[gridworld.map_states(8)] grid_9_neighbors = list(grid_9.neighbors.values()) grid_9_neighbors_ids = [grid.id for grid in grid_9_neighbors] results = list(set(grid_9_neighbors_ids) - set([None])) results.sort() target = [6,8] self.assertEqual(results,target)
def main(arguments): parser = create_argparser({ "alpha": { "default": 0.1 }, "--use_ep_func": { "dest": "use_ep_func", "action": "store_true", "default": True } }) args = parser.parse_args(arguments) grid_world = GridWorld(default_grid, args.p1, args.p2) default_args = {"epsilon": 0.1, "discount_factor": 0.9} for arg in default_args: if arg not in args: setattr(args, arg, default_args[arg]) run_dict = {} num_episodes = args.num_episodes globals()['args'] = args num_runs = 3 if args.AVERAGE_RUNS else 1 for i in range(num_runs): start_time = time.time() q_s_a, q_s_a2 = initialize(grid_world) if not args.use_ep_func: _, _, ep_length_log, time_log, avg_ep_length_log, avg_time_log = double_q( grid_world, q_s_a, q_s_a2, args.epsilon, num_episodes=num_episodes) else: _, _, ep_length_log, time_log, avg_ep_length_log, avg_time_log = double_q( grid_world, q_s_a, q_s_a2, epsilon_func, num_episodes=num_episodes) total_time = time.time() - start_time run_dict[i] = { "Episode Length": ep_length_log, "Time Per Episode": time_log, "Total Time": total_time, "Average Time Log": avg_time_log, "Average Ep Length": avg_ep_length_log } print("\nTook {}s to finish {} episodes".format( total_time, num_episodes)) average_ep_lengths = np.average(np.array( [run_dict[key]["Episode Length"] for key in run_dict]), axis=0) average_ep_time = np.average(np.array( [run_dict[key]["Time Per Episode"] for key in run_dict]), axis=0) average_time = np.average(np.array( [run_dict[key]["Total Time"] for key in run_dict]), axis=0) average_avg_time_log = np.average(np.array( [run_dict[key]["Average Time Log"] for key in run_dict]), axis=0) average_avg_ep_length = np.average(np.array( [run_dict[key]["Average Ep Length"] for key in run_dict]), axis=0) output_deterministic_policy(q_s_a, q_s_a2, grid_world) return average_ep_lengths, average_ep_time, average_time, average_avg_time_log, average_avg_ep_length
def main(): grid_size = 10 grid_world = GridWorld(grid_size, num_obstacles=20, stochastic_cell_ratio=0.1) params = {} params['type'] = 'value_iteration' params['grid_size'] = grid_size params['rewards'] = grid_world.rewards params['transition_matrix'] = grid_world.transition_matrix params['step_func'] = GridWorld.deterministic_step params['discount'] = 0.9 agent = AgentFactory.create_agent(params) episode_ended = False while True: grid_world.get_user_input() grid_world.draw_with_state_values( agent.v, policy=agent.pi if grid_world.render_policy else None) if not grid_world.pause: if episode_ended: grid_world.restart_episode() grid_world.draw_black_screen() episode_ended = False else: agent.do_job() if agent.ready_to_play(): action = agent.get_action(grid_world.pos) episode_ended, _, _ = grid_world.step(action) grid_world.tick_tock()
from helper import create_argparser from env import GridWorld import random import math from copy import copy import time #Creating command line parser parser = create_argparser() args = parser.parse_args() grid_world = GridWorld(args.p1, args.p2, args.r_up, args.r_down, args.r_left, args.r_right, grid_world_size=4, starting_state=8) ''' Policy Evaluation ''' def policy_evaluation(v_s, pi_s, grid_world): #Policy evaluation implementation delta = math.inf while delta > args.theta: delta = 0 for i, s in enumerate(v_s): if i == grid_world.terminal_state:
print(row) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--gamma", type=float, default=1.0) parser.add_argument("--alpha", type=float, default=0.0001) parser.add_argument("--map_size", type=int, default=4) parser.add_argument("--num_ep", type=int, default=50000) parser.add_argument("--method", type=to_str, default='mc') args = parser.parse_args() # Set hyper-parameters gamma = args.gamma # alpha = args.alpha map_size = args.map_size num_ep = args.num_ep # 에피소드 진행 횟수 method = args.method env = GridWorld() agent = Agent() data = np.zeros((map_size, map_size)) if method == 'mc': mc(data, gamma, alpha, num_ep) elif method == 'td': print(data, gamma, alpha, num_ep, method) td(data, gamma, alpha, num_ep)
def main(arguments): parser = create_argparser() args = parser.parse_args(arguments) grid_world = GridWorld(default_grid, args.p1, args.p2) default_args = {"epsilon": 0.1, "discount_factor": 0.9} #For nice syntax for arg in default_args: if arg not in args: setattr(args, arg, default_args[arg]) num_episodes = args.num_episodes run_dict = {} #injecting into global scope globals()['args'] = args num_runs = 3 if args.AVERAGE_RUNS else 1 for i in range(num_runs): start_time = time.time() pi, q_s_a, returns = initialize(grid_world) _, _, ep_length_log, time_log, avg_ep_length_log, avg_time_log = gpi( grid_world, pi, q_s_a, returns, num_episodes=num_episodes) total_time = time.time() - start_time run_dict[i] = { "Episode Length": ep_length_log, "Time Per Episode": time_log, "Total Time": total_time, "Average Time Log": avg_time_log, "Average Ep Length": avg_ep_length_log } print("\nTook {}s to finish {} episodes".format( total_time, num_episodes)) average_ep_lengths = np.average(np.array( [run_dict[key]["Episode Length"] for key in run_dict]), axis=0) average_ep_time = np.average(np.array( [run_dict[key]["Time Per Episode"] for key in run_dict]), axis=0) average_time = np.average(np.array( [run_dict[key]["Total Time"] for key in run_dict]), axis=0) average_avg_time_log = np.average(np.array( [run_dict[key]["Average Time Log"] for key in run_dict]), axis=0) average_avg_ep_length = np.average(np.array( [run_dict[key]["Average Ep Length"] for key in run_dict]), axis=0) res = [ average_ep_lengths, average_ep_time, average_time, average_avg_time_log, average_avg_ep_length ] graph_names = [ "Episode Length", "Time Per Episode", "Total Time in Seconds", "Time Per Episode (Moving Average 10 ep)", "Episode Length (Moving Average 10 ep)" ] y_axis_names = [ "Episode Length in Steps", "Time Per Episode in Seconds", "Total Time in Seconds", "Time Per Episode in Seconds", "Episode Length in Steps" ] #outputting policy output_deterministic_policy(pi, grid_world) for i in [0, 1]: t = np.linspace(1, num_episodes, num=num_episodes)[0::10] plt.plot(t, res[i][0::10], label="mc") plt.title(graph_names[i]) plt.xlabel("Episode Number") plt.ylabel(y_axis_names[i]) plt.legend() plt.savefig(graph_names[i] + "_mc" + ".jpg") plt.close() for i in [-2, -1]: t = np.linspace(1, num_episodes, num=num_episodes / 10) plt.plot(t, res[i], label="mc") plt.title(graph_names[i]) plt.xlabel("Episode Number") plt.ylabel(y_axis_names[i]) plt.legend() plt.savefig(graph_names[i] + "_mc" + ".jpg") plt.close() return average_ep_lengths, average_ep_time, average_time, average_avg_time_log, average_avg_ep_length
def testCreate(self): gridworld = GridWorld() results = gridworld.get_pretty() target = "bbbbb\nbaaab\nbaaab\nbaaab\nbbbbb\n" self.assertEqual(results,target)
def testMap_states(self): gridworld = GridWorld(size= (3,4)) start = [0,1,2,3,4,5,6,7,8,9,10,11] target = [6,7,8,11,12,13,16,17,18,21,22,23] results = list(map(gridworld.map_states, start)) self.assertEqual(results,target)