def main(problem_id, map_name_base): #random agent derived from lochlomond_demo.py provided by tutor prof.bjorn jensen for ai course(2019-20) University of Glasgow if(problem_id < 0 or problem_id > 7): problem_id = problem_id else: print("Probleam ID should be between 0 and 7") if(map_name_base == "8x8-base" or map_name_base == "4x4-base"): map_name_base = map_name_base else: print("Map base can be 8x8-base or 4x4-base") reward_hole = 0.0 is_stochastic = True EpisodeStats = namedtuple("Stats",["episode_lengths", "episode_rewards"]) max_episodes = 10000 max_iter_per_episode = 1000 #generate the specific problem env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, map_name_base=map_name_base, reward_hole=reward_hole) env.action_space.sample() print(env.desc) state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env) np.random.seed(12) stats = EpisodeStats(episode_lengths=np.zeros(max_episodes),episode_rewards=np.zeros(max_episodes)) for e in range(max_episodes): observation = env.reset() for iter in range(max_iter_per_episode): action = env.action_space.sample() #The agent goes here observation, reward, done, info = env.step(action) stats.episode_rewards[e] += reward #collect useful stats for comparison and plotting stats.episode_lengths[e] = iter if(done and reward==reward_hole): print("We have reached a hole :-( [we can't move so stop trying; just give up... and perhaps restart]") break if (done and reward == +1.0): #env.render() print("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal... perhaps try again?]") break return (stats)
def main(problemID, mapID): problem = int(problemID) rewardHole = -0.02 stochastic = True trainingEpisodes = 35000 episodes = 1000 iterPerEpisode = 2000 mapBase = mapID np.random.seed(12) successes = 0 # records the number of successes totalReward = 0 stats = {"episodes": {}} # set up the environment env = LochLomondEnv(problem_id=problem, is_stochastic=stochastic, map_name_base=mapBase, reward_hole=rewardHole) qTable = generate_q(env, trainingEpisodes, iterPerEpisode) print("___________________________________") print("Training Finished") print("Attempting to find solution...") for episode in range(episodes): # initial params state = env.reset() step = 0 done = False reward = 0 for step in range(iterPerEpisode): action = np.argmax(qTable[state, :]) # take the best action nextState, reward, done, info = env.step(action) if done: stats["episodes"][episode] = {"steps": step, "reward": reward} if (reward == 1.0): successes += 1 totalReward += reward break state = nextState successRate = ((successes / episodes) * 100) print("___________________________________") print("Finished") print("Success Rate: " + str(successRate) + "%") print("Total Reward: " + str(totalReward)) # log stats stats["successrate"] = successRate stats["totalreward"] = totalReward stats["qtable"] = qTable return stats, qTable
def run(problem_id=0, max_episodes=10000, max_iters_per=2000, reward_hole=-1.0): env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, reward_hole=reward_hole) epsilon = 0.9 lr_rate = 0.81 gamma = 0.96 epsilon_reduce = 1 / max_episodes Q = np.zeros((env.observation_space.n, env.action_space.n)) np.random.seed(12) results = [] for episode in range(max_episodes): state = env.reset() print('-' * 50) print_headers() for iter in range(max_iters_per): action = choose_action(state, epsilon, Q, env) state2, reward, done, info = env.step(action) print(",".join([ str(episode), str(iter), str(reward), str(done), str(info), str(action) ])) learn(state, state2, reward, action, Q, gamma, lr_rate) state = state2 if done and reward == reward_hole: print('Found a hole in ' + str(iter) + ' iterations') results.append({'iters': iter, 'success': False}) break if done: print('Found frisbee in ' + str(iter) + ' iterations') results.append({'iters': iter, 'success': True}) break epsilon -= epsilon_reduce return results
def generate_grids(cols): grids = [] for i in range(cols): map_name_base = '{}x{}-base'.format(cols, cols) env = LochLomondEnv(problem_id=i, is_stochastic=True, reward_hole=-0.02, map_name_base=map_name_base) env.render() grid = EnvMDP.to_decoded(env).reshape(env.nrow * env.ncol) grids.append(np.hstack(([i], grid))) return grids
def main(problemID, mapID): problem = int(problemID) reward_hole = -1.0 stochastic = False episodes = 100 mapBase = mapID stats = {} # start from a known seed np.random.seed(12) # set up the environment env = LochLomondEnv(problem_id=problem, is_stochastic=stochastic, map_name_base=mapBase, reward_hole=reward_hole) state_space_locations, state_space_actions, state_initial_id, \ state_goal_id = env2statespace(env) # Insert the solution here to find and output the solution using A-star # define the states and actions in a table maze_map = search.UndirectedGraph(state_space_actions) maze_map.locations = state_space_locations maze_problem = search.GraphProblem(state_initial_id, state_goal_id, maze_map) for episode in range(episodes): # iterate over episodes env.reset() # reset the state of the env to the starting state iterations, node = my_astar_search_graph(problem=maze_problem, h=None) # -- Trace the solution --# solution_path = [node] cnode = node.parent solution_path.append(cnode) while cnode.state != state_initial_id: cnode = cnode.parent solution_path.append(cnode) print("----------------------------------------") print("Identified goal state:" + str(solution_path[0])) print("Solution trace:" + str(solution_path)) print("Iterations:" + str(iterations)) print("----------------------------------------") # log stats stats["solutiontrace"] = str(solution_path) stats["numberofiterations"] = str(iterations) return stats
def test_env_2_transitions(self): env = LochLomondEnv(problem_id=1, is_stochastic=True, reward_hole=-0.2, map_name_base="4x4-base") mdp = EnvMDP(env) transitions = EnvMDP.to_transitions(env) # transitions[current_pos][action] = [(prob, newstate)] # moving to the left should... ## move to the bottom with 0.333 prob self.assertAlmostEqual(transitions[(0, 0)][0][2][0], 0.333, places=3) self.assertEqual(transitions[(0, 0)][0][2][1], (0, 1)) ## stay 0.333 prob self.assertAlmostEqual(transitions[(0, 0)][0][1][0], 0.333, places=3) self.assertEqual(transitions[(0, 0)][0][1][1], (0, 0)) ## stay 0.333 prob self.assertAlmostEqual(transitions[(0, 0)][0][0][0], 0.333, places=3) self.assertEqual(transitions[(0, 0)][0][0][1], (0, 0)) # moving to the down should... ## stay 0.333 prob self.assertAlmostEqual(transitions[(0, 0)][1][0][0], 0.333, places=3) self.assertEqual(transitions[(0, 0)][1][0][1], (0, 0)) ## move to the bottom with 0.333 prob self.assertAlmostEqual(transitions[(0, 0)][1][1][0], 0.333, places=3) self.assertEqual(transitions[(0, 0)][1][1][1], (0, 1)) ## move to the right with 0.333 prob self.assertAlmostEqual(transitions[(0, 0)][1][2][0], 0.333, places=3) self.assertEqual(transitions[(0, 0)][1][2][1], (1, 0))
def test_env_2_init(self): env = LochLomondEnv(problem_id=1, is_stochastic=True, reward_hole=-0.2, map_name_base="4x4-base") mdp = EnvMDP(env) initial = EnvMDP.to_position(env, letter=b'S') self.assertEqual((1, 0), initial[0])
def eviornment(): # Setup the parameters for the specific problem (you can change all of these if you want to) problem_id = int( sys.argv[1] ) # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent reward_hole = 0.0 # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice) is_stochastic = True # should be False for A-star (deterministic search) and True for the RL agent max_episodes = 2000 # you can decide you rerun the problem many times thus generating many episodes... you can learn from them all! max_iter_per_episode = 500 # you decide how many iterations/actions can be executed per episode observation_list = list() reward_list = list() # Generate the specific problem env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=reward_hole) # Let's visualize the problem/env print('env', env.desc) # Reset the random generator to a known state (for reproducability) np.random.seed(12) return max_episodes, env, max_iter_per_episode, observation_list, reward_list
def main(problemID, mapID): problem = int(problemID) reward_hole = -1.0 stochastic = True episodes = 1000 iterPerEpisode = 2000 mapBase = mapID successes = 0 # records the number of successes stats = {"episodes": {}} totalReward = 0 # reward per episode # set up the environment env = LochLomondEnv(problem_id=problem, is_stochastic=stochastic, map_name_base=mapBase, reward_hole=reward_hole) np.random.seed(12) for episode in range(episodes): # iterate over episodes print("___________________________________") print("EPISODE: " + str(episode)) observation = env.reset( ) # reset the state of the env to the starting state reward = 0 for step in range(iterPerEpisode): action = env.action_space.sample( ) # your agent goes here (the current agent takes random actions) observation, reward, done, info = env.step( action) # observe what happends when you take the action # Check if we are done and monitor rewards etc... if done: stats["episodes"][episode] = {"steps": step, "reward": reward} totalReward += reward break successRate = ((successes / episodes) * 100) print("Finished") print("Success Rate: " + str(successRate) + "%") print("Total Reward: " + str(totalReward)) # log stats stats["successrate"] = successRate stats["totalreward"] = totalReward return stats
def __init__(self, problem_id, map_name_base="8x8-base"): # map_name_base="4x4-base" if not (0 <= problem_id <= 7): raise ValueError("Problem ID must be 0 <= problem_id <= 7") self.map_name_base = map_name_base self.env = LochLomondEnv(problem_id=problem_id, is_stochastic=self.is_stochastic(), reward_hole=self.reward_hole(), map_name_base=map_name_base) self.problem_id = problem_id self.reset() self.out = 'out/' self.policy = {} self._train = [] self.graphs = {}
def test_env(self): env = LochLomondEnv(problem_id=0, is_stochastic=True, reward_hole=-0.02, map_name_base="4x4-base") self.assertEqual(b'S', env.desc[0,0]) self.assertEqual(b'F', env.desc[0,1]) self.assertEqual(b'H', env.desc[1,1]) self.assertEqual(b'G', env.desc[3,0])
def test_env_mdp(self): env = LochLomondEnv(problem_id=1, is_stochastic=True, reward_hole=-0.2, map_name_base="4x4-base") mdp = EnvMDP(env) self.assertEqual(4, mdp.rows) self.assertEqual(4, mdp.cols) self.assertAlmostEqual(-0.2, mdp.grid[3][0]) self.assertTrue((0, 1) in mdp.states) self.assertEqual((1, 1), mdp.terminals[0])
def test_env_2_grid(self): env = LochLomondEnv(problem_id=0, is_stochastic=True, reward_hole=-0.2, map_name_base="4x4-base") mdp = EnvMDP(env) grid = EnvMDP.to_grid_matrix(env) self.assertEqual(0, grid[0,0]) self.assertEqual(0, grid[0,1]) self.assertEqual(-0.2, grid[1,1]) self.assertEqual(env.reward, grid[3,0])
def train_for_one_problem(problem_id, map_name): problem_id = problem_id # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent reward_hole = 0.0 # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice) is_stochastic = False # should be False for A-star (deterministic search) and True for the RL agent env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, reward_hole=reward_hole, map_name_base=map_name) env.reset() done = False total_test_num = 60000 restart_times = 0 succeed_times = 0 shortest_path = 100 one_map_succeed_percentage = [] for i in range(total_test_num): restart_times += 1 done = False n_actions_for_episode = 0 while not done: n_actions_for_episode += 1 action = env.action_space.sample( ) # take random action from the available actions observation, reward, done, info = env.step(action) if done: print("\rProblem:%s Episodes #%s / 60000" % (problem_id, restart_times), end='') if reward == 1.0: if shortest_path > n_actions_for_episode: shortest_path = n_actions_for_episode succeed_times += 1 else: env.reset() print("\nSucceed Times:", succeed_times) print("Total Times:", total_test_num) print("Shortest path:", shortest_path) one_map_succeed_percentage = float(succeed_times / 60000) return one_map_succeed_percentage env.close()
def test_env_2_terminals(self): env = LochLomondEnv(problem_id=1, is_stochastic=True, reward_hole=-0.2, map_name_base="4x4-base") mdp = EnvMDP(env) terminals = EnvMDP.to_position(env, letter=b'GH') self.assertEqual((1, 1), terminals[0]) self.assertEqual((3, 1), terminals[1]) self.assertEqual((3, 2), terminals[2]) self.assertEqual((0, 3), terminals[3]) self.assertEqual((1, 3), terminals[4])
def run(problem_id=0, max_episodes=10000, max_iters_per=2000, reward_hole=0.0): env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=reward_hole) statespace_locs, statespace_actions, statespace_init, statespace_goal = env2statespace( env) maze_problem = GraphProblem(statespace_init, statespace_goal, UndirectedGraph(statespace_actions)) np.random.seed(12) results = [] for episode in range(max_episodes): print('-' * 50) env.reset() func = memoize(maze_problem.h, 'func') frontier = PriorityQueue('min', func) node = Node(maze_problem.initial) frontier.append(node) seen = set() for iter in range(max_iters_per): node = frontier.pop() print(",".join([str(episode), str(iter), node.state])) if maze_problem.goal_test(node.state): print('done') results.append({'iters': iter, 'success': True}) break seen.add(node.state) for possible in node.expand(maze_problem): if possible.state not in seen and possible not in frontier: frontier.append(possible) elif possible in frontier: if func(possible) < frontier[possible]: del frontier[possible] frontier.append(possible) return results
def search_for_one_solution(problem_id, map_name, plot_or_not): problem_id = problem_id reward_hole = 0.0 is_stochastic = False if map_name == '4x4-base': n_dim = 4 else: n_dim = 8 env = LochLomondEnv(problem_id = problem_id, is_stochastic = is_stochastic, reward_hole = reward_hole, map_name_base = map_name) env.reset() # Create a dict representation of the state space state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env) #--------------SOLUTION--------------# maze_map = UndirectedGraph(state_space_actions) maze_map.locations = state_space_locations maze_problem = GraphProblem(state_initial_id, state_goal_id, maze_map) iterations, _, node = my_astar_search_graph(problem=maze_problem, h=None) #-------------Trace the solution-----------------# solution_path = [node] cnode = node.parent solution_path.append(cnode) i = 0 while cnode.state != state_initial_id: i += 1 cnode = cnode.parent solution_path.append(cnode) solution = [] solution_x = [] solution_y = [] for s in str(solution_path).split('_',-1): for s_s in str(s).split('>',-1): if s_s.isdigit(): solution.append(s_s) for i in range(int(len(solution)/2)): solution_y.append(int(solution[i*2])) solution_x.append(int(solution[i*2+1])) print("Steps:",i) print("Goal state:"+str(solution_path[0])) print("Final Solution:",solution_path[::-1]) print("----------------------------------------") env.close() plt.cla() plt.plot(solution_x[::-1], solution_y[::-1]) plt.scatter(solution_x[::-1], solution_y[::-1],s=120) plt.xlim(0,n_dim-1) plt.ylim(n_dim-1,0) plt.grid(True) plt.title("Simple Agent Solution for Problem%s" % problem_id) plt.savefig('./Images/%sx%s maps: Simple Agent Solution for Problem%s.jpg' % (n_dim,n_dim,problem_id)) print("Figure Saved in Folder 'Images'") if plot_or_not == True: plt.show()
def environment(): # Set up the Environment problem_id = int(sys.argv[1]) env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=-0.01) total_episodes = 10000 max_steps = 1000 lr_rate = 0.80 gamma = 0.96 epsilon = 0.9 Q = np.zeros((env.observation_space.n, env.action_space.n)) return env, problem_id, epsilon, total_episodes, max_steps, lr_rate, gamma, Q
def run(problem_id=0, max_episodes=10000, max_iters_per=2000, reward_hole=0.0): env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, reward_hole=reward_hole) np.random.seed(12) results = [] for episode in range(max_episodes): env.reset() print('-' * 50) print_headers() for iteration in range(max_iters_per): action = env.action_space.sample() observation, reward, done, info = env.step(action) print(",".join([ str(episode), str(iteration), str(reward), str(done), str(info), str(action) ])) if done and reward == reward_hole: env.render() print("Hole Found in " + str(iteration) + " iterations") results.append({'iters': iteration, 'success': False}) break if done and reward == 1.0: env.render() print("Frisbee acquired in " + str(iteration) + " iterations") results.append({'iters': iteration, 'success': True}) break return results
""" Runs the LochLomondEnv problem using the random agent. Takes as input a command line argument which specifies the problem ID. """ import sys from uofgsocsai import LochLomondEnv from random_agent import RandomAgent, process_data_random from constants import (REWARD_HOLE_RANDOM, MAX_EPISODES, MAX_ITERS_PER_EPISODE, IS_STOCHASTIC_RANDOM) if len(sys.argv) == 2: PROBLEM_ID = int(sys.argv[1]) else: PROBLEM_ID = 0 env = LochLomondEnv(problem_id=PROBLEM_ID, is_stochastic=IS_STOCHASTIC_RANDOM, reward_hole=REWARD_HOLE_RANDOM) rand_agent = RandomAgent(env) process_data_random(env, rand_agent, MAX_EPISODES, MAX_ITERS_PER_EPISODE, REWARD_HOLE_RANDOM, PROBLEM_ID)
def main(problem_id, map_name_base): #simple agent referenced and adapted from lab 4 notebook by tutor prof.bjorn jensen for ai course (2019-20) if(problem_id < 0 or problem_id > 7): problem_id = problem_id else: print("Probleam ID should be between 0 and 7") if(map_name_base == "8x8-base" or map_name_base == "4x4-base"): map_name_base = map_name_base else: print("Map base can be 8x8-base or 4x4-base") reward_hole = -1.0 is_stochastic = False max_episodes = 10000 env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, map_name_base=map_name_base, reward_hole=reward_hole) env.action_space.sample() print(env.desc) EpisodeStats = namedtuple("Stats",["episode_lengths", "episode_rewards"]) state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env) frozen_lake_map = UndirectedGraph(state_space_actions) frozen_lake_map.locations = state_space_locations frozen_lake_problem = GraphProblem(state_initial_id, state_goal_id, frozen_lake_map) all_node_colors=[] iterations, all_node_colors, node = my_astar_search_graph(problem=frozen_lake_problem, h=None) solution_path = [node] cnode = node.parent solution_path.append(cnode) while cnode.state != "S_00_00": cnode = cnode.parent if cnode is None: break solution_path.append(cnode) steps = solution_path[::-1] # Reset the random generator to a known state (for reproducibility) np.random.seed(12) observation = env.reset() # reset the state of the env to the starting state stats = EpisodeStats(episode_lengths=np.zeros(max_episodes),episode_rewards=np.ones(max_episodes)) for e in range(max_episodes): # iterate over episodes observation = env.reset() # reset the state of the env to the starting state for i in range(len(steps)-1): action = get_action_from_states(steps[i],steps[i+1])# your agent goes here (the current agent takes random actions) observation, reward, done, info = env.step(action) # observe what happends when you take the action # update stats stats.episode_rewards[e] = reward stats.episode_lengths[e] = i # Check if we are done and monitor rewards etc... if (done): print("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal... perhaps try again?]") break return (stats)
def main(problem_id, map_name_base): #rl agent referenced from lab 8 and 9 notebooks provided by tutor prof.bjorn jensen for ai course(2019-20) University of Glasgow if (problem_id < 0 or problem_id > 7): problem_id = problem_id else: print("Problem ID should be between 0 and 7") if (map_name_base == "8x8-base" or map_name_base == "4x4-base"): map_name_base = map_name_base else: print("Map base can be 8x8-base or 4x4-base") reward_hole = -0.05 #Hole penalty is set based on analysis to ensure that the reward is maximized is_stochastic = True EpisodeStats = namedtuple("Stats", ["episode_lengths", "episode_rewards"]) map_name_base = map_name_base np.random.seed(12) env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, reward_hole=reward_hole, map_name_base=map_name_base) states = env.observation_space.n actions = env.action_space.n Q = np.zeros((states, actions)) max_episodes = 10000 max_iter_per_episode = 1000 alpha = 0.1 #learning rate gamma = 0.999 #discount rate epsilon = 1 stats = EpisodeStats(episode_lengths=np.zeros(max_episodes), episode_rewards=np.zeros(max_episodes)) for episode in range(max_episodes): state = env.reset() for step in range(max_iter_per_episode): # take best action according to Q-table if random value is greater than epsilon, otherwise take a random action random_value = random.uniform(0, 1) if random_value > epsilon: action = np.argmax(Q[state, :]) #Agent goes here else: action = env.action_space.sample() new_state, reward, done, info = env.step(action) Q[state, action] = Q[state, action] + alpha * ( reward + gamma * np.max(Q[new_state, :]) - Q[state, action]) stats.episode_rewards[episode] += reward stats.episode_lengths[episode] = step state = new_state if done: break epsilon = 0.01 #epsilon is set to a low value to make sure of the exploitation print(Q) return (stats)
def train_for_one_model(problem_id, map_name, train_or_not): problem_id = problem_id # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent reward_hole = -0.01 # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice) is_stochastic = True # should be False for A-star (deterministic search) and True for the RL agent if map_name == '4x4-base': n_dim = 4 num_episodes = 100000 else: num_episodes = 300000 n_dim = 8 env = LochLomondEnv(problem_id = problem_id, is_stochastic = is_stochastic, reward_hole = reward_hole, map_name_base = map_name) restart_times = 0 n_actions_for_episode = 0 rewards_all_episodes = [] rewards_all_episodes_per_2000 = [] x_axis_rewardsvsepisodes = [] episode_steps = [] max_steps_per_episode = 10000 exploration_rate = 0.5 q_table = np.zeros([env.observation_space.n,env.action_space.n]) learning_rate = 0.3 discount = 0.5 if problem_id == 0 and n_dim == 8: learning_rate = 0.2 discount = 0.8 if problem_id == 0 and n_dim == 4: learning_rate = 0.4 discount = 0.7 epsilon_min = 0.005 epsilon_decay_rate = 0.99995 shortest_path = 10000 longest_path = 0 avg_path = [] Train_or_not = train_or_not if Train_or_not == True: #--------------Training Process-----------------# for episode in range(num_episodes): restart_times += 1 state = env.reset() done = False rewards_current_episode = 0 path = [state] if restart_times % 5000 == 0: print("\ntraining in progress: #", restart_times) for step in range(max_steps_per_episode): n_actions_for_episode += 1 # Exploration - exploitation trade-off exploration_exploitation_rate = random.uniform(0, 1) epsilon = 0.3 if exploration_exploitation_rate < epsilon or q_table[state, :].all() == 0: action = env.action_space.sample() # Exploration Method 20% i.e take random action from the available actions else: action = np.argmax(q_table[state, :] + np.random.randn(1,4)) # Exploitation Method 80% i.e select the action with max value new_state, reward, done, info = env.step(action) path.append(new_state) # Update Q-table q_table[state, action] = q_table[state, action] + learning_rate * (reward + discount * np.max(q_table[new_state, :]) - q_table[state, action]) state = new_state # rewards_current_episode += reward if done == True and reward == 1: print("\rEpisode #%s: Finish it within %d steps" % (restart_times, len(path)),end = '') break if done == True and reward == -0.01: break # epsilon decay if epsilon >= epsilon_min: epsilon *= epsilon_decay_rate # rewards_all_episodes.append(rewards_current_episode) # if restart_times % 2000 == 0: # avg_reward_2000 = np.sum(rewards_all_episodes) / (2000 * (restart_times / 2000)) # rewards_all_episodes_per_2000.append(avg_reward_2000) # x_axis_rewardsvsepisodes.append(2000 * (restart_times / 2000)) #---------------SAVE THE MODEL--------------------# np.save('%sx%s q_tableP%s.npy' % (n_dim, n_dim, problem_id), q_table) #--------FINAL TEST-----------# if(train_or_not == True): print("\nRunning Test for 50000 times. Please wait...") q_table = np.load('%sx%s q_tableP%s.npy' % (n_dim, n_dim, problem_id)) # Load the trained model q table env.reset() state = env.reset() test_total_num = 50000 test_fail_num = 0 test_succeed_num = 0 Avg_rewards_per_1000_episodes = [] Avg_reward_per_step = [] Avg_reward_per_episode = [] for k in range(test_total_num): s = env.reset() j=0 rewards_temp = 0 while j < 1000: j += 1 action = np.argmax(q_table[s,:]) new_state,r,done,b = env.step(action) rewards_temp += r s = new_state if done and r == -0.01: test_fail_num += 1 break if done and r == 1.0: avg_path.append(j) if shortest_path > j: shortest_path = j if longest_path < j: longest_path = j test_succeed_num += 1.0 break if j == 1000: test_fail_num += 1 Avg_reward_per_episode.append(rewards_temp) Avg_reward_per_step.append(rewards_temp / j) if k % 1000 == 0: Avg_rewards_per_1000_episodes.append(np.sum(Avg_reward_per_episode) / int(1000 * float(k / 1000))) x_axis_rewardsvsepisodes.append(1000 * (k / 1000)) #--------------OUTPUT FINAL RESULT-----------------# if (train_or_not == True): print("\n-------------------------------------------") print("Average rewards per 1000 episodes:",Avg_rewards_per_1000_episodes[-1]) print("Average rewards per steps:", Avg_reward_per_step[-1]) print("Success times:",test_succeed_num) print("Failure times:",test_fail_num) print("Success rate:",float(test_succeed_num / test_total_num)) print("Success vs Failure rate:",float(test_succeed_num / test_fail_num)) print("Steps number (Best case):",shortest_path) print("Steps number (Worst case):",longest_path) print("Steps number (On average):",np.mean(avg_path)) print("Learning rate:",learning_rate) plt.cla() plt.plot(x_axis_rewardsvsepisodes[:], Avg_rewards_per_1000_episodes[:]) plt.savefig('./Images/%sx%s maps: Average Rewards of Problem%s.jpg' % (n_dim,n_dim,problem_id)) if (train_or_not == True): print("Figure Saved in Folder 'Images'") plt.show() return test_succeed_num, test_fail_num, shortest_path, longest_path,np.mean(avg_path), learning_rate, Avg_rewards_per_1000_episodes[-1], Avg_reward_per_step[-1]
import numpy as np import time from uofgsocsai import LochLomondEnv import os, sys from helpers import * import networkx as nx from search import * # Setup the parameters problem_id = int(sys.argv[1]) reward_hole = 0.0 is_stochastic = True # Generate the environment env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=reward_hole) print(env.desc) state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env) maze_map = UndirectedGraph(state_space_actions) # initialise a graph G = nx.Graph() node_labels = dict() node_colors = dict() for n, p in state_space_locations.items(): G.add_node(n) # add nodes from locations
def main(p_id): # Setup the parameters for the specific problem (you can change all of these if you want to) problem_id = int(p_id) # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent reward_hole = 0.0 # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice) is_stochastic = False # should be False for A-star (deterministic search) and True for the RL agent max_episodes = 2000 max_iter_per_episode = 500 # you decide how many iterations/actions can be executed per episode # Generate the specific problem env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=reward_hole) # Let's visualize the problem/env # print("grid= \n") # print(env.desc) # env.render g = Grid(env.desc) # Create a representation of the state space for use with AIMA A-star # state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env) # print(state_goal_id) # Reset the random generator to a known state (for reproducability) np.random.seed(12) #setup vars for logfile f= open("out_AStar_{}.txt".format(problem_id) ,"w+") successes = 0 failures = 0 #### for e in range(max_episodes): # iterate over episodes observation = env.reset() # reset the state of the env to the starting state steps = aStar(g) for iter in range(max_iter_per_episode): # env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line action = steps[iter] # print(action) observation, reward, done, info = env.step(action) # observe what happends when you take the action # # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner # Check if we are done and monitor rewards etc... if(done and reward==reward_hole): # env.render() # print("Failure") failures += 1 f.write("e,iter,reward,done = " + str(e) + " " + str(iter)+ " " + str(reward)+ " Fail\n") # f.write("We have reached a hole :-( [we can't move so stop trying; just give up]\n") break if (done and reward == +1.0): # env.render() successes += 1 # print("Success") f.write("e,iter,reward,done = " + str(e) + " " + str(iter)+ " " + str(reward)+ " Success\n") # f.write("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal]\n") break f.write("Successes: " + str(successes)) f.write("\n") f.write("Failures: " + str(failures)) successRate = successes / max_episodes * 100 dict = {"Success": successes, "Failures": failures, "Episodes": max_episodes, "SuccessRate": successRate} # print(dict) return dict
import os, sys from helpers import * print("Working dir:" + os.getcwd()) print("Python version:" + sys.version) # Setup the parameters for the specific problem (you can change all of these if you want to) problem_id = 0 # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent reward_hole = 0.0 # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice) is_stochastic = True # should be False for A-star (deterministic search) and True for the RL agent max_episodes = 2000 # you can decide you rerun the problem many times thus generating many episodes... you can learn from them all! max_iter_per_episode = 500 # you decide how many iterations/actions can be executed per episode # Generate the specific problem env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=reward_hole) # Let's visualize the problem/env print(env.desc) # Create a representation of the state space for use with AIMA A-star state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace( env) # Reset the random generator to a known state (for reproducability) np.random.seed(12) #### for e in range(max_episodes): # iterate over episodes observation = env.reset(
from uofgsocsai import LochLomondEnv import os import sys from helpers import * problem_id = 0 reward_hole = 0.0 is_stochastic = True max_episodes = 2000 max_iter_per_episode = 2000 map_name_base = "8x8-base" # Generate the specific problem env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, map_name_base=map_name_base, reward_hole=reward_hole) # Create a representation of the state space for use with AIMA A-star state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace( env) print(state_space_locations) # Reset the random generator to a known state (for reproducibility) np.random.seed(12) # Run a random/senseless agent for e in range(max_episodes): # iterate over episodes observation = env.reset( ) # reset the state of the env to the starting state
from uofgsocsai import LochLomondEnv # load the class defining the custom Open AI Gym problem # Setup the parameters for the specific problem (you can change all of these if you want to) problem_id = 0 # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent reward_hole = 0.0 # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice) is_stochastic = True # should be False for A-star (deterministic search) and True for the RL agent max_episodes = 2000 # you can decide you rerun the problem many times thus generating many episodes... you can learn from them all! max_iter_per_episode = 500 # you decide how many iterations/actions can be executed per episode observation_list= list() reward_list= list() # Generate the specific problem env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, reward_hole=reward_hole) # Let's visualize the problem/env print('env',env.desc) # Reset the random generator to a known state (for reproducability) np.random.seed(12) for e in range(max_episodes): # iterate over episodes observation = env.reset() # reset the state of the env to the starting state for iter in range(max_iter_per_episode): env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line #action = env.action_space.sample() # your agent goes here (the current agent takes random actions) random= runRandom() action= random.action()
IS_STOCHASTIC_Q) from q_agent import QLearningAgent, process_data_q from random_agent import RandomAgent, process_data_random from simple_agent import SimpleAgent, process_data_simple from uofgsocsai import LochLomondEnv # Reads command line argument and stores in PROBLEM_ID, # to specify the problem if this hasnt been provided, then # just set a deafult of 0 if len(sys.argv) == 2: PROBLEM_ID = int(sys.argv[1]) else: PROBLEM_ID = 0 env_random = LochLomondEnv( problem_id=PROBLEM_ID, is_stochastic=IS_STOCHASTIC_RANDOM, reward_hole=REWARD_HOLE_RANDOM) env_simple = LochLomondEnv( problem_id=PROBLEM_ID, is_stochastic=IS_STOCHASTIC_SIMPLE, reward_hole=REWARD_HOLE_SIMPLE) env_qlearn = LochLomondEnv( problem_id=PROBLEM_ID, is_stochastic=IS_STOCHASTIC_Q, reward_hole=REWARD_HOLE_Q) start_index = np.where(env_qlearn.desc == b'S') row, col = start_index[0][0], start_index[1][0] start = row*8 + col end_index = np.where(env_qlearn.desc == b'G') row, col = end_index[0][0], end_index[1][0] goal = row*8 + col holes = np.where(env_qlearn == b'H') terminals = [] for i in range(len(holes[0])):
temp_id = int(sys.argv[1]) except IndexError as identifier: print("There is no input number so the problem id is set to default 0.") temp_id = 0 # Setup the parameters for the specific problem (you can change all of these if you want to) # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent problem_id = temp_id # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice) reward_hole = 0.0 # should be False for A-star (deterministic search) and True for the RL agent is_stochastic = True # Load Environment and Q-table structure env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, reward_hole=reward_hole) # learn episode times! max_episodes = 10000 # you decide how many iterations/actions can be executed per episode max_iter_per_episode = 2000 # random agent def random_agent(env, problem_id, max_episodes): output_file = f'out_random_{problem_id}.pkl' n_states = env.observation_space.n random_agent_dict = {} # define a dict to save random actions # set random action to a dict for state in range(n_states):