def search_for_one_solution(problem_id, map_name, plot_or_not): problem_id = problem_id reward_hole = 0.0 is_stochastic = False if map_name == '4x4-base': n_dim = 4 else: n_dim = 8 env = LochLomondEnv(problem_id = problem_id, is_stochastic = is_stochastic, reward_hole = reward_hole, map_name_base = map_name) env.reset() # Create a dict representation of the state space state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env) #--------------SOLUTION--------------# maze_map = UndirectedGraph(state_space_actions) maze_map.locations = state_space_locations maze_problem = GraphProblem(state_initial_id, state_goal_id, maze_map) iterations, _, node = my_astar_search_graph(problem=maze_problem, h=None) #-------------Trace the solution-----------------# solution_path = [node] cnode = node.parent solution_path.append(cnode) i = 0 while cnode.state != state_initial_id: i += 1 cnode = cnode.parent solution_path.append(cnode) solution = [] solution_x = [] solution_y = [] for s in str(solution_path).split('_',-1): for s_s in str(s).split('>',-1): if s_s.isdigit(): solution.append(s_s) for i in range(int(len(solution)/2)): solution_y.append(int(solution[i*2])) solution_x.append(int(solution[i*2+1])) print("Steps:",i) print("Goal state:"+str(solution_path[0])) print("Final Solution:",solution_path[::-1]) print("----------------------------------------") env.close() plt.cla() plt.plot(solution_x[::-1], solution_y[::-1]) plt.scatter(solution_x[::-1], solution_y[::-1],s=120) plt.xlim(0,n_dim-1) plt.ylim(n_dim-1,0) plt.grid(True) plt.title("Simple Agent Solution for Problem%s" % problem_id) plt.savefig('./Images/%sx%s maps: Simple Agent Solution for Problem%s.jpg' % (n_dim,n_dim,problem_id)) print("Figure Saved in Folder 'Images'") if plot_or_not == True: plt.show()
def main(problemID, mapID): problem = int(problemID) reward_hole = -1.0 stochastic = False episodes = 100 mapBase = mapID stats = {} # start from a known seed np.random.seed(12) # set up the environment env = LochLomondEnv(problem_id=problem, is_stochastic=stochastic, map_name_base=mapBase, reward_hole=reward_hole) state_space_locations, state_space_actions, state_initial_id, \ state_goal_id = env2statespace(env) # Insert the solution here to find and output the solution using A-star # define the states and actions in a table maze_map = search.UndirectedGraph(state_space_actions) maze_map.locations = state_space_locations maze_problem = search.GraphProblem(state_initial_id, state_goal_id, maze_map) for episode in range(episodes): # iterate over episodes env.reset() # reset the state of the env to the starting state iterations, node = my_astar_search_graph(problem=maze_problem, h=None) # -- Trace the solution --# solution_path = [node] cnode = node.parent solution_path.append(cnode) while cnode.state != state_initial_id: cnode = cnode.parent solution_path.append(cnode) print("----------------------------------------") print("Identified goal state:" + str(solution_path[0])) print("Solution trace:" + str(solution_path)) print("Iterations:" + str(iterations)) print("----------------------------------------") # log stats stats["solutiontrace"] = str(solution_path) stats["numberofiterations"] = str(iterations) return stats
def train_for_one_problem(problem_id, map_name): problem_id = problem_id # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent reward_hole = 0.0 # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice) is_stochastic = False # should be False for A-star (deterministic search) and True for the RL agent env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, reward_hole=reward_hole, map_name_base=map_name) env.reset() done = False total_test_num = 60000 restart_times = 0 succeed_times = 0 shortest_path = 100 one_map_succeed_percentage = [] for i in range(total_test_num): restart_times += 1 done = False n_actions_for_episode = 0 while not done: n_actions_for_episode += 1 action = env.action_space.sample( ) # take random action from the available actions observation, reward, done, info = env.step(action) if done: print("\rProblem:%s Episodes #%s / 60000" % (problem_id, restart_times), end='') if reward == 1.0: if shortest_path > n_actions_for_episode: shortest_path = n_actions_for_episode succeed_times += 1 else: env.reset() print("\nSucceed Times:", succeed_times) print("Total Times:", total_test_num) print("Shortest path:", shortest_path) one_map_succeed_percentage = float(succeed_times / 60000) return one_map_succeed_percentage env.close()
def main(problemID, mapID): problem = int(problemID) rewardHole = -0.02 stochastic = True trainingEpisodes = 35000 episodes = 1000 iterPerEpisode = 2000 mapBase = mapID np.random.seed(12) successes = 0 # records the number of successes totalReward = 0 stats = {"episodes": {}} # set up the environment env = LochLomondEnv(problem_id=problem, is_stochastic=stochastic, map_name_base=mapBase, reward_hole=rewardHole) qTable = generate_q(env, trainingEpisodes, iterPerEpisode) print("___________________________________") print("Training Finished") print("Attempting to find solution...") for episode in range(episodes): # initial params state = env.reset() step = 0 done = False reward = 0 for step in range(iterPerEpisode): action = np.argmax(qTable[state, :]) # take the best action nextState, reward, done, info = env.step(action) if done: stats["episodes"][episode] = {"steps": step, "reward": reward} if (reward == 1.0): successes += 1 totalReward += reward break state = nextState successRate = ((successes / episodes) * 100) print("___________________________________") print("Finished") print("Success Rate: " + str(successRate) + "%") print("Total Reward: " + str(totalReward)) # log stats stats["successrate"] = successRate stats["totalreward"] = totalReward stats["qtable"] = qTable return stats, qTable
def main(problem_id, map_name_base): #random agent derived from lochlomond_demo.py provided by tutor prof.bjorn jensen for ai course(2019-20) University of Glasgow if(problem_id < 0 or problem_id > 7): problem_id = problem_id else: print("Probleam ID should be between 0 and 7") if(map_name_base == "8x8-base" or map_name_base == "4x4-base"): map_name_base = map_name_base else: print("Map base can be 8x8-base or 4x4-base") reward_hole = 0.0 is_stochastic = True EpisodeStats = namedtuple("Stats",["episode_lengths", "episode_rewards"]) max_episodes = 10000 max_iter_per_episode = 1000 #generate the specific problem env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, map_name_base=map_name_base, reward_hole=reward_hole) env.action_space.sample() print(env.desc) state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env) np.random.seed(12) stats = EpisodeStats(episode_lengths=np.zeros(max_episodes),episode_rewards=np.zeros(max_episodes)) for e in range(max_episodes): observation = env.reset() for iter in range(max_iter_per_episode): action = env.action_space.sample() #The agent goes here observation, reward, done, info = env.step(action) stats.episode_rewards[e] += reward #collect useful stats for comparison and plotting stats.episode_lengths[e] = iter if(done and reward==reward_hole): print("We have reached a hole :-( [we can't move so stop trying; just give up... and perhaps restart]") break if (done and reward == +1.0): #env.render() print("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal... perhaps try again?]") break return (stats)
def run(problem_id=0, max_episodes=10000, max_iters_per=2000, reward_hole=0.0): env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, reward_hole=reward_hole) np.random.seed(12) results = [] for episode in range(max_episodes): env.reset() print('-' * 50) print_headers() for iteration in range(max_iters_per): action = env.action_space.sample() observation, reward, done, info = env.step(action) print(",".join([ str(episode), str(iteration), str(reward), str(done), str(info), str(action) ])) if done and reward == reward_hole: env.render() print("Hole Found in " + str(iteration) + " iterations") results.append({'iters': iteration, 'success': False}) break if done and reward == 1.0: env.render() print("Frisbee acquired in " + str(iteration) + " iterations") results.append({'iters': iteration, 'success': True}) break return results
def run(problem_id=0, max_episodes=10000, max_iters_per=2000, reward_hole=0.0): env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=reward_hole) statespace_locs, statespace_actions, statespace_init, statespace_goal = env2statespace( env) maze_problem = GraphProblem(statespace_init, statespace_goal, UndirectedGraph(statespace_actions)) np.random.seed(12) results = [] for episode in range(max_episodes): print('-' * 50) env.reset() func = memoize(maze_problem.h, 'func') frontier = PriorityQueue('min', func) node = Node(maze_problem.initial) frontier.append(node) seen = set() for iter in range(max_iters_per): node = frontier.pop() print(",".join([str(episode), str(iter), node.state])) if maze_problem.goal_test(node.state): print('done') results.append({'iters': iter, 'success': True}) break seen.add(node.state) for possible in node.expand(maze_problem): if possible.state not in seen and possible not in frontier: frontier.append(possible) elif possible in frontier: if func(possible) < frontier[possible]: del frontier[possible] frontier.append(possible) return results
def run(problem_id=0, max_episodes=10000, max_iters_per=2000, reward_hole=-1.0): env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, reward_hole=reward_hole) epsilon = 0.9 lr_rate = 0.81 gamma = 0.96 epsilon_reduce = 1 / max_episodes Q = np.zeros((env.observation_space.n, env.action_space.n)) np.random.seed(12) results = [] for episode in range(max_episodes): state = env.reset() print('-' * 50) print_headers() for iter in range(max_iters_per): action = choose_action(state, epsilon, Q, env) state2, reward, done, info = env.step(action) print(",".join([ str(episode), str(iter), str(reward), str(done), str(info), str(action) ])) learn(state, state2, reward, action, Q, gamma, lr_rate) state = state2 if done and reward == reward_hole: print('Found a hole in ' + str(iter) + ' iterations') results.append({'iters': iter, 'success': False}) break if done: print('Found frisbee in ' + str(iter) + ' iterations') results.append({'iters': iter, 'success': True}) break epsilon -= epsilon_reduce return results
def main(problemID, mapID): problem = int(problemID) reward_hole = -1.0 stochastic = True episodes = 1000 iterPerEpisode = 2000 mapBase = mapID successes = 0 # records the number of successes stats = {"episodes": {}} totalReward = 0 # reward per episode # set up the environment env = LochLomondEnv(problem_id=problem, is_stochastic=stochastic, map_name_base=mapBase, reward_hole=reward_hole) np.random.seed(12) for episode in range(episodes): # iterate over episodes print("___________________________________") print("EPISODE: " + str(episode)) observation = env.reset( ) # reset the state of the env to the starting state reward = 0 for step in range(iterPerEpisode): action = env.action_space.sample( ) # your agent goes here (the current agent takes random actions) observation, reward, done, info = env.step( action) # observe what happends when you take the action # Check if we are done and monitor rewards etc... if done: stats["episodes"][episode] = {"steps": step, "reward": reward} totalReward += reward break successRate = ((successes / episodes) * 100) print("Finished") print("Success Rate: " + str(successRate) + "%") print("Total Reward: " + str(totalReward)) # log stats stats["successrate"] = successRate stats["totalreward"] = totalReward return stats
def main(problem_id, map_name_base): #rl agent referenced from lab 8 and 9 notebooks provided by tutor prof.bjorn jensen for ai course(2019-20) University of Glasgow if (problem_id < 0 or problem_id > 7): problem_id = problem_id else: print("Problem ID should be between 0 and 7") if (map_name_base == "8x8-base" or map_name_base == "4x4-base"): map_name_base = map_name_base else: print("Map base can be 8x8-base or 4x4-base") reward_hole = -0.05 #Hole penalty is set based on analysis to ensure that the reward is maximized is_stochastic = True EpisodeStats = namedtuple("Stats", ["episode_lengths", "episode_rewards"]) map_name_base = map_name_base np.random.seed(12) env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, reward_hole=reward_hole, map_name_base=map_name_base) states = env.observation_space.n actions = env.action_space.n Q = np.zeros((states, actions)) max_episodes = 10000 max_iter_per_episode = 1000 alpha = 0.1 #learning rate gamma = 0.999 #discount rate epsilon = 1 stats = EpisodeStats(episode_lengths=np.zeros(max_episodes), episode_rewards=np.zeros(max_episodes)) for episode in range(max_episodes): state = env.reset() for step in range(max_iter_per_episode): # take best action according to Q-table if random value is greater than epsilon, otherwise take a random action random_value = random.uniform(0, 1) if random_value > epsilon: action = np.argmax(Q[state, :]) #Agent goes here else: action = env.action_space.sample() new_state, reward, done, info = env.step(action) Q[state, action] = Q[state, action] + alpha * ( reward + gamma * np.max(Q[new_state, :]) - Q[state, action]) stats.episode_rewards[episode] += reward stats.episode_lengths[episode] = step state = new_state if done: break epsilon = 0.01 #epsilon is set to a low value to make sure of the exploitation print(Q) return (stats)
def main(p_id): # Setup the parameters for the specific problem (you can change all of these if you want to) problem_id = int( p_id ) # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent reward_hole = -1.0 # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice) is_stochastic = True # should be False for A-star (deterministic search) and True for the RL agent # Generate the specific problem env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=reward_hole) #q-learning variables epsilon = 0.5 # degree of randomness, I found a lower rate leads to better results in the long term max_episodes = 2000 # you can decide you rerun the problem many times thus generating many episodes... you can learn from them all! max_iter_per_episode = 500 # you decide how many iterations/actions can be executed per episode lr_rate = 0.81 gamma = 0.96 Q = np.zeros((env.observation_space.n, env.action_space.n)) def choose_action(state): action = 0 if np.random.uniform(0, 1) < epsilon: action = env.action_space.sample() #make a random move else: action = np.argmax(Q[state, :]) return action def learn(state, state2, reward, action): predict = Q[state, action] target = reward + gamma * np.max(Q[state2, :]) Q[state, action] = Q[state, action] + lr_rate * (target - predict) # Reset the random generator to a known state (for reproducability) np.random.seed(12) #setup vars for logfile f = open("out_RL_{}.txt".format(problem_id), "w+") successes = 0 failures = 0 #### for e in range(max_episodes): # iterate over episodes state = env.reset() # reset the state of the env to the starting state for iter in range(max_iter_per_episode): # env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line action = choose_action( state ) # your agent goes here (the current agent takes random actions) observation, reward, done, info = env.step( action) # observe what happends when you take the action learn(state, observation, reward, action) state = observation # # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner # Check if we are done and monitor rewards etc... if (done and reward == reward_hole): # env.render() # print("Failure") failures += 1 f.write("e,iter,reward,done = " + str(e) + " " + str(iter) + " " + str(reward) + " " + str(done) + "\n") # f.write("We have reached a hole :-( [we can't move so stop trying; just give up]\n") break if (done and reward == +1.0): # env.render() successes += 1 # print("Success") f.write("e,iter,reward,done = " + str(e) + " " + str(iter) + " " + str(reward) + " " + str(done) + "\n") # f.write("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal]\n") break f.write("Successes: " + str(successes)) f.write("\n") f.write("Failures: " + str(failures)) successRate = successes / max_episodes * 100 dict = { "Success": successes, "Failures": failures, "Episodes": max_episodes, "SuccessRate": successRate } return dict
def run_reinforcement_agent(problem_id, map): reward_hole = -0.5 env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=reward_hole, map_name_base=map) env.reset() # state_space, action_space, state_initial_id, state_goal_id = env2statespace(env) action_space = env.action_space.n state_space = env.observation_space.n q_table = np.zeros((state_space, action_space)) # parameter set up max_episodes = 10000 iterations = 1000 learning_rate = 0.1 # alpha discount_rate = 0.95 # gamma epsilon = 0.05 # exploration-exploitation settup rewards = [] hole_episode_counter = [] # number of times goal is reached out of max_episodes/ (performance measures where reward is collected) goal_episodes = [] # average number of iterations taken to reach goal per rewarded episode goal_iterations = [] # number of episodes before goal is first reached first_goal = 0 for episode in range(max_episodes): state = env.reset() # end learning phase at midpoint if episode == max_episodes / 2: learning_rate = 0.0 rewards_current_episode = 0 for step in range(iterations): # choose the highest q_value in table to choose action if np.random.uniform(0, 1) < epsilon: action = env.action_space.sample() else: action = np.argmax(q_table[state, :]) # if q_table is empty, random choice if action == 0: action = env.action_space.sample() new_state, reward, done, info = env.step(action) # update q table q_table[state, action] = q_table[state, action] * (1 - learning_rate) \ + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :])) state = new_state rewards_current_episode += reward if done == True: if rewards_current_episode != reward_hole: # set first episode that goal is reached if first_goal == 0: first_goal = episode goal_episodes.append(episode) goal_iterations.append(step + 2) # print('you reached the goal in {} steps'.format(step)) break else: # print('you fell in {} steps'.format(step)) hole_episode_counter.append(episode) break rewards.append(rewards_current_episode) rewards_per_100_eps = np.split(np.array(rewards), max_episodes / 100) rewards_per_100_eps = [str(sum(r / 100)) for r in rewards_per_100_eps] return len(goal_episodes), len(hole_episode_counter), mean(goal_iterations), \ mini(goal_iterations), maxi(goal_iterations), first_goal, rewards_per_100_eps
is_stochastic=False, reward_hole=reward_hole) # Let's visualize the problem/env print(env.desc) # Create a representation of the state space for use with AIMA A-star state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace( env) # Reset the random generator to a known state (for reproducability) np.random.seed(12) #### for e in range(max_episodes): # iterate over episodes observation = env.reset( ) # reset the state of the env to the starting state for iter in range(max_iter_per_episode): #env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line action = env.action_space.sample( ) # your agent goes here (the current agent takes random actions) observation, reward, done, info = env.step( action) # observe what happends when you take the action # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner print("e,iter,reward,done =" + str(e) + " " + str(iter) + " " + str(reward) + " " + str(done)) # Check if we are done and monitor rewards etc... if (done and reward == reward_hole):
def simple_agent(problem_id): # since A*star agent is fully informed, any negative reward for hole would not make a difference, hence we chose 0 reward_hole = 0.0 # generate 10 000 episodes in order to give agent chance to reach the goal multiple times max_episodes = 10000 # since A*star agent always wins, lower limit for allowed iterations per episode to 100 (time constraint) max_iter_per_episode = 100 actions = [] results = [] # setup the frozen lake loch lomond environment (deterministic, no uncertainty) env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=reward_hole) state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env) # informed search problem undirected_graph = UndirectedGraph(state_space_actions) undirected_graph.locations = state_space_locations graph_problem = GraphProblem(state_initial_id, state_goal_id, undirected_graph) node = astar_search(problem=graph_problem, h=None) best_path = node.solution() print('Running Simple Agent for problem: '.format(problem_id)) for i in range(len(best_path)): if i == 0: previous = undirected_graph.locations[state_initial_id] else: previous = undirected_graph.locations[best_path[i - 1]] current = undirected_graph.locations[best_path[i]] action = get_action_from_location(previous, current) actions.append(action) for e in range(max_episodes): # iterate over total number of possible episodes observation = env.reset() # reset the state of the environment to starting state S for iter in range(max_iter_per_episode): # select action from the solution action = actions[iter] # outcome of taking a certain action observation, reward, done, info = env.step(action) # Test condition to see if agent is done and associated rewards if (done and reward==reward_hole): break if (done and reward == +1.0): break results.append([e, iter+1, int(reward)]) # Save results to a CSV file np.savetxt('out_simple_{}.csv'.format(problem_id), np.array(results), header="episode,iterations,reward", delimiter=",", fmt='%s') columns = ['episode', 'iterations', 'reward'] dataframe = pd.DataFrame(data=np.array(results), index=np.array(results)[0:,0], columns=columns) dataframe['cumulative_rewards'] = list(itertools.accumulate(dataframe['reward'], operator.add)) dataframe['mean_rewards'] = dataframe.apply(lambda x: mean_rewards(x), axis=1) # Plotting the results for all task environments ID 0 to 7 x = range(1, len(dataframe) + 1) y = dataframe['mean_rewards'] title = 'Mean Reward vs Episodes' subtitle = 'Simple Agent: Problem ID {}'.format(problem_id) labels = ['Episodes', 'Mean Reward'] add_plot(x, y, 'out_simple_{}_mean_reward.png'.format(problem_id), title, subtitle, labels) # Print involved performance measures over all 10 000 episodes print('Total episodes run: ', max_episodes) print('Allowed iterations per episode: ', max_iter_per_episode) print('Max iterations per episode: ', max(dataframe['iterations'])) print('Mean iterations per episode: ', dataframe['iterations'].mean()) print('Average success per episode: ', max(dataframe['cumulative_rewards']) / max_episodes) print('Episodes won: ', max(dataframe['cumulative_rewards'])) print("\n") return dataframe
def rl_agent(problem_id): # select small negative rewards for the RL-Agent to create an incenctive to learn reward_hole = -0.01 # generate 10 000 episodes in order to give agent chance to reach the goal max_episodes = 10000 # every episode should have 2000 iterations (agent can take 2000 steps in the map) max_iter_per_episode = 2000 results = [] # setup the frozen lake loch lomond environment (uncertainty involved) env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, reward_hole=reward_hole) all_act = list(range(env.action_space.n)) q_agent = QLearningAgentUofG(terminals=get_terminals(env), all_act=all_act, alpha=lambda n: 0.8, gamma=0.8, Rplus=2, Ne=5) print('Running Q Learning Agent for problem: '.format(problem_id)) print("(will take a while)") for e in range(max_episodes): # iterate over episodes state = env.reset( ) # reset the state of the environment to starting state S reward = 0 # Over the total number of allowed iterations for iter in range(max_iter_per_episode): action = q_agent(state, reward, e + 1) # current agent takes actions if action is not None: state, reward, done, info = env.step(action) # Test condition to see if agent is done and associated rewards if done: q_agent(state, reward, e + 1) break results.append([e, iter + 1, int(reward)]) # Compute the policy policy = {} for state_action, value in list(q_agent.Q.items()): state, action = state_action policy[state] = argmax(q_agent.actions_in_state(state), key=lambda a: q_agent.Q[state, a]) print('Policy: ') print_table(to_arrows(policy, 8, 8)) # Save results to a CSV file np.savetxt('out_rl_{}.csv'.format(problem_id), np.array(results), header="episode,iterations,reward", delimiter=",", fmt='%s') np.savetxt('out_rl_{}_policy.txt'.format(problem_id), to_arrows(policy, 8, 8), delimiter="\t", fmt='%s') # Add a plot over all 10 000 episodes columns = ['episode', 'iterations', 'reward'] dataframe = pd.DataFrame(data=np.array(results), index=np.array(results)[0:, 0], columns=columns) dataframe['cumulative_rewards'] = list( itertools.accumulate(dataframe['reward'], operator.add)) dataframe['mean_rewards'] = dataframe.apply(lambda x: mean_rewards(x), axis=1) x = range(1, len(dataframe) + 1) y = dataframe['mean_rewards'] title = 'Mean Reward vs Episodes' subtitle = 'RL-Agent: Problem ID {}'.format(problem_id) labels = ['Episodes', 'Mean Reward'] add_plot(x, y, 'out_rl_{}.png'.format(problem_id), title, subtitle, labels) # Adding plot for the last 1000 episodes to detect potential learning dataframe_ac = pd.DataFrame( data=np.array(results)[range(max_episodes - 1000, max_episodes), :], columns=columns) dataframe_ac['episode'] = range(1000) dataframe_ac['cumulative_rewards'] = list( itertools.accumulate(dataframe_ac['reward'], operator.add)) dataframe_ac['mean_rewards'] = dataframe_ac.apply( lambda x: mean_rewards(x), axis=1) x = range(1, len(dataframe_ac) + 1) y = dataframe_ac['mean_rewards'] title = 'RL-Agent: Problem ID {}'.format(problem_id) subtitle = 'Last 1000 Episodes' labels = ['Last 1000 Episodes', 'Mean Reward'] add_plot(x, y, 'out_rl_{}_converged.png'.format(problem_id), title, subtitle, labels) # Print involved performance measures over all 10 000 episodes print('Total episodes run: ', max_episodes) print('Allowed iterations per episode: ', max_iter_per_episode) print('Max iterations per episode: ', max(dataframe['iterations'])) print('Mean iterations per episode: ', dataframe['iterations'].mean()) print('Average success per episode: ', max(dataframe['cumulative_rewards']) / max_episodes) print('Episodes won: ', max(dataframe['cumulative_rewards'])) # Print involved performance measures over the last 1000 episodes print("\n\n") print('Stats for the last 1000 episodes....') print('Max iterations per episode: ', max(dataframe_ac['iterations'])) print('Mean iterations per episode: ', dataframe_ac['iterations'].mean()) print('Average success per episode: ', max(dataframe_ac['cumulative_rewards']) / 1000) print('Episodes won: ', max(dataframe_ac['cumulative_rewards'])) return dataframe
class MyAbstractAIAgent(): """ Abstract agent that works as a base for all our agents. """ def __init__(self, problem_id, map_name_base="8x8-base"): # map_name_base="4x4-base" if not (0 <= problem_id <= 7): raise ValueError("Problem ID must be 0 <= problem_id <= 7") self.map_name_base = map_name_base self.env = LochLomondEnv(problem_id=problem_id, is_stochastic=self.is_stochastic(), reward_hole=self.reward_hole(), map_name_base=map_name_base) self.problem_id = problem_id self.reset() self.out = 'out/' self.policy = {} self._train = [] self.graphs = {} def is_stochastic(self): raise NotImplementedError def reward_hole(self): raise NotImplementedError def reset(self): self.rewards = 0 self.failures = 0 self.eval = [] self.timeouts = 0 def solve(self, episodes=10000, iterations=1000, seed=None, gamma=0.95): print('Solving with {} Agent'.format(self.name().capitalize())) print('Problem: ', self.problem_id) print('Grid: ', self.map_name_base) print('Episodes that will run...: ', episodes) self.train(episodes=episodes, iterations=iterations) rewards = self.rewards timeouts = self.timeouts failures = self.failures for e in range(1, episodes + 1): # iterate over episodes state = self.env.reset() self.set_episode_seed(e, seed) if e % 1000 == 0: print("Eval Episode", e) for i in range(1, iterations + 1): action = self.action(state) state, reward, done, info = self.env.step(action) if done: if reward == 1.0: rewards += int(reward) else: failures += 1 # break the cycle break if not done: timeouts += 1 self.eval.append([ self.problem_id, e, i, to_human(action), int(reward), rewards, rewards / e, failures, timeouts ]) def action(self, i): raise NotImplementedError def train(self, episodes, iterations): raise NotImplementedError def env(self): return self.env def set_episode_seed(self, episode, seed=None): # by default no seed for abstract agent return None def alias(self): return '{}out_{}_{}_{}'.format(self.out, self.name(), self.problem_id, self.env.ncol) def evaluate(self, episodes): self.env.reset() print("This is the environment: ") print(self.env.render()) if (len(self.policy) > 0): print("This is the final policy: ") print_table( policy_to_arrows(self.policy, self.env.ncol, self.env.ncol)) print('Saving Evaluation Files...') self.write_eval_files() # Plotting mean rewards print('Saving Plots...') labels = ['Episodes', 'Mean Reward'] title = 'Problem {}. Plot for {} Agent'.format( self.problem_id, self.name().capitalize()) if (len(self._train) > 0): subtitle = 'Episodes vs Mean Reward (Training Phase).' self.plot_train(range(episodes), labels, title, subtitle, 'mr') subtitle = 'First 1000 Episodes vs Mean Reward (Training Phase).' self.plot_train(range(999), labels, title, subtitle, 'mr_first_1000') subtitle = 'Last 1000 Episodes vs Mean Reward (Training Phase).' self.plot_train(range(episodes - 1000, episodes - 1), labels, title, subtitle, 'mr_last_1000') if (len(self.eval) > 0): subtitle = 'Episodes vs Mean Reward (Evaluation Phase).' self.plot_evaluation(range(episodes), labels, title, subtitle, 'mr') subtitle = 'First 1000 Episodes vs Mean Reward (Evaluation Phase).' self.plot_evaluation(range(999), labels, title, subtitle, 'mr_first_1000') subtitle = 'Last 1000 Episodes vs Mean Reward (Evaluation Phase).' self.plot_evaluation(range(episodes - 1000, episodes - 1), labels, title, subtitle, 'mr_last_1000') if (len(self.graphs) > 0): subtitle = 'Utilities plot' self.plot_utilities(['Episodes', 'U'], title, subtitle) def write_eval_files(self): def data_for_file(name): if name == 'policy': return policy_to_list(self.policy) if name == 'u': return u_to_list(self.U) if name == 'eval': return self.eval if name == 'q': return self.Q if name == 'train': return self._train if name == 'graphs': return self.graphs return [] for file in self.files(): if file == 'graphs': filename = '{}_{}.json'.format(self.alias(), file) with open(filename, 'w') as outfile: json.dump(data_for_file(file), outfile) else: filename = '{}_{}.csv'.format(self.alias(), file) data = [self.header(file)] + data_for_file(file) np.savetxt(filename, data, delimiter=",", fmt='%s') print('\tFile saved: {}'.format(filename)) def header(self, key): headers = { 'eval': [ 'id', 'episode', 'iteration', 'action', 'reward', 'rewards', 'mean_rewards', 'failures', 'timeouts' ], 'policy': ['x', 'y', 'action'], 'u': ['x', 'y', 'u'], 'train': [ 'id', 'episode', 'iteration', 'reward', 'rewards', 'mean_rewards', 'failures', 'timeouts' ], 'graphs': ['x', 'y', 'value'], 'q': ['position', 'x', 'y', 'action', 'action_friendly', 'value'] } if key in headers: return headers[key] def plot_train(self, rows, labels, title, subtitle, suffix=''): """ Plots mean rewards from training phase """ train = np.array(self._train) x = pd.to_numeric(train[:, 1]) y = pd.to_numeric(train[:, 5]) filename = '{}_train_{}.png'.format(self.alias(), suffix) self.plot(x, y, rows, labels, filename, title, subtitle) def plot_evaluation(self, rows, labels, title, subtitle, suffix=''): """ Plots mean rewards from evaluation phase """ evaluation = np.array(self.eval) x = pd.to_numeric(evaluation[:, 1]) y = pd.to_numeric(evaluation[:, 6]) filename = '{}_eval_{}.png'.format(self.alias(), suffix) self.plot(x, y, rows, labels, filename, title, subtitle) def plot_utilities(self, labels, title, subtitle): for state, value in self.graphs.items(): x, y = zip(*value) plt.plot(x, y, label=str(state)) plt.ylim([-0.1, 1.05]) plt.legend(loc='lower right') plt.xlabel(labels[0]) plt.ylabel(labels[1]) filename = '{}_utilities.png'.format(self.alias()) plt.suptitle(title, fontsize=12) plt.title(subtitle, fontsize=10) plt.savefig(filename) plt.close() print('\tPlot saved: {}'.format(filename)) def plot(self, x, y, rows, labels, filename, title, subtitle): plt.plot(x[rows], y[rows]) plt.xlabel(labels[0]) plt.ylabel(labels[1]) plt.suptitle(title, fontsize=12) plt.title(subtitle, fontsize=10) plt.savefig(filename) plt.close() print('\tPlot saved: {}'.format(filename))
def main(problem_id, map_name_base): #simple agent referenced and adapted from lab 4 notebook by tutor prof.bjorn jensen for ai course (2019-20) if(problem_id < 0 or problem_id > 7): problem_id = problem_id else: print("Probleam ID should be between 0 and 7") if(map_name_base == "8x8-base" or map_name_base == "4x4-base"): map_name_base = map_name_base else: print("Map base can be 8x8-base or 4x4-base") reward_hole = -1.0 is_stochastic = False max_episodes = 10000 env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, map_name_base=map_name_base, reward_hole=reward_hole) env.action_space.sample() print(env.desc) EpisodeStats = namedtuple("Stats",["episode_lengths", "episode_rewards"]) state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env) frozen_lake_map = UndirectedGraph(state_space_actions) frozen_lake_map.locations = state_space_locations frozen_lake_problem = GraphProblem(state_initial_id, state_goal_id, frozen_lake_map) all_node_colors=[] iterations, all_node_colors, node = my_astar_search_graph(problem=frozen_lake_problem, h=None) solution_path = [node] cnode = node.parent solution_path.append(cnode) while cnode.state != "S_00_00": cnode = cnode.parent if cnode is None: break solution_path.append(cnode) steps = solution_path[::-1] # Reset the random generator to a known state (for reproducibility) np.random.seed(12) observation = env.reset() # reset the state of the env to the starting state stats = EpisodeStats(episode_lengths=np.zeros(max_episodes),episode_rewards=np.ones(max_episodes)) for e in range(max_episodes): # iterate over episodes observation = env.reset() # reset the state of the env to the starting state for i in range(len(steps)-1): action = get_action_from_states(steps[i],steps[i+1])# your agent goes here (the current agent takes random actions) observation, reward, done, info = env.step(action) # observe what happends when you take the action # update stats stats.episode_rewards[e] = reward stats.episode_lengths[e] = i # Check if we are done and monitor rewards etc... if (done): print("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal... perhaps try again?]") break return (stats)
def run_senseless_agent(problem_id, map): reward_hole = 0.0 max_episodes = 10000 max_iter_per_episode = 1000 env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, map_name_base=map, reward_hole=reward_hole) env.render() env.action_space.sample() np.random.seed(12) # variables for performance evaluation # number of times goal is reached out of max_episodes/ (performance measures where reward is collected) goal_episodes = [] # number of episodes agent falls in hole hole_episodes = [] # average number of iterations taken to reach goal per rewarded episode goal_iterations = [] rewards = [] # number of episodes before goal is first reached first_goal = 0 for e in range(max_episodes): rewards_current_episode = 0 state = env.reset() for iter in range(max_iter_per_episode): action = env.action_space.sample() state, reward, done, info = env.step(action) rewards_current_episode += reward if (done and reward == reward_hole): hole_episodes.append(e) break if (done and reward == +1.0): # env.render() goal_episodes.append(e) goal_iterations.append(iter+2) # sets first goal to episode if first_goal == 0: first_goal = e break rewards.append(rewards_current_episode) # calculating steps to goal goal_iteration_average = mean(goal_iterations) goal_iteration_bestcase = mini(goal_iterations) goal_iteration_worstcase = maxi(goal_iterations) # splits collected rewards into per 100 episodes rewards_per_100_eps = np.split(np.array(rewards), max_episodes / 100) rewards_per_100_eps = [str(sum(r / 100)) for r in rewards_per_100_eps] return len(goal_episodes), len(hole_episodes), goal_iteration_average, goal_iteration_bestcase, \ goal_iteration_worstcase, first_goal, rewards_per_100_eps
def train_for_one_model(problem_id, map_name, train_or_not): problem_id = problem_id # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent reward_hole = -0.01 # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice) is_stochastic = True # should be False for A-star (deterministic search) and True for the RL agent if map_name == '4x4-base': n_dim = 4 num_episodes = 100000 else: num_episodes = 300000 n_dim = 8 env = LochLomondEnv(problem_id = problem_id, is_stochastic = is_stochastic, reward_hole = reward_hole, map_name_base = map_name) restart_times = 0 n_actions_for_episode = 0 rewards_all_episodes = [] rewards_all_episodes_per_2000 = [] x_axis_rewardsvsepisodes = [] episode_steps = [] max_steps_per_episode = 10000 exploration_rate = 0.5 q_table = np.zeros([env.observation_space.n,env.action_space.n]) learning_rate = 0.3 discount = 0.5 if problem_id == 0 and n_dim == 8: learning_rate = 0.2 discount = 0.8 if problem_id == 0 and n_dim == 4: learning_rate = 0.4 discount = 0.7 epsilon_min = 0.005 epsilon_decay_rate = 0.99995 shortest_path = 10000 longest_path = 0 avg_path = [] Train_or_not = train_or_not if Train_or_not == True: #--------------Training Process-----------------# for episode in range(num_episodes): restart_times += 1 state = env.reset() done = False rewards_current_episode = 0 path = [state] if restart_times % 5000 == 0: print("\ntraining in progress: #", restart_times) for step in range(max_steps_per_episode): n_actions_for_episode += 1 # Exploration - exploitation trade-off exploration_exploitation_rate = random.uniform(0, 1) epsilon = 0.3 if exploration_exploitation_rate < epsilon or q_table[state, :].all() == 0: action = env.action_space.sample() # Exploration Method 20% i.e take random action from the available actions else: action = np.argmax(q_table[state, :] + np.random.randn(1,4)) # Exploitation Method 80% i.e select the action with max value new_state, reward, done, info = env.step(action) path.append(new_state) # Update Q-table q_table[state, action] = q_table[state, action] + learning_rate * (reward + discount * np.max(q_table[new_state, :]) - q_table[state, action]) state = new_state # rewards_current_episode += reward if done == True and reward == 1: print("\rEpisode #%s: Finish it within %d steps" % (restart_times, len(path)),end = '') break if done == True and reward == -0.01: break # epsilon decay if epsilon >= epsilon_min: epsilon *= epsilon_decay_rate # rewards_all_episodes.append(rewards_current_episode) # if restart_times % 2000 == 0: # avg_reward_2000 = np.sum(rewards_all_episodes) / (2000 * (restart_times / 2000)) # rewards_all_episodes_per_2000.append(avg_reward_2000) # x_axis_rewardsvsepisodes.append(2000 * (restart_times / 2000)) #---------------SAVE THE MODEL--------------------# np.save('%sx%s q_tableP%s.npy' % (n_dim, n_dim, problem_id), q_table) #--------FINAL TEST-----------# if(train_or_not == True): print("\nRunning Test for 50000 times. Please wait...") q_table = np.load('%sx%s q_tableP%s.npy' % (n_dim, n_dim, problem_id)) # Load the trained model q table env.reset() state = env.reset() test_total_num = 50000 test_fail_num = 0 test_succeed_num = 0 Avg_rewards_per_1000_episodes = [] Avg_reward_per_step = [] Avg_reward_per_episode = [] for k in range(test_total_num): s = env.reset() j=0 rewards_temp = 0 while j < 1000: j += 1 action = np.argmax(q_table[s,:]) new_state,r,done,b = env.step(action) rewards_temp += r s = new_state if done and r == -0.01: test_fail_num += 1 break if done and r == 1.0: avg_path.append(j) if shortest_path > j: shortest_path = j if longest_path < j: longest_path = j test_succeed_num += 1.0 break if j == 1000: test_fail_num += 1 Avg_reward_per_episode.append(rewards_temp) Avg_reward_per_step.append(rewards_temp / j) if k % 1000 == 0: Avg_rewards_per_1000_episodes.append(np.sum(Avg_reward_per_episode) / int(1000 * float(k / 1000))) x_axis_rewardsvsepisodes.append(1000 * (k / 1000)) #--------------OUTPUT FINAL RESULT-----------------# if (train_or_not == True): print("\n-------------------------------------------") print("Average rewards per 1000 episodes:",Avg_rewards_per_1000_episodes[-1]) print("Average rewards per steps:", Avg_reward_per_step[-1]) print("Success times:",test_succeed_num) print("Failure times:",test_fail_num) print("Success rate:",float(test_succeed_num / test_total_num)) print("Success vs Failure rate:",float(test_succeed_num / test_fail_num)) print("Steps number (Best case):",shortest_path) print("Steps number (Worst case):",longest_path) print("Steps number (On average):",np.mean(avg_path)) print("Learning rate:",learning_rate) plt.cla() plt.plot(x_axis_rewardsvsepisodes[:], Avg_rewards_per_1000_episodes[:]) plt.savefig('./Images/%sx%s maps: Average Rewards of Problem%s.jpg' % (n_dim,n_dim,problem_id)) if (train_or_not == True): print("Figure Saved in Folder 'Images'") plt.show() return test_succeed_num, test_fail_num, shortest_path, longest_path,np.mean(avg_path), learning_rate, Avg_rewards_per_1000_episodes[-1], Avg_reward_per_step[-1]
print("The dict has been saved into file: " + output_file) with open(output_file, 'wb') as f: pickle.dump(random_agent_dict, f) print(random_agent_dict) # return a dict for use return random_agent_dict # a simple run random agent if __name__ == '__main__': agent_dict = random_agent(env, problem_id, max_episodes) reward_random_accumulate = 0 reward_random_total = 0 for episode in range(max_episodes): state = env.reset() step = 0 reward_random = 0 for step in range(max_iter_per_episode): action = env.action_space.sample() state, reward, done, info = env.step(action) reward_random_accumulate += reward if (step == max_iter_per_episode - 1): print("step over") if (done and reward == reward_hole): #print("hole :-( ") break if (done and reward == +1.0): reward_random_total = reward + reward_random_total
def main(p_id): # Setup the parameters for the specific problem (you can change all of these if you want to) problem_id = int(p_id) # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent reward_hole = 0.0 # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice) is_stochastic = False # should be False for A-star (deterministic search) and True for the RL agent max_episodes = 2000 max_iter_per_episode = 500 # you decide how many iterations/actions can be executed per episode # Generate the specific problem env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=reward_hole) # Let's visualize the problem/env # print("grid= \n") # print(env.desc) # env.render g = Grid(env.desc) # Create a representation of the state space for use with AIMA A-star # state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env) # print(state_goal_id) # Reset the random generator to a known state (for reproducability) np.random.seed(12) #setup vars for logfile f= open("out_AStar_{}.txt".format(problem_id) ,"w+") successes = 0 failures = 0 #### for e in range(max_episodes): # iterate over episodes observation = env.reset() # reset the state of the env to the starting state steps = aStar(g) for iter in range(max_iter_per_episode): # env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line action = steps[iter] # print(action) observation, reward, done, info = env.step(action) # observe what happends when you take the action # # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner # Check if we are done and monitor rewards etc... if(done and reward==reward_hole): # env.render() # print("Failure") failures += 1 f.write("e,iter,reward,done = " + str(e) + " " + str(iter)+ " " + str(reward)+ " Fail\n") # f.write("We have reached a hole :-( [we can't move so stop trying; just give up]\n") break if (done and reward == +1.0): # env.render() successes += 1 # print("Success") f.write("e,iter,reward,done = " + str(e) + " " + str(iter)+ " " + str(reward)+ " Success\n") # f.write("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal]\n") break f.write("Successes: " + str(successes)) f.write("\n") f.write("Failures: " + str(failures)) successRate = successes / max_episodes * 100 dict = {"Success": successes, "Failures": failures, "Episodes": max_episodes, "SuccessRate": successRate} # print(dict) return dict
def random_agent(problem_id): # should be less than or equal to 0.0, select 0 because reaching goal state is hard enough for random agent reward_hole = 0.0 # generate 10 000 episodes in order to give agent chance to reach the goal multiple times max_episodes = 10000 # every episode should have 2000 iterations (agent can take 2000 steps in the map) max_iter_per_episode = 2000 # setup the frozen lake loch lomond environment (uncertainty involved) env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, reward_hole=reward_hole) results = [] print('Running Random Agent for problem: ', problem_id) for e in range( max_episodes): # iterate over total number of possible episodes # reset the random generator to known state (probability needs to adapt) np.random.seed(e) observation = env.reset( ) # reset the state of the environment to starting state S for iter in range(max_iter_per_episode): # current agent takes random actions action = env.action_space.sample() # outcome of taking a certain action observation, reward, done, info = env.step(action) # Test condition to see if agent is done and associated rewards if (done and reward == reward_hole): break if (done and reward == +1.0): break results.append([e, iter + 1, int(reward)]) columns = ['episode', 'iterations', 'reward'] # Save results to a CSV file np.savetxt('out_random_{}.csv'.format(problem_id), np.array(results), header="episode,iterations,reward", delimiter=",", fmt='%s') dataframe = pd.DataFrame(data=np.array(results), index=np.array(results)[0:, 0], columns=columns) dataframe['cumulative_rewards'] = list( itertools.accumulate(dataframe['reward'], operator.add)) dataframe['mean_rewards'] = dataframe.apply(lambda x: mean_rewards(x), axis=1) # Plotting the results for all task environments ID 0 to 7 x = range(1, len(dataframe) + 1) y = dataframe['mean_rewards'] title = 'Mean Reward vs Episodes' subtitle = 'Random Agent: Problem ID {}'.format(problem_id) labels = ['Episodes', 'Mean Reward'] dataframe = dataframe[[ 'episode', 'iterations', 'cumulative_rewards', 'mean_rewards' ]] add_plot(x, y, 'out_random_{}_mean_reward.png'.format(problem_id), title, subtitle, labels) print('Total episodes run: ', max_episodes) print('Allowed iterations per episode: ', max_iter_per_episode) print('Max iterations per episode: ', max(dataframe['iterations'])) print('Mean iterations per episode: ', dataframe['iterations'].mean()) print('Average success per episode: ', max(dataframe['cumulative_rewards']) / max_episodes) print('Episodes won: ', max(dataframe['cumulative_rewards'])) print("\n") return dataframe