def __init__(self): super(GraphicDisplay, self).__init__() self.title('Value Iteration') self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) self.texts = [] self.arrows = [] self.util = Util() self.agent = ValueIteration(self.util) self._build_env()
def clear(self): for i in self.texts: self.canvas.delete(i) for i in self.arrows: self.canvas.delete(i) self.canvas.delete(self.rectangle) self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) self.agent = ValueIteration(self.util)
def __init__(self): super(GraphicDisplay, self).__init__() self.title('Value Iteration') self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) self.texts = [] self.arrows = [] self.env = Env() self.agent = ValueIteration(self.env) self._build_env() self.iteration_count = 0 self.improvement_count = 0 self.is_moving = 0
def test_5x5_maze_value_iteration(self): env = MazeEnvSpecial5x5() alg = ValueIteration(env) alg.train() done_cnt = 0 current_state = env.reset() while True: action = alg.predict(current_state) current_state, reward, done, _ = env.step(action) if done: break done_cnt += 1 self.assertEqual(done_cnt, 15) self.assertEqual(reward, 1)
def clear(self): if self.is_moving == 0: self.iteration_count = 0 self.improvement_count = 0 for i in self.texts: self.canvas.delete(i) for i in self.arrows: self.canvas.delete(i) self.canvas.delete(self.rectangle) self.rectangle = self.canvas.create_image( 50, 50, image=self.rectangle_image) self.agent = ValueIteration(self.env)
def train(gamma, epsilon, n_samples, n_steps, n_epochs, learning_rate): env = gym.make('EasyFrozenLakeEnv-v0') value_iteration = ValueIteration(env.nS, env.nA, env.P) print('Env States: %i' % (env.nS)) # preparing an expert # Calculate OPTIMAL POLICY V, policy = value_iteration(gamma, epsilon) # Use PI_opt to Sample trajectories = sample_trajectories(env, policy, n_steps, n_samples) # 1 if Visited, 0 if Not Visited # Feature = Average visited in 100 Samples of 10 Steps experts_feature = compute_experts_feature(env.nS, trajectories) print(experts_feature[:, ]) # training feature_matrix = np.eye(env.nS) reward_function = Reward(env.nS) svf = StateVisitationFrequency(env.nS, env.nA, env.P) for i in range(n_epochs): # Iterate to get V, policy = value_iteration(gamma, epsilon, reward_function) P = svf(policy, trajectories) grad = experts_feature - feature_matrix.T.dot(P) reward_function.update(learning_rate * grad) return reward_function(feature_matrix)
def test_3x3_maze_value_iteration(self): env = MazeEnvSample3x3() alg = ValueIteration(env, max_iter=90) alg.train() expected_values = np.array([[2.048, 2.56, 3.2], [2.56, 3.2, 4], [3.2, 4, 5]]) # expected values are solved by Bell equation x = 1 + 0.8 * x for V[2, 2] = 5, etc.. assert_array_almost_equal(alg.values, expected_values) done_cnt = 0 current_state = env.reset() while True: action = alg.predict(current_state) current_state, reward, done, _ = env.step(action) if done: break done_cnt += 1 self.assertEqual(done_cnt, 3) self.assertEqual(reward, 1)
def human_model_goal(self, policy_index, theta, final_value_param): valiter = ValueIteration(self.grid_size) final_value = theta * final_value_param value, q_value, optimal_policies = valiter.value_iteration( final_value, self.discount) exp_q_vals = np.zeros(len(valiter.policies)) for i in range(len(valiter.policies)): exp_q_vals[i] = np.exp( self.beta * q_value[self.robot_state[0], self.robot_state[1], i]) sum_exp = 0 for i in range(len(exp_q_vals)): if not np.isnan(exp_q_vals[i]): sum_exp += exp_q_vals[i] exp_q_vals /= sum_exp return exp_q_vals[policy_index]
def main(args): # resolve path to world map definition if not args.world: world_map_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'world_map.txt') else: world_map_path = args.world print("Reading world from %s" % world_map_path) if not os.path.exists(world_map_path): raise IOError( "World map definition not found at its expected path: %s" % world_map_path) world = World(world_map_path) visualizer = Visualizer(world) # Value Iteration value_iteration = ValueIteration(world, one_step_cost_v1, discount_factor=args.gamma, eps=10e-10) value_iteration.execute() optimal_policy = value_iteration.extract_policy() fig_vi = plt.figure() visualizer.draw(fig_vi, optimal_policy, value_iteration.value_fn, "Value Iteration (gamma = %.2f)" % args.gamma) # Policy Iteration policy_iteration = PolicyIteration(world, one_step_cost_v1, discount_factor=args.gamma) value_fn = policy_iteration.execute() fig_pi = plt.figure() visualizer.draw(fig_pi, policy_iteration.policy, value_fn, "Policy Iteration (gamma = %.2f)" % args.gamma) plt.show()
def execute_value_iteration_test(test, epsilon, output='console'): """ Description ----------- Function used to run the value_iteration for the given test. Parameters ---------- test: Test() \\ -- A Test() instance with the information needed to run the value_iteration. epsilon: float \\ -- A floating point number, used to set the value_iteration's stopping decision. output: str \\ -- A string that tells the function where its output is expected. Returns ------- ValueIteration() \\ -- If no recognizable outputs is informed, returns the instance of the value_iteration used. """ value_iteration = ValueIteration(test, epsilon=epsilon) value_iteration.run() if output in ['console', 'file']: output_processing(output, test, value_iteration, 'ValueIteration') return value_iteration
def main(algorithm, track, x_start, y_start, discount, learning_rate, threshold, max_iterations, epsilon=None, reset_on_crash=False): """ Program entry. Runs selected algorithm on selected track, at given coordinates, with given parameters :param algorithm: String :param track: List :param x_start: Int :param y_start: Int :param discount: Float :param learning_rate: Float :param threshold: Float :param max_iterations: Int :param epsilon: Float :param reset_on_crash: Boolean :return: None """ with open(track) as f: specs = f.readline().strip().split(',') rows = int(specs[0]) cols = int(specs[1]) layout = f.read().splitlines() initial_state = (x_start, y_start, 0, 0) initial_action = (0, 0) agent = Car(initial_action, epsilon) environment = RaceTrack(rows, cols, layout, initial_state, reset_on_crash=reset_on_crash) if algorithm == 'value_iteration': value_iterator = ValueIteration(discount, threshold, max_iterations, environment, agent) value_iterator.run() path = value_iterator.extract_policy(initial_state) value_iterator.plot_max_diffs() elif algorithm == 'q_learning': q_learner = QLearning(discount, learning_rate, threshold, max_iterations, environment, agent) path = q_learner.run() q_learner.plot_avg_cost() elif algorithm == 'sarsa': sarsa = Sarsa(discount, learning_rate, threshold, max_iterations, environment, agent) path = sarsa.run() sarsa.plot_avg_cost() else: print("No algorithm selected") return None draw_track(path, layout)
def train(gamma, epsilon, n_samples, n_steps, n_epochs, learning_rate): env = gym.make('EasyFrozenLakeEnv-v0') value_iteration = ValueIteration(env.nS, env.nA, env.P) # preparing an expert V, policy = value_iteration(gamma, epsilon) trajectories = sample_trajectories(env, policy, n_steps, n_samples) experts_feature = compute_experts_feature(env.nS, trajectories) # training feature_matrix = np.eye(env.nS) reward_function = Reward(env.nS) svf = StateVisitationFrequency(env.nS, env.nA, env.P) for i in range(n_epochs): V, policy = value_iteration(gamma, epsilon, reward_function) P = svf(policy, trajectories) grad = experts_feature - feature_matrix.T.dot(P) reward_function.update(learning_rate * grad) return reward_function(feature_matrix)
def main(): aiType = 3 worldSize = 6 game = Game(aiType, worldSize) agent = Agent() pc = None policy = None if aiType == 1: policy = ValueIteration() pc = PolicyConfiguration(inpRewards=[1, -1, 0, 10, -1], inpDiscounts=[1, .1, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]]) elif aiType == 2: policy = PolicyIteration() pc = PolicyConfiguration(inpRewards=[1, -1, 0, 10, -1], inpDiscounts=[1, .1, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]]) elif aiType == 3: policy = qLearningAgent() pc = PolicyConfiguration(inpRewards=[0, -1, 0, 10, -1], inpDiscounts=[1, .1, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]], inpFile="QLValues.p", inpTrainingLimit=1000) elif aiType == 4: policy = approximateQLearning() pc = PolicyConfiguration(inpRewards=[2, -1, 0, 0, -1], inpDiscounts=[0.9, .2, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]], inpFile="AQLWeights.json", inpTrainingLimit=500) else: policy = ValueIteration() pc = PolicyConfiguration() policy.config = pc agent.policy = policy game.agent = agent game.mainLoop()
def __init__(self, can, direction, inpAIType): self.flag = 1 self.can = can self.direction = direction self.aiType = inpAIType self.agent = Agent() pc = None policy = None #inpRewards = [food reward, hazard reward, living reward, good location reward, bad location reward] #good and bad location is only used for qlearning #tried to use to cause graph searching #not really used and can give wonky results #inpDiscounts = [gamma discount, alpha discount, epsilon explore chance] #inpStochastic = [forward action[forward chance, left chance, right chance] #left action[forward chance, left chance, right chance] #right action[forward chance, left chance, right chance]] #inpFile file for weight or qvalues if self.aiType == 1: policy = ValueIteration() pc = PolicyConfiguration(inpRewards = [1,-1,0,10,-1], inpDiscounts = [1,.1,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]]) elif self.aiType == 2: policy = PolicyIteration() pc = PolicyConfiguration(inpRewards = [1,-1,0,10,-1], inpDiscounts = [1,.1,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]]) elif self.aiType == 3: policy = qLearningAgent() #risk aversion aka rarely go off best path seems to work best #This one seemed to work #pc = PolicyConfiguration(inpRewards = [2,-1,0,0,-1], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]]) pc = PolicyConfiguration(inpRewards = [2,-1,0,0,0], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]], inpFile = None, inpTrainingLimit = 20000) elif self.aiType == 4: policy = approximateQLearning() pc = PolicyConfiguration(inpRewards = [2,-1,0,0,-1], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]], inpFile = None, inpTrainingLimit = 5000) else: policy = ValueIteration() pc = PolicyConfiguration() policy.config = pc self.agent.policy = policy
class GraphicDisplay(tk.Tk): def __init__(self): super(GraphicDisplay, self).__init__() self.title('Value Iteration') self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) self.texts = [] self.arrows = [] self.util = Util() self.agent = ValueIteration(self.util) self._build_env() def _build_env(self): self.canvas = tk.Canvas(self, bg='white', height=HEIGHT * UNIT, width=WIDTH * UNIT) # Buttons iteration_button = tk.Button(self, text="Calculate", command=self.calculate_value) iteration_button.configure(width=10, activebackground="#33B5E5") self.canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10, window=iteration_button) policy_button = tk.Button(self, text="Print Policy", command=self.print_optimal_policy) policy_button.configure(width=10, activebackground="#33B5E5") self.canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10, window=policy_button) policy_button = tk.Button(self, text="Move", command=self.move_by_policy) policy_button.configure(width=10, activebackground="#33B5E5") self.canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10, window=policy_button) policy_button = tk.Button(self, text="Clear", command=self.clear) policy_button.configure(width=10, activebackground="#33B5E5") self.canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10, window=policy_button) # create grids for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT self.canvas.create_line(x0, y0, x1, y1) for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r self.canvas.create_line(x0, y0, x1, y1) # image_load self.up_image = ImageTk.PhotoImage(Image.open("../resources/up.png").resize((13, 13))) self.right_image = ImageTk.PhotoImage(Image.open("../resources/right.png").resize((13, 13))) self.left_image = ImageTk.PhotoImage(Image.open("../resources/left.png").resize((13, 13))) self.down_image = ImageTk.PhotoImage(Image.open("../resources/down.png").resize((13, 13))) self.rectangle_image = ImageTk.PhotoImage( Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS)) self.triangle_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65))) self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65))) # add image to canvas self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) self.hell1 = self.canvas.create_image(250, 150, image=self.triangle_image) self.hell2 = self.canvas.create_image(150, 250, image=self.triangle_image) self.circle = self.canvas.create_image(250, 250, image=self.circle_image) # add reward text self.text_reward(2, 2, "R : 1.0") self.text_reward(1, 2, "R : -1.0") self.text_reward(2, 1, "R : -1.0") # pack all self.canvas.pack() def clear(self): for i in self.texts: self.canvas.delete(i) for i in self.arrows: self.canvas.delete(i) self.canvas.delete(self.rectangle) self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) self.agent = ValueIteration(self.util) def reset(self): self.update() time.sleep(0.5) self.canvas.delete(self.rectangle) self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) # return observation return self.canvas.coords(self.rectangle) def text_value(self, row, col, contents, font='Helvetica', size=12, style='normal', anchor="nw"): origin_x, origin_y = 85, 70 x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) font = (font, str(size), style) return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor)) def text_reward(self, row, col, contents, font='Helvetica', size=12, style='normal', anchor="nw"): origin_x, origin_y = 5, 5 x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) font = (font, str(size), style) return self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor) def step(self, action): s = self.canvas.coords(self.rectangle) base_action = np.array([0, 0]) if action == 0: # up if s[1] > UNIT: base_action[1] -= UNIT elif action == 1: # down if s[1] < (HEIGHT - 1) * UNIT: base_action[1] += UNIT elif action == 2: # right if s[0] < (WIDTH - 1) * UNIT: base_action[0] += UNIT elif action == 3: # left if s[0] > UNIT: base_action[0] -= UNIT self.canvas.move(self.rectangle, base_action[0], base_action[1]) # move agent s_ = self.canvas.coords(self.rectangle) # next state # reward function if s_ == self.canvas.coords(self.circle): reward = 1 done = True elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]: reward = -1 done = True else: reward = 0 done = False return s_, reward, done def rectangle_move(self, action): base_action = np.array([0, 0]) self.render() if action[0] == 1: # down base_action[1] += UNIT elif action[0] == -1: # up base_action[1] -= UNIT elif action[1] == 1: # right base_action[0] += UNIT elif action[1] == -1: # left base_action[0] -= UNIT self.canvas.move(self.rectangle, base_action[0], base_action[1]) # move agent def rectangle_location(self): temp = self.canvas.coords(self.rectangle) x = (temp[0] / 100) - 0.5 y = (temp[1] / 100) - 0.5 return int(y), int(x) def move_by_policy(self): self.canvas.delete(self.rectangle) self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) agent_state = [self.rectangle_location()[0], self.rectangle_location()[1]] while len(self.agent.get_action(agent_state, False)) != 0: agent_state = [self.rectangle_location()[0], self.rectangle_location()[1]] self.after(100, self.rectangle_move(self.agent.get_action(agent_state, True))) def draw_one_arrow(self, col, row, action): if action[0] == 1: # down origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.down_image)) elif action[0] == -1: # up origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.up_image)) elif action[1] == 1: # right origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.right_image)) elif action[1] == -1: # left origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.left_image)) def draw_from_values(self, state, action_list): i = state[0] j = state[1] for action in action_list: self.draw_one_arrow(i, j, action) def print_values(self, values): for i in range(WIDTH): for j in range(HEIGHT): self.text_value(i, j, values[i][j]) def render(self): time.sleep(0.1) self.canvas.tag_raise(self.rectangle) self.update() def calculate_value(self): for i in self.texts: self.canvas.delete(i) self.agent.iteration() print(self.agent.get_value_table) self.print_values(self.agent.get_value_table()) def print_optimal_policy(self): for i in self.arrows: self.canvas.delete(i) for state in self.util.get_all_states(): action = self.agent.get_action(state, False) self.draw_from_values(state, action)
def main(argv): # Get command line arguments try: opts, args = getopt.getopt(argv, "pi:") except getopt.GetoptError: print("main.py [-p] [-i=n_iter]") sys.exit(1) # Plot switch plot = False # Default iterations n_iter = 100000 # Parse command line arguments for opt, arg in opts: # Help if opt == "-h": print("main.py [-p] [-i=n_iter]") sys.exit() # Plot elif opt == "-p": plot = True # Number of iterations elif opt == "-i": n_iter = arg # Construct grid grid = construct_grid() # Find solution with value iteration print("Performing value iteration...") vi = ValueIteration(grid) vi = vi.solve() print("----------") print("The value for each cell is:") print("(x,y): value") for cell in grid: print(cell.name_x()+","+cell.name_y()+": "+\ str(vi[0][cell.get_name()])) print("----------") print("The policy found by value iteration is:") print("(x,y): action") for cell in grid: print(cell.name_x()+","+cell.name_y()+": "+\ str(vi[1][cell.get_name()])) # Find solution with Q-learning print("\n") print("Performing Q-learning...") ql = QLearning(grid, N_iter = int(n_iter)) ql = ql.solve() ql_states = ql[0] ql_Q = ql[1] print("----------") print("The policy found by Q-learning is.") print("(x,y): action") actions = ["north","east","south","west"] for cell in grid: ind = ql_states.index(cell.get_name()) action = actions[np.argmax(ql_Q[ind,])] print(cell.name_x()+","+cell.name_y()+": "+str(action)) if plot: # Convergence graph for q-learning: # best Q-value of the best action for the START state fig, ax = plt.subplots() ax.plot(ql[2][0:ql[3]], label = "Q-values for best action in START") ax.plot((0,ql[3]),(vi[0]["11"],vi[0]["11"]), label = "Values of START") plt.xlabel("Iterations") plt.show()
import numpy as np from value_iteration import ValueIteration grid_size = [5, 5] final_value = np.zeros((grid_size[0], grid_size[1])) final_value[0][0] = 1 final_value[2][1] = -1 discount = 0.9 valiter = ValueIteration(grid_size) value, q_value, optimal_policies = valiter.value_iteration( final_value, discount) print(value) print(optimal_policies)
soft_Q_policy[i][1] = 0 #左 if (i % X) == 0: soft_Q_policy[i][4] = soft_Q_policy[i][4] + soft_Q_policy[i][2] soft_Q_policy[i][2] = 0 #右 if (i % X) == X - 1: soft_Q_policy[i][4] = soft_Q_policy[i][4] + soft_Q_policy[i][3] soft_Q_policy[i][3] = 0 #報酬を保存 np.savetxt("R_X5Y5.csv", est_reward.reshape((X, Y)), delimiter=", ") #print(est_reward) env_est = gridworld.GridWorld(grid_shape, est_reward) est_agent = ValueIteration(env_est, gamma) #状態価値を出す#確率的な方策にも対応 V_est = est_agent.get_pi_value(soft_Q_policy) print(V_est) #np.savetxt("V_Pro_1.csv",V_est.reshape((5,5)),delimiter=", ") gap_sum_dist = [] for q in range(len(traj)): pi_check = traj[q] #ここのpi_checkは軌跡(方策ではない) #gapをだす q_gap_one_list = Q_seikika_gap(pi_check, X, Y, V_est, soft_Q_policy) q_gap_sum_list = Q_gap_sum_list(q_gap_one_list) #print(q_gap_sum_list[-1]) gap_sum_dist.append(q_gap_sum_list[-1])
height = 8 width = 8 goal_co_ord = (3, 2) tile_size = (200, 200) surface, tiles = make_pygame(height, width, goal_co_ord, tile_size) grid = GridWorld(height, width, goal_co_ord=goal_co_ord) state_values = [0] * len(grid.states) update_tiles(state_values, tiles) random_policy = {action: 0.25 for action in grid.actions} solvers = { 'dynamic-programming': DynamicProgramming(random_policy, grid), 'value-iteration': ValueIteration(grid) } solver = solvers[args.solver] running = True done = False # while not done: while running: running = check_pygame() pygame.display.update() state_values, done = solver.forward(state_values) update_tiles(state_values, tiles)
parser.add_argument('-r', '--renderEvery', type=int, default=0, help="Render every nth episode. 0 to disable.") args = parser.parse_args() # Initialize the environment env = Environment(args.environment, args.numEpisodesPerEval, args.renderEvery) if args.algorithm == 'SARSA' or args.algorithm == 'Qlearning' or args.algorithm == 'MonteCarlo' or args.algorithm == 'MinVar': from value_iteration import ValueIteration policy = policies.DiscreteQfunction(env, args.hiddenLayers) algo = ValueIteration(policy, gamma=args.gamma, learnrate=args.learningRate, estimator=args.algorithm) else: # policy based methods # Initialize the policy if env.actionType == 'discrete': policy = policies.DiscretePolicy(env, args.hiddenLayers) elif env.actionType == 'continuous': policy = policies.GaussianPolicy(env, args.hiddenLayers, args.explorationNoise) else: raise Exception("Unreachable.") # Select a training algorithm. if args.algorithm == 'Reinforce' or args.algorithm == 'PG': from reinforce import Reinforce algo = Reinforce(policy,
################################################################################ actions = [ (-1,-1), (0,-1), (1,-1), (-1, 0), (0, 0), (1, 0), (-1, 1), (0, 1), (1, 1), ] vl_opts = [0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1] # tiny test track ################################################################################ track = dl.load_tinytrack() simulator = TrackSimulator(track = track, min_velocity = min(vl_opts), max_velocity = max(vl_opts), crash_restart = False) learner = ValueIteration(env = simulator, vl_opts = vl_opts, actions = actions, gamma = 1.0, epsilon = 0.001) learner = Q_SARSA_Learner(env = simulator, vl_opts = vl_opts, actions = actions, alpha = 0.25, gamma = 0.9) learner = Q_SARSA_Learner(env = simulator, vl_opts = vl_opts, actions = actions, alpha = 0.25, gamma = 0.9, sarsa = True) simulator.pretty_print() trial_helper(simulator, learner, 50, 10, 'tinytrack', policy = None) trial_helper(simulator, learner, 100000, 10, 'tinytrack-q', policy = None) trial_helper(simulator, learner, 100000, 10, 'tinytrack-sarsa', policy = None) # l-track ################################################################################ track = dl.load_l()