def test_instantiation(): """ Testing common QLearner initial arguments and support functions. """ # Set-up: STATES = 10 ACTIONS = 5 rmatrix_sq = np.random.rand(STATES, STATES) rmatrix_rec = np.random.rand(STATES, ACTIONS) tmatrix = np.random.randint(0, STATES, size=(STATES, ACTIONS)) # making sure tmatrix points to goal states: tmatrix[:, ACTIONS - 1] = np.random.randint(0, 1, size=STATES) goal_l = (0, 1) goal_f = lambda x: x <= 1 np.savetxt('test.dat', rmatrix_sq) global QLEARNER # Test 1: list goal temp = QLearner(rmatrix_sq, goal_l) assert np.array_equal(temp.rmatrix, rmatrix_sq), "R matrix not equal to arg." assert temp.goal(0) and temp.goal(1) and not temp.goal(2) and not temp.goal(3), \ 'List goal not working.' QLEARNER = temp # Test 2: function goal temp = QLearner(rmatrix_sq, goal_f) assert temp.goal(0) and temp.goal( 1) and not temp.goal(2), 'Function goal not working.' QLEARNER = temp # Test 3: File I/O temp = QLearner('test.dat', goal_l) assert temp.qmatrix.shape == rmatrix_sq.shape, "Q & R matrix dimension mismatch." assert np.array_equal(temp.rmatrix, rmatrix_sq), "R matrix not equal to arg." QLEARNER = temp # Test 4: rectangular r matrix, no tmatrix try: QLearner(rmatrix_rec, goal_l) except ValueError: pass # Test 5: rectangular r matrix, t matrix of same dimension temp = QLearner(rmatrix_rec, goal_f, tmatrix) assert temp.next_state(1, 2) == tmatrix[1, 2], 'Next state prediction incorrect.' QLEARNER = temp # Test 6: episodes l = set(temp.episodes(coverage=1.0, mode='bfs')) assert l == set(range(temp.num_states)), 'Full episode coverage failed.' # Finalize os.remove('test.dat')
def set_up_learner(self, learner, **kwargs): """ Attaches the appropriate learner to instance for testing. """ if learner == FLearner: sflags = FlagGenerator(self.size, self.size) aflags = FlagGenerator(2, 2) self.learner = FLearner(rmatrix=self.rmatrix, goal=self.goals, stateconverter=sflags, actionconverter=aflags, tmatrix=self.tmatrix, seed=self.seed, **kwargs) elif learner == QLearner: self.learner = QLearner(rmatrix=self.rmatrix, goal=self.goals, tmatrix=self.tmatrix, seed=self.seed, **kwargs) elif learner == SLearner: sflags = FlagGenerator(self.size, self.size) aflags = FlagGenerator(2, 2) sim = create_sim_env(self.size, self.random) def reward(svec, avec, nstate): action = aflags.encode(avec) state = sflags.encode((round(svec[0]), round(svec[1]))) return self.rmatrix[state, action] def goal(svec): return self.coord2state( (round(svec[0]), round(svec[1]))) in self.goals self.learner = SLearner(reward=reward, simulator=sim, goal=goal, stateconverter=sflags, actionconverter=aflags, seed=self.seed, **kwargs) elif learner is None: self.learner = None else: raise TypeError('Class: ' + learner.__name__ + ' is not supported.\ Assign to .learner manually')
def branin(discount, learning_rate, buckets_w, buckets_h, buckets_v): def run_game(): # Make a new monkey object. swing = SwingyMonkey( visual=False, # no video sound=False, # no audio action_callback=learner_class.action_callback, reward_callback=learner_class.reward_callback) # Loop until you hit something. while swing.game_loop(): pass return swing # make a new learner with the given parameters learner_class = QLearner(learn_fn=lambda i: learning_rate, discount_fn=lambda i: discount, bucket_height=buckets_h, bucket_width=buckets_w, velocity_bucket=buckets_v) # train the learner for t in xrange(TRAIN_ITERS): run_game() # keep learning, take average over the iterations scores = [] for t in xrange(TEST_ITERS): # Make a new monkey object. swing = run_game() scores.append(swing.score) avg_score = float(sum(scores)) / float(TEST_ITERS) median_score = np.median(scores) # which do we return? print "The median is %d and the mean is %f." % (median_score, avg_score) # out objective is to minimize the negative of the average score return -1 * avg_score
OUT proximity refers to outside of the quartile of the player """ NUM_STATES = 32 * (54**args.numTeammates) # Shoot, Dribble or Pass to one of N teammates or NUM_ACTIONS = 2 + args.numTeammates hfo = HFOEnvironment() hfo.connectToServer(feature_set=HIGH_LEVEL_FEATURE_SET, server_port=args.port) if args.inQTableDir: q_learner = QLearner( NUM_STATES, NUM_ACTIONS, epsilon=args.epsilon, learning_rate=args.learningRate, q_table_in=args.inQTableDir + str(args.playerIndex) + '.npy', q_table_out=args.outQTableDir + str(args.playerIndex) + '.npy') else: q_learner = QLearner( NUM_STATES, NUM_ACTIONS, epsilon=args.epsilon, learning_rate=args.learningRate, q_table_in=args.outQTableDir + str(args.playerIndex) + '.npy', q_table_out=args.outQTableDir + str(args.playerIndex) + '.npy') for episode in range(0, args.numEpisodes): status = IN_GAME action = None
# main from MapBuilder import MapBuilder from qlearner import QLearner from universe import Universe from Criterions import get_cost_based_on_fuel, get_cost_based_on_time, get_cost_based_on_mixture if __name__ == "__main__": universe = Universe(MapBuilder()) qlearners = [ QLearner(universe.get_initial_state(), get_cost_based_on_fuel, universe.move_request, universe.get_terminal_state(), 1, 0.9, universe.next_state), QLearner(universe.get_initial_state(), get_cost_based_on_time, universe.move_request, universe.get_terminal_state(), 1, 0.9, universe.next_state), QLearner(universe.get_initial_state(), get_cost_based_on_mixture, universe.move_request, universe.get_terminal_state(), 1, 0.9, universe.next_state) ] num_of_epochs = 1000 for epoch_num in range(num_of_epochs): for qlearner in qlearners: while qlearner._state != universe.get_terminal_state(): qlearner.move() qlearner.reset(universe.get_initial_state()) print("Energy:", qlearners[0]._Q, end='\n\n') print("Time:", qlearners[1]._Q, end='\n\n')
# L.Braun 2018 # Main program to solve a gridworld maze problem # Uses qlearner.py, environ.py from qlearner import QLearner import pylab as plt my_learner = QLearner() my_learner.load_maze('/u/braun/tlab/QLearner/data/reward_4x4.npy', '/u/braun/tlab/QLearner/data/meta_4x4.txt') #print ("testing data load\n\n") #my_learner.display_Q() #my_learner.display_R() print("begin training...") reward = my_learner.train(0.7) my_learner.display_Q() my_learner.display_R() steps = my_learner.test(7) # 7 foods in 4x4 maze print("steps") print(steps) print("") plt.hist(reward, 50, normed=1, facecolor='g', alpha=0.75) plt.xlabel('Episodes required to reach 200')
### SETUP num_learning_trials = 10000 num_simulation_trials = 1000 num_learning_epochs = 15 ### PART III: MDP 1 epsilon experiments epsilon_list = [0.1, 0.25, 0.5, 0.75] learning_rate = 0.01 epoch_list = [] avg_reward_list = [] for e, epsilon in enumerate(epsilon_list): print "Epsilon: {0}".format(epsilon) qlearner = QLearner(mdp1, initial_state1, epsilon=epsilon, alpha=learning_rate) epoch_list.append(range(num_learning_epochs)) avg_reward_list.append([]) for epoch in epoch_list[e]: for trial in range(num_learning_trials): qlearner.run_learning_trial() avg_reward = 0 for trial in range(num_simulation_trials): (total_reward, state_seq, action_seq) = qlearner.run_simulation_trial() avg_reward += total_reward avg_reward = 1. * avg_reward / num_simulation_trials avg_reward_list[e].append(avg_reward)
def __init__(self, config_or_model, load_model=False): self.config = None self.model_loaded = False #load a saved model if load_model: print("Loading model from: {}".format(config_or_model)) load_path = Path(config_or_model) if (not load_path.exists()) or (not load_path.is_dir()): print("Error: directory doesn't exist") config_filename = load_path.joinpath("config.json") self.config = self.load_config(str(config_filename)) else: self.config = self.load_config(config_or_model) #select game self.game_name = self.config["game"] self.game = None if self.game_name == "snake": self.game = game.Snake elif self.game_name == "box": self.game = game.Box else: print("Error: unknown game {}".format(self.game_name)) self.nn_config = self.config["nn"] #parameters of experience memory self.memory_size = self.config["memory_size"] self.memory_alpha = self.config["memory_alpha"] self.memory_beta_start = self.config["memory_beta_start"] self.memory_beta_end = self.config["memory_beta_end"] self.memory_beta_num_steps = self.config["memory_beta_num_steps"] self.memory_beta_step = (self.memory_beta_end - self.memory_beta_start ) / self.memory_beta_num_steps self.exp_memory_start_size = self.config["memory_start_size"] #game parameters: image size, board size, num_goals, ... self.width = self.config["width"] self.height = self.config["height"] self.image_scale_factor = self.config["image_scale_factor"] self.num_goals = self.config["num_goals"] self.img_width = self.width * self.image_scale_factor self.img_height = self.height * self.image_scale_factor self.num_img_channels = self.game.num_channels self.num_actions = self.game.num_actions #random policy parameters self.epsilon_start = self.config["epsilon_start"] self.epsilon_min = self.config["epsilon_min"] self.num_epsilon_steps = self.config["num_epsilon_steps"] self.epsilon_step = (self.epsilon_start - self.epsilon_min) / self.num_epsilon_steps #scale rewards, training might be more stable if q-values converge to range [-1,1] self.scale_reward_max = None if "scale_reward_max" in self.config: self.scale_reward_max = self.config["scale_reward_max"] self.game.max_reward *= self.scale_reward_max self.game.min_reward *= self.scale_reward_max self.game.empty_reward *= self.scale_reward_max print("Scaling rewards by {}".format(self.scale_reward_max)) #frequence parameters of updating target network, output, saving, tensorboard, evaluation self.max_steps = self.config["max_steps"] self.output_freq = self.config["output_freq"] self.update_freq = self.config["update_freq"] self.target_network_update_mode = self.config[ "target_network_update_mode"] self.target_network_update_tau = None self.target_network_update_freq = None if self.target_network_update_mode == "hard": self.target_network_update_freq = self.config[ "target_network_update_freq"] else: self.target_network_update_tau = self.config[ "target_network_update_tau"] self.eval_freq = self.config["eval_freq"] self.eval_steps = self.config["eval_steps"] self.tensorboard_log_freq = self.config["tensorboard_log_freq"] self.tensorboard_log_path = self.config["tensorboard_log_path"] self.save_freq = self.config["save_freq"] self.save_path = self.config["save_path"] self.batch_size = self.config["batch_size"] #parameters that are actually changed while training, these need to be saved and loaded self.curr_step = 0 self.epsilon = self.epsilon_start self.memory_beta = self.memory_beta_start self.best_average_score = 0 #create experience memory self.exp_memory = ExperienceMemory(self.memory_size, self.img_width, self.img_height, self.num_img_channels, self.memory_alpha) #create QLearner object, load saved neural network model if necessary self.qlearner = None if load_model: load_path = str( Path(config_or_model).joinpath("nn").joinpath("model")) self.qlearner = QLearner( self.nn_config, self.num_actions, self.img_width, self.img_height, self.num_img_channels, self.memory_size, load_model=load_path, target_network_update_tau=self.target_network_update_tau) self.curr_step = self.config["curr_step"] self.epsilon = self.config["epsilon"] self.memory_beta = self.config["memory_beta"] self.best_average_score = self.config["best_average_score"] print("Model loaded successfully") self.model_loaded = True else: self.qlearner = QLearner( self.nn_config, self.num_actions, self.img_width, self.img_height, self.num_img_channels, self.memory_size, target_network_update_tau=self.target_network_update_tau) if self.tensorboard_log_freq > 0: self.qlearner.add_tensorboard_ops(self.tensorboard_log_path)
Pass opening angle, SMALL or LARGE or INVALID -- 3 Goal scoring angle, SMALL or LARGE or INVALID -- 3 OUT proximity refers to outside of the quartile of the player """ NUM_STATES = 32 * (54 ** args.numTeammates) # Shoot, Pass to one of N teammates or Dribble NUM_ACTIONS = 2 + args.numTeammates hfo = HFOEnvironment() hfo.connectToServer(feature_set=HIGH_LEVEL_FEATURE_SET, server_port=args.port) q_learner = QLearner(NUM_STATES, NUM_ACTIONS, epsilon=0.0, q_table_in=args.qTableDir + str(args.playerIndex) + '.npy', q_table_out=args.qTableDir + str(args.playerIndex) + '.npy') for episode in range(0, args.numEpisodes): status = IN_GAME action = None state = None history = [] timestep = 0 while status == IN_GAME: timestep += 1 features = hfo.getState() # Print off features in a readable manner # feature_printer(features, args.numTeammates, args.numOpponents) if int(features[5] != 1):
# Initialise result data structures rewards_per_run = dict() runtime_per_run = [] # For each run, train agent until environment is solved, or episode budget # runs out: for run in range(num_runs): # Initialise result helpers end_episode = num_episodes # indicates in which run the environment was solved start = timer() rewards = [0.0] * num_episodes # reward per episode # Initialise environment and agent wrapper = CartPoleWrapperDiscrete() agent = QLearner(wrapper=wrapper, seed=run) style.use('fivethirtyeight') fig = plt.figure() plt.axis([0, args.episodes, 0, 300]) plt.xlabel('Episodes') plt.ylabel('AVG Reward last 50 episodes') # For each episode, train the agent on the environment and record the # reward of each episode for episode in range(num_episodes): rewards[episode] = agent.train() if (episode % 50) == 0 and episode != 0: avg_last = float(sum(rewards[episode - 50:episode])) / 50 plt.scatter(episode, avg_last)
def frozen_ql_experiment(env_name, new_lake): np.random.seed(0) min_r = -100.0 max_r = 100.0 problem = MyWrapper.TransformReward( gym.make(env_name, desc=new_lake), lambda r: np.clip(r * 100.0, min_r, max_r)) problem.seed(0) problem.reset() folder = "q_learning/" env = MyWrapper.Monitor(problem, folder, force=True) # env.observation_space.n is number of states # q_table = np.zeros((env.observation_space.n, env.action_space.n)) # param -> q_table num_of_states = env.observation_space.n num_of_action = env.action_space.n rewards_list = [] # this will record reward for that run iterations_list = [] # this will record all number of iteration alpha = [0.5, 0.9] # param -> alpha [0.45, 0.65, 0.85] current 0.45 gamma = 0.99 # param -> gamma episodes = 10000 rar = [0.1, 0.9] # epsilon [0.1,0.3,0.5,0.7,0.9], current 0.1 radr = 0.99 # randomess decay time_list = [] # begin the timer before the iteration begin # initialize the qlearner here qlearner = QLearner( num_actions=num_of_action, num_states=num_of_states, alpha=alpha[0], gamma=gamma, rar=rar[0], radr=radr, ) # print(qlearner.q_table) """This is for plot #1 """ # total time spend per episode init_time_diff = 0 for episode in range(episodes): # total number of iterations start_time = time.time() qlearner.s = env.reset() # current state done = False total_reward = 0 # this is the initial reward i have max_steps = 10000000 # print(state) for i in range(max_steps): if done: break # update qlearner.s by state """Key step, refer to the qlearner implementation """ # update s before use as an input # action here is either a random action or the best action of the given state action = qlearner.choose_best_action( qlearner.num_actions, qlearner.rar, qlearner.s, qlearner.q_table) # use current q_table # qlearner.s = qlearner.s # get state reward done, info from the environment next_state, reward, done, info = env.step( action) # this will update done # qlearner.s = qlearner.s already updated qlearner.a = action # update my reward total_reward += reward """ right now the problem is that q table is not being updated""" # reward is current reward, total_reward is cumulative reward # update q-table on q[qlearner.s, action] using state(future_state) and reward, temp_action = qlearner.query( next_state, reward, False) # this step will not update self.s and self.a # update state to next state, action is already updated, we good qlearner.s = next_state end_time = time.time() time_spend_one_episode = (end_time - start_time) * 1000 init_time_diff += (time_spend_one_episode ) # by the end of iteration cumulative time time_list.append(init_time_diff) rewards_list.append(total_reward) # total rewards for this episode iterations_list.append( i) # record current iteration when it's done for the episide # close the environment, find the time difference env.close() def chunk_list(l, n): for i in range(0, len(l), n): yield l[i:i + n] """rewards vs # of iterations plot""" episode_size = int(episodes / 50) segments = list(chunk_list(rewards_list, episode_size)) average_reward = [sum(segment) / len(segment) for segment in segments] plt.title( "Average Rewards vs Iterations (learning rate: 0.5, Epsilon: 0.1)") plt.plot(range(0, len(rewards_list), episode_size), average_reward) plt.xlabel("Iterations") plt.ylabel("Average Reward") plt.savefig( "./plots/frozen_lake_experiment/frozen_qlearner_reward_vs_iterations.png" ) plt.close() plt.figure() """plot 1 done """ """Plot 2 computation time vs episodes """ plt.title( "Computation time vs episodes (learning rate: 0.5, Epsilon: 0.1)") plt.plot(range(0, episodes, 1), time_list) plt.xlabel("episodes") plt.ylabel("computation time (mili seconds)") plt.savefig( "./plots/frozen_lake_experiment/computation_time_vs_episodes.png") plt.close() plt.figure() """This is for plot #3 change alpha:0.9, rar 0.1 """ # plot 2 alpha = 0.65 vs reward single_alpha = alpha[1] # alpha = 0.9 rewards_list = [] # this will record reward for that run iterations_list = [] # this will record all number of iteration time_list = [] init_time_diff = 0 qlearner = QLearner( num_actions=num_of_action, num_states=num_of_states, alpha=single_alpha, gamma=gamma, rar=rar[0], radr=radr, ) for episode in range(episodes): # total number of iterations start_time = time.time() qlearner.s = env.reset() # current state done = False total_reward = 0 # this is the initial reward i have max_steps = 10000000 # print(state) for i in range(max_steps): if done: break # update qlearner.s by state """Key step, refer to the qlearner implementation """ # start the timer # update s before use as an input # action here is either a random action or the best action of the given state action = qlearner.choose_best_action( qlearner.num_actions, qlearner.rar, qlearner.s, qlearner.q_table) # use current q_table # qlearner.s = qlearner.s # get state reward done, info from the environment next_state, reward, done, info = env.step( action) # this will update done # qlearner.s = qlearner.s already updated qlearner.a = action # update my reward total_reward += reward """ right now the problem is that q table is not being updated""" # reward is current reward, total_reward is cumulative reward # update q-table on q[qlearner.s, action] using state(future_state) and reward, temp_action = qlearner.query( next_state, reward, False) # this step will not update self.s and self.a # update state to next state, action is already updated, we good qlearner.s = next_state end_time = time.time() time_spend_one_episode = (end_time - start_time) * 1000 init_time_diff += (time_spend_one_episode ) # by the end of iteration cumulative time time_list.append(init_time_diff) rewards_list.append(total_reward) # total rewards for this episode iterations_list.append( i) # record current iteration when it's done for the episide # close the environment, find the time difference """plot 3""" episode_size = int(episodes / 50) segments = list(chunk_list(rewards_list, episode_size)) average_reward = [sum(segment) / len(segment) for segment in segments] plt.title("Reward vs Iteration (Learning Rate: 0.9, Epsilon:0.1)") # print(single_alpha) plt.plot(range(0, len(rewards_list), episode_size), average_reward) plt.xlabel("Iterations") plt.ylabel("Average Rewards") plt.savefig( "./plots/frozen_lake_experiment/frozen_qlearner_rewards_vs_iter_alpha0.9.png" ) plt.close() plt.figure() """plot 4 time vs iters""" plt.title( "Computation time vs episodes (learning rate: 0.9, Epsilon: 0.1)") plt.plot(range(0, episodes, 1), time_list) plt.xlabel("episodes") plt.ylabel("computation time (mili seconds)") plt.savefig( "./plots/frozen_lake_experiment/computation_time_vs_episodes_alpha0.9.png" ) plt.close() plt.figure() """This is for plot #4 alpha: 0.5, rar(epsilon) 0.9""" single_alpha = alpha[0] # alpha = 0.9 single_rar = rar[1] rewards_list = [] # this will record reward for that run iterations_list = [] # this will record all number of iteration time_list = [] init_time_diff = 0 qlearner = QLearner( num_actions=num_of_action, num_states=num_of_states, alpha=single_alpha, gamma=gamma, rar=single_rar, radr=radr, ) for episode in range(episodes): # total number of iterations start_time = time.time() qlearner.s = env.reset() # current state done = False total_reward = 0 # this is the initial reward i have max_steps = 10000 # print(state) for i in range(max_steps): if done: break # update qlearner.s by state """Key step, refer to the qlearner implementation """ # start the timer # update s before use as an input # action here is either a random action or the best action of the given state action = qlearner.choose_best_action( qlearner.num_actions, qlearner.rar, qlearner.s, qlearner.q_table) # use current q_table # qlearner.s = qlearner.s # get state reward done, info from the environment next_state, reward, done, info = env.step( action) # this will update done # qlearner.s = qlearner.s already updated qlearner.a = action # update my reward total_reward += reward """ right now the problem is that q table is not being updated""" # reward is current reward, total_reward is cumulative reward # update q-table on q[qlearner.s, action] using state(future_state) and reward, temp_action = qlearner.query( next_state, reward, False) # this step will not update self.s and self.a # update state to next state, action is already updated, we good qlearner.s = next_state end_time = time.time() time_spend_one_episode = (end_time - start_time) * 1000 init_time_diff += (time_spend_one_episode ) # by the end of iteration cumulative time time_list.append(init_time_diff) rewards_list.append(total_reward) # total rewards for this episode iterations_list.append( i) # record current iteration when it's done for the episide """plot 5 reward vs iteration""" episode_size = int(episodes / 50) segments = list(chunk_list(rewards_list, episode_size)) average_reward = [sum(segment) / len(segment) for segment in segments] plt.title("Reward vs Iteration (Learning Rate: 0.5, Epsilon:0.9)") # print(single_alpha) plt.plot(range(0, len(rewards_list), episode_size), average_reward) plt.xlabel("Iterations") plt.ylabel("Average Rewards") plt.savefig( "./plots/frozen_lake_experiment/frozen_qlearner_rewards_vs_iter_epsilon0.9.png" ) plt.close() plt.figure() """plot 6 time vs iters""" plt.title( "Computation time vs episodes (learning rate: 0.5, Epsilon: 0.9)") plt.plot(range(0, episodes, 1), time_list) plt.xlabel("episodes") plt.ylabel("computation time (mili seconds)") plt.savefig( "./plots/frozen_lake_experiment/computation_time_vs_episodes_epsilon0.9.png" ) plt.close() plt.figure()