def learn_policy(self): # Initialize Q-learner. qlearner = QLearner( \ self.state_space, self.actions, self.handle_action, self.reset_training_world ) # Initialize goal states. goal_states = [] print "Enumerating goal states..." print self.state_space_dim for state_index in xrange(qlearner.r_table.size): state = numpy.unravel_index(state_index, qlearner.r_table.shape) if state[FullTransform.StateOffset.Arrows] == World.ArrowState.Arrows_Complete: goal_states.append(tuple(state)) print "Goal states: %d" % len(goal_states) for goal_state in goal_states: qlearner.set_r_value( goal_state, 100 ) #print qlearner.r_table # Run Q-learner. print "Total states: %d" % (qlearner.r_table.size) qlearner.execute(goal_states, 500000, 50) # Return policy. return qlearner.get_policy()
def set_up_learner(self, learner, **kwargs): """ Attaches the appropriate learner to instance for testing. """ if learner == FLearner: sflags = FlagGenerator(self.size, self.size) aflags = FlagGenerator(2, 2) self.learner = FLearner(rmatrix=self.rmatrix, goal=self.goals, stateconverter=sflags, actionconverter=aflags, tmatrix=self.tmatrix, seed=self.seed, **kwargs) elif learner == QLearner: self.learner = QLearner(rmatrix=self.rmatrix, goal=self.goals, tmatrix=self.tmatrix, seed=self.seed, **kwargs) elif learner == SLearner: sflags = FlagGenerator(self.size, self.size) aflags = FlagGenerator(2, 2) sim = create_sim_env(self.size, self.random) def reward(svec, avec, nstate): action = aflags.encode(avec) state = sflags.encode((round(svec[0]), round(svec[1]))) return self.rmatrix[state, action] def goal(svec): return self.coord2state( (round(svec[0]), round(svec[1]))) in self.goals self.learner = SLearner(reward=reward, simulator=sim, goal=goal, stateconverter=sflags, actionconverter=aflags, seed=self.seed, **kwargs) elif learner is None: self.learner = None else: raise TypeError('Class: ' + learner.__name__ + ' is not supported.\ Assign to .learner manually')
def branin(discount, learning_rate, buckets_w, buckets_h, buckets_v): def run_game(): # Make a new monkey object. swing = SwingyMonkey( visual=False, # no video sound=False, # no audio action_callback=learner_class.action_callback, reward_callback=learner_class.reward_callback) # Loop until you hit something. while swing.game_loop(): pass return swing # make a new learner with the given parameters learner_class = QLearner(learn_fn=lambda i: learning_rate, discount_fn=lambda i: discount, bucket_height=buckets_h, bucket_width=buckets_w, velocity_bucket=buckets_v) # train the learner for t in xrange(TRAIN_ITERS): run_game() # keep learning, take average over the iterations scores = [] for t in xrange(TEST_ITERS): # Make a new monkey object. swing = run_game() scores.append(swing.score) avg_score = float(sum(scores)) / float(TEST_ITERS) median_score = np.median(scores) # which do we return? print "The median is %d and the mean is %f." % (median_score, avg_score) # out objective is to minimize the negative of the average score return -1 * avg_score
def learn_policy(self): # Initialize Q-learner. qlearner = QLearner(self.state_space, self.actions, self.handle_action, self.reset_training_world) # Initialize reward states. goal_states = [(PositionTransform.HorizontalState.At + 1, PositionTransform.VerticleState.At + 1)] for goal_state in goal_states: qlearner.set_r_value(goal_state, 100) # print qlearner.r_table # Run Q-learner. qlearner.execute(goal_states, 300, 50) # Return policy. return qlearner.get_policy()
def learn_policy(self): # Initialize Q-learner. qlearner = QLearner( \ self.state_space, self.actions, self.handle_action, self.reset_training_world ) # Initialize reward states. goal_states = [( self.state_space[0].index(World.SiteState.Useless), )] for goal_state in goal_states: qlearner.set_r_value( goal_state, 100 ) #print qlearner.r_table # Run Q-learner. qlearner.execute(goal_states, 300, 30) # Return policy. return qlearner.get_policy()
OUT proximity refers to outside of the quartile of the player """ NUM_STATES = 32 * (54**args.numTeammates) # Shoot, Dribble or Pass to one of N teammates or NUM_ACTIONS = 2 + args.numTeammates hfo = HFOEnvironment() hfo.connectToServer(feature_set=HIGH_LEVEL_FEATURE_SET, server_port=args.port) if args.inQTableDir: q_learner = QLearner( NUM_STATES, NUM_ACTIONS, epsilon=args.epsilon, learning_rate=args.learningRate, q_table_in=args.inQTableDir + str(args.playerIndex) + '.npy', q_table_out=args.outQTableDir + str(args.playerIndex) + '.npy') else: q_learner = QLearner( NUM_STATES, NUM_ACTIONS, epsilon=args.epsilon, learning_rate=args.learningRate, q_table_in=args.outQTableDir + str(args.playerIndex) + '.npy', q_table_out=args.outQTableDir + str(args.playerIndex) + '.npy') for episode in range(0, args.numEpisodes): status = IN_GAME action = None
def frozen_ql_experiment(env_name, new_lake): np.random.seed(0) min_r = -100.0 max_r = 100.0 problem = MyWrapper.TransformReward( gym.make(env_name, desc=new_lake), lambda r: np.clip(r * 100.0, min_r, max_r)) problem.seed(0) problem.reset() folder = "q_learning/" env = MyWrapper.Monitor(problem, folder, force=True) # env.observation_space.n is number of states # q_table = np.zeros((env.observation_space.n, env.action_space.n)) # param -> q_table num_of_states = env.observation_space.n num_of_action = env.action_space.n rewards_list = [] # this will record reward for that run iterations_list = [] # this will record all number of iteration alpha = [0.5, 0.9] # param -> alpha [0.45, 0.65, 0.85] current 0.45 gamma = 0.99 # param -> gamma episodes = 10000 rar = [0.1, 0.9] # epsilon [0.1,0.3,0.5,0.7,0.9], current 0.1 radr = 0.99 # randomess decay time_list = [] # begin the timer before the iteration begin # initialize the qlearner here qlearner = QLearner( num_actions=num_of_action, num_states=num_of_states, alpha=alpha[0], gamma=gamma, rar=rar[0], radr=radr, ) # print(qlearner.q_table) """This is for plot #1 """ # total time spend per episode init_time_diff = 0 for episode in range(episodes): # total number of iterations start_time = time.time() qlearner.s = env.reset() # current state done = False total_reward = 0 # this is the initial reward i have max_steps = 10000000 # print(state) for i in range(max_steps): if done: break # update qlearner.s by state """Key step, refer to the qlearner implementation """ # update s before use as an input # action here is either a random action or the best action of the given state action = qlearner.choose_best_action( qlearner.num_actions, qlearner.rar, qlearner.s, qlearner.q_table) # use current q_table # qlearner.s = qlearner.s # get state reward done, info from the environment next_state, reward, done, info = env.step( action) # this will update done # qlearner.s = qlearner.s already updated qlearner.a = action # update my reward total_reward += reward """ right now the problem is that q table is not being updated""" # reward is current reward, total_reward is cumulative reward # update q-table on q[qlearner.s, action] using state(future_state) and reward, temp_action = qlearner.query( next_state, reward, False) # this step will not update self.s and self.a # update state to next state, action is already updated, we good qlearner.s = next_state end_time = time.time() time_spend_one_episode = (end_time - start_time) * 1000 init_time_diff += (time_spend_one_episode ) # by the end of iteration cumulative time time_list.append(init_time_diff) rewards_list.append(total_reward) # total rewards for this episode iterations_list.append( i) # record current iteration when it's done for the episide # close the environment, find the time difference env.close() def chunk_list(l, n): for i in range(0, len(l), n): yield l[i:i + n] """rewards vs # of iterations plot""" episode_size = int(episodes / 50) segments = list(chunk_list(rewards_list, episode_size)) average_reward = [sum(segment) / len(segment) for segment in segments] plt.title( "Average Rewards vs Iterations (learning rate: 0.5, Epsilon: 0.1)") plt.plot(range(0, len(rewards_list), episode_size), average_reward) plt.xlabel("Iterations") plt.ylabel("Average Reward") plt.savefig( "./plots/frozen_lake_experiment/frozen_qlearner_reward_vs_iterations.png" ) plt.close() plt.figure() """plot 1 done """ """Plot 2 computation time vs episodes """ plt.title( "Computation time vs episodes (learning rate: 0.5, Epsilon: 0.1)") plt.plot(range(0, episodes, 1), time_list) plt.xlabel("episodes") plt.ylabel("computation time (mili seconds)") plt.savefig( "./plots/frozen_lake_experiment/computation_time_vs_episodes.png") plt.close() plt.figure() """This is for plot #3 change alpha:0.9, rar 0.1 """ # plot 2 alpha = 0.65 vs reward single_alpha = alpha[1] # alpha = 0.9 rewards_list = [] # this will record reward for that run iterations_list = [] # this will record all number of iteration time_list = [] init_time_diff = 0 qlearner = QLearner( num_actions=num_of_action, num_states=num_of_states, alpha=single_alpha, gamma=gamma, rar=rar[0], radr=radr, ) for episode in range(episodes): # total number of iterations start_time = time.time() qlearner.s = env.reset() # current state done = False total_reward = 0 # this is the initial reward i have max_steps = 10000000 # print(state) for i in range(max_steps): if done: break # update qlearner.s by state """Key step, refer to the qlearner implementation """ # start the timer # update s before use as an input # action here is either a random action or the best action of the given state action = qlearner.choose_best_action( qlearner.num_actions, qlearner.rar, qlearner.s, qlearner.q_table) # use current q_table # qlearner.s = qlearner.s # get state reward done, info from the environment next_state, reward, done, info = env.step( action) # this will update done # qlearner.s = qlearner.s already updated qlearner.a = action # update my reward total_reward += reward """ right now the problem is that q table is not being updated""" # reward is current reward, total_reward is cumulative reward # update q-table on q[qlearner.s, action] using state(future_state) and reward, temp_action = qlearner.query( next_state, reward, False) # this step will not update self.s and self.a # update state to next state, action is already updated, we good qlearner.s = next_state end_time = time.time() time_spend_one_episode = (end_time - start_time) * 1000 init_time_diff += (time_spend_one_episode ) # by the end of iteration cumulative time time_list.append(init_time_diff) rewards_list.append(total_reward) # total rewards for this episode iterations_list.append( i) # record current iteration when it's done for the episide # close the environment, find the time difference """plot 3""" episode_size = int(episodes / 50) segments = list(chunk_list(rewards_list, episode_size)) average_reward = [sum(segment) / len(segment) for segment in segments] plt.title("Reward vs Iteration (Learning Rate: 0.9, Epsilon:0.1)") # print(single_alpha) plt.plot(range(0, len(rewards_list), episode_size), average_reward) plt.xlabel("Iterations") plt.ylabel("Average Rewards") plt.savefig( "./plots/frozen_lake_experiment/frozen_qlearner_rewards_vs_iter_alpha0.9.png" ) plt.close() plt.figure() """plot 4 time vs iters""" plt.title( "Computation time vs episodes (learning rate: 0.9, Epsilon: 0.1)") plt.plot(range(0, episodes, 1), time_list) plt.xlabel("episodes") plt.ylabel("computation time (mili seconds)") plt.savefig( "./plots/frozen_lake_experiment/computation_time_vs_episodes_alpha0.9.png" ) plt.close() plt.figure() """This is for plot #4 alpha: 0.5, rar(epsilon) 0.9""" single_alpha = alpha[0] # alpha = 0.9 single_rar = rar[1] rewards_list = [] # this will record reward for that run iterations_list = [] # this will record all number of iteration time_list = [] init_time_diff = 0 qlearner = QLearner( num_actions=num_of_action, num_states=num_of_states, alpha=single_alpha, gamma=gamma, rar=single_rar, radr=radr, ) for episode in range(episodes): # total number of iterations start_time = time.time() qlearner.s = env.reset() # current state done = False total_reward = 0 # this is the initial reward i have max_steps = 10000 # print(state) for i in range(max_steps): if done: break # update qlearner.s by state """Key step, refer to the qlearner implementation """ # start the timer # update s before use as an input # action here is either a random action or the best action of the given state action = qlearner.choose_best_action( qlearner.num_actions, qlearner.rar, qlearner.s, qlearner.q_table) # use current q_table # qlearner.s = qlearner.s # get state reward done, info from the environment next_state, reward, done, info = env.step( action) # this will update done # qlearner.s = qlearner.s already updated qlearner.a = action # update my reward total_reward += reward """ right now the problem is that q table is not being updated""" # reward is current reward, total_reward is cumulative reward # update q-table on q[qlearner.s, action] using state(future_state) and reward, temp_action = qlearner.query( next_state, reward, False) # this step will not update self.s and self.a # update state to next state, action is already updated, we good qlearner.s = next_state end_time = time.time() time_spend_one_episode = (end_time - start_time) * 1000 init_time_diff += (time_spend_one_episode ) # by the end of iteration cumulative time time_list.append(init_time_diff) rewards_list.append(total_reward) # total rewards for this episode iterations_list.append( i) # record current iteration when it's done for the episide """plot 5 reward vs iteration""" episode_size = int(episodes / 50) segments = list(chunk_list(rewards_list, episode_size)) average_reward = [sum(segment) / len(segment) for segment in segments] plt.title("Reward vs Iteration (Learning Rate: 0.5, Epsilon:0.9)") # print(single_alpha) plt.plot(range(0, len(rewards_list), episode_size), average_reward) plt.xlabel("Iterations") plt.ylabel("Average Rewards") plt.savefig( "./plots/frozen_lake_experiment/frozen_qlearner_rewards_vs_iter_epsilon0.9.png" ) plt.close() plt.figure() """plot 6 time vs iters""" plt.title( "Computation time vs episodes (learning rate: 0.5, Epsilon: 0.9)") plt.plot(range(0, episodes, 1), time_list) plt.xlabel("episodes") plt.ylabel("computation time (mili seconds)") plt.savefig( "./plots/frozen_lake_experiment/computation_time_vs_episodes_epsilon0.9.png" ) plt.close() plt.figure()
def find_paddle(state): line = state[paddle_line,8:-8,0] indices = np.where(line == 200) return np.mean(indices) def find_ball(a,b): diff = b-a diff = diff[tim_sux:chris_sux,:,0] indices = np.where(diff == 200) y = np.mean(indices[0]) + tim_sux x = np.mean(indices[1]) # chris_sux return (x,y) env = gym.make('Breakout-v0') learner = QLearner(num_states=500, num_actions=env.action_space.n) for i_episode in range(2000): observation = env.reset() action = learner.set_initial_state(0) prev = observation total_reward = 0 for t in range(10000): # env.render() prev = observation observation, reward, done, info = env.step(action) total_reward += reward paddle = find_paddle(observation) x,y = find_ball(prev, observation) try: feature = int(paddle - x) action = learner.move(feature, reward)
# main from MapBuilder import MapBuilder from qlearner import QLearner from universe import Universe from Criterions import get_cost_based_on_fuel, get_cost_based_on_time, get_cost_based_on_mixture if __name__ == "__main__": universe = Universe(MapBuilder()) qlearners = [ QLearner(universe.get_initial_state(), get_cost_based_on_fuel, universe.move_request, universe.get_terminal_state(), 1, 0.9, universe.next_state), QLearner(universe.get_initial_state(), get_cost_based_on_time, universe.move_request, universe.get_terminal_state(), 1, 0.9, universe.next_state), QLearner(universe.get_initial_state(), get_cost_based_on_mixture, universe.move_request, universe.get_terminal_state(), 1, 0.9, universe.next_state) ] num_of_epochs = 1000 for epoch_num in range(num_of_epochs): for qlearner in qlearners: while qlearner._state != universe.get_terminal_state(): qlearner.move() qlearner.reset(universe.get_initial_state()) print("Energy:", qlearners[0]._Q, end='\n\n') print("Time:", qlearners[1]._Q, end='\n\n')
def test_instantiation(): """ Testing common QLearner initial arguments and support functions. """ # Set-up: STATES = 10 ACTIONS = 5 rmatrix_sq = np.random.rand(STATES, STATES) rmatrix_rec = np.random.rand(STATES, ACTIONS) tmatrix = np.random.randint(0, STATES, size=(STATES, ACTIONS)) # making sure tmatrix points to goal states: tmatrix[:, ACTIONS - 1] = np.random.randint(0, 1, size=STATES) goal_l = (0, 1) goal_f = lambda x: x <= 1 np.savetxt('test.dat', rmatrix_sq) global QLEARNER # Test 1: list goal temp = QLearner(rmatrix_sq, goal_l) assert np.array_equal(temp.rmatrix, rmatrix_sq), "R matrix not equal to arg." assert temp.goal(0) and temp.goal(1) and not temp.goal(2) and not temp.goal(3), \ 'List goal not working.' QLEARNER = temp # Test 2: function goal temp = QLearner(rmatrix_sq, goal_f) assert temp.goal(0) and temp.goal( 1) and not temp.goal(2), 'Function goal not working.' QLEARNER = temp # Test 3: File I/O temp = QLearner('test.dat', goal_l) assert temp.qmatrix.shape == rmatrix_sq.shape, "Q & R matrix dimension mismatch." assert np.array_equal(temp.rmatrix, rmatrix_sq), "R matrix not equal to arg." QLEARNER = temp # Test 4: rectangular r matrix, no tmatrix try: QLearner(rmatrix_rec, goal_l) except ValueError: pass # Test 5: rectangular r matrix, t matrix of same dimension temp = QLearner(rmatrix_rec, goal_f, tmatrix) assert temp.next_state(1, 2) == tmatrix[1, 2], 'Next state prediction incorrect.' QLEARNER = temp # Test 6: episodes l = set(temp.episodes(coverage=1.0, mode='bfs')) assert l == set(range(temp.num_states)), 'Full episode coverage failed.' # Finalize os.remove('test.dat')
### SETUP num_learning_trials = 10000 num_simulation_trials = 1000 num_learning_epochs = 15 ### PART III: MDP 1 epsilon experiments epsilon_list = [0.1, 0.25, 0.5, 0.75] learning_rate = 0.01 epoch_list = [] avg_reward_list = [] for e, epsilon in enumerate(epsilon_list): print "Epsilon: {0}".format(epsilon) qlearner = QLearner(mdp1, initial_state1, epsilon=epsilon, alpha=learning_rate) epoch_list.append(range(num_learning_epochs)) avg_reward_list.append([]) for epoch in epoch_list[e]: for trial in range(num_learning_trials): qlearner.run_learning_trial() avg_reward = 0 for trial in range(num_simulation_trials): (total_reward, state_seq, action_seq) = qlearner.run_simulation_trial() avg_reward += total_reward avg_reward = 1.*avg_reward/num_simulation_trials avg_reward_list[e].append(avg_reward) print "MDP1 epoch {0}: {1}".format(epoch, avg_reward)
# L.Braun 2018 # Main program to solve a gridworld maze problem # Uses qlearner.py, environ.py from qlearner import QLearner import pylab as plt my_learner = QLearner() my_learner.load_maze('/u/braun/tlab/QLearner/data/reward_4x4.npy', '/u/braun/tlab/QLearner/data/meta_4x4.txt') #print ("testing data load\n\n") #my_learner.display_Q() #my_learner.display_R() print("begin training...") reward = my_learner.train(0.7) my_learner.display_Q() my_learner.display_R() steps = my_learner.test(7) # 7 foods in 4x4 maze print("steps") print(steps) print("") plt.hist(reward, 50, normed=1, facecolor='g', alpha=0.75) plt.xlabel('Episodes required to reach 200')
# Initialise result data structures rewards_per_run = dict() runtime_per_run = [] # For each run, train agent until environment is solved, or episode budget # runs out: for run in range(num_runs): # Initialise result helpers end_episode = num_episodes # indicates in which run the environment was solved start = timer() rewards = [0.0] * num_episodes # reward per episode # Initialise environment and agent wrapper = CartPoleWrapperDiscrete() agent = QLearner(wrapper=wrapper, seed=run) style.use('fivethirtyeight') fig = plt.figure() plt.axis([0, args.episodes, 0, 300]) plt.xlabel('Episodes') plt.ylabel('AVG Reward last 50 episodes') # For each episode, train the agent on the environment and record the # reward of each episode for episode in range(num_episodes): rewards[episode] = agent.train() if (episode % 50) == 0 and episode != 0: avg_last = float(sum(rewards[episode - 50:episode])) / 50 plt.scatter(episode, avg_last)
#mdp.value_iteration() #mdp.save_policy(filename='scen1.p') mdp.load_policy(filename='scen1.p') value_iter_pi = mdp.pi plotter.plot_state_actions(value_iter_pi, rewards = grid.reward_states, sinks = grid.sink_states) value_iter_data = np.zeros([TRIALS, ITER]) classic_q_data = np.zeros([TRIALS, ITER]) for t in range(TRIALS): mdp.load_policy(filename='scen1.p') q = QLearner(grid, mdp, moves=40) r = 0.0 for i in range(ITER): q.guide() r = r + q.get_reward() / (ITER) print "Value iter reward: " + str(r) value_iter_data[t,:] = np.zeros(ITER) + r r = 0.0 q.clear_states() mdp.pi = QPolicy(q) a = Analysis(W, H, ITER, rewards=rewards, sinks=sinks, desc='Q policy') for i in range(ITER * SAMP): q.rollout() r = r + q.get_reward() / (ITER * SAMP)
### SETUP num_learning_trials = 10000 num_simulation_trials = 1000 num_learning_epochs = 15 ### PART III: MDP 1 epsilon experiments epsilon_list = [0.1, 0.25, 0.5, 0.75] learning_rate = 0.01 epoch_list = [] avg_reward_list = [] for e, epsilon in enumerate(epsilon_list): print "Epsilon: {0}".format(epsilon) qlearner = QLearner(mdp1, initial_state1, epsilon=epsilon, alpha=learning_rate) epoch_list.append(range(num_learning_epochs)) avg_reward_list.append([]) for epoch in epoch_list[e]: for trial in range(num_learning_trials): qlearner.run_learning_trial() avg_reward = 0 for trial in range(num_simulation_trials): (total_reward, state_seq, action_seq) = qlearner.run_simulation_trial() avg_reward += total_reward avg_reward = 1. * avg_reward / num_simulation_trials avg_reward_list[e].append(avg_reward)
rewards = scenarios.scenario0['rewards'] sinks = scenarios.scenario0['sinks'] grid.reward_states = rewards grid.sink_states = sinks mdp = ClassicMDP(ClassicPolicy(grid), grid) #mdp.value_iteration() #mdp.save_policy(filename='scen1.p') mdp.load_policy(filename='scen1.p') value_iter_pi = mdp.pi plotter.plot_state_actions(value_iter_pi, rewards = grid.reward_states, sinks = grid.sink_states) q = QLearner(grid, mdp, moves=20) q.Q = Qapprox(H, W) q.animate = False for i in range(20): q.guide() #for key in q.Q.dataset.keys(): # print key, ",", np.mean(q.Q.dataset[key]) an = Analysis(W, H, ITER, rewards=rewards, sinks=sinks, desc='Q policy') q.clear_states() q.retrain() mdp.pi = QPolicy(q) #print q.Q.get(State(2, 12), -1) #print len(q.states)
def find_paddle(state): line = state[paddle_line,8:-8,0] indices = np.where(line == 200) return np.mean(indices) def find_ball(a,b): diff = b-a diff = diff[tim_sux:chris_sux,:,0] indices = np.where(diff == 200) y = np.mean(indices[0]) + tim_sux x = np.mean(indices[1]) # chris_sux return (x,y) env = gym.make('Breakout-v0') learner = QLearner(num_states=200, num_actions=env.action_space.n) for i_episode in range(2000): observation = env.reset() action = learner.set_initial_state(0) prev = observation for t in range(10000): env.render() # print(observation) paddle = find_paddle(observation) x,y = find_ball(prev, observation) try: feature = int(paddle - x) if feature > 15: feature = 15 if feature < -15: feature = -15
def __init__(self, config_or_model, load_model=False): self.config = None self.model_loaded = False #load a saved model if load_model: print("Loading model from: {}".format(config_or_model)) load_path = Path(config_or_model) if (not load_path.exists()) or (not load_path.is_dir()): print("Error: directory doesn't exist") config_filename = load_path.joinpath("config.json") self.config = self.load_config(str(config_filename)) else: self.config = self.load_config(config_or_model) #select game self.game_name = self.config["game"] self.game = None if self.game_name == "snake": self.game = game.Snake elif self.game_name == "box": self.game = game.Box else: print("Error: unknown game {}".format(self.game_name)) self.nn_config = self.config["nn"] #parameters of experience memory self.memory_size = self.config["memory_size"] self.memory_alpha = self.config["memory_alpha"] self.memory_beta_start = self.config["memory_beta_start"] self.memory_beta_end = self.config["memory_beta_end"] self.memory_beta_num_steps = self.config["memory_beta_num_steps"] self.memory_beta_step = (self.memory_beta_end - self.memory_beta_start ) / self.memory_beta_num_steps self.exp_memory_start_size = self.config["memory_start_size"] #game parameters: image size, board size, num_goals, ... self.width = self.config["width"] self.height = self.config["height"] self.image_scale_factor = self.config["image_scale_factor"] self.num_goals = self.config["num_goals"] self.img_width = self.width * self.image_scale_factor self.img_height = self.height * self.image_scale_factor self.num_img_channels = self.game.num_channels self.num_actions = self.game.num_actions #random policy parameters self.epsilon_start = self.config["epsilon_start"] self.epsilon_min = self.config["epsilon_min"] self.num_epsilon_steps = self.config["num_epsilon_steps"] self.epsilon_step = (self.epsilon_start - self.epsilon_min) / self.num_epsilon_steps #scale rewards, training might be more stable if q-values converge to range [-1,1] self.scale_reward_max = None if "scale_reward_max" in self.config: self.scale_reward_max = self.config["scale_reward_max"] self.game.max_reward *= self.scale_reward_max self.game.min_reward *= self.scale_reward_max self.game.empty_reward *= self.scale_reward_max print("Scaling rewards by {}".format(self.scale_reward_max)) #frequence parameters of updating target network, output, saving, tensorboard, evaluation self.max_steps = self.config["max_steps"] self.output_freq = self.config["output_freq"] self.update_freq = self.config["update_freq"] self.target_network_update_mode = self.config[ "target_network_update_mode"] self.target_network_update_tau = None self.target_network_update_freq = None if self.target_network_update_mode == "hard": self.target_network_update_freq = self.config[ "target_network_update_freq"] else: self.target_network_update_tau = self.config[ "target_network_update_tau"] self.eval_freq = self.config["eval_freq"] self.eval_steps = self.config["eval_steps"] self.tensorboard_log_freq = self.config["tensorboard_log_freq"] self.tensorboard_log_path = self.config["tensorboard_log_path"] self.save_freq = self.config["save_freq"] self.save_path = self.config["save_path"] self.batch_size = self.config["batch_size"] #parameters that are actually changed while training, these need to be saved and loaded self.curr_step = 0 self.epsilon = self.epsilon_start self.memory_beta = self.memory_beta_start self.best_average_score = 0 #create experience memory self.exp_memory = ExperienceMemory(self.memory_size, self.img_width, self.img_height, self.num_img_channels, self.memory_alpha) #create QLearner object, load saved neural network model if necessary self.qlearner = None if load_model: load_path = str( Path(config_or_model).joinpath("nn").joinpath("model")) self.qlearner = QLearner( self.nn_config, self.num_actions, self.img_width, self.img_height, self.num_img_channels, self.memory_size, load_model=load_path, target_network_update_tau=self.target_network_update_tau) self.curr_step = self.config["curr_step"] self.epsilon = self.config["epsilon"] self.memory_beta = self.config["memory_beta"] self.best_average_score = self.config["best_average_score"] print("Model loaded successfully") self.model_loaded = True else: self.qlearner = QLearner( self.nn_config, self.num_actions, self.img_width, self.img_height, self.num_img_channels, self.memory_size, target_network_update_tau=self.target_network_update_tau) if self.tensorboard_log_freq > 0: self.qlearner.add_tensorboard_ops(self.tensorboard_log_path)
class QTrainer: def __init__(self, config_or_model, load_model=False): self.config = None self.model_loaded = False #load a saved model if load_model: print("Loading model from: {}".format(config_or_model)) load_path = Path(config_or_model) if (not load_path.exists()) or (not load_path.is_dir()): print("Error: directory doesn't exist") config_filename = load_path.joinpath("config.json") self.config = self.load_config(str(config_filename)) else: self.config = self.load_config(config_or_model) #select game self.game_name = self.config["game"] self.game = None if self.game_name == "snake": self.game = game.Snake elif self.game_name == "box": self.game = game.Box else: print("Error: unknown game {}".format(self.game_name)) self.nn_config = self.config["nn"] #parameters of experience memory self.memory_size = self.config["memory_size"] self.memory_alpha = self.config["memory_alpha"] self.memory_beta_start = self.config["memory_beta_start"] self.memory_beta_end = self.config["memory_beta_end"] self.memory_beta_num_steps = self.config["memory_beta_num_steps"] self.memory_beta_step = (self.memory_beta_end - self.memory_beta_start ) / self.memory_beta_num_steps self.exp_memory_start_size = self.config["memory_start_size"] #game parameters: image size, board size, num_goals, ... self.width = self.config["width"] self.height = self.config["height"] self.image_scale_factor = self.config["image_scale_factor"] self.num_goals = self.config["num_goals"] self.img_width = self.width * self.image_scale_factor self.img_height = self.height * self.image_scale_factor self.num_img_channels = self.game.num_channels self.num_actions = self.game.num_actions #random policy parameters self.epsilon_start = self.config["epsilon_start"] self.epsilon_min = self.config["epsilon_min"] self.num_epsilon_steps = self.config["num_epsilon_steps"] self.epsilon_step = (self.epsilon_start - self.epsilon_min) / self.num_epsilon_steps #scale rewards, training might be more stable if q-values converge to range [-1,1] self.scale_reward_max = None if "scale_reward_max" in self.config: self.scale_reward_max = self.config["scale_reward_max"] self.game.max_reward *= self.scale_reward_max self.game.min_reward *= self.scale_reward_max self.game.empty_reward *= self.scale_reward_max print("Scaling rewards by {}".format(self.scale_reward_max)) #frequence parameters of updating target network, output, saving, tensorboard, evaluation self.max_steps = self.config["max_steps"] self.output_freq = self.config["output_freq"] self.update_freq = self.config["update_freq"] self.target_network_update_mode = self.config[ "target_network_update_mode"] self.target_network_update_tau = None self.target_network_update_freq = None if self.target_network_update_mode == "hard": self.target_network_update_freq = self.config[ "target_network_update_freq"] else: self.target_network_update_tau = self.config[ "target_network_update_tau"] self.eval_freq = self.config["eval_freq"] self.eval_steps = self.config["eval_steps"] self.tensorboard_log_freq = self.config["tensorboard_log_freq"] self.tensorboard_log_path = self.config["tensorboard_log_path"] self.save_freq = self.config["save_freq"] self.save_path = self.config["save_path"] self.batch_size = self.config["batch_size"] #parameters that are actually changed while training, these need to be saved and loaded self.curr_step = 0 self.epsilon = self.epsilon_start self.memory_beta = self.memory_beta_start self.best_average_score = 0 #create experience memory self.exp_memory = ExperienceMemory(self.memory_size, self.img_width, self.img_height, self.num_img_channels, self.memory_alpha) #create QLearner object, load saved neural network model if necessary self.qlearner = None if load_model: load_path = str( Path(config_or_model).joinpath("nn").joinpath("model")) self.qlearner = QLearner( self.nn_config, self.num_actions, self.img_width, self.img_height, self.num_img_channels, self.memory_size, load_model=load_path, target_network_update_tau=self.target_network_update_tau) self.curr_step = self.config["curr_step"] self.epsilon = self.config["epsilon"] self.memory_beta = self.config["memory_beta"] self.best_average_score = self.config["best_average_score"] print("Model loaded successfully") self.model_loaded = True else: self.qlearner = QLearner( self.nn_config, self.num_actions, self.img_width, self.img_height, self.num_img_channels, self.memory_size, target_network_update_tau=self.target_network_update_tau) if self.tensorboard_log_freq > 0: self.qlearner.add_tensorboard_ops(self.tensorboard_log_path) #return a new game instance def get_game(self): return self.game(self.width, self.height, self.image_scale_factor, self.num_goals) #initialize experience memory obtained by random play, i.e. at each step the agent chooses an action uniformly at random def init_random_exp_memory(self, size): if size > self.memory_size: size = self.memory_size game = self.get_game() self.exp_memory.add(game.get_state(), 0, 0, 0) for i in range(size): random_action = np.random.randint(0, self.num_actions) reward, is_terminal = game.execute_action(random_action) state = game.get_state() self.exp_memory.add(state, random_action, reward, is_terminal) if is_terminal: game.reset() self.exp_memory.add(game.get_state(), 0, 0, 0) #initialize experience memory with epsilon-greedy policy def init_exp_memory(self, size): if size > self.memory_size: size = self.memory_size game = self.get_game() self.exp_memory.add(game.get_state(), 0, 0, 0) for i in range(size): action = 0 if np.random.rand() < self.epsilon: action = np.random.randint(0, self.num_actions) else: action = self.qlearner.compute_action(game.get_state())[0] reward, is_terminal = game.execute_action(action) state = game.get_state() self.exp_memory.add(state, action, reward, is_terminal) if is_terminal: game.reset() self.exp_memory.add(game.get_state(), 0, 0, 0) def train(self): if self.model_loaded: self.init_exp_memory(self.exp_memory_start_size) else: self.init_random_exp_memory(self.exp_memory_start_size) total_reward = 0.0 games_played = 1 game = self.get_game() self.exp_memory.add(game.get_state(), 0, 0, 0) while self.curr_step < self.max_steps: #play one game step according to epsilon-greedy policy action = 0 if np.random.rand() < self.epsilon: action = np.random.randint(0, self.num_actions) else: action = self.qlearner.compute_action(game.get_state())[0] reward, is_terminal = game.execute_action(action) self.exp_memory.add(game.get_state(), action, reward, is_terminal) if is_terminal: game.reset() self.exp_memory.add(game.get_state(), 0, 0, 0) games_played += 1 total_reward += self.renormalize_reward(reward) #compute next epsilon self.epsilon = np.maximum(self.epsilon_min, self.epsilon - self.epsilon_step) self.memory_beta = np.minimum( self.memory_beta_end, self.memory_beta + self.memory_beta_step) if self.curr_step % self.update_freq == 0: #sample a batch of transitions from experience memory s, a, r, s2, t, indices, p_values = self.exp_memory.sample( self.batch_size) #output tensorboard summaries write_summary = False if (self.tensorboard_log_freq > 0) and ( self.curr_step % self.tensorboard_log_freq == 0): write_summary = True #beta is divided by 2 here because squared error loss squares beta _, _, td = self.qlearner.train_step( s, a, r, s2, t, p_values, self.memory_beta / 2.0, write_summary=write_summary) self.exp_memory.update_p(indices, td) #update target network if self.target_network_update_mode == "soft": if self.curr_step % self.update_freq == 0: self.qlearner.update_target_network() else: if self.curr_step % self.target_network_update_freq == 0: self.qlearner.update_target_network() #output current training status if self.curr_step % self.output_freq == 0: average_reward = total_reward / games_played total_reward = 0 games_played = 1 print("step: {} epsilon: {} average reward per game: {}". format(self.curr_step, self.epsilon, average_reward)) #evaluate current target network and save model if average score per game has improved if (self.curr_step % self.eval_freq == 0): score, num_games, average, max_score = self.eval( self.eval_steps) print("Evaluating model with {} steps:".format( self.eval_steps)) print( "Total score: {} Games: {} Average: {} Max: {}".format( score, num_games, average, max_score)) if average >= self.best_average_score: print("Improved average score") print("Saving model...") self.save() self.best_average_score = average #add average score to tensorboard summary = tf.Summary() summary.value.add(tag='average_score', simple_value=average) summary.value.add(tag='max_score', simple_value=max_score) self.qlearner.summary_writer.add_summary( summary, self.curr_step) self.curr_step += 1 #evaluate model for a given number of steps def eval(self, num_steps): game = self.get_game() total_score = 0.0 current_score = 0.0 num_games = 1.0 max_score = 0.0 for i in range(num_steps): action = self.qlearner.compute_action(game.get_state())[0] reward, is_terminal = game.execute_action(action) reward = self.renormalize_reward(reward) current_score += reward total_score += reward if is_terminal: game.reset() if i < (num_steps - 1): num_games += 1 if current_score > max_score: max_score = current_score current_score = 0 average = total_score / num_games return total_score, num_games, average, max_score #compute original values for scaled rewards def renormalize_reward(self, reward): if not self.scale_reward_max is None: return reward / self.scale_reward_max else: return reward def load_config(self, filename): result = None with open(filename, 'r') as fp: result = json.load(fp) return result def save(self): base_path = Path(self.save_path) if not base_path.exists(): base_path.mkdir() date_str = datetime.datetime.today().strftime("%Y-%m-%d--%H-%M") save_path = date_str + "--step" + str(self.curr_step) save_path = base_path.joinpath(save_path) #create path if it doesn't exist if not save_path.exists(): save_path.mkdir() self.config["epsilon"] = self.epsilon self.config["curr_step"] = self.curr_step self.config["memory_beta"] = self.memory_beta self.config["best_average_score"] = self.best_average_score #save config config_filename = save_path.joinpath("config.json") with config_filename.open('w') as fp: json.dump(self.config, fp, indent=4) #save neural network nn_path = save_path.joinpath("nn") if not nn_path.exists(): nn_path.mkdir() self.qlearner.save_model(str(nn_path.joinpath("model"))) #output game images def eval_with_images(self, num_steps, path): image_id = 0 game = self.get_game() self.save_image(game.get_state(), path, image_id, 0, 0, 0, 0.0) total_score = 0 games_finished = 0 max_game_score = 0 current_game_score = 0.0 for i in range(num_steps): image_id += 1 action = self.qlearner.compute_action(game.get_state())[0] reward, is_terminal = game.execute_action(action) reward = self.renormalize_reward(reward) total_score += reward current_game_score += reward self.save_image(game.get_state(), path, image_id, action, reward, is_terminal, score=current_game_score) if is_terminal: game.reset() games_finished += 1 if current_game_score > max_game_score: max_game_score = current_game_score current_game_score = 0.0 self.save_image(game.get_state(), path, image_id, action, reward, is_terminal, score=current_game_score) print("Max score: {}".format(max_game_score)) #output images for games whose score is above a given threshold def find_max_games(self, num_steps, path, score_threshold): image_id = 0 game = self.get_game() frames = [] frames.append((np.copy(game.get_state()), 0.0)) max_game_score = 0 current_game_score = 0.0 for i in range(num_steps): if i % (num_steps // 10) == 0: print("At step {}".format(i)) action = self.qlearner.compute_action(game.get_state())[0] reward, is_terminal = game.execute_action(action) reward = self.renormalize_reward(reward) current_game_score += reward frames.append((np.copy(game.get_state()), current_game_score)) if is_terminal: game.reset() if current_game_score > max_game_score: max_game_score = current_game_score if current_game_score > score_threshold: print("Saving images...") for frame in frames: self.save_image(frame[0], path, image_id, 0, 0, 0, score=frame[1]) image_id += 1 frames = [] frames.append((np.copy(game.get_state()), 0.0)) current_game_score = 0.0 print("Max score: {}".format(max_game_score)) #output transition images def test_experience_memory(self, num_steps, path): image_id = 0 self.init_random_exp_memory(self.exp_memory_start_size) s, a, r, s2, t = self.exp_memory.sample(num_steps) for i in range(num_steps): image_id += 1 action = a[i] reward = r[i] is_terminal = t[i] self.save_transition(s[i], action, reward, s2[i], is_terminal, path, image_id) def save_transition(self, s, a, r, s2, t, path, image_id): self.save_image(self.combine_images(s, s2), path, image_id, a, r, t) def combine_images(self, image1, image2, sep_width=10): image1 = np.squeeze(image1) image2 = np.squeeze(image2) shape = image1.shape sep = np.ones([shape[0], sep_width, self.num_img_channels], dtype=float) frames1 = [] frames2 = [] for j in range(self.num_frames): start_index = j * self.num_img_channels end_index = (j + 1) * self.num_img_channels frames1.append(image1[:, :, start_index:end_index]) frames2.append(image2[:, :, start_index:end_index]) if j != (self.num_frames - 1): frames1.append(sep) frames2.append(sep) image1 = np.concatenate(frames1, axis=1) image2 = np.concatenate(frames2, axis=1) shape = image1.shape sep = np.ones([sep_width, shape[1], self.num_img_channels], dtype=float) return np.concatenate((image2, sep, image1), axis=0) def save_image(self, img, path, image_id, action, reward, is_terminal, score=None): save_file = Path(path).joinpath("img{}.png".format(image_id)) with save_file.open('wb') as fp: fig = plt.figure() plt.imshow(np.squeeze(img), origin="lower") plt.axis("off") if not score is None: plt.title("Score: {}".format(score)) else: plt.title("action: {} reward: {} terminal: {}".format( self.game.action_names[action], reward, is_terminal)) fig.savefig(fp, bbox_inches='tight', format="png") plt.close()
Pass opening angle, SMALL or LARGE or INVALID -- 3 Goal scoring angle, SMALL or LARGE or INVALID -- 3 OUT proximity refers to outside of the quartile of the player """ NUM_STATES = 32 * (54 ** args.numTeammates) # Shoot, Pass to one of N teammates or Dribble NUM_ACTIONS = 2 + args.numTeammates hfo = HFOEnvironment() hfo.connectToServer(feature_set=HIGH_LEVEL_FEATURE_SET, server_port=args.port) q_learner = QLearner(NUM_STATES, NUM_ACTIONS, epsilon=0.0, q_table_in=args.qTableDir + str(args.playerIndex) + '.npy', q_table_out=args.qTableDir + str(args.playerIndex) + '.npy') for episode in range(0, args.numEpisodes): status = IN_GAME action = None state = None history = [] timestep = 0 while status == IN_GAME: timestep += 1 features = hfo.getState() # Print off features in a readable manner # feature_printer(features, args.numTeammates, args.numOpponents) if int(features[5] != 1):