def main(argv): rospy.init_node('jackal_gym', anonymous=True) env = gym.make('jackal-v0') rospack = rospkg.RosPack() training_dir = rospack.get_path('jackal-gym') + '/training_results' env = gym.wrappers.Monitor(env, training_dir, force=True) alpha = rospy.get_param('/alpha') epsilon = rospy.get_param('/epsilon') gamma = rospy.get_param('/gamma') epsilon_discount = rospy.get_param('/epsilon_discount') num_episodes = rospy.get_param('/num_episodes') num_steps = rospy.get_param('/num_steps') last_time_steps = numpy.ndarray(0) q_learning = qlearning.QLearning(range(env.action_space.n), epsilon, alpha, gamma) highest_reward = 0 for episode in range(num_episodes): rospy.loginfo('Starting episode %s' % (episode)) cumulated_reward = 0 done = False if q_learning.epsilon > 0.05: q_learning.epsilon *= epsilon_discount observation = env.reset() state = ''.join(map(str, observation)) for step in range(num_steps): rospy.loginfo('Step %d' % (step)) action = q_learning.choose_action(state) observation, reward, done, info = env.step(action) cumulated_reward += reward highest_reward = max(highest_reward, cumulated_reward) next_state = ''.join(map(str, observation)) q_learning.learn(state, action, reward, next_state) if done is True: last_time_steps = numpy.append(last_time_steps, [int(step + 1)]) break else: state = next_state rospy.loginfo('Finished episode ' + str(episode)) time_steps = last_time_steps.tolist() time_steps.sort() rospy.loginfo("Overall score: {:0.2f}".format(last_time_steps.mean())) rospy.loginfo("Best 10 score: {:0.2f}".format( reduce(lambda x, y: x + y, l[-10:]) / len(l[-10:]))) env.close()
def _create_players(self, p1_name, p2_name, is_against_ai): player1 = human_player.HumanPlayer(p1_name, "O", self._is_cli) if is_against_ai: q = qlearning.QLearning(self.qtable_pickle_file) player2 = computer_player.ComputerPlayer("Computer", "X", True, q) else: player2 = human_player.HumanPlayer(p2_name, "X", self._is_cli) return player1, player2
def test_train_ql(): learning_agent = QL.QLearning() for i in range(1, MAX_TRAIN_EPISODE): game = BJ.BlackJack() status = game.deal() step = 0 if ( status is BJ.Status.BLACKJACK ): # Game is over right after distrubution and this not useful for training continue game_history = [] # Agent turn while (game.round is None): # When action return STAND or BUST the loop should exit step += 1 previous_state = game.get_state() action = learning_agent.get_action(previous_state) if (action == BJ.Action.HIT): status = game.hit() game_history.append( [previous_state, action, game.get_state(), step]) if (status is BJ.Status.GOOD): # non-terminal state continue else: status = game.stand() game_history.append( [previous_state, action, game.get_state(), step]) if (status is not BJ.Status.STAND): # non-terminal state continue if (game.round == BJ.Round.WIN): reward = 1 elif (game.round == BJ.Round.LOSE): reward = -1 elif (game.round == BJ.Round.TIE): reward = 0 else: raise ValueError('Error in handling the game status') for ele in game_history: if (step == ele[3]): reward_recalculated = reward else: reward_recalculated = 0 learning_agent.learn(ele[0], ele[1], ele[2], reward_recalculated) print_state_table(learning_agent) print(learning_agent._Q) report(play(learning_agent, MAX_RUNIN_EPISODE))
def main(): action_space_name = 'large_action_space' action_space = actions.BaseAction(action_space_name) agent = qlearning.QLearning(q_file, epsilon=1, action_space=action_space) env = environment.Environment(buoys, steps_between_actions, vessel_id, rudder_id, thruster_id, scenario, goal, goal_heading_e_ccw, goal_vel_lon, False) with open(variables_file, 'wb') as outfile: pickle_vars = dict() pickle_vars['action_space'] = action_space_name # env.set_up() # env.set_single_start_pos_mode([8000, 4600, -103.5, 3, 0, 0]) agent.exploring = True pickle.dump(pickle_vars, outfile) for episode in range(max_episodes): print('###STARTING EPISODE ', episode) env.set_up() # env.set_single_start_pos_mode([8000, 4600, -103.5, 3, 0, 0]) episode_dict = dict() episode_transitions_list = list() final_flag = 0 env.new_episode() for step in range(maximum_training_steps): state = env.get_state() print('Yaw:', state[2]) angle, rot = agent.select_action(state) state_prime, reward = env.step(angle, rot) # state_rime, reward = env.step(0, 0) print('Reward:', reward) transition = (state, (angle, rot), state_prime, reward) final_flag = env.is_final() agent.observe_reward(state, angle, rot, state_prime, reward, final_flag) print("***Training step " + str(step + 1) + " Completed") episode_transitions_list.append(transition) if final_flag != 0: break episode_dict['episode_number'] = episode episode_dict['transitions_list'] = episode_transitions_list episode_dict['final_flag'] = final_flag pickle_vars['ep#' + str(episode)] = episode_dict pickle.dump(episode_dict, outfile) env.finish() #Now that the training has finished, the agent can use his policy without updating it with open(learner_file, 'wb') as outfile: pickle.dump(agent, outfile)
def main(): action_space_name = 'cte_rotation' action_space = actions.BaseAction(action_space_name) agent = qlearning.QLearning(q_file, epsilon=0.1, action_space=action_space, gamma=1.0) env = environment.Environment(buoys, steps_between_actions, vessel_id, rudder_id, thruster_id, scenario, goal, goal_heading_e_ccw, goal_vel_lon, True) # with open(variables_file, 'wb') as outfile: pickle_vars = dict() pickle_vars['action_space'] = action_space_name env.set_up() env.set_single_start_pos_mode([8000, 4600, -103.5, 3, 0, 0]) # env.set_single_start_pos_mode([6000, 4000, -103.5, 3, 0, 0]) agent.exploring = True # pickle.dump(pickle_vars, outfile) for episode in range(500): print('###STARTING EPISODE ', episode) transitions_list = list() final_flag = 0 env.move_to_next_start() env.reset_to_start() for step in range(5000): state = env.get_state() print('Yaw:', state[2]) angle, rot = agent.select_action(state) state_prime, reward = env.step(angle, rot) # state_rime, reward = env.step(0, 0) print('Reward:', reward) transition = (state, (angle, rot), state_prime, reward) final_flag = env.is_final() agent.observe_reward(state, angle, rot, state_prime, reward, final_flag) print("***Training step "+str(step+1)+" Completed") transitions_list.append(transition) if final_flag != 0: break # state_rime, reward = env.step(0, 0) # state_rime, reward = env.step(0, 0) with open('qlearning_' + 'action_' + action_space_name + '_' + str(episode), 'wb') as outfile: pickle.dump(transitions_list, outfile) transitions_list = list() with open(learner_file, 'wb') as outfile: pickle.dump(agent, outfile)
def _start(self): q = qlearning.QLearning(self.qtable_pickle_file, self.alpha, self.gamma, self.delta, self.epsilon) player1, player2 = self.create_computer_players(q) self.rounds = int(input('Enter no. of rounds: ')) self.session = session.Session(player1, player2, self.update_new_move, self.update_new_game, self.update_invalid_move) self.session.run(self.rounds)
learning_rates = [0.01, 0.05, 0.1] discount_values = [0.7, 0.8, 0.9] epsilon = 1 START_EPSILON_DECAYING = 1000 END_EPSILON_DECAYING = EPISODES // 2 epsilon_decay_value = epsilon/(END_EPSILON_DECAYING-START_EPSILON_DECAYING) for learning_rate in learning_rates: for discount_value in discount_values: np.random.seed(10) q = qlearning.QLearning(learning_rate, discount_value, ACTION_SPACE_SIZE) for episode in range(EPISODES): episode_reward = 0 if not episode % avg_window_size: print(episode) env.reset() done = False player_is_winner = False while not done: (dice, movable_pieces, player_pieces, enemy_pieces, player_is_winner, _), player_i = env.get_observation() if player_i == 0 and movable_pieces.size:
def build_objects(): """Builds the agent (Q-learning) object and environment object """ return qlearning.QLearning(), environment.Environment()
self.model = nn.Sequential(nn.Linear(in_size, 40, bias=True), nn.ReLU(), nn.Linear(40, out_size, bias=True), nn.Softmax(dim=1)) def forward(self, inputs): x = self.model(inputs) return x print(1) states = [] actions = [] qbot = qlearning.QLearning(10, 0.00) nnbot = keras.models.load_model( "C:/Users/Kilby/Code/Waterloo/CS680/Project/NN1.h5") gabot = GeneticSnake(6, 4) gabot.load_state_dict( torch.load( "C:/Users/Kilby/Code/Waterloo/CS680/Project/genetic_champs/winners1/champ29.pt" )) gabot.eval() #Training data: #Import test data and training data play = Play_Snake(qbot)