示例#1
0
文件: train.py 项目: ICGog/jackal-gym
def main(argv):
    rospy.init_node('jackal_gym', anonymous=True)
    env = gym.make('jackal-v0')

    rospack = rospkg.RosPack()
    training_dir = rospack.get_path('jackal-gym') + '/training_results'
    env = gym.wrappers.Monitor(env, training_dir, force=True)

    alpha = rospy.get_param('/alpha')
    epsilon = rospy.get_param('/epsilon')
    gamma = rospy.get_param('/gamma')
    epsilon_discount = rospy.get_param('/epsilon_discount')
    num_episodes = rospy.get_param('/num_episodes')
    num_steps = rospy.get_param('/num_steps')

    last_time_steps = numpy.ndarray(0)

    q_learning = qlearning.QLearning(range(env.action_space.n), epsilon, alpha,
                                     gamma)

    highest_reward = 0

    for episode in range(num_episodes):
        rospy.loginfo('Starting episode %s' % (episode))

        cumulated_reward = 0
        done = False
        if q_learning.epsilon > 0.05:
            q_learning.epsilon *= epsilon_discount

        observation = env.reset()
        state = ''.join(map(str, observation))

        for step in range(num_steps):
            rospy.loginfo('Step %d' % (step))
            action = q_learning.choose_action(state)
            observation, reward, done, info = env.step(action)
            cumulated_reward += reward
            highest_reward = max(highest_reward, cumulated_reward)
            next_state = ''.join(map(str, observation))

            q_learning.learn(state, action, reward, next_state)

            if done is True:
                last_time_steps = numpy.append(last_time_steps,
                                               [int(step + 1)])
                break
            else:
                state = next_state

        rospy.loginfo('Finished episode ' + str(episode))

    time_steps = last_time_steps.tolist()
    time_steps.sort()
    rospy.loginfo("Overall score: {:0.2f}".format(last_time_steps.mean()))
    rospy.loginfo("Best 10 score: {:0.2f}".format(
        reduce(lambda x, y: x + y, l[-10:]) / len(l[-10:])))

    env.close()
示例#2
0
 def _create_players(self, p1_name, p2_name, is_against_ai):
     player1 = human_player.HumanPlayer(p1_name, "O", self._is_cli)
     if is_against_ai:
         q = qlearning.QLearning(self.qtable_pickle_file)
         player2 = computer_player.ComputerPlayer("Computer", "X", True, q)
     else:
         player2 = human_player.HumanPlayer(p2_name, "X", self._is_cli)
     return player1, player2
def test_train_ql():
    learning_agent = QL.QLearning()
    for i in range(1, MAX_TRAIN_EPISODE):
        game = BJ.BlackJack()
        status = game.deal()
        step = 0

        if (
                status is BJ.Status.BLACKJACK
        ):  # Game is over right after distrubution and this not useful for training
            continue

        game_history = []

        # Agent turn
        while (game.round is None):
            # When action  return STAND or BUST the loop should exit
            step += 1
            previous_state = game.get_state()
            action = learning_agent.get_action(previous_state)

            if (action == BJ.Action.HIT):
                status = game.hit()
                game_history.append(
                    [previous_state, action,
                     game.get_state(), step])
                if (status is BJ.Status.GOOD):  # non-terminal state
                    continue
            else:
                status = game.stand()
                game_history.append(
                    [previous_state, action,
                     game.get_state(), step])
                if (status is not BJ.Status.STAND):  # non-terminal state
                    continue

            if (game.round == BJ.Round.WIN):
                reward = 1
            elif (game.round == BJ.Round.LOSE):
                reward = -1
            elif (game.round == BJ.Round.TIE):
                reward = 0
            else:
                raise ValueError('Error in handling the game status')

            for ele in game_history:
                if (step == ele[3]):
                    reward_recalculated = reward
                else:
                    reward_recalculated = 0
                learning_agent.learn(ele[0], ele[1], ele[2],
                                     reward_recalculated)

    print_state_table(learning_agent)
    print(learning_agent._Q)
    report(play(learning_agent, MAX_RUNIN_EPISODE))
示例#4
0
def main():
    action_space_name = 'large_action_space'
    action_space = actions.BaseAction(action_space_name)
    agent = qlearning.QLearning(q_file, epsilon=1, action_space=action_space)
    env = environment.Environment(buoys, steps_between_actions, vessel_id,
                                  rudder_id, thruster_id, scenario, goal,
                                  goal_heading_e_ccw, goal_vel_lon, False)
    with open(variables_file, 'wb') as outfile:
        pickle_vars = dict()
        pickle_vars['action_space'] = action_space_name
        # env.set_up()
        # env.set_single_start_pos_mode([8000, 4600, -103.5, 3, 0, 0])
        agent.exploring = True
        pickle.dump(pickle_vars, outfile)
        for episode in range(max_episodes):
            print('###STARTING EPISODE ', episode)
            env.set_up()
            # env.set_single_start_pos_mode([8000, 4600, -103.5, 3, 0, 0])
            episode_dict = dict()
            episode_transitions_list = list()
            final_flag = 0
            env.new_episode()
            for step in range(maximum_training_steps):
                state = env.get_state()
                print('Yaw:', state[2])
                angle, rot = agent.select_action(state)
                state_prime, reward = env.step(angle, rot)
                # state_rime, reward = env.step(0, 0)
                print('Reward:', reward)
                transition = (state, (angle, rot), state_prime, reward)
                final_flag = env.is_final()
                agent.observe_reward(state, angle, rot, state_prime, reward,
                                     final_flag)
                print("***Training step " + str(step + 1) + " Completed")
                episode_transitions_list.append(transition)
                if final_flag != 0:
                    break
            episode_dict['episode_number'] = episode
            episode_dict['transitions_list'] = episode_transitions_list
            episode_dict['final_flag'] = final_flag
            pickle_vars['ep#' + str(episode)] = episode_dict
            pickle.dump(episode_dict, outfile)
            env.finish()
        #Now that the training has finished, the agent can use his policy without updating it
    with open(learner_file, 'wb') as outfile:
        pickle.dump(agent, outfile)
示例#5
0
def main():
    action_space_name = 'cte_rotation'
    action_space = actions.BaseAction(action_space_name)
    agent = qlearning.QLearning(q_file, epsilon=0.1, action_space=action_space, gamma=1.0)
    env = environment.Environment(buoys, steps_between_actions, vessel_id,
                                  rudder_id, thruster_id, scenario, goal, goal_heading_e_ccw, goal_vel_lon, True)
    # with open(variables_file, 'wb') as outfile:
    pickle_vars = dict()
    pickle_vars['action_space'] = action_space_name
    env.set_up()
    env.set_single_start_pos_mode([8000, 4600, -103.5, 3, 0, 0])
    # env.set_single_start_pos_mode([6000, 4000, -103.5, 3, 0, 0])
    agent.exploring = True
    # pickle.dump(pickle_vars, outfile)
    for episode in range(500):
        print('###STARTING EPISODE ', episode)
        transitions_list = list()
        final_flag = 0
        env.move_to_next_start()
        env.reset_to_start()
        for step in range(5000):
            state = env.get_state()
            print('Yaw:', state[2])
            angle, rot = agent.select_action(state)
            state_prime, reward = env.step(angle, rot)
            # state_rime, reward = env.step(0, 0)
            print('Reward:', reward)
            transition = (state, (angle, rot), state_prime, reward)
            final_flag = env.is_final()
            agent.observe_reward(state, angle, rot, state_prime, reward, final_flag)
            print("***Training step "+str(step+1)+" Completed")
            transitions_list.append(transition)
            if final_flag != 0:
                break
                # state_rime, reward = env.step(0, 0)
                # state_rime, reward = env.step(0, 0)
        with open('qlearning_' + 'action_' + action_space_name + '_' + str(episode),
                  'wb') as outfile:
            pickle.dump(transitions_list, outfile)
            transitions_list = list()
    with open(learner_file, 'wb') as outfile:
        pickle.dump(agent, outfile)
示例#6
0
 def _start(self):
     q = qlearning.QLearning(self.qtable_pickle_file, self.alpha, self.gamma, self.delta, self.epsilon)
     player1, player2 = self.create_computer_players(q)
     self.rounds = int(input('Enter no. of rounds: '))
     self.session = session.Session(player1, player2, self.update_new_move, self.update_new_game, self.update_invalid_move)
     self.session.run(self.rounds)
learning_rates = [0.01, 0.05, 0.1]
discount_values = [0.7, 0.8, 0.9]

epsilon = 1
START_EPSILON_DECAYING = 1000
END_EPSILON_DECAYING = EPISODES // 2

epsilon_decay_value = epsilon/(END_EPSILON_DECAYING-START_EPSILON_DECAYING)

for learning_rate in learning_rates:
    for discount_value in discount_values:

        np.random.seed(10)

        q = qlearning.QLearning(learning_rate, discount_value, ACTION_SPACE_SIZE)

        for episode in range(EPISODES):
            episode_reward = 0

            if not episode % avg_window_size:
                print(episode)

            env.reset()
            done = False
            player_is_winner = False

            while not done:
                (dice, movable_pieces, player_pieces, enemy_pieces, player_is_winner, _), player_i = env.get_observation()

                if player_i == 0 and movable_pieces.size:
示例#8
0
def build_objects():
    """Builds the agent (Q-learning) object and environment object
            
    """
    return qlearning.QLearning(), environment.Environment()
示例#9
0
        self.model = nn.Sequential(nn.Linear(in_size, 40, bias=True),
                                   nn.ReLU(), nn.Linear(40,
                                                        out_size,
                                                        bias=True),
                                   nn.Softmax(dim=1))

    def forward(self, inputs):
        x = self.model(inputs)
        return x


print(1)
states = []
actions = []

qbot = qlearning.QLearning(10, 0.00)

nnbot = keras.models.load_model(
    "C:/Users/Kilby/Code/Waterloo/CS680/Project/NN1.h5")

gabot = GeneticSnake(6, 4)
gabot.load_state_dict(
    torch.load(
        "C:/Users/Kilby/Code/Waterloo/CS680/Project/genetic_champs/winners1/champ29.pt"
    ))
gabot.eval()

#Training data:
#Import test data and training data

play = Play_Snake(qbot)