Пример #1
0
def Agent(t, ratio):
    """ Train the DQN to play Cartpole with RNN trained using data generated randomly
        """
    parser = argparse.ArgumentParser()
    parser.add_argument('-r',
                        '--num_rand_acts',
                        help="Random actions before learning starts",
                        default=100,
                        type=int)
    parser.add_argument('-m',
                        '--mem_size',
                        help="Size of the experience replay memory",
                        default=10**4,
                        type=int)
    args = parser.parse_args()

    # Set up logging:
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Other things to modify
    number_training_steps = t
    print_progress_after = 10**2
    Copy_model_after = 100

    number_random_actions = args.num_rand_acts
    mem_size = args.mem_size

    logger.info(' num_rand_acts = %s, mem_size = %s', number_random_actions,
                mem_size)

    # Make the model
    model = cartpole.make_model()
    model.summary()

    # Make the memories
    mem_states = cartpole.RingBufSimple(mem_size)
    mem_actions = cartpole.RingBufSimple(mem_size)
    mem_rewards = cartpole.RingBufSimple(mem_size)
    mem_terminal = cartpole.RingBufSimple(mem_size)

    print('Setting up Cartpole and pre-filling memory with random actions...')

    # Create and reset the Atari env:
    env = gym.make('CartPole-v1')
    env.reset()
    steps = 0

    # First make some random actions, and initially fill the memories with these:
    test_input = np.zeros((number_random_actions + 1, 4))
    test_output = np.zeros((number_random_actions + 1, 1))
    for i in range(number_random_actions + 1):
        iteration = i
        # Random action
        action = env.action_space.sample()
        next_state, reward, is_terminal, _ = env.step(action)
        steps += 1
        test_input[i] = next_state
        next_state = np.array([
            next_state
        ])[0, :]  # Process state so that it's a numpy array, shape (4,)

        if is_terminal:
            reward = -100
            env.reset()

        else:
            reward = 2.4 - abs(next_state[0]) + 12 * 2 * math.pi / 360 - abs(next_state[2]) - abs(next_state[1]) \
                     - abs(next_state[3])

        if steps >= 200:
            env.reset()
            steps = 0

        test_output[i] = reward
        cartpole.add_to_memory(iteration, mem_states, mem_actions, mem_rewards,
                               mem_terminal, next_state, action, reward,
                               is_terminal)

    # Now do actions using the DQN, and train as we go...
    print('Finished the {} random actions...'.format(number_random_actions))
    current_state = next_state

    # For recroding the score
    score = 0
    scores = []
    true_score = 0
    true_scores = []
    train_number = 0
    test_number = 0
    train_input_class = list()
    train_output_class = list()
    train_input_reg = list()
    train_output_reg = list()

    # Create RNN
    model_class = []
    model_reg = []
    for i in range(number_of_RNNmodels_class):
        globals()['RNNmodel_{}'.format(i + 1)] = RNN.class_model()
        model_class.append(globals()['RNNmodel_{}'.format(i + 1)])
    model_class = RNN.check_model(model_class, number_of_RNNmodels_class,
                                  test_input)
    for i in range(number_of_RNNmodels_reg):
        mmodel = RNN.reg_model()
        model_reg.append(mmodel)
    model_reg = RNN.check_model(model_reg, number_of_RNNmodels_reg, test_input)
    MODELS = [model_class, model_reg]

    plt.ion()
    fig = plt.figure('Agent_r')
    for i in range(number_training_steps):

        iteration = number_random_actions + i

        # Copy model periodically and fit to this: this makes the learning more stable
        if i % Copy_model_after == 0:
            target_model = keras.models.clone_model(model)
            target_model.set_weights(model.get_weights())

        ret = random.random()
        if ret < ratio:
            train_number += 1
            steps, action, reward, is_terminal, epsilon, current_state, true_score, true_scores, train_input_class = cartpole.q_iteration(
                steps, env, model, target_model, iteration, current_state,
                mem_states, mem_actions, mem_rewards, mem_terminal, mem_size,
                true_score, true_scores, train_input_class)

            train_output_class.append(is_terminal)
            if not is_terminal:
                train_input_reg.append(current_state)
                train_output_reg.append(reward)
        else:
            test_number += 1
            steps, action, reward_pred, is_terminal, epsilon, current_state, score, scores,true_score, true_scores = \
                Agent_q_iteration(steps, env, model, target_model, iteration, current_state, mem_states, mem_actions,
                                          mem_rewards, mem_terminal, mem_size, score, scores, true_score, true_scores,
                                  number_of_RNNmodels_class, number_of_RNNmodels_reg, MODELS)

        # Print progress, time, and SAVE the model
        if (i + 1) % print_progress_after == 0:
            print('Training steps done: {}, Epsilon: {}'.format(
                i + 1, epsilon))
            print('Mean score = {}'.format(np.mean(scores)))
            print('Average scores for last 100 trials = {}'.format(
                np.mean(true_scores[::-1][0:100])))
            print('Ratio = {}'.format(train_number /
                                      (train_number + test_number)))
            # Test_acc = np.zeros((number_of_RNNmodels, 1))
            # for j in range(number_of_RNNmodels):
            #     test_acc, test_loss = rnnModel.test_RNNmodel(test_input, test_output, MODELS[j])
            #     Test_acc[j] = test_acc
            # print('RNN Test mean accuracy:', np.mean(Test_acc))

            plt.clf()
            plt.plot(true_scores)
            plt.ylabel('scores')
            plt.xlabel('Steps until {}'.format(i + 1))
            plt.pause(0.1)

        if len(train_input_class) == 100:
            for j in range(number_of_RNNmodels_class):
                hos = MODELS[0][j].fit(np.array(train_input_class),
                                       np.array(train_output_class),
                                       batch_size=100,
                                       epochs=20,
                                       verbose=0)
            train_input_class = list()
            train_output_class = list()
        if len(train_input_reg) == 100:
            for j in range(number_of_RNNmodels_reg):
                MODELS[1][j] = RNN.train_RNNmodel(np.array(train_input_reg),
                                                  np.array(train_output_reg),
                                                  MODELS[1][j])
            train_input_reg = list()
            train_output_reg = list()
    plt.ioff()
    # if len(training_input) > 0:
    #     for j in range(number_of_RNNmodels):
    #         MODELS[j] = RNN_reg.train_RNNmodel(np.array(training_input), np.array(training_output),MODELS[j])

    #   Save Agent_r
    file_name = os.path.join('Agents_0.06', 'Agent_r_5')
    model.save_weights(file_name)
    print('Agent_r saved')
    return scores, true_scores
Пример #2
0
def q_iteration(steps, env, model, target_model, iteration, current_state,
                mem_states, mem_actions, mem_rewards, mem_terminal, mem_size,
                score, scores, true_score, true_scores, MODELS, Ask_number,
                correct_pred, Ask_input_class, Ask_output_class, Ask_input_reg,
                Ask_output_reg, can_ask, t, ratio):
    """
    Do one iteration of acting then learning
    """
    epsilon = cartpole.get_epsilon_for_iteration(
        iteration)  # Choose epsilon based on the iteration
    start_state = current_state
    # Choose the action:
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = cartpole.choose_best_action(model, start_state)

    # Play one game iteration:
    next_state, _, is_terminal, _ = env.step(action)
    steps += 1
    next_state = np.array([
        next_state
    ])[0, :]  # Process state so that it's a numpy array, shape (4,)

    # Use RNN to predict reward
    IfAsk = False
    predictions = []
    for j in range(number_of_RNNmodels_class):
        prediction = MODELS[0][j].predict(np.array([next_state]))[0][0]
        predictions.append(prediction)
    if np.mean(predictions) > 0.9:
        reward_pred = -100
    elif np.mean(predictions) < 0.1:
        predictions = []
        for j in range(number_of_RNNmodels_reg):
            prediction = MODELS[1][j].predict(np.array([next_state]))
            prediction = prediction[0]

            predictions.append(prediction)
        if (t - iteration + 100) <= (round(t * ratio) - number - Ask_number):
            IfAsk = True

        if (np.max(predictions) - np.min(predictions)) < 0.1:
            reward_pred = np.mean(predictions)
        else:
            IfAsk = True
            reward_pred = None

    else:
        IfAsk = True

    # Retrain the RNNmodels
    if IfAsk:
        if can_ask:
            if is_terminal:
                reward_pred = -100
            else:
                reward_pred = 2.4 - abs(next_state[0]) + 12 * 2 * math.pi / 360 - abs(next_state[2]) \
                              - abs(next_state[1]) - abs(next_state[3])
                Ask_input_reg.append(next_state)
                Ask_output_reg.append(reward_pred)

            if abs(show_max(predictions) - reward_pred) < 0.1:
                correct_pred += 1

            Ask_input_class.append(next_state)
            Ask_output_class.append(is_terminal)
            Ask_number += 1
        else:
            reward_pred = np.mean(predictions)

    if is_terminal:
        reward = -100
    else:
        reward = 2.4 - abs(next_state[0]) + 12 * 2 * math.pi / 360 - abs(next_state[2]) \
                      - abs(next_state[1]) - abs(next_state[3])

    score += reward_pred
    true_score += reward

    # If DONE, reset model, modify reward, record score
    if is_terminal:
        env.reset()
        scores.append(score)  # Record score
        score = 0  # Reset score to zero
        true_scores.append(true_score)
        true_score = 0
    elif steps >= 200:
        scores.append(score)
        score = 0
        steps = 0
        true_scores.append(true_score)
        true_score = 0
        env.reset()

    cartpole.add_to_memory(iteration + 1, mem_states, mem_actions, mem_rewards,
                           mem_terminal, next_state, action, reward_pred,
                           is_terminal)

    # Make then fit a batch (gamma=0.99, num_in_batch=32)
    number_in_batch = 32
    cartpole.make_n_fit_batch(model, target_model, 0.99, iteration, mem_size,
                              mem_states, mem_actions, mem_rewards,
                              mem_terminal, number_in_batch)

    current_state = next_state

    return steps, action, reward_pred, is_terminal, epsilon, current_state, score, scores, true_score, true_scores, \
           Ask_number, correct_pred, Ask_input_class, Ask_output_class,Ask_input_reg, Ask_output_reg, can_ask
Пример #3
0
def Agent_q_iteration(steps, env, model, target_model, iteration,
                      current_state, mem_states, mem_actions, mem_rewards,
                      mem_terminal, mem_size, score, scores, true_score,
                      true_scores, number_of_RNNmodels_class,
                      number_of_RNNmodels_reg, MODELS):
    """
    Do one iteration of acting then learning
    """
    epsilon = cartpole.get_epsilon_for_iteration(
        iteration)  # Choose epsilon based on the iteration
    start_state = current_state
    # Choose the action:
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = cartpole.choose_best_action(model, start_state)

    # Play one game iteration:
    next_state, reward, is_terminal, _ = env.step(action)
    steps += 1
    next_state = np.array([
        next_state
    ])[0, :]  # Process state so that it's a numpy array, shape (4,)
    # Use RNN to predict reward
    predictions = []
    pre_class = []
    for k in range(number_of_RNNmodels_class):
        pred = MODELS[0][k].predict(np.array([next_state]))[0][0]
        pre_class.append(pred)
    class_pred = np.mean(pre_class)
    if class_pred >= 0.5:
        reward_pred = -100
    else:
        for j in range(number_of_RNNmodels_reg):
            prediction = MODELS[1][j].predict(np.array([next_state]))[0]
            predictions.append(prediction)
        reward_pred = np.mean(predictions)

    # get true reward
    if is_terminal:
        reward = -100
    else:
        reward = 2.4 - abs(next_state[0]) + 12 * 2 * math.pi / 360 - abs(next_state[2]) - abs(next_state[1])\
                 - abs(next_state[3])

    score += reward_pred
    true_score += reward

    # If DONE, reset model, modify reward, record score
    if is_terminal:
        env.reset()
        scores.append(score)  # Record score
        score = 0  # Reset score to zero
        true_scores.append(true_score)
        true_score = 0
    elif steps >= 200:
        env.reset()
        steps = 0
        scores.append(score)
        score = 0
        true_scores.append(true_score)
        true_score = 0

    cartpole.add_to_memory(iteration + 1, mem_states, mem_actions, mem_rewards,
                           mem_terminal, next_state, action, reward_pred,
                           is_terminal)

    # Make then fit a batch (gamma=0.99, num_in_batch=32)
    number_in_batch = 32
    cartpole.make_n_fit_batch(model, target_model, 0.99, iteration, mem_size,
                              mem_states, mem_actions, mem_rewards,
                              mem_terminal, number_in_batch)

    current_state = next_state

    return steps, action, reward_pred, is_terminal, epsilon, current_state, score, scores, true_score, true_scores
Пример #4
0
def Agent(t):
    """ Train the DQN to play Cartpole
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-r',
                        '--num_rand_acts',
                        help="Random actions before learning starts",
                        default=100,
                        type=int)
    parser.add_argument('-m',
                        '--mem_size',
                        help="Size of the experience replay memory",
                        default=10**4,
                        type=int)
    args = parser.parse_args()

    # Set up logging:
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Other things to modify
    number_training_steps = t
    print_progress_after = 10**2
    Copy_model_after = 100

    number_random_actions = args.num_rand_acts
    mem_size = args.mem_size

    logger.info(' num_rand_acts = %s, mem_size = %s', number_random_actions,
                mem_size)

    # Make the model
    model = cartpole.make_model()
    model.summary()

    # Make the memories
    mem_states = cartpole.RingBufSimple(mem_size)
    mem_actions = cartpole.RingBufSimple(mem_size)
    mem_rewards = cartpole.RingBufSimple(mem_size)
    mem_terminal = cartpole.RingBufSimple(mem_size)
    train_input_class = []

    print('Setting up Cartpole and pre-filling memory with random actions...')

    # Create and reset the Atari env:
    env = gym.make('CartPole-v1')
    env.reset()
    steps = 0

    # First make some random actions, and initially fill the memories with these:
    for i in range(number_random_actions + 1):
        iteration = i
        # Random action
        action = env.action_space.sample()
        next_state, reward, is_terminal, _ = env.step(action)
        steps += 1
        next_state = np.array([
            next_state
        ])[0, :]  # Process state so that it's a numpy array, shape (4,)

        if is_terminal:
            reward = -100
            env.reset()

        else:
            reward = 2.4 - abs(next_state[0]) + 12 * 2 * math.pi / 360 - abs(
                next_state[2]) - abs(next_state[1]) - abs(next_state[3])

        if steps >= 200:
            env.reset()
            steps = 0

        cartpole.add_to_memory(iteration, mem_states, mem_actions, mem_rewards,
                               mem_terminal, next_state, action, reward,
                               is_terminal)

    # Now do actions using the DQN, and train as we go...
    print('Finished the {} random actions...'.format(number_random_actions))
    tic = 0
    current_state = next_state

    # For recroding the score
    score = 0
    scores = []
    plt.ion()
    fig = plt.figure('Agent_f')
    for i in range(number_training_steps):

        iteration = number_random_actions + i

        # Copy model periodically and fit to this: this makes the learning more stable
        if i % Copy_model_after == 0:
            target_model = keras.models.clone_model(model)
            target_model.set_weights(model.get_weights())

        steps, action, reward, is_terminal, epsilon, current_state, score, scores, _ = cartpole.q_iteration(
            steps, env, model, target_model, iteration, current_state,
            mem_states, mem_actions, mem_rewards, mem_terminal, mem_size,
            score, scores, train_input_class)

        # Print progress, time, and SAVE the model
        if (i + 1) % print_progress_after == 0:
            print('Training steps done: {}, Epsilon: {}'.format(
                i + 1, epsilon))
            print('Mean score = {}'.format(np.mean(scores)))
            print('Average scores for last 100 trials = {}'.format(
                np.mean(scores[::-1][0:100])))
            plt.clf()
            plt.plot(scores)
            plt.title('Agent_f')
            plt.ylabel('scores')
            plt.xlabel('Number of Trials (Steps until {})'.format(i + 1))
            plt.pause(0.1)

    plt.ioff()

    # Save Agent_f
    file_name = os.path.join('Agents_0.1', 'Agent_f_5')
    model.save_weights(file_name)
    print('Agent_f saved')

    return scores