예제 #1
0
        ## Set up networks ##

        tf.reset_default_graph()

        mainQN = q_learning.qnetwork(input_dim, output_dim, hidden_units,
                                     layers, learning_rate, clip_value)
        targetQN = q_learning.qnetwork(input_dim, output_dim, hidden_units,
                                       layers, learning_rate, clip_value)

        init = tf.global_variables_initializer()

        saver = tf.train.Saver()

        trainables = tf.trainable_variables()

        targetOps = q_learning.updateNetwork(trainables, tau)

        load_model = True
        ## Create replay buffer ##
        exp_buffer = q_learning.replay_buffer(buffer_size)

        ## Randomness of actions ##
        epsilon = eStart
        stepDrop = (eStart - eEnd) / estep

        ## Init environment ##

        #states = []
        #actions = []
        #reward_time = []
        #reward_average = []
예제 #2
0
def compute_objective(space):
    id = int(time.clock())
    num_of_cars = 50
    num_of_lanes = 5
    track_length = 1000
    speed_limit = 50
    mode = "dyn"
    random_seed = 1
    random.seed(random_seed)

    input_dim = (num_of_cars + 1) * 3
    output_dim = 23
    clip_value = 300
    ## Ego Init ##
    ego_lane_init = 1
    ego_pos_init = 0
    ego_speed_init = 0.25 * speed_limit

    #### Reward Variables
    random_sweep = 3
    max_train_episodes = 20000
    objective_window = 100
    average_window = 100

    reward_average = np.zeros(
        (random_sweep, int(max_train_episodes / average_window)))
    reward_objective = np.zeros((random_sweep, max_train_episodes))
    reward_eval_obj = np.zeros((random_sweep, objective_window))

    #### Assign Hyperparameters
    layers = space['layers']
    hidden_units = space['hidden_units']
    learning_rate = space['learning_rate']
    buffer_size = space['buffer_size']
    pre_train_steps = 0.2 * buffer_size
    batch_size = space['batch_size']
    update_freq = space['update_frequency']
    tau = space['tau']
    # RL params
    gamma = space['gamma']
    eStart = space['eStart']
    eEnd = space['eEnd']
    estep = space['estep']

    random_sweep = 3

    for r_seed in range(0, random_sweep):

        random.seed(r_seed)
        #### Start training process ####
        states = []
        actions = []
        reward_time = []

        folder_path = "./bayes_dyn/"

        path = folder_path + "model_random_" + str(id) + "/"

        #### Store Results
        if r_seed == 1:  # Only write for first time

            file = open(path + 'params' + str(id) + '.txt', 'w')
            # file = open(complete_file, 'w')
            file.write('NETWORK PARAMETERS: \n\n')
            file.write('Layers: ' + str(layers) + '\n')
            file.write('Hidden units: ' + str(hidden_units) + '\n')
            file.write('Learning rate: ' + str(learning_rate) + '\n')
            file.write('Buffer size: ' + str(buffer_size) + '\n')
            file.write('Pre_train_steps: ' + str(pre_train_steps) + '\n')
            file.write('Batch_size: ' + str(batch_size) + '\n')
            file.write('Update frequency: ' + str(update_freq) + '\n')
            file.write('Tau: ' + str(tau) + '\n\n')

            file.write('RL PARAMETERS: \n\n')
            file.write('Gamma: ' + str(gamma) + '\n')
            file.write('Epsilon start: ' + str(eStart) + '\n')
            file.write('Epsilon end: ' + str(eEnd) + '\n')
            file.write('Epsilon steps: ' + str(estep) + '\n')

            file.write('SCENARIO PARAMETERS: \n\n')
            file.write('Cars: ' + str(num_of_cars) + '\n')
            file.write('Lanes: ' + str(num_of_lanes) + '\n')
            file.write('Ego speed init: ' + str(ego_speed_init) + '\n')
            file.write('Ego pos init: ' + str(ego_pos_init) + '\n')
            file.write('Ego lane init: ' + str(ego_lane_init) + '\n')

            file.close()

        # Set up networks ##

        tf.reset_default_graph()

        mainQN = q_learning.qnetwork(input_dim, output_dim, hidden_units,
                                     layers, learning_rate, clip_value)
        targetQN = q_learning.qnetwork(input_dim, output_dim, hidden_units,
                                       layers, learning_rate, clip_value)

        init = tf.global_variables_initializer()

        saver = tf.train.Saver()

        trainables = tf.trainable_variables()

        targetOps = q_learning.updateNetwork(trainables, tau)

        load_model = False
        ## Create replay buffer ##
        exp_buffer = q_learning.replay_buffer(buffer_size)

        ## Randomness of actions ##
        epsilon = eStart
        stepDrop = (eStart - eEnd) / estep

        ## Init environment ##
        total_steps = 0
        done = False

        ## Start Session ##

        with tf.Session() as sess:
            sess.run(init)

            for episode in range(max_train_episodes):
                episode_buffer = q_learning.replay_buffer(buffer_size)
                env = road_env.highway(num_of_lanes, num_of_cars, track_length,
                                       speed_limit, ego_lane_init,
                                       ego_pos_init, ego_speed_init, mode,
                                       r_seed)
                state, _, _ = env.get_state()
                state_v = vectorize_state(state)

                done = False
                reward_episode = 0

                while done == False:
                    if (np.random.random() < epsilon
                            or total_steps < pre_train_steps):
                        action = random.randint(0, 22)

                    else:
                        action = sess.run(
                            mainQN.action_pred,
                            feed_dict={mainQN.input_state: [state_v]})

                    state1, reward, done, _ = env.step(action)
                    state1_v = vectorize_state(state1)

                    total_steps += 1

                    episode_buffer.add(
                        np.reshape(
                            np.array([state_v, action, reward, state1_v,
                                      done]), [1, 5]))  # [s,a,r,s1,d]

                    ## Decrease randomness ##
                    if total_steps > pre_train_steps:
                        if epsilon > eEnd:
                            epsilon -= estep

                        trainBatch = exp_buffer.sample(batch_size)
                        ## Calculate Q-value: Q = r(s,a) + gamma*Q(s1,a_max)
                        # Use main network to predict the action a_max
                        action_max = sess.run(mainQN.action_pred,
                                              feed_dict={
                                                  mainQN.input_state:
                                                  np.vstack(trainBatch[:, 3])
                                              })  # a_max
                        Qt1_vec = sess.run(targetQN.output_q_predict,
                                           feed_dict={
                                               targetQN.input_state:
                                               np.vstack(trainBatch[:, 3])
                                           })  # All Q-values for s1

                        end_multiplier = -(trainBatch[:, 4] - 1
                                           )  # When last step Q = r(s,a)
                        Qt1 = Qt1_vec[range(batch_size),
                                      action_max]  # select Q(s1,a_max)

                        # Q = r(s,a) + gamma*Q(s1,a_max)
                        Q_gt = trainBatch[:, 2] + gamma * Qt1 * end_multiplier

                        ## Optimize network parameters
                        _ = sess.run(mainQN.update,
                                     feed_dict={
                                         mainQN.input_state:
                                         np.vstack(trainBatch[:, 0]),
                                         mainQN.q_gt:
                                         Q_gt,
                                         mainQN.actions:
                                         trainBatch[:, 1]
                                     })

                        ## Update target network ##
                        if total_steps % update_freq == 0:
                            print("Update target network")
                            q_learning.updateTarget(targetOps, sess)

                    reward_episode += reward
                    states.append(state)
                    actions.append(action)
                    state_v = state1_v

                exp_buffer.add(episode_buffer.buffer)
                reward_time.append(reward_episode)
                reward_objective[r_seed, episode] = reward_episode

                if episode % 5000 == 0:
                    save_path = saver.save(
                        sess, path + "modelRL_" + str(r_seed) + "_" +
                        str(episode) + ".ckpt")
                    print("Model saved in: %s", save_path)

                if episode % average_window == 0:
                    print("Total steps: ", total_steps,
                          "Average reward over 100 Episodes: ",
                          np.mean(reward_time[-average_window:]), "Episode:",
                          episode)
                    # reward_average[r_seed,int(episode/average_window)].append(np.mean(reward_time[-average_window:]))
                    reward_average[r_seed,
                                   int(episode / average_window)] = (np.mean(
                                       reward_time[-average_window:]))
            final_save_path = saver.save(
                sess, path + "random_" + str(r_seed) + "_" + "Final.ckpt")
            print("Model saved in: %s", final_save_path)

            with open(path + 'reward_time' + str(id) + str(r_seed) + '.csv',
                      'w') as myfile:
                wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
                wr.writerow(reward_time)
            myfile.close()

            reward_eval = []

            for t in range(objective_window):
                env = road_env.highway(num_of_lanes, num_of_cars, track_length,
                                       speed_limit, ego_lane_init,
                                       ego_pos_init, ego_speed_init, mode,
                                       r_seed)
                state, _, _ = env.get_state()
                state_v = vectorize_state(state)

                done = False
                reward_episode = 0

                while done == False:
                    action = sess.run(
                        mainQN.action_pred,
                        feed_dict={mainQN.input_state: [state_v]})
                    state1, reward, done, _ = env.step(action)
                    reward_episode += reward

                reward_eval.append(reward_episode)

        reward_eval_obj[r_seed] = reward_eval

    plt.figure(4)

    ax = plt.subplot(1, 1, 1)
    ax.set_title("Reward over time")
    ax.set_xlabel("epsiode/100")
    ax.set_ylabel("reward")
    ax.grid()

    sns.tsplot(reward_average)
    manager = plt.get_current_fig_manager()
    manager.resize(*manager.window.maxsize())
    # plt.show(block=False)
    plt.tight_layout()
    plt.savefig(folder_path + 'reward' + str(id) + '.png')
    # plt.show()
    plt.close()

    with open(path + 'reward_average' + str(id) + '.csv', 'w') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(reward_average)
    myfile.close()

    with open(path + 'reward_time' + str(id) + '.csv', 'w') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerow(reward_time)
    myfile.close()

    objective = q_learning.bayes_objective(reward_eval_obj, objective_window)

    return objective