예제 #1
0
def run_maze():
    step = 0
    render_time = 0
    max_episodes = 5000
    episode_step_holder = []
    success_holder = []
    base_path = './logs/dqn/model/'

    for i_episode in range(max_episodes):
        episode_step = 0
        s = env.reset().ravel()

        while True:
            env.render(render_time)
            action = RL.choose_action(s)
            s_, reward, done, info = env.step(action)
            s_ = s_.ravel()
            # print('action:{0} | reward:{1} | done: {2}'.format(action, reward, done))
            RL.store_transition(s, action, reward, s_)

            if step > 200:
                RL.learn(done)

            s = s_
            step += 1
            episode_step += 1

            if episode_step > 299:
                done = True

            if done:
                print('{0} -- {1} -- {2} -- {3}'.format(
                    i_episode, info, episode_step, RL.epsilon))
                if info == 'running':
                    episode_step = 300
                    success_holder.append(0)
                elif info == 'terminal':
                    if episode_step < 50:
                        success_holder.append(1)
                    else:
                        success_holder.append(1)
                else:
                    raise Exception("Invalid info code.")
                env.render(render_time)
                episode_step_holder.append(episode_step)
                break

    # end of game
    print('game over')
    save_path = RL.saver.save(RL.sess, base_path + 'model_dqn.ckpt')
    print("Model saved in path: {}".format(save_path))
    RL.sess.close()
    env.destroy()

    # plot_cost(episode_step_holder, base_path + 'episode_steps.png')
    plot_rate(success_holder, base_path, index=0)
예제 #2
0
def run_maze():
    step = 0
    render_time = 0
    episode_step_holder = []
    success_holder = []
    base_path = './logs/dqn/'

    for i_episode in range(400):
        episode_step = 0
        s = env.reset().ravel()

        while True:
            env.render(render_time)
            action = RL.choose_action(s)
            s_, reward, done, info = env.step(action)
            s_ = s_.ravel()
            # print('action:{0} | reward:{1} | done: {2}'.format(action, reward, done))
            RL.store_transition(s, action, reward, s_)

            if step > 200:
                RL.learn()

            s = s_
            step += 1
            episode_step += 1

            if episode_step > 500:
                done = True

            if done:
                print('{0} -- {1} -- {2}'.format(i_episode, info,
                                                 episode_step))
                if info != 'success':
                    episode_step = 500
                    reward = -1
                    success_holder.append(0)
                else:
                    success_holder.append(1)
                env.render(render_time)
                episode_step_holder.append(episode_step)
                break

    # end of game
    print('game over')
    save_path = RL.saver.save(RL.sess, base_path + 'model_dqn.ckpt')
    print("Model saved in path: {}".format(save_path))
    RL.sess.close()
    env.destroy()
    # plot_cost(episode_step_holder, base_path + 'episode_steps.png')
    plot_rate(success_holder, base_path, index=15)
예제 #3
0
def run_maze():
    # expert_counter = 0  # Record how many times we refer the expert action.
    n_features = env.height * env.width
    n_actions = 4
    restore_path = None
    dagger_itr = 5  # how many times we do dataset aggregation.
    episode_step_holder = []
    success_holder = []

    obser_list = []
    action_list = []
    obsers_all = np.zeros((1, n_features))
    actions_all = np.zeros((1, 1))

    # # Collecting data.
    for j in range(RUN_STEPS):
        s = env.reset()
        while True:
            env.render(RENDER_TIME)
            a = get_expert_action(s)
            obser_list.append(s.ravel())
            action_list.append(a)
            s_, r, done, info = env.step(a)
            if done:
                env.render(RENDER_TIME)
                break
            s = s_

    assert len(obser_list) == len(action_list)
    for obser, act in zip(obser_list, action_list):
        obsers_all = np.concatenate([obsers_all, obser[np.newaxis, :]], axis=0)
        actions_all = np.concatenate([actions_all, np.array([act])[np.newaxis, :]], axis=0)
    obsers_all = obsers_all[1:, :]
    actions_all = actions_all[1:, :].astype(int)
    actions_all = one_hot_encoding_numpy(actions_all.ravel().tolist(), 4)

    # # Training an initialized policy.
    net_s = tf.placeholder(tf.float32, [None, n_features], name='net_s')  # observations
    net_a = tf.placeholder(tf.int32, [None, n_actions], name='net_a')  # expert actions

    net_l1 = tf.contrib.layers.fully_connected(net_s, 32, activation_fn=tf.nn.relu)
    # net_l2 = tf.contrib.layers.fully_connected(net_l1, 128, activation_fn=tf.nn.relu)
    net_out = tf.contrib.layers.fully_connected(net_l1, n_actions, activation_fn=tf.nn.softmax)

    net_loss = tf.losses.softmax_cross_entropy(onehot_labels=net_a, logits=net_out)  # compute cost
    train_op = tf.train.AdamOptimizer(0.001).minimize(net_loss)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        if restore_path is not None:
            saver.restore(sess, restore_path)
            print("Model restore in path: {}".format(restore_path))
        else:
            print("No pretrained model found.")

        for step in range(TRAIN_STEPS):
            b_s, b_a = get_batch(obsers_all, actions_all, batch_size=32)
            _, loss_ = sess.run([train_op, net_loss], {net_s: b_s, net_a: b_a})

            if step % 5 == 0:
                output = sess.run(net_out, {net_s: obsers_all, net_a: actions_all})
                item_a = np.argmax(output, axis=1)
                item_b = np.argmax(actions_all, axis=1)
                n_accuracy = np.where(np.equal(np.argmax(output, axis=1), np.argmax(actions_all, axis=1)))[0]
                accuracy_ = n_accuracy.shape[0] / actions_all.shape[0]
                print('Step:', step, '| train loss: %.4f' % loss_, '| test accuracy: %.2f' % accuracy_)

        output = sess.run(net_out, {net_s: obsers_all, net_a: actions_all})
        n_accuracy = np.where(np.equal(np.argmax(output, axis=1), np.argmax(actions_all, axis=1)))[0]
        accuracy_ = n_accuracy.shape[0] / actions_all.shape[0]
        print('Step:', step, '| train loss: %.4f' % loss_, '| test accuracy: %.2f' % accuracy_)

        save_path = saver.save(sess, BASE_LOGS + 'model_init.ckpt')
        print("Model saved in path: {}".format(save_path))

    # # Dataset Aggregation and Retraining the policy.
    for i in range(dagger_itr):
        restore_path = save_path
        # restore_path = BASE_LOGS + 'model_init.ckpt'

        obser_list = []
        action_list = []
        obsers_new = np.zeros((1, n_features))
        actions_new = np.zeros((1, 1))

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            if restore_path is not None:
                saver.restore(sess, restore_path)
                print("Model restore in path: {}".format(restore_path))
            else:
                raise Exception("No pretrained model found.")

            for j in range(RUN_STEPS):
                episode_step = 0
                s = env.reset()
                while True:
                    env.render(RENDER_TIME)
                    a = sess.run(net_out, {net_s: s.ravel()[np.newaxis, :]})
                    a = np.argmax(a, axis=1)[0]
                    obser_list.append(s.ravel())
                    action_list.append(get_expert_action(s))  # add action what the expert teaches.
                    s_, r, done, info = env.step(a)  # act action what the model output.

                    s = s_
                    episode_step += 1

                    if episode_step > 299:
                        done = True

                    if done:
                        if info == 'running':
                            episode_step = 300
                            success_holder.append(0)
                        elif info == 'terminal':
                            if episode_step < 50:
                                success_holder.append(1)
                            else:
                                success_holder.append(1)
                        else:
                            raise Exception("Invalid info code.")
                        env.render(RENDER_TIME)
                        episode_step_holder.append(episode_step)
                        break

            assert len(obser_list) == len(action_list)
            for obser, act in zip(obser_list, action_list):
                obsers_new = np.concatenate([obsers_new, obser[np.newaxis, :]], axis=0)
                actions_new = np.concatenate([actions_new, np.array([act])[np.newaxis, :]], axis=0)
            obsers_new = obsers_new[1:, :]
            actions_new = actions_new[1:, :].astype(int)
            actions_new = one_hot_encoding_numpy(actions_new.ravel().tolist(), 4)
            # Dataset Aggregation
            obsers_all = np.concatenate([obsers_all, obsers_new], axis=0)
            actions_all = np.concatenate([actions_all, actions_new], axis=0)
            # Retraining the policy
            saver = tf.train.Saver()
            saver.restore(sess, restore_path)
            print("Model restore in path: {}".format(restore_path))

            for step in range(TRAIN_STEPS):
                b_s, b_a = get_batch(obsers_all, actions_all, batch_size=32)
                _, loss_ = sess.run([train_op, net_loss], {net_s: b_s, net_a: b_a})

                if step % 5 == 0:
                    output = sess.run(net_out, {net_s: obsers_all, net_a: actions_all})
                    n_accuracy = np.where(np.equal(np.argmax(output, axis=1), np.argmax(actions_all, axis=1)))[0]
                    accuracy_ = n_accuracy.shape[0] / actions_all.shape[0]
                    print('Step:', step, '| train loss: %.4f' % loss_, '| test accuracy: %.2f' % accuracy_)

            output = sess.run(net_out, {net_s: obsers_all, net_a: actions_all})
            n_accuracy = np.where(np.equal(np.argmax(output, axis=1), np.argmax(actions_all, axis=1)))[0]
            accuracy_ = n_accuracy.shape[0] / actions_all.shape[0]
            print('Step:', step, '| train loss: %.4f' % loss_, '| test accuracy: %.2f' % accuracy_)
            print("Size of the dataset {0}".format(obsers_all.shape[0]))

            save_path = saver.save(sess, BASE_LOGS + 'model_{}.ckpt'.format(i))
            print("Model saved in path: {}".format(save_path))

    plot_rate(success_holder, './logs/dagger/model/', index=0)

    # # Testing the final policy.
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        if restore_path is not None:
            saver.restore(sess, restore_path)
            print("Model restore in path: {}".format(restore_path))
        else:
            raise Exception("No pretrained model found.")

        for epi in range(100):
            s = env.reset()
            while True:
                env.render(0)
                a = sess.run(net_out, {net_s: s.ravel()[np.newaxis, :]})
                a = np.argmax(a, axis=1)[0]
                s_, r, done, info = env.step(a)  # act action what the model output.
                if done:
                    env.render(0)
                    break
                s = s_

    # # destroy the env.
    env.destroy()
예제 #4
0
        """
        return super().append_input(data)


# The file path of packets' log
log_packet_file = "output/packet_log/packet-0.log"

# Use the object you created above
my_solution = MySolution()

# Create the emulator using your solution
# Specify USE_CWND to decide whether or not use crowded windows. USE_CWND=True by default.
# Specify ENABLE_LOG to decide whether or not output the log of packets. ENABLE_LOG=True by default.
# You can get more information about parameters at https://github.com/Azson/DTP-emulator/tree/pcc-emulator#constant
emulator = PccEmulator(solution=my_solution, USE_CWND=False, ENABLE_LOG=True)

# Run the emulator and you can specify the time for the emualtor's running.
# It will run until there is no packet can sent by default.
emulator.run_for_dur()

# print the debug information of links and senders
emulator.print_debug()

# Output the picture of pcc_emulator-analysis.png
# You can get more information from https://github.com/Azson/DTP-emulator/tree/pcc-emulator#pcc_emulator-analysispng.
analyze_pcc_emulator(log_packet_file, file_range="all")

# Output the picture of cwnd_changing.png
# You can get more information from https://github.com/Azson/DTP-emulator/tree/pcc-emulator#cwnd_changingpng
plot_rate(log_packet_file, file_range="all")
예제 #5
0
def run_maze():
    step = 0
    render_time = 0
    episode_step_holder = []
    success_holder = []
    base_path = './logs/a2c/'

    sess = tf.Session()

    actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
    critic = Critic(sess, n_features=N_F, lr=LR_C)

    sess.run(tf.global_variables_initializer())
    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    if RESTORE:
        saver.restore(sess, base_path + 'model_a2c.ckpt')
        print("Model restore in path: {}".format(base_path + 'model_a2c.ckpt'))

    if OUTPUT_GRAPH:
        tf.summary.FileWriter('.logs/', sess.graph)

    for i_episode in range(MAX_EPISODE):
        episode_step = 0
        s = env.reset().ravel()

        while True:
            env.render(render_time)

            a = actor.choose_action(s)
            s_, r, done, info = env.step(a)
            s_ = s_.ravel()

            td_error = critic.learn(
                s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
            actor.learn(
                s, a,
                td_error)  # true_gradient = grad[log(Pi(s,a)) * td_error]

            s = s_
            step += 1
            episode_step += 1

            if episode_step > 100:
                done = True

            if done:
                print('{0} -- {1} -- {2}'.format(i_episode, info,
                                                 episode_step))
                if info != 'success':
                    episode_step = 500
                    reward = -1
                    success_holder.append(0)
                else:
                    success_holder.append(1)
                env.render(render_time)
                episode_step_holder.append(episode_step)
                break

    # end of game
    print('game over')
    if RESTORE:
        save_path = saver.save(sess, base_path + 'model_a2c.ckpt')
        print("Model saved in path: {}".format(save_path))
    sess.close()
    env.destroy()
    # plot_cost(episode_step_holder, base_path + 'episode_steps.png')
    plot_rate(success_holder, base_path + 'success_rate.png')