def run_maze(): step = 0 render_time = 0 max_episodes = 5000 episode_step_holder = [] success_holder = [] base_path = './logs/dqn/model/' for i_episode in range(max_episodes): episode_step = 0 s = env.reset().ravel() while True: env.render(render_time) action = RL.choose_action(s) s_, reward, done, info = env.step(action) s_ = s_.ravel() # print('action:{0} | reward:{1} | done: {2}'.format(action, reward, done)) RL.store_transition(s, action, reward, s_) if step > 200: RL.learn(done) s = s_ step += 1 episode_step += 1 if episode_step > 299: done = True if done: print('{0} -- {1} -- {2} -- {3}'.format( i_episode, info, episode_step, RL.epsilon)) if info == 'running': episode_step = 300 success_holder.append(0) elif info == 'terminal': if episode_step < 50: success_holder.append(1) else: success_holder.append(1) else: raise Exception("Invalid info code.") env.render(render_time) episode_step_holder.append(episode_step) break # end of game print('game over') save_path = RL.saver.save(RL.sess, base_path + 'model_dqn.ckpt') print("Model saved in path: {}".format(save_path)) RL.sess.close() env.destroy() # plot_cost(episode_step_holder, base_path + 'episode_steps.png') plot_rate(success_holder, base_path, index=0)
def run_maze(): step = 0 render_time = 0 episode_step_holder = [] success_holder = [] base_path = './logs/dqn/' for i_episode in range(400): episode_step = 0 s = env.reset().ravel() while True: env.render(render_time) action = RL.choose_action(s) s_, reward, done, info = env.step(action) s_ = s_.ravel() # print('action:{0} | reward:{1} | done: {2}'.format(action, reward, done)) RL.store_transition(s, action, reward, s_) if step > 200: RL.learn() s = s_ step += 1 episode_step += 1 if episode_step > 500: done = True if done: print('{0} -- {1} -- {2}'.format(i_episode, info, episode_step)) if info != 'success': episode_step = 500 reward = -1 success_holder.append(0) else: success_holder.append(1) env.render(render_time) episode_step_holder.append(episode_step) break # end of game print('game over') save_path = RL.saver.save(RL.sess, base_path + 'model_dqn.ckpt') print("Model saved in path: {}".format(save_path)) RL.sess.close() env.destroy() # plot_cost(episode_step_holder, base_path + 'episode_steps.png') plot_rate(success_holder, base_path, index=15)
def run_maze(): # expert_counter = 0 # Record how many times we refer the expert action. n_features = env.height * env.width n_actions = 4 restore_path = None dagger_itr = 5 # how many times we do dataset aggregation. episode_step_holder = [] success_holder = [] obser_list = [] action_list = [] obsers_all = np.zeros((1, n_features)) actions_all = np.zeros((1, 1)) # # Collecting data. for j in range(RUN_STEPS): s = env.reset() while True: env.render(RENDER_TIME) a = get_expert_action(s) obser_list.append(s.ravel()) action_list.append(a) s_, r, done, info = env.step(a) if done: env.render(RENDER_TIME) break s = s_ assert len(obser_list) == len(action_list) for obser, act in zip(obser_list, action_list): obsers_all = np.concatenate([obsers_all, obser[np.newaxis, :]], axis=0) actions_all = np.concatenate([actions_all, np.array([act])[np.newaxis, :]], axis=0) obsers_all = obsers_all[1:, :] actions_all = actions_all[1:, :].astype(int) actions_all = one_hot_encoding_numpy(actions_all.ravel().tolist(), 4) # # Training an initialized policy. net_s = tf.placeholder(tf.float32, [None, n_features], name='net_s') # observations net_a = tf.placeholder(tf.int32, [None, n_actions], name='net_a') # expert actions net_l1 = tf.contrib.layers.fully_connected(net_s, 32, activation_fn=tf.nn.relu) # net_l2 = tf.contrib.layers.fully_connected(net_l1, 128, activation_fn=tf.nn.relu) net_out = tf.contrib.layers.fully_connected(net_l1, n_actions, activation_fn=tf.nn.softmax) net_loss = tf.losses.softmax_cross_entropy(onehot_labels=net_a, logits=net_out) # compute cost train_op = tf.train.AdamOptimizer(0.001).minimize(net_loss) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() if restore_path is not None: saver.restore(sess, restore_path) print("Model restore in path: {}".format(restore_path)) else: print("No pretrained model found.") for step in range(TRAIN_STEPS): b_s, b_a = get_batch(obsers_all, actions_all, batch_size=32) _, loss_ = sess.run([train_op, net_loss], {net_s: b_s, net_a: b_a}) if step % 5 == 0: output = sess.run(net_out, {net_s: obsers_all, net_a: actions_all}) item_a = np.argmax(output, axis=1) item_b = np.argmax(actions_all, axis=1) n_accuracy = np.where(np.equal(np.argmax(output, axis=1), np.argmax(actions_all, axis=1)))[0] accuracy_ = n_accuracy.shape[0] / actions_all.shape[0] print('Step:', step, '| train loss: %.4f' % loss_, '| test accuracy: %.2f' % accuracy_) output = sess.run(net_out, {net_s: obsers_all, net_a: actions_all}) n_accuracy = np.where(np.equal(np.argmax(output, axis=1), np.argmax(actions_all, axis=1)))[0] accuracy_ = n_accuracy.shape[0] / actions_all.shape[0] print('Step:', step, '| train loss: %.4f' % loss_, '| test accuracy: %.2f' % accuracy_) save_path = saver.save(sess, BASE_LOGS + 'model_init.ckpt') print("Model saved in path: {}".format(save_path)) # # Dataset Aggregation and Retraining the policy. for i in range(dagger_itr): restore_path = save_path # restore_path = BASE_LOGS + 'model_init.ckpt' obser_list = [] action_list = [] obsers_new = np.zeros((1, n_features)) actions_new = np.zeros((1, 1)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() if restore_path is not None: saver.restore(sess, restore_path) print("Model restore in path: {}".format(restore_path)) else: raise Exception("No pretrained model found.") for j in range(RUN_STEPS): episode_step = 0 s = env.reset() while True: env.render(RENDER_TIME) a = sess.run(net_out, {net_s: s.ravel()[np.newaxis, :]}) a = np.argmax(a, axis=1)[0] obser_list.append(s.ravel()) action_list.append(get_expert_action(s)) # add action what the expert teaches. s_, r, done, info = env.step(a) # act action what the model output. s = s_ episode_step += 1 if episode_step > 299: done = True if done: if info == 'running': episode_step = 300 success_holder.append(0) elif info == 'terminal': if episode_step < 50: success_holder.append(1) else: success_holder.append(1) else: raise Exception("Invalid info code.") env.render(RENDER_TIME) episode_step_holder.append(episode_step) break assert len(obser_list) == len(action_list) for obser, act in zip(obser_list, action_list): obsers_new = np.concatenate([obsers_new, obser[np.newaxis, :]], axis=0) actions_new = np.concatenate([actions_new, np.array([act])[np.newaxis, :]], axis=0) obsers_new = obsers_new[1:, :] actions_new = actions_new[1:, :].astype(int) actions_new = one_hot_encoding_numpy(actions_new.ravel().tolist(), 4) # Dataset Aggregation obsers_all = np.concatenate([obsers_all, obsers_new], axis=0) actions_all = np.concatenate([actions_all, actions_new], axis=0) # Retraining the policy saver = tf.train.Saver() saver.restore(sess, restore_path) print("Model restore in path: {}".format(restore_path)) for step in range(TRAIN_STEPS): b_s, b_a = get_batch(obsers_all, actions_all, batch_size=32) _, loss_ = sess.run([train_op, net_loss], {net_s: b_s, net_a: b_a}) if step % 5 == 0: output = sess.run(net_out, {net_s: obsers_all, net_a: actions_all}) n_accuracy = np.where(np.equal(np.argmax(output, axis=1), np.argmax(actions_all, axis=1)))[0] accuracy_ = n_accuracy.shape[0] / actions_all.shape[0] print('Step:', step, '| train loss: %.4f' % loss_, '| test accuracy: %.2f' % accuracy_) output = sess.run(net_out, {net_s: obsers_all, net_a: actions_all}) n_accuracy = np.where(np.equal(np.argmax(output, axis=1), np.argmax(actions_all, axis=1)))[0] accuracy_ = n_accuracy.shape[0] / actions_all.shape[0] print('Step:', step, '| train loss: %.4f' % loss_, '| test accuracy: %.2f' % accuracy_) print("Size of the dataset {0}".format(obsers_all.shape[0])) save_path = saver.save(sess, BASE_LOGS + 'model_{}.ckpt'.format(i)) print("Model saved in path: {}".format(save_path)) plot_rate(success_holder, './logs/dagger/model/', index=0) # # Testing the final policy. with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() if restore_path is not None: saver.restore(sess, restore_path) print("Model restore in path: {}".format(restore_path)) else: raise Exception("No pretrained model found.") for epi in range(100): s = env.reset() while True: env.render(0) a = sess.run(net_out, {net_s: s.ravel()[np.newaxis, :]}) a = np.argmax(a, axis=1)[0] s_, r, done, info = env.step(a) # act action what the model output. if done: env.render(0) break s = s_ # # destroy the env. env.destroy()
""" return super().append_input(data) # The file path of packets' log log_packet_file = "output/packet_log/packet-0.log" # Use the object you created above my_solution = MySolution() # Create the emulator using your solution # Specify USE_CWND to decide whether or not use crowded windows. USE_CWND=True by default. # Specify ENABLE_LOG to decide whether or not output the log of packets. ENABLE_LOG=True by default. # You can get more information about parameters at https://github.com/Azson/DTP-emulator/tree/pcc-emulator#constant emulator = PccEmulator(solution=my_solution, USE_CWND=False, ENABLE_LOG=True) # Run the emulator and you can specify the time for the emualtor's running. # It will run until there is no packet can sent by default. emulator.run_for_dur() # print the debug information of links and senders emulator.print_debug() # Output the picture of pcc_emulator-analysis.png # You can get more information from https://github.com/Azson/DTP-emulator/tree/pcc-emulator#pcc_emulator-analysispng. analyze_pcc_emulator(log_packet_file, file_range="all") # Output the picture of cwnd_changing.png # You can get more information from https://github.com/Azson/DTP-emulator/tree/pcc-emulator#cwnd_changingpng plot_rate(log_packet_file, file_range="all")
def run_maze(): step = 0 render_time = 0 episode_step_holder = [] success_holder = [] base_path = './logs/a2c/' sess = tf.Session() actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A) critic = Critic(sess, n_features=N_F, lr=LR_C) sess.run(tf.global_variables_initializer()) # Add ops to save and restore all the variables. saver = tf.train.Saver() if RESTORE: saver.restore(sess, base_path + 'model_a2c.ckpt') print("Model restore in path: {}".format(base_path + 'model_a2c.ckpt')) if OUTPUT_GRAPH: tf.summary.FileWriter('.logs/', sess.graph) for i_episode in range(MAX_EPISODE): episode_step = 0 s = env.reset().ravel() while True: env.render(render_time) a = actor.choose_action(s) s_, r, done, info = env.step(a) s_ = s_.ravel() td_error = critic.learn( s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)] actor.learn( s, a, td_error) # true_gradient = grad[log(Pi(s,a)) * td_error] s = s_ step += 1 episode_step += 1 if episode_step > 100: done = True if done: print('{0} -- {1} -- {2}'.format(i_episode, info, episode_step)) if info != 'success': episode_step = 500 reward = -1 success_holder.append(0) else: success_holder.append(1) env.render(render_time) episode_step_holder.append(episode_step) break # end of game print('game over') if RESTORE: save_path = saver.save(sess, base_path + 'model_a2c.ckpt') print("Model saved in path: {}".format(save_path)) sess.close() env.destroy() # plot_cost(episode_step_holder, base_path + 'episode_steps.png') plot_rate(success_holder, base_path + 'success_rate.png')