def Initial(self): with tf.Session().as_default() as sess: saver = tf.train.import_meta_graph('log/nn_model_ep_60.ckpt.meta') saver.restore(sess, tf.train.latest_checkpoint("log/")) print("Model restored.") actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) print("init successe \n") sess.run(tf.global_variables_initializer()) # saver = tf.train.Saver() # save neural net parameters print("saver created") # restore neural net parameters # if NN_MODEL is not None: # NN_MODEL is the path to file # saver.restore(sess, NN_MODEL) # print("Testing model restored.") # print("Nnmodel restored") self.actor = actor self.critic = critic self.sess = sess self.TP_buf = [0.25] * 125
def __init__(self, sess): self.sess = sess self.actor = a3c.ActorNetwork(self.sess, state_dim=S_INFO, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) self.critic = a3c.CriticNetwork(self.sess, state_dim=S_INFO, learning_rate=CRITIC_LR_RATE) self.summary_ops, self.summary_vars = a3c.build_summaries() self.sess.run(tf.global_variables_initializer()) self.writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) self.saver = tf.train.Saver() # restore neural network if NN_MODEL is not None: print("load model success!") self.saver.restore(self.sess, NN_MODEL) self.epoch = 0 self.i_episode = 0 self.total_reward = 0.0 self.s = env.reset()
def __init__(self): self.sess = tf.Session() self.actor = a3c_hotdash.ActorNetwork(self.sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM_prefetch, learning_rate=ACTOR_LR_RATE) self.critic = a3c_hotdash.CriticNetwork(self.sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) self.sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file saver.restore(self.sess, NN_MODEL) print("Testing model 1 restored.") # reuse = True tf.reset_default_graph() self.sess_bitr = tf.Session() self.actor_bitr = a3c.ActorNetwork(self.sess_bitr, state_dim=[S_INFO_PENSIEVE, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) self.critic_bitr = a3c.CriticNetwork(self.sess_bitr, state_dim=[S_INFO_PENSIEVE, S_LEN], learning_rate=CRITIC_LR_RATE) self.sess_bitr.run(tf.global_variables_initializer()) saver_bitr = tf.train.Saver() # restore neural net parameters if NN_MODEL_bitr is not None: # NN_MODEL is the path to file saver.restore(self.sess_bitr, NN_MODEL_bitr) print("Testing model 2 restored.")
def main(): env = gym.make("CartPole-v0") # env.force_mag = 100.0 with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=S_DIM, learning_rate=CRITIC_LR_RATE) saver = tf.train.Saver() saver.restore(sess, NN_MODEL) for eps in xrange(100): obs = env.reset() reward = 0 for _ in range(300): env.render() action_prob = actor.predict(np.reshape(obs, (1, S_DIM))) action_cumsum = np.cumsum(action_prob) a = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() obs, rew, done, info = env.step(a) reward += rew if done: break print eps, reward, done
def __init__(self, scope): # self.gp = config['gp'] # self.buffer_size = config['buffer_size'] # self.abr_osc = config['abr_osc'] # self.abr_basic = config['abr_basic'] self.quality = 0 #self.last_quality = 0 self.state = np.zeros((Zero.S_INFO, Zero.S_LEN)) self.quality_len = Zero.A_DIM self.sess = tf.Session() # with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: self.dual = a3c.DualNetwork(self.sess, scope) self.actor = a3c.ActorNetwork(self.sess, state_dim=[Zero.S_INFO, Zero.S_LEN], action_dim=self.quality_len, learning_rate=Zero.ACTOR_LR_RATE, scope=scope, dual=self.dual) self.critic = a3c.CriticNetwork(self.sess, state_dim=[Zero.S_INFO, Zero.S_LEN], learning_rate=Zero.CRITIC_LR_RATE, scope=scope, dual=self.dual) self.sess.run(tf.global_variables_initializer()) self.history = [] self.s_batch = [np.zeros((Zero.S_INFO, Zero.S_LEN))] action_vec = np.zeros(Zero.A_DIM) self.a_batch = [action_vec] self.r_batch = [] self.actor_gradient_batch = [] self.critic_gradient_batch = []
def __init__(self): self.sess = tf.Session() self.actor = a3c.ActorNetwork(self.sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) self.critic = a3c.CriticNetwork(self.sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) self.sess.run(tf.global_variables_initializer()) tf.train.Saver().restore(self.sess, NN_MODEL)
def agent(agent_id, net_params_queue, exp_queue): env = gym.make("CartPole-v0") env.force_mag = 100.0 with tf.Session() as sess, open( SUMMARY_DIR + '/log_agent_' + str(agent_id), 'w') as log_file: actor = a3c.ActorNetwork(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=S_DIM, learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) time_stamp = 0 for ep in range(TRAIN_EPOCH): obs = env.reset() s_batch = [] a_batch = [] r_batch = [] for step in range(TRAIN_SEQ_LEN): s_batch.append(obs) action_prob = actor.predict(np.reshape(obs, (1, S_DIM))) action_cumsum = np.cumsum(action_prob) a = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() action_vec = np.zeros(A_DIM) action_vec[a] = 1 a_batch.append(action_vec) obs, rew, done, info = env.step(a) r_batch.append(rew) if done: break exp_queue.put([s_batch, a_batch, r_batch, done]) actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) log_file.write('epoch' + str(ep) + 'reward' + str(np.sum(rew)) + 'step' + str(len(r_batch))) log_file.flush()
def run(server_class=HTTPServer, port=8333, log_file_path=LOG_FILE): np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM if not os.path.exists(SUMMARY_DIR): os.makedirs(SUMMARY_DIR) with tf.Session() as sess, open(log_file_path, 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.initialize_all_variables()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") init_action = np.zeros(A_DIM) init_action[DEFAULT_QUALITY] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [init_action] r_batch = [] train_counter = 0 last_bit_rate = DEFAULT_QUALITY last_total_rebuf = 0 # need this storage, because observation only contains total rebuffering time # we compute the difference to get video_chunk_count = 0 input_dict = {'sess': sess, 'log_file': log_file, 'actor': actor, 'critic': critic, 'saver': saver, 'train_counter': train_counter, 'last_bit_rate': last_bit_rate, 'last_total_rebuf': last_total_rebuf, 'video_chunk_coount': video_chunk_count, 's_batch': s_batch, 'a_batch': a_batch, 'r_batch': r_batch} # interface to abr_rl server handler_class = make_request_handler(input_dict=input_dict) server_address = ('192.168.0.101', port) httpd = server_class(server_address, handler_class) print 'Listening on port ' + str(port) httpd.serve_forever()
def __init__(self, mpd, base_url, base_dst, options): self.config = Config(mpd, base_url) self.quality_rep_map = {} self.file_writer = common.FileWriter(base_dst) for rep in self.config.reps: self.quality_rep_map[rep['bandwidth']] = rep self.bitrates = self.quality_rep_map.keys() self.bitrates.sort() utility_offset = -math.log(self.bitrates[0]) self.utilities = [math.log(b) + utility_offset for b in self.bitrates] self.buffer_size = options.buffer_size * 1000 self.verbose = options.verbose self.segment_time = self.config.reps[0]['dur_s'] * 1000 self.bandwidth_changerscript_path = options.bandwidth_changerscript_path self.player = videoplayer.VideoPlayer(self.segment_time, self.utilities, self.bitrates) self.sess = tf.Session() self.quality_switch = 0 self.actor = a3c.ActorNetwork(self.sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) self.critic = a3c.CriticNetwork(self.sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) self.sess.run(tf.initialize_all_variables()) self.saver = tf.train.Saver() # restore neural net parameters self.nn_model = NN_MODEL if self.nn_model is not None: # nn_model is the path to file self.saver.restore(self.sess, self.nn_model) print("Model restored.") self.init_action = np.zeros(A_DIM) self.init_action[DEFAULT_QUALITY] = 1 self.s_batch = [np.zeros((S_INFO, S_LEN))] self.a_batch = [self.init_action] self.r_batch = [] self.last_quality = DEFAULT_QUALITY self.last_bit_rate = DEFAULT_QUALITY # need this storage, because observation only contains total rebuffering time # we compute the difference to get self.last_total_rebuf = 0 self.video_chunk_count = 0 self.chunk_fetch_time = 0 self.chunk_size = 0 self.ptime = 0
def __init__(self): # fill your init vars n = 0 self.BITRATE = [0, 1, 2, 3] self.TARGET_BUFFER = [0, 1, 2, 3] self.LATENCY_LIMIT = [1, 2, 3, 4] self.ACTION_SAPCE = [] self.sess = tf.Session() self.actor = a3c.ActorNetwork(self.sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) self.critic = a3c.CriticNetwork(self.sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver()
def Initial(self): # Initail your session or something with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file saver.restore(sess, NN_MODEL) print("Testing model restored.") IntialVars = [] IntialVars.append(actor) IntialVars.append(critic) return IntialVars
def central_agent(net_params_queues, exp_queues): assert len(net_params_queues) == NUM_AGENTS assert len(exp_queues) == NUM_AGENTS with tf.Session() as sess, open(SUMMARY_DIR + '/log_central', 'w') as log_file: actor = a3c.ActorNetwork(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=S_DIM, learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") # while True: # assemble experiences from agents, compute the gradients for ep in range(TRAIN_EPOCH): # synchronize the network parameters of work agent actor_net_params = actor.get_network_params() critic_net_params = critic.get_network_params() for i in range(NUM_AGENTS): net_params_queues[i].put([actor_net_params, critic_net_params]) # record average reward and td loss change # in the experiences from the agents total_batch_len = 0.0 total_reward = 0.0 total_td_loss = 0.0 total_agents = 0.0 # assemble experiences from the agents actor_gradient_batch = [] critic_gradient_batch = [] for i in range(NUM_AGENTS): s_batch, a_batch, r_batch, terminal = exp_queues[i].get() actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients( # s_batch=np.vstack(s_batch), s_batch=np.stack(s_batch, axis=0), a_batch=np.vstack(a_batch), r_batch=np.vstack(r_batch), terminal=terminal, actor=actor, critic=critic) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) total_reward += np.sum(r_batch) total_td_loss += np.sum(td_batch) total_batch_len += len(r_batch) total_agents += 1.0 # compute aggregated gradient assert NUM_AGENTS == len(actor_gradient_batch) assert len(actor_gradient_batch) == len(critic_gradient_batch) for i in range(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) # log training information avg_reward = total_reward / total_agents avg_td_loss = total_td_loss / total_batch_len log_file.write('Epoch: ' + str(ep) + ' TD_loss: ' + str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) + '\n') log_file.flush() summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: avg_td_loss, summary_vars[1]: avg_reward }) writer.add_summary(summary_str, ep) writer.flush() if ep % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save( sess, MODEL_DIR + "/nn_model_ep_" + str(ep) + ".ckpt")
def agent(agent_id, net_params_queue, exp_queue): net_env = env.Environment(random_seed=agent_id, fixed_env=False, trace_folder=TRAIN_TRACES) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) mask = net_env.video_masks[net_env.video_idx] last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action = bitrate_to_action(bit_rate, mask) last_action = action action_vec = np.zeros(np.sum(mask)) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, \ rebuf, video_chunk_size, end_of_video, \ video_chunk_remain, video_num_chunks, \ next_video_chunk_size, mask = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms reward = VIDEO_BIT_RATE[action] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[action] - VIDEO_BIT_RATE[last_action]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate last_action = action # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[action] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K state[4, -1] = video_chunk_remain / float(video_num_chunks) state[5, :] = -1 nxt_chnk_cnt = 0 for i in xrange(A_DIM): if mask[i] == 1: state[5, i] = next_video_chunk_size[nxt_chnk_cnt] / M_IN_B nxt_chnk_cnt += 1 assert (nxt_chnk_cnt) == np.sum(mask) state[6, -A_DIM:] = mask # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) # the action probability should correspond to number of bit rates assert len(action_prob[0]) == np.sum(mask) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states action = bitrate_to_action(bit_rate, mask) entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[action]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action = bitrate_to_action(bit_rate, mask) last_action = action action_vec = np.zeros(np.sum(mask)) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(np.sum(mask)) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def central_agent(net_params_queues, exp_queues): assert len(net_params_queues) == NUM_AGENTS assert len(exp_queues) == NUM_AGENTS logging.basicConfig(filename=LOG_FILE + '_central', filemode='w', level=logging.INFO) with tf.Session() as sess, open(LOG_FILE + '_test', 'wb') as test_log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver(max_to_keep=10000) # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model == "None": epoch = 0 nn_model = None if nn_model is not None: # nn_model is the path to file epoch = int(nn_model.replace("nn_model_ep_", "").split(".ckpt")[0]) saver.restore(sess, MODEL_DIR + nn_model) print("Model restored.") # while True: # assemble experiences from agents, compute the gradients while True: # synchronize the network parameters of work agent actor_net_params = actor.get_network_params() critic_net_params = critic.get_network_params() for i in xrange(NUM_AGENTS): net_params_queues[i].put([actor_net_params, critic_net_params]) # record average reward and td loss change # in the experiences from the agents total_batch_len = 0.0 total_reward = 0.0 total_td_loss = 0.0 total_entropy = 0.0 total_agents = 0.0 # assemble experiences from the agents actor_gradient_batch = [] critic_gradient_batch = [] for i in xrange(NUM_AGENTS): s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get() actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients( s_batch=np.stack(s_batch, axis=0), a_batch=np.vstack(a_batch), r_batch=np.vstack(r_batch), terminal=terminal, actor=actor, critic=critic) for i in xrange(len(actor_gradient)): assert np.any(np.isnan(actor_gradient[i])) == False actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) total_reward += np.sum(r_batch) total_td_loss += np.sum(td_batch) total_batch_len += len(r_batch) total_agents += 1.0 total_entropy += np.sum(info['entropy']) # compute aggregated gradient assert NUM_AGENTS == len(actor_gradient_batch) assert len(actor_gradient_batch) == len(critic_gradient_batch) # assembled_actor_gradient = actor_gradient_batch[0] # assembled_critic_gradient = critic_gradient_batch[0] # for i in xrange(len(actor_gradient_batch) - 1): # for j in xrange(len(assembled_actor_gradient)): # assembled_actor_gradient[j] += actor_gradient_batch[i][j] # assembled_critic_gradient[j] += critic_gradient_batch[i][j] # actor.apply_gradients(assembled_actor_gradient) # critic.apply_gradients(assembled_critic_gradient) for i in xrange(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) # log training information epoch += 1 avg_reward = total_reward / total_agents avg_td_loss = total_td_loss / total_batch_len avg_entropy = total_entropy / total_batch_len logging.info('Epoch: ' + str(epoch) + ' TD_loss: ' + str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) + ' Avg_entropy: ' + str(avg_entropy)) summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: avg_td_loss, summary_vars[1]: avg_reward, summary_vars[2]: avg_entropy }) writer.add_summary(summary_str, epoch) writer.flush() if epoch % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save( sess, MODEL_DIR + "nn_model_ep_" + str(epoch) + ".ckpt") logging.info("Model saved in file: " + save_path) testing(epoch, MODEL_DIR + "nn_model_ep_" + str(epoch) + ".ckpt", test_log_file)
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue, epoch_queue): net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # 1.从center同步最新的模型参数 initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() epoch_num = epoch_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) #初始化 动作空间A个actions action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # 和环境Env交互 the action is from the last decision # this is to make the framework similar to the real # delay, sleep_time, buffer_size, rebuf, \ # video_chunk_size, next_video_chunk_sizes, \ # end_of_video, video_chunk_remain = \ # net_env.get_video_chunk(bit_rate) assert bit_rate >= 0 assert bit_rate < A_DIM bitrate_send_last, lossrate_recv_last, bitrate_real_recovery,\ bitrate_send_last_probe, lossrate_recv_last_probe, bitrate_real_recovery_probe,\ end_of_video \ = net_env.action_dispatch_and_report_svr(VIDEO_BIT_RATE[bit_rate]) time_stamp += 2 # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness #print '1', net_env.netbw #print '2', bitrate_send_last_probe * (1 - lossrate_recv_last_probe) x_funtion_top = (bitrate_send_last_probe * (1 - lossrate_recv_last_probe) - VIDEO_BIT_RATE[bit_rate]) / M_IN_K reward = -x_funtion_top * x_funtion_top # 0.1 0.2 ... 1.1 1.2 r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record #state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms #state[0, -1] = bitrate_send_last / 1000.0 # last quality #state[1, -1] = lossrate_recv_last # 丢包率0.1 0.2 0.3 0.4 #state[2, -1] = bitrate_real_recovery / 1000.0 # kilo byte / ms state = np.roll(state, -1, axis=1) state[0, -1] = bitrate_send_last_probe / 1000.0 # last quality state[1, -1] = lossrate_recv_last_probe # 丢包率0.1 0.2 0.3 0.4 state[2, -1] = bitrate_real_recovery_probe / 1000.0 # kilo byte / ms state[3, :A_DIM] = np.array( VIDEO_BIT_RATE[:]) / 1000.0 # kilo byte / ms state[4, -1] = bitrate_send_last / 1000.0 # kilo byte / ms # print state[3, :A_DIM] # ================== Predict BandWidth ========================= # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(bitrate_send_last) + '\t' + str(lossrate_recv_last) + '\t' + str(bitrate_real_recovery) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) epoch_num = epoch_queue.get() del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def central_agent(net_params_queues, exp_queues): # 参数是两个有16个队列(进程队列?)的列表 #打开Session(){ # 生成神经网络 # 生成一个tf.summary???(好像是用来检测数据作可视化用的) # 初始化神经网络参数,读取已保存的神经网络 # 循环{ # 在Queue中放入神经网络参数*子agent数量 # 初始化变量和batch[] # 从Queue获取子agent传来的batch[]数据,综合以后执行梯度下降Optimizer # 将数据写入文件 # 达到一定次数更新一次保存的神经网络 # } #} assert len(net_params_queues) == NUM_AGENTS assert len(exp_queues) == NUM_AGENTS logging.basicConfig(filename=LOG_FILE + '_central', filemode='w', level=logging.INFO) # 创建日志? with tf.Session() as sess, open(LOG_FILE + '_test', 'wb') as test_log_file: # 创建actor神经网络,参数为tensorflow的Session,[输入神经元个数,历史带宽长度],输出神经元个数(码率范围),学习率 actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) # 创建critic神经网络,参数为tensorflow的Session,[输入神经元个数,历史带宽长度],学习率 critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() # 总结什么? sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") epoch = 0 # assemble experiences from agents, compute the gradients while True: # synchronize同步 the network parameters of work agent actor_net_params = actor.get_network_params() critic_net_params = critic.get_network_params() for i in xrange(NUM_AGENTS): # 0-15 net_params_queues[i].put([actor_net_params, critic_net_params ]) # 将参数放入列表中每个进程对应的队列 # Note: this is synchronous version of the parallel training, # which is easier to understand and probe. The framework can be # fairly easily modified to support asynchronous training. # Some practices of asynchronous training (lock-free SGD at # its core) are nicely explained in the following two papers: # https://arxiv.org/abs/1602.01783 # https://arxiv.org/abs/1106.5730 # record average reward and td loss change # in the experiences from the agents total_batch_len = 0.0 total_reward = 0.0 total_td_loss = 0.0 total_entropy = 0.0 total_agents = 0.0 # assemble experiences from the agents actor_gradient_batch = [] critic_gradient_batch = [] for i in xrange(NUM_AGENTS): # 0-15 s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get( ) # 从列表中每个进程对应的队列取出参数? actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients( # 计算梯度? s_batch=np.stack(s_batch, axis=0), a_batch=np.vstack(a_batch), r_batch=np.vstack(r_batch), terminal=terminal, actor=actor, critic=critic) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) total_reward += np.sum(r_batch) total_td_loss += np.sum(td_batch) total_batch_len += len(r_batch) total_agents += 1.0 total_entropy += np.sum(info['entropy']) # 从info字典中取出熵值 # compute aggregated汇总 gradient assert NUM_AGENTS == len(actor_gradient_batch) assert len(actor_gradient_batch) == len(critic_gradient_batch) # assembled_actor_gradient = actor_gradient_batch[0] # assembled_critic_gradient = critic_gradient_batch[0] # for i in xrange(len(actor_gradient_batch) - 1): # for j in xrange(len(assembled_actor_gradient)): # assembled_actor_gradient[j] += actor_gradient_batch[i][j] # assembled_critic_gradient[j] += critic_gradient_batch[i][j] # actor.apply_gradients(assembled_actor_gradient) # critic.apply_gradients(assembled_critic_gradient) for i in xrange(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) # log training information epoch += 1 avg_reward = total_reward / total_agents avg_td_loss = total_td_loss / total_batch_len avg_entropy = total_entropy / total_batch_len logging.info('Epoch: ' + str(epoch) + ' TD_loss: ' + str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) + ' Avg_entropy: ' + str(avg_entropy)) # 记录日志 summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: avg_td_loss, summary_vars[1]: avg_reward, summary_vars[2]: avg_entropy }) writer.add_summary(summary_str, epoch) writer.flush() if epoch % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save( sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt") logging.info("Model saved in file: " + save_path) testing(epoch, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt", test_log_file) # 测试?
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue): net_env = env.Environment(time=all_cooked_time, bandwidth=all_cooked_bw, random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] #need to initialize, and get before simulation step track_index = [] hm = head_movement.move_prediction() time_stamp = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real # xgw 20180918: need to modify here estimate_track_index = hm.get_head_movement_prediction() # actual_track_index = hm.get_head_movement_current() actual_track_index = [2, 3, 5, 6] delay, rebuf, buffer_size, sleep_time, video_chunk_size, end_of_video = \ net_env.get_video_chunk(bit_rate, estimate_track_index) time_stamp += delay # in ms time_stamp += sleep_time # in ms # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness # xgw 20180918: need to modify the reward, add the qualiy consistency in viewport # and the buffer # actually the consistency of quality in viewport is the error of head movement prediction error # so it's not sure that whether add the "quality consistency" here # don't know how to modelized the qp as first input reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K # bit_rate_log_reward = np.log((bit_rate + 1) / A_DIM) * BIT_RATE_REWARD_PARAMETER # smooth_p = np.exp(np.abs(last_bit_rate - bit_rate) / A_DIM) * SMOOTH_PENALTY # reward = bit_rate - REBUF_PENALTY * rebuf - smooth_p r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms # state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE)) # last quality # state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 6 sec # state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K # kilo byte / ms # state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec # state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte # state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) state[0, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 6 sec state[2, :4] = np.array(actual_track_index) state[3, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last chunk's bitrate # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write('time_stamp: ' + str(time_stamp) + '\t' + 'VIDEO_BIT_RATE: ' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + 'buffer_size: ' + str(buffer_size) + '\t' + 'rebuf: ' + str(rebuf) + '\t' + 'video_chunk_size: ' + str(video_chunk_size) + '\t' + 'delay: ' + str(delay) + '\t' + 'avg throughtput: ' + str(video_chunk_size / delay) + '\t' + 'reward: ' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def central_agent(net_params_queues, exp_queues): assert len(net_params_queues) == NUM_AGENTS assert len(exp_queues) == NUM_AGENTS logging.basicConfig(filename=LOG_FILE + '_central', filemode='w', level=logging.INFO) with tf.Session() as sess, open(LOG_FILE + '_test', 'w') as test_log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver(max_to_keep=50000) # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") epoch = 0 # assemble experiences from agents, compute the gradients while epoch <= num_epochs: # synchronize the network parameters of work agent actor_net_params = actor.get_network_params() critic_net_params = critic.get_network_params() for i in range(NUM_AGENTS): net_params_queues[i].put([actor_net_params, critic_net_params]) # Note: this is synchronous version of the parallel training, # which is easier to understand and probe. The framework can be # fairly easily modified to support asynchronous training. # Some practices of asynchronous training (lock-free SGD at # its core) are nicely explained in the following two papers: # https://arxiv.org/abs/1602.01783 # https://arxiv.org/abs/1106.5730 # record average reward and td loss change # in the experiences from the agents total_batch_len = 0.0 total_reward = 0.0 total_td_loss = 0.0 total_entropy = 0.0 total_agents = 0.0 # assemble experiences from the agents actor_gradient_batch = [] critic_gradient_batch = [] for i in range(NUM_AGENTS): s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get() actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients( s_batch=np.stack(s_batch, axis=0), a_batch=np.vstack(a_batch), r_batch=np.vstack(r_batch), terminal=terminal, actor=actor, critic=critic) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) total_reward += np.sum(r_batch) total_td_loss += np.sum(td_batch) total_batch_len += len(r_batch) total_agents += 1.0 total_entropy += np.sum(info['entropy']) # compute aggregated gradient assert NUM_AGENTS == len(actor_gradient_batch) assert len(actor_gradient_batch) == len(critic_gradient_batch) # assembled_actor_gradient = actor_gradient_batch[0] # assembled_critic_gradient = critic_gradient_batch[0] # for i in range(len(actor_gradient_batch) - 1): # for j in range(len(assembled_actor_gradient)): # assembled_actor_gradient[j] += actor_gradient_batch[i][j] # assembled_critic_gradient[j] += critic_gradient_batch[i][j] # actor.apply_gradients(assembled_actor_gradient) # critic.apply_gradients(assembled_critic_gradient) for i in range(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) # log training information epoch += 1 avg_reward = total_reward / total_agents avg_td_loss = total_td_loss / total_batch_len avg_entropy = total_entropy / total_batch_len logging.info('Epoch: ' + str(epoch) + ' TD_loss: ' + str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) + ' Avg_entropy: ' + str(avg_entropy)) summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: avg_td_loss, summary_vars[1]: avg_reward, summary_vars[2]: avg_entropy }) writer.add_summary(summary_str, epoch) writer.flush() if epoch % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save(sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt") logging.info("Model saved in file: " + save_path) testing( epoch, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt", test_log_file )
def main(): # utility_offset = -math.log(VIDEO_BIT_RATE[0]) # so utilities[0] = 0 # utilities = [math.log(b) + utility_offset for b in VIDEO_BIT_RATE] np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM all_cooked_time, all_cooked_bw, _ = load_trace.load_trace() load_trace.plot_bandwidth(all_cooked_time, all_cooked_bw, _) if not os.path.exists(SUMMARY_DIR): os.makedirs(SUMMARY_DIR) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) with tf.Session() as sess, open(LOG_FILE, 'w') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") epoch = 0 time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] actor_gradient_batch = [] critic_gradient_batch = [] while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_counter,throughput,video_chunk_remain = \ net_env.get_video_chunk(bit_rate) #print(net_env.get_video_chunk(bit_rate)) time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty - smooth penalty reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # print(state) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum( video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) # print('state',state) action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) rand = np.random.randint(1, RAND_RANGE) / float(RAND_RANGE) print(action_cumsum, action_cumsum > rand, (action_cumsum > rand).argmax()) # print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)) # print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() #compute Vp and map bitrate # bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() Vp_index = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() Vp = BUFFER_PARAMETER[Vp_index] # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states config = { 'buffer_size': env.BUFFER_THRESH, 'gp': GP, 'Vp': Vp, 'abr_osc': False, 'abr_basic': False, 'no_ibr': False } bola = get_bitrate.Bola(config=config) bit_rate = bola.get_quality( Vp, buffer_size * env.MILLISECONDS_IN_SECOND, last_bit_rate, throughput) #决策前的信息 print( '[%d]:download time %.2fms,thrput=%.2f,chunk size %d,buffer=%.2fs,bitrate=%d' % (video_chunk_counter, throughput, delay, video_chunk_size, buffer_size, last_bit_rate)) entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() if len(r_batch ) >= TRAIN_SEQ_LEN or end_of_video: # do training once actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients(s_batch=np.stack(s_batch[1:], axis=0), # ignore the first chuck a_batch=np.vstack(a_batch[1:]), # since we don't have the r_batch=np.vstack(r_batch[1:]), # control over it terminal=end_of_video, actor=actor, critic=critic) td_loss = np.mean(td_batch) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) print("====") print("Epoch", epoch) print("TD_loss", td_loss, "Avg_reward", np.mean(r_batch), "Avg_entropy", np.mean(entropy_record)) print("====") summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: td_loss, summary_vars[1]: np.mean(r_batch), summary_vars[2]: np.mean(entropy_record) }) writer.add_summary(summary_str, epoch) writer.flush() entropy_record = [] if len(actor_gradient_batch) >= GRADIENT_BATCH_SIZE: assert len(actor_gradient_batch) == len( critic_gradient_batch) # assembled_actor_gradient = actor_gradient_batch[0] # assembled_critic_gradient = critic_gradient_batch[0] # assert len(actor_gradient_batch) == len(critic_gradient_batch) # for i in xrange(len(actor_gradient_batch) - 1): # for j in xrange(len(actor_gradient)): # assembled_actor_gradient[j] += actor_gradient_batch[i][j] # assembled_critic_gradient[j] += critic_gradient_batch[i][j] # actor.apply_gradients(assembled_actor_gradient) # critic.apply_gradients(assembled_critic_gradient) for i in range(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) actor_gradient_batch = [] critic_gradient_batch = [] epoch += 1 if epoch % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save( sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt") print("Model saved in file: %s" % save_path) del s_batch[:] del a_batch[:] del r_batch[:] if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) # print(bit_rate) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def main(): env = gym.make("RLALIGN") with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE, Rows=env.noOfRows, Cols=env.noOfCols) critic = a3c.CriticNetwork(sess, state_dim=S_DIM, learning_rate=CRITIC_LR_RATE, Rows=env.noOfRows, Cols=env.noOfCols) saver = tf.train.Saver() saver.restore(sess, saved_MODEL) #stepsize =[10,15,25,45,75,85,105,125,200,400,800] #interval =[10,15,25,45,75,85,105,125,200,400,800] stepsize = [10, 20, 50, 100, 200] interval = [10, 20, 50, 100, 200] FinalAccPlot = [] GlobalAcc = {} for inter in PROB: GlobalAcc[str(inter)] = [] for probability in PROB: Acc = {} for inter in interval: Acc[str(inter)] = [] print "probability", probability AccuracyPlot = [] for v in stepsize: print "current step ", v MeanAccuracy = 0 for eps in xrange(1000): obs = env.reset() testSequence = env.reset() seq = {} for i in range(len(testSequence)): seq[i] = testSequence[i].replace("-", "") GoalAlignment = 0 #temp=mafft.mafft_Score(seq,100) #GoalAlignment = temp[0] temp = NW(seq[0], seq[1]) GoalAlignment = temp #print "GoalAlignemnt Score:",GoalAlignment #print "alignment", temp[1] #print "Test Sequence",testSequence listOfStates = [] listOfStates.append(testSequence) #print "Test Sequence",testSequence #print actionSpace for i in listOfStates: count = 1 StateToGetAction = env.de_one_hot_encode( env.GetStateVector(i)) #1*12 shape #print StateToGetAction #StateToGetAction = np.reshape(StateToGetAction,(2,6)) testState = np.reshape( env.GetStateVector(i), [env.noOfRows, env.noOfCols * 5, 1]) #print legalStateIndices rew = [] while True: #print "Move ",count legalStates = env.getLegalActions( StateToGetAction, env.ActionSpace) #print legalStates legalStateIndices = [] for j in legalStates: legalStateIndices.append( env.ActionSpace.index(j)) #print legalStateIndices #print "Input State", env.get_sequences_from_state(env.de_one_hot_encode(testState)) #print testState.shape testState = np.reshape(testState, [ 1, testState.shape[0], testState.shape[1], testState.shape[2] ]) prediction = actor.predict(testState) #print prediction #print "Prediction",prediction #print prediction.shape predictionToUse = [] actionPredicted1 = np.argmax(prediction) #print "action Predicted over all actions",env.ActionSpace[actionPredicted1] for k in legalStateIndices: predictionToUse.append(prediction[0][k]) prob, actions = env.getProbForActionEpsilonGreedy( predictionToUse, probability) #print actions actionPredicted = legalStateIndices[ predictionToUse.index( np.random.choice(actions, p=prob))] #actionPredicted=legalStateIndices[np.argmax(predictionToUse)] #print "best action from legal states " + str(env.ActionSpace[legalStateIndices[np.argmax(predictionToUse)]]) #print "action Predicted over legal action",env.ActionSpace[actionPredicted] next_sequence = env.step(testState, actionPredicted, GoalAlignment) #print "Next State",env.get_sequences_from_state(env.de_one_hot_encode(next_sequence[1])) #print "reward", next_sequence[0] rew.append(next_sequence[0]) StateToGetAction = env.de_one_hot_encode( next_sequence[1]) testState = np.reshape( next_sequence[1], [env.noOfRows, env.noOfCols * 5, 1]) count += 1 if (next_sequence[0] >= GoalAlignment): Acc[str(v)].append(next_sequence[0] - GoalAlignment) MeanAccuracy += 1 break if count == v: Acc[str(v)].append(max(rew) - GoalAlignment) # if(max(rew)>GoalAlignment): #print "Wohoo Prediction better than Mafft!" # MeanAccuracy+=1 # elif(max(rew)==GoalAlignment): #print "Yaay prediction correct" # MeanAccuracy+=1 #else: #print "Reward Abs Diff", abs(GoalAlignment - max(rew)) #res.append(max(rew) - GoalAlignment) break print "Average percentage " + str( (MeanAccuracy / 1000.0) * 100.0) AccuracyPlot.append((MeanAccuracy / 1000.0) * 100.0) print AccuracyPlot FinalAccPlot.append(AccuracyPlot) for inter in interval: GlobalAcc[str(probability)].append(Acc[str(inter)]) ''' plt.subplot(141) plt.title('Random Factor 0%') sns.boxplot(data=GlobalAcc[str(PROB[3])]) plt.ylabel("Alignment Score Difference") plt.xlabel("No of Steps") plt.xticks(range(0, len(interval), 1), interval) plt.subplot(142) plt.title('Random Factor 10%') sns.boxplot(data=GlobalAcc[str(PROB[2])]) plt.xlabel("No of Steps") plt.xticks(range(0, len(interval), 1), interval) plt.subplot(143) plt.title('Random Factor 20%') plt.xlabel("No of Steps") sns.boxplot(data=GlobalAcc[str(PROB[1])]) plt.xticks(range(0, len(interval), 1), interval) plt.subplot(144) plt.title('Random Factor 30%') ''' plt.ylabel("Alignment Score Difference", fontsize=25) plt.xlabel("Number of Steps", fontsize=25) sns.boxplot(data=GlobalAcc[str(PROB[0])]) plt.xticks(range(0, len(interval), 1), interval, fontsize=20) plt.yticks(fontsize=20) plt.savefig('MSA_2x8x4.png') #plt.clf() '''
def agent(agent_id, net_params_queue, exp_queue): env = gym.make("RLALIGN") with tf.Session() as sess, open(SUMMARY_DIR + '/log_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE,Rows = env.noOfRows,Cols = env.noOfCols) critic = a3c.CriticNetwork(sess, state_dim=S_DIM, learning_rate=CRITIC_LR_RATE,Rows = env.noOfRows,Cols = env.noOfCols) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) time_stamp = 0 for ep in xrange(TRAIN_EPOCH): obs = env.reset() seq = {} for i in range(len(obs)): seq[i] = obs[i].replace("-","") GoalAlignment =0 #temp=mafft.mafft_Score(seq,agent_id) temp = NW(seq[0],seq[1]) GoalAlignment = temp #print seq #print temp[1] #print GoalAlignment stateOriginal = env.GetStateVector(obs) stateOriginal = np.reshape(stateOriginal, [env.noOfRows,env.noOfCols*5,1]) seq=env.get_sequences_from_state(env.de_one_hot_encode(stateOriginal)) s_batch = [] a_batch = [] r_batch = [] for step in xrange(TRAIN_SEQ_LEN): s_batch.append(stateOriginal) #action_prob = actor.predict(np.reshape(obs, (1, S_DIM))) #print env.get_sequences_from_state(env.de_one_hot_encode(stateOriginal)) stateBuffer=copy.deepcopy(stateOriginal) stateBuffer = np.reshape(stateOriginal,[1,stateOriginal.shape[0],stateOriginal.shape[1],stateOriginal.shape[2]]) StateToGetAction=env.de_one_hot_encode(stateOriginal) action_prob = actor.predict(stateBuffer) action_cumsum = np.cumsum(action_prob) a=(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() #a = np.random.choice(A_DIM, p=action_prob[0]) #action_prob = actor.predict(stateBuffer) #print action_prob #action_prob[0] = action_prob[0] - np.finfo(np.float32).epsneg #histogram = np.random.multinomial(1, action_prob[0]) #a = int(np.nonzero(histogram)[0]) #if random.random() < EPSILON: # a= random.randint(0, A_DIM-1) #else: # a = np.random.choice(A_DIM, p=action_prob[0]) action_vec = np.zeros(A_DIM) action_vec[a] = 1 a_batch.append(action_vec) rew,s_, done, info = env.step(stateOriginal,a,GoalAlignment) #print rew s_ = np.reshape(s_, [s_.shape[0],s_.shape[1],1]) stateOriginal = s_ r_batch.append(rew) ind = r_batch.index(max(r_batch)) rewardDiff = max(r_batch) - GoalAlignment reachedGoal = "No" if(max(r_batch) >= GoalAlignment): reachedGoal = "Yes" if not done: # ind = r_batch.index(max(r_batch)) done = True exp_queue.put([s_batch[0:ind+1], a_batch[0:ind+1], r_batch[0:ind+1], done]) log_file.write('seq '+ str(seq) +' epoch ' + str(ep) + ' reward ' + str(np.sum(r_batch[0:ind+1])) + ' step ' + str(len(r_batch[0:ind+1])) + ' AlignmentDiff '+ str(rewardDiff) + ' reachedGoal ' + reachedGoal) log_file.write("\n") log_file.flush() #exp_queue.put([s_batch, a_batch, r_batch, done]) actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params)
def main(): np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM #all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace(TEST_TRACES) cooked_files = os.listdir(TEST_TRACES) g1 = tf.Graph() with tf.Session(graph=g1) as sess: #sess = tf.session() actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file saver.restore(sess, NN_MODEL) print("Testing model restored.") if not os.path.exists(LOG_FILE): os.makedirs(LOG_FILE) cooked_files = os.listdir(TEST_TRACES) for cooked_file in cooked_files: net_env = env.Environment(RANDOM_SEED, cooked_file) log_path = LOG_FILE + 'log_sim_rl_' + cooked_file + '_log.txt' log_file = open(log_path, 'wb') time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] #entropy_record = [] last_rtt = -1 last_vmaf = -1 while True: _norm_bitrate = VIDEO_BIT_RATE[bit_rate] delay, loss, recv_bitrate, rtt, throughput, limbo_bytes_len = \ net_env.get_video_chunk(bit_rate) print delay if delay is None: log_file.write('\n') print 'Test done', cooked_file log_file.close() last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here del s_batch[:] del a_batch[:] del r_batch[:] action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) #entropy_record = [] break rtt = float(rtt) / float(1000) if last_rtt < 0: last_rtt = rtt _norm_send_bitrate = bit_rate / A_DIM _queuing_delay = abs(rtt - last_rtt) _norm_recv_bitrate = min( float(recv_bitrate) / delay / BUFFER_NORM_FACTOR, 1.0) time_stamp += delay # in ms vmaf = net_env.get_vmaf(bit_rate) if last_vmaf < 0: last_vmaf = vmaf #_normalized_bitrate = (_norm_bitrate - BITRATE_MIN) / (BITRATE_MAX - BITRATE_MIN) _vmaf_reward = (vmaf / _norm_bitrate) * BITRATE_MIN reward = \ 1.0 * vmaf - \ 0.2 * _norm_send_bitrate - \ 1.0 / DELAY_GRADIENT_MAX * min(_queuing_delay, DELAY_GRADIENT_MAX) - \ 1.0 * abs(last_vmaf - vmaf) r_batch.append(reward) last_vmaf = vmaf last_rtt = rtt log_file.write( str(time_stamp) + '\t' + str(_norm_bitrate) + '\t' + str(recv_bitrate) + '\t' + str(limbo_bytes_len) + '\t' + str(rtt) + '\t' + str(vmaf) + '\t' + str(reward) + '\n') log_file.flush() if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) state[0, -1] = _norm_send_bitrate # last quality state[1, -1] = _norm_recv_bitrate # kilo byte / ms state[2, -1] = _queuing_delay # max:500ms state[3, -1] = float(loss) # changed loss # test:add fft feature _fft = np.fft.fft(state[1]) state[4] = _fft.real state[5] = _fft.imag state[6] = net_env.predict_vmaf() action_prob = actor.predict( np.reshape(state, (1, S_INFO, S_LEN))) #print 'state',state[6] #print 'action',action_prob[0] action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax()
def main(): np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace(TEST_TRACES) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file saver.restore(sess, NN_MODEL) print("Testing model restored.") time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] video_count = 0 while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty - smoothness reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate # log time_stamp, bit_rate, buffer_size, reward log_file.write(str(time_stamp / M_IN_K) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states DECISIONS.append(bit_rate) s_batch.append(state) entropy_record.append(a3c.compute_entropy(action_prob[0])) if end_of_video: log_file.write('\n') log_file.close() last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here del s_batch[:] del a_batch[:] del r_batch[:] action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) entropy_record = [] video_count += 1 if video_count >= len(all_file_names): break log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') print "Decisions: {}".format(Counter(DECISIONS))
def main(): summary_dir = SUMMARY_DIR if not os.path.exists(summary_dir): os.makedirs(summary_dir) log_file_dir = TEST_LOG_FOLDER if not os.path.exists(log_file_dir): os.makedirs(log_file_dir) TOTAL_REWARD_BITRATE = 0.0 TOTAL_REWARD_HD_BITRATE = 0.0 TOTAL_REWARD_REBUF = 0.0 TOTAL_REWARD_SMOOTHNESS = 0.0 TOTAL_REWARD = 0.0 TOTAL_HOTSPOT_CHUNKS = 0.0 np.random.seed(RANDOM_SEED) all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace( TEST_TRACES) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file saver.restore(sess, NN_MODEL) print "Testing model restored." time_stamp = 0 prefetch_decision = DEFAULT_PREFETCH last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[prefetch_decision] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] video_count = 0 while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real state_data_for_action = net_env.execute_action(prefetch_decision) # normal chunk state information delay = state_data_for_action['delay'] sleep_time = state_data_for_action['sleep_time'] last_bit_rate = state_data_for_action['last_bit_rate'] play_buffer_size = state_data_for_action['play_buffer_size'] rebuf = state_data_for_action['rebuf'] video_chunk_size = state_data_for_action['video_chunk_size'] next_video_chunk_sizes = state_data_for_action[ 'next_video_chunk_sizes'] end_of_video = state_data_for_action['end_of_video'] video_chunk_remain = state_data_for_action['video_chunk_remain'] current_seq_no = state_data_for_action['current_seq_no'] log_prefetch_decision = state_data_for_action[ 'log_prefetch_decision'] # hotspot chunk state information was_hotspot_chunk = 1.0 * state_data_for_action['was_hotspot_chunk'] TOTAL_HOTSPOT_CHUNKS += was_hotspot_chunk hotspot_chunks_remain = state_data_for_action[ 'hotspot_chunks_remain'] chunks_till_played = state_data_for_action['chunks_till_played'] total_buffer_size = state_data_for_action['total_buffer_size'] last_hotspot_bit_rate = state_data_for_action[ 'last_hotspot_bit_rate'] next_hotspot_chunk_sizes = state_data_for_action[ 'next_hotspot_chunk_sizes'] dist_from_hotspot_chunks = state_data_for_action[ 'dist_from_hotspot_chunks'] smoothness_eval_bitrates = state_data_for_action[ 'smoothness_eval_bitrates'] # abr decision state information normal_bitrate_pensieve = state_data_for_action[ 'normal_bitrate_pensieve'] hotspot_bitrate_pensieve = state_data_for_action[ 'hotspot_bitrate_pensieve'] # print len(next_video_chunk_sizes) # print len(next_hotspot_chunk_sizes) last_overall_bitrate = last_bit_rate if prefetch_decision == 1: last_overall_bitrate = last_hotspot_bit_rate time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty - smoothness reward_normal_br = (1.0 - was_hotspot_chunk) * ( VIDEO_BIT_RATE[last_bit_rate] / M_IN_K) * 1.0 reward_hotspot_br = was_hotspot_chunk * HD_REWARD[ last_hotspot_bit_rate] * 1.0 reward_rebuffering = REBUF_PENALTY * rebuf * 1.0 reward_smoothness = 0.0 if len(smoothness_eval_bitrates) > 1: for i in xrange(len(smoothness_eval_bitrates) - 1): reward_smoothness += 1.0 * SMOOTH_PENALTY * (1.0 * np.abs( VIDEO_BIT_RATE[smoothness_eval_bitrates[i + 1]] - VIDEO_BIT_RATE[smoothness_eval_bitrates[i]]) / M_IN_K) reward = (1.0 * reward_normal_br) + (1.0 * reward_hotspot_br) - ( 1.0 * reward_rebuffering) - (1.0 * reward_smoothness) TOTAL_REWARD_BITRATE += reward_normal_br TOTAL_REWARD_HD_BITRATE += reward_hotspot_br TOTAL_REWARD_REBUF += reward_rebuffering TOTAL_REWARD_SMOOTHNESS += reward_smoothness TOTAL_REWARD += reward # print "reward before: {}".format(reward) r_batch.append(reward) # print "reward after: {}".format(reward) # log time_stamp, bit_rate, buffer_size, reward if not end_of_video: log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[last_overall_bitrate]) + '\t' + str(play_buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\t' + str(log_prefetch_decision) + '\t' + str(int(was_hotspot_chunk)) + '\t' + str(current_seq_no) + '\n') log_file.flush() # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms ## Normal state S_ABR_INFO state[0, -1] = VIDEO_BIT_RATE[last_overall_bitrate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = play_buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :BITRATE_LEVELS] = np.array( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum( video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) ## Hotspot state S_HOT_INFO state[6, -1] = np.minimum( hotspot_chunks_remain, NUM_HOTSPOT_CHUNKS) / float(NUM_HOTSPOT_CHUNKS) state[7, -1] = np.minimum( chunks_till_played, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) state[8, -1] = total_buffer_size / BUFFER_NORM_FACTOR state[9, -1] = last_hotspot_bit_rate / float(np.max(VIDEO_BIT_RATE)) state[10, :BITRATE_LEVELS] = np.array( next_hotspot_chunk_sizes) / M_IN_K / M_IN_K state[11, :NUM_HOTSPOT_CHUNKS] = ( np.array(dist_from_hotspot_chunks) + CHUNK_TIL_VIDEO_END_CAP) / float(2 * CHUNK_TIL_VIDEO_END_CAP) ## Bitrate actions state S_BRT_INFO state[12, -1] = normal_bitrate_pensieve / float(np.max(VIDEO_BIT_RATE)) state[13, -1] = hotspot_bitrate_pensieve / float( np.max(VIDEO_BIT_RATE)) # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) prefetch_decision = ( action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states s_batch.append(state) entropy_record.append(a3c.compute_entropy(action_prob[0])) if end_of_video: log_file.write('\n') log_file.close() # break prefetch_decision = DEFAULT_PREFETCH del s_batch[:] del a_batch[:] del r_batch[:] action_vec = np.zeros(A_DIM) action_vec[prefetch_decision] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) entropy_record = [] video_count += 1 if video_count >= len(all_file_names): break # print "log file: {}".format(log_file) # print "Hot chunks: {}".format(TOTAL_HOTSPOT_CHUNKS) log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') print "Normal bitrate reward: {}".format(TOTAL_REWARD_BITRATE) print "Hotspot bitrate reward: {}".format(TOTAL_REWARD_HD_BITRATE) print "Rebuffering reward: {}".format(TOTAL_REWARD_REBUF) print "Smoothness reward: {}".format(TOTAL_REWARD_SMOOTHNESS) print "Total reward: {}".format(TOTAL_REWARD) print "Total hotspot chunks: {}".format(int(TOTAL_HOTSPOT_CHUNKS))
def agent(agent_id, all_cooked_time, all_cooked_bw, all_file_names, video_size_file, net_params_queue, exp_queue): net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id, VIDEO_SIZE_FILE=video_size_file, Debug=False) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) bit_rate = DEFAULT_QUALITY target_buffer = DEFAULT_QUALITY latency_limit = 4 index = 1 action_vec = np.zeros(A_DIM) action_vec[index] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] video_count = 0 reward_all_sum = 0 reward_all = 0 reward = 0 switch_num = 0 SMOOTH_PENALTY = 0.0 REBUF_PENALTY = 3 LANTENCY_PENALTY = 0.0 BITRATE_REWARD = 0.001 SKIP_PENALTY = 0.0 epoch = 0 n = 0 state = np.array(s_batch[-1], copy=True) frame_time_len = 0.04 last_bit_rate = DEFAULT_QUALITY while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real time, time_interval, send_data_size, chunk_len, \ rebuf, buffer_size, play_time_len, end_delay, \ cdn_newest_id, download_id, cdn_has_frame, skip_frame_time_len, decision_flag, \ buffer_flag, cdn_flag, skip_flag, end_of_video = net_env.get_video_frame(bit_rate, target_buffer, latency_limit) # # QOE setting # if end_delay <= 1.0: # LANTENCY_PENALTY = 0.005 # else: # LANTENCY_PENALTY = 0.01 reward_frame = 0 epoch += 1 if not cdn_flag: reward_frame = frame_time_len * float( BIT_RATE[bit_rate] ) * BITRATE_REWARD - REBUF_PENALTY * rebuf - LANTENCY_PENALTY * end_delay - SKIP_PENALTY * skip_frame_time_len else: reward_frame = -(REBUF_PENALTY * rebuf) reward += reward_frame # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = buffer_size * 0.1 state[1, -1] = send_data_size * 0.00001 state[2, -1] = time_interval * 10 # kilo byte / ms state[3, -1] = end_delay * 0.1 # 10 sec state[4, -1] = rebuf # mega byte if decision_flag and not end_of_video: reward_frame = -1 * SMOOTH_PENALTY * ( abs(BIT_RATE[bit_rate] - BIT_RATE[last_bit_rate]) / 1000) reward += reward_frame last_bit_rate = bit_rate r_batch.append(reward) reward = 0 # compute action probability vector action_prob = actor.predict( np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) temp = np.random.randint(1, RAND_RANGE) / float(RAND_RANGE) index = (action_cumsum > temp).argmax() bit_rate = ACTION_SAPCE[index][0] target_buffer = ACTION_SAPCE[index][1] latency_limit = ACTION_SAPCE[index][2] # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get( ) actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[index] = 1 a_batch.append(action_vec) reward_all += reward_frame # store the state and action into batches if end_of_video: r_batch.append(reward) reward_all_sum += reward_all / 20 video_count += 1 if video_count >= len(all_file_names): n += 1 video_count = 0 print(n, "agent_id ", agent_id, "reward_all_sum:", reward_all_sum) w.writerow([n, reward_all_sum]) out.flush() reward_all_sum = 0 net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=epoch, VIDEO_SIZE_FILE=video_size_file, Debug=False) if n == NUM_EPOCH: break reward_all = 0 reward = 0 switch_num = 0 bit_rate = DEFAULT_QUALITY # use the default action here target_buffer = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec)
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue): # agent号,trece数据,对应的两个队列的列表 #Summary:先建立环境,然后打开Session(){ # 生成神经网络 # (从主agent获取参数,给神经网络初始化) # 选取默认动作,初始化batch[],entropy[] # 循环:{ # 从环境更新状态,新状态加入batch[],选择新动作,记录数据进文件 # 积累到batch大小,放到多进程的Queue中(等待主agent取出) # 重新从主agent获取参数,清除旧batch[]的数据 # } #} net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id) # 调试环境参数? with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: # 创建actor神经网络,参数为tensorflow的Session,[输入神经元个数,历史带宽长度],输出神经元个数(码率范围),学习率 actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) # 创建critic神经网络,参数为tensorflow的Session,[输入神经元个数,历史带宽长度],学习率 critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) # [0,0,0,0,0,0] action_vec[bit_rate] = 1 # 设置有效码率为1(其中一个) s_batch = [np.zeros((S_INFO, S_LEN))] # [6*8的0矩阵,],历史状态列表? a_batch = [action_vec] # [[0,0,0,0,0,0],] r_batch = [] # reward? entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) # 还没看懂 time_stamp += delay # in ms time_stamp += sleep_time # in ms # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K # -- log scale reward -- # log_bit_rate = np.log(VIDEO_BIT_RATE[bit_rate] / float(VIDEO_BIT_RATE[-1])) # log_last_bit_rate = np.log(VIDEO_BIT_RATE[last_bit_rate] / float(VIDEO_BIT_RATE[-1])) # reward = log_bit_rate \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate) # -- HD reward -- # reward = HD_REWARD[bit_rate] \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(HD_REWARD[bit_rate] - HD_REWARD[last_bit_rate]) r_batch.append(reward) last_bit_rate = bit_rate # retrieve取回/恢复 previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # 没看懂 # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality,码率 state[ 1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec,current buffer size,缓存大小 state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms, 带宽测量 state[3, -1] = float( delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec, 延迟时间,下载时间? state[4, :A_DIM] = np.array( next_video_chunk_sizes ) / M_IN_K / M_IN_K # mega byte, 下一个chunk的各种size,放在前6列? state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float( CHUNK_TIL_VIDEO_END_CAP) # 剩余chunks # compute action probability vector,这里没搞懂 action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # rand_range = 1000,前面有 # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator,更新神经网络参数 actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue): net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'w') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness # reward = \ # VIDEO_BIT_RATE[bit_rate] / M_IN_K \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - # VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K # -- log scale reward -- # log_bit_rate = np.log(VIDEO_BIT_RATE[bit_rate] / float(VIDEO_BIT_RATE[-1])) #log_last_bit_rate = np.log(VIDEO_BIT_RATE[last_bit_rate] / float(VIDEO_BIT_RATE[-1])) #reward = log_bit_rate \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate) # -- HD reward -- reward = HD_REWARD[bit_rate] \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(HD_REWARD[bit_rate] - HD_REWARD[last_bit_rate]) r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write(str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, {'entropy': entropy_record}]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write('\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def main(): os.system('rm -r ' + TEST_LOG_FOLDER) os.system('mkdir ' + TEST_LOG_FOLDER) np.random.seed(RANDOM_SEED) all_user_pos, all_file_names = load_trace.load_trace(TEST_TRACES) net_env = fixed_env.Environment(all_user_pos=all_user_pos) log_path = TEST_LOG_FOLDER + 'log_sim_rl_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file saver.restore(sess, NN_MODEL) print("Testing model restored.") # initializing association = one_hot().T num_shared = 50 trace_count = 0 while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real channel_gain, num_user_bs, rate, end_of_trace = \ net_env.scheduling_and_association(association, num_shared) reward = np.mean(np.log(rate)) # log time_stamp, bit_rate, buffer_size, reward log_file.write(str(reward) + '\n') log_file.flush() state_p1 = (channel_gain-np.mean(channel_gain.reshape((-1))))/(np.std(channel_gain.reshape((-1)))+1e-6) state_p2 = ((num_user_bs-np.mean(num_user_bs))/(np.std(num_user_bs)+1e-6)).reshape((7,1)) #state = np.concatenate([state_p1,state_p2],axis = 1) # state shape (7, 91) state = state_p1 # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action = epsilon_greedy(action_prob, 0) # set epsilon to zero when testing association, num_shared = rl_scheduling(channel_gain, action) if end_of_trace: print all_file_names[net_env.trace_idx-1],net_env.scheduling_ptr,'number of shared subchannels:', num_shared, 'SINR threshold:', BETA_SET[np.argmax(action[K_DIM:A_DIM])] #plot_cellular_network(net_env.macrocell, net_env.picocells, net_env.current_user_pos, association) log_file.write('\n') log_file.close() association = one_hot().T num_shared = 50 trace_count += 1 if trace_count >= len(all_file_names): break log_path = TEST_LOG_FOLDER + 'log_sim_rl_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') # append test performance to the log with open(LOG_FILE + '_rl_test', 'ab') as log_file: rewards = [] test_log_files = os.listdir(TEST_LOG_FOLDER) for test_log_file in test_log_files: reward = [] with open(TEST_LOG_FOLDER + test_log_file, 'rb') as f: for line in f: parse = line.split() try: reward.append(float(parse[0])) except IndexError: break rewards.append(np.sum(reward[1:])) rewards = np.array(rewards) rewards_min = np.min(rewards) rewards_5per = np.percentile(rewards, 5) rewards_mean = np.mean(rewards) rewards_median = np.percentile(rewards, 50) rewards_95per = np.percentile(rewards, 95) rewards_max = np.max(rewards) log_file.write(str(rewards_min) + '\t' + str(rewards_5per) + '\t' + str(rewards_mean) + '\t' + str(rewards_median) + '\t' + str(rewards_95per) + '\t' + str(rewards_max) + '\n') log_file.flush() print 'testing results' + '\t average rewards: ' + str(rewards_mean)
buffer -= rtt return buffer if __name__ == '__main__': argv=sys.argv if len(argv)!=3: print 'Usage : ./dashClient.py [mpdURL] [clientIP]' else: np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() nn_model = NN_MODEL if nn_model is not None: saver.restore(sess, nn_model) #print "Model restored" s_batch = [np.zeros((S_INFO, S_LEN))] url=argv[1] clientIP=argv[2] run_time=time.time() start_time = datetime.datetime.fromtimestamp(run_time) #epoc => date
def main(): np.random.seed(RANDOM_SEED) assert len(BITRATE) == A_DIM all_cooked_time, all_cooked_bw, all_file_names = load.loadBandwidth( TEST_TRACES) player = live_player.Live_Player(time_traces=all_cooked_time, throughput_traces=all_cooked_bw, seg_duration=SEG_DURATION, frag_duration=FRAG_DURATION, chunk_duration=CHUNK_DURATION, start_up_th=USER_START_UP_TH, freezing_tol=USER_FREEZING_TOL, latency_tol=USER_LATENCY_TOL, randomSeed=RANDOM_SEED) server = live_server.Live_Server(seg_duration=SEG_DURATION, frag_duration=FRAG_DURATION, chunk_duration=CHUNK_DURATION, start_up_th=SERVER_START_UP_TH) log_path = LOG_FILE + '_' + all_file_names[player.trace_idx] log_file = open(log_path, 'wb') with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file saver.restore(sess, NN_MODEL) print("Testing model restored.") action_num = DEFAULT_ACTION # 0 last_bit_rate = DEFAULT_ACTION % len(BITRATE) bit_rate = DEFAULT_ACTION % len(BITRATE) playing_speed = NORMAL_PLAYING action_vec = np.zeros(A_DIM) action_vec[action_num] = 1 take_action = 1 latency = 0.0 s_batch = [np.zeros((S_INFO, S_LEN))] state = np.array(s_batch[-1], copy=True) a_batch = [action_vec] r_batch = [] action_reward = 0.0 # Total reward is for all chunks within on segment video_count = 0 starting_time = server.time starting_time_idx = player.time_idx while True: # serve video forever assert len(server.chunks) >= 1 download_chunk_info = server.chunks[0] download_chunk_size = download_chunk_info[2] download_chunk_idx = download_chunk_info[1] download_seg_idx = download_chunk_info[0] server_wait_time = 0.0 sync = 0 real_chunk_size, download_duration, freezing, time_out, player_state = player.fetch( bit_rate, download_chunk_size, download_seg_idx, download_chunk_idx, take_action, playing_speed) # print(freezing, time_out) take_action = 0 past_time = download_duration buffer_length = player.buffer server_time = server.update(past_time) if not time_out: server.chunks.pop(0) sync = player.check_resync(server_time) else: assert player.state == 0 assert np.round(player.buffer, 3) == 0.0 # Pay attention here, how time out influence next reward, the smoothness # Bit_rate will recalculated later, this is for reward calculation bit_rate = 0 sync = 1 if sync: # To sync player, enter start up phase, buffer becomes zero sync_time, missing_count = server.sync_encoding_buffer() player.sync_playing(sync_time) buffer_length = player.buffer latency = server.time - player.playing_time player_state = player.state log_bit_rate = np.log(BITRATE[bit_rate] / BITRATE[0]) log_last_bit_rate = np.log(BITRATE[last_bit_rate] / BITRATE[0]) last_bit_rate = bit_rate # print(log_bit_rate, log_last_bit_rate) reward = ACTION_REWARD * log_bit_rate \ - REBUF_PENALTY * freezing / MS_IN_S \ - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate) \ - LONG_DELAY_PENALTY*(LONG_DELAY_PENALTY_BASE**(ReLU(latency-TARGET_LATENCY)/ MS_IN_S)-1) \ - UNNORMAL_PLAYING_PENALTY*(playing_speed-NORMAL_PLAYING)*download_duration/MS_IN_S # - MISSING_PENALTY * missing_count # print(reward) action_reward += reward # chech whether need to wait, using number of available segs if len(server.chunks) == 0: server_wait_time = server.wait() assert server_wait_time > 0.0 assert server_wait_time < CHUNK_DURATION player.wait(server_wait_time) buffer_length = player.buffer # Establish state for next iteration state = np.roll(state, -1, axis=1) state[0, -1] = BITRATE[bit_rate] / BITRATE[0] # video bitrate state[1, -1] = real_chunk_size / KB_IN_MB # chunk size state[2, -1] = download_duration / MS_IN_S # downloading time state[3, -1] = freezing / MS_IN_S # current freezing time state[4, -1] = latency / MS_IN_S # accu latency from start up state[5, -1] = sync # whether there is resync state[6, -1] = player_state # state of player state[ 7, -1] = server_wait_time / MS_IN_S # time of waiting for server state[8, -1] = buffer_length / MS_IN_S # buffer length # generate next set of seg size # if add this, this will return to environment # next_chunk_size_info = server.chunks[0][2] # not useful # state[7, :A_DIM] = next_chunk_size_info # not useful # print(state) next_chunk_idx = server.chunks[0][1] if next_chunk_idx == 0 or sync: take_action = 1 # print(action_reward) r_batch.append(action_reward) action_reward = 0.0 # If sync, might go to medium of segment, and there is no estimated chunk size next_seg_size_info = [] if sync and not next_chunk_idx == 0: next_seg_size_info = [ 2 * np.sum(x) / KB_IN_MB for x in server.chunks[0][2] ] else: next_seg_size_info = [ x / KB_IN_MB for x in server.chunks[0][3] ] state[9, :A_DIM] = next_seg_size_info action_prob = actor.predict( np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) # print(action_prob) action_num = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() bit_rate = action_num % len(BITRATE) if action_num >= len(BITRATE): playing_speed = FAST_PLAYING else: playing_speed = NORMAL_PLAYING log_file.write( str(server.time) + '\t' + str(BITRATE[last_bit_rate]) + '\t' + str(buffer_length) + '\t' + str(freezing) + '\t' + str(time_out) + '\t' + str(server_wait_time) + '\t' + str(sync) + '\t' + str(missing_count) + '\t' + str(player.state) + '\t' + str(int(action_num / len(BITRATE))) + '\t' + str(reward) + '\n') log_file.flush() if len(r_batch) >= MAX_LIVE_LEN: # need to modify time_duration = server.time - starting_time tp_record = record_tp(player.throughput_trace, starting_time_idx, time_duration) print(starting_time_idx, all_file_names[player.trace_idx], len(player.throughput_trace), player.time_idx, len(tp_record), np.sum(r_batch)) log_file.write('\t'.join(str(tp) for tp in tp_record)) log_file.write('\n' + str(starting_time)) log_file.write('\n') log_file.close() action_num = DEFAULT_ACTION # 0 last_bit_rate = DEFAULT_ACTION % len(BITRATE) bit_rate = DEFAULT_ACTION % len(BITRATE) playing_speed = NORMAL_PLAYING del s_batch[:] del a_batch[:] del r_batch[:] action_vec = np.zeros(A_DIM) action_vec[action_num] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) video_count += 1 if video_count >= TEST_TRACE_NUM: break player.test_reset(start_up_th=USER_START_UP_TH) server.test_reset(start_up_th=SERVER_START_UP_TH) # Do not need to append state to s_batch as there is no iteration starting_time = server.time starting_time_idx = player.time_idx log_path = LOG_FILE + '_' + all_file_names[player.trace_idx] log_file = open(log_path, 'wb') take_action = 1 else: if next_chunk_idx == 0 or sync: s_batch.append(state) state = np.array(s_batch[-1], copy=True) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)