Exemplos de CriticNetwork em Python, exemplos de a3c.CriticNetwork em Python

Exemplo n.º 1

0

Exibir arquivo

  def Initial(self):
      
     with tf.Session().as_default() as sess:
         saver = tf.train.import_meta_graph('log/nn_model_ep_60.ckpt.meta')
         saver.restore(sess, tf.train.latest_checkpoint("log/"))
         print("Model restored.")
         actor = a3c.ActorNetwork(sess,
                                  state_dim=[S_INFO, S_LEN], action_dim=A_DIM,
                                  learning_rate=ACTOR_LR_RATE)
 
         critic = a3c.CriticNetwork(sess,
                                  state_dim=[S_INFO, S_LEN],
                                  learning_rate=CRITIC_LR_RATE)
         print("init successe \n")
         sess.run(tf.global_variables_initializer())
 #             saver = tf.train.Saver()  # save neural net parameters
         print("saver created")
             # restore neural net parameters
 #             if NN_MODEL is not None:  # NN_MODEL is the path to file
 #                 saver.restore(sess, NN_MODEL)
 #                 print("Testing model restored.")
 #             print("Nnmodel restored")
         self.actor = actor
         self.critic = critic
         self.sess = sess
         self.TP_buf = [0.25] * 125

Exemplo n.º 2

0

Exibir arquivo

Arquivo: run_agent.py Projeto: mashoujiang/Reinforcement_Learning

    def __init__(self, sess):
        self.sess = sess
        self.actor = a3c.ActorNetwork(self.sess,
                                      state_dim=S_INFO,
                                      action_dim=A_DIM,
                                      learning_rate=ACTOR_LR_RATE)
        self.critic = a3c.CriticNetwork(self.sess,
                                        state_dim=S_INFO,
                                        learning_rate=CRITIC_LR_RATE)

        self.summary_ops, self.summary_vars = a3c.build_summaries()

        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)
        self.saver = tf.train.Saver()

        # restore neural network
        if NN_MODEL is not None:
            print("load model success!")
            self.saver.restore(self.sess, NN_MODEL)

        self.epoch = 0
        self.i_episode = 0
        self.total_reward = 0.0

        self.s = env.reset()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: hotdash.py Projeto: haitian2du/pitree

    def __init__(self):
        self.sess = tf.Session()
        self.actor = a3c_hotdash.ActorNetwork(self.sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM_prefetch,
                                              learning_rate=ACTOR_LR_RATE)
        self.critic = a3c_hotdash.CriticNetwork(self.sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE)

        self.sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        
        # restore neural net parameters
        if NN_MODEL is not None:  # NN_MODEL is the path to file
            saver.restore(self.sess, NN_MODEL)
            print("Testing model 1 restored.")
        
        # reuse = True
        tf.reset_default_graph()
        
        self.sess_bitr = tf.Session()
        self.actor_bitr = a3c.ActorNetwork(self.sess_bitr, state_dim=[S_INFO_PENSIEVE, S_LEN], action_dim=A_DIM,
                                           learning_rate=ACTOR_LR_RATE)
        self.critic_bitr = a3c.CriticNetwork(self.sess_bitr, state_dim=[S_INFO_PENSIEVE, S_LEN],
                                             learning_rate=CRITIC_LR_RATE)

        self.sess_bitr.run(tf.global_variables_initializer())
        saver_bitr = tf.train.Saver()
        
        # restore neural net parameters
        if NN_MODEL_bitr is not None:  # NN_MODEL is the path to file
            saver.restore(self.sess_bitr, NN_MODEL_bitr)
            print("Testing model 2 restored.")

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test.py Projeto: SfTI-Robotics/Code-Comparison

def main():

    env = gym.make("CartPole-v0")
    # env.force_mag = 100.0

    with tf.Session() as sess:
        actor = a3c.ActorNetwork(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess, state_dim=S_DIM, learning_rate=CRITIC_LR_RATE)
        saver = tf.train.Saver()
        saver.restore(sess, NN_MODEL)
    
        for eps in xrange(100):
            obs = env.reset()
            reward = 0
            for _ in range(300):
                env.render()

                action_prob = actor.predict(np.reshape(obs, (1, S_DIM)))
                action_cumsum = np.cumsum(action_prob)
                a = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax()

                obs, rew, done, info = env.step(a)

                reward += rew
                if done:
                    break
            print eps, reward, done

Exemplo n.º 5

0

Exibir arquivo

 def __init__(self, scope):
     # self.gp = config['gp']
     # self.buffer_size = config['buffer_size']
     # self.abr_osc = config['abr_osc']
     # self.abr_basic = config['abr_basic']
     self.quality = 0
     #self.last_quality = 0
     self.state = np.zeros((Zero.S_INFO, Zero.S_LEN))
     self.quality_len = Zero.A_DIM
     self.sess = tf.Session()
     # with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file:
     self.dual = a3c.DualNetwork(self.sess, scope)
     self.actor = a3c.ActorNetwork(self.sess,
                                   state_dim=[Zero.S_INFO, Zero.S_LEN],
                                   action_dim=self.quality_len,
                                   learning_rate=Zero.ACTOR_LR_RATE,
                                   scope=scope,
                                   dual=self.dual)
     self.critic = a3c.CriticNetwork(self.sess,
                                     state_dim=[Zero.S_INFO, Zero.S_LEN],
                                     learning_rate=Zero.CRITIC_LR_RATE,
                                     scope=scope,
                                     dual=self.dual)
     self.sess.run(tf.global_variables_initializer())
     self.history = []
     self.s_batch = [np.zeros((Zero.S_INFO, Zero.S_LEN))]
     action_vec = np.zeros(Zero.A_DIM)
     self.a_batch = [action_vec]
     self.r_batch = []
     self.actor_gradient_batch = []
     self.critic_gradient_batch = []

Exemplo n.º 6

0

Exibir arquivo

    def __init__(self):
        self.sess = tf.Session()
        self.actor = a3c.ActorNetwork(self.sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM,
                                      learning_rate=ACTOR_LR_RATE)
        self.critic = a3c.CriticNetwork(self.sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE)

        self.sess.run(tf.global_variables_initializer())
        tf.train.Saver().restore(self.sess, NN_MODEL)

Exemplo n.º 7

0

Exibir arquivo

def agent(agent_id, net_params_queue, exp_queue):

    env = gym.make("CartPole-v0")
    env.force_mag = 100.0

    with tf.Session() as sess, open(
            SUMMARY_DIR + '/log_agent_' + str(agent_id), 'w') as log_file:
        actor = a3c.ActorNetwork(sess,
                                 state_dim=S_DIM,
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=S_DIM,
                                   learning_rate=CRITIC_LR_RATE)

        # initial synchronization of the network parameters from the coordinator
        actor_net_params, critic_net_params = net_params_queue.get()
        actor.set_network_params(actor_net_params)
        critic.set_network_params(critic_net_params)

        time_stamp = 0
        for ep in range(TRAIN_EPOCH):

            obs = env.reset()

            s_batch = []
            a_batch = []
            r_batch = []

            for step in range(TRAIN_SEQ_LEN):

                s_batch.append(obs)

                action_prob = actor.predict(np.reshape(obs, (1, S_DIM)))
                action_cumsum = np.cumsum(action_prob)
                a = (action_cumsum > np.random.randint(1, RAND_RANGE) /
                     float(RAND_RANGE)).argmax()

                action_vec = np.zeros(A_DIM)
                action_vec[a] = 1
                a_batch.append(action_vec)

                obs, rew, done, info = env.step(a)

                r_batch.append(rew)

                if done:
                    break

            exp_queue.put([s_batch, a_batch, r_batch, done])

            actor_net_params, critic_net_params = net_params_queue.get()
            actor.set_network_params(actor_net_params)
            critic.set_network_params(critic_net_params)

            log_file.write('epoch' + str(ep) + 'reward' + str(np.sum(rew)) +
                           'step' + str(len(r_batch)))
            log_file.flush()

Exemplo n.º 8

0

Exibir arquivo

def run(server_class=HTTPServer, port=8333, log_file_path=LOG_FILE):

    np.random.seed(RANDOM_SEED)

    assert len(VIDEO_BIT_RATE) == A_DIM

    if not os.path.exists(SUMMARY_DIR):
        os.makedirs(SUMMARY_DIR)

    with tf.Session() as sess, open(log_file_path, 'wb') as log_file:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN], action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        sess.run(tf.initialize_all_variables())
        saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model is not None:  # nn_model is the path to file
            saver.restore(sess, nn_model)
            print("Model restored.")

        init_action = np.zeros(A_DIM)
        init_action[DEFAULT_QUALITY] = 1

        s_batch = [np.zeros((S_INFO, S_LEN))]
        a_batch = [init_action]
        r_batch = []

        train_counter = 0

        last_bit_rate = DEFAULT_QUALITY
        last_total_rebuf = 0
        # need this storage, because observation only contains total rebuffering time
        # we compute the difference to get

        video_chunk_count = 0

        input_dict = {'sess': sess, 'log_file': log_file,
                      'actor': actor, 'critic': critic,
                      'saver': saver, 'train_counter': train_counter,
                      'last_bit_rate': last_bit_rate,
                      'last_total_rebuf': last_total_rebuf,
                      'video_chunk_coount': video_chunk_count,
                      's_batch': s_batch, 'a_batch': a_batch, 'r_batch': r_batch}

        # interface to abr_rl server
        handler_class = make_request_handler(input_dict=input_dict)

        server_address = ('192.168.0.101', port)
        httpd = server_class(server_address, handler_class)
        print 'Listening on port ' + str(port)
        httpd.serve_forever()

Exemplo n.º 9

0

Exibir arquivo

Arquivo: client.py Projeto: riteshsinha-ritz/media-tools

    def __init__(self, mpd, base_url, base_dst, options):
        self.config = Config(mpd, base_url)
        self.quality_rep_map = {}
        self.file_writer = common.FileWriter(base_dst)

        for rep in self.config.reps:
            self.quality_rep_map[rep['bandwidth']] = rep

        self.bitrates = self.quality_rep_map.keys()
        self.bitrates.sort()
        utility_offset = -math.log(self.bitrates[0])
        self.utilities = [math.log(b) + utility_offset for b in self.bitrates]
        self.buffer_size = options.buffer_size * 1000
        self.verbose = options.verbose
        self.segment_time = self.config.reps[0]['dur_s'] * 1000
        self.bandwidth_changerscript_path = options.bandwidth_changerscript_path
        self.player = videoplayer.VideoPlayer(self.segment_time,
                                              self.utilities, self.bitrates)
        self.sess = tf.Session()
        self.quality_switch = 0
        self.actor = a3c.ActorNetwork(self.sess,
                                      state_dim=[S_INFO, S_LEN],
                                      action_dim=A_DIM,
                                      learning_rate=ACTOR_LR_RATE)
        self.critic = a3c.CriticNetwork(self.sess,
                                        state_dim=[S_INFO, S_LEN],
                                        learning_rate=CRITIC_LR_RATE)

        self.sess.run(tf.initialize_all_variables())
        self.saver = tf.train.Saver()

        # restore neural net parameters
        self.nn_model = NN_MODEL
        if self.nn_model is not None:  # nn_model is the path to file
            self.saver.restore(self.sess, self.nn_model)
            print("Model restored.")

        self.init_action = np.zeros(A_DIM)
        self.init_action[DEFAULT_QUALITY] = 1

        self.s_batch = [np.zeros((S_INFO, S_LEN))]
        self.a_batch = [self.init_action]
        self.r_batch = []

        self.last_quality = DEFAULT_QUALITY
        self.last_bit_rate = DEFAULT_QUALITY
        # need this storage, because observation only contains total rebuffering time
        # we compute the difference to get

        self.last_total_rebuf = 0
        self.video_chunk_count = 0
        self.chunk_fetch_time = 0
        self.chunk_size = 0
        self.ptime = 0

Exemplo n.º 10

0

Exibir arquivo

Arquivo: Pensieve.py Projeto: tianzhaotju/Deeplive

 def __init__(self):
     # fill your init vars
     n = 0
     self.BITRATE = [0, 1, 2, 3]
     self.TARGET_BUFFER = [0, 1, 2, 3]
     self.LATENCY_LIMIT = [1, 2, 3, 4]
     self.ACTION_SAPCE = []
     self.sess = tf.Session()
     self.actor = a3c.ActorNetwork(self.sess,
                                   state_dim=[S_INFO, S_LEN],
                                   action_dim=A_DIM,
                                   learning_rate=ACTOR_LR_RATE)
     self.critic = a3c.CriticNetwork(self.sess,
                                     state_dim=[S_INFO, S_LEN],
                                     learning_rate=CRITIC_LR_RATE)
     self.sess.run(tf.global_variables_initializer())
     self.saver = tf.train.Saver()

Exemplo n.º 11

0

Exibir arquivo

     def Initial(self):
     # Initail your session or something
         with tf.Session() as sess:
             actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN], action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)

             critic = a3c.CriticNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 learning_rate=CRITIC_LR_RATE)
             sess.run(tf.global_variables_initializer())
             saver = tf.train.Saver()  # save neural net parameters

            # restore neural net parameters
             if NN_MODEL is not None:  # NN_MODEL is the path to file
                 saver.restore(sess, NN_MODEL)
                 print("Testing model restored.")

             IntialVars = []
             IntialVars.append(actor)
             IntialVars.append(critic)         
             return IntialVars

Exemplo n.º 12

0

Exibir arquivo

Arquivo: train.py Projeto: albao666/DeepK8S-1

def central_agent(net_params_queues, exp_queues):

    assert len(net_params_queues) == NUM_AGENTS
    assert len(exp_queues) == NUM_AGENTS

    with tf.Session() as sess, open(SUMMARY_DIR + '/log_central',
                                    'w') as log_file:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=S_DIM,
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=S_DIM,
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(SUMMARY_DIR,
                                       sess.graph)  # training monitor
        saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model is not None:  # nn_model is the path to file
            saver.restore(sess, nn_model)
            print("Model restored.")

        # while True:  # assemble experiences from agents, compute the gradients
        for ep in range(TRAIN_EPOCH):
            # synchronize the network parameters of work agent
            actor_net_params = actor.get_network_params()
            critic_net_params = critic.get_network_params()
            for i in range(NUM_AGENTS):
                net_params_queues[i].put([actor_net_params, critic_net_params])

            # record average reward and td loss change
            # in the experiences from the agents
            total_batch_len = 0.0
            total_reward = 0.0
            total_td_loss = 0.0
            total_agents = 0.0

            # assemble experiences from the agents
            actor_gradient_batch = []
            critic_gradient_batch = []

            for i in range(NUM_AGENTS):
                s_batch, a_batch, r_batch, terminal = exp_queues[i].get()

                actor_gradient, critic_gradient, td_batch = \
                    a3c.compute_gradients(
                        # s_batch=np.vstack(s_batch),
                        s_batch=np.stack(s_batch, axis=0),
                        a_batch=np.vstack(a_batch),
                        r_batch=np.vstack(r_batch),
                        terminal=terminal, actor=actor, critic=critic)

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                total_reward += np.sum(r_batch)
                total_td_loss += np.sum(td_batch)
                total_batch_len += len(r_batch)
                total_agents += 1.0

            # compute aggregated gradient
            assert NUM_AGENTS == len(actor_gradient_batch)
            assert len(actor_gradient_batch) == len(critic_gradient_batch)

            for i in range(len(actor_gradient_batch)):
                actor.apply_gradients(actor_gradient_batch[i])
                critic.apply_gradients(critic_gradient_batch[i])

            # log training information
            avg_reward = total_reward / total_agents
            avg_td_loss = total_td_loss / total_batch_len

            log_file.write('Epoch: ' + str(ep) + ' TD_loss: ' +
                           str(avg_td_loss) + ' Avg_reward: ' +
                           str(avg_reward) + '\n')
            log_file.flush()

            summary_str = sess.run(summary_ops,
                                   feed_dict={
                                       summary_vars[0]: avg_td_loss,
                                       summary_vars[1]: avg_reward
                                   })

            writer.add_summary(summary_str, ep)
            writer.flush()

            if ep % MODEL_SAVE_INTERVAL == 0:
                # Save the neural net parameters to disk.
                save_path = saver.save(
                    sess, MODEL_DIR + "/nn_model_ep_" + str(ep) + ".ckpt")

Exemplo n.º 13

0

Exibir arquivo

def agent(agent_id, net_params_queue, exp_queue):

    net_env = env.Environment(random_seed=agent_id,
                              fixed_env=False,
                              trace_folder=TRAIN_TRACES)

    with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id),
                                    'wb') as log_file:
        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        # initial synchronization of the network parameters from the coordinator
        actor_net_params, critic_net_params = net_params_queue.get()
        actor.set_network_params(actor_net_params)
        critic.set_network_params(critic_net_params)

        mask = net_env.video_masks[net_env.video_idx]

        last_bit_rate = DEFAULT_QUALITY
        bit_rate = DEFAULT_QUALITY

        action = bitrate_to_action(bit_rate, mask)
        last_action = action

        action_vec = np.zeros(np.sum(mask))
        action_vec[bit_rate] = 1

        s_batch = [np.zeros((S_INFO, S_LEN))]
        a_batch = [action_vec]
        r_batch = []
        entropy_record = []

        time_stamp = 0
        while True:  # experience video streaming forever

            # the action is from the last decision
            # this is to make the framework similar to the real
            delay, sleep_time, buffer_size, \
                rebuf, video_chunk_size, end_of_video, \
                video_chunk_remain, video_num_chunks, \
                next_video_chunk_size, mask = \
                net_env.get_video_chunk(bit_rate)

            time_stamp += delay  # in ms
            time_stamp += sleep_time  # in ms

            reward = VIDEO_BIT_RATE[action] / M_IN_K \
                     - REBUF_PENALTY * rebuf \
                     - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[action] -
                                               VIDEO_BIT_RATE[last_action]) / M_IN_K

            r_batch.append(reward)

            last_bit_rate = bit_rate
            last_action = action

            # retrieve previous state
            if len(s_batch) == 0:
                state = [np.zeros((S_INFO, S_LEN))]
            else:
                state = np.array(s_batch[-1], copy=True)

            # dequeue history record
            state = np.roll(state, -1, axis=1)

            # this should be S_INFO number of terms
            state[0, -1] = VIDEO_BIT_RATE[action] / float(
                np.max(VIDEO_BIT_RATE))  # last quality
            state[1, -1] = buffer_size / BUFFER_NORM_FACTOR
            state[2, -1] = float(video_chunk_size) / float(
                delay) / M_IN_K  # kilo byte / ms
            state[3, -1] = float(delay) / M_IN_K
            state[4, -1] = video_chunk_remain / float(video_num_chunks)
            state[5, :] = -1
            nxt_chnk_cnt = 0
            for i in xrange(A_DIM):
                if mask[i] == 1:
                    state[5, i] = next_video_chunk_size[nxt_chnk_cnt] / M_IN_B
                    nxt_chnk_cnt += 1
            assert (nxt_chnk_cnt) == np.sum(mask)
            state[6, -A_DIM:] = mask

            # compute action probability vector
            action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN)))

            # the action probability should correspond to number of bit rates
            assert len(action_prob[0]) == np.sum(mask)

            action_cumsum = np.cumsum(action_prob)
            bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) /
                        float(RAND_RANGE)).argmax()
            # Note: we need to discretize the probability into 1/RAND_RANGE steps,
            # because there is an intrinsic discrepancy in passing single state and batch states

            action = bitrate_to_action(bit_rate, mask)

            entropy_record.append(a3c.compute_entropy(action_prob[0]))

            # log time_stamp, bit_rate, buffer_size, reward
            log_file.write(
                str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[action]) + '\t' +
                str(buffer_size) + '\t' + str(rebuf) + '\t' +
                str(video_chunk_size) + '\t' + str(delay) + '\t' +
                str(reward) + '\n')
            log_file.flush()

            # report experience to the coordinator
            if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video:
                exp_queue.put([
                    s_batch[1:],  # ignore the first chuck
                    a_batch[1:],  # since we don't have the
                    r_batch[1:],  # control over it
                    end_of_video,
                    {
                        'entropy': entropy_record
                    }
                ])

                # synchronize the network parameters from the coordinator
                actor_net_params, critic_net_params = net_params_queue.get()
                actor.set_network_params(actor_net_params)
                critic.set_network_params(critic_net_params)

                del s_batch[:]
                del a_batch[:]
                del r_batch[:]
                del entropy_record[:]

                log_file.write(
                    '\n')  # so that in the log we know where video ends

            # store the state and action into batches
            if end_of_video:
                last_bit_rate = DEFAULT_QUALITY
                bit_rate = DEFAULT_QUALITY  # use the default action here

                action = bitrate_to_action(bit_rate, mask)
                last_action = action
                action_vec = np.zeros(np.sum(mask))
                action_vec[bit_rate] = 1

                s_batch.append(np.zeros((S_INFO, S_LEN)))
                a_batch.append(action_vec)

            else:
                s_batch.append(state)

                action_vec = np.zeros(np.sum(mask))
                action_vec[bit_rate] = 1
                a_batch.append(action_vec)

Exemplo n.º 14

0

Exibir arquivo

def central_agent(net_params_queues, exp_queues):

    assert len(net_params_queues) == NUM_AGENTS
    assert len(exp_queues) == NUM_AGENTS

    logging.basicConfig(filename=LOG_FILE + '_central',
                        filemode='w',
                        level=logging.INFO)

    with tf.Session() as sess, open(LOG_FILE + '_test', 'wb') as test_log_file:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(SUMMARY_DIR,
                                       sess.graph)  # training monitor
        saver = tf.train.Saver(max_to_keep=10000)  # save neural net parameters

        # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model == "None":
            epoch = 0
            nn_model = None
        if nn_model is not None:  # nn_model is the path to file
            epoch = int(nn_model.replace("nn_model_ep_", "").split(".ckpt")[0])
            saver.restore(sess, MODEL_DIR + nn_model)
            print("Model restored.")

        # while True:  # assemble experiences from agents, compute the gradients
        while True:
            # synchronize the network parameters of work agent
            actor_net_params = actor.get_network_params()
            critic_net_params = critic.get_network_params()
            for i in xrange(NUM_AGENTS):
                net_params_queues[i].put([actor_net_params, critic_net_params])

            # record average reward and td loss change
            # in the experiences from the agents
            total_batch_len = 0.0
            total_reward = 0.0
            total_td_loss = 0.0
            total_entropy = 0.0
            total_agents = 0.0

            # assemble experiences from the agents
            actor_gradient_batch = []
            critic_gradient_batch = []

            for i in xrange(NUM_AGENTS):
                s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get()

                actor_gradient, critic_gradient, td_batch = \
                    a3c.compute_gradients(
                        s_batch=np.stack(s_batch, axis=0),
                        a_batch=np.vstack(a_batch),
                        r_batch=np.vstack(r_batch),
                        terminal=terminal, actor=actor, critic=critic)

                for i in xrange(len(actor_gradient)):
                    assert np.any(np.isnan(actor_gradient[i])) == False

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                total_reward += np.sum(r_batch)
                total_td_loss += np.sum(td_batch)
                total_batch_len += len(r_batch)
                total_agents += 1.0
                total_entropy += np.sum(info['entropy'])

            # compute aggregated gradient
            assert NUM_AGENTS == len(actor_gradient_batch)
            assert len(actor_gradient_batch) == len(critic_gradient_batch)
            # assembled_actor_gradient = actor_gradient_batch[0]
            # assembled_critic_gradient = critic_gradient_batch[0]
            # for i in xrange(len(actor_gradient_batch) - 1):
            #     for j in xrange(len(assembled_actor_gradient)):
            #             assembled_actor_gradient[j] += actor_gradient_batch[i][j]
            #             assembled_critic_gradient[j] += critic_gradient_batch[i][j]
            # actor.apply_gradients(assembled_actor_gradient)
            # critic.apply_gradients(assembled_critic_gradient)
            for i in xrange(len(actor_gradient_batch)):
                actor.apply_gradients(actor_gradient_batch[i])
                critic.apply_gradients(critic_gradient_batch[i])

            # log training information
            epoch += 1
            avg_reward = total_reward / total_agents
            avg_td_loss = total_td_loss / total_batch_len
            avg_entropy = total_entropy / total_batch_len

            logging.info('Epoch: ' + str(epoch) + ' TD_loss: ' +
                         str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) +
                         ' Avg_entropy: ' + str(avg_entropy))

            summary_str = sess.run(summary_ops,
                                   feed_dict={
                                       summary_vars[0]: avg_td_loss,
                                       summary_vars[1]: avg_reward,
                                       summary_vars[2]: avg_entropy
                                   })

            writer.add_summary(summary_str, epoch)
            writer.flush()

            if epoch % MODEL_SAVE_INTERVAL == 0:
                # Save the neural net parameters to disk.
                save_path = saver.save(
                    sess, MODEL_DIR + "nn_model_ep_" + str(epoch) + ".ckpt")
                logging.info("Model saved in file: " + save_path)
                testing(epoch,
                        MODEL_DIR + "nn_model_ep_" + str(epoch) + ".ckpt",
                        test_log_file)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: multi_agent.py Projeto: csqs/video_bitrate_control

def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue,
          exp_queue, epoch_queue):
    net_env = env.Environment(all_cooked_time=all_cooked_time,
                              all_cooked_bw=all_cooked_bw,
                              random_seed=agent_id)

    with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id),
                                    'wb') as log_file:
        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        # 1.从center同步最新的模型参数 initial synchronization of the network parameters from the coordinator
        actor_net_params, critic_net_params = net_params_queue.get()
        epoch_num = epoch_queue.get()
        actor.set_network_params(actor_net_params)
        critic.set_network_params(critic_net_params)

        last_bit_rate = DEFAULT_QUALITY
        bit_rate = DEFAULT_QUALITY

        action_vec = np.zeros(A_DIM)  #初始化 动作空间A个actions
        action_vec[bit_rate] = 1

        s_batch = [np.zeros((S_INFO, S_LEN))]
        a_batch = [action_vec]
        r_batch = []
        entropy_record = []

        time_stamp = 0
        while True:  # experience video streaming forever

            # 和环境Env交互 the action is from the last decision
            # this is to make the framework similar to the real
            # delay, sleep_time, buffer_size, rebuf, \
            # video_chunk_size, next_video_chunk_sizes, \
            # end_of_video, video_chunk_remain = \
            #     net_env.get_video_chunk(bit_rate)

            assert bit_rate >= 0
            assert bit_rate < A_DIM
            bitrate_send_last, lossrate_recv_last, bitrate_real_recovery,\
            bitrate_send_last_probe, lossrate_recv_last_probe, bitrate_real_recovery_probe,\
             end_of_video \
            = net_env.action_dispatch_and_report_svr(VIDEO_BIT_RATE[bit_rate])

            time_stamp += 2
            # -- linear reward --
            # reward is video quality - rebuffer penalty - smoothness
            #print '1', net_env.netbw
            #print '2', bitrate_send_last_probe * (1 - lossrate_recv_last_probe)
            x_funtion_top = (bitrate_send_last_probe *
                             (1 - lossrate_recv_last_probe) -
                             VIDEO_BIT_RATE[bit_rate]) / M_IN_K
            reward = -x_funtion_top * x_funtion_top  # 0.1 0.2 ... 1.1 1.2

            r_batch.append(reward)

            last_bit_rate = bit_rate

            # retrieve previous state
            if len(s_batch) == 0:
                state = [np.zeros((S_INFO, S_LEN))]
            else:
                state = np.array(s_batch[-1], copy=True)

            # dequeue history record
            #state = np.roll(state, -1, axis=1)

            # this should be S_INFO number of terms
            #state[0, -1] = bitrate_send_last / 1000.0  # last quality
            #state[1, -1] = lossrate_recv_last  # 丢包率0.1 0.2 0.3 0.4
            #state[2, -1] = bitrate_real_recovery / 1000.0  # kilo byte / ms

            state = np.roll(state, -1, axis=1)
            state[0, -1] = bitrate_send_last_probe / 1000.0  # last quality
            state[1, -1] = lossrate_recv_last_probe  # 丢包率0.1 0.2 0.3 0.4
            state[2,
                  -1] = bitrate_real_recovery_probe / 1000.0  # kilo byte / ms

            state[3, :A_DIM] = np.array(
                VIDEO_BIT_RATE[:]) / 1000.0  # kilo byte / ms
            state[4, -1] = bitrate_send_last / 1000.0  # kilo byte / ms
            # print state[3, :A_DIM]

            # ================== Predict BandWidth =========================

            # compute action probability vector
            action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN)))
            action_cumsum = np.cumsum(action_prob)

            bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) /
                        float(RAND_RANGE)).argmax()

            # Note: we need to discretize the probability into 1/RAND_RANGE steps,
            # because there is an intrinsic discrepancy in passing single state and batch states

            entropy_record.append(a3c.compute_entropy(action_prob[0]))

            # log time_stamp, bit_rate, buffer_size, reward
            log_file.write(
                str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' +
                str(bitrate_send_last) + '\t' + str(lossrate_recv_last) +
                '\t' + str(bitrate_real_recovery) + '\t' + str(reward) + '\n')
            log_file.flush()

            # report experience to the coordinator
            if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video:
                exp_queue.put([
                    s_batch[1:],  # ignore the first chuck
                    a_batch[1:],  # since we don't have the
                    r_batch[1:],  # control over it
                    end_of_video,
                    {
                        'entropy': entropy_record
                    }
                ])

                # synchronize the network parameters from the coordinator
                actor_net_params, critic_net_params = net_params_queue.get()
                actor.set_network_params(actor_net_params)
                critic.set_network_params(critic_net_params)
                epoch_num = epoch_queue.get()

                del s_batch[:]
                del a_batch[:]
                del r_batch[:]
                del entropy_record[:]

                log_file.write(
                    '\n')  # so that in the log we know where video ends

            # store the state and action into batches
            if end_of_video:
                last_bit_rate = DEFAULT_QUALITY
                bit_rate = DEFAULT_QUALITY  # use the default action here

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1

                s_batch.append(np.zeros((S_INFO, S_LEN)))
                a_batch.append(action_vec)

            else:
                s_batch.append(state)

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1
                a_batch.append(action_vec)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: multi_agent.py Projeto: 888yzbt888/Pensieve-multiagent

def central_agent(net_params_queues, exp_queues):  # 参数是两个有16个队列（进程队列？）的列表

    #打开Session(){
    #    生成神经网络
    #    生成一个tf.summary???（好像是用来检测数据作可视化用的）
    #    初始化神经网络参数，读取已保存的神经网络
    #    循环{
    #        在Queue中放入神经网络参数*子agent数量
    #        初始化变量和batch[]
    #        从Queue获取子agent传来的batch[]数据，综合以后执行梯度下降Optimizer
    #        将数据写入文件
    #        达到一定次数更新一次保存的神经网络
    #    }
    #}
    assert len(net_params_queues) == NUM_AGENTS
    assert len(exp_queues) == NUM_AGENTS

    logging.basicConfig(filename=LOG_FILE + '_central',
                        filemode='w',
                        level=logging.INFO)  # 创建日志？

    with tf.Session() as sess, open(LOG_FILE + '_test', 'wb') as test_log_file:

        # 创建actor神经网络，参数为tensorflow的Session，[输入神经元个数，历史带宽长度]，输出神经元个数（码率范围），学习率
        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        # 创建critic神经网络，参数为tensorflow的Session，[输入神经元个数，历史带宽长度]，学习率
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()  # 总结什么？

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(SUMMARY_DIR,
                                       sess.graph)  # training monitor
        saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model is not None:  # nn_model is the path to file
            saver.restore(sess, nn_model)
            print("Model restored.")

        epoch = 0

        # assemble experiences from agents, compute the gradients
        while True:
            # synchronize同步 the network parameters of work agent
            actor_net_params = actor.get_network_params()
            critic_net_params = critic.get_network_params()
            for i in xrange(NUM_AGENTS):  # 0-15
                net_params_queues[i].put([actor_net_params, critic_net_params
                                          ])  # 将参数放入列表中每个进程对应的队列
                # Note: this is synchronous version of the parallel training,
                # which is easier to understand and probe. The framework can be
                # fairly easily modified to support asynchronous training.
                # Some practices of asynchronous training (lock-free SGD at
                # its core) are nicely explained in the following two papers:
                # https://arxiv.org/abs/1602.01783
                # https://arxiv.org/abs/1106.5730

            # record average reward and td loss change
            # in the experiences from the agents
            total_batch_len = 0.0
            total_reward = 0.0
            total_td_loss = 0.0
            total_entropy = 0.0
            total_agents = 0.0

            # assemble experiences from the agents
            actor_gradient_batch = []
            critic_gradient_batch = []

            for i in xrange(NUM_AGENTS):  # 0-15
                s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get(
                )  # 从列表中每个进程对应的队列取出参数？

                actor_gradient, critic_gradient, td_batch = \
                    a3c.compute_gradients( # 计算梯度？
                        s_batch=np.stack(s_batch, axis=0),
                        a_batch=np.vstack(a_batch),
                        r_batch=np.vstack(r_batch),
                        terminal=terminal, actor=actor, critic=critic)

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                total_reward += np.sum(r_batch)
                total_td_loss += np.sum(td_batch)
                total_batch_len += len(r_batch)
                total_agents += 1.0
                total_entropy += np.sum(info['entropy'])  # 从info字典中取出熵值

            # compute aggregated汇总 gradient
            assert NUM_AGENTS == len(actor_gradient_batch)
            assert len(actor_gradient_batch) == len(critic_gradient_batch)
            # assembled_actor_gradient = actor_gradient_batch[0]
            # assembled_critic_gradient = critic_gradient_batch[0]
            # for i in xrange(len(actor_gradient_batch) - 1):
            #     for j in xrange(len(assembled_actor_gradient)):
            #             assembled_actor_gradient[j] += actor_gradient_batch[i][j]
            #             assembled_critic_gradient[j] += critic_gradient_batch[i][j]
            # actor.apply_gradients(assembled_actor_gradient)
            # critic.apply_gradients(assembled_critic_gradient)
            for i in xrange(len(actor_gradient_batch)):
                actor.apply_gradients(actor_gradient_batch[i])
                critic.apply_gradients(critic_gradient_batch[i])

            # log training information
            epoch += 1
            avg_reward = total_reward / total_agents
            avg_td_loss = total_td_loss / total_batch_len
            avg_entropy = total_entropy / total_batch_len

            logging.info('Epoch: ' + str(epoch) + ' TD_loss: ' +
                         str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) +
                         ' Avg_entropy: ' + str(avg_entropy))  # 记录日志

            summary_str = sess.run(summary_ops,
                                   feed_dict={
                                       summary_vars[0]: avg_td_loss,
                                       summary_vars[1]: avg_reward,
                                       summary_vars[2]: avg_entropy
                                   })

            writer.add_summary(summary_str, epoch)
            writer.flush()

            if epoch % MODEL_SAVE_INTERVAL == 0:
                # Save the neural net parameters to disk.
                save_path = saver.save(
                    sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt")
                logging.info("Model saved in file: " + save_path)
                testing(epoch,
                        SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt",
                        test_log_file)  # 测试？

Exemplo n.º 17

0

Exibir arquivo

Arquivo: multi_agent.py Projeto: xgw/proj

def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue,
          exp_queue):

    net_env = env.Environment(time=all_cooked_time,
                              bandwidth=all_cooked_bw,
                              random_seed=agent_id)

    with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id),
                                    'wb') as log_file:
        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        # initial synchronization of the network parameters from the coordinator
        actor_net_params, critic_net_params = net_params_queue.get()
        actor.set_network_params(actor_net_params)
        critic.set_network_params(critic_net_params)

        last_bit_rate = DEFAULT_QUALITY
        bit_rate = DEFAULT_QUALITY

        action_vec = np.zeros(A_DIM)
        action_vec[bit_rate] = 1

        s_batch = [np.zeros((S_INFO, S_LEN))]
        a_batch = [action_vec]
        r_batch = []
        entropy_record = []
        #need to initialize, and get before simulation step
        track_index = []
        hm = head_movement.move_prediction()

        time_stamp = 0
        while True:  # experience video streaming forever

            # the action is from the last decision
            # this is to make the framework similar to the real
            # xgw 20180918: need to modify here

            estimate_track_index = hm.get_head_movement_prediction()
            # actual_track_index = hm.get_head_movement_current()
            actual_track_index = [2, 3, 5, 6]


            delay, rebuf, buffer_size, sleep_time, video_chunk_size, end_of_video = \
                net_env.get_video_chunk(bit_rate, estimate_track_index)

            time_stamp += delay  # in ms
            time_stamp += sleep_time  # in ms

            # -- linear reward --
            # reward is video quality - rebuffer penalty - smoothness
            # xgw 20180918: need to modify the reward, add the qualiy consistency in viewport
            #               and the buffer
            # actually the consistency of quality in viewport is the error of head movement prediction error
            # so it's not sure that whether add the "quality consistency" here
            # don't know how to modelized the qp as first input
            reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \
                     - REBUF_PENALTY * rebuf \
                     - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] -
                                               VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K
            # bit_rate_log_reward = np.log((bit_rate + 1) / A_DIM) * BIT_RATE_REWARD_PARAMETER
            # smooth_p = np.exp(np.abs(last_bit_rate - bit_rate) / A_DIM) * SMOOTH_PENALTY
            # reward = bit_rate - REBUF_PENALTY * rebuf - smooth_p

            r_batch.append(reward)

            last_bit_rate = bit_rate

            # retrieve previous state
            if len(s_batch) == 0:
                state = [np.zeros((S_INFO, S_LEN))]
            else:
                state = np.array(s_batch[-1], copy=True)

            # dequeue history record
            state = np.roll(state, -1, axis=1)

            # this should be S_INFO number of terms
            # state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE))  # last quality
            # state[1, -1] = buffer_size / BUFFER_NORM_FACTOR  # 6 sec
            # state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K  # kilo byte / ms
            # state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR  # 10 sec
            # state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K  # mega byte
            # state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP)

            state[0, -1] = float(video_chunk_size) / float(
                delay) / M_IN_K  # kilo byte / ms
            state[1, -1] = buffer_size / BUFFER_NORM_FACTOR  # 6 sec
            state[2, :4] = np.array(actual_track_index)
            state[3, -1] = VIDEO_BIT_RATE[bit_rate] / float(
                np.max(VIDEO_BIT_RATE))  # last chunk's bitrate

            # compute action probability vector
            action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN)))
            action_cumsum = np.cumsum(action_prob)
            bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) /
                        float(RAND_RANGE)).argmax()
            # Note: we need to discretize the probability into 1/RAND_RANGE steps,
            # because there is an intrinsic discrepancy in passing single state and batch states

            entropy_record.append(a3c.compute_entropy(action_prob[0]))

            # log time_stamp, bit_rate, buffer_size, reward
            log_file.write('time_stamp: ' + str(time_stamp) + '\t' +
                           'VIDEO_BIT_RATE: ' + str(VIDEO_BIT_RATE[bit_rate]) +
                           '\t' + 'buffer_size: ' + str(buffer_size) + '\t' +
                           'rebuf: ' + str(rebuf) + '\t' +
                           'video_chunk_size: ' + str(video_chunk_size) +
                           '\t' + 'delay: ' + str(delay) + '\t' +
                           'avg throughtput: ' +
                           str(video_chunk_size / delay) + '\t' + 'reward: ' +
                           str(reward) + '\n')
            log_file.flush()

            # report experience to the coordinator
            if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video:
                exp_queue.put([
                    s_batch[1:],  # ignore the first chuck
                    a_batch[1:],  # since we don't have the
                    r_batch[1:],  # control over it
                    end_of_video,
                    {
                        'entropy': entropy_record
                    }
                ])

                # synchronize the network parameters from the coordinator
                actor_net_params, critic_net_params = net_params_queue.get()
                actor.set_network_params(actor_net_params)
                critic.set_network_params(critic_net_params)

                del s_batch[:]
                del a_batch[:]
                del r_batch[:]
                del entropy_record[:]

                log_file.write(
                    '\n')  # so that in the log we know where video ends

            # store the state and action into batches
            if end_of_video:
                last_bit_rate = DEFAULT_QUALITY
                bit_rate = DEFAULT_QUALITY  # use the default action here

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1

                s_batch.append(np.zeros((S_INFO, S_LEN)))
                a_batch.append(action_vec)

            else:
                s_batch.append(state)

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1
                a_batch.append(action_vec)

Exemplo n.º 18

0

Exibir arquivo

def central_agent(net_params_queues, exp_queues):

    assert len(net_params_queues) == NUM_AGENTS
    assert len(exp_queues) == NUM_AGENTS

    logging.basicConfig(filename=LOG_FILE + '_central',
                        filemode='w',
                        level=logging.INFO)

    with tf.Session() as sess, open(LOG_FILE + '_test', 'w') as test_log_file:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN], action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)  # training monitor
        saver = tf.train.Saver(max_to_keep=50000)  # save neural net parameters

        # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model is not None:  # nn_model is the path to file
            saver.restore(sess, nn_model)
		
            print("Model restored.")

        epoch = 0

        # assemble experiences from agents, compute the gradients
        while epoch <= num_epochs:
            # synchronize the network parameters of work agent
            actor_net_params = actor.get_network_params()
            critic_net_params = critic.get_network_params()
            for i in range(NUM_AGENTS):
                net_params_queues[i].put([actor_net_params, critic_net_params])
                # Note: this is synchronous version of the parallel training,
                # which is easier to understand and probe. The framework can be
                # fairly easily modified to support asynchronous training.
                # Some practices of asynchronous training (lock-free SGD at
                # its core) are nicely explained in the following two papers:
                # https://arxiv.org/abs/1602.01783
                # https://arxiv.org/abs/1106.5730

            # record average reward and td loss change
            # in the experiences from the agents
            total_batch_len = 0.0
            total_reward = 0.0
            total_td_loss = 0.0
            total_entropy = 0.0
            total_agents = 0.0 

            # assemble experiences from the agents
            actor_gradient_batch = []
            critic_gradient_batch = []

            for i in range(NUM_AGENTS):
                s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get()

                actor_gradient, critic_gradient, td_batch = \
                    a3c.compute_gradients(
                        s_batch=np.stack(s_batch, axis=0),
                        a_batch=np.vstack(a_batch),
                        r_batch=np.vstack(r_batch),
                        terminal=terminal, actor=actor, critic=critic)

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                total_reward += np.sum(r_batch)
                total_td_loss += np.sum(td_batch)
                total_batch_len += len(r_batch)
                total_agents += 1.0
                total_entropy += np.sum(info['entropy'])

            # compute aggregated gradient
            assert NUM_AGENTS == len(actor_gradient_batch)
            assert len(actor_gradient_batch) == len(critic_gradient_batch)
            # assembled_actor_gradient = actor_gradient_batch[0]
            # assembled_critic_gradient = critic_gradient_batch[0]
            # for i in range(len(actor_gradient_batch) - 1):
            #     for j in range(len(assembled_actor_gradient)):
            #             assembled_actor_gradient[j] += actor_gradient_batch[i][j]
            #             assembled_critic_gradient[j] += critic_gradient_batch[i][j]
            # actor.apply_gradients(assembled_actor_gradient)
            # critic.apply_gradients(assembled_critic_gradient)
            for i in range(len(actor_gradient_batch)):
                actor.apply_gradients(actor_gradient_batch[i])
                critic.apply_gradients(critic_gradient_batch[i])

            # log training information
            epoch += 1
            avg_reward = total_reward / total_agents
            avg_td_loss = total_td_loss / total_batch_len
            avg_entropy = total_entropy / total_batch_len

            logging.info('Epoch: ' + str(epoch) +
                         ' TD_loss: ' + str(avg_td_loss) +
                         ' Avg_reward: ' + str(avg_reward) +
                         ' Avg_entropy: ' + str(avg_entropy))

            summary_str = sess.run(summary_ops, feed_dict={
                summary_vars[0]: avg_td_loss,
                summary_vars[1]: avg_reward,
                summary_vars[2]: avg_entropy
            })

            writer.add_summary(summary_str, epoch)
            writer.flush()

            if epoch % MODEL_SAVE_INTERVAL == 0:
                # Save the neural net parameters to disk.
                save_path = saver.save(sess, SUMMARY_DIR + "/nn_model_ep_" +
                                       str(epoch) + ".ckpt")
                logging.info("Model saved in file: " + save_path)
                testing(
                    epoch,
                    SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt", 
                    test_log_file
                )

Exemplo n.º 19

0

Exibir arquivo

def main():
    # utility_offset = -math.log(VIDEO_BIT_RATE[0])  # so utilities[0] = 0
    # utilities = [math.log(b) + utility_offset for b in VIDEO_BIT_RATE]

    np.random.seed(RANDOM_SEED)

    assert len(VIDEO_BIT_RATE) == A_DIM

    all_cooked_time, all_cooked_bw, _ = load_trace.load_trace()
    load_trace.plot_bandwidth(all_cooked_time, all_cooked_bw, _)
    if not os.path.exists(SUMMARY_DIR):
        os.makedirs(SUMMARY_DIR)

    net_env = env.Environment(all_cooked_time=all_cooked_time,
                              all_cooked_bw=all_cooked_bw)

    with tf.Session() as sess, open(LOG_FILE, 'w') as log_file:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)

        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(SUMMARY_DIR,
                                       sess.graph)  # training monitor
        saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model is not None:  # nn_model is the path to file
            saver.restore(sess, nn_model)
            print("Model restored.")

        epoch = 0
        time_stamp = 0

        last_bit_rate = DEFAULT_QUALITY
        bit_rate = DEFAULT_QUALITY

        action_vec = np.zeros(A_DIM)
        action_vec[bit_rate] = 1

        s_batch = [np.zeros((S_INFO, S_LEN))]
        a_batch = [action_vec]
        r_batch = []
        entropy_record = []

        actor_gradient_batch = []
        critic_gradient_batch = []

        while True:  # serve video forever
            # the action is from the last decision
            # this is to make the framework similar to the real
            delay, sleep_time, buffer_size, rebuf, \
            video_chunk_size, next_video_chunk_sizes, \
            end_of_video, video_chunk_counter,throughput,video_chunk_remain = \
                net_env.get_video_chunk(bit_rate)
            #print(net_env.get_video_chunk(bit_rate))
            time_stamp += delay  # in ms
            time_stamp += sleep_time  # in ms

            # reward is video quality - rebuffer penalty - smooth penalty
            reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \
                     - REBUF_PENALTY * rebuf \
                     - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] -
                                               VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K
            r_batch.append(reward)

            last_bit_rate = bit_rate

            # retrieve previous state
            if len(s_batch) == 0:
                state = [np.zeros((S_INFO, S_LEN))]
            else:
                state = np.array(s_batch[-1], copy=True)
            # print(state)

            # dequeue history record
            state = np.roll(state, -1, axis=1)

            # this should be S_INFO number of terms
            state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(
                np.max(VIDEO_BIT_RATE))  # last quality
            state[1, -1] = buffer_size / BUFFER_NORM_FACTOR  # 10 sec
            state[2, -1] = float(video_chunk_size) / float(
                delay) / M_IN_K  # kilo byte / ms
            state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR  # 10 sec
            state[4, :A_DIM] = np.array(
                next_video_chunk_sizes) / M_IN_K / M_IN_K  # mega byte
            state[5, -1] = np.minimum(
                video_chunk_remain,
                CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP)

            # print('state',state)
            action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN)))
            action_cumsum = np.cumsum(action_prob)
            rand = np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)
            print(action_cumsum, action_cumsum > rand,
                  (action_cumsum > rand).argmax())
            # print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE))
            # print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax()

            #compute Vp and map bitrate
            # bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax()

            Vp_index = (action_cumsum > np.random.randint(1, RAND_RANGE) /
                        float(RAND_RANGE)).argmax()
            Vp = BUFFER_PARAMETER[Vp_index]
            # Note: we need to discretize the probability into 1/RAND_RANGE steps,
            # because there is an intrinsic discrepancy in passing single state and batch states

            config = {
                'buffer_size': env.BUFFER_THRESH,
                'gp': GP,
                'Vp': Vp,
                'abr_osc': False,
                'abr_basic': False,
                'no_ibr': False
            }
            bola = get_bitrate.Bola(config=config)
            bit_rate = bola.get_quality(
                Vp, buffer_size * env.MILLISECONDS_IN_SECOND, last_bit_rate,
                throughput)

            #决策前的信息
            print(
                '[%d]:download time %.2fms,thrput=%.2f,chunk size %d,buffer=%.2fs,bitrate=%d'
                % (video_chunk_counter, throughput, delay, video_chunk_size,
                   buffer_size, last_bit_rate))

            entropy_record.append(a3c.compute_entropy(action_prob[0]))

            # log time_stamp, bit_rate, buffer_size, reward
            log_file.write(
                str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' +
                str(buffer_size) + '\t' + str(rebuf) + '\t' +
                str(video_chunk_size) + '\t' + str(delay) + '\t' +
                str(reward) + '\n')
            log_file.flush()

            if len(r_batch
                   ) >= TRAIN_SEQ_LEN or end_of_video:  # do training once

                actor_gradient, critic_gradient, td_batch = \
                    a3c.compute_gradients(s_batch=np.stack(s_batch[1:], axis=0),  # ignore the first chuck
                                          a_batch=np.vstack(a_batch[1:]),  # since we don't have the
                                          r_batch=np.vstack(r_batch[1:]),  # control over it
                                          terminal=end_of_video, actor=actor, critic=critic)
                td_loss = np.mean(td_batch)

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                print("====")
                print("Epoch", epoch)
                print("TD_loss", td_loss, "Avg_reward", np.mean(r_batch),
                      "Avg_entropy", np.mean(entropy_record))
                print("====")

                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: td_loss,
                                           summary_vars[1]: np.mean(r_batch),
                                           summary_vars[2]:
                                           np.mean(entropy_record)
                                       })

                writer.add_summary(summary_str, epoch)
                writer.flush()

                entropy_record = []

                if len(actor_gradient_batch) >= GRADIENT_BATCH_SIZE:

                    assert len(actor_gradient_batch) == len(
                        critic_gradient_batch)
                    # assembled_actor_gradient = actor_gradient_batch[0]
                    # assembled_critic_gradient = critic_gradient_batch[0]
                    # assert len(actor_gradient_batch) == len(critic_gradient_batch)
                    # for i in xrange(len(actor_gradient_batch) - 1):
                    #     for j in xrange(len(actor_gradient)):
                    #         assembled_actor_gradient[j] += actor_gradient_batch[i][j]
                    #         assembled_critic_gradient[j] += critic_gradient_batch[i][j]
                    # actor.apply_gradients(assembled_actor_gradient)
                    # critic.apply_gradients(assembled_critic_gradient)

                    for i in range(len(actor_gradient_batch)):
                        actor.apply_gradients(actor_gradient_batch[i])
                        critic.apply_gradients(critic_gradient_batch[i])

                    actor_gradient_batch = []
                    critic_gradient_batch = []

                    epoch += 1
                    if epoch % MODEL_SAVE_INTERVAL == 0:
                        # Save the neural net parameters to disk.
                        save_path = saver.save(
                            sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) +
                            ".ckpt")
                        print("Model saved in file: %s" % save_path)

                del s_batch[:]
                del a_batch[:]
                del r_batch[:]

            if end_of_video:
                last_bit_rate = DEFAULT_QUALITY
                bit_rate = DEFAULT_QUALITY  # use the default action here

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1

                s_batch.append(np.zeros((S_INFO, S_LEN)))
                a_batch.append(action_vec)

            else:
                s_batch.append(state)

                action_vec = np.zeros(A_DIM)
                # print(bit_rate)
                action_vec[bit_rate] = 1
                a_batch.append(action_vec)

Exemplo n.º 20

0

Exibir arquivo

def main():
    env = gym.make("RLALIGN")
    with tf.Session() as sess:
        actor = a3c.ActorNetwork(sess,
                                 state_dim=S_DIM,
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE,
                                 Rows=env.noOfRows,
                                 Cols=env.noOfCols)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=S_DIM,
                                   learning_rate=CRITIC_LR_RATE,
                                   Rows=env.noOfRows,
                                   Cols=env.noOfCols)
        saver = tf.train.Saver()
        saver.restore(sess, saved_MODEL)
        #stepsize =[10,15,25,45,75,85,105,125,200,400,800]
        #interval =[10,15,25,45,75,85,105,125,200,400,800]
        stepsize = [10, 20, 50, 100, 200]
        interval = [10, 20, 50, 100, 200]
        FinalAccPlot = []
        GlobalAcc = {}
        for inter in PROB:
            GlobalAcc[str(inter)] = []
        for probability in PROB:
            Acc = {}
            for inter in interval:
                Acc[str(inter)] = []
            print "probability", probability
            AccuracyPlot = []
            for v in stepsize:
                print "current step ", v
                MeanAccuracy = 0
                for eps in xrange(1000):
                    obs = env.reset()
                    testSequence = env.reset()
                    seq = {}
                    for i in range(len(testSequence)):
                        seq[i] = testSequence[i].replace("-", "")
                    GoalAlignment = 0
                    #temp=mafft.mafft_Score(seq,100)
                    #GoalAlignment = temp[0]
                    temp = NW(seq[0], seq[1])
                    GoalAlignment = temp
                    #print "GoalAlignemnt Score:",GoalAlignment
                    #print "alignment", temp[1]
                    #print "Test Sequence",testSequence
                    listOfStates = []
                    listOfStates.append(testSequence)
                    #print "Test Sequence",testSequence
                    #print actionSpace
                    for i in listOfStates:
                        count = 1
                        StateToGetAction = env.de_one_hot_encode(
                            env.GetStateVector(i))  #1*12 shape
                        #print StateToGetAction
                        #StateToGetAction = np.reshape(StateToGetAction,(2,6))
                        testState = np.reshape(
                            env.GetStateVector(i),
                            [env.noOfRows, env.noOfCols * 5, 1])
                        #print legalStateIndices
                        rew = []
                        while True:
                            #print "Move ",count
                            legalStates = env.getLegalActions(
                                StateToGetAction, env.ActionSpace)
                            #print legalStates
                            legalStateIndices = []
                            for j in legalStates:
                                legalStateIndices.append(
                                    env.ActionSpace.index(j))
                            #print legalStateIndices

                            #print "Input State", env.get_sequences_from_state(env.de_one_hot_encode(testState))
                            #print testState.shape
                            testState = np.reshape(testState, [
                                1, testState.shape[0], testState.shape[1],
                                testState.shape[2]
                            ])
                            prediction = actor.predict(testState)
                            #print prediction
                            #print "Prediction",prediction
                            #print prediction.shape
                            predictionToUse = []
                            actionPredicted1 = np.argmax(prediction)
                            #print "action Predicted over all actions",env.ActionSpace[actionPredicted1]
                            for k in legalStateIndices:
                                predictionToUse.append(prediction[0][k])
                            prob, actions = env.getProbForActionEpsilonGreedy(
                                predictionToUse, probability)
                            #print actions
                            actionPredicted = legalStateIndices[
                                predictionToUse.index(
                                    np.random.choice(actions, p=prob))]
                            #actionPredicted=legalStateIndices[np.argmax(predictionToUse)]
                            #print "best action from legal states " + str(env.ActionSpace[legalStateIndices[np.argmax(predictionToUse)]])
                            #print "action Predicted over legal action",env.ActionSpace[actionPredicted]
                            next_sequence = env.step(testState,
                                                     actionPredicted,
                                                     GoalAlignment)
                            #print "Next State",env.get_sequences_from_state(env.de_one_hot_encode(next_sequence[1]))
                            #print "reward", next_sequence[0]
                            rew.append(next_sequence[0])
                            StateToGetAction = env.de_one_hot_encode(
                                next_sequence[1])
                            testState = np.reshape(
                                next_sequence[1],
                                [env.noOfRows, env.noOfCols * 5, 1])
                            count += 1
                            if (next_sequence[0] >= GoalAlignment):
                                Acc[str(v)].append(next_sequence[0] -
                                                   GoalAlignment)
                                MeanAccuracy += 1
                                break
                            if count == v:
                                Acc[str(v)].append(max(rew) - GoalAlignment)
                                #    if(max(rew)>GoalAlignment):
                                #print "Wohoo Prediction better than Mafft!"
                                #        MeanAccuracy+=1
                                #    elif(max(rew)==GoalAlignment):
                                #print "Yaay prediction correct"
                                #        MeanAccuracy+=1
                                #else:
                                #print "Reward Abs Diff", abs(GoalAlignment - max(rew))
                                #res.append(max(rew) - GoalAlignment)
                                break
                print "Average percentage " + str(
                    (MeanAccuracy / 1000.0) * 100.0)
                AccuracyPlot.append((MeanAccuracy / 1000.0) * 100.0)
                print AccuracyPlot
            FinalAccPlot.append(AccuracyPlot)
            for inter in interval:
                GlobalAcc[str(probability)].append(Acc[str(inter)])
    '''
    plt.subplot(141)
    plt.title('Random Factor 0%')
    sns.boxplot(data=GlobalAcc[str(PROB[3])])
    plt.ylabel("Alignment Score Difference")
    plt.xlabel("No of Steps")
    plt.xticks(range(0, len(interval), 1), interval)
    plt.subplot(142)
    plt.title('Random Factor 10%')
    sns.boxplot(data=GlobalAcc[str(PROB[2])])
    plt.xlabel("No of Steps")
    plt.xticks(range(0, len(interval), 1), interval)
    plt.subplot(143)
    plt.title('Random Factor 20%')
    plt.xlabel("No of Steps")
    sns.boxplot(data=GlobalAcc[str(PROB[1])])
    plt.xticks(range(0, len(interval), 1), interval)
    plt.subplot(144)
    plt.title('Random Factor 30%')
    '''
    plt.ylabel("Alignment Score Difference", fontsize=25)
    plt.xlabel("Number of Steps", fontsize=25)
    sns.boxplot(data=GlobalAcc[str(PROB[0])])
    plt.xticks(range(0, len(interval), 1), interval, fontsize=20)
    plt.yticks(fontsize=20)
    plt.savefig('MSA_2x8x4.png')

    #plt.clf()
    '''

Exemplo n.º 21

0

Exibir arquivo

def agent(agent_id, net_params_queue, exp_queue):

    env = gym.make("RLALIGN")

    with tf.Session() as sess, open(SUMMARY_DIR + '/log_agent_' + str(agent_id), 'wb') as log_file:
        actor = a3c.ActorNetwork(sess,
                                 state_dim=S_DIM, action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE,Rows = env.noOfRows,Cols = env.noOfCols)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=S_DIM,
                                   learning_rate=CRITIC_LR_RATE,Rows = env.noOfRows,Cols = env.noOfCols)

        # initial synchronization of the network parameters from the coordinator
        actor_net_params, critic_net_params = net_params_queue.get()
        actor.set_network_params(actor_net_params)
        critic.set_network_params(critic_net_params)

        time_stamp = 0
        for ep in xrange(TRAIN_EPOCH): 
            obs = env.reset()
            seq = {}
            for i in range(len(obs)):
                seq[i] = obs[i].replace("-","")
            GoalAlignment =0
            #temp=mafft.mafft_Score(seq,agent_id)
            temp = NW(seq[0],seq[1])
            GoalAlignment = temp
            #print seq
            #print temp[1]
            #print GoalAlignment
            stateOriginal = env.GetStateVector(obs)
            stateOriginal = np.reshape(stateOriginal, [env.noOfRows,env.noOfCols*5,1])
            seq=env.get_sequences_from_state(env.de_one_hot_encode(stateOriginal))

            s_batch = []
            a_batch = []
            r_batch = []

            for step in xrange(TRAIN_SEQ_LEN):

                s_batch.append(stateOriginal)

                #action_prob = actor.predict(np.reshape(obs, (1, S_DIM)))
                #print env.get_sequences_from_state(env.de_one_hot_encode(stateOriginal))
                stateBuffer=copy.deepcopy(stateOriginal)
                stateBuffer = np.reshape(stateOriginal,[1,stateOriginal.shape[0],stateOriginal.shape[1],stateOriginal.shape[2]])
                StateToGetAction=env.de_one_hot_encode(stateOriginal)
                action_prob = actor.predict(stateBuffer)
                action_cumsum = np.cumsum(action_prob)
                a=(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax()
                #a = np.random.choice(A_DIM, p=action_prob[0])
                #action_prob = actor.predict(stateBuffer)
                #print action_prob
                #action_prob[0] = action_prob[0] - np.finfo(np.float32).epsneg
                #histogram = np.random.multinomial(1, action_prob[0])
                #a = int(np.nonzero(histogram)[0])

                #if random.random() < EPSILON:
                #    a= random.randint(0, A_DIM-1)
                #else:
                #    a = np.random.choice(A_DIM, p=action_prob[0])
                    

                action_vec = np.zeros(A_DIM)
                action_vec[a] = 1
                a_batch.append(action_vec)
                rew,s_, done, info = env.step(stateOriginal,a,GoalAlignment)
                #print rew
                s_ = np.reshape(s_, [s_.shape[0],s_.shape[1],1])
                stateOriginal = s_
                r_batch.append(rew)

            ind = r_batch.index(max(r_batch))
            rewardDiff = max(r_batch) - GoalAlignment
            reachedGoal = "No"
            if(max(r_batch) >= GoalAlignment):
                reachedGoal = "Yes"
            if not done:
            #    ind = r_batch.index(max(r_batch))
                done = True
                exp_queue.put([s_batch[0:ind+1], a_batch[0:ind+1], r_batch[0:ind+1], done])
                log_file.write('seq '+ str(seq) +' epoch ' + str(ep) + ' reward ' + str(np.sum(r_batch[0:ind+1])) + ' step ' + str(len(r_batch[0:ind+1])) + ' AlignmentDiff '+ str(rewardDiff) + ' reachedGoal ' + reachedGoal)
                log_file.write("\n")
                log_file.flush()
            #exp_queue.put([s_batch, a_batch, r_batch, done])
            actor_net_params, critic_net_params = net_params_queue.get()
            actor.set_network_params(actor_net_params)
            critic.set_network_params(critic_net_params)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: test.py Projeto: yogeshVU/QARC

def main():
    np.random.seed(RANDOM_SEED)

    assert len(VIDEO_BIT_RATE) == A_DIM

    #all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace(TEST_TRACES)

    cooked_files = os.listdir(TEST_TRACES)
    g1 = tf.Graph()
    with tf.Session(graph=g1) as sess:
        #sess = tf.session()
        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()  # save neural net parameters
        # restore neural net parameters
        if NN_MODEL is not None:  # NN_MODEL is the path to file
            saver.restore(sess, NN_MODEL)
            print("Testing model restored.")

        if not os.path.exists(LOG_FILE):
            os.makedirs(LOG_FILE)

        cooked_files = os.listdir(TEST_TRACES)
        for cooked_file in cooked_files:
            net_env = env.Environment(RANDOM_SEED, cooked_file)
            log_path = LOG_FILE + 'log_sim_rl_' + cooked_file + '_log.txt'
            log_file = open(log_path, 'wb')

            time_stamp = 0

            last_bit_rate = DEFAULT_QUALITY
            bit_rate = DEFAULT_QUALITY

            action_vec = np.zeros(A_DIM)
            action_vec[bit_rate] = 1

            s_batch = [np.zeros((S_INFO, S_LEN))]
            a_batch = [action_vec]
            r_batch = []
            #entropy_record = []
            last_rtt = -1
            last_vmaf = -1

            while True:
                _norm_bitrate = VIDEO_BIT_RATE[bit_rate]
                delay, loss, recv_bitrate, rtt, throughput, limbo_bytes_len = \
                    net_env.get_video_chunk(bit_rate)
                print delay
                if delay is None:
                    log_file.write('\n')
                    print 'Test done', cooked_file
                    log_file.close()

                    last_bit_rate = DEFAULT_QUALITY
                    bit_rate = DEFAULT_QUALITY  # use the default action here

                    del s_batch[:]
                    del a_batch[:]
                    del r_batch[:]

                    action_vec = np.zeros(A_DIM)
                    action_vec[bit_rate] = 1

                    s_batch.append(np.zeros((S_INFO, S_LEN)))
                    a_batch.append(action_vec)
                    #entropy_record = []
                    break

                rtt = float(rtt) / float(1000)
                if last_rtt < 0:
                    last_rtt = rtt
                _norm_send_bitrate = bit_rate / A_DIM
                _queuing_delay = abs(rtt - last_rtt)
                _norm_recv_bitrate = min(
                    float(recv_bitrate) / delay / BUFFER_NORM_FACTOR, 1.0)

                time_stamp += delay  # in ms
                vmaf = net_env.get_vmaf(bit_rate)
                if last_vmaf < 0:
                    last_vmaf = vmaf

                #_normalized_bitrate = (_norm_bitrate - BITRATE_MIN) / (BITRATE_MAX - BITRATE_MIN)
                _vmaf_reward = (vmaf / _norm_bitrate) * BITRATE_MIN
                reward = \
                    1.0 * vmaf - \
                    0.2 * _norm_send_bitrate  - \
                    1.0 / DELAY_GRADIENT_MAX * min(_queuing_delay, DELAY_GRADIENT_MAX) - \
                    1.0 * abs(last_vmaf - vmaf)
                r_batch.append(reward)

                last_vmaf = vmaf
                last_rtt = rtt
                log_file.write(
                    str(time_stamp) + '\t' + str(_norm_bitrate) + '\t' +
                    str(recv_bitrate) + '\t' + str(limbo_bytes_len) + '\t' +
                    str(rtt) + '\t' + str(vmaf) + '\t' + str(reward) + '\n')
                log_file.flush()

                if len(s_batch) == 0:
                    state = [np.zeros((S_INFO, S_LEN))]
                else:
                    state = np.array(s_batch[-1], copy=True)

                # dequeue history record
                state = np.roll(state, -1, axis=1)
                state[0, -1] = _norm_send_bitrate  # last quality
                state[1, -1] = _norm_recv_bitrate  # kilo byte / ms
                state[2, -1] = _queuing_delay  # max:500ms
                state[3, -1] = float(loss)  # changed loss
                # test:add fft feature
                _fft = np.fft.fft(state[1])
                state[4] = _fft.real
                state[5] = _fft.imag
                state[6] = net_env.predict_vmaf()
                action_prob = actor.predict(
                    np.reshape(state, (1, S_INFO, S_LEN)))
                #print 'state',state[6]
                #print 'action',action_prob[0]
                action_cumsum = np.cumsum(action_prob)
                bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) /
                            float(RAND_RANGE)).argmax()

Exemplo n.º 23

0

Exibir arquivo

def main():

    np.random.seed(RANDOM_SEED)

    assert len(VIDEO_BIT_RATE) == A_DIM

    all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace(TEST_TRACES)

    net_env = env.Environment(all_cooked_time=all_cooked_time,
                              all_cooked_bw=all_cooked_bw)

    log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx]
    log_file = open(log_path, 'wb')

    with tf.Session() as sess:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN], action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)

        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        if NN_MODEL is not None:  # NN_MODEL is the path to file
            saver.restore(sess, NN_MODEL)
            print("Testing model restored.")

        time_stamp = 0

        last_bit_rate = DEFAULT_QUALITY
        bit_rate = DEFAULT_QUALITY

        action_vec = np.zeros(A_DIM)
        action_vec[bit_rate] = 1

        s_batch = [np.zeros((S_INFO, S_LEN))]
        a_batch = [action_vec]
        r_batch = []
        entropy_record = []

        video_count = 0

        while True:  # serve video forever
            # the action is from the last decision
            # this is to make the framework similar to the real
            delay, sleep_time, buffer_size, rebuf, \
            video_chunk_size, next_video_chunk_sizes, \
            end_of_video, video_chunk_remain = \
                net_env.get_video_chunk(bit_rate)

            time_stamp += delay  # in ms
            time_stamp += sleep_time  # in ms

            # reward is video quality - rebuffer penalty - smoothness
            reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \
                     - REBUF_PENALTY * rebuf \
                     - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] -
                                               VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K

            r_batch.append(reward)

            last_bit_rate = bit_rate

            # log time_stamp, bit_rate, buffer_size, reward
            log_file.write(str(time_stamp / M_IN_K) + '\t' +
                           str(VIDEO_BIT_RATE[bit_rate]) + '\t' +
                           str(buffer_size) + '\t' +
                           str(rebuf) + '\t' +
                           str(video_chunk_size) + '\t' +
                           str(delay) + '\t' +
                           str(reward) + '\n')
            log_file.flush()

            # retrieve previous state
            if len(s_batch) == 0:
                state = [np.zeros((S_INFO, S_LEN))]
            else:
                state = np.array(s_batch[-1], copy=True)

            # dequeue history record
            state = np.roll(state, -1, axis=1)

            # this should be S_INFO number of terms
            state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE))  # last quality
            state[1, -1] = buffer_size / BUFFER_NORM_FACTOR  # 10 sec
            state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K  # kilo byte / ms
            state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR  # 10 sec
            state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K  # mega byte
            state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP)

            action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN)))
            action_cumsum = np.cumsum(action_prob)
            bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax()
            # Note: we need to discretize the probability into 1/RAND_RANGE steps,
            # because there is an intrinsic discrepancy in passing single state and batch states

            DECISIONS.append(bit_rate)

            s_batch.append(state)

            entropy_record.append(a3c.compute_entropy(action_prob[0]))

            if end_of_video:
                log_file.write('\n')
                log_file.close()

                last_bit_rate = DEFAULT_QUALITY
                bit_rate = DEFAULT_QUALITY  # use the default action here

                del s_batch[:]
                del a_batch[:]
                del r_batch[:]

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1

                s_batch.append(np.zeros((S_INFO, S_LEN)))
                a_batch.append(action_vec)
                entropy_record = []

                video_count += 1

                if video_count >= len(all_file_names):
                    break

                log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx]
                log_file = open(log_path, 'wb')

    print "Decisions: {}".format(Counter(DECISIONS))

Exemplo n.º 24

0

Exibir arquivo

def main():

    summary_dir = SUMMARY_DIR
    if not os.path.exists(summary_dir):
        os.makedirs(summary_dir)
    log_file_dir = TEST_LOG_FOLDER
    if not os.path.exists(log_file_dir):
        os.makedirs(log_file_dir)

    TOTAL_REWARD_BITRATE = 0.0
    TOTAL_REWARD_HD_BITRATE = 0.0
    TOTAL_REWARD_REBUF = 0.0
    TOTAL_REWARD_SMOOTHNESS = 0.0
    TOTAL_REWARD = 0.0
    TOTAL_HOTSPOT_CHUNKS = 0.0

    np.random.seed(RANDOM_SEED)

    all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace(
        TEST_TRACES)

    net_env = env.Environment(all_cooked_time=all_cooked_time,
                              all_cooked_bw=all_cooked_bw)

    log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx]
    log_file = open(log_path, 'wb')

    with tf.Session() as sess:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)

        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        if NN_MODEL is not None:  # NN_MODEL is the path to file
            saver.restore(sess, NN_MODEL)
            print "Testing model restored."

        time_stamp = 0

        prefetch_decision = DEFAULT_PREFETCH
        last_bit_rate = DEFAULT_QUALITY
        bit_rate = DEFAULT_QUALITY

        action_vec = np.zeros(A_DIM)
        action_vec[prefetch_decision] = 1

        s_batch = [np.zeros((S_INFO, S_LEN))]
        a_batch = [action_vec]
        r_batch = []
        entropy_record = []

        video_count = 0

        while True:  # serve video forever
            # the action is from the last decision
            # this is to make the framework similar to the real
            state_data_for_action = net_env.execute_action(prefetch_decision)

            # normal chunk state information
            delay = state_data_for_action['delay']
            sleep_time = state_data_for_action['sleep_time']
            last_bit_rate = state_data_for_action['last_bit_rate']
            play_buffer_size = state_data_for_action['play_buffer_size']
            rebuf = state_data_for_action['rebuf']
            video_chunk_size = state_data_for_action['video_chunk_size']
            next_video_chunk_sizes = state_data_for_action[
                'next_video_chunk_sizes']
            end_of_video = state_data_for_action['end_of_video']
            video_chunk_remain = state_data_for_action['video_chunk_remain']
            current_seq_no = state_data_for_action['current_seq_no']
            log_prefetch_decision = state_data_for_action[
                'log_prefetch_decision']

            # hotspot chunk state information
            was_hotspot_chunk = 1.0 * state_data_for_action['was_hotspot_chunk']
            TOTAL_HOTSPOT_CHUNKS += was_hotspot_chunk
            hotspot_chunks_remain = state_data_for_action[
                'hotspot_chunks_remain']
            chunks_till_played = state_data_for_action['chunks_till_played']
            total_buffer_size = state_data_for_action['total_buffer_size']
            last_hotspot_bit_rate = state_data_for_action[
                'last_hotspot_bit_rate']
            next_hotspot_chunk_sizes = state_data_for_action[
                'next_hotspot_chunk_sizes']
            dist_from_hotspot_chunks = state_data_for_action[
                'dist_from_hotspot_chunks']
            smoothness_eval_bitrates = state_data_for_action[
                'smoothness_eval_bitrates']

            # abr decision state information
            normal_bitrate_pensieve = state_data_for_action[
                'normal_bitrate_pensieve']
            hotspot_bitrate_pensieve = state_data_for_action[
                'hotspot_bitrate_pensieve']

            # print len(next_video_chunk_sizes)
            # print len(next_hotspot_chunk_sizes)

            last_overall_bitrate = last_bit_rate
            if prefetch_decision == 1:
                last_overall_bitrate = last_hotspot_bit_rate

            time_stamp += delay  # in ms
            time_stamp += sleep_time  # in ms

            # reward is video quality - rebuffer penalty - smoothness
            reward_normal_br = (1.0 - was_hotspot_chunk) * (
                VIDEO_BIT_RATE[last_bit_rate] / M_IN_K) * 1.0
            reward_hotspot_br = was_hotspot_chunk * HD_REWARD[
                last_hotspot_bit_rate] * 1.0
            reward_rebuffering = REBUF_PENALTY * rebuf * 1.0
            reward_smoothness = 0.0
            if len(smoothness_eval_bitrates) > 1:
                for i in xrange(len(smoothness_eval_bitrates) - 1):
                    reward_smoothness += 1.0 * SMOOTH_PENALTY * (1.0 * np.abs(
                        VIDEO_BIT_RATE[smoothness_eval_bitrates[i + 1]] -
                        VIDEO_BIT_RATE[smoothness_eval_bitrates[i]]) / M_IN_K)

            reward = (1.0 * reward_normal_br) + (1.0 * reward_hotspot_br) - (
                1.0 * reward_rebuffering) - (1.0 * reward_smoothness)

            TOTAL_REWARD_BITRATE += reward_normal_br
            TOTAL_REWARD_HD_BITRATE += reward_hotspot_br
            TOTAL_REWARD_REBUF += reward_rebuffering
            TOTAL_REWARD_SMOOTHNESS += reward_smoothness
            TOTAL_REWARD += reward

            # print "reward before: {}".format(reward)

            r_batch.append(reward)

            # print "reward after: {}".format(reward)

            # log time_stamp, bit_rate, buffer_size, reward
            if not end_of_video:
                log_file.write(
                    str(time_stamp) + '\t' +
                    str(VIDEO_BIT_RATE[last_overall_bitrate]) + '\t' +
                    str(play_buffer_size) + '\t' + str(rebuf) + '\t' +
                    str(video_chunk_size) + '\t' + str(delay) + '\t' +
                    str(reward) + '\t' + str(log_prefetch_decision) + '\t' +
                    str(int(was_hotspot_chunk)) + '\t' + str(current_seq_no) +
                    '\n')
                log_file.flush()

            # retrieve previous state
            if len(s_batch) == 0:
                state = [np.zeros((S_INFO, S_LEN))]
            else:
                state = np.array(s_batch[-1], copy=True)

            # dequeue history record
            state = np.roll(state, -1, axis=1)

            # this should be S_INFO number of terms
            ## Normal state S_ABR_INFO
            state[0, -1] = VIDEO_BIT_RATE[last_overall_bitrate] / float(
                np.max(VIDEO_BIT_RATE))  # last quality
            state[1, -1] = play_buffer_size / BUFFER_NORM_FACTOR  # 10 sec
            state[2, -1] = float(video_chunk_size) / float(
                delay) / M_IN_K  # kilo byte / ms
            state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR  # 10 sec
            state[4, :BITRATE_LEVELS] = np.array(
                next_video_chunk_sizes) / M_IN_K / M_IN_K  # mega byte
            state[5, -1] = np.minimum(
                video_chunk_remain,
                CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP)
            ## Hotspot state S_HOT_INFO
            state[6, -1] = np.minimum(
                hotspot_chunks_remain,
                NUM_HOTSPOT_CHUNKS) / float(NUM_HOTSPOT_CHUNKS)
            state[7, -1] = np.minimum(
                chunks_till_played,
                CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP)
            state[8, -1] = total_buffer_size / BUFFER_NORM_FACTOR
            state[9,
                  -1] = last_hotspot_bit_rate / float(np.max(VIDEO_BIT_RATE))
            state[10, :BITRATE_LEVELS] = np.array(
                next_hotspot_chunk_sizes) / M_IN_K / M_IN_K
            state[11, :NUM_HOTSPOT_CHUNKS] = (
                np.array(dist_from_hotspot_chunks) +
                CHUNK_TIL_VIDEO_END_CAP) / float(2 * CHUNK_TIL_VIDEO_END_CAP)

            ## Bitrate actions state S_BRT_INFO
            state[12,
                  -1] = normal_bitrate_pensieve / float(np.max(VIDEO_BIT_RATE))
            state[13, -1] = hotspot_bitrate_pensieve / float(
                np.max(VIDEO_BIT_RATE))

            # compute action probability vector
            action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN)))
            action_cumsum = np.cumsum(action_prob)
            prefetch_decision = (
                action_cumsum >
                np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax()
            # Note: we need to discretize the probability into 1/RAND_RANGE steps,
            # because there is an intrinsic discrepancy in passing single state and batch states

            s_batch.append(state)

            entropy_record.append(a3c.compute_entropy(action_prob[0]))

            if end_of_video:
                log_file.write('\n')
                log_file.close()
                # break

                prefetch_decision = DEFAULT_PREFETCH

                del s_batch[:]
                del a_batch[:]
                del r_batch[:]

                action_vec = np.zeros(A_DIM)
                action_vec[prefetch_decision] = 1

                s_batch.append(np.zeros((S_INFO, S_LEN)))
                a_batch.append(action_vec)
                entropy_record = []

                video_count += 1

                if video_count >= len(all_file_names):
                    break

                # print "log file: {}".format(log_file)
                # print "Hot chunks: {}".format(TOTAL_HOTSPOT_CHUNKS)

                log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx]
                log_file = open(log_path, 'wb')

        print "Normal bitrate reward: {}".format(TOTAL_REWARD_BITRATE)
        print "Hotspot bitrate reward: {}".format(TOTAL_REWARD_HD_BITRATE)
        print "Rebuffering reward: {}".format(TOTAL_REWARD_REBUF)
        print "Smoothness reward: {}".format(TOTAL_REWARD_SMOOTHNESS)
        print "Total reward: {}".format(TOTAL_REWARD)
        print "Total hotspot chunks: {}".format(int(TOTAL_HOTSPOT_CHUNKS))

Exemplo n.º 25

0

Exibir arquivo

Arquivo: train.py Projeto: tianzhaotju/Deeplive

def agent(agent_id, all_cooked_time, all_cooked_bw, all_file_names,
          video_size_file, net_params_queue, exp_queue):
    net_env = env.Environment(all_cooked_time=all_cooked_time,
                              all_cooked_bw=all_cooked_bw,
                              random_seed=agent_id,
                              VIDEO_SIZE_FILE=video_size_file,
                              Debug=False)

    with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id),
                                    'wb') as log_file:
        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        # initial synchronization of the network parameters from the coordinator
        actor_net_params, critic_net_params = net_params_queue.get()
        actor.set_network_params(actor_net_params)
        critic.set_network_params(critic_net_params)

        bit_rate = DEFAULT_QUALITY
        target_buffer = DEFAULT_QUALITY
        latency_limit = 4
        index = 1
        action_vec = np.zeros(A_DIM)
        action_vec[index] = 1

        s_batch = [np.zeros((S_INFO, S_LEN))]
        a_batch = [action_vec]
        r_batch = []
        entropy_record = []

        video_count = 0
        reward_all_sum = 0
        reward_all = 0
        reward = 0
        switch_num = 0
        SMOOTH_PENALTY = 0.0
        REBUF_PENALTY = 3
        LANTENCY_PENALTY = 0.0
        BITRATE_REWARD = 0.001
        SKIP_PENALTY = 0.0
        epoch = 0
        n = 0
        state = np.array(s_batch[-1], copy=True)
        frame_time_len = 0.04
        last_bit_rate = DEFAULT_QUALITY
        while True:  # experience video streaming forever
            # the action is from the last decision
            # this is to make the framework similar to the real
            time, time_interval, send_data_size, chunk_len, \
            rebuf, buffer_size, play_time_len, end_delay, \
            cdn_newest_id, download_id, cdn_has_frame, skip_frame_time_len, decision_flag, \
            buffer_flag, cdn_flag, skip_flag, end_of_video = net_env.get_video_frame(bit_rate, target_buffer,
                                                                                     latency_limit)
            # # QOE setting
            # if end_delay <= 1.0:
            #     LANTENCY_PENALTY = 0.005
            # else:
            #     LANTENCY_PENALTY = 0.01

            reward_frame = 0
            epoch += 1
            if not cdn_flag:
                reward_frame = frame_time_len * float(
                    BIT_RATE[bit_rate]
                ) * BITRATE_REWARD - REBUF_PENALTY * rebuf - LANTENCY_PENALTY * end_delay - SKIP_PENALTY * skip_frame_time_len
            else:
                reward_frame = -(REBUF_PENALTY * rebuf)
            reward += reward_frame

            # dequeue history record
            state = np.roll(state, -1, axis=1)
            # this should be S_INFO number of terms
            state[0, -1] = buffer_size * 0.1
            state[1, -1] = send_data_size * 0.00001
            state[2, -1] = time_interval * 10  # kilo byte / ms
            state[3, -1] = end_delay * 0.1  # 10 sec
            state[4, -1] = rebuf  # mega byte

            if decision_flag and not end_of_video:

                reward_frame = -1 * SMOOTH_PENALTY * (
                    abs(BIT_RATE[bit_rate] - BIT_RATE[last_bit_rate]) / 1000)
                reward += reward_frame
                last_bit_rate = bit_rate
                r_batch.append(reward)

                reward = 0

                # compute action probability vector
                action_prob = actor.predict(
                    np.reshape(state, (1, S_INFO, S_LEN)))
                action_cumsum = np.cumsum(action_prob)
                temp = np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)
                index = (action_cumsum > temp).argmax()

                bit_rate = ACTION_SAPCE[index][0]
                target_buffer = ACTION_SAPCE[index][1]
                latency_limit = ACTION_SAPCE[index][2]
                # Note: we need to discretize the probability into 1/RAND_RANGE steps,
                # because there is an intrinsic discrepancy in passing single state and batch states

                entropy_record.append(a3c.compute_entropy(action_prob[0]))

                # report experience to the coordinator
                if len(r_batch) >= TRAIN_SEQ_LEN:
                    exp_queue.put([
                        s_batch[1:],  # ignore the first chuck
                        a_batch[1:],  # since we don't have the
                        r_batch[1:],  # control over it
                        end_of_video,
                        {
                            'entropy': entropy_record
                        }
                    ])

                    # synchronize the network parameters from the coordinator
                    actor_net_params, critic_net_params = net_params_queue.get(
                    )
                    actor.set_network_params(actor_net_params)
                    critic.set_network_params(critic_net_params)

                    del s_batch[:]
                    del a_batch[:]
                    del r_batch[:]
                    del entropy_record[:]

                s_batch.append(state)

                action_vec = np.zeros(A_DIM)
                action_vec[index] = 1
                a_batch.append(action_vec)

            reward_all += reward_frame

            # store the state and action into batches
            if end_of_video:
                r_batch.append(reward)

                reward_all_sum += reward_all / 20
                video_count += 1
                if video_count >= len(all_file_names):
                    n += 1
                    video_count = 0
                    print(n, "agent_id ", agent_id, "reward_all_sum:",
                          reward_all_sum)
                    w.writerow([n, reward_all_sum])
                    out.flush()
                    reward_all_sum = 0
                    net_env = env.Environment(all_cooked_time=all_cooked_time,
                                              all_cooked_bw=all_cooked_bw,
                                              random_seed=epoch,
                                              VIDEO_SIZE_FILE=video_size_file,
                                              Debug=False)
                    if n == NUM_EPOCH:
                        break

                reward_all = 0
                reward = 0
                switch_num = 0

                bit_rate = DEFAULT_QUALITY  # use the default action here
                target_buffer = DEFAULT_QUALITY

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1

                s_batch.append(np.zeros((S_INFO, S_LEN)))
                a_batch.append(action_vec)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: multi_agent.py Projeto: 888yzbt888/Pensieve-multiagent

def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue,
          exp_queue):  # agent号，trece数据，对应的两个队列的列表

    #Summary:先建立环境，然后打开Session(){
    #    生成神经网络
    #    （从主agent获取参数，给神经网络初始化）
    #    选取默认动作，初始化batch[],entropy[]
    #    循环：{
    #        从环境更新状态，新状态加入batch[]，选择新动作,记录数据进文件
    #        积累到batch大小，放到多进程的Queue中（等待主agent取出）
    #        重新从主agent获取参数，清除旧batch[]的数据
    #    }
    #}
    net_env = env.Environment(all_cooked_time=all_cooked_time,
                              all_cooked_bw=all_cooked_bw,
                              random_seed=agent_id)  # 调试环境参数？

    with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id),
                                    'wb') as log_file:
        # 创建actor神经网络，参数为tensorflow的Session，[输入神经元个数，历史带宽长度]，输出神经元个数（码率范围），学习率
        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        # 创建critic神经网络，参数为tensorflow的Session，[输入神经元个数，历史带宽长度]，学习率
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        # initial synchronization of the network parameters from the coordinator
        actor_net_params, critic_net_params = net_params_queue.get()
        actor.set_network_params(actor_net_params)
        critic.set_network_params(critic_net_params)

        last_bit_rate = DEFAULT_QUALITY
        bit_rate = DEFAULT_QUALITY

        action_vec = np.zeros(A_DIM)  # [0,0,0,0,0,0]
        action_vec[bit_rate] = 1  # 设置有效码率为1（其中一个）

        s_batch = [np.zeros((S_INFO, S_LEN))]  # [6*8的0矩阵,]，历史状态列表？
        a_batch = [action_vec]  # [[0,0,0,0,0,0],]
        r_batch = []  # reward？
        entropy_record = []

        time_stamp = 0
        while True:  # experience video streaming forever

            # the action is from the last decision
            # this is to make the framework similar to the real
            delay, sleep_time, buffer_size, rebuf, \
            video_chunk_size, next_video_chunk_sizes, \
            end_of_video, video_chunk_remain = \
                net_env.get_video_chunk(bit_rate) # 还没看懂

            time_stamp += delay  # in ms
            time_stamp += sleep_time  # in ms

            # -- linear reward --
            # reward is video quality - rebuffer penalty - smoothness
            reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \
                     - REBUF_PENALTY * rebuf \
                     - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] -
                                               VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K

            # -- log scale reward --
            # log_bit_rate = np.log(VIDEO_BIT_RATE[bit_rate] / float(VIDEO_BIT_RATE[-1]))
            # log_last_bit_rate = np.log(VIDEO_BIT_RATE[last_bit_rate] / float(VIDEO_BIT_RATE[-1]))

            # reward = log_bit_rate \
            #          - REBUF_PENALTY * rebuf \
            #          - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate)

            # -- HD reward --
            # reward = HD_REWARD[bit_rate] \
            #          - REBUF_PENALTY * rebuf \
            #          - SMOOTH_PENALTY * np.abs(HD_REWARD[bit_rate] - HD_REWARD[last_bit_rate])

            r_batch.append(reward)

            last_bit_rate = bit_rate

            # retrieve取回/恢复 previous state
            if len(s_batch) == 0:
                state = [np.zeros((S_INFO, S_LEN))]
            else:
                state = np.array(s_batch[-1], copy=True)

            # dequeue history record
            state = np.roll(state, -1, axis=1)  # 没看懂

            # this should be S_INFO number of terms
            state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(
                np.max(VIDEO_BIT_RATE))  # last quality，码率
            state[
                1,
                -1] = buffer_size / BUFFER_NORM_FACTOR  # 10 sec，current buffer size，缓存大小
            state[2, -1] = float(video_chunk_size) / float(
                delay) / M_IN_K  # kilo byte / ms， 带宽测量
            state[3, -1] = float(
                delay) / M_IN_K / BUFFER_NORM_FACTOR  # 10 sec， 延迟时间，下载时间？
            state[4, :A_DIM] = np.array(
                next_video_chunk_sizes
            ) / M_IN_K / M_IN_K  # mega byte， 下一个chunk的各种size，放在前6列？
            state[5, -1] = np.minimum(video_chunk_remain,
                                      CHUNK_TIL_VIDEO_END_CAP) / float(
                                          CHUNK_TIL_VIDEO_END_CAP)  # 剩余chunks

            # compute action probability vector，这里没搞懂
            action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN)))
            action_cumsum = np.cumsum(action_prob)
            bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) /
                        float(RAND_RANGE)).argmax()  # rand_range = 1000,前面有
            # Note: we need to discretize the probability into 1/RAND_RANGE steps,
            # because there is an intrinsic discrepancy in passing single state and batch states

            entropy_record.append(a3c.compute_entropy(action_prob[0]))

            # log time_stamp, bit_rate, buffer_size, reward
            log_file.write(
                str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' +
                str(buffer_size) + '\t' + str(rebuf) + '\t' +
                str(video_chunk_size) + '\t' + str(delay) + '\t' +
                str(reward) + '\n')
            log_file.flush()

            # report experience to the coordinator
            if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video:
                exp_queue.put([
                    s_batch[1:],  # ignore the first chuck
                    a_batch[1:],  # since we don't have the
                    r_batch[1:],  # control over it
                    end_of_video,
                    {
                        'entropy': entropy_record
                    }
                ])

                # synchronize the network parameters from the coordinator，更新神经网络参数
                actor_net_params, critic_net_params = net_params_queue.get()
                actor.set_network_params(actor_net_params)
                critic.set_network_params(critic_net_params)

                del s_batch[:]
                del a_batch[:]
                del r_batch[:]
                del entropy_record[:]

                log_file.write(
                    '\n')  # so that in the log we know where video ends

            # store the state and action into batches
            if end_of_video:
                last_bit_rate = DEFAULT_QUALITY
                bit_rate = DEFAULT_QUALITY  # use the default action here

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1

                s_batch.append(np.zeros((S_INFO, S_LEN)))
                a_batch.append(action_vec)

            else:
                s_batch.append(state)

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1
                a_batch.append(action_vec)

Exemplo n.º 27

0

Exibir arquivo

def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue):

    net_env = env.Environment(all_cooked_time=all_cooked_time,
                              all_cooked_bw=all_cooked_bw,
                              random_seed=agent_id)

    with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'w') as log_file:
        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN], action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        # initial synchronization of the network parameters from the coordinator
        actor_net_params, critic_net_params = net_params_queue.get()
        actor.set_network_params(actor_net_params)
        critic.set_network_params(critic_net_params)

        last_bit_rate = DEFAULT_QUALITY
        bit_rate = DEFAULT_QUALITY

        action_vec = np.zeros(A_DIM)
        action_vec[bit_rate] = 1

        s_batch = [np.zeros((S_INFO, S_LEN))]
        a_batch = [action_vec]
        r_batch = []
        entropy_record = []

        time_stamp = 0
        while True:  # experience video streaming forever

            # the action is from the last decision
            # this is to make the framework similar to the real
            delay, sleep_time, buffer_size, rebuf, \
                video_chunk_size, next_video_chunk_sizes, \
                end_of_video, video_chunk_remain = \
                net_env.get_video_chunk(bit_rate)

            time_stamp += delay  # in ms
            time_stamp += sleep_time  # in ms

            # -- linear reward --
            # reward is video quality - rebuffer penalty - smoothness
           # reward = \
            #    VIDEO_BIT_RATE[bit_rate] / M_IN_K \
             #    - REBUF_PENALTY * rebuf \
             #    - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] -
              #                             VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K

            # -- log scale reward --
           # log_bit_rate = np.log(VIDEO_BIT_RATE[bit_rate] / float(VIDEO_BIT_RATE[-1]))
            #log_last_bit_rate = np.log(VIDEO_BIT_RATE[last_bit_rate] / float(VIDEO_BIT_RATE[-1]))

            #reward = log_bit_rate \
             #        - REBUF_PENALTY * rebuf \
              #       - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate)

            # -- HD reward --
            reward = HD_REWARD[bit_rate] \
                      - REBUF_PENALTY * rebuf \
                      - SMOOTH_PENALTY * np.abs(HD_REWARD[bit_rate] - HD_REWARD[last_bit_rate])

            r_batch.append(reward)

            last_bit_rate = bit_rate

            # retrieve previous state
            if len(s_batch) == 0:
                state = [np.zeros((S_INFO, S_LEN))]
            else:
                state = np.array(s_batch[-1], copy=True)

            # dequeue history record
            state = np.roll(state, -1, axis=1)

            # this should be S_INFO number of terms
            state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE))  # last quality
            state[1, -1] = buffer_size / BUFFER_NORM_FACTOR  # 10 sec
            state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K  # kilo byte / ms
            state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR  # 10 sec
            state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K  # mega byte
            state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP)

            # compute action probability vector
            action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN)))
            action_cumsum = np.cumsum(action_prob)
            bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax()
            # Note: we need to discretize the probability into 1/RAND_RANGE steps,
            # because there is an intrinsic discrepancy in passing single state and batch states

            entropy_record.append(a3c.compute_entropy(action_prob[0]))

            # log time_stamp, bit_rate, buffer_size, reward
            log_file.write(str(time_stamp) + '\t' +
                           str(VIDEO_BIT_RATE[bit_rate]) + '\t' +
                           str(buffer_size) + '\t' +
                           str(rebuf) + '\t' +
                           str(video_chunk_size) + '\t' +
                           str(delay) + '\t' +
                           str(reward) + '\n')
            log_file.flush()

            # report experience to the coordinator
            if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video:
                exp_queue.put([s_batch[1:],  # ignore the first chuck
                               a_batch[1:],  # since we don't have the
                               r_batch[1:],  # control over it
                               end_of_video,
                               {'entropy': entropy_record}])

                # synchronize the network parameters from the coordinator
                actor_net_params, critic_net_params = net_params_queue.get()
                actor.set_network_params(actor_net_params)
                critic.set_network_params(critic_net_params)

                del s_batch[:]
                del a_batch[:]
                del r_batch[:]
                del entropy_record[:]

                log_file.write('\n')  # so that in the log we know where video ends

            # store the state and action into batches
            if end_of_video:
                last_bit_rate = DEFAULT_QUALITY
                bit_rate = DEFAULT_QUALITY  # use the default action here

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1

                s_batch.append(np.zeros((S_INFO, S_LEN)))
                a_batch.append(action_vec)

            else:
                s_batch.append(state)

                action_vec = np.zeros(A_DIM)
                action_vec[bit_rate] = 1
                a_batch.append(action_vec)

Exemplo n.º 28

0

Exibir arquivo

def main():
    os.system('rm -r ' + TEST_LOG_FOLDER)
    os.system('mkdir ' + TEST_LOG_FOLDER)
    
    np.random.seed(RANDOM_SEED)
    all_user_pos, all_file_names = load_trace.load_trace(TEST_TRACES)
    net_env = fixed_env.Environment(all_user_pos=all_user_pos)
    log_path = TEST_LOG_FOLDER + 'log_sim_rl_' + all_file_names[net_env.trace_idx]
    log_file = open(log_path, 'wb')
    
    with tf.Session() as sess:

        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN], action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)

        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        if NN_MODEL is not None:  # NN_MODEL is the path to file
            saver.restore(sess, NN_MODEL)
            print("Testing model restored.")
        
        # initializing
        association = one_hot().T 
        num_shared = 50
        trace_count = 0
        while True:  # serve video forever
            # the action is from the last decision
            # this is to make the framework similar to the real
            channel_gain, num_user_bs, rate, end_of_trace = \
                net_env.scheduling_and_association(association, num_shared)

            reward = np.mean(np.log(rate)) 



            # log time_stamp, bit_rate, buffer_size, reward
            log_file.write(str(reward) + '\n')
            log_file.flush()

            state_p1 = (channel_gain-np.mean(channel_gain.reshape((-1))))/(np.std(channel_gain.reshape((-1)))+1e-6)
            state_p2 = ((num_user_bs-np.mean(num_user_bs))/(np.std(num_user_bs)+1e-6)).reshape((7,1))
            #state = np.concatenate([state_p1,state_p2],axis = 1)     # state shape (7, 91)
            state = state_p1


            # compute action probability vector
            action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN)))  
            action = epsilon_greedy(action_prob, 0)          # set epsilon to zero when testing

            association, num_shared = rl_scheduling(channel_gain, action)

            if end_of_trace:
                print all_file_names[net_env.trace_idx-1],net_env.scheduling_ptr,'number of shared subchannels:', num_shared, 'SINR threshold:', BETA_SET[np.argmax(action[K_DIM:A_DIM])]
                #plot_cellular_network(net_env.macrocell, net_env.picocells, net_env.current_user_pos, association)
                log_file.write('\n')
                log_file.close()
                association = one_hot().T 
                num_shared = 50
                
                trace_count += 1
                if trace_count >= len(all_file_names):
                    break

                log_path = TEST_LOG_FOLDER + 'log_sim_rl_' + all_file_names[net_env.trace_idx]
                log_file = open(log_path, 'wb')

        # append test performance to the log
    with open(LOG_FILE + '_rl_test', 'ab') as log_file:        
        rewards = []
        test_log_files = os.listdir(TEST_LOG_FOLDER)
        for test_log_file in test_log_files:
            reward = []
            with open(TEST_LOG_FOLDER + test_log_file, 'rb') as f:
                for line in f:
                    parse = line.split()
                    try:
                        reward.append(float(parse[0]))
                    except IndexError:
                        break
            rewards.append(np.sum(reward[1:]))

        rewards = np.array(rewards)
        rewards_min = np.min(rewards)
        rewards_5per = np.percentile(rewards, 5)
        rewards_mean = np.mean(rewards)
        rewards_median = np.percentile(rewards, 50)
        rewards_95per = np.percentile(rewards, 95)
        rewards_max = np.max(rewards)

        log_file.write(str(rewards_min) + '\t' +
                       str(rewards_5per) + '\t' +
                       str(rewards_mean) + '\t' +
                       str(rewards_median) + '\t' +
                       str(rewards_95per) + '\t' +
                       str(rewards_max) + '\n')
        log_file.flush()
        
        print 'testing results' + '\t average rewards: ' + str(rewards_mean)

Exemplo n.º 29

0

Exibir arquivo

Arquivo: AB.py Projeto: tatsuya-nagashima/dash-emulator

        buffer -= rtt

    return buffer

if __name__ == '__main__':
    argv=sys.argv
    if len(argv)!=3:
        print 'Usage : ./dashClient.py [mpdURL] [clientIP]'
    else:

    	np.random.seed(RANDOM_SEED)
    	assert len(VIDEO_BIT_RATE) == A_DIM

    	with tf.Session() as sess:
    	    actor  = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE)
    	    critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE)

    	    sess.run(tf.global_variables_initializer())
    	    saver = tf.train.Saver()

    	    nn_model = NN_MODEL
    	    if nn_model is not None:
        		saver.restore(sess, nn_model)
        		#print "Model restored"

            s_batch = [np.zeros((S_INFO, S_LEN))]

            url=argv[1]
            clientIP=argv[2]
            run_time=time.time()
            start_time = datetime.datetime.fromtimestamp(run_time) #epoc => date

Exemplo n.º 30

0

Exibir arquivo

def main():

    np.random.seed(RANDOM_SEED)

    assert len(BITRATE) == A_DIM

    all_cooked_time, all_cooked_bw, all_file_names = load.loadBandwidth(
        TEST_TRACES)

    player = live_player.Live_Player(time_traces=all_cooked_time,
                                     throughput_traces=all_cooked_bw,
                                     seg_duration=SEG_DURATION,
                                     frag_duration=FRAG_DURATION,
                                     chunk_duration=CHUNK_DURATION,
                                     start_up_th=USER_START_UP_TH,
                                     freezing_tol=USER_FREEZING_TOL,
                                     latency_tol=USER_LATENCY_TOL,
                                     randomSeed=RANDOM_SEED)
    server = live_server.Live_Server(seg_duration=SEG_DURATION,
                                     frag_duration=FRAG_DURATION,
                                     chunk_duration=CHUNK_DURATION,
                                     start_up_th=SERVER_START_UP_TH)

    log_path = LOG_FILE + '_' + all_file_names[player.trace_idx]
    log_file = open(log_path, 'wb')

    with tf.Session() as sess:
        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)
        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()  # save neural net parameters

        # restore neural net parameters
        if NN_MODEL is not None:  # NN_MODEL is the path to file
            saver.restore(sess, NN_MODEL)
            print("Testing model restored.")

        action_num = DEFAULT_ACTION  # 0
        last_bit_rate = DEFAULT_ACTION % len(BITRATE)
        bit_rate = DEFAULT_ACTION % len(BITRATE)
        playing_speed = NORMAL_PLAYING
        action_vec = np.zeros(A_DIM)
        action_vec[action_num] = 1
        take_action = 1
        latency = 0.0

        s_batch = [np.zeros((S_INFO, S_LEN))]
        state = np.array(s_batch[-1], copy=True)
        a_batch = [action_vec]
        r_batch = []
        action_reward = 0.0  # Total reward is for all chunks within on segment

        video_count = 0
        starting_time = server.time
        starting_time_idx = player.time_idx

        while True:  # serve video forever
            assert len(server.chunks) >= 1
            download_chunk_info = server.chunks[0]
            download_chunk_size = download_chunk_info[2]
            download_chunk_idx = download_chunk_info[1]
            download_seg_idx = download_chunk_info[0]
            server_wait_time = 0.0
            sync = 0
            real_chunk_size, download_duration, freezing, time_out, player_state = player.fetch(
                bit_rate, download_chunk_size, download_seg_idx,
                download_chunk_idx, take_action, playing_speed)
            # print(freezing, time_out)
            take_action = 0
            past_time = download_duration
            buffer_length = player.buffer
            server_time = server.update(past_time)
            if not time_out:
                server.chunks.pop(0)
                sync = player.check_resync(server_time)
            else:
                assert player.state == 0
                assert np.round(player.buffer, 3) == 0.0
                # Pay attention here, how time out influence next reward, the smoothness
                # Bit_rate will recalculated later, this is for reward calculation
                bit_rate = 0
                sync = 1
            if sync:
                # To sync player, enter start up phase, buffer becomes zero
                sync_time, missing_count = server.sync_encoding_buffer()
                player.sync_playing(sync_time)
                buffer_length = player.buffer

            latency = server.time - player.playing_time
            player_state = player.state

            log_bit_rate = np.log(BITRATE[bit_rate] / BITRATE[0])
            log_last_bit_rate = np.log(BITRATE[last_bit_rate] / BITRATE[0])
            last_bit_rate = bit_rate
            # print(log_bit_rate, log_last_bit_rate)
            reward = ACTION_REWARD * log_bit_rate \
              - REBUF_PENALTY * freezing / MS_IN_S \
              - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate) \
              - LONG_DELAY_PENALTY*(LONG_DELAY_PENALTY_BASE**(ReLU(latency-TARGET_LATENCY)/ MS_IN_S)-1) \
              - UNNORMAL_PLAYING_PENALTY*(playing_speed-NORMAL_PLAYING)*download_duration/MS_IN_S
            # - MISSING_PENALTY * missing_count
            # print(reward)
            action_reward += reward

            # chech whether need to wait, using number of available segs
            if len(server.chunks) == 0:
                server_wait_time = server.wait()
                assert server_wait_time > 0.0
                assert server_wait_time < CHUNK_DURATION
                player.wait(server_wait_time)
                buffer_length = player.buffer

            # Establish state for next iteration
            state = np.roll(state, -1, axis=1)
            state[0, -1] = BITRATE[bit_rate] / BITRATE[0]  # video bitrate
            state[1, -1] = real_chunk_size / KB_IN_MB  # chunk size
            state[2, -1] = download_duration / MS_IN_S  # downloading time
            state[3, -1] = freezing / MS_IN_S  # current freezing time
            state[4, -1] = latency / MS_IN_S  # accu latency from start up
            state[5, -1] = sync  # whether there is resync
            state[6, -1] = player_state  # state of player
            state[
                7,
                -1] = server_wait_time / MS_IN_S  # time of waiting for server
            state[8, -1] = buffer_length / MS_IN_S  # buffer length
            # generate next set of seg size
            # if add this, this will return to environment
            # next_chunk_size_info = server.chunks[0][2]	# not useful
            # state[7, :A_DIM] = next_chunk_size_info		# not useful
            # print(state)

            next_chunk_idx = server.chunks[0][1]
            if next_chunk_idx == 0 or sync:
                take_action = 1
                # print(action_reward)
                r_batch.append(action_reward)
                action_reward = 0.0
                # If sync, might go to medium of segment, and there is no estimated chunk size
                next_seg_size_info = []
                if sync and not next_chunk_idx == 0:
                    next_seg_size_info = [
                        2 * np.sum(x) / KB_IN_MB for x in server.chunks[0][2]
                    ]
                else:
                    next_seg_size_info = [
                        x / KB_IN_MB for x in server.chunks[0][3]
                    ]

                state[9, :A_DIM] = next_seg_size_info
                action_prob = actor.predict(
                    np.reshape(state, (1, S_INFO, S_LEN)))
                action_cumsum = np.cumsum(action_prob)
                # print(action_prob)
                action_num = (action_cumsum >
                              np.random.randint(1, RAND_RANGE) /
                              float(RAND_RANGE)).argmax()
                bit_rate = action_num % len(BITRATE)
                if action_num >= len(BITRATE):
                    playing_speed = FAST_PLAYING
                else:
                    playing_speed = NORMAL_PLAYING

            log_file.write(
                str(server.time) + '\t' + str(BITRATE[last_bit_rate]) + '\t' +
                str(buffer_length) + '\t' + str(freezing) + '\t' +
                str(time_out) + '\t' + str(server_wait_time) + '\t' +
                str(sync) + '\t' + str(missing_count) + '\t' +
                str(player.state) + '\t' +
                str(int(action_num / len(BITRATE))) + '\t' + str(reward) +
                '\n')
            log_file.flush()

            if len(r_batch) >= MAX_LIVE_LEN:
                # need to modify
                time_duration = server.time - starting_time
                tp_record = record_tp(player.throughput_trace,
                                      starting_time_idx, time_duration)
                print(starting_time_idx, all_file_names[player.trace_idx],
                      len(player.throughput_trace), player.time_idx,
                      len(tp_record), np.sum(r_batch))
                log_file.write('\t'.join(str(tp) for tp in tp_record))
                log_file.write('\n' + str(starting_time))
                log_file.write('\n')
                log_file.close()

                action_num = DEFAULT_ACTION  # 0
                last_bit_rate = DEFAULT_ACTION % len(BITRATE)
                bit_rate = DEFAULT_ACTION % len(BITRATE)
                playing_speed = NORMAL_PLAYING

                del s_batch[:]
                del a_batch[:]
                del r_batch[:]

                action_vec = np.zeros(A_DIM)
                action_vec[action_num] = 1

                s_batch.append(np.zeros((S_INFO, S_LEN)))
                a_batch.append(action_vec)

                video_count += 1

                if video_count >= TEST_TRACE_NUM:
                    break

                player.test_reset(start_up_th=USER_START_UP_TH)
                server.test_reset(start_up_th=SERVER_START_UP_TH)
                # Do not need to append state to s_batch as there is no iteration
                starting_time = server.time
                starting_time_idx = player.time_idx
                log_path = LOG_FILE + '_' + all_file_names[player.trace_idx]
                log_file = open(log_path, 'wb')
                take_action = 1

            else:
                if next_chunk_idx == 0 or sync:
                    s_batch.append(state)
                    state = np.array(s_batch[-1], copy=True)
                    action_vec = np.zeros(A_DIM)
                    action_vec[bit_rate] = 1
                    a_batch.append(action_vec)