def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP # GLOBAL_RUNNING_R is the reward of all workers, GLOBAL_EP is the total iterations of all workers total_step = 1 # iterations of this worker # 先执行一步 self.clientsExecResult = self.net.updateClientVideo() allClientSNR = utils1.get_snr(self.clientsExecResult) buffer_s, buffer1_s, buffer2_s, buffer3_s, buffer4_s,\ buffer_CR_a, buffer1_CR_a, buffer2_CR_a, buffer3_CR_a, buffer4_CR_a,\ buffer_CR1_r, buffer_CR2_r, buffer_CR3_r, buffer_CR4_r, buffer_CR_r = [], [], [], [], [], [], [], [], [], [], [], [], [], [], [] windowInfo = [] rewardCRList = [[] for _ in range(options.HostNum)] # while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: ep_r_CR = 0 # the total reward of this episode while GLOBAL_EP < MAX_GLOBAL_EP: while True: allClientsAction = {} c1_action = {} c2_action = {} c3_action = {} c4_action = {} # get the env info env, *s_env = utils1.env_state8(self.clientsExecResult) feed_dict1 = {self.AC.s1_CR: np.array(env[0]).reshape((-1, ENV_DIMS_new))} feed_dict2 = {self.AC.s2_CR: np.array(env[1]).reshape((-1, ENV_DIMS_new))} feed_dict3 = {self.AC.s3_CR: np.array(env[2]).reshape((-1, ENV_DIMS_new))} feed_dict4 = {self.AC.s4_CR: np.array(env[3]).reshape((-1, ENV_DIMS_new))} CR1_prob = SESS.run(self.AC.CR1_prob, feed_dict1) CR2_prob = SESS.run(self.AC.CR2_prob, feed_dict2) CR3_prob = SESS.run(self.AC.CR3_prob, feed_dict3) CR4_prob = SESS.run(self.AC.CR4_prob, feed_dict4) c1_CRList, c1_CRList_d = self.AC.choose_CR_p(CR1_prob) c2_CRList, c2_CRList_d = self.AC.choose_CR_p(CR2_prob) c3_CRList, c3_CRList_d = self.AC.choose_CR_p(CR3_prob) c4_CRList, c4_CRList_d = self.AC.choose_CR_p(CR4_prob) # 神经网络分配的CC: c1_CC = lib.CR_mapping[c1_CRList_d][0] * options.serverCC c2_CC = lib.CR_mapping[c2_CRList_d][0] * options.serverCC c3_CC = lib.CR_mapping[c3_CRList_d][0] * options.serverCC c4_CC = lib.CR_mapping[c4_CRList_d][0] * options.serverCC print("神经网络分配的CC:", "c1:", c1_CC, "\tc2:", c2_CC, "\tc3:", c3_CC, "\tc4:", c4_CC) # add buffer info capa1_prob = lib.CR_mapping[c1_CRList_d][0] env[0][-1] = capa1_prob capa2_prob = lib.CR_mapping[c2_CRList_d][0] env[1][-1] = capa2_prob capa3_prob = lib.CR_mapping[c3_CRList_d][0] env[2][-1] = capa3_prob capa4_prob = lib.CR_mapping[c4_CRList_d][0] env[3][-1] = capa4_prob allenv = np.concatenate([env[0], env[1], env[2], env[3]], axis=0) buffer_s.append(np.array(allenv)) buffer1_s.append(np.array(env[0])) buffer2_s.append(np.array(env[1])) buffer3_s.append(np.array(env[2])) buffer4_s.append(np.array(env[3])) buffer1_CR_a.append(c1_CRList_d) buffer2_CR_a.append(c2_CRList_d) buffer3_CR_a.append(c3_CRList_d) buffer4_CR_a.append(c4_CRList_d) # buffer_CR_a.append(all_CRList_d) # 将神经网络分配的CC,按路由器规则映射成真实的传输速率 CC_real type:list disCC = [c1_CC, c2_CC, c3_CC, c4_CC] CC_real = utils1.adjust_CC(disCC, allClientSNR) c1_action["CC"] = CC_real[0] c2_action["CC"] = CC_real[1] c3_action["CC"] = CC_real[2] c4_action["CC"] = CC_real[3] c1_action["RR"] = c1_CRList[1] c2_action["RR"] = c2_CRList[1] c3_action["RR"] = c3_CRList[1] c4_action["RR"] = c4_CRList[1] allClientsAction['c1'] = c1_action allClientsAction['c2'] = c2_action allClientsAction['c3'] = c3_action allClientsAction['c4'] = c4_action # update env_state according to the real_CC and bitrate choices self.clientsExecResult = self.net.updateClientVideo(allClientsAction) # 取出下一时刻的snr_dict9 allClientSNR = utils1.get_snr(self.clientsExecResult) # Use window to record the info windowInfo.append(copy.deepcopy(self.clientsExecResult)) if len(windowInfo) > 5: del windowInfo[0] qoe_list, reward_list = utils1.reward_joint2(self.clientsExecResult) buffer_CR1_r.append(reward_list[0]) buffer_CR2_r.append(reward_list[1]) buffer_CR3_r.append(reward_list[2]) buffer_CR4_r.append(reward_list[3]) buffer_CR_r.append(reward_list[-1]) rewardCRList[0].append(copy.deepcopy(qoe_list[0])) # 用户1的reward rewardCRList[1].append(copy.deepcopy(qoe_list[1])) rewardCRList[2].append(copy.deepcopy(qoe_list[2])) rewardCRList[3].append(copy.deepcopy(qoe_list[3])) # 总体reward # print the env info if self.isPrint: self.printMidInfo(qoe_list, reward_list) total_step += 1 if total_step % UPDATE_GLOBAL_ITER < 0: # update global and assign to local net GLOBAL_EP += 1 break if total_step % UPDATE_GLOBAL_ITER == 0: # update global and assign to local net env, *s_env = utils1.env_state8(self.clientsExecResult) feed_dict = {self.AC.s_CR: np.array(env).reshape((-1, 4 * ENV_DIMS_new))} CR_v_ = SESS.run(self.AC.CR_v, feed_dict) v_.append(CR_v_[0][0]) CR_v_target = [] for r in buffer_CR_r[::-1]: # 将下一个state的v评价进行一个反向衰减传递得到每一步的v现实 CR_v_ = r + GAMMA * CR_v_[0][0] CR_v_target.append(CR_v_) # 将每一步的v现实都加入缓存中 CR_v_target.reverse() # print("CR_v_target: ", CR_v_target) v_target.append(CR_v_target[0]) # ***************************************************************************************** ENV1 = buffer1_s ENV2 = buffer2_s ENV3 = buffer3_s ENV4 = buffer4_s ALLENV = buffer_s allCR1 = buffer1_CR_a allCR2 = buffer2_CR_a allCR3 = buffer3_CR_a allCR4 = buffer4_CR_a # ***************************************************************************************** feed_dict_A1 = { self.AC.s_CR: ALLENV, # (?, 32) self.AC.s1_CR: ENV1, # (?, 8) self.AC.cr1_a: allCR1, # (?, ) # 用于计算A loss self.AC.CR_v_target: np.reshape(CR_v_target, (-1, 1)), # (?,4) self.AC.C2C_var: np.reshape(np.var([CC_real[0], c1_CC]), [-1, 1]) } feed_dict_A2 = { self.AC.s_CR: ALLENV, self.AC.s2_CR: ENV2, self.AC.cr2_a: allCR2, self.AC.CR_v_target: np.reshape(CR_v_target, (-1, 1)), self.AC.C2C_var: np.reshape(np.var([CC_real[1], c2_CC]), [-1, 1]) } feed_dict_A3 = { self.AC.s_CR: ALLENV, self.AC.s3_CR: ENV3, self.AC.cr3_a: allCR3, self.AC.CR_v_target: np.reshape(CR_v_target, (-1, 1)), self.AC.C2C_var: np.reshape(np.var([CC_real[2], c3_CC]), [-1, 1]) } feed_dict_A4 = { self.AC.s_CR: ALLENV, self.AC.s4_CR: ENV4, self.AC.cr4_a: allCR4, self.AC.CR_v_target: np.reshape(CR_v_target, (-1, 1)), self.AC.C2C_var: np.reshape(np.var([CC_real[3], c4_CC]), [-1, 1]) } feed_dict_C = { self.AC.s_CR: ALLENV, self.AC.CR_v_target: np.reshape(CR_v_target, (-1, 1)) } # *********************************** Debug ****************************************************** # CR1_A_loss = SESS.run(self.AC.CR1_A_loss, feed_dict_A1) # print("-" * 30) # # print("CR1_A_loss:", CR1_A_loss) # # CR2_A_loss = SESS.run(self.AC.CR2_A_loss, feed_dict_A2) # print("-" * 30) # # print("CR2_A_loss:", CR2_A_loss) # # CR3_A_loss = SESS.run(self.AC.CR3_A_loss, feed_dict_A3) # print("-" * 30) # # print("CR3_A_loss:", CR3_A_loss) # # CR4_A_loss = SESS.run(self.AC.CR4_A_loss, feed_dict_A4) # print("-" * 30) # # print("CR4_A_loss:", CR4_A_loss) # # CR_C_loss = SESS.run(self.AC.CR_C_loss, feed_dict_C) # # print("CR_C_loss", CR_C_loss) # # critic_loss.append(CR_C_loss[0]) # *************************************************** Train ********************************************************** self.AC.update_A1(feed_dict_A1) self.AC.update_A2(feed_dict_A2) self.AC.update_A3(feed_dict_A3) self.AC.update_A4(feed_dict_A4) self.AC.update_C(feed_dict_C) self.AC.pull_CR() rewardCRList = [[] for _ in range(options.HostNum)] buffer_s, buffer1_s, buffer2_s, buffer3_s, buffer4_s, \ buffer_CR_a, buffer1_CR_a, buffer2_CR_a, buffer3_CR_a, buffer4_CR_a, \ buffer_CR1_r, buffer_CR2_r, buffer_CR3_r, buffer_CR4_r, buffer_CR_r= [], [], [], [], [], [], [], [], [], [], [], [], [], [], [] if len(GLOBAL_RUNNING_R) == 0: # record running episode reward GLOBAL_RUNNING_R.append(reward_list[-1]) else: GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * reward_list[-1]) print( self.name, "Ep:", GLOBAL_EP, "| Ep_CR_r: %i" % GLOBAL_RUNNING_R[-1], ) GLOBAL_EP += 1 break
def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP # GLOBAL_RUNNING_R is the reward of all workers, GLOBAL_EP is the total iterations of all workers total_step = 1 # iterations of this worker # Store the train_data and train_label allSNR = [[] for h_index in range(options.HostNum)] # Start to simulate the video-downloading self.clientsExecResult = self.net.updateClientVideo() allClientSNR = utils1.unitEnv_uni(self.clientsExecResult) for h_index in range(options.HostNum): allSNR[h_index] += allClientSNR[h_index].tolist() buffer_s, buffer1_s,buffer2_s, buffer3_s, buffer4_s,\ buffer_CR_a, buffer1_CR_a, buffer2_CR_a, buffer3_CR_a, buffer4_CR_a,\ buffer_CR1_r, buffer_CR2_r, buffer_CR3_r, buffer_CR4_r, buffer_CR_r = [], [], [], [], [], [], [], [], [], [], [], [] ,[], [], [] windowInfo = [] rewardCRList = [[] for _ in range(options.HostNum)] # while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: while GLOBAL_EP < MAX_GLOBAL_EP: # ep_r = 0 # the total reward of this episode while True: allClientsAction = {} c1_action = {} c2_action = {} c3_action = {} c4_action = {} # get the env info env, *s_env = utils1.env_state8(self.clientsExecResult) feed_dict1 = { self.AC.s1_CR: np.array(env[0]).reshape((-1, ENV_DIMS_new)) } feed_dict2 = { self.AC.s2_CR: np.array(env[1]).reshape((-1, ENV_DIMS_new)) } feed_dict3 = { self.AC.s3_CR: np.array(env[2]).reshape((-1, ENV_DIMS_new)) } feed_dict4 = { self.AC.s4_CR: np.array(env[3]).reshape((-1, ENV_DIMS_new)) } CR1_prob = SESS.run(self.AC.CR1_prob, feed_dict1) CR2_prob = SESS.run(self.AC.CR2_prob, feed_dict2) CR3_prob = SESS.run(self.AC.CR3_prob, feed_dict3) CR4_prob = SESS.run(self.AC.CR4_prob, feed_dict4) c1_CRList, c1_CRList_d = self.AC.choose_CR_p(CR1_prob) c2_CRList, c2_CRList_d = self.AC.choose_CR_p(CR2_prob) c3_CRList, c3_CRList_d = self.AC.choose_CR_p(CR3_prob) c4_CRList, c4_CRList_d = self.AC.choose_CR_p(CR4_prob) capa2_all = options.serverCC - lib.CR_mapping[c1_CRList_d][ 0] * options.serverCC capa3_all = capa2_all - lib.CR_mapping[c2_CRList_d][ 0] * capa2_all capa4_all = capa3_all - lib.CR_mapping[c3_CRList_d][ 0] * capa3_all # add buffer info capa1_prob = lib.CR_mapping[c1_CRList_d][0] env[0][-1] = capa1_prob capa2_prob = (lib.CR_mapping[c2_CRList_d][0] * capa2_all) / options.serverCC env[1][-1] = capa2_prob capa3_prob = (lib.CR_mapping[c3_CRList_d][0] * capa3_all) / options.serverCC env[2][-1] = capa3_prob capa4_prob = (lib.CR_mapping[c4_CRList_d][0] * capa4_all) / options.serverCC env[3][-1] = capa4_prob allenv = np.concatenate([env[0], env[1], env[2], env[3]], axis=0) buffer_s.append(np.array(allenv)) buffer1_s.append(np.array(env[0])) buffer2_s.append(np.array(env[1])) buffer3_s.append(np.array(env[2])) buffer4_s.append(np.array(env[3])) # all_CRList_d = np.concatenate([c1_CRList_d, c2_CRList_d, c3_CRList_d, c4_CRList_d], 0) buffer1_CR_a.append(c1_CRList_d) buffer2_CR_a.append(c2_CRList_d) buffer3_CR_a.append(c3_CRList_d) buffer4_CR_a.append(c4_CRList_d) # buffer_CR_a.append(all_CRList_d) # todo c1_action[ "CC"] = lib.CR_mapping[c1_CRList_d][0] * options.serverCC c2_action["CC"] = lib.CR_mapping[c2_CRList_d][0] * capa2_all c3_action["CC"] = lib.CR_mapping[c3_CRList_d][0] * capa3_all c4_action["CC"] = lib.CR_mapping[c4_CRList_d][0] * capa4_all c1_action["RR"] = c1_CRList[1] c2_action["RR"] = c2_CRList[1] c3_action["RR"] = c3_CRList[1] c4_action["RR"] = c4_CRList[1] allClientsAction['c1'] = c1_action allClientsAction['c2'] = c2_action allClientsAction['c3'] = c3_action allClientsAction['c4'] = c4_action print("allAction:", allClientsAction) # update env_state according to the CC and resolution choices self.clientsExecResult = self.net.updateClientVideo( allClientsAction) # Use window to record the info windowInfo.append(copy.deepcopy(self.clientsExecResult)) if len(windowInfo) > 5: del windowInfo[0] # compute reward # r = utils.reward_window(windowInfo) ep_r_CR1, ep_r_CR2, ep_r_CR3, ep_r_CR4 = utils1.reward_joint2( self.clientsExecResult) # todo:reward_joint3 ep_r_CR = ep_r_CR1 + ep_r_CR2 + ep_r_CR3 + ep_r_CR4 buffer_CR1_r.append(ep_r_CR1) buffer_CR2_r.append(ep_r_CR2) buffer_CR3_r.append(ep_r_CR3) buffer_CR4_r.append(ep_r_CR4) buffer_CR_r.append(ep_r_CR) rewardCRList[0].append(copy.deepcopy(ep_r_CR1)) rewardCRList[1].append(copy.deepcopy(ep_r_CR2)) rewardCRList[2].append(copy.deepcopy(ep_r_CR3)) rewardCRList[3].append(copy.deepcopy(ep_r_CR4)) capa2_all = options.serverCC - c1_CRList[0] * options.serverCC capa3_all = capa2_all - c2_CRList[0] * capa2_all capa4_all = capa3_all - c3_CRList[0] * capa3_all # print the env info if self.isPrint: self.printMidInfo() if total_step % 1 == 0: print("CC_client1: ", lib.CR_mapping[c1_CRList_d][0], lib.CR_mapping[c1_CRList_d][0] * options.serverCC) print("CC_client2: ", lib.CR_mapping[c2_CRList_d][0], lib.CR_mapping[c2_CRList_d][0] * capa2_all) print("CC_client3: ", lib.CR_mapping[c3_CRList_d][0], lib.CR_mapping[c3_CRList_d][0] * capa3_all) print("CC_client4: ", lib.CR_mapping[c4_CRList_d][0], lib.CR_mapping[c4_CRList_d][0] * capa4_all) print("-" * 30) # print("Reso_client1: ", c1_CRList[1]) print("Reso_client2: ", c2_CRList[1]) print("Reso_client3: ", c3_CRList[1]) print("Reso_client4: ", c4_CRList[1]) total_step += 1 if total_step % UPDATE_GLOBAL_ITER < 0: # update global and assign to local net GLOBAL_EP += 1 break if total_step % UPDATE_GLOBAL_ITER == 0: # update global and assign to local net print("GLOBAL_EP:", GLOBAL_EP) if self.isPrint: self.printMidInfo() env, *s_env = utils1.env_state8(self.clientsExecResult) feed_dict = { self.AC.s_CR: np.array(env).reshape( (-1, 4 * ENV_DIMS_new)) } CR_v_ = SESS.run(self.AC.CR_v, feed_dict) print("CR_v_: ", CR_v_) print("buffer_CR1_r:", buffer_CR1_r) print("buffer_CR2_r:", buffer_CR2_r) print("buffer_CR3_r:", buffer_CR3_r) print("buffer_CR4_r:", buffer_CR4_r) CR_v_target = [[] for _ in range(options.HostNum)] for h_index in range(options.HostNum): reward_CR_client = rewardCRList[h_index][::-1] value = CR_v_[0, h_index] for r in reward_CR_client: value = r + GAMMA * value CR_v_target[h_index].append(value) CR_v_target[h_index].reverse() CR_v_target = np.array(CR_v_target).T # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # batch_num = len(buffer_s) # # allENV = np.array(buffer_s).reshape((batch_num, 4 * ENV_DIMS_new)) # # allCR = np.array(buffer_CR_a).reshape((batch_num, 4)) # # # ENV1 = np.array(buffer_s[:0:]).reshape((batch_num, ENV_DIMS_new)) # ENV2 = np.array(buffer_s[1]).reshape((batch_num, ENV_DIMS_new)) # ENV3 = np.array(buffer_s[2]).reshape((batch_num, ENV_DIMS_new)) # ENV4 = np.array(buffer_s[3]).reshape((batch_num, ENV_DIMS_new)) # ***************************************************************************************** ENV1 = buffer1_s ENV2 = buffer2_s ENV3 = buffer3_s ENV4 = buffer4_s ALLENV = buffer_s allCR1 = buffer1_CR_a allCR2 = buffer2_CR_a allCR3 = buffer3_CR_a allCR4 = buffer4_CR_a # ***************************************************************************************** feed_dict_A1 = { self.AC.s_CR: ALLENV, # (?, 32) self.AC.s1_CR: ENV1, # (?, 8) self.AC.cr1_a: allCR1, # (?, ) self.AC.CR_v_target: np.reshape(CR_v_target, (-1, 4)) # (?,4) } feed_dict_A2 = { self.AC.s_CR: ALLENV, self.AC.s2_CR: ENV2, self.AC.cr2_a: allCR2, self.AC.CR_v_target: np.reshape(CR_v_target, (-1, 4)) } feed_dict_A3 = { self.AC.s_CR: ALLENV, self.AC.s3_CR: ENV3, self.AC.cr3_a: allCR3, self.AC.CR_v_target: np.reshape(CR_v_target, (-1, 4)) } feed_dict_A4 = { self.AC.s_CR: ALLENV, self.AC.s4_CR: ENV4, self.AC.cr4_a: allCR4, self.AC.CR_v_target: np.reshape(CR_v_target, (-1, 4)) } feed_dict_C = { self.AC.s_CR: ALLENV, self.AC.CR_v_target: np.reshape(CR_v_target, (-1, 4)) } # *********************************** Debug ****************************************************** CR1_A_loss = SESS.run(self.AC.CR1_A_loss, feed_dict_A1) print("-" * 30) print("CR1_A_loss:", CR1_A_loss) CR2_A_loss = SESS.run(self.AC.CR2_A_loss, feed_dict_A2) print("-" * 30) print("CR2_A_loss:", CR2_A_loss) CR3_A_loss = SESS.run(self.AC.CR3_A_loss, feed_dict_A3) print("-" * 30) print("CR3_A_loss:", CR3_A_loss) CR4_A_loss = SESS.run(self.AC.CR4_A_loss, feed_dict_A4) print("-" * 30) print("CR4_A_loss:", CR4_A_loss) CR_C_loss = SESS.run(self.AC.CR_C_loss, feed_dict_C) critic_loss.append(CR_C_loss) print("CR_C_loss", CR_C_loss) # ************************************ Train ***************************************************** time = 3 # todo for _ in range(time): self.AC.train_CR1(feed_dict_A1) self.AC.train_CR2(feed_dict_A2) self.AC.train_CR3(feed_dict_A3) self.AC.train_CR4(feed_dict_A4) rewardCRList = [[] for _ in range(options.HostNum)] buffer_s, buffer1_s, buffer2_s, buffer3_s, buffer4_s, \ buffer_CR_a, buffer1_CR_a, buffer2_CR_a, buffer3_CR_a, buffer4_CR_a, \ buffer_CR1_r, buffer_CR2_r, buffer_CR3_r, buffer_CR4_r, buffer_CR_r= [], [], [], [], [], [], [], [], [], [], [], [], [], [], [] GLOBAL_EP += 1 if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r_CR) else: GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r_CR) print( self.name, "Ep:", GLOBAL_EP, "| Ep_CR_r: %i" % GLOBAL_RUNNING_R[-1], ) break