reward = reward_step if stepIdx > 100: s, a, r = PG.store_transition(observation_step, action, reward) if stepIdx % 6 == 0 and stepIdx > 100: PG.learn() for k in range(len(observation)): ss = observation[k].copy() ss.extend(matrixOfChanAlloc.copy().reshape( 1, nOfenb * nOfchannel).tolist()[0]) # print(ss) observation_step = np.array(ss).reshape( nOfenb * nOfchannel + 4, 1).ravel() print("observation_step: ", observation_step) if observation_step[1] > 0: action = PG.choose_action1(observation_step, matrixOfChanAlloc, stepIdx) if action < 12: action_list.append(observation_step[0]) action_list.append(observation_step[1]) action_list.append(action) else: action_list.append(0) action_list.append(0) action_list.append(0) reward = 0 if k == len(observation) - 1 or observation[k + 1][1] == 0: #大step d = () for b in range(len(action_list)): d += (spaces.Discrete(int(action_list[b])), ) action_ = spaces.Tuple(d)
ss = [] for a in observation: for b in a: ss.append(b) ss.extend(matrixOfChanAlloc.copy().reshape( 1, nOfenb * nOfchannel).tolist()[0]) #请求+信道占用 observation_step = np.array(ss).reshape( nOfenb * nOfchannel + sizeperq * len(observation), 1).ravel() #变换为网络输入所要求的维度 print("observation_step: ", observation_step) if observation_step[k * sizeperq + 1] > 0: #判断RNTI是否大于0 是否为有效请求 action = PG.choose_action1(observation_step, matrixOfChanAlloc, observation[k][0]) #选取动作 if action < nOfchannel: #判断是否为有效动作 observation[k][4] = action #改变状态 addaction(observation[k][0], observation[k][1], action, action_list) #存储分配策略到action_list else: addaction(0, 0, 0, action_list) #空动作 reward = 0 #eposide没有结束reward为0 if stepIdx > 100 and k < numue - 1: #stepIndex大于100开始进入学习过程,开始存储信息 s, a, r = PG.store_transition( observation_step, action + observation[k][0] * nOfchannel, reward)