def __init__(self, id, config): """ Prioritized experience replay buffer initialization. """ self.id = id self.config = config self.tree = SumTree(self.config.buffer_size) self.buffer_size = self.config.buffer_size self.batch_size = self.config.batch_size self.alpha = self.config.alpha self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.seed = random.seed(self.config.seed) self.step_lock = Lock() self.sample_lock = Lock() self.sample_lock.acquire() # allow save_step first self.eps = EPSILON self.beta = 0.6 self.beta_increment_per_sampling = 2. / float(self.config.total_step) GLOBAL_LOGGER.get_tb_logger().add_text_of_object( "PER_REPLAY_MEMORY_CONFIG", self.config)
def step(self, action): total_reward = 0.0 rewards = [] total_rb = 0.0 for u in range(self.config.n_ue): a_n_rb = self.tmp_state[u].n_rb GLOBAL_LOGGER.get_tb_logger().add_scalar('required_n_rb.' + str(u), a_n_rb, self.n_step) if action[u] == 1 and self.tmp_state[u].q_length: total_rb += a_n_rb if total_rb <= self.config.ue_config.channel.total_n_rb: total_rb = self.config.ue_config.channel.total_n_rb for u in range(self.config.n_ue): if action[u] == 1 and self.tmp_state[u].q_length > 0: n_rb = round(float(self.tmp_state[u].n_rb) / total_rb * self.config.ue_config.channel.total_n_rb) GLOBAL_LOGGER.get_tb_logger().add_scalar('A_NRB_' + str(u), n_rb, self.n_step) r = self.ue_list[u].step(UE_RB_ACTION(n_rb)) total_reward += r GLOBAL_LOGGER.get_tb_logger().add_scalar('UE_REWARD_' + str(u), r, self.n_step) else: self.ue_list[u].step(UE_RB_ACTION(0)) r = 0 GLOBAL_LOGGER.get_tb_logger().add_scalar('UE_REWARD_' + str(u), 0, self.n_step) rewards.append(r) GLOBAL_LOGGER.get_tb_logger().add_scalar('ENV_REWARD', total_reward, self.n_step) return np.array(rewards, dtype=float)
def run(self): rewards_his = np.zeros(self.config.n_ue) total_reward_his = 0 for e in range(self.config.n_episode): self.init_env() for t in range(self.config.n_step): state = self.get_state() action = self.agent.get_action(state) action_ = np.copy(action) if self.config.action_conversion_f is None: for u in range(self.config.n_ue): if (action_[u] > 1.): action_[u] = 1. else: action_ = self.config.action_conversion_f(action_) rewards = self.step(action_) rewards_his = 0.99 * rewards_his + 0.01 * rewards for i in range(self.config.n_ue): GLOBAL_LOGGER.get_tb_logger().add_scalar('UE_REWARD.moving_avg.' + str(i), rewards_his[i], self.n_step) total_reward = np.sum(rewards) total_reward_his = 0.99 * total_reward_his + 0.01 * total_reward GLOBAL_LOGGER.get_tb_logger().add_scalar('ENV_REWARD.moving_avg', total_reward_his, self.n_step) next_state = self.get_state() done = 0 if t == self.config.n_step - 1: done = 1 self.agent.save_step(state, action, rewards, next_state, done)
def step(self, action): ''' :param action: 0 or 1 for Tx binary, 0~1 for the percentage of RBs :return: ''' total_reward = 0.0 rewards = [] total_rb_pct = 0.0 for u in range(self.config.n_ue): a_n_rb = action[u] * self.config.ue_config.channel.total_n_rb GLOBAL_LOGGER.get_tb_logger().add_scalar('required_n_rb.' + str(u), a_n_rb, self.n_step) total_rb_pct += action[u] if total_rb_pct < 1.: total_rb_pct = 1. for u in range(self.config.n_ue): if int(action[u] * self.config.ue_config.channel.total_n_rb) > 0 and self.tmp_state[u].q_length > 0: n_rb = round(float(action[u]) / total_rb_pct * self.config.ue_config.channel.total_n_rb) GLOBAL_LOGGER.get_tb_logger().add_scalar('A_NRB_' + str(u), n_rb, self.n_step) r = self.ue_list[u].step(UE_RB_ACTION(n_rb)) total_reward += r GLOBAL_LOGGER.get_tb_logger().add_scalar('UE_REWARD_' + str(u), r, self.n_step) else: self.ue_list[u].step(UE_RB_ACTION(0)) r = 0 GLOBAL_LOGGER.get_tb_logger().add_scalar('UE_REWARD_' + str(u), 0, self.n_step) rewards.append(r) GLOBAL_LOGGER.get_tb_logger().add_scalar('ENV_REWARD', total_reward, self.n_step) return np.array(rewards, dtype=float)
def __init__(self, id, config): LearningModel.__init__(self) self.id = id self.config = config GLOBAL_LOGGER.get_tb_logger().add_text_of_object( "DDPG_CONFIG", self.config) self.actor = None self.actor_optim = None self.critic = None self.critic_optim = None self.actor_target = None self.critic_target = None self.init_model() GLOBAL_LOGGER.get_tb_logger().add_text_of_object( "actor_target_arch", self.actor_target) GLOBAL_LOGGER.get_tb_logger().add_text_of_object( "critic_target_arch", self.critic_target) GLOBAL_LOGGER.get_tb_logger().add_text_of_object( "actor_arch", self.actor) GLOBAL_LOGGER.get_tb_logger().add_text_of_object( "critic_arch", self.critic) self.step_counter = 0 if USE_CUDA: self.move_nn_to_gpu()
def get_action(self, state): """ Get the action from the actor :param state: env state in np :return: action in np """ # if logic is present, the agent will use logic as the actor with self.actor_lock: if self.scheduler_function is None: state = to_tensor(state) ret = self.actor.forward(state) ret = to_numpy(ret) else: ret = self.scheduler_function.forward(state) noise_factor = math.exp(-self.n_step * self.config.noise_attenuation) if self.action_noise: GLOBAL_LOGGER.get_tb_logger().add_scalar( "NOISE_FACTOR", noise_factor, self.n_step) ret = self.action_noise.add_noise(ret, noise_factor) for a in range(len(ret)): GLOBAL_LOGGER.get_tb_logger().add_scalar( "ACTION_" + str(a), ret[a], self.n_step) return ret
def __init__(self, id, config, agent, learning_model, replay_memory): Thread.__init__(self) self.id = id self.config = config GLOBAL_LOGGER.get_tb_logger().add_text_of_object( "CONTROLLER_CONFIG", config) self.agent = agent self.model = learning_model self.replay_memory = replay_memory self.agent.update_actor(self.model.get_actor()) if isinstance(self.agent, Thread): print("agent as a thread start it") self.agent.start() self.step = 0
def run(self): for x in range(self.config.total_step): batch = self.replay_memory.sample() if batch is not None: training_info = self.model.step(batch) if training_info is not None: self.replay_memory.training_info(batch, training_info) self.agent.update_actor(self.model.get_actor()) if x % 5000 == 0: output_file_path = GLOBAL_LOGGER.get_log_path() self.model.save(output_file_path, str(x)) GLOBAL_LOGGER.reset_event_file() output_file_path = GLOBAL_LOGGER.get_log_path() self.model.save(output_file_path, 'final')
def __init__(self, id, config, replay_memory, scheduler_function=None): StatusObject.__init__(self) self.id = id self.env_id = None self.replay_memory = replay_memory self.actor = None self.actor_lock = Lock() self.config = config self.scheduler_function = scheduler_function GLOBAL_LOGGER.get_tb_logger().add_text_of_object( "AGENT_CONFIG", self.config) if isinstance(self.config.noise_config, OU_ACTION_NOISE_CONFIG): self.action_noise = OUActionNoise(self.id, self.config.noise_config) else: self.action_noise = None
def __init__(self, id, config): """Initialize a ReplayBuffer object. Params ====== buffer_size (int): maximum size of buffer batch_size (int): size of each training batch """ self.id = id self.config = config GLOBAL_LOGGER.get_tb_logger().add_text_of_object( "REPLAY_MEMORY_CONFIG", self.config) self.memory = deque( maxlen=self.config.buffer_size) # internal memory (deque) self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.seed = random.seed(self.config.seed) self.step_lock = Lock() self.sample_lock = Lock() self.sample_lock.acquire() # allow save_step first
def run(self): rewards_his = np.zeros(self.config.n_ue) total_reward_his = 0 for e in range(self.config.n_episode): self.init_env() for t in range(self.config.n_step): state = self.get_state() action = self.agent.get_action(state) action_ = np.copy(action) phi = self.get_phi() if self.config.action_conversion_f is None: action_[action_ > 0.] = 1 action_[action_ <= 0.] = 0 else: action_ = self.config.action_conversion_f(action_) rewards = self.step(action_) rewards_his = 0.99 * rewards_his + 0.01 * rewards for i in range(self.config.n_ue): GLOBAL_LOGGER.get_tb_logger().add_scalar('UE_REWARD.moving_avg.' + str(i), rewards_his[i], self.n_step) total_reward = np.sum(rewards) total_reward_his = 0.99 * total_reward_his + 0.01 * total_reward GLOBAL_LOGGER.get_tb_logger().add_scalar('ENV_REWARD.moving_avg', total_reward_his, self.n_step) next_state = self.get_state() phi_next = self.get_phi() shaper = - 1. * (phi - self.gamma * phi_next) for i in range(self.config.n_ue): GLOBAL_LOGGER.get_tb_logger().add_scalar('shaper.' + str(i), shaper[i], self.n_step) done = 0 if t == self.config.n_step - 1: done = 1 self.agent.save_step(state, action, rewards + shaper, next_state, done)
def run(self): for x in range(self.config.total_step): t1 = time.time() batch = self.replay_memory.sample(asynchronization=True) if batch is not None: training_info = self.model.step(batch) if training_info is not None: self.replay_memory.training_info(batch, training_info) self.agent.update_actor(self.model.get_actor()) GLOBAL_LOGGER.get_tb_logger().add_scalar( "training_time", time.time() - t1, x) else: time.sleep(0.01) if x % 5000 == 0: output_file_path = GLOBAL_LOGGER.get_log_path() self.model.save(output_file_path, str(x)) GLOBAL_LOGGER.reset_event_file() output_file_path = GLOBAL_LOGGER.get_log_path() self.model.save(output_file_path, 'final')
def step(self, batch): self._print("learn") states = to_tensor(batch[0]) actions = to_tensor(batch[1]) rewards = self._reward(to_tensor(batch[2]), states) next_states = to_tensor(batch[3]) done = to_tensor(batch[4]) self._print("states", batch[0]) self._print("actions", batch[1]) a = self.actor_target.forward(next_states) a = self._action_match(a) s_a = torch.cat((next_states, a), dim=1) q = self.critic_target.forward(s_a) y = torch.mul(q, self.config.rl_config.gamma) self._print("gamma", self.config.rl_config.gamma) self._print("rewards", rewards) self._print("q", q) y = torch.add(rewards, y).detach() self._print("y", y) actions = self._action_match(actions) s_a = torch.cat((states, actions), dim=1) q = self.critic.forward(s_a) l_critic = F.smooth_l1_loss(q, y, reduction='none') self._print("loss", l_critic) l_critic_per_batch = torch.sum(l_critic, dim=1, keepdim=True) self._print('l_critic_per_batch', l_critic_per_batch) ret_per_e = to_numpy(l_critic) ret_per_e = ret_per_e * self._per_w_multiplier(batch) self._print('ret_per_e_full', ret_per_e) ret_per_e = np.sum(ret_per_e, axis=1, keepdims=True) self._print('ret_per_e', ret_per_e) if len(batch) > 5: weights = to_tensor(batch[5]) self._print("weights", weights) l_critic = torch.mul(l_critic_per_batch, weights) self._print("w_l_critic", l_critic) l_critic = torch.mean(l_critic) self.critic_optim.zero_grad() l_critic.backward() self.critic_optim.step() a = self.actor.forward(states) s_a = torch.cat((states, a), dim=1) l_actor = self.critic.forward(s_a) l_actor_per_batch = torch.sum(l_actor, dim=1, keepdim=True) if len(batch) > 5: weights = to_tensor(batch[5]) self._print("weights", weights) l_actor = torch.mul(l_actor_per_batch, weights) self._print("w_l_actor", l_actor) l_actor = torch.mean(torch.neg(l_actor)) self.actor_optim.zero_grad() l_actor.backward() self.actor_optim.step() GLOBAL_LOGGER.get_tb_logger().add_scalar('DDPG.loss_actor', to_numpy(l_actor), self.n_step) GLOBAL_LOGGER.get_tb_logger().add_scalar('DDPG.loss_critic', to_numpy(l_critic), self.n_step) self.update_nn() self.step_counter += 1 return ret_per_e
drl_c = exp_drl_config(2, int(1e9)) drl_c.actor_lr = 1e-4 drl_c.critic_lr = 1e-4 dir = os.path.dirname(os.path.abspath(__file__)) dir = os.path.join(dir, 'example_nn') drl_c.actor_load_path = os.path.join(dir, 'actor_0.pt') drl_c.critic_load_path = os.path.join(dir, 'critic_0.pt') drl_c.reload_config() assert isinstance(drl_c.actor_config.af_config[-1], nn.modules.Tanh) log_path = os.path.dirname(os.path.realpath(__file__)) folder_name = "online/" experiment_name = "online_example" GLOBAL_LOGGER.set_log_path(log_path, folder_name, experiment_name) rm = SimReplayMemory(0, drl_c.replay_memory_config) nf = None # def noise_f(p, t_now, t_start): # return p + 0.1 * p * np.random.randn() * math.exp(- (t_now - t_start) / 60) # # nf = noise_f rt_agent = RTAgent(0, None, '0.0.0.0', 4000, drl_c.actor_config, GLOBAL_LOGGER, nf) rt_agent.set_replay_memory(rm) ddpg = MultiHeadCriticDDPG(0, drl_c.ddpg_config)
def __init__(self, id, config, agent): Thread.__init__(self) SimEnv.__init__(self, id, config, agent) GLOBAL_LOGGER.get_tb_logger().add_text_of_object("ENV_CONFIG", self.config)
from sim_src.sim_env.sim_agent import SimAgent from sim_src.sim_env.sim_env import SimEnvTxBinary_RewardShaping from sim_src.tb_logger import GLOBAL_LOGGER env_c = env_config_helper() drl_c = ddpg_config_helper(env_c.N_UE, env_c.N_STEP * env_c.N_EPISODE) env_c.reload_config() drl_c.reload_config() import os log_path = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) folder_name = "tb-data" experiment_name = "ka" GLOBAL_LOGGER.set_log_path(log_path, folder_name, experiment_name) scalar_list = [] scalar = 'TX_DELAY_' scalar_list.append(scalar) scalar = 'N_RLCTX_' scalar_list.append(scalar) scalar = 'N_DISCARD_' scalar_list.append(scalar) scalar = 'RLC_REWARD_' scalar_list.append(scalar) scalar = 'N_CH_TX_OK_' scalar_list.append(scalar)
SctpServer.__init__(self, 'TBServer', server_bind_ip, server_bind_port, 100) self.logger = logger self.tb_client_listener_thread_list = [] def connection_handler(self, conn): print('TBServer get conn from', conn) c = TBClientListener(conn, self.logger) c.start() self.tb_client_listener_thread_list.append(c) if __name__ == '__main__': from sim_src.tb_logger import GLOBAL_LOGGER GLOBAL_LOGGER.set_log_path('/tmp/aaaaa/', 'test_tensor_board_server', 'test_tensor_board_server') t = scalar() t.tti = 1213 t.name = 'hello' ts = Timestamp() t.timestamp.seconds = 10 t.timestamp.nanos = 112310 print(t.timestamp.ToMicroseconds()) print(t.name) print(t.timestamp) print(t.tti) print(t.value) server = TBServer(server_bind_ip='127.0.1.100', server_bind_port=TENSORBOARD_SERVER_PORT,
def step(self, action): ret = 0. n_txed = 0 if action.tx: ret = self.get_hol_reward() if self.queue: GLOBAL_LOGGER.get_tb_logger().add_scalar( 'TX_DELAY_' + str(self.id), self.get_hol(), self.n_step) n_txed = 1 self.pop() GLOBAL_LOGGER.get_tb_logger().add_scalar('N_RLCTX_' + str(self.id), n_txed, self.n_step) n_discard = self.push() GLOBAL_LOGGER.get_tb_logger().add_scalar('N_PACKET_' + str(self.id), self.n_packet, self.n_step) # assuming packet is arrived at the end of the last TTI self.time_step += 1 n_discard += self.discard() GLOBAL_LOGGER.get_tb_logger().add_scalar('N_DISCARD_' + str(self.id), n_discard, self.n_step) GLOBAL_LOGGER.get_tb_logger().add_scalar('RLC_REWARD_' + str(self.id), ret, self.n_step) GLOBAL_LOGGER.get_tb_logger().add_scalar('HOL_' + str(self.id), self.get_hol(), self.n_step) GLOBAL_LOGGER.get_tb_logger().add_scalar('Qsize_' + str(self.id), self.get_n_byte_total(), self.n_step) return ret
def step(self, action): err = 0. if action.n_rb > 0: err = tx_error_rate_for_n_bytes(action.n_byte, action.n_rb, db_to_dec(self.get_snr_db()), self.config.T_f, self.config.rb_bw) if action.n_rb >= self.config.total_n_rb and err < 1e-5: err = 1e-5 if err < 1e-5: ret = 5. else: ret = -math.log10(err) else: ret = 0. n_successful_tx = 1 if p_true(err): n_successful_tx = 0 GLOBAL_LOGGER.get_tb_logger().add_scalar('NRB_' + str(self.id), action.n_rb, self.n_step) GLOBAL_LOGGER.get_tb_logger().add_scalar('SNR_' + str(self.id), self.get_snr_db(), self.n_step) GLOBAL_LOGGER.get_tb_logger().add_scalar('E_' + str(self.id), err, self.n_step) GLOBAL_LOGGER.get_tb_logger().add_scalar('DIS_' + str(self.id), self.dis, self.n_step) GLOBAL_LOGGER.get_tb_logger().add_scalar('CH_REWARD_' + str(self.id), ret, self.n_step) GLOBAL_LOGGER.get_tb_logger().add_scalar('N_CH_TX_OK_' + str(self.id), n_successful_tx, self.n_step) self.change_position() return float(n_successful_tx)
# DRL-5G-Scheduler; Author: Zhouyou Gu ([email protected]); # Supervisors: Wibowo Hardjawana; Branka Vucetic; # This project is developed at Centre for IoT and Telecommunications at The University of Sydney, # under a project directly funded by Telstra Corporation Ltd., titled # ”Development of an Open Programmable Scheduler for LTE Networks”, from 2018 to 2019. # Reference: Z. Gu, C. She, W. Hardjawana, S. Lumb, D. McKechnie, T. Essery, and B. Vucetic, # “Knowledge-assisted deep reinforcement learning in 5G scheduler design: # From theoretical framework to implementation,” IEEE JSAC., to appear, 2021 from exp_src.tensorboard_server import * from sim_src.tb_logger import GLOBAL_LOGGER log_path = "./" folder_name = "tb_server_log/" experiment_name = "tb_server_log" GLOBAL_LOGGER.set_log_path(log_path, folder_name, experiment_name) server = TBServer(server_bind_ip='0.0.0.0', server_bind_port=TENSORBOARD_SERVER_PORT, logger=GLOBAL_LOGGER.get_tb_logger()) server.start() server.join()