def init(self, net_dim, state_dim, action_dim): # explict call self.init() for multiprocessing self.action_dim = action_dim self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.cri = QNet(net_dim, state_dim, action_dim).to(self.device) self.cri_target = QNet(net_dim, state_dim, action_dim).to(self.device) self.act = self.cri # to keep the same from Actor-Critic framework self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate)
def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): self.explore_rate = 0.1 # the probability of choosing action randomly in epsilon-greedy self.action_dim = action_dim self.state = None # set for self.update_buffer(), initialize self.state before training self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.act = QNet(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.criterion = torch.torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam(self.act.parameters(), lr=learning_rate)
class AgentDQN: def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): self.explore_rate = 0.1 # the probability of choosing action randomly in epsilon-greedy self.action_dim = action_dim self.state = None # set for self.update_buffer(), initialize self.state before training self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.act = QNet(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.criterion = torch.torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam(self.act.parameters(), lr=learning_rate) def select_actions(self, states): # for discrete action space if rd.rand() < self.explore_rate: # epsilon-greedy a_int = rd.randint( self.action_dim, size=(len(states), )) # choosing action randomly else: states = torch.as_tensor(states, dtype=torch.float32, device=self.device) actions = self.act(states) a_int = actions.argmax(dim=1).detach().cpu().numpy() return a_int def update_buffer(self, env, buffer, max_step, reward_scale, gamma): for _ in range(max_step): action = self.select_actions((self.state, ))[0] next_s, reward, done, _ = env.step(action) other = (reward * reward_scale, 0.0 if done else gamma, action ) # action is an int buffer.append_memo(self.state, other) self.state = env.reset() if done else next_s return max_step def update_net(self, buffer, max_step, batch_size, repeat_times): buffer.update__now_len__before_sample() next_q = obj_critic = None for _ in range(int(max_step * repeat_times)): with torch.no_grad(): reward, mask, action, state, next_s = buffer.random_sample( batch_size) # next_state next_q = self.act_target(next_s).max(dim=1, keepdim=True)[0] q_label = reward + mask * next_q q_eval = self.act(state).gather(1, action.type(torch.long)) obj_critic = self.criterion(q_eval, q_label) self.optimizer.zero_grad() obj_critic.backward() self.optimizer.step() soft_target_update(self.act_target, self.act, tau=5e-3) return next_q.mean().item(), obj_critic.item() #
class AgentDQN(AgentBase): def __init__(self): super().__init__() self.explore_rate = 0.1 # the probability of choosing action randomly in epsilon-greedy self.action_dim = None # chose discrete action randomly in epsilon-greedy def init(self, net_dim, state_dim, action_dim): # explict call self.init() for multiprocessing self.action_dim = action_dim self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.cri = QNet(net_dim, state_dim, action_dim).to(self.device) self.cri_target = QNet(net_dim, state_dim, action_dim).to(self.device) self.act = self.cri # to keep the same from Actor-Critic framework self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) def select_action(self, state) -> int: # for discrete action space if rd.rand() < self.explore_rate: # epsilon-greedy a_int = rd.randint(self.action_dim) else: states = torch.as_tensor((state,), dtype=torch.float32, device=self.device).detach_() action = self.act(states)[0] a_int = action.argmax().cpu().numpy() return a_int def explore_env(self, env, buffer, target_step, reward_scale, gamma) -> int: for _ in range(target_step): action = self.select_action(self.state) next_s, reward, done, _ = env.step(action) other = (reward * reward_scale, 0.0 if done else gamma, action) # action is an int buffer.append_buffer(self.state, other) self.state = env.reset() if done else next_s return target_step def update_net(self, buffer, target_step, batch_size, repeat_times) -> (float, float): buffer.update_now_len_before_sample() q_value = obj_critic = None for _ in range(int(target_step * repeat_times)): obj_critic, q_value = self.get_obj_critic(buffer, batch_size) self.cri_optimizer.zero_grad() obj_critic.backward() self.cri_optimizer.step() self.soft_update(self.cri_target, self.cri, self.soft_update_tau) return q_value.mean().item(), obj_critic.item() def get_obj_critic(self, buffer, batch_size) -> (torch.Tensor, torch.Tensor): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch(batch_size) next_q = self.cri_target(next_s, self.act_target(next_s)) q_label = reward + mask * next_q q_value = self.cri(state).gather(1, action.type(torch.long)) obj_critic = self.criterion(q_value, q_label) return obj_critic, q_value
class AgentDQN: def __init__(self): super().__init__() self.explore_rate = 0.1 # the probability of choosing action randomly in epsilon-greedy self.action_dim = None # chose discrete action randomly in epsilon-greedy self.learning_rate = 1e-4 self.soft_update_tau = 2 ** -8 # 5e-3 ~= 2 ** -8 self.state = None # set for self.update_buffer(), initialize before training self.device = None self.cri = self.cri_target = None self.act = self.cri # to keep the same from Actor-Critic framework self.criterion = None self.optimizer = None def init(self, net_dim, state_dim, action_dim): # explict call self.init() for multiprocessing self.action_dim = action_dim self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.cri = QNet(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.act = self.cri # to keep the same from Actor-Critic framework self.criterion = torch.torch.nn.MSELoss() self.optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) def select_actions(self, states): # for discrete action space if rd.rand() < self.explore_rate: # epsilon-greedy a_int = rd.randint(self.action_dim, size=(len(states),)) # choosing action randomly else: states = torch.as_tensor(states, dtype=torch.float32, device=self.device) actions = self.act(states) a_int = actions.argmax(dim=1).detach().cpu().numpy() return a_int def store_transition(self, env, buffer, target_step, reward_scale, gamma): for _ in range(target_step): action = self.select_actions((self.state,))[0] next_s, reward, done, _ = env.step(action) other = (reward * reward_scale, 0.0 if done else gamma, action) # action is an int buffer.append_buffer(self.state, other) self.state = env.reset() if done else next_s return target_step def update_net(self, buffer, max_step, batch_size, repeat_times): buffer.update__now_len__before_sample() next_q = obj_critic = None for _ in range(int(max_step * repeat_times)): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch(batch_size) # next_state next_q = self.cri_target(next_s).max(dim=1, keepdim=True)[0] q_label = reward + mask * next_q q_eval = self.cri(state).gather(1, action.type(torch.long)) obj_critic = self.criterion(q_eval, q_label) self.optimizer.zero_grad() obj_critic.backward() self.optimizer.step() self.soft_update(self.cri_target, self.cri) return next_q.mean().item(), obj_critic.item() def soft_update(self, target_net, current_net): for tar, cur in zip(target_net.parameters(), current_net.parameters()): tar.data.copy_(cur.data * self.soft_update_tau + tar.data * (1 - self.soft_update_tau))