class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.q_net1(next_states, next_actions) next_q2 = self.q_net2(next_states, next_actions) next_v = self.target_value_net(next_states) # value Loss next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(states) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) expected_q = rewards + (1 - dones) * self.gamma * next_v q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value network and q networks self.value_optimizer.zero_grad() v_loss.backward() self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() #delayed update for policy net and target value nets if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(states) min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1
class SACAgent(): def __init__(self, env: object, gamma: float, tau: float, buffer_maxlen: int, critic_lr: float, actor_lr: float, reward_scale: int): # Selecting the device to use, wheter CUDA (GPU) if available or CPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Creating the Gym environments for training and evaluation self.env = env # Get max and min values of the action of this environment self.action_range = [ self.env.action_space.low, self.env.action_space.high ] # Get dimension of of the state and the action self.obs_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.critic_lr = critic_lr self.actor_lr = actor_lr self.buffer_maxlen = buffer_maxlen self.reward_scale = reward_scale # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions self.scale = (self.action_range[1] - self.action_range[0]) / 2.0 self.bias = (self.action_range[1] + self.action_range[0]) / 2.0 # initialize networks self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy weight parameters to the target Q networks for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=self.critic_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=self.critic_lr) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=self.actor_lr) # Create a replay buffer self.replay_buffer = BasicBuffer(self.buffer_maxlen) def update(self, batch_size: int): # Sampling experiences from the replay buffer states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # Convert numpy arrays of experience tuples into pytorch tensors states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = self.reward_scale * torch.FloatTensor(rewards).to( self.device) # in SAC we do reward scaling for the sampled rewards next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Critic update (computing the loss) # Please refer to equation (6) in the paper for details # Sample actions for the next states (s_t+1) using the current policy next_actions, next_log_pi, _, _ = self.policy.sample( next_states, self.scale) next_actions = self.rescale_action(next_actions) # Compute Q(s_t+1,a_t+1) by giving the states and actions to the Q network and choose the minimum from 2 target Q networks next_q1 = self.target_q_net1(next_states, next_actions) next_q2 = self.target_q_net2(next_states, next_actions) min_q = torch.min(next_q1, next_q2) # find minimum between next_q1 and next_q2 # Compute the next Q_target (Q(s_t,a_t)-alpha(next_log_pi)) next_q_target = (min_q - next_log_pi) # Compute the Q(s_t,a_t) using s_t and a_t from the replay buffer curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) # Find expected Q, i.e., r(t) + gamma*next_q_target expected_q = rewards + (1 - dones) * self.gamma * next_q_target # Compute loss between Q network and expected Q q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # Backpropagate the losses and update Q network parameters self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # Policy update (computing the loss) # Sample new actions for the current states (s_t) using the current policy new_actions, log_pi, _, _ = self.policy.sample(states, self.scale) new_actions = self.rescale_action(new_actions) # Compute Q(s_t,a_t) and choose the minimum from 2 Q networks new_q1 = self.q_net1.forward(states, new_actions) new_q2 = self.q_net2.forward(states, new_actions) min_q = torch.min(new_q1, new_q2) # Compute the next policy loss, i.e., alpha*log_pi - Q(s_t,a_t) eq. (7) policy_loss = (log_pi - min_q).mean() # Backpropagate the losses and update policy network parameters self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # Updating target networks with soft update using update rate tau for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) def get_action( self, state: np.ndarray, stochastic: bool) -> Tuple[np.ndarray, torch.Tensor, torch.Tensor]: # state: the state input to the pi network # stochastic: boolean (True -> use noisy action, False -> use noiseless (deterministic action)) state = torch.FloatTensor(state).unsqueeze(0).to(self.device) # Get mean and sigma from the policy network mean, log_std = self.policy.forward(state) std = log_std.exp() # Stochastic mode is used for training, non-stochastic mode is used for evaluation if stochastic: normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() else: normal = Normal(mean, 0) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() # return a rescaled action, and also the mean and standar deviation of the action # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value return self.rescale_action(action), mean, std def rescale_action(self, action: np.ndarray) -> np.ndarray: # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value # scale -> scalar multiplication # bias -> scalar offset return action * self.scale[0] + self.bias[0] def Actor_save(self, WORKSPACE: str): # save 각 node별 모델 저장 print("Save the torch model") savePath = WORKSPACE + "./policy_model5_Hop_.pth" torch.save(self.policy.state_dict(), savePath) def Actor_load(self, WORKSPACE: str): # save 각 node별 모델 로드 print("load the torch model") savePath = WORKSPACE + "./policy_model5_Hop_.pth" # Best self.policy = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy.load_state_dict(torch.load(savePath))
class DecoupledWorker(mp.Process): def __init__(self, id, env, gamma, global_value_network, global_policy_network, global_value_optimizer, global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE): super(DecoupledWorker, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.name = "w%i" % id self.env = env self.env.seed(id) self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.local_value_network = ValueNetwork(self.obs_dim, 1) self.local_policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.global_value_network = global_value_network self.global_policy_network = global_policy_network self.global_episode = global_episode self.global_value_optimizer = global_value_optimizer self.global_policy_optimizer = global_policy_optimizer self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE # sync local networks with global networks self.sync_with_global() def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.local_policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.local_value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.local_policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss def update_global(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.global_value_optimizer.zero_grad() value_loss.backward() # propagate local gradients to global parameters for local_params, global_params in zip( self.local_value_network.parameters(), self.global_value_network.parameters()): global_params._grad = local_params._grad self.global_value_optimizer.step() self.global_policy_optimizer.zero_grad() policy_loss.backward() # propagate local gradients to global parameters for local_params, global_params in zip( self.local_policy_network.parameters(), self.global_policy_network.parameters()): global_params._grad = local_params._grad #print(global_params._grad) self.global_policy_optimizer.step() def sync_with_global(self): self.local_value_network.load_state_dict( self.global_value_network.state_dict()) self.local_policy_network.load_state_dict( self.global_policy_network.state_dict()) def run(self): state = self.env.reset() trajectory = [] # [[s, a, r, s', done], [], ...] episode_reward = 0 while self.global_episode.value < self.GLOBAL_MAX_EPISODE: action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) trajectory.append([state, action, reward, next_state, done]) episode_reward += reward if done: with self.global_episode.get_lock(): self.global_episode.value += 1 print(self.name + " | episode: " + str(self.global_episode.value) + " " + str(episode_reward)) self.update_global(trajectory) self.sync_with_global() trajectory = [] episode_reward = 0 state = self.env.reset() else: state = next_state
class SACAgent: def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [0, 250] self.obs_dim = env.state_dim self.action_dim = env.action_dim # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) # entropy temperature self.alpha = alpha self.target_entropy = -torch.prod( torch.Tensor([self.action_dim, 1]).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.target_q_net1(next_states, next_actions) next_q2 = self.target_q_net2(next_states, next_actions) next_q_target = torch.min(next_q1, next_q2) - self.alpha * next_log_pi expected_q = rewards + (1 - dones) * self.gamma * next_q_target # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update q networks self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # delayed update for policy network and target q networks new_actions, log_pi = self.policy_net.sample(states) if self.update_step % self.delay_step == 0: min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (self.alpha * log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) # update temperature alpha_loss = (self.log_alpha * (-log_pi - self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() self.update_step += 1
class DRTRPOAgent(): """ DR TRPO """ def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = logits probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_adv_mc(self, trajectory): """ Compute the advantage of all (st,at) in trajectory. The advantage is estimated using MC: i.e. discounted reward sum (from trajectory) - value (from NN) """ states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) advantages = value_targets - values return advantages, value_loss def compute_adv_td(self, state, next_state, reward): """ Compute the advantage of a single (s,a) using TD: i.e. r + v(s') - v(s) - depends highly on the accuracy of NN """ state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) reward = torch.as_tensor(reward) state_value = self.value_network.forward(state) next_state_value = self.value_network.forward(next_state) value_target = reward + next_state_value advantage = value_target - state_value value_loss = F.mse_loss(state_value, value_target) return advantage, value_loss def compute_policy_loss_kl(self, state, state_adv, beta): """ Policy loss of DR TRPO (KL Constraint). """ state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) pi_dist = logits state_adv = torch.FloatTensor(state_adv).to(self.device) denom = torch.sum(torch.exp(state_adv / beta) * pi_dist) new_pi_dist = torch.exp(state_adv / beta) * pi_dist / denom return F.mse_loss(pi_dist, new_pi_dist) def compute_policy_loss_wass(self, state, state_adv, beta): """ Policy loss of DR TRPO (Wasserstein Constraint). """ state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) pi_dist = logits state_adv = torch.FloatTensor(state_adv).to(self.device) """Find argmax_j {A(s,aj) - β*d(aj,ai)}.""" best_j = [] for i in range(self.action_dim): opt_j = 0 opt_val = state_adv[opt_j] - beta * self.compute_distance(opt_j, i) for j in range(self.action_dim): cur_val = state_adv[j] - beta * self.compute_distance(j, i) if cur_val > opt_val: opt_j = j opt_val = cur_val best_j.append(opt_j) new_pi_dist = torch.zeros(self.action_dim) for j in range(self.action_dim): for i in range(self.action_dim): if j == best_j[i]: new_pi_dist[j] += pi_dist[i] return F.mse_loss(pi_dist, new_pi_dist) def compute_distance(self, a1, a2): if a1 == a2: return 0 else: return 1 def update(self, value_loss, policy_loss): self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class A2CAgent(): def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss def update(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class A2CAgent(): def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = logits probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory, adv_method): """ When gamma is large, the NN loss does not converge, we should use MC to estimate advantage. When gamma is small (i.e. 0.9), the NN loss decreases after training, we can use TD to estimate advantage. """ states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = logits probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() # 0 for MC, 1 for TD if adv_method == 0: advantages = value_targets - values if adv_method == 1: advantages = rewards - values + self.gamma * torch.cat( (values[1:], torch.FloatTensor([[0]])), dim=0) policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantages.detach() policy_loss = policy_loss.sum() - 0.001 * entropy return value_loss, policy_loss def update(self, trajectory, adv_method): value_loss, policy_loss = self.compute_loss(trajectory, adv_method) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.firsttime = 0 self.env = env self.action_range = [env.action_space.low, env.action_space.high] #self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] #1 self.conv_channels = 4 self.kernel_size = (3, 3) self.img_size = (500, 500, 3) print("Diagnostics:") print(f"action_range: {self.action_range}") #print(f"obs_dim: {self.obs_dim}") print(f"action_dim: {self.action_dim}") # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.feature_net = FeatureExtractor(self.img_size[2], self.conv_channels, self.kernel_size).to(self.device) print("Feature net init'd successfully") input_dim = self.feature_net.get_output_size(self.img_size) self.input_size = input_dim[0] * input_dim[1] * input_dim[2] print(f"input_size: {self.input_size}") self.value_net = ValueNetwork(self.input_size, 1).to(self.device) self.target_value_net = ValueNetwork(self.input_size, 1).to(self.device) self.q_net1 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.input_size, self.action_dim).to(self.device) print("Finished initing all nets") # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) print("Finished copying targets") # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) print("Finished initing optimizers") self.replay_buffer = BasicBuffer(buffer_maxlen) print("End of init") def get_action(self, state): if state.shape != self.img_size: print( f"Invalid size, expected shape {self.img_size}, got {state.shape}" ) return None inp = torch.from_numpy(state).float().permute(2, 0, 1).unsqueeze(0).to( self.device) features = self.feature_net(inp) features = features.view(-1, self.input_size) mean, log_std = self.policy_net.forward(features) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # states and next states are lists of ndarrays, np.stack converts them to # ndarrays of shape (batch_size, height, width, num_channels) states = np.stack(states) next_states = np.stack(next_states) states = torch.FloatTensor(states).permute(0, 3, 1, 2).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).permute(0, 3, 1, 2).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Process images features = self.feature_net( states) #.contiguous() # Properly shaped due to batching next_features = self.feature_net(next_states) #.contiguous() features = torch.reshape(features, (64, self.input_size)) next_features = torch.reshape(next_features, (64, self.input_size)) next_actions, next_log_pi = self.policy_net.sample(next_features) next_q1 = self.q_net1(next_features, next_actions) next_q2 = self.q_net2(next_features, next_actions) next_v = self.target_value_net(next_features) next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(features) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss expected_q = rewards + (1 - dones) * self.gamma * next_v curr_q1 = self.q_net1.forward(features, actions) curr_q2 = self.q_net2.forward(features, actions) q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value and q networks self.value_optimizer.zero_grad() v_loss.backward(retain_graph=True) self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward(retain_graph=True) self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward(retain_graph=True) self.q2_optimizer.step() # delayed update for policy network and target q networks if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(features) min_q = torch.min(self.q_net1.forward(features, new_actions), self.q_net2.forward(features, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward(retain_graph=True) self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1