class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.name = "DDPG" self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, 'actor_local') self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, 'actor_target') # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, 'critic_local') self.critic_target = Critic(self.state_size, self.action_size, 'critic_target') # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Reward counter self.total_reward = 0 self.n_steps = 0 def load(self): self.actor_local.load() self.actor_target.load() self.critic_local.load() self.critic_target.load() print("Agent's weights loaded from disk.") def save(self): self.actor_local.save() self.actor_target.save() self.critic_local.save() self.critic_target.save() print("Agent's weights saved to disk.") def reset_episode(self): self.total_reward = 0 self.n_steps = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Add reward to total self.total_reward += reward self.n_steps += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state, add_noise=True): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] # Hack, rescale rotor revs to +-5 range from average # rev_mean = np.mean(action) # action = (action-450)/450 # action *= 50 # action += rev_mean if add_noise: action += self.noise.sample() # additive noise for exploration return list(action) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class CAC(object): def __init__(self, a_dim, s_dim, variant, action_prior='uniform', max_global_steps=100000): """ a_dim : dimension of action space s_dim: state space dimension variant: dictionary containing parameters for the algorithms """ ############################### Model parameters #################################### set_seed(variant['seed']) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.actor = Actor(input_dim=s_dim, output_dim=a_dim, n_layers=3, layer_sizes=[256, 256, 256], hidden_activation="leakyrelu").to(self.device) self.actor_target = Actor(input_dim=s_dim, output_dim=a_dim, n_layers=3, layer_sizes=[256, 256, 256], hidden_activation="leakyrelu").to( self.device).eval() self.critic = LyapunovCritic(state_dim=s_dim, action_dim=a_dim, output_dim=None, n_layers=2, layer_sizes=[256, 256], hidden_activation="leakyrelu").to( self.device) self.critic_target = LyapunovCritic(state_dim=s_dim, action_dim=a_dim, output_dim=None, n_layers=2, layer_sizes=[256, 256], hidden_activation="leakyrelu").to( self.device).eval() # copy parameters of the learning network to the target network hard_update(self.critic_target, self.critic) hard_update(self.actor_target, self.actor) # disable gradient calculations of the target network stop_grad(self.critic_target) stop_grad(self.actor_target) # self.memory_capacity = variant['memory_capacity'] ################################ parameters for training ############################### self.batch_size = variant[ 'batch_size'] # batch size for learning the actor self.gamma = variant['gamma'] # discount factor self.tau = variant['tau'] # smoothing parameter for the weight updates self.approx_value = True if 'approx_value' not in variant.keys( ) else variant['approx_value'] self._action_prior = action_prior # prior over action space s_dim = s_dim * (variant['history_horizon'] + 1) self.a_dim, self.s_dim, = a_dim, s_dim self.history_horizon = variant[ 'history_horizon'] # horizon to consider for the history self.working_memory = deque(maxlen=variant['history_horizon'] + 1) # memory to store history target_entropy = variant['target_entropy'] if target_entropy is None: self.target_entropy = -self.a_dim #lower bound of the policy entropy else: self.target_entropy = target_entropy self.target_variance = 0.0 self.finite_horizon = variant['finite_horizon'] self.soft_predict_horizon = variant['soft_predict_horizon'] self.use_lyapunov = variant['use_lyapunov'] self.adaptive_alpha = variant['adaptive_alpha'] self.adaptive_beta = variant[ 'adaptive_beta'] if 'adaptive_beta' in variant.keys() else False self.time_near = variant['Time_near'] self.max_global_steps = max_global_steps self.LR_A = variant['lr_a'] self.LR_L = variant['lr_l'] self.LR_lag = self.LR_A / 10 self.alpha3 = variant['alpha3'] labda = variant['labda'] # formula (12) in the paper alpha = variant['alpha'] # entropy temperature (beta in the paper) beta = variant['beta'] # constraint error weight self.log_labda = torch.log(torch.tensor([labda], device=self.device)) self.log_alpha = torch.log(torch.tensor( [alpha], device=self.device)) # Entropy Temperature self.log_beta = torch.log(torch.tensor([beta], device=self.device)) self.log_alpha.requires_grad = True self.log_beta.requires_grad = True self.log_labda.requires_grad = True # The update is in log space self.labda = torch.clamp(torch.exp(self.log_labda), min=SCALE_lambda_MIN_MAX[0], max=SCALE_lambda_MIN_MAX[1]) self.alpha = torch.exp(self.log_alpha) self.beta = torch.clamp(torch.exp(self.log_beta), min=SCALE_beta_MIN_MAX[0], max=SCALE_beta_MIN_MAX[1]) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=self.LR_A) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=self.LR_L) self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.LR_A) self.labda_optim = torch.optim.Adam([self.log_labda], lr=self.LR_lag) self.beta_optim = torch.optim.Adam([self.log_beta], lr=0.01) # step_fn = lambda i : 1.0 - (i - 1.)/self.max_global_steps # self.actor_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.actor_optim, lr_lambda = step_fn) # self.critic_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.critic_optim, lr_lambda = step_fn) # self.alpha_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.alpha_optim, lr_lambda = step_fn) # self.labda_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.labda_optim, lr_lambda = step_fn) # self.beta_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.beta_optim, lr_lambda = step_fn) self.actor.float() self.critic.float() def act(self, s, evaluation=False): a, deterministic_a, _, _ = self.actor(s) if evaluation is True: return deterministic_a else: return a def learn(self, batch): bs = torch.tensor(batch['s'], dtype=torch.float).to(self.device) # state ba = torch.tensor(batch['a'], dtype=torch.float).to(self.device) # action br = torch.tensor(batch['r'], dtype=torch.float).to(self.device) # reward bterminal = torch.tensor(batch['terminal'], dtype=torch.float).to(self.device) bs_ = torch.tensor(batch['s_'], dtype=torch.float).to(self.device) # next state b_s = torch.tensor(batch['_s'], dtype=torch.float).to(self.device) # prev state bv = None b_r_ = None # print(bs) alpha_loss = None beta_loss = None # # beta learning # self.beta_optim.zero_grad() # beta_loss = self.get_beta_loss(b_s) # if self.adaptive_beta: # beta_loss.backward(retain_graph = False) # self.beta_optim.step() # else: # self.beta_optim.zero_grad() # lyapunov learning start_grad(self.critic) if self.finite_horizon: bv = torch.tensor(batch['value']) b_r_ = torch.tensor(batch['r_N_']) self.critic_optim.zero_grad() critic_loss = self.get_lyapunov_loss(bs, bs_, ba, br, b_r_, bv, bterminal) critic_loss.backward() self.critic_optim.step() # actor lerning stop_grad(self.critic) self.actor_optim.zero_grad() actor_loss = self.get_actor_loss(bs, bs_, ba, br) actor_loss.backward(retain_graph=False) self.actor_optim.step() # alpha learning if self.adaptive_alpha: self.alpha_optim.zero_grad() alpha_loss = self.get_alpha_loss(bs, self.target_entropy) alpha_loss.backward(retain_graph=False) self.alpha_optim.step() self.alpha = torch.exp(self.log_alpha) # labda learning self.labda_optim.zero_grad() labda_loss = self.get_labda_loss(br, bs, bs_, ba) # print("labda loss = ", labda_loss) labda_loss.backward(retain_graph=False) self.labda_optim.step() self.labda = torch.clamp(torch.exp(self.log_labda), min=SCALE_lambda_MIN_MAX[0], max=SCALE_lambda_MIN_MAX[1]) # update target networks soft_update(self.critic_target, self.critic, self.tau) soft_update(self.actor_target, self.actor, self.tau) return alpha_loss, beta_loss, labda_loss, actor_loss, critic_loss def get_alpha_loss(self, s, target_entropy): # with torch.no_grad(): # _, self.deterministic_a,self.log_pis, _ = self.actor_target(s) intermediate = (self.log_pis + target_entropy).detach() # self.a, self.deterministic_a, self.log_pis, _ = self.actor(s) # print(self.a) return -torch.mean(self.log_alpha * intermediate) def get_labda_loss(self, r, s, s_, a): # with torch.no_grad(): # l = self.critic(s, a) # lya_a_, _, _, _ = self.actor_target(s_) # self.l_ = self.critic_target(s_, lya_a_) l = self.l.detach() lyapunov_loss = torch.mean(self.l_ - l + self.alpha3 * r) return -torch.mean(self.log_labda * lyapunov_loss) def get_beta_loss(self, _s): with torch.no_grad(): _, _deterministic_a, _, _ = self.actor_target(_s) self.l_action = torch.mean( torch.norm(_deterministic_a.detach() - self.deterministic_a, dim=1)) with torch.no_grad(): intermediate = (self.l_action - 0.02).detach() return -torch.mean(self.log_beta * intermediate) def get_actor_loss(self, s, s_, a, r): if self._action_prior == 'normal': policy_prior = torch.distributions.MultivariateNormal( loc=torch.zeros(self.a_dim), covariance_matrix=torch.diag(torch.ones(self.a_dim))) policy_prior_log_probs = policy_prior.log_prob(self.a) elif self._action_prior == 'uniform': policy_prior_log_probs = 0.0 # only actor weights are updated! _, self.deterministic_a, self.log_pis, _ = self.actor(s) # self.l = self.critic(s, a) with torch.no_grad(): # self.l = self.critic(s, a) lya_a_, _, _, _ = self.actor(s_) self.l_ = self.critic(s_, lya_a_) l = self.l.detach() self.lyapunov_loss = torch.mean(self.l_ - l + self.alpha3 * r) labda = self.labda.detach() alpha = self.alpha.detach() a_loss = labda * self.lyapunov_loss + alpha * torch.mean( self.log_pis) - policy_prior_log_probs return a_loss def get_lyapunov_loss(self, s, s_, a, r, r_n_=None, v=None, terminal=0.): with torch.no_grad(): a_, _, _, _ = self.actor_target(s_) l_ = self.critic_target(s_, a_) self.l = self.critic(s, a) if self.approx_value: if self.finite_horizon: if self.soft_predict_horizon: l_target = r - r_n_ + l_ else: l_target = v else: l_target = r + self.gamma * ( 1 - terminal ) * l_ # Lyapunov critic - self.alpha * next_log_pis else: l_target = r mse_loss = nn.MSELoss() l_loss = mse_loss(self.l, l_target) return l_loss def save_result(self, path): if not os.path.exists(path + "/policy/"): os.mkdir(path + "/policy/") self.actor_target.save(path + "/policy/actor_target.pth") self.critic_target.save(path + "/policy/critic_target.pth") self.actor.save(path + "/policy/actor.pth") self.critic.save(path + "/policy/critic.pth") print("Save to path: ", path + "/policy/") def restore(self, path): result_path = path if not os.path.exists(result_path): raise IOError("Results path ", result_path, " does not contain anything to load") self.actor_target.load(result_path + "/actor_target.pth") self.critic_target.load(result_path + "/critic_target.pth") self.actor.load(result_path + "/actor.pth") self.critic.load(result_path + "/critic.pth") success_load = True print("Load successful, model file from ", result_path) print("#########################################################") return success_load def scheduler_step(self): self.alpha_scheduler.step() self.beta_scheduler.step() self.labda_scheduler.step() self.actor_scheduler.step() self.critic_scheduler.step()