class DDPG: def __init__(self, state_size, action_size, random_seed, hyperparams): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.hyperparams = hyperparams self.actor = Actor(state_size, action_size, random_seed).to(device) self.actor_noise = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=hyperparams.alpha_actor) self.critic = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optim = optim.Adam( self.critic.parameters(), lr=hyperparams.alpha_critic, weight_decay=hyperparams.weight_decay, ) self.replay_buffer = ReplayBuffer(hyperparams.buffer_size, hyperparams.batch_size, random_seed) self.noise = OUNoise( action_size, random_seed, self.hyperparams.mu, self.hyperparams.theta, self.hyperparams.sigma, ) def step(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if len(self.replay_buffer) > self.hyperparams.batch_size: observations = self.replay_buffer.sample() self.update_params(observations) def select_action(self, state, train=True, nn_noise=False): state = torch.from_numpy(state).to(dtype=torch.float32, device=device) self.actor.eval() if nn_noise: action = self.actor_noise(state).cpu().data.numpy() else: action = self.actor(state).cpu().data.numpy() self.actor.train() if train: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset_state() def update_params(self, observations): states, actions, rewards, next_states, dones = observations next_actions = self.actor_target(next_states) next_Q_values = self.critic_target(next_states, next_actions) Q_values = rewards + (self.hyperparams.gamma * next_Q_values * (1 - dones)) expected_Q = self.critic(states, actions) Q_values_loss = F.l1_loss(expected_Q, Q_values) self.critic_optim.zero_grad() Q_values_loss.backward() self.critic_optim.step() policy_loss = -self.critic(states, self.actor(states)) policy_loss = policy_loss.mean() self.actor_optim.zero_grad() policy_loss.backward() self.actor_optim.step() for qtarget_param, qlocal_param in zip(self.critic_target.parameters(), self.critic.parameters()): qtarget_param.data.copy_(self.hyperparams.tau * qlocal_param.data + (1.0 - self.hyperparams.tau) * qtarget_param.data) for target_param, local_param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(self.hyperparams.tau * local_param.data + (1.0 - self.hyperparams.tau) * target_param.data)
class DDPG(): """Reinforcement learning agent that learns using DDPG.""" def __init__(self, task, train=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Set the learning rate suggested by paper: https://pdfs.semanticscholar.org/71f2/03de1a53deae81a7707143f0ed564661e279.pdf self.actor_learning_rate = 0.001 self.actor_decay = 0.0 self.critic_learning_rate = 0.001 self.critic_decay = 0.0 # Actor Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay) # Critic Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay) self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay) # initialize targets model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # self.exploration_theta = 0.15 # self.exploration_sigma = 0.2 self.exploration_theta = 0.01 self.exploration_sigma = 0.02 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.best_w = None self.best_score = -np.inf # self.noise_scale = 0.7 self.score = 0 # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Indicate if we want to learn (or use to predict without learn) self.set_train(train) def reset_episode(self): self.total_reward = 0.0 self.score = 0 self.step_count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.total_reward += reward self.step_count += 1 # Save experience /reward self.memory.add(self.last_state, action, reward, next_state, done) self.score = self.total_reward / float(self.step_count) if self.step_count else 0.0 # Update the noise factor depending on the new score value if self.score >= self.best_score: self.best_score = self.score # Learn, if enough samples are available in memory if self.train and len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, done) # Roll over last state and action self.last_state= next_state def act(self, state): """Returns actions for given state(s) as per current policy""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add more noise for exploration def learn(self, experiences, done): """Update policy and value parameters using give batch experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_state = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) next_action = self.actor_target.model.predict_on_batch(next_state) Q_targets_next = self.critic_target.model.predict_on_batch([next_state, next_action]) # Compute Q targets for current states and train critic model(local) Q_targets = rewards + self.gamma * Q_targets_next * ( 1- dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Soft-update target method self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters mush have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def set_train(self, train): self.train = train
class DDPG(): """Reinforcement Learning agent , learning using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.08 self.exploration_sigma = 0.15 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor 0.99 self.tau = 0.001 # for soft update of target parameters 0.01 # Score tracker and learning parameters self.total_reward = None self.count = 0 self.score = 0 self.best_score = -np.inf self.last_state = None def reset_episode(self): self.total_reward = None self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): if self.total_reward: self.total_reward += reward else: self.total_reward = reward self.count += 1 # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(states)[0] # add some noise for exploration return list(action + self.noise.sample()) def learn(self, experiences): """Update policy and value parameters using given batch of reward tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted actions of next-state and Q values from target models actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) # track best score self.score = self.total_reward / float( self.count) if self.count else -np.inf if self.best_score < self.score: self.best_score = self.score def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): def __init__(self, agent_id, model, action_size, random_seed): self.id = agent_id # Actor Neural Network (Regular and target) self.actor_regular = model.actor_regular self.actor_target = model.actor_target self.actor_optimizer = optim.Adam(self.actor_regular.parameters(), lr=LR_ACTOR) # Critic Neural Network (Regular and target) self.critic_regular = model.critic_regular self.critic_target = model.critic_target self.critic_optimizer = optim.Adam(self.critic_regular.parameters(), lr=LR_CRITIC) # Exploration noise self.noise = OUNoise(action_size, random_seed, OU_MU, OU_THETA, OU_SIGMA) # Ensure that both networks have the same weights self.deep_copy(self.actor_target, self.actor_regular) self.deep_copy(self.critic_target, self.critic_regular) def act(self, states, noise_value, add_noise=True): states = torch.from_numpy(states).float().to(DEVICE) self.actor_regular.eval() with torch.no_grad(): action = self.actor_regular(states).cpu().data.numpy() self.actor_regular.train() if add_noise: # Include exploration noise action += noise_value * self.noise.sample() # Clip action to the right interval return np.clip(action, -1, 1) def learn(self, memory, agent_id, experiences, all_next_actions, all_actions): states, actions, rewards, next_states, dones = experiences # Update the critic neural network self.critic_optimizer.zero_grad() agent_id = torch.tensor([agent_id]).to(DEVICE) actions_next = torch.cat(all_next_actions, dim=1).to(DEVICE) with torch.no_grad(): Q_targets_next = self.critic_target(next_states, actions_next) Q_expected = self.critic_regular(states, actions) # Compute Q targets for current states filtered by agent id Q_targets = rewards.index_select( 1, agent_id) + (GAMMA * Q_targets_next * (1 - dones.index_select(1, agent_id))) # Calculate the critic loss critic_loss = F.mse_loss(Q_expected, Q_targets.detach()) # Minimize the loss critic_loss.backward() # Critic gradient clipping to 1 torch.nn.utils.clip_grad_norm_(self.critic_regular.parameters(), 1) self.critic_optimizer.step() # Update the actor neural network self.actor_optimizer.zero_grad() # Detach actions of other agents actions_pred = [ actions if i == self.id else actions.detach() for i, actions in enumerate(all_actions) ] actions_pred = torch.cat(actions_pred, dim=1).to(DEVICE) actor_loss = -self.critic_regular(states, actions_pred).mean() # Minimize the loss function actor_loss.backward() self.actor_optimizer.step() # Update target network using the soft update approach (slowly updating) self.soft_update(self.critic_regular, self.critic_target) self.soft_update(self.actor_regular, self.actor_target) def soft_update(self, local_model, target_model): # Update the target network slowly to improve the stability for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data) def deep_copy(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, state_size_full, action_size_full, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.state_size_full = state_size_full self.action_size_full = action_size_full self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(hyperparameters.device) self.actor_target = Actor(state_size, action_size, random_seed).to(hyperparameters.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=hyperparameters.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size_full, action_size_full, random_seed).to(hyperparameters.device) self.critic_target = Critic(state_size_full, action_size_full, random_seed).to(hyperparameters.device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=hyperparameters.LR_CRITIC, weight_decay=hyperparameters.WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) def act(self, state, eps, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(hyperparameters.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += eps * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG: def __init__(self, state_size, action_size, tau, lr_actor, lr_critic, num_agents, agent_idx, seed, device, gamma, tensorboard_writer=None): self.state_size = state_size self.action_size = action_size self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.num_agents = num_agents self.agent_idx = agent_idx self.seed = seed self.device = device self.gamma = gamma random.seed(seed) self.tensorboard_writer = tensorboard_writer self.actor_local = Actor(state_size, action_size, seed) self.actor_target = Actor(state_size, action_size, seed) critic_state_size = (state_size + action_size) * num_agents self.critic_local = Critic(critic_state_size, seed) self.critic_target = Critic(critic_state_size, seed) hard_update(self.actor_local, self.actor_target) hard_update(self.critic_local, self.critic_target) self.actor_optim = torch.optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_optim = torch.optim.Adam(self.critic_local.parameters(), lr=lr_critic) self.noise = OUNoise(action_size, seed) self.iteration = 0 def to(self, device): self.actor_local.to(device) self.actor_target.to(device) self.critic_local.to(device) self.critic_target.to(device) return self def act(self, state, noise_scale, use_noise=True): state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if use_noise: action += self.noise.sample() * noise_scale return np.clip(action, -1, 1) def learn(self, experiences, all_curr_pred_actions, all_next_pred_actions): agent_idx_device = torch.tensor(self.agent_idx).to(self.device) states, actions, rewards, next_states, dones = experiences rewards = rewards.index_select(1, agent_idx_device) dones = dones.index_select(1, agent_idx_device) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models batch_size = next_states.shape[0] actions_next = torch.cat(all_next_pred_actions, dim=1).to(self.device) next_states = next_states.reshape(batch_size, -1) with torch.no_grad(): Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss states = states.reshape(batch_size, -1) actions = actions.reshape(batch_size, -1) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets.detach()) # Minimize the loss self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss self.actor_optim.zero_grad() predicted_actions = torch.cat([action if idx == self.agent_idx \ else action.detach() for idx, action in enumerate(all_curr_pred_actions)], dim=1).to(self.device) actor_loss = -self.critic_local(states, predicted_actions).mean() # minimize loss actor_loss.backward() self.actor_optim.step() al = actor_loss.cpu().detach().item() cl = critic_loss.cpu().detach().item() if self.tensorboard_writer is not None: self.tensorboard_writer.add_scalar("agent{}/actor_loss".format(self.agent_idx), al, self.iteration) self.tensorboard_writer.add_scalar("agent{}/critic_loss".format(self.agent_idx), cl, self.iteration) self.tensorboard_writer.file_writer.flush() self.iteration += 1 # ----------------------- update target networks ----------------------- # soft_update(self.critic_target, self.critic_local, self.tau) soft_update(self.actor_target, self.actor_local, self.tau) def reset(self): self.noise.reset()
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def load(self, model_dir, agent_id): # Load Actor and Critic network weights self.actor_local.load_state_dict( torch.load( os.path.join(model_dir, 'agent_{0}_actor.pth'.format(agent_id)))) self.critic_local.load_state_dict( torch.load( os.path.join(model_dir, 'agent_{0}_critic.pth'.format(agent_id)))) def save(self, model_dir, agent_id): # Save Actor and Critic network weights torch.save( self.actor_local.state_dict(), os.path.join(model_dir, 'agent_{0}_actor.pth'.format(agent_id))) torch.save( self.critic_local.state_dict(), os.path.join(model_dir, 'agent_{0}_critic.pth'.format(agent_id)))
class DDPGAgent: def __init__(self, state_size, action_size, seed, n_hidden_units=128, n_layers=3): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # actor self.actor = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4) # critic self.critic = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_opt = optim.Adam(self.critic.parameters(), lr=3e-4, weight_decay=0.0001) # will add noise self.noise = OUNoise(action_size, seed) # experience replay self.replay = ReplayBuffer(seed) def act(self, state, noise=True): ''' Returns actions taken. ''' state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().data.numpy() self.actor.train() if noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def step(self, state, action, reward, next_state, done): ''' Save experiences into replay and sample if replay contains enough experiences ''' self.replay.add(state, action, reward, next_state, done) if self.replay.len() > self.replay.batch_size: experiences = self.replay.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): ''' Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, n_s, done) tuples gamma (float): discount factor ''' states, actions, rewards, next_states, dones = experiences # update critic: # get predicted next state actions and Qvalues from targets next_actions = self.actor_target(next_states) next_Q_targets = self.critic_target(next_states, next_actions) # get current state Qvalues Q_targets = rewards + (GAMMA * next_Q_targets * (1 - dones)) # compute citic loss Q_expected = self.critic(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # minimize loss self.critic_opt.zero_grad() critic_loss.backward(retain_graph=True) self.critic_opt.step() # update actor: # compute actor loss action_predictions = self.actor(states) actor_loss = -self.critic(states, action_predictions).mean() # minimize actor loss self.actor_opt.zero_grad() actor_loss.backward(retain_graph=True) self.actor_opt.step() # update target networks self.soft_update(self.critic, self.critic_target, TAU) self.soft_update(self.actor, self.actor_target, TAU) def soft_update(self, local, target, tau): ''' Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params: local: PyTorch model (weights will be copied from) target: PyTorch model (weights will be copied to) tau (float): interpolation parameter ''' for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG_Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, brain_name, seed, params=default_params, device=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ params = self._fill_params(params) # implementation and identity self.device = device if device is not None else torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.name = params['name'] self.brain_name = brain_name # set environment information self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed, fc1_units=params['layers_actor'][0], fc2_units=params['layers_actor'][1]).to( self.device) self.actor_target = Actor(state_size, action_size, seed, fc1_units=params['layers_actor'][0], fc2_units=params['layers_actor'][1]).to( self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed, fcs1_units=params['layers_critic'][0], fc2_units=params['layers_critic'][1]).to( self.device) self.critic_target = Critic(state_size, action_size, seed, fcs1_units=params['layers_critic'][0], fc2_units=params['layers_critic'][1]).to( self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=params['lr_critic'], weight_decay=params['weight_decay']) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, params['buffer_size'], params['batch_size'], seed, device=self.device) # save params self.params = params def _fill_params(self, src_params): params = { 'name': self._get_param_or_default('name', src_params, default_params), 'buffer_size': self._get_param_or_default('buffer_size', src_params, default_params), 'batch_size': self._get_param_or_default('batch_size', src_params, default_params), 'layers_actor': self._get_param_or_default('layers_actor', src_params, default_params), 'layers_critic': self._get_param_or_default('layers_critic', src_params, default_params), 'lr_actor': self._get_param_or_default('lr_actor', src_params, default_params), 'lr_critic': self._get_param_or_default('lr_critic', src_params, default_params), 'gamma': self._get_param_or_default('gamma', src_params, default_params), 'tau': self._get_param_or_default('tau', src_params, default_params), 'weight_decay': self._get_param_or_default('weight_decay', src_params, default_params) } return params def display_params(self, force_print=False): if force_print: print(self.params) return self.params def _get_param_or_default(self, key, src_params, default_params): if key in src_params: return src_params[key] else: return default_params[key] def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) def start_learn(self): # Learn, if enough samples are available in memory # decoupled from step method to allow multiple steps per learning pass if len(self.memory) > self.params['batch_size']: experiences = self.memory.sample() self.learn(experiences, self.params['gamma']) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.params['tau']) self.soft_update(self.actor_local, self.actor_target, self.params['tau']) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, agent_id): """Initialize a DDPGAgent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action agent_id (int): identifier for this agent """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(RANDOM_SEED) self.agent_id = agent_id self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Make sure that the target-local model pairs are initialized to the # same weights self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) self.noise = OUNoise(action_size) #self.noise_amplification = NOISE_AMPLIFICATION #self.noise_amplification_decay = NOISE_AMPLIFICATION_DECAY #self._print_network() def act(self, state, add_noise=False): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() self._decay_noise_amplification() return np.clip(action, -1, 1) def reset(self): """Resets the OU Noise for this agent.""" self.noise.reset() def learn(self, experiences, next_actions, actions_pred): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(next_state) -> action critic_target(next_state, next_action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples next_actions (list): next actions computed from each agent actions_pred (list): prediction for actions for current states from each agent """ states, actions, rewards, next_states, dones = experiences agent_id_tensor = torch.tensor([self.agent_id - 1]).to(device) ### Update critic self.critic_optimizer.zero_grad() Q_targets_next = self.critic_target(next_states, next_actions) Q_targets = rewards.index_select(1, agent_id_tensor) + ( GAMMA * Q_targets_next * (1 - dones.index_select(1, agent_id_tensor))) Q_expected = self.critic_local(states, actions) # Minimize the loss critic_loss = F.mse_loss(Q_expected, Q_targets) critic_loss.backward() self.critic_optimizer.step() ### Update actor self.actor_optimizer.zero_grad() # Minimize the loss actor_loss = -self.critic_local(states, actions_pred).mean() actor_loss.backward() self.actor_optimizer.step() ### Update target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def hard_update(self, local_model, target_model): """Hard update model parameters. θ_target = θ_local Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ * θ_local + (1 - τ) * θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def _print_network(self): """Helper to print network architecture for this agent's actors and critics.""" print("Agent #{}".format(self.agent_id)) print("Actor (Local):") print(self.actor_local) print("Actor (Target):") print(self.actor_target) print("Critic (Local):") print(self.critic_local) print("Critic (Target):") print(self.critic_target) if self.agent_id != NUM_AGENTS: print( "_______________________________________________________________" )
class Agent(): def __init__(self, state_size, action_size, seed): self.gradient_clipping = True self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.config = Config() # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.config.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.config.LR_CRITIC, weight_decay=self.config.WEIGHT_DECAY) self.noise = OUNoise(action_size, seed) self.memory = ReplayBuffer(action_size, self.config.BUFFER_SIZE, self.config.BATCH_SIZE, seed, device) self.step_count = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.step_count += 1 if len( self.memory ) > self.config.BATCH_SIZE and self.step_count % self.config.UPDATE_EVERY == 0: experiences = self.memory.sample() self.learn(experiences, self.config.GAMMA) def act(self, state, eps, add_noise=True): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action = action + self.noise.sample() * eps return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if self.gradient_clipping: # use gradient clipping torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() if self.step_count % 10 == 0: self.soft_update(self.critic_local, self.critic_target, self.config.TAU) self.soft_update(self.actor_local, self.actor_target, self.config.TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(object): """ Interacts with and learns from the environment. There are two agents and the observations of each agent has 24 dimensions, while each agent's action has 2 dimensions. Here we use two separate actor networks (one for each agent using each agent's observations only and output that agent's action). The critic for each agents gets to see the full observations and full actions of all agents. """ def __init__(self, agent_id, state_size, full_state_size, action_size, full_action_size, actor_hidden_sizes=(256, 128), actor_lr=1e-4, actor_weight_decay=0., critic_hidden_sizes=(256, 128), critic_lr=1e-3, critic_weight_decay=0., is_action_continuous=True): """ Initialize an Agent object. :param agent_id (int): ID of each each agent. :param state_size (int): Dimension of each state for each agent. :param full_state_size (int): Dimension of full state for all agents. :param action_size (int): Dimension of each action for each agent. :param full_action_size: Dimension of full action for all agents. :param actor_hidden_sizes (tuple): Hidden units of the actor network. :param actor_lr (float): Learning rate of the actor network. :param actor_weight_decay (float): weight decay (L2 penalty) of the actor network. :param critic_hidden_sizes (tuple): Hidden units of the critic network. :param critic_lr (float): Learning rate of the critic network. :param critic_weight_decay (float): weight decay (L2 penalty) of the critic network. :param is_action_continuous (bool): Whether action space is continuous or discrete. """ self.id = agent_id self.state_size = state_size self.full_state_size = full_state_size self.action_size = action_size self.full_action_size = full_action_size self.is_action_continuous = is_action_continuous # Actor Network (w/ Target Network) self.actor_local = Actor( state_size, actor_hidden_sizes, action_size, out_gate=nn.Tanh if is_action_continuous else None) self.actor_target = Actor( state_size, actor_hidden_sizes, action_size, out_gate=nn.Tanh if is_action_continuous else None) self.update(self.actor_local, self.actor_target, 1.) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=actor_lr, weight_decay=actor_weight_decay) # Critic Network (w/ Target Network) num_agents = int(full_action_size / action_size) self.critic_local = Critic( full_state_size, full_action_size if is_action_continuous else num_agents, critic_hidden_sizes) self.critic_target = Critic( full_state_size, full_action_size if is_action_continuous else num_agents, critic_hidden_sizes) # self.critic_local, self.critic_target = get_critic(full_state_size, full_action_size, critic_hidden_sizes) self.update(self.critic_local, self.critic_target, 1.) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=critic_lr, weight_decay=critic_weight_decay) self.use_actor = True # Noise Process self.noise_scale = 0. self.noise = OUNoise(action_size) def reset(self): self.noise.reset() def act(self, state, noise_scale=0.0): """ Returns action for given state using current policy. """ states = torch.from_numpy(state[np.newaxis]).float() # calculate actions self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states) self.actor_local.train() actions = actions.cpu().numpy().squeeze() # add noise actions += noise_scale * self.noise.sample() return np.clip(actions, -1, 1) if self.is_action_continuous else np.argmax(actions) def learn(self, states, actions, rewards, next_states, dones, full_actions_predicted, critic_full_next_actions, gamma=0.99): """ Update policy and value parameters. Q_targets = r + γ * critic_target(next_state, action_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value :param states: Full states for training which size is (BATCHES, NUM_AGENTS, STATE_SIZE) :param actions: Full actions for training which size is (BATCHES, NUM_AGENTS, ACTION_SIZE) :param rewards: Full rewards for training which size is (BATCHES, NUM_AGENTS) :param next_states: Full next states for training which size is (BATCHES, NUM_AGENTS, STATE_SIZE) :param dones: Full dones for training which size is (BATCHES, NUM_AGENTS) :param full_actions_predicted: :param critic_full_next_actions: Full next states which size is (BATCHES, NUM_AGENTS * STATE_SIZE) :param gamma: discount ratio """ full_states = states.view(-1, self.full_state_size) full_actions = actions.view(states.shape[0], -1).float() full_next_states = next_states.view(-1, self.full_state_size) critic_full_next_actions = torch.cat(critic_full_next_actions, dim=1).float().to(DEVICE) actor_rewards = rewards[:, self.id].view(-1, 1) actor_dones = dones[:, self.id].view(-1, 1) # ---------------------------- update critic ---------------------------- # q_next = self.critic_target.forward(full_next_states, critic_full_next_actions) q_target = actor_rewards + gamma * q_next * (1 - actor_dones) q_expected = self.critic_local(full_states, full_actions) # Compute critic loss critic_loss = F.mse_loss(q_expected, q_target.detach()) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # if self.use_actor: # detach actions from other agents full_actions_predicted = [ actions if i == self.id else actions.detach() for i, actions in enumerate(full_actions_predicted) ] full_actions_predicted = torch.cat(full_actions_predicted, dim=1).float().to(DEVICE) # Compute actor loss actor_loss = -self.critic_local.forward( full_states, full_actions_predicted).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() else: actor_loss = torch.tensor(0) return actor_loss.cpu().item(), critic_loss.cpu().item() def update(self, source, target, tau=0.01): """ Update target model parameters: θ_target = τ*θ_local + (1 - τ)*θ_target :param source: Pytorch model which parameters are copied from :param target: Pytorch model which parameters are copied to :param tau: interpolation parameter """ for param, target_param in zip(source.parameters(), target.parameters()): target_param.data.copy_(target_param.data * (1 - tau) + param.data * tau)