class ActorAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, noise, learning_rate, memory, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.noise = noise self.learning_rate = learning_rate self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate) # Noise process self.noise = OUNoise(action_size, seed=random_seed) # Replay memory #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.memory = memory def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset()
class Agent(): def __init__(self, state_size, action_size, state_size_full, action_size_full, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic = Critic(state_size_full, action_size_full, random_seed).to(device) self.critic_target = Critic(state_size_full, action_size_full, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # initialize targets same as original networks self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) # Noise process self.noise = OUNoise(action_size, random_seed) def hard_update(self, target, source): """ Copy network parameters from source to target Inputs: target (torch.nn.Module): Net to copy parameters to source (torch.nn.Module): Net whose parameters to copy """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def act(self, state, amplitud): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().data.squeeze().numpy() self.actor.train() action += amplitud * self.noise.sample() return np.clip(action, -1, 1)
class Agent(): '''Interact with and learn from environment.''' def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # counter for activating learning every few steps self.running_c_loss = 0 self.running_a_loss = 0 self.training_cnt = 0 # Actor network (w/ target network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (w/ target network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def act(self, state, mode): '''Returns actions for given state as per current policy. Params ====== state (array): current state mode (string): train or test epsilon (float): for epsilon-greedy action selection ''' state = torch.from_numpy(state).unsqueeze(0).float().to( device) # shape of state (1, state_size) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if mode == 'test': return np.clip(action, -1, 1) elif mode == 'train': # if train, then add OUNoise in action action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for _ in range(10): # update 10 times per learning experiences = self.memory.sample() self.learn(experiences, GAMMA) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.running_c_loss += float(critic_loss.cpu().data.numpy()) self.training_cnt += 1 # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.running_a_loss += float(actor_loss.cpu().data.numpy()) # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() #torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) # clip gradient to max 1 self.actor_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class TD3MultiAgent: def __init__(self): self.max_action = 1 self.policy_freq = 2 self.policy_freq_it = 0 self.batch_size = 512 self.discount = 0.99 self.replay_buffer = int(1e5) self.device = 'cuda' self.state_dim = 24 self.action_dim = 2 self.max_action = 1 self.policy_noise = 0.1 self.agents = 1 self.random_period = 1e4 self.tau = 5e-3 self.replay_buffer = ReplayBuffer(self.replay_buffer) self.actor = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device) self.actor_target = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4) # self.actor.load_state_dict(torch.load('actor2.pth')) # self.actor_target.load_state_dict(torch.load('actor2.pth')) self.noise = OUNoise(2, 32) self.critic = Critic(48, self.action_dim).to(self.device) self.critic_target = Critic(48, self.action_dim).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4) def select_action_with_noise(self, state, i): import pdb ratio = len(self.replay_buffer)/self.random_period if len(self.replay_buffer)>self.random_period: state = torch.FloatTensor(state[i,:]).to(self.device) action = self.actor(state).cpu().data.numpy() if self.policy_noise != 0: action = (action + self.noise.sample()) return action.clip(-self.max_action,self.max_action) else: q= self.noise.sample() return q def step(self, i): if len(self.replay_buffer)>self.random_period/2: # Sample mini batch # if True: import pdb s, a, r, s_, d = self.replay_buffer.sample(self.batch_size) state = torch.FloatTensor(s[:,i,:]).to(self.device) action = torch.FloatTensor(a[:,i,:]).to(self.device) next_state = torch.FloatTensor(s_[:,i,:]).to(self.device) a_state = torch.FloatTensor(s).to(self.device).reshape(-1,48) a_action = torch.FloatTensor(a).to(self.device).reshape(-1,4) a_next_state = torch.FloatTensor(s_).to(self.device).reshape(-1,48) done = torch.FloatTensor(1 - d[:,i]).to(self.device) reward = torch.FloatTensor(r[:,i]).to(self.device) # pdb.set_trace() # Select action with the actor target and apply clipped noise noise = torch.FloatTensor(a[:,i,:]).data.normal_(0, self.policy_noise).to(self.device) noise = noise.clamp(-0.1,0.1) # NOISE CLIP WTF? next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(a_next_state, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward.reshape(-1,1) + (done.reshape(-1,1) * self.discount * target_Q).detach() # Get current Q estimates current_Q1, current_Q2 = self.critic(a_state, action) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Delayed policy updates if self.policy_freq_it % self.policy_freq == 0: # Compute actor loss actor_loss = -self.critic.Q1(a_state, self.actor(state)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) self.policy_freq_it += 1 return True def reset(self): self.policy_freq_it = 0 self.noise.reset()
class DDPGAgent: '''Class representing the DDPG algorithm''' def __init__(self, state_size, action_size, config): '''Class constructor and parameters initialization''' self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print(f'Using {self.device}') self.timestep = 0 seed = config['seed'] self.gamma = config['gamma'] self.state_size = state_size self.action_size = action_size self.num_agents = config['number_agents'] # Learns argmax_a[Q(s, a); theta_mu] = mu(s, a; theta_mu) self.learnt_actor = Actor(seed, state_size, action_size).to(self.device) # learnt self.target_actor = Actor(seed, state_size, action_size).to( self.device) # soft-update tracking self.actor_optim = optim.Adam(self.learnt_actor.parameters(), lr=config['actor_lr']) # Learns to evaluate Q(s, mu(s, a); theta_q) self.learnt_critic = Critic(seed, state_size, action_size, 1).to(self.device) # learnt self.target_critic = Critic(seed, state_size, action_size, 1).to(self.device) # soft-update tracking self.critic_optim = optim.Adam(self.learnt_critic.parameters(), lr=config['critic_lr']) print( f'Summary:\nActor network:\n{self.learnt_actor}\nCritic network:\n{self.learnt_critic}' ) # Note: Could be replaced by parallel env batching self.batch_size = config['batch_size'] self.memory = Memory(config['memory_size'], self.batch_size, seed) self.memory.to_device(self.device) # Soft-update self.tau = config['tau'] # Noise self.noise = OUNoise(action_size, seed) self.noise_decay = config['noise_decay'] def reset(self): '''Reset the noise state''' self.noise.reset() def act(self, states): '''Sample an action from the policy''' states = torch.tensor(states, dtype=torch.float32, device=self.device) self.learnt_actor.eval() with torch.no_grad(): actions = self.learnt_actor(states).cpu().data.numpy() self.learnt_actor.train() actions += self.noise_decay * self.noise.sample() return np.clip(actions, -1, 1) def remember(self, states, actions, rewards, next_states, dones): '''Populates the replay memory with new batch of data''' n = len(states) assert (n == len(actions)) assert (n == len(rewards)) assert (n == len(next_states)) assert (n == len(dones)) for (state, action, reward, next_state, done) in zip(states, actions, rewards, next_states, dones): self.memory.add(Experience(state, action, reward, next_state, done)) def step(self, timestep): '''Wraps and controls the training of the function approximators using soft-updating''' if len(self.memory ) > self.batch_size and self.timestep % LEARN_EVERY == 0: for _ in range(ITERS): states, actions, rewards, next_states, dones = self.memory.sample( ) self.__learn(states, actions, rewards, next_states, dones) def __learn(self, states, actions, rewards, next_states, dones): '''Optimizes the function apprximators and soft-updates''' self.__optimize_critic(states, actions, rewards, next_states, dones) self.__optimize_actor(states) self.__soft_update(self.learnt_actor, self.target_actor, self.tau) self.__soft_update(self.learnt_critic, self.target_critic, self.tau) self.noise_decay *= self.noise_decay self.reset() def __optimize_critic(self, states, actions, rewards, next_states, dones): '''Optimizes the critic approximator''' best_next_actions = self.target_actor(next_states) q_targets = rewards + self.gamma * self.target_critic( next_states, best_next_actions) * (1 - dones) q_predictions = self.learnt_critic(states, actions) self.critic_optim.zero_grad() critic_loss = F.mse_loss(q_predictions, q_targets) critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.learnt_critic.parameters(), 1) self.critic_optim.step() def __optimize_actor(self, states): '''Optimizes the actor approximator''' best_current_actions = self.learnt_actor(states) advantage = -self.learnt_critic(states, best_current_actions).mean() self.actor_optim.zero_grad() advantage.backward() self.actor_optim.step() def __soft_update(self, learnt, target, tau): '''Soft-updates the target parameters''' for learnt_param, target_param in zip(learnt.parameters(), target.parameters()): target_param.data.copy_(tau * learnt_param.data + (1.0 - tau) * target_param.data)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.best_score = -np.inf self.score = 0 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.11 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.score = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state self.score += reward if done: if self.score > self.best_score: self.best_score = self.score def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent: """Initeracts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size # Actor Networks both Local and Target. self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Networks both Local and Target. self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # Noise process self.noise = OUNoise(action_size, random_seed) self.noise_modulation = 1 self.noise_decay = NOISE_DECAY # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Count number of steps self.n_steps = 0 self.update_every = UPDATE_EVERY def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done) # Learn if enough samples are available in memory if len(self.memory) > BATCH_SIZE and self.n_steps % self.update_every == 0: experiences = self.memory.sample() self.learn(experiences, GAMMA) self.noise_modulation *= self.noise_decay self.n_steps += 1 def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise_modulation * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.n_steps = 0 self.noise.reset() def learn(self, experiences, gamma): """Update policy and value paramters given batch of experience tuples. Q_targets = r + gamma * cirtic_target(next_state, actor_state(next)state) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Update critic # Get predicted next-state actions and Q-values from target models. actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() # Clear gradient critic_loss.backward() # Backpropagation torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Update parameters # Update actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() # Clear gradient actor_loss.backward() # Backpropagation torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() # Update parameters # Now we update the target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. theta_target = tau * theta_local + (1 - tau) * theta_target Params ====== local_model: PyTorch model (weight source) target_model: PyTorch model (weight destination) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Actor(): def __init__(self, action_size, state_size,buffer_size, batch_size,actor_lr,critic_lr,device,weight_decay, tau,shared_memory,noise, share_memory_flag, seed=0): self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.actor_lr = actor_lr self.weight_decay = weight_decay self.device = device self.seed= seed self.actor_loss =[] #self.critic_loss =[] torch.manual_seed(seed) np.random.seed(seed) self.tau = tau self.noise= OUNoise(self.action_size,self.seed) #self.noise = noise self.share_memory_flag = share_memory_flag if self.share_memory_flag: self.memory = shared_memory else: self.memory = ReplayBuffer(action_size, buffer_size, batch_size, self.device) ## Actor self.actor_local = ActorNN(self.state_size,self.action_size).to(self.device) self.actor_target = ActorNN(self.state_size,self.action_size).to(self.device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr = self.actor_lr) ## Critic #self.critic_local = Critic(self.state_size,self.action_size).to(self.device) #self.critic_target = Critic(self.state_size,self.action_size).to(self.device) #self.critic_optimizer = Adam(self.critic_local.parameters(), lr = self.critic_lr, weight_decay=self.weight_decay) # initialize targets same as original networks self.hard_update(self.actor_target, self.actor_local) #self.hard_update(self.critic_target, self.critic_local) def reset(self): self.noise.reset() def act(self, state,noise = True,sd=1e-4): state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): #print(state.shape) action = self.actor_local(state).cpu().data.numpy() ##action.cpu().detach().numpy() self.actor_local.train() if noise: #print(type(action)) #action += np.random.normal(loc=0.0, scale=sd, size=action.size) action += self.noise.sample() action = np.clip(action, -1,1).reshape(1,-1) return action def hard_update(self,target, source): """ Copy network parameters from source to target Inputs: target (torch.nn.Module): Net to copy parameters to source (torch.nn.Module): Net whose parameters to copy """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def step(self, state, action, rewards, next_state, done,GAMMA=1.0): ## As per the description we are not supposed to use discount factor self.memory.add(state, action, rewards, next_state, done)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, engine): self.task = engine self.width = engine.width self.height = engine.height self.state_size = engine.state_size self.action_size = engine.action_size self.action_low = engine.action_low self.action_high = engine.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high,self.width,self.height) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high,self.width,self.height) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size,self.width,self.height) self.critic_target = Critic(self.state_size, self.action_size,self.width,self.height) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory # print(self.last_state) #print(action) #print(reward) #print(next_state) #print(done) #print('----') if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" #print(self.state_size) #state = np.reshape(state, [1, self.state_size]) #print(state.shape) #print('act') action = self.actor_local.model.predict(state.reshape(1,self.state_size))[0] #action = action.squeeze(0).argmax() return list(action + self.noise.sample()) # add some noise for exploration def act1(self, state): """Returns actions for given state(s) as per current policy.""" #print(state) #print('act') state = np.reshape(state, [-1, self.state_size]) #print(state) #print('act') action = self.actor_local.model.predict(state.reshape(1,self.state_size))[0] #my_state.reshape(1, OBSERVATION_SPACE) #print(action) action = np.argmax(action) #print(action) return action #return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]).reshape(-1, self.state_size) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) #actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]).reshape(-1, self.state_size) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) #print(next_states) #print('next_states') actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) file_output = 'data1.txt' with open(file_output, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(np.array(Q_targets_next) ) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) #print(Q_targets.shape) #print(actions.shape) #print(Q_targets.shape) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model arameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def reset_episode(self): self.noise.reset() #state = self.task.clear() self.task.clear() #self.last_state = state self.last_state =self.task.board return self.task.board
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, agent_id, random_seed): """Initialize a ddpg_agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action agent_id (int): identifier for this agent """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.agent_id = agent_id self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Make sure that the target-local model pairs are initialized to the # same weights self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) self.noise = OUNoise(action_size, random_seed) self.noise_amplification = NOISE_AMPLIFICATION self.noise_amplification_decay = NOISE_AMPLIFICATION_DECAY ### self._print_network() def act(self, state, add_noise=False): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() self._decay_noise_amplification() return np.clip(action, -1, 1) def reset(self): """Resets the OU Noise for this agent.""" self.noise.reset() def learn(self, experiences, next_actions, actions_pred): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(next_state) -> action critic_target(next_state, next_action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples next_actions (list): next actions computed from each agent actions_pred (list): prediction for actions for current states from each agent """ states, actions, rewards, next_states, dones = experiences agent_id_tensor = torch.tensor([self.agent_id - 1]).to(device) ### Update critic self.critic_optimizer.zero_grad() Q_targets_next = self.critic_target(next_states, next_actions) Q_targets = rewards.index_select(1, agent_id_tensor) + \ (GAMMA * Q_targets_next * (1 - dones.index_select(1, agent_id_tensor))) Q_expected = self.critic_local(states, actions) # Minimize the loss critic_loss = F.mse_loss(Q_expected, Q_targets) critic_loss.backward() self.critic_optimizer.step() ### Update actor self.actor_optimizer.zero_grad() # Minimize the loss actor_loss = -self.critic_local(states, actions_pred).mean() actor_loss.backward() self.actor_optimizer.step() ### Update target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def hard_update(self, local_model, target_model): """Hard update model parameters. θ_target = θ_local Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ * θ_local + (1 - τ) * θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def _decay_noise_amplification(self): """Helper for decaying exploration noise amplification.""" self.noise_amplification *= self.noise_amplification_decay
class Agent(): def __init__(self, state_size, action_size): # Constants self.buffer_size = int(1e6) self.batch_size = 128 self.learning_rate = 1e-4 self.learn_every = 2 self.learning_rounds = 4 self.gamma = 0.99 self.tau = 1e-3 self.t = 0 self.state_size = state_size self.action_size = action_size self.eps = 5.0 self.eps_decay = 1 / (300 * self.learning_rounds) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.learning_rate) self.noise = OUNoise((1, action_size)) self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size) def step(self, state, action, reward, next_state, done, agent_number): self.t += 1 self.memory.add(state, action, reward, next_state, done) if len(self.memory ) > self.batch_size and self.t % self.learn_every == 0: for _ in range(self.learning_rounds): experiences = self.memory.sample() self.learn(experiences, self.gamma, agent_number) def act(self, states, add_noise): states = torch.from_numpy(states).to(device).float() # Get the actions for this agent with torch.no_grad(): actions = self.actor_local( states.squeeze()).unsqueeze(0).cpu().data.numpy() if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): states, actions, rewards, next_states, dones = experiences # Find the best action according to target network actions_next = self.actor_target(next_states) if agent_number == 0: #Get the first two actions actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: #Get the second two action actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) # Compute loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # Clip the gradients to avoid exploding gradients torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Find the best action according to local network actions_pred = self.actor_local(states) if agent_number == 0: #Get the first two actions actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: #Get the second two actions actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target network ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) # Update noise param eps self.eps -= self.eps_decay self.eps = max(self.eps, 0) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Environment: """ Train & simulate wrapper for Atari-DQN Args: params: dictionary of parameters memory_size : size of replay memory. 100000 needs almost 25GB memory, recommend reduce it if you need exploration_step : pure exploration step gamma : discount rate tau: parameter for soft update lr_actor: learning rate for actor network lr_critic: learning rate for critic network device_name : name of device(normally cpu:0 or gpu:0) """ def __init__(self, params, device_name): self.env = gym.make('Pendulum-v0') self.ddpg = DDPG(input_dim=self.env.observation_space.shape[0], action_dim=self.env.action_space.shape[0], action_scale=(self.env.action_space.low[0], self.env.action_space.high[0]), memory_size=params["memory_size"], gamma=params["gamma"], tau=params["tau"], learning_rate_actor=params["lr_actor"], learning_rate_critic=params["lr_critic"], device_name=device_name) self.ddpg.build() self.ddpg.summary() self.random_process = OUNoise(size=self.env.action_space.shape[0]) # total step operated self.i_step = 0 def load(self, global_step="latest"): """ Load saved weights for ddpg Args: global_step : load specific step, if "latest" load latest one """ self.ddpg.load(global_step) def save(self): """ Save current weight of ddpg layers """ self.ddpg.save() def train(self, episode, max_step, minibatch_size, render=False, verbose=1, val_epi=5, saving=False): """run the game with training network Args: episode : number of train episodes max_step : maximum step for each episode minibatch_size : minibatch size for replay memory training render : whether to show game simulating graphic verbose : for which step it will print the loss and accuracy (and saving) val_epi : number of episode for validation saving: whether to save checkpoint or not """ losses = [] episode_return = [] verbose_return = [] episode_return_val = [] tr = trange(episode, desc="") for i_episode in tr: return_episode = 0 observation = self.env.reset() self.random_process.reset() for t in range(max_step): self.i_step += 1 if render: self.env.render() X = observation.astype(np.float32) action_policy = self.ddpg.get_action(tf.convert_to_tensor(X)) action_policy += self.random_process.sample() action_policy = np.clip(action_policy, self.env.action_space.low[0], self.env.action_space.high[0]) observation, reward, done, info = self.env.step(action_policy) return_episode += reward X_next = observation.astype(np.float32) self.ddpg.replay_memory.append( (X, action_policy, reward, X_next, done)) # training step if len(self.ddpg.replay_memory) > minibatch_size: X_batch, action_batch, reward_batch, X_next_batch, done_batch = self.ddpg.replay_memory.get_batch( minibatch_size) loss_critic, loss_actor = self.ddpg.train( X_batch, action_batch, reward_batch, X_next_batch, done_batch) losses.append((loss_critic, loss_actor)) if done: break episode_return.append(return_episode) verbose_return.append(return_episode) tr.set_description("%.4f" % (sum(episode_return) / len(episode_return))) if i_episode == 0 or ((i_episode + 1) % verbose == 0): if len(self.ddpg.replay_memory) <= minibatch_size: stage_tooltip = "EXPLORATION" print(Fore.RED + "[EPISODE %3d / STEP %5d] - %s" % (i_episode + 1, self.i_step, stage_tooltip)) print(Fore.GREEN + "Learned Step : %4d" % (self.ddpg.global_step)) print(Fore.BLUE + "AVG Return : %.4f" % (sum(verbose_return) / len(verbose_return))) print(Fore.BLUE + "MAX Return : %.4f" % (max(verbose_return))) continue else: stage_tooltip = "TRAINING" losses_critic = [l[0] for l in losses] losses_actor = [l[1] for l in losses] # validation returns = [] for epi_val in range(val_epi): return_episode_val = 0 observation = self.env.reset() for t in range(max_step): if render: self.env.render() action_policy = self.ddpg.get_action( tf.convert_to_tensor(observation.astype( np.float32))) observation, reward, done, info = self.env.step( action_policy) return_episode_val += reward if done: # print(Fore.GREEN + "EPISODE %3d: REWARD: %s" % (i_episode, return_episode)) returns.append(return_episode_val) break print(Fore.RED + "[EPISODE %3d / STEP %5d] - %s" % (i_episode + 1, self.i_step, stage_tooltip)) print(Fore.GREEN + "Learned Step : %4d" % (self.ddpg.global_step)) print(Fore.BLUE + "AVG Return : %.4f" % (sum(verbose_return) / len(verbose_return))) print(Fore.BLUE + "MAX Return : %.4f" % (max(verbose_return))) print(Fore.LIGHTYELLOW_EX + "AVG LOSS Actor : %.4f" % (sum(losses_actor) / len(losses_actor))) print(Fore.LIGHTYELLOW_EX + "AVG LOSS Critic : %.4f" % (sum(losses_critic) / len(losses_critic))) print(Fore.LIGHTRED_EX + "AVG VAL[%2d] Return : %.4f" % (val_epi, sum(returns) / len(returns))) print(Fore.LIGHTRED_EX + "MAX VAL[%2d] Return : %.4f" % (val_epi, max(returns))) verbose_return = [] losses = [] episode_return_val.append(sum(returns) / len(returns)) if saving: self.save() time.sleep(1) return episode_return def simulate(self, episode, max_step=1000, render=False): """Run the game with existing dqn network Args: episode : number of train episodes max_step : maximum step for each episode render : whether to show game simulating graphic """ returns = [] for i_episode in range(episode): return_episode = 0 observation = self.env.reset() for t in range(max_step): if render: self.env.render() action_policy = self.ddpg.get_action( tf.convert_to_tensor(observation.astype(np.float32))) observation, reward, done, info = self.env.step(action_policy) return_episode += reward if done: print(Fore.GREEN + "EPISODE %3d: REWARD: %s" % (i_episode, return_episode)) returns.append(return_episode) break print(Fore.RED + "AVG REWARD : %s" % (sum(returns) / len(returns))) print(Fore.BLUE + "MAX REWARD : %s" % (max(returns)))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.eps = 3.0 self.eps_decay = 0.9999 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_target = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_target = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=0) # Noise process self.noise = OUNoise((1, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number, learn_iterations=5): """Save experience in replay memory, and use random sample from buffer to learn.""" #self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory ) > BATCH_SIZE: #and self.timestep % LEARN_EVERY == 0: for _ in range(learn_iterations): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Since the critic takes the actions of both agents we need to update only # one part of the given action if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) elif agent_number == 1: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Since the critic takes the actions of both agents we need to update only # one part of the given action if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) elif agent_number == 1: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update epsilon self.eps *= self.eps_decay self.eps = max(self.eps, 1) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DdpgAgent(object): def __init__(self, name, state_size, action_size, joint_state_size, joint_action_size, actor_lr, critic_lr, device): self.name = name self.device = device self.noise = OUNoise(action_size, sigma=0.1) self.actor_local = Actor(state_size, action_size, fc1=64, fc2=64).to(device) self.actor_target = Actor(state_size, action_size, fc1=64, fc2=64).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=actor_lr) self.critic_local = Critic(joint_state_size, joint_action_size, fc1=64, fc2=64).to(device) self.critic_target = Critic(joint_state_size, joint_action_size, fc1=64, fc2=64).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=critic_lr) self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) def reset(self): self.noise.reset() def hard_copy_weights(self, target, source): """ copy weights from source to target network (part of initialization)""" for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def act(self, states, eps, add_noise=True): """ Get actions to take from given states. Tensors will be returned on the device on which agent is configured to run on. params: states: 1D tenor of states for which to get actions to return. """ s = np.random.random() if s < eps: action = np.random.uniform(-1, 1, 2) return action states = torch.from_numpy(states).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local( states.unsqueeze(0)).squeeze(0).cpu().data.numpy() self.actor_local.train() if add_noise: action += (self.noise.sample()) return np.clip(action, -1, 1) def learn(self, experiences, joint_predicted_actions_from_state, joint_predicted_actions_from_next_state): """ Start learning process """ joint_states, joint_actions, rewards, joint_next_states, dones = experiences q_targets_next = self.critic_target( joint_next_states, joint_predicted_actions_from_next_state) q_targets = rewards + (torch.Tensor(dones.shape).fill_(GAMMA).to( self.device) * q_targets_next * (1 - dones)) q_expected = self.critic_local(joint_states, joint_actions) critic_loss = torch.nn.SmoothL1Loss()(q_expected, q_targets.detach()) # critic_loss = F.mse_loss(q_expected, q_targets.detach()) critic_loss_value = critic_loss.item() self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # print("Joint next states: {}".format(q_targets - q_expected)) self.actor_optimizer.zero_grad() actor_loss = -self.critic_local( joint_states, joint_predicted_actions_from_state).mean() actor_loss_value = actor_loss.item() # torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) return critic_loss_value, actor_loss_value def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: def __init__(self, state_size, action_size, rand_seed, actor_lr, critic_lr): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.device = device self.mem = ReplayBuffer(device=device, batch_size=MEMORY_BATCH_SIZE, memory_size=MEMORY_SIZE) self.noise = OUNoise(action_size, rand_seed) self.actor_local = Actor2(state_size, action_size, fc1=400, fc2=300, seed=rand_seed) self.actor_target = Actor2(state_size, action_size, fc1=400, fc2=300, seed=rand_seed) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=actor_lr) self.critic_local = Critic2(state_size, action_size, fc1=400, fc2=300, seed=rand_seed) self.critic_target = Critic2(state_size, action_size, fc1=400, fc2=300, seed=rand_seed) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=critic_lr, weight_decay=0) def load_state(self, state_name): """ Restore state (=weights). Params: state_name (string): Name of the state to restore agent from. """ self.actor_local.load_state_dict( torch.load(f'states/actor_{state_name}.pth')) self.critic_local.load_state_dict( torch.load(f'states/critic_{state_name}.pth')) def save_state(self, state_name): """ Save current state (=weights). This can later be restored using `load_state`. Params: state_name (string): Name to be used to save the state as. """ torch.save(self.actor_local.state_dict(), f'states/actor_{state_name}.pth') torch.save(self.critic_local.state_dict(), f'states/critic_{state_name}.pth') def act(self, states, add_noise=True): """ Get actions to take from given states. Tensors will be returned on the device on which agent is configured to run on. params: states: 1D tenor of states for which to get actions to return. """ states = torch.from_numpy(states).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def learn(self): """ Start learning process """ if len(self.mem) > MEMORY_BATCH_SIZE: experiences = self.mem.sample() self._do_learn(experiences) def _do_learn(self, experiences): states, actions, rewards, next_states, dones = experiences q_targets = rewards + GAMMA * self.critic_target( next_states, self.actor_target(next_states)) * (1 - dones) q_expectations = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expectations, q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() actions_loss = -self.critic_local(states, self.actor_local(states)).mean() self.actor_optimizer.zero_grad() actions_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def step(self, sarss): """ Add each of (state, action, reward, next_state, done) tuples to memory. params: sarss: an array of (state, action, reward, next, done) tuples to add to the memory. """ for state, action, reward, next_state, done in sarss: self.mem.add(state, action, reward, next_state, done)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_shape, action_size, num_agents, buffer_size, batch_size, gamma, tau, learning_rate_actor, learning_rate_critic, device, update_every=1, random_seed=42): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents acting in the environment buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor tau (float): used for soft update of target parameters learning_rate_actor (float): learning rate for the actor learning_rate_critic (float): learning rate for the critic device (torch.Device): pytorch device update_every (int): how many time steps between network updates seed (int): random seed """ self.state_shape = state_shape self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.device = device self.update_every = update_every self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(action_size, random_seed).to(device) self.actor_target = Actor(action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=learning_rate_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(action_size, random_seed).to(device) self.critic_target = Critic(action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=learning_rate_critic, weight_decay=0) # Noise process self.noise = OUNoise(size=action_size, seed=random_seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, device=device, seed=random_seed) # Initialize time step (for updating every self.update_every steps) self.t_step = 0 def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" next_state_torch = torch.from_numpy(next_state).float().to(self.device) reward_torch = torch.from_numpy(np.array(reward)).float().to( self.device) done_torch = torch.from_numpy(np.array(done).astype( np.uint8)).float().to(self.device) state_torch = torch.from_numpy(state).float().to(self.device) action_torch = torch.from_numpy(action).float().to(self.device) self.actor_target.eval() self.critic_target.eval() self.critic_local.eval() with torch.no_grad(): action_next = self.actor_target(next_state_torch) Q_target_next = self.critic_target(next_state_torch, action_next) Q_target = reward_torch + (self.gamma * Q_target_next * (1 - done_torch)) Q_expected = self.critic_local(state_torch, action_torch) self.actor_local.train() self.critic_target.train() self.critic_local.train() #Error used in prioritized replay buffer error = (Q_expected - Q_target).squeeze().cpu().data.numpy() #Adding experiences to prioritized replay buffer #for i in np.arange(len(reward)): self.memory.add(error, state, action, reward, next_state, done) def step(self, state, action, reward, next_state, done): """Save experience in replay memory.""" # Save experience / reward self.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: if len(self.memory) > self.batch_size: experiences, idxs, is_weights = self.memory.sample() self.learn(experiences, idxs, is_weights) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, idxs, is_weights): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) #critic_loss = F.mse_loss(Q_expected, Q_targets) critic_loss = (torch.from_numpy(is_weights).float().to(self.device) * F.mse_loss(Q_expected, Q_targets)).mean() # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #gradient clipping #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) #.......................update priorities in prioritized replay buffer.......# #Calculate errors used in prioritized replay buffer errors = (Q_expected - Q_targets).squeeze().cpu().data.numpy() # update priority for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, memory, device='cpu', params=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action memory (obj): Memory buffer to sample device (str): device string between cuda:0 and cpu params (dict): hyper-parameters """ self.state_size = state_size self.action_size = action_size self.device = device self.step_t = 0 self.update_every = params['update_every'] # Set parameters self.gamma = params['gamma'] self.tau = params['tau'] self.seed = random.seed(params['seed']) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_target = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) self.critic_target = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=params['lr_critic'], weight_decay=params['weight_decay']) # Noise process self.noise = OUNoise(action_size, params['seed'], theta=params['noise_theta'], sigma=params['noise_sigma']) # Replay memory self.memory = memory def store_weights(self, filenames): """Store weights of Actor/Critic Params ====== filenames (list): string of filename to store weights of actor and critic filenames[0] = actor weights filenames[1] = critic weights """ torch.save(self.actor_local.state_dict(), filenames[0]) torch.save(self.critic_local.state_dict(), filenames[1]) def load_weights(self, filenames): """Load weights of Actor/Critic Params ====== filenames (list): string of filename to load weights of actor and critic filenames[0] = actor weights filenames[1] = critic weights """ self.actor_local.load_state_dict(torch.load(filenames[0])) self.critic_local.load_state_dict(torch.load(filenames[1])) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.step_t = (self.step_t + 1) % self.update_every # Learn, if enough samples are available in memory if self.step_t == 0 and len( self.memory) > self.memory.get_batch_size(): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(object): """DDPG Agent that interacts and learns from the environment.""" def __init__(self, state_size, action_size, device, actor_args={}, critic_args={}): """Initializes the DQN agent. Args: state_size (int): Dimension of each state action_size (int): Dimension of each action device (torch.device): Device to use for calculations actor_args (dict): Arguments describing the actor network critic_args (dict): Arguments describing the critic network """ self.state_size = state_size """Dimension of each state""" self.action_size = action_size """Dimension of each action""" self.device = device """Device to use for calculations""" self.t_step = 0 """Timestep between training updates""" # Parameters # Actor network self.actor_local = Actor(state_size, action_size, **actor_args).to(device) self.actor_target = Actor(state_size, action_size, **actor_args).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network self.critic_local = Critic(state_size, action_size, **critic_args).to(device) self.critic_target = Critic(state_size, action_size, **critic_args).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process for exploration self.noise = OUNoise(action_size, sigma=NOISE_SD) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device) def reset(self): """Reset state of agent.""" self.noise.reset() def save_weights(self, path): """Save local network weights. Args: path (string): File to save to""" torch.save( { 'actor_local': self.actor_local.state_dict(), 'actor_target': self.actor_target.state_dict(), 'critic_local': self.critic_local.state_dict(), 'critic_target': self.critic_target.state_dict() }, path) def load_weights(self, path): """Load local network weights. Args: path (string): File to load weights from""" checkpoint = torch.load(path) self.actor_local.load_state_dict(checkpoint['actor_local']) self.actor_target.load_state_dict(checkpoint['actor_target']) self.critic_local.load_state_dict(checkpoint['critic_local']) self.critic_target.load_state_dict(checkpoint['critic_target']) def act(self, state, add_noise=True): """Returns action for given state according to the current policy Args: state (np.ndarray): Current state Returns: action (np.ndarray): Action tuple """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Temporarily set evaluation mode (no dropout &c) & turn off autograd self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().detach().numpy() # Resume training mode self.actor_local.train() # Add noise if exploring if add_noise: action += self.noise.sample() # The noise might take us out of range action = np.clip(action, -1, 1) return action def step(self, state, action, reward, next_state, done): """Save experience and learn if due. Args: state (Tensor): Current state action (int): Chosen action reward (float): Resulting reward next_state (Tensor): State after action done (bool): True if terminal state """ self.memory.add(state, action, reward, next_state, done) # Learn as soon as we have enough stored experiences self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) > BATCH_SIZE: for _ in range(NUM_UPDATES): experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """Learn from batch of experiences.""" states, actions, rewards, next_states, dones = experiences # region Update Critic actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) q_targets = rewards + (GAMMA * q_targets_next * (1 - dones)) q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0) self.critic_optimizer.step() # endregion # region Update Actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # endregion # Update target networks soft_update(self.critic_local, self.critic_target, TAU) soft_update(self.actor_local, self.actor_target, TAU)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, num_agents, state_size, action_size, gamma, tau, learning_rate_actor, learning_rate_critic, weight_decay, device, random_seed=42): """Initialize an Agent object (used my MultiAgent for MADDPG). Params ====== num_agents (list): number of agents acting in the environment state_size (int): dimension of each state action_size (int): dimension of each action gamma (float): discount factor tau (float): used for soft update of target parameters learning_rate_actor (float): learning rate for the actor learning_rate_critic (float): learning rate for the critic weight_decay (float): weight decay for the optimizers device (torch.Device): pytorch device random_seed (int): random seed """ self.gamma = gamma self.tau = tau self.device = device self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=learning_rate_actor, weight_decay=weight_decay) # Critic Network (w/ Target Network) self.critic_local = Critic(num_agents, state_size, action_size, random_seed).to(device) self.critic_target = Critic(num_agents, state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=learning_rate_critic, weight_decay=weight_decay) #0.0001 # Noise process self.noise = OUNoise(size=action_size, seed=random_seed) self.timestep = 0 def act(self, state, epsilon=1, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() * epsilon return np.clip(action, -1, 1) def reset(self): """Resets the noise""" self.noise.reset() def learn(self, index, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== index (int): Index of the current agent experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences all_states = torch.cat(states, dim=1).to(self.device) all_next_states = torch.cat(next_states, dim=1).to(self.device) all_actions = torch.cat(actions, dim=1).to(self.device) actions_next = actions.copy() # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next[index] = self.actor_target(next_states[index]) all_actions_next = torch.cat(actions_next, dim=1).to(self.device) Q_targets_next = self.critic_target(all_next_states, all_actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards[index] + (gamma * Q_targets_next * (1 - dones[index])) # Compute critic loss Q_expected = self.critic_local(all_states, all_actions) huber_loss = torch.nn.SmoothL1Loss() critic_loss = huber_loss(Q_expected, Q_targets.detach()) #critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = actions.copy() actions_pred[index] = self.actor_local(states[index]) all_actions_pred = torch.cat(actions_pred, dim=1).to(self.device) actor_loss = -self.critic_local(all_states, all_actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG: def __init__(self, config): self.config = config self.state_size = config.state_size self.action_size = config.action_size self.actor_local = Actor(self.state_size, self.action_size, 2).to(device) self.actor_target = Actor(self.state_size, self.action_size, 2).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.LR_ACTOR) self.critic_local = Critic(self.state_size, self.action_size, 2).to(device) self.critic_target = Critic(self.state_size, self.action_size, 2).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=config.LR_CRITIC, ) self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE) self.noise = OUNoise(self.action_size, config.random_seed) self.t_step = 0 self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) def step(self, states, actions, rewards, next_states, dones): for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY if len(self.memory) > self.config.BATCH_SIZE and (self.t_step == 0): for i in range(self.config.EPOCH): experiences = self.memory.sample(self.config.BATCH_SIZE) self.learn(experiences) def reset(self): self.noise.reset() def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences Q_targets_next = self.critic_target(next_states, self.actor_target(next_states)) Q_targets = rewards + (self.config.GAMMA * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() actor_loss = -self.critic_local(states, self.actor_local(states)).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.TAU) self.soft_update(self.actor_local, self.actor_target, self.config.TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, mnoise=True, split_state=True): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.mnoise = mnoise self.split_state = split_state # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # initialize targets same as original networks self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) # Noise process if self.mnoise: self.noise = OUNoise((2, action_size), random_seed) else: self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones, step): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward if self.split_state: for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) else: self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): """ Copy network parameters from source to target Inputs: target (torch.nn.Module): Net to copy parameters to source (torch.nn.Module): Net whose parameters to copy """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, add_noise = True, PER = False, PSN = True): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.add_noise = add_noise # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.PSN = PSN if self.add_noise: if self.PSN: self.noise = PSNoise(state_size, action_size, random_seed) else: self.noise = OUNoise(action_size, random_seed) # Replay memory self.PER = PER if self.PER: self.memory = ReplayBufferPE(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, alpha = ALPHA) self.beta = BETA_INITIAL else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Initialize learning steps self.learn_step = 0 def reset(self): if self.add_noise: if self.PSN: self.noise.reset(self.actor_local) else: self.noise.reset() def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: # Learn, if enough samples are available in memory for number of timesteps for _ in range(STEPS_UPDATE): experiences = self.memory.sample() self.learn(experiences, GAMMA) # LEARN_EVERY time steps. ''' self.learn_step = (self.learn_step + 1) % LEARN_EVERY if self.learn_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for _ in range(STEPS_UPDATE): experiences = self.memory.sample() self.learn(experiences, GAMMA) ''' def act(self, state, epsilon = 1, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() #If add_noise = True: if self.add_noise: #Add AS or PS noise: if self.PSN: # Parameter Space Noise if len(self.memory) > BATCH_SIZE: # PS noise needs to sample from memory to perturbate actor weights self.noise.update_noise(self.actor_local, states_batch = self.memory.sample()[0]) with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # Action Space Noise else: with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() action += self.noise.sample() #If add_noise = False, no noise is added else: with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() #For all cases, return clipped action return np.clip(action, -1, 1) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples.""" states, actions, rewards, next_states, dones = experiences #Clip rewards #rewards_ = torch.clamp(rewards, min=-1., max=1.) # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute Q expected Q_expected = self.critic_local(states, actions) # Compute critic loss if self.PER: # Update Beta self.beta += BETA_INCREMENT # Get RB weights weights = self.memory.get_weights(self.beta) # Clip abs(TD_errors) TD_errors = torch.clamp(torch.abs(Q_targets - Q_expected), min=0., max=1.) # Update replay buffer with proportional probs self.memory.update_priorities(TD_errors) #compute weighted mse loss critic_loss = torch.mean(weights * (Q_expected - Q_targets) ** 2) else: #compute mse loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize critic loss self.critic_optimizer.zero_grad() critic_loss.backward() # Clip critic gradient torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() #NEGATIVE: gradiet ascent # Minimize actor loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Soft updates for target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, agents=2, every=4, updates=4): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size random.seed(np.random.seed(random_seed)) self.agents = agents self.every = every self.updates = updates self.steps = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noises = OUNoise((agents, action_size)) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device) def load_actor(self, model_file: str): self.actor_local.load_state_dict( torch.load(model_file, map_location=device)) self.actor_local.to(device) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.steps += 1 for i in range(self.agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and self.steps % self.every == 0: self.steps = 0 for _ in range(self.updates): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noises.sample() return np.clip(actions, -1, 1) def reset(self): self.noises.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # Gradient clipping self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, number_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed number_agents (int): number of agents """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.number_agents = number_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise processes self.noise = OUNoise((number_agents, action_size), random_seed) #self.noise = GaussianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1) #self.noise = GeometricBrownianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experiences in replay memory, and use random sample from buffer to learn.""" # We save experience tuples in the memory for each agent. for i in range(self.number_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn, if enough samples are available in memory (threshold value: BATCH_SIZE) and at learning interval settings if len(self.memory) > BATCH_SIZE: for _ in range(UPDATE_RATE): experiences = self.memory.sample() self.learn(experiences, GAMMA) # def act(self, states, add_noise=True): # """Returns actions for given state as per current policy.""" # # The code has been adapted to implement batch normalization. # actions = np.zeros((self.number_agents, self.action_size)) # self.actor_local.eval() # with torch.no_grad(): # for agent_number, state in enumerate(states): # state = torch.from_numpy(state).float().unsqueeze(0).to(device) # The code has been adapted to implement batch normalization. # action = self.actor_local(state).cpu().data.numpy() # actions[agent_number, :] = action # self.actor_local.train() # if add_noise: # actions += self.noise.sample() # return np.clip(actions, -1, 1) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.number_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_number, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_number, :] = action self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, agent_id): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, 256, 256, random_seed).to(device) self.actor_target = Actor(state_size, action_size, 256, 256, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # critic input critic_state_size = (state_size + action_size) * 2 # Critic Network (w/ Target Network) self.critic_local = Critic(critic_state_size, 256, 256, random_seed).to(device) self.critic_target = Critic(critic_state_size, 256, 256, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # agent id self.id_agent = agent_id # set weights the same for both models self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) def act(self, state, noise_counter, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: if noise_counter < NOISE_LEVEL: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, all_actions, all_next_actions, agent_id): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor next_actions (list): list of agents next actions actions (list): list of agents actions agent_id (int): agent_id, needed to distinguish between agents """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # agent_id = torch.tensor([agent_id]).to(device) all_next_actions = torch.cat(all_next_actions, dim=1).to(device) # Get predicted next-state actions and Q values from target models with torch.no_grad(): Q_targets_next = self.critic_target(next_states, all_next_actions) # Compute Q targets for current states (y_i) Q_targets = rewards.index_select( 1, agent_id) + (gamma * Q_targets_next * (1 - dones.index_select(1, agent_id))) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = [ actions if i == self.id_agent else actions.detach() for i, actions in enumerate(all_actions) ] actions_pred = torch.cat(actions_pred, dim=1).to(device) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): """Copy weights from source to target network, modified version of agent.soft_update()""" for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class Agent(): '''Interact with and learn from environment.''' def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # counter for activating learning every few steps self.running_c_loss = 0 self.running_a_loss = 0 self.training_cnt = 0 # Actor network (w/ target network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (w/ target network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Prioritized replay memory self.prioritized_memory = PrioritizedMemory(BATCH_SIZE, BUFFER_SIZE, seed) def act(self, state, mode): '''Returns actions for given state as per current policy. Params ====== state (array): current state mode (string): train or test epsilon (float): for epsilon-greedy action selection ''' state = torch.from_numpy(state).unsqueeze(0).float().to( device) # shape of state (1, state_size) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if mode == 'test': return np.clip(action, -1, 1) elif mode == 'train': # if train, then add OUNoise in action action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): # add new experience in memory self.prioritized_memory.add(state, action, reward, next_state, done) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: # If enough samples are available in memory, get random subset and learn if len(self.prioritized_memory) >= BUFFER_SIZE: for _ in range(10): # update 10 times per learning idxes, experiences, is_weights = self.prioritized_memory.sample( device) self.learn(experiences, GAMMA, is_weights=is_weights, leaf_idxes=idxes) def reset(self): self.noise.reset() def learn(self, experiences, gamma, is_weights, leaf_idxes): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Policy loss = (1/n)*Q_local(s,a) -> for deterministic policy (no log prob) Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor is_weights (tensor array): importance-sampling weights for prioritized experience replay leaf_idxes (numpy array): indexes for update priorities in SumTree """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # rewards = rewards # TODO: rewards are clipped to be in [-1,1] actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) td_errors = ( Q_targets - Q_expected).tanh() # TD-errors are clipped to be in [-1,1] abs_errors = td_errors.abs().cpu().data.numpy() # pull back to cpu self.prioritized_memory.batch_update( leaf_idxes, abs_errors) # update priorities in SumTree c_loss = (is_weights * (td_errors**2)).mean( ) # adjust squared TD loss by Importance-Sampling Weights self.running_c_loss += float(c_loss.cpu().data.numpy()) self.training_cnt += 1 # Minimize the loss self.critic_optimizer.zero_grad() c_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # clip gradient to max 1 self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) a_loss = self.critic_local(states, actions_pred) a_loss = -a_loss.mean() self.running_a_loss += float(a_loss.cpu().data.numpy()) # Minimize the loss self.actor_optimizer.zero_grad() a_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) # clip gradient to max 1 self.actor_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgent: '''Class representing the DDPG algorithm''' def __init__(self, seed, state_size, action_size, num_agents, device, config): '''Class constructor and parameters initialization''' self.device = device self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.gamma = config['gamma'] # Learns argmax_a[Q(s, a); theta_mu] = mu(s, a; theta_mu) self.learnt_actor = Actor(seed, state_size, action_size).to(self.device) # learnt self.target_actor = Actor(seed, state_size, action_size).to( self.device) # soft-update tracking self.actor_optim = optim.Adam(self.learnt_actor.parameters(), lr=config['actor_lr']) # Learns to evaluate Q(s, mu(s, a); theta_q) self.learnt_critic = Critic(seed, state_size * num_agents, action_size * num_agents, num_agents).to(self.device) # learnt self.target_critic = Critic(seed, state_size * num_agents, action_size * num_agents, num_agents).to( self.device) # soft-update tracking self.critic_optim = optim.Adam(self.learnt_critic.parameters(), lr=config['critic_lr']) print( f'Summary:\nActor network:\n{self.learnt_actor}\nCritic network:\n{self.learnt_critic}' ) # Soft-update self.tau = config['tau'] # Noise self.noise = OUNoise(action_size, seed) self.noise_decay = config['noise_decay'] self.hard_copy_weights(self.learnt_actor, self.target_actor) self.hard_copy_weights(self.learnt_critic, self.target_critic) def reset_noise(self): '''Reset the noise state''' self.noise.reset() # Note: Decentralized actors (execution) def act(self, state): '''Sample an action from the policy''' state = torch.tensor(state, dtype=torch.float32, device=self.device) self.learnt_actor.eval() with torch.no_grad(): actions = self.learnt_actor(state).cpu().data.numpy() self.learnt_actor.train() actions += self.noise_decay * self.noise.sample() return np.clip(actions, -1, 1) # Note: Centralized critic (training) def step(self, best_current_actions, best_next_actions, states, actions, rewards, next_states, dones): '''Optimizes the function apprximators and soft-updates''' self.__optimize_critic(best_next_actions, states, actions, rewards, next_states, dones) self.__optimize_actor(best_current_actions, states) self.__soft_update(self.learnt_actor, self.target_actor, self.tau) self.__soft_update(self.learnt_critic, self.target_critic, self.tau) self.noise_decay *= 0.9999 #self.noise_decay #self.reset_noise() def __optimize_critic(self, best_next_actions, states, actions, rewards, next_states, dones): '''Optimizes the critic approximator''' with torch.no_grad(): q_targets = self.target_critic(next_states, best_next_actions) q_targets = rewards + self.gamma * q_targets * (1 - dones) q_predictions = self.learnt_critic(states, actions) self.critic_optim.zero_grad() critic_loss = F.mse_loss(q_predictions, q_targets.detach()) critic_loss.backward() # Note: Control the magnitude of the gradient torch.nn.utils.clip_grad_norm_(self.learnt_critic.parameters(), 0.5) self.critic_optim.step() def __optimize_actor(self, best_current_actions, states): '''Optimizes the actor approximator''' advantage = -self.learnt_critic(states, best_current_actions).mean() self.actor_optim.zero_grad() advantage.backward() self.actor_optim.step() def hard_copy_weights(self, learnt, target): """ Copy weights from source to target network (part of initialization)""" for learnt_param, target_param in zip(learnt.parameters(), target.parameters()): target_param.data.copy_(learnt_param.data) def __soft_update(self, learnt, target, tau): '''Soft-updates the target parameters''' for learnt_param, target_param in zip(learnt.parameters(), target.parameters()): target_param.data.copy_(tau * learnt_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, device, state_size, action_size, actor, critic, action_low=-1.0, action_high=1.0, lrate_critic=10e-3, lrate_actor=10e-4, tau=0.001, gamma=0.99, exploration_mu=0.0, exploration_theta=0.15, noise_decay=1., exploration_sigma=0.20, restore_path=None, weight_decay=0., seed=None): self.state_size = state_size self.action_size = action_size self.action_low = action_low self.action_high = action_high self.seed = seed if seed else np.random.randint(100) self.lrate_critic = lrate_critic self.lrate_actor = lrate_actor self.tau = tau self.gamma = gamma self.restore_path = restore_path self.device = device self.weight_decay = weight_decay self.noise_decay = noise_decay # actors networks self.actor = actor(device, state_size, action_size, low=action_low, high=action_high, seed=self.seed) self.actor_target = actor(device, state_size, action_size, low=action_low, high=action_high, seed=self.seed) # critic networks self.critic = critic(device, state_size, action_size, seed=self.seed) self.critic_target = critic(device, state_size, action_size, seed=self.seed) # restore networks if needed if restore_path is not None: self.restore(restore_path, True) # optimizer self.actor_opt = optim.Adam(self.actor.parameters(), lr=lrate_actor, weight_decay=self.weight_decay) self.critic_opt = optim.Adam(self.critic.parameters(), lr=lrate_critic, weight_decay=self.weight_decay) # noise self.noise = OUNoise(action_size, exploration_mu, exploration_theta, exploration_sigma) self.noise_scale = 1.0 # reset agent for training self.reset_episode() self.it = 0 def reset_episode(self): self.noise.reset() def act(self, state, learn=True): if type(state) == 'list': state = np.array(state) if not learn: self.actor.eval() with torch.no_grad(): action = self.actor(self.tensor(state)).cpu().numpy() # Add noise when learning for exploration if learn: action += self.noise.sample() * self.noise_scale self.noise_scale = max(self.noise_scale * self.noise_decay, 0.01) self.actor.train() return np.clip(action, self.action_low, self.action_high) def save(self, path): dirn = os.path.dirname(path) if not os.path.exists(dirn): os.mkdir(dirn) params = {} params['actor'] = self.actor.state_dict() params['critic'] = self.critic.state_dict() torch.save(params, path) def restore(self, path, for_Training=False): # Restore only actor for performance checkpoint = torch.load(path, map_location=self.device) self.actor.load_state_dict(checkpoint['actor']) # Restore also for futhert training if for_Training: self.actor_target.load_state_dict(checkpoint['actor']) self.critic.load_state_dict(checkpoint['critic']) self.critic_target.load_state_dict(checkpoint['critic']) def learn_step(self, replay_buffer): # learn from mini-batch of replay buffer state_b, action_b, reward_b, next_state_b, done_b = replay_buffer.sample( ) # calculate td target with torch.no_grad(): y_b = reward_b.unsqueeze(1) + self.gamma * \ self.critic_target(next_state_b, self.actor_target(next_state_b)) * (1-done_b.unsqueeze(1)) # update critic critic_loss = F.smooth_l1_loss(self.critic(state_b, action_b), y_b) self.critic.zero_grad() critic_loss.backward() self.critic_opt.step() # update actor action = self.actor(state_b) actor_loss = -self.critic(state_b, action).mean() self.actor.zero_grad() actor_loss.backward() self.actor_opt.step() # soft update networks # critic only if trained # actor always self.soft_update() def soft_update(self): """Soft update of target network θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def tensor(self, x): return torch.from_numpy(x).float().to(torch.device(self.device))
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed, device, lr_actor, lr_critic, weight_decay_critic, batch_size, buffer_size, gamma, tau, update_every, n_updates, eps_start, eps_end, eps_decay): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.t_step = 0 self.device = device self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay_critic = weight_decay_critic self.batch_size = batch_size self.buffer_size = buffer_size self.gamma = gamma self.tau = tau self.update_every = update_every self.n_updates = n_updates self.eps = eps_start self.eps_end = eps_end self.eps_decay = eps_decay # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.device) self.actor_target = Actor(state_size, action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.device) self.critic_target = Critic(state_size, action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device) def step(self, state, action, reward, next_state, done, agent_number): """Save experience in replay memory, and use random sample from buffer to learn.""" self.t_step += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at interval settings if len(self.memory) > self.batch_size: if self.t_step % self.update_every == 0: for _ in range(self.n_updates): experiences = self.memory.sample() self.learn(experiences, self.gamma, agent_number) def act(self, states, add_noise): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(self.device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: actions += self.eps * self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) # Update epsilon noise value self.eps = max(self.eps_end, self.eps_decay*self.eps) # self.eps = self.eps - (1/self.eps_decay) # if self.eps < self.eps_end: # self.eps = self.eps_end def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, agent_id, args): self.state_size = state_size self.action_size = action_size self.seed = args['seed'] self.device = args['device'] self.args = args # Q-Network self.actor_network = ActorNetwork(state_size, action_size, args).to(self.device) self.actor_target = ActorNetwork(state_size, action_size, args).to(self.device) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR']) #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine) if not agent_id: self.actor_network.load_state_dict(torch.load( args['agent_p0_path']), strict=False) self.actor_target.load_state_dict(torch.load( args['agent_p0_path']), strict=False) else: self.actor_network.load_state_dict(torch.load( args['agent_p1_path']), strict=False) self.actor_target.load_state_dict(torch.load( args['agent_p1_path']), strict=False) # Replay memory self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.seed) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) if len(self.memory) > self.args['BATCH_SIZE']: experiences = self.memory.sample() self.train(experiences) def act(self, current_state): with torch.no_grad(): self.actor_network.eval() input_state = torch.from_numpy(current_state).float().to( self.device) with torch.no_grad(): action = self.actor_network(input_state).cpu().data.numpy() self.actor_network.train() action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def train(self, experiences): global states_ global next_states_ global actions_ global max_min_actions_vector global max_min_states_vector states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # with torch.no_grad(): # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = mCritic.target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = mCritic.network(states, actions) mCritic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss mCritic.optimizer.zero_grad() mCritic_loss.backward() mCritic.optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_network(states) actor_loss = -mCritic.network(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(mCritic.network, mCritic.target, TAU) self.soft_update(self.actor_network, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)