def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic One Network (w/ Target Network) self.critic_local_one = Critic(state_size, action_size, random_seed).to(device) self.critic_target_one = Critic(state_size, action_size, random_seed).to(device) self.critic_one_optimizer = optim.Adam( self.critic_local_one.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Critic Two Network (w/ Target Network) self.critic_local_two = Critic(state_size, action_size, random_seed).to(device) self.critic_target_two = Critic(state_size, action_size, random_seed).to(device) self.critic_two_optimizer = optim.Adam( self.critic_local_two.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # # # Noise process # self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Counter self.t_step = 0 # learn_counter self.learn_ctr = 0
def __init__(self, state_size, action_size, random_seed=1): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ # Store parameters self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON self.lr_actor = LR_ACTOR self.lr_critic = LR_CRITIC self.lr_decay = WEIGHT_DECAY # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic) # Noise process self.noise = OUNoise(action_size, random_seed) self.timestep = 0 # Replay memory self.memory = FifoMemory(BUFFER_SIZE, BATCH_SIZE) # Short term memory contains only 1/100 of the complete memory and the most recent samples self.memory_success = FifoMemory(int(BUFFER_SIZE), int(BATCH_SIZE)) self.memory_short = FifoMemory(5, 5)
def __init__(self, state_size, action_size, random_seed=1): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ # Store parameters self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON self.lr_actor = LR_ACTOR self.lr_critic = LR_CRITIC self.lr_decay = WEIGHT_DECAY # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = FifoMemory(BUFFER_SIZE, BATCH_SIZE) # Success memory contains only the last 10 samples which led to a positive reward self.memory_success = FifoMemory(int(BUFFER_SIZE), int(BATCH_SIZE)) # Rolling sample memory of last 10 samples self.memory_short = FifoMemory(10, 10)
def __init__(self, num_agents, state_size, action_size, random_seed, actor_fc1_units, actor_fc2_units, critic_fcs1_units, critic_fc2_units, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay, ou_mu, ou_theta, ou_sigma, update_every_t_steps, num_of_updates): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed buffer_size (int) : replay buffer size batch_size (int) : minibatch size gamma (float) : discount factor tau (float) : for soft update of target parameter lr_actor (float) : learning rate of the actor lr_critic (float) : learning rate of the critic weight_decay (float) : L2 weight decay ou_mu (float) : OUNoise mu ou_theta (float) : OUNoise theta ou_sigma (float) : OUNoise sigma update_every_t_steps (int): timesteps between updates num_of_updates (int): num of update passes when updating """ print( "[AGENT INFO] DDPG constructor initialized parameters:\n num_agents={} \n state_size={} \n action_size={} \n random_seed={} \n actor_fc1_units={} \n actor_fc2_units={} \n critic_fcs1_units={} \n critic_fc2_units={} \n buffer_size={} \n batch_size={} \n gamma={} \n tau={} \n lr_actor={} \n lr_critic={} \n weight_decay={} \n ou_mu={}\n ou_theta={}\n ou_sigma={}\n update_every_t_steps={}\n num_of_updates={}\n" .format(num_agents, state_size, action_size, random_seed, actor_fc1_units, actor_fc2_units, critic_fcs1_units, critic_fc2_units, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay, ou_mu, ou_theta, ou_sigma, update_every_t_steps, num_of_updates)) self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.actor_fc1_units = actor_fc1_units self.actor_fc2_units = actor_fc2_units self.critic_fcs1_units = critic_fcs1_units self.critic_fc2_units = critic_fc2_units self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.ou_mu = ou_mu self.ou_theta = ou_theta self.ou_sigma = ou_sigma self.update_every_t_steps = update_every_t_steps self.num_of_updates = num_of_updates # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed, actor_fc1_units, actor_fc2_units).to(device) self.actor_target = Actor(state_size, action_size, random_seed, actor_fc1_units, actor_fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed, critic_fcs1_units, critic_fc2_units).to(device) self.critic_target = Critic(state_size, action_size, random_seed, critic_fcs1_units, critic_fc2_units).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=self.weight_decay) # Noise process self.noise = OUNoise(action_size, random_seed, mu=self.ou_mu, theta=self.ou_theta, sigma=self.ou_sigma) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, num_agents, state_size, action_size, random_seed, actor_fc1_units, actor_fc2_units, critic_fcs1_units, critic_fc2_units, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay, ou_mu, ou_theta, ou_sigma, update_every_t_steps, num_of_updates): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed buffer_size (int) : replay buffer size batch_size (int) : minibatch size gamma (float) : discount factor tau (float) : for soft update of target parameter lr_actor (float) : learning rate of the actor lr_critic (float) : learning rate of the critic weight_decay (float) : L2 weight decay ou_mu (float) : OUNoise mu ou_theta (float) : OUNoise theta ou_sigma (float) : OUNoise sigma update_every_t_steps (int): timesteps between updates num_of_updates (int): num of update passes when updating """ print( "[AGENT INFO] DDPG constructor initialized parameters:\n num_agents={} \n state_size={} \n action_size={} \n random_seed={} \n actor_fc1_units={} \n actor_fc2_units={} \n critic_fcs1_units={} \n critic_fc2_units={} \n buffer_size={} \n batch_size={} \n gamma={} \n tau={} \n lr_actor={} \n lr_critic={} \n weight_decay={} \n ou_mu={}\n ou_theta={}\n ou_sigma={}\n update_every_t_steps={}\n num_of_updates={}\n" .format(num_agents, state_size, action_size, random_seed, actor_fc1_units, actor_fc2_units, critic_fcs1_units, critic_fc2_units, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay, ou_mu, ou_theta, ou_sigma, update_every_t_steps, num_of_updates)) self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.actor_fc1_units = actor_fc1_units self.actor_fc2_units = actor_fc2_units self.critic_fcs1_units = critic_fcs1_units self.critic_fc2_units = critic_fc2_units self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.ou_mu = ou_mu self.ou_theta = ou_theta self.ou_sigma = ou_sigma self.update_every_t_steps = update_every_t_steps self.num_of_updates = num_of_updates # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed, actor_fc1_units, actor_fc2_units).to(device) self.actor_target = Actor(state_size, action_size, random_seed, actor_fc1_units, actor_fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed, critic_fcs1_units, critic_fc2_units).to(device) self.critic_target = Critic(state_size, action_size, random_seed, critic_fcs1_units, critic_fc2_units).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=self.weight_decay) # Noise process self.noise = OUNoise(action_size, random_seed, mu=self.ou_mu, theta=self.ou_theta, sigma=self.ou_sigma) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) # Make sure target is with the same weight as the source #self.hard_copy(self.actor_target, self.actor_local) #self.hard_copy(self.critic_target, self.critic_local) def step(self, states, actions, rewards, next_states, dones, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len( self.memory ) > self.batch_size and timestep % self.update_every_t_steps == 0: for _ in range(self.num_of_updates): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q1_targets_next, Q2_targets_next = self.critic_target( next_states, actions_next) Q_targets_next = torch.min(Q1_targets_next, Q2_targets_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q1_expected, Q2_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q1_expected, Q_targets) + F.mse_loss( Q2_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local.Q1(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_copy(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class TD3Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed=1): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ # Store parameters self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON self.lr_actor = LR_ACTOR self.lr_critic = LR_CRITIC self.lr_decay = WEIGHT_DECAY # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = FifoMemory(BUFFER_SIZE, BATCH_SIZE) # Success memory contains only the last 10 samples which led to a positive reward self.memory_success = FifoMemory(int(BUFFER_SIZE), int(BATCH_SIZE)) # Rolling sample memory of last 10 samples self.memory_short = FifoMemory(10, 10) def update_model(self, state, action, reward, next_state, done): self.step(state, action, reward, next_state, done) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" reached = True if len(self.memory_success) < BATCH_SIZE: reached = False self.memory.add(state, action, reward, next_state, done) self.memory_short.add(state, action, reward, next_state, done) # Fill the success memory in case this agents receives positive reward if reward > 0.0: for i in range(len(self.memory_short)): self.memory_success.add( self.memory_short.samples[i].state, \ self.memory_short.samples[i].action, \ self.memory_short.samples[i].reward, \ self.memory_short.samples[i].next_state, \ self.memory_short.samples[i].done) self.memory_short.clear() if reached == False and len(self.memory_success) > BATCH_SIZE: print("Success memory ready for use!") # Train with the complete replay memory if len(self.memory) > BATCH_SIZE: for i in range(LEARN_NUM_MEMORY): experiences = self.memory.sample() # delay update of the policy and only update every 2nd training self.learn(experiences, 0 , GAMMA) # Train with the success replay memory if (len(self.memory_success) > self.memory_success.batch_size): for i in range(LEARN_NUM_MEMORY_SUCCESS): experiences_success = self.memory_success.sample() self.learn(experiences_success, 0 ,GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # TD3 --> Action noise regularisation if add_noise: action += self.epsilon * self.noise.sample() # The range of noise is clipped in order to keep the target value # close to the original action. clipped_action = np.clip(action, -1, 1) self.epsilon *= EPSILON_DECAY return clipped_action def reset(self): self.noise.reset() def learn(self, experiences, delay ,gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # TD3 --> Using a pair of critic networks (The twin part of the title) # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next1, Q_targets_next2 = self.critic_target(next_states, actions_next) # TD3 --> Take the minimum of both critic in order to avoid overestimation #Q_targets_next = torch.min(Q_targets_next1, Q_targets_next2) Q_targets_next = Q_targets_next1 # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected1, Q_expected2 = self.critic_local(states, actions) # compute critic loss [HOW MUCH OFF?] as sum of both loss from target #critic_loss = F.mse_loss(Q_expected1, Q_targets)+F.mse_loss(Q_expected2, Q_targets) critic_loss = F.mse_loss(Q_expected1, Q_targets) # minimize loss [TRAIN] self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # TD3 --> Delayed updates of the actor = policy (The delayed part) # Compute actor loss if delay == 0: actions_pred = self.actor_local(states) # compute loss [HOW MUCH OFF?] actor_loss = -self.critic_local.Q1(states, actions_pred).mean() # minimize loss [TRAIN] self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ---------------------------- update noise ---------------------------- # self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic One Network (w/ Target Network) self.critic_local_one = Critic(state_size, action_size, random_seed).to(device) self.critic_target_one = Critic(state_size, action_size, random_seed).to(device) self.critic_one_optimizer = optim.Adam( self.critic_local_one.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Critic Two Network (w/ Target Network) self.critic_local_two = Critic(state_size, action_size, random_seed).to(device) self.critic_target_two = Critic(state_size, action_size, random_seed).to(device) self.critic_two_optimizer = optim.Adam( self.critic_local_two.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # # # Noise process # self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Counter self.t_step = 0 # learn_counter self.learn_ctr = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. # self.t_step = (self.t_step + 1) % UPDATE_EVERY # # if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += np.random.normal(0, 0.2, size=action.shape) return np.clip(action, -1, 1) # def act(self, state, add_noise=True): # """Returns actions for given state as per current policy.""" # if self.t_step < WARMUP: # action = np.random.normal(scale=0.1, size=(self.action_size)) # else: # state = torch.from_numpy(state).float().to(device) # self.actor_local.eval() # with torch.no_grad(): # action = self.actor_local(state).cpu().data.numpy() # self.actor_local.train() # if add_noise: # action += np.random.normal(0, 0.1, action.shape) # #update counter # self.t_step += 1 # return np.clip(action, -1, 1) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) noise = torch.randn_like(actions_next).mul(0.2) noise = noise.clamp(-0.5, 0.5) actions_next = (actions_next + noise).clamp(-1, 1) # actions_next = self.actor_target(next_states) # actions_next = actions_next + torch.clamp(torch.from_numpy(np.random.normal(loc=0, scale=0.2, size=actions_next.shape)).float().to(device), -0.5, 0.5) # actions_next = torch.clamp(actions_next, self.min_size[0], self.max_size[0]) critic_one_target = self.critic_target_one(next_states, actions_next) critic_two_target = self.critic_target_two(next_states, actions_next) Q_targets_next = torch.min(critic_one_target, critic_two_target) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_targets.detach() critic_one_expected = self.critic_local_one(states, actions) critic_two_expected = self.critic_local_two(states, actions) # Compute Q targets for current states (y_i) # Compute both critics loss # Minimize the loss critic_one_loss = F.mse_loss(critic_one_expected, Q_targets) critic_two_loss = F.mse_loss(critic_two_expected, Q_targets) critic_loss = critic_one_loss + critic_two_loss self.critic_one_optimizer.zero_grad() self.critic_two_optimizer.zero_grad() critic_loss.backward() self.critic_one_optimizer.step() self.critic_two_optimizer.step() self.learn_ctr = (self.learn_ctr + 1) % UPDATE_EVERY if self.learn_ctr != 0: return # ---------------------------- update actor ---------------------------- # # Compute actor loss actor_loss = -self.critic_local_one(states, self.actor_local(states)).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local_one, self.critic_target_one, TAU) self.soft_update(self.critic_local_two, self.critic_target_two, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)