class DDPG_agent(nn.Module): def __init__(self, in_actor, in_critic, action_size, num_agents, random_seed): super(DDPG_agent, self).__init__() """init the agent""" self.action_size = action_size self.seed = random_seed # Fully connected actor network self.actor_local = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_target = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Fully connected critic network self.critic_local = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_target = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise process for exploration self.noise = OUNoise((action_size), random_seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def target_act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" action = self.actor_target(state) return action def reset(self): """ Resets noise """ self.noise.reset()
class Agent(): def __init__(self, actor_size, action_size, critic_size): super().__init__() gpu = torch.cuda.is_available() if (gpu): print('GPU/CUDA works! Happy fast training :)') torch.cuda.current_device() torch.cuda.empty_cache() self.device = torch.device("cuda") else: print('training on cpu...') self.device = torch.device("cpu") self.actor = Actor(actor_size, action_size).to(self.device) self.actor_target = Actor(actor_size, action_size).to(self.device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001) self.critic = Critic(critic_size).to(self.device) self.critic_target = Critic(critic_size).to(self.device) self.critic_optim = optim.Adam(self.critic.parameters(), lr=0.001, weight_decay=0) self.gamma = 0.95 #0.99 self.tau = 0.001 self.noise = OUNoise((action_size), 2) self.target_network_update(self.actor_target, self.actor, 1.0) self.target_network_update(self.critic_target, self.critic, 1.0) def select_actions(self, state): state = torch.from_numpy(state).float().to(self.device).view(1, -1) #print(state.shape) self.actor.eval() with torch.no_grad(): actions = self.actor(state).cpu().data.squeeze(0) self.actor.train() actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def target_network_update(self, target_network, network, tau): for network_param, target_param in zip(network.parameters(), target_network.parameters()): target_param.data.copy_(tau * network_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = [ OUNoise(action_size, random_seed, sigma=0.1) for i in range(self.num_agents) ] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Make sure target is with the same weight as the source self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) self.t_step = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done, self.num_agents) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for _ in range(UPDATES_PER_STEP): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: for i in range(self.num_agents): agent_action = action[i] for j in agent_action: j += self.noise[i].sample() return np.clip(action, -1, 1) def reset(self): for i in range(self.num_agents): self.noise[i].reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + ? * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. ?_target = t*?_local + (1 - t)*?_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class DyNODESacAgent(object): """DyNODE-SAC.""" def __init__(self, obs_shape, action_shape, device, model_kind, kind='D', step_MVE=5, hidden_dim=256, discount=0.99, init_temperature=0.01, alpha_lr=1e-3, alpha_beta=0.9, actor_lr=1e-3, actor_beta=0.9, actor_log_std_min=-10, actor_log_std_max=2, critic_lr=1e-3, critic_beta=0.9, critic_tau=0.005, critic_target_update_freq=2, model_lr=1e-3, log_interval=100): self.device = device self.discount = discount self.critic_tau = critic_tau self.critic_target_update_freq = critic_target_update_freq self.log_interval = log_interval self.step_MVE = step_MVE self.model_kind = model_kind self.actor = Actor(obs_shape, action_shape, hidden_dim, actor_log_std_min, actor_log_std_max).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr, betas=(actor_beta, 0.999)) self.critic = Critic(obs_shape, action_shape, hidden_dim).to(device) self.critic_target = Critic(obs_shape, action_shape, hidden_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr, betas=(critic_beta, 0.999)) self.log_alpha = torch.tensor(np.log(init_temperature)).to(device) self.log_alpha.requires_grad = True self.target_entropy = -np.prod( action_shape) # set target entropy to -|A| self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr, betas=(alpha_beta, 0.999)) if self.model_kind == 'dynode_model': self.model = DyNODE(obs_shape, action_shape, hidden_dim_p=200, hidden_dim_r=200).to(device) elif self.model_kind == 'nn_model': self.model = NN_Model(obs_shape, action_shape, hidden_dim_p=200, hidden_dim_r=200, kind=kind).to(device) else: assert 'model is not supported' self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=model_lr) self.train() self.critic_target.train() def train(self, training=True): self.training = training self.actor.train(training) self.critic.train(training) self.model.train(training) @property def alpha(self): return self.log_alpha.exp() def select_action(self, obs): with torch.no_grad(): obs = torch.FloatTensor(obs).to(self.device) obs = obs.unsqueeze(0) mu, _, _, _ = self.actor(obs, compute_pi=False, compute_log_pi=False) return mu.cpu().data.numpy().flatten() def sample_action(self, obs): with torch.no_grad(): obs = torch.FloatTensor(obs).to(self.device) obs = obs.unsqueeze(0) mu, pi, _, _ = self.actor(obs, compute_log_pi=False) return pi.cpu().data.numpy().flatten() def update_model(self, replay_buffer, L, step): if self.model_kind == 'dynode_model': obs_m, action_m, reward_m, next_obs_m, _ = replay_buffer.sample_dynode( ) transition_loss, reward_loss = self.model.loss( obs_m, action_m, reward_m, next_obs_m) model_loss = transition_loss + reward_loss elif self.model_kind == 'nn_model': obs, action, reward, next_obs, _ = replay_buffer.sample() transition_loss, reward_loss = self.model.loss( obs, action, reward, next_obs) model_loss = transition_loss + reward_loss else: assert 'model is not supported' # Optimize the Model self.model_optimizer.zero_grad() model_loss.backward() self.model_optimizer.step() if step % self.log_interval == 0: L.log('train/model_loss', model_loss, step) def MVE_prediction(self, replay_buffer, L, step): obs, action, reward, next_obs, not_done = replay_buffer.sample() trajectory = [] next_ob = next_obs with torch.no_grad(): while len(trajectory) < self.step_MVE: ob = next_ob _, act, _, _ = self.actor(ob) rew, next_ob = self.model(ob, act) trajectory.append([ob, act, rew, next_ob]) _, next_action, log_pi, _ = self.actor(next_ob) target_Q1, target_Q2 = self.critic_target(next_ob, next_action) ret = torch.min(target_Q1, target_Q2) - self.alpha.detach() * log_pi critic_loss = 0 for ob, act, rew, _ in reversed(trajectory): current_Q1, current_Q2 = self.critic(ob, act) ret = rew + self.discount * ret # critic_loss = critic_loss + utils.huber(current_Q1 - ret).mean() + utils.huber(current_Q2 - ret).mean() critic_loss = critic_loss + F.mse_loss( current_Q1, ret) + F.mse_loss(current_Q2, ret) current_Q1, current_Q2 = self.critic(obs, action) ret = reward + self.discount * ret # critic_loss = critic_loss + utils.huber(current_Q1 - ret).mean() + utils.huber(current_Q2 - ret).mean() critic_loss = critic_loss + F.mse_loss(current_Q1, ret) + F.mse_loss( current_Q2, ret) critic_loss = critic_loss / (self.step_MVE + 1) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # actor _, pi, log_pi, log_std = self.actor(obs) actor_Q1, actor_Q2 = self.critic(obs.detach(), pi) actor_Q = torch.min(actor_Q1, actor_Q2) actor_loss = (self.alpha.detach() * log_pi - actor_Q).mean() # optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.log_alpha_optimizer.zero_grad() alpha_loss = (self.alpha * (-log_pi - self.target_entropy).detach()).mean() alpha_loss.backward() self.log_alpha_optimizer.step() def update_critic(self, obs, action, reward, next_obs, not_done, L, step): with torch.no_grad(): _, policy_action, log_pi, _ = self.actor(next_obs) target_Q1, target_Q2 = self.critic_target(next_obs, policy_action) target_V = torch.min(target_Q1, target_Q2) - self.alpha.detach() * log_pi target_Q = reward + (not_done * self.discount * target_V) # get current Q estimates current_Q1, current_Q2 = self.critic(obs, action) critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) if step % self.log_interval == 0: L.log('train_critic/loss', critic_loss, step) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() self.critic.log(L, step) def update_actor_and_alpha(self, obs, L, step): _, pi, log_pi, log_std = self.actor(obs) actor_Q1, actor_Q2 = self.critic(obs, pi) actor_Q = torch.min(actor_Q1, actor_Q2) actor_loss = (self.alpha.detach() * log_pi - actor_Q).mean() if step % self.log_interval == 0: L.log('train_actor/loss', actor_loss, step) L.log('train_actor/target_entropy', self.target_entropy, step) entropy = 0.5 * log_std.shape[1] * ( 1.0 + np.log(2 * np.pi)) + log_std.sum(dim=-1) if step % self.log_interval == 0: L.log('train_actor/entropy', entropy.mean(), step) # optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.actor.log(L, step) self.log_alpha_optimizer.zero_grad() alpha_loss = (self.alpha * (-log_pi - self.target_entropy).detach()).mean() if step % self.log_interval == 0: L.log('train_alpha/loss', alpha_loss, step) L.log('train_alpha/value', self.alpha, step) alpha_loss.backward() self.log_alpha_optimizer.step() def update(self, replay_buffer, L, step): if step < 2000: for _ in range(2): obs, action, reward, next_obs, not_done = replay_buffer.sample( ) self.update_critic(obs, action, reward, next_obs, not_done, L, step) self.update_actor_and_alpha(obs, L, step) if step % self.log_interval == 0: L.log('train/batch_reward', reward.mean(), step) else: obs, action, reward, next_obs, not_done = replay_buffer.sample() if step % self.log_interval == 0: L.log('train/batch_reward', reward.mean(), step) self.MVE_prediction(replay_buffer, L, step) self.update_critic(obs, action, reward, next_obs, not_done, L, step) self.update_actor_and_alpha(obs, L, step) if step % self.critic_target_update_freq == 0: utils.soft_update_params(self.critic.Q1, self.critic_target.Q1, self.critic_tau) utils.soft_update_params(self.critic.Q2, self.critic_target.Q2, self.critic_tau) def save(self, model_dir, step): torch.save(self.actor.state_dict(), '%s/actor_%s.pt' % (model_dir, step)) torch.save(self.critic.state_dict(), '%s/critic_%s.pt' % (model_dir, step)) def save_model(self, model_dir, step): torch.save(self.model.state_dict(), '%s/model_%s.pt' % (model_dir, step)) def load(self, model_dir, step): self.actor.load_state_dict( torch.load('%s/actor_%s.pt' % (model_dir, step))) self.critic.load_state_dict( torch.load('%s/critic_%s.pt' % (model_dir, step)))
class DDPG: def __init__(self, state_size, action_size, memory_size=int(1e5), # replay buffer size batch_size=128, # minibatch size gamma=0.99, # discount factor tau=1e-3, # for soft update of target parameters update_every=10, lr_actor=1e-4, lr_critic=1e-3, random_seed=2): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.params = {"lr_actor": lr_actor, "lr_critic": lr_critic, "gamma": gamma, "tau": tau, "memory_size": memory_size, "batch_size": batch_size, "optimizer": "adam"} self.actor_local = Actor(state_size, action_size, seed=random_seed).to(device) self.actor_target = Actor(state_size, action_size, seed=random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_local = Critic(state_size, action_size, seed=random_seed).to(device) self.critic_target = Critic(state_size, action_size, seed=random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) self.memory = ReplayBuffer(action_size, memory_size, batch_size, random_seed) # Noise process self.noise = OUNoise(action_size, random_seed) self.learn_steps = 0 self.update_every = update_every def reset(self): self.noise.reset() def act(self, state, add_noise=True): # for single agent only state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor_local.eval() # must set to eval mode, since BatchNorm used with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action.squeeze(), -1, 1) def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) if len(self.memory) > self.params["batch_size"]: experiences = self.memory.sample() self.learn(experiences, self.params["gamma"]) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # ------------------------------------------ # update critic # ------------------------------------------ # recall DQN # Q[s][a] = Q[s][a] + alpha * (r + gamma * np.max(Q[s_next]) - Q[s][a]) # thus, here # Q_local = Q[s][a] # = critic_local(s, a) # Q_target = r + gamma * np.max(Q[s_next]) # = r + gamma * (critic_target[s_next, actor_target(s_next)]) # # calculate np.max(Q[s_next]) with critic_target[s_next, actor_target(s_next)] # because actor suppose to output action which max Q(s) # # loss = mse(Q_local - Q_target) best_actions = self.actor_target(next_states) # supposed to be best actions, however Q_next_max = self.critic_target(next_states, best_actions) Q_target = rewards + gamma * Q_next_max * (1 - dones) # Q_target_detached = Q_target.detach() Q_local = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_local, Q_target) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ------------------------------------------ # update critic # ------------------------------------------ # suppose critic(s,a) give us q_max as a baseline or guidance # we want actor(s) to output the right a # which let critic(s,a)->q_max happen # so we want find a_actor to max Q_critic(s, a) # a_actor is function of θ # so the gradient is dQ/da*da/dθ actions_pred = self.actor_local(states) Q_baseline = self.critic_local(states, actions_pred) actor_loss = -Q_baseline.mean() # I think this is a good trick to make loss to scalar # note, gradients from both actor_local and critic_local will be calculated # however we only update actor_local self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # if self.learn_steps % self.update_every == 0: self.soft_update(self.critic_local, self.critic_target, self.params["tau"]) self.soft_update(self.actor_local, self.actor_target, self.params["tau"]) self.learn_steps += 1 def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """ Interacts with and learns from the environment. """ def __init__(self, state_size, action_size, fc1_units, fc2_units): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = torch.manual_seed(SEED) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, fc1_units, fc2_units).to(device) self.actor_target = Actor(state_size, action_size, fc1_units, fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, fc1_units, fc2_units).to(device) self.critic_target = Critic(state_size, action_size, fc1_units, fc2_units).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OrnsteinUhlenbeck(action_size, SEED) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, SEED, device) def step(self, time_step, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done) # Learn only every N_TIME_STEPS if time_step % N_TIME_STEPS != 0: return # Learn if enough samples are available in replay buffer if len(self.memory) > BATCH_SIZE: for i in range(N_LEARN_UPDATES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """ Returns actions for given state as per current policy. """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets from current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def store(self): torch.save(self.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(self.critic_local.state_dict(), 'checkpoint_critic.pth') def load(self): if os.path.isfile('checkpoint_actor.pth') and os.path.isfile( 'checkpoint_critic.pth'): print("=> loading checkpoints for Actor and Critic... ") self.actor_local.load_state_dict('checkpoint_actor') self.critic_local.load_state_dict('checkpoint_critic') print("done !") else: print("no checkpoints found for Actor and Critic...")
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, num_agents, state_size, action_size, random_seed=2018): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.device = torch.device('cuda' if cuda else 'cpu') # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # # Learn, if enough samples are available in memory # if len(self.memory) > BATCH_SIZE: # experiences = self.memory.sample() # self.learn(experiences, GAMMA) def sampleandlearn(self): ''' Learn from stored experiences ''' if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) # Deactivate gradients and perform forward pass self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: for a in range(self.num_agents): action[a] += self.noise.sample() # Clip action return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): ''' Interacts with and learns from the environment ''' def __init__(self, num_agents, state_size, action_size, random_seed=2018): self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.device = torch.device('cuda' if cuda else 'cpu') self.update = UPDATE_EVERY self.updates = NUMBER_OF_UPDATES # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device) def step(self, state, action, reward, next_state, done, timestep): ''' Save experience in replay memory, and use random sample from buffer to learn ''' # Save experience into memory __for each agent__ for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # If we are in the timestep to update if timestep % self.update == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: # Do learning "updates" times for _ in range(self.updates): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): ''' Returns actions for given state as per current policy ''' states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) # Deactivate gradients and perform forward pass self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: for a in range(self.num_agents): actions[a, :] += self.noise.sample() # Clip action return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): ''' Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples ''' states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models # Dimensions actions_next = self.actor_target(next_states) # (BSx2) Q_targets_next = self.critic_target(next_states, actions_next) # # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local( states, actions_pred).mean() # Average over the minibatch # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): ''' Soft update model parameters ''' for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(): def __init__(self, env, log_dir, gamma=0.99, batch_size=64, sigma=0.2, batch_norm=True, merge_layer=2, buffer_size=int(1e6), buffer_min=int(1e4), tau=1e-3, Q_wd=1e-2, num_episodes=1000): self.s_dim = env.reset().shape[0] # self.a_dim = env.action_space.shape[0] self.a_dim = env.action_space2.shape[0] # self.a_dim = 1 self.env = env # self.mu = Actor(self.s_dim, self.a_dim, env.action_space, batch_norm=batch_norm) self.mu = Actor(self.s_dim, self.a_dim, env.action_space2, batch_norm=batch_norm) self.Q = Critic(self.s_dim, self.a_dim, batch_norm=batch_norm, merge_layer=merge_layer) self.targ_mu = copy.deepcopy(self.mu).eval() self.targ_Q = copy.deepcopy(self.Q).eval() self.noise = OrnsteinUhlenbeck(mu=torch.zeros(self.a_dim), sigma=sigma * torch.ones(self.a_dim)) self.buffer = Buffer(buffer_size, self.s_dim, self.a_dim) self.buffer_min = buffer_min self.mse_fn = torch.nn.MSELoss() self.mu_optimizer = torch.optim.Adam(self.mu.parameters(), lr=1e-4) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=1e-3, weight_decay=Q_wd) self.gamma = gamma self.batch_size = batch_size self.num_episodes = num_episodes self.tau = tau self.log_dir = log_dir self.fill_buffer() #updates the target network to slowly track the main network def track_network(self, target, main): with torch.no_grad(): for pt, pm in zip(target.parameters(), main.parameters()): pt.data.copy_(self.tau * pm.data + (1 - self.tau) * pt.data) # updates the target nets to slowly track the main ones def track_networks(self): self.track_network(self.targ_mu, self.mu) self.track_network(self.targ_Q, self.Q) def run_episode(self): done = False s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) t = 0 tot_r = 0 while not done: self.mu = self.mu.eval() # a_ = torch.squeeze(self.mu(s)).detach().numpy() a = torch.squeeze(self.mu(s)).detach().numpy() # print("a {}\n".format(a)) self.mu = self.mu.train() ac_noise = self.noise().detach().numpy() a = a + ac_noise # print("ac_noise {}\n".format(ac_noise)) # print("a+ac_noise {}\n".format(a)) if a < self.env.action_space2.low: a = self.env.action_space2.low elif a > self.env.action_space2.high: a = self.env.action_space2.high s = s.detach().numpy() a_updated = self.LQR(s, a) # s_p, r, done, _ = self.env.step(a) s_p, r, done, _ = self.env.step(a_updated) tot_r += r self.buffer.add_tuple(s, a, r, s_p, done) s_batch, a_batch, r_batch, s_p_batch, done_batch = self.buffer.sample( batch_size=self.batch_size) # update critic with torch.no_grad(): q_p_pred = self.targ_Q(s_p_batch, self.targ_mu(s_p_batch)) q_p_pred = torch.squeeze(q_p_pred) y = r_batch + (1.0 - done_batch) * self.gamma * q_p_pred self.Q_optimizer.zero_grad() q_pred = self.Q(s_batch, a_batch) q_pred = torch.squeeze(q_pred) #print(torch.mean(q_pred)) Q_loss = self.mse_fn(q_pred, y) Q_loss.backward(retain_graph=False) self.Q_optimizer.step() # update actor self.mu_optimizer.zero_grad() q_pred_mu = self.Q(s_batch, self.mu(s_batch)) q_pred_mu = torch.squeeze(q_pred_mu) #print(torch.mean(q_pred_mu)) mu_loss = -torch.mean(q_pred_mu) # print(mu_loss) mu_loss.backward(retain_graph=False) #print(torch.sum(self.mu.layers[0].weight.grad)) self.mu_optimizer.step() self.track_networks() s = torch.tensor(s_p.astype(np.float32), requires_grad=False) t += 1 return tot_r, t def train(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 10 == 0: torch.save(self.mu, self.log_dir + '/models/model_' + str(i)) np.save(self.log_dir + '/results_train.npy', np.array(results)) def train1(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 10 == 0: torch.save(self.mu, self.log_dir + '/models1/model_' + str(i)) np.save(self.log_dir + '/results_train1.npy', np.array(results)) def train2(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 10 == 0: torch.save(self.mu, self.log_dir + '/models2/model_' + str(i)) np.save(self.log_dir + '/results_train2.npy', np.array(results)) def train3(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 10 == 0: torch.save(self.mu, self.log_dir + '/models3/model_' + str(i)) np.save(self.log_dir + '/results_train3.npy', np.array(results)) def eval_all(self, model_dir, num_eps=5): results = [] for model_fname in sorted(os.listdir(model_dir), key=lambda x: int(x.split('_')[1])): print(model_fname) mu = torch.load(os.path.join(model_dir, model_fname)) r, t = self.eval(num_eps=num_eps, mu=mu) results.append([r, t]) np.save(self.log_dir + '/results_eval.npy', np.array(results)) def eval_all1(self, model_dir, num_eps=5): results = [] for model_fname in sorted(os.listdir(model_dir), key=lambda x: int(x.split('_')[1])): print(model_fname) mu = torch.load(os.path.join(model_dir, model_fname)) r, t = self.eval(num_eps=num_eps, mu=mu) results.append([r, t]) np.save(self.log_dir + '/results_eval1.npy', np.array(results)) def eval_all2(self, model_dir, num_eps=5): results = [] for model_fname in sorted(os.listdir(model_dir), key=lambda x: int(x.split('_')[1])): print(model_fname) mu = torch.load(os.path.join(model_dir, model_fname)) r, t = self.eval(num_eps=num_eps, mu=mu) results.append([r, t]) np.save(self.log_dir + '/results_eval2.npy', np.array(results)) def eval_all3(self, model_dir, num_eps=5): results = [] for model_fname in sorted(os.listdir(model_dir), key=lambda x: int(x.split('_')[1])): print(model_fname) mu = torch.load(os.path.join(model_dir, model_fname)) r, t = self.eval(num_eps=num_eps, mu=mu) results.append([r, t]) np.save(self.log_dir + '/results_eval3.npy', np.array(results)) def eval(self, num_eps=10, mu=None): if mu == None: mu = self.mu results = [] mu = mu.eval() for i in range(num_eps): r, t = self.run_eval_episode(mu=mu) results.append([r, t]) print('{} reward: {:.2f}, length: {}'.format(i, r, t)) return np.mean(results, axis=0) def run_eval_episode(self, mu=None): if mu == None: mu = self.mu done = False s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) tot_r = t = 0 while not done: a = mu(s).view(-1).detach().numpy() a_updated = self.LQR(s, a) # s_p, r, done, _ = self.env.step(a) s_p, r, done, _ = self.env.step(a_updated) tot_r += r t += 1 s = torch.tensor(s_p.astype(np.float32), requires_grad=False) return tot_r, t def LQR(self, s, a): FPS = 50 SCALE = 30.0 # affects how fast-paced the game is, forces should be adjusted as well VIEWPORT_W = 600 VIEWPORT_H = 400 gravity = 9.8 / FPS / FPS # gravity is enhanced by scaling thrust_main_max = gravity / 0.56 thrust_side_max = thrust_main_max * 0.095 / 0.7 # m/frame^2 # determined by test m_main_inv = thrust_main_max # gravity*0.57 m_side_inv = thrust_side_max # gravity*0.225 a_i_inv = 0.198 / 100 # rad/frame^2 # determined by test # not depend on SCALE align = 0.87 # 0.87 = sin30 # target point set x_target = 0 y_target = 0 # the landing point is 0 Vx_target = 0 Vy_target = 0 theta_target = 0 omega_target = 0 if a < self.env.action_space2.low: a = self.env.action_space2.low elif a > self.env.action_space2.high: a = self.env.action_space2.high a_float = float(a) y_target = s[1] * (VIEWPORT_H / SCALE / 2) / a_float # 1.6 succeeds all the times X = np.array([ \ [s[0]*(VIEWPORT_W/SCALE/2)-x_target], \ [s[1]*(VIEWPORT_H/SCALE/2)-y_target], \ [s[2]/(VIEWPORT_W/SCALE/2)-Vx_target], \ [s[3]/(VIEWPORT_H/SCALE/2)-Vy_target], \ [s[4]-theta_target], \ [s[5]/20.0-omega_target]]) # print("X {}\n".format(X)) A = np.array([ \ [0, 0, 1, 0, 0, 0], \ [0, 0, 0, 1, 0, 0], \ [0, 0, 0, 0, -1*gravity, 0], \ [0, 0, 0, 0, 0, 0], \ [0, 0, 0, 0, 0, 1], \ [0, 0, 0, 0, 0, 0]]) B = np.array([ \ [0, 0], \ [0, 0], \ [0, m_side_inv*align], \ [1*m_main_inv, 0], \ [0, 0], \ [0, -1*a_i_inv]]) sigma = np.array([ \ [0], \ [0], \ [0], \ [-1*gravity], \ [0], \ [0]]) # gravity compensation BTB = np.dot(B.T, B) u_sigma = -1 * np.linalg.inv(BTB).dot(B.T).dot(sigma) # print("u_sigma {}\n".format(u_sigma)) # Design of LQR # Solve Riccati equation to find a optimal control input R = np.array([ \ [1, 0], \ [0, 1]]) Q = np.array([ \ [1, 0, 0, 0, 0, 0], \ [0, 1, 0, 0, 0, 0], \ [0, 0, 1, 0, 0, 0], \ [0, 0, 0, 1, 0, 0], \ [0, 0, 0, 0, 100, 0], \ [0, 0, 0, 0, 0, 100]]) # Solving Riccati equation P = sp.linalg.solve_continuous_are(A, B, Q, R) # print("P {}\n".format(P)) # u = -KX # K = R-1*Rt*P K = np.linalg.inv(R).dot(B.T).dot(P) thrust = -1 * np.dot(K, X) + u_sigma BK = np.dot(B, K) A_ = A - BK a_eig = np.linalg.eig(A_) a_sort = np.sort(a_eig[0]) # print("eigen values {}\n".format(a_sort)) # print("thrust {}\n".format(thrust)) # thrust[0] = 0 # thrust[1] = 1 if s[1] < 0.3 / SCALE: thrust[0] = 0 thrust[1] = 0 # conversion to compensate main thruster's tricky thrusting thrust[0] = thrust[0] / 0.5 - 1.0 if self.env.continuous: a_updated = np.array([thrust[0], thrust[1]]) # print("a_updated {}\n".format(a_updated)) # a = (0.5, 0) a_updated = np.clip( a_updated, -1, +1) # if the value is less than 0.5, it's ignored # print("a_updated * {}\n".format(a_updated)) else: print("please change to cts mode") return a_updated def fill_buffer(self): print('Filling buffer') s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) temp_number = 0 while self.buffer.size < self.buffer_min: # self.action_space = spaces.Box(-1, +1, (2,), dtype=np.float32) a = np.random.uniform(self.env.action_space2.low, self.env.action_space2.high, size=(self.a_dim)) a_updated = self.LQR(s, a) if temp_number < 3: print("a {}\n".format(a), "actions:", "{} {}".format(a_updated[0], a_updated[1])) # print("a_updated*** {}\n".format(a_updated)) temp_number += 1 # s_p, r, done, _ = self.env.step(a) s_p, r, done, _ = self.env.step(a_updated) if done: self.env.reset() self.buffer.add_tuple(s, a, r, s_p, done) s = s_p
class Agent(): def __init__(self, state_size, action_size): super().__init__() gpu = torch.cuda.is_available() if (gpu): print('GPU/CUDA works! Happy fast training :)') torch.cuda.current_device() torch.cuda.empty_cache() self.device = torch.device("cuda") else: print('training on cpu...') self.device = torch.device("cpu") self.actor = Actor(state_size, action_size).to(self.device) self.actor_target = Actor(state_size, action_size).to(self.device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001) self.critic = Critic(state_size, action_size).to(self.device) self.critic_target = Critic(state_size, action_size).to(self.device) self.critic_optim = optim.Adam(self.critic.parameters(), lr=0.001, weight_decay=0) self.replay_buffer = deque(maxlen=1000000) #1m self.gamma = 0.95 #0.99 self.batch_size = 128 self.tau = 0.001 self.seed = random.seed(2) self.noise = OUNoise((20, action_size), 2) self.target_network_update(self.actor_target, self.actor, 1.0) self.target_network_update(self.critic_target, self.critic, 1.0) def select_actions(self, state): state = torch.from_numpy(state).float().to(self.device) self.actor.eval() with torch.no_grad(): actions = self.actor(state).cpu().data.numpy() self.actor.train() actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def add(self, sars): self.replay_buffer.append(sars) def train(self): if (len(self.replay_buffer) > self.batch_size): states, actions, rewards, next_states, dones = self.sample() next_actions = self.actor_target(next_states) next_state_q_v = self.critic_target(next_states, next_actions) #print(next_state_q_v) q_targets = rewards + (self.gamma * next_state_q_v * (1 - dones)) current_q_v = self.critic(states, actions) critic_loss = F.mse_loss(current_q_v, q_targets) self.critic_optim.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic.parameters(), 1) self.critic_optim.step() actions = self.actor(states) actor_loss = -self.critic(states, actions).mean() self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self.target_network_update(self.actor_target, self.actor, self.tau) self.target_network_update(self.critic_target, self.critic, self.tau) def target_network_update(self, target_network, network, tau): for network_param, target_param in zip(network.parameters(), target_network.parameters()): target_param.data.copy_(tau * network_param.data + (1.0 - tau) * target_param.data) def sample(self): samples = random.sample(self.replay_buffer, k=self.batch_size) states = torch.tensor([s[0] for s in samples]).float().to(self.device) actions = torch.tensor([s[1] for s in samples]).float().to(self.device) rewards = torch.tensor([s[2] for s in samples ]).float().unsqueeze(1).to(self.device) next_states = torch.tensor([s[3] for s in samples]).float().to(self.device) dones = torch.tensor([s[4] for s in samples ]).float().unsqueeze(1).to(self.device) return states, actions, rewards, next_states, dones
class DDPG(): """ This is an Individual DDPG Agent """ def __init__(self, state_size, action_size, seed): """ Initialize a DDPG Agent Object :param state_size: dimension of state (input) for this decentralized actor :param action_size: dimension of action (output) for this decentralized actor :param random_seed: random seed """ self.state_size = state_size self.action_size = action_size self.seed = seed self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Hyperparameters self.buffer_size = 100000 self.batch_size = 256 self.gamma = 0.99 self.tau = 0.01 self.lr_actor = 0.0001 self.lr_critic = 0.001 # Setup Networks (Actor: State -> Action, Critic: (States for all agents, Actions for all agents) -> Value) self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.lr_actor) self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.lr_critic) # Initialize local and taret networks to start with same parameters self.soft_update(self.actor_local, self.actor_target, tau=1) self.soft_update(self.critic_local, self.critic_target, tau=1) # Noise Setup self.noise = OUNoise(self.action_size, self.seed) # Replay Buffer Setup self.memory = ReplayBuffer(self.buffer_size, self.batch_size) def __str__(self): return "DDPG_Agent" def reset_noise(self): """ resets to noise parameters """ self.noise.reset() def act(self, state, epsilon, add_noise=True): """ Returns actions for given states as per current policy. Policy comes from the actor network. :param state: observations for this individual agent :param epsilon: probability of exploration :param add_noise: bool on whether or not to potentially have exploration for action :return: clipped actions """ state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise and epsilon > np.random.random(): actions += self.noise.sample() return np.clip(actions, -1,1) def step(self): if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """ Update actor and critic networks using a given batch of experiences Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(states) -> actions critic_target(states, actions) -> Q-value :param experiences: tuple of arrays (states, actions, rewards, next_states, dones) sampled from the replay buffer """ states, actions, rewards, next_states, dones = experiences # -------------------- Update Critic -------------------- # # Use target networks for getting next actions and q values and calculate q_targets next_actions = self.actor_target(next_states) next_q_targets = self.critic_target(next_states, next_actions) q_targets = rewards + (self.gamma * next_q_targets * (1 - dones)) # Compute critic loss (Same as DQN Loss) q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # -------------------- Update Actor --------------------- # # Computer actor loss (maximize mean of Q(states,actions)) action_preds = self.actor_local(states) # Optimizer minimizes and we want to maximize so multiply by -1 actor_loss = -1 * self.critic_local(states, action_preds).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ---------------- Update Target Networks ---------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_network, target_network, tau): """ soft update newtwork parametes θ_target = τ*θ_local + (1 - τ)*θ_target :param local_network: PyTorch Network that is always up to date :param target_network: PyTorch Network that is not up to date :param tau: update (interpolation) parameter """ for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class A2C(): def __init__(self, state_dim, action_dim, action_lim, update_type='soft', lr_actor=1e-4, lr_critic=1e-3, tau=1e-3, mem_size=1e6, batch_size=256, gamma=0.99, other_cars=False, ego_dim=None): self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.joint_model = False if len(state_dim) == 3: self.model = ActorCriticCNN(state_dim, action_dim, action_lim) self.model_optim = optim.Adam(self.model.parameters(), lr=lr_actor) self.target_model = ActorCriticCNN(state_dim, action_dim, action_lim) self.target_model.load_state_dict(self.model.state_dict()) self.model.to(self.device) self.target_model.to(self.device) self.joint_model = True else: self.actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim) self.actor_optim = optim.Adam(self.actor.parameters(), lr=lr_actor) self.target_actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim) self.target_actor.load_state_dict(self.actor.state_dict()) self.target_actor.eval() self.critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim) self.critic_optim = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1e-2) self.target_critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim) self.target_critic.load_state_dict(self.critic.state_dict()) self.target_critic.eval() self.actor.to(self.device) self.target_actor.to(self.device) self.critic.to(self.device) self.target_critic.to(self.device) self.action_lim = action_lim self.tau = tau # hard update if tau is None self.update_type = update_type self.batch_size = batch_size self.gamma = gamma if self.joint_model: mem_size = mem_size//100 self.memory = Memory(int(mem_size), action_dim, state_dim) mu = np.zeros(action_dim) sigma = np.array([0.5, 0.05]) self.noise = OrnsteinUhlenbeckActionNoise(mu, sigma) self.target_noise = OrnsteinUhlenbeckActionNoise(mu, sigma) self.initialised = True self.training = False def select_action(self, obs): with torch.no_grad(): obs = torch.FloatTensor(np.expand_dims(obs, axis=0)).to(self.device) if self.joint_model: action, _ = self.model(obs) action = action.data.cpu().numpy().flatten() else: action = self.actor(obs).data.cpu().numpy().flatten() if self.training: action += self.noise() return action else: return action def append(self, obs0, action, reward, obs1, terminal1): self.memory.append(obs0, action, reward, obs1, terminal1) def reset_noise(self): self.noise.reset() self.target_noise.reset() def train(self): if self.joint_model: self.model.train() self.target_model.train() else: self.actor.train() self.target_actor.train() self.critic.train() self.target_critic.train() self.training = True def eval(self): if self.joint_model: self.model.eval() self.target_model.eval() else: self.actor.eval() self.target_actor.eval() self.critic.eval() self.target_critic.eval() self.training = False def save(self, folder, episode, previous=None, solved=False): filename = lambda type, ep : folder + '%s' % type + \ (not solved) * ('_ep%d' % (ep)) + \ (solved * '_solved') + '.pth' if self.joint_model: torch.save(self.model.state_dict(), filename('model', episode)) torch.save(self.target_model.state_dict(), filename('target_model', episode)) else: torch.save(self.actor.state_dict(), filename('actor', episode)) torch.save(self.target_actor.state_dict(), filename('target_actor', episode)) torch.save(self.critic.state_dict(), filename('critic', episode)) torch.save(self.target_critic.state_dict(), filename('target_critic', episode)) if previous is not None and previous > 0: if self.joint_model: os.remove(filename('model', previous)) os.remove(filename('target_model', previous)) else: os.remove(filename('actor', previous)) os.remove(filename('target_actor', previous)) os.remove(filename('critic', previous)) os.remove(filename('target_critic', previous)) def load_actor(self, actor_filepath): qualifier = '_' + actor_filepath.split("_")[-1] folder = actor_filepath[:actor_filepath.rfind("/")+1] filename = lambda type : folder + '%s' % type + qualifier if self.joint_model: self.model.load_state_dict(torch.load(filename('model'), map_location=self.device)) self.target_model.load_state_dict(torch.load(filename('target_model'), map_location=self.device)) else: self.actor.load_state_dict(torch.load(filename('actor'), map_location=self.device)) self.target_actor.load_state_dict(torch.load(filename('target_actor'), map_location=self.device)) def load_all(self, actor_filepath): self.load_actor(actor_filepath) qualifier = '_' + actor_filepath.split("_")[-1] folder = actor_filepath[:actor_filepath.rfind("/")+1] filename = lambda type : folder + '%s' % type + qualifier if not self.joint_model: self.critic.load_state_dict(torch.load(filename('critic'), map_location=self.device)) self.target_critic.load_state_dict(torch.load(filename('target_critic'), map_location=self.device)) def update(self, target_noise=True): try: minibatch = self.memory.sample(self.batch_size) # dict of ndarrays except ValueError as e: print('Replay memory not big enough. Continue.') return None, None states = Variable(torch.FloatTensor(minibatch['obs0'])).to(self.device) actions = Variable(torch.FloatTensor(minibatch['actions'])).to(self.device) rewards = Variable(torch.FloatTensor(minibatch['rewards'])).to(self.device) next_states = Variable(torch.FloatTensor(minibatch['obs1'])).to(self.device) terminals = Variable(torch.FloatTensor(minibatch['terminals1'])).to(self.device) if self.joint_model: target_actions, _ = self.target_model(next_states) if target_noise: for sample in range(target_actions.shape[0]): target_actions[sample] += self.target_noise() target_actions[sample].clamp(-self.action_lim, self.action_lim) _, target_qvals = self.target_model(next_states, target_actions=target_actions) y = rewards + self.gamma * (1 - terminals) * target_qvals _, model_qvals = self.model(states, target_actions=actions) value_loss = F.mse_loss(y, model_qvals) model_actions, _ = self.model(states) _, model_qvals = self.model(states, target_actions=model_actions) action_loss = -model_qvals.mean() self.model_optim.zero_grad() (value_loss + action_loss).backward() self.model_optim.step() else: target_actions = self.target_actor(next_states) if target_noise: for sample in range(target_actions.shape[0]): target_actions[sample] += self.target_noise() target_actions[sample].clamp(-self.action_lim, self.action_lim) target_critic_qvals = self.target_critic(next_states, target_actions) y = rewards + self.gamma * (1 - terminals) * target_critic_qvals # optimise critic critic_qvals = self.critic(states, actions) value_loss = F.mse_loss(y, critic_qvals) self.critic_optim.zero_grad() value_loss.backward() self.critic_optim.step() # optimise actor action_loss = -self.critic(states, self.actor(states)).mean() self.actor_optim.zero_grad() action_loss.backward() self.actor_optim.step() # optimise target networks if self.update_type == 'soft': if self.joint_model: soft_update(self.target_model, self.model, self.tau) else: soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) else: if self.joint_model: hard_update(self.target_model, self.model) else: hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) return action_loss.item(), value_loss.item()
class DDPG(): def __init__(self, env, log_dir, gamma=0.99, batch_size=64, sigma=0.2, batch_norm=True, merge_layer=2, buffer_size=int(1e6), buffer_min=int(1e4), tau=1e-3, Q_wd=1e-2, num_episodes=1000): self.s_dim = env.reset().shape[0] self.a_dim = env.action_space.shape[0] self.env = env self.mu = Actor(self.s_dim, self.a_dim, env.action_space, batch_norm=batch_norm) self.Q = Critic(self.s_dim, self.a_dim, batch_norm=batch_norm, merge_layer=merge_layer) self.targ_mu = copy.deepcopy(self.mu).eval() self.targ_Q = copy.deepcopy(self.Q).eval() self.noise = OrnsteinUhlenbeck(mu=torch.zeros(self.a_dim), sigma=sigma * torch.ones(self.a_dim)) self.buffer = Buffer(buffer_size, self.s_dim, self.a_dim) self.buffer_min = buffer_min self.mse_fn = torch.nn.MSELoss() self.mu_optimizer = torch.optim.Adam(self.mu.parameters(), lr=1e-4) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=1e-3, weight_decay=Q_wd) self.gamma = gamma self.batch_size = batch_size self.num_episodes = num_episodes self.tau = tau self.log_dir = log_dir self.fill_buffer() #updates the target network to slowly track the main network def track_network(self, target, main): with torch.no_grad(): for pt, pm in zip(target.parameters(), main.parameters()): pt.data.copy_(self.tau * pm.data + (1 - self.tau) * pt.data) # updates the target nets to slowly track the main ones def track_networks(self): self.track_network(self.targ_mu, self.mu) self.track_network(self.targ_Q, self.Q) def run_episode(self): done = False s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) t = 0 tot_r = 0 while not done: self.mu = self.mu.eval() a = torch.squeeze(self.mu(s)).detach().numpy() self.mu = self.mu.train() ac_noise = self.noise().detach().numpy() a = a + ac_noise s = s.detach().numpy() s_p, r, done, _ = self.env.step(a) tot_r += r self.buffer.add_tuple(s, a, r, s_p, done) s_batch, a_batch, r_batch, s_p_batch, done_batch = self.buffer.sample( batch_size=self.batch_size) # update critic with torch.no_grad(): q_p_pred = self.targ_Q(s_p_batch, self.targ_mu(s_p_batch)) q_p_pred = torch.squeeze(q_p_pred) y = r_batch + (1.0 - done_batch) * self.gamma * q_p_pred self.Q_optimizer.zero_grad() q_pred = self.Q(s_batch, a_batch) q_pred = torch.squeeze(q_pred) #print(torch.mean(q_pred)) Q_loss = self.mse_fn(q_pred, y) Q_loss.backward(retain_graph=False) self.Q_optimizer.step() # update actor self.mu_optimizer.zero_grad() q_pred_mu = self.Q(s_batch, self.mu(s_batch)) q_pred_mu = torch.squeeze(q_pred_mu) #print(torch.mean(q_pred_mu)) mu_loss = -torch.mean(q_pred_mu) # print(mu_loss) mu_loss.backward(retain_graph=False) #print(torch.sum(self.mu.layers[0].weight.grad)) self.mu_optimizer.step() self.track_networks() s = torch.tensor(s_p.astype(np.float32), requires_grad=False) t += 1 return tot_r, t def train(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 20 == 0: torch.save(self.mu, self.log_dir + '/models/model_' + str(i)) np.save(self.log_dir + '/results_train.npy', np.array(results)) def train1(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 20 == 0: torch.save(self.mu, self.log_dir + '/models1/model_' + str(i)) np.save(self.log_dir + '/results_train1.npy', np.array(results)) def train2(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 20 == 0: torch.save(self.mu, self.log_dir + '/models2/model_' + str(i)) np.save(self.log_dir + '/results_train2.npy', np.array(results)) def train3(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 20 == 0: torch.save(self.mu, self.log_dir + '/models3/model_' + str(i)) np.save(self.log_dir + '/results_train3.npy', np.array(results)) def eval_all(self, model_dir, num_eps=5): results = [] for model_fname in sorted(os.listdir(model_dir), key=lambda x: int(x.split('_')[1])): print(model_fname) mu = torch.load(os.path.join(model_dir, model_fname)) r, t = self.eval(num_eps=num_eps, mu=mu) results.append([r, t]) np.save(self.log_dir + '/results_eval.npy', np.array(results)) def eval(self, num_eps=10, mu=None): if mu == None: mu = self.mu results = [] mu = mu.eval() for i in range(num_eps): r, t = self.run_eval_episode(mu=mu) results.append([r, t]) print('{} reward: {:.2f}, length: {}'.format(i, r, t)) return np.mean(results, axis=0) def run_eval_episode(self, mu=None): if mu == None: mu = self.mu done = False s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) tot_r = t = 0 while not done: a = mu(s).view(-1).detach().numpy() s_p, r, done, _ = self.env.step(a) tot_r += r t += 1 s = torch.tensor(s_p.astype(np.float32), requires_grad=False) return tot_r, t def fill_buffer(self): print('Filling buffer') s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) while self.buffer.size < self.buffer_min: a = np.random.uniform(self.env.action_space.low, self.env.action_space.high, size=(self.a_dim)) s_p, r, done, _ = self.env.step(a) if done: self.env.reset() self.buffer.add_tuple(s, a, r, s_p, done) s = s_p
score = 0 steps = 0 noise_std = args.noise_std_start for i in range(args.episodes): env_info = env.reset(train_mode=True)[brain_name] state = torch.from_numpy( env_info.vector_observations).view(-1).float().to(device) for t in range(args.max_t): with torch.no_grad(): actor.eval() action = torch.clamp( actor_target(state.unsqueeze(0)) + torch.zeros( (1, action_size * 2)).normal_(0, noise_std).to(device), -1, 1).squeeze().float() #+ ou_process.sample() actor.train() env_info = env.step( torch.stack( (action[:action_size], action[action_size:])).to('cpu').numpy())[brain_name] next_state = torch.from_numpy( env_info.vector_observations).view(-1).float() reward = torch.tensor(env_info.rewards).sum().float() score += reward.item() done = torch.tensor(env_info.local_done[0] or env_info.local_done[1]).float() replay_buffer.push(state.to('cpu'), action.to('cpu'), next_state, reward, done) if ((steps + 1) % args.n_steps == 0 and len(replay_buffer) >= args.batch_size): for iteration in range(args.iterations):
class DDPG(): """ Deep Deterministic Policy Gradients Agent used to interaction with and learn from an environment """ def __init__(self, state_size: int, action_size: int, num_agents: int, epsilon, random_seed: int): """ Initialize a DDPG Agent Object :param state_size: dimension of state (input) :param action_size: dimension of action (output) :param num_agents: number of concurrent agents in the environment :param epsilon: initial value of epsilon for exploration :param random_seed: random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.t_step = 0 # Hyperparameters self.buffer_size = 1000000 self.batch_size = 128 self.update_every = 10 self.num_updates = 10 self.gamma = 0.99 self.tau = 0.001 self.lr_actor = 0.0001 self.lr_critic = 0.001 self.weight_decay = 0 self.epsilon = epsilon self.epsilon_decay = 0.97 self.epsilon_min = 0.005 # Networks (Actor: State -> Action, Critic: (State,Action) -> Value) self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Initialize actor and critic networks to start with same parameters self.soft_update(self.actor_local, self.actor_target, tau=1) self.soft_update(self.critic_local, self.critic_target, tau=1) # Noise Setup self.noise = OUNoise(self.action_size, random_seed) # Replay Buffer Setup self.memory = ReplayBuffer(self.buffer_size, self.batch_size) def __str__(self): return "DDPG_Agent" def train(self, env, brain_name, num_episodes=200, max_time=1000, print_every=10): """ Interacts with and learns from a given Unity Environment :param env: Unity Environment the agents is trying to learn :param brain_name: Brain for Environment :param num_episodes: Number of episodes to train :param max_time: How long each episode runs for :param print_every: How often in episodes to print a running average :return: Returns episodes scores and 100 episode averages as lists """ # --------- Set Everything up --------# scores = [] avg_scores = [] scores_deque = deque(maxlen=print_every) # -------- Simulation Loop --------# for episode_num in range(1, num_episodes + 1): # Reset everything env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations episode_scores = np.zeros(self.num_agents) self.reset_noise() # Run the episode for t in range(max_time): actions = self.act(states, self.epsilon) env_info = env.step(actions)[brain_name] next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done self.step(states, actions, rewards, next_states, dones) episode_scores += rewards states = next_states if np.any(dones): break # -------- Episode Finished ---------# self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon, self.epsilon_min) scores.append(np.mean(episode_scores)) scores_deque.append(np.mean(episode_scores)) avg_scores.append(np.mean(scores_deque)) if episode_num % print_every == 0: print( f'Episode: {episode_num} \tAverage Score: {round(np.mean(scores_deque), 2)}' ) torch.save( self.actor_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth') torch.save( self.critic_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth') # -------- All Episodes finished Save parameters and scores --------# # Save Model Parameters torch.save(self.actor_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth') torch.save(self.critic_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth') # Save mean score per episode (of the 20 agents) f = open(f'{PATH}\scores\{self.__str__()}_Multiple_Scores.txt', 'w') scores_string = "\n".join([str(score) for score in scores]) f.write(scores_string) f.close() # Save average scores for 100 window average f = open(f'{PATH}\scores\{self.__str__()}_Multiple_AvgScores.txt', 'w') avgScores_string = "\n".join([str(score) for score in avg_scores]) f.write(avgScores_string) f.close() return scores, avg_scores def step(self, states, actions, rewards, next_states, dones): """ what the agent needs to do for every time step that occurs in the environment. Takes in a (s,a,r,s',d) tuple and saves it to memeory and learns from experiences. Note: this is not the same as a step in the environment. Step is only called once per environment time step. :param states: array of states agent used to select actions :param actions: array of actions taken by agents :param rewards: array of rewards for last action taken in environment :param next_states: array of next states after actions were taken :param dones: array of bools representing if environment is finished or not """ # Save experienced in replay memory for agent_num in range(self.num_agents): self.memory.add(states[agent_num], actions[agent_num], rewards[agent_num], next_states[agent_num], dones[agent_num]) # Learn "num_updates" times every "update_every" time step self.t_step += 1 if len(self.memory ) > self.batch_size and self.t_step % self.update_every == 0: self.t_step = 0 for _ in range(self.num_updates): experiences = self.memory.sample() self.learn(experiences) def act(self, states, epsilon, add_noise=True): """ Returns actions for given states as per current policy. Policy comes from the actor network. :param states: array of states from the environment :param epsilon: probability of exploration :param add_noise: bool on whether or not to potentially have exploration for action :return: clipped actions """ states = torch.from_numpy(states).float().to(self.device) self.actor_local.eval() # Sets to eval mode (no gradients) with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() # Sets to train mode (gradients back on) if add_noise and epsilon > np.random.random(): actions += [self.noise.sample() for _ in range(self.num_agents)] return np.clip(actions, -1, 1) def reset_noise(self): """ resets to noise parameters """ self.noise.reset() def learn(self, experiences): """ Update actor and critic networks using a given batch of experiences Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(states) -> actions critic_target(states, actions) -> Q-value :param experiences: tuple of arrays (states, actions, rewards, next_states, dones) sampled from the replay buffer """ states, actions, rewards, next_states, dones = experiences # -------------------- Update Critic -------------------- # # Use target networks for getting next actions and q values and calculate q_targets next_actions = self.actor_target(next_states) next_q_targets = self.critic_target(next_states, next_actions) q_targets = rewards + (self.gamma * next_q_targets * (1 - dones)) # Compute critic loss (Same as DQN Loss) q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # -------------------- Update Actor --------------------- # # Computer actor loss (maximize mean of Q(states,actions)) action_preds = self.actor_local(states) # Optimizer minimizes and we want to maximize so multiply by -1 actor_loss = -1 * self.critic_local(states, action_preds).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() #---------------- Update Target Networks ---------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_network, target_network, tau): """ soft update newtwork parametes θ_target = τ*θ_local + (1 - τ)*θ_target :param local_network: PyTorch Network that is always up to date :param target_network: PyTorch Network that is not up to date :param tau: update (interpolation) parameter """ for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)