class MADDPGAgent: """ Defines a Multi-Agent Deep Deterministic Policy Gradient (MADDPG) agent """ def __init__(self, num_agents=2, obs_size=24, act_size=2, gamma=0.99, tau=1e-3, lr_actor=1.0e-4, lr_critic=1.0e-3, weight_decay_actor=1e-5, weight_decay_critic=1e-4, clip_grad=1.0): super(MADDPGAgent, self).__init__() # Write parameters self.num_agents = num_agents self.gamma = gamma self.tau = tau self.clip_grad = clip_grad # Create all the networks self.actor = ActorNetwork(obs_size, act_size).to(device) self.critic = CriticNetwork(num_agents, obs_size, act_size).to(device) self.target_actor = ActorNetwork(obs_size, act_size).to(device) self.target_critic = CriticNetwork(num_agents, obs_size, act_size).to(device) # Copy initial network parameters to target networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Initialize training optimizers and OU noise self.noise = OUNoise(act_size, scale=1.0) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=weight_decay_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay_critic) def act(self, obs, noise=0.0): """ Act using the online actor network """ obs = obs.to(device) action = self.actor(obs) + (noise * self.noise.noise()).to(device) action = torch.clamp(action, -1, 1) return action def target_act(self, obs, noise=0.0): """ Act using the target actor network (used for training) """ obs = obs.to(device) action = self.target_actor(obs) + (noise * self.noise.noise()).to(device) action = torch.clamp(action, -1, 1) return action def update_targets(self): """ Perform soft update of target network parameters based on latest actor/critic parameters """ soft_update(self.target_critic, self.critic, self.tau) soft_update(self.target_actor, self.actor, self.tau) def train(self, samples): """ Perform a training step for critic and actor networks with soft update """ # Unpack data from replay buffer and convert to tensors obs = torch.tensor([exp[0] for exp in samples], dtype=torch.float, device=device) act = torch.tensor([exp[1] for exp in samples], dtype=torch.float, device=device) reward = torch.tensor([exp[2] for exp in samples], dtype=torch.float, device=device) next_obs = torch.tensor([exp[3] for exp in samples], dtype=torch.float, device=device) done = torch.tensor([exp[4] for exp in samples], dtype=torch.float, device=device) obs_full = torch.tensor([exp[5] for exp in samples], dtype=torch.float, device=device) next_obs_full = torch.tensor([exp[6] for exp in samples], dtype=torch.float, device=device) act_full = torch.tensor([exp[7] for exp in samples], dtype=torch.float, device=device) # Critic update self.critic_optimizer.zero_grad() target_critic_obs = [next_obs_full[:,i,:].squeeze() \ for i in range(self.num_agents)] target_critic_obs = torch.cat(target_critic_obs, dim=1) target_act = [self.target_act(next_obs_full[:,i,:].squeeze()) \ for i in range(self.num_agents)] target_act = torch.cat(target_act, dim=1) with torch.no_grad(): q_next = self.target_critic(target_critic_obs, target_act) q_target = reward + self.gamma * q_next * (1 - done) critic_obs = [obs_full[:,i,:].squeeze() \ for i in range(self.num_agents)] critic_obs = torch.cat(critic_obs, dim=1) critic_act = [act_full[:,i,:].squeeze() \ for i in range(self.num_agents)] critic_act = torch.cat(critic_act, dim=1) q = self.critic(critic_obs, critic_act) critic_loss = torch.nn.functional.mse_loss(q, q_target.detach()) critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.clip_grad) self.critic_optimizer.step() # Actor update using policy gradient self.actor_optimizer.zero_grad() actor_act = [self.act(obs_full[:,i,:].squeeze()) \ for i in range(self.num_agents)] actor_act = torch.cat(actor_act, dim=1) actor_loss = -self.critic(critic_obs, actor_act).mean() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_grad) actor_loss.backward() self.actor_optimizer.step() # Update target networks self.update_targets()
class DdpgAgent: """ A Deep Deterministic Policy Gradient Agent. Interacts with and learns from the environment. """ def __init__(self, num_agents, state_size, action_size, random_seed): """ Initialize an Agent object. Params ====== num_agents (int): number of agents observed at the same time. multiple agents are handled within the class. state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ if random_seed is not None: random.seed(random_seed) np.random.seed(random_seed) self.t_step = 0 # A counter that increases each time the "step" function is executed self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = ActorNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.actor_target = ActorNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY_ACTOR) # self.actor_optimizer = optim.RMSprop(self.actor_local.parameters(), lr=LR_ACTOR, # weight_decay=WEIGHT_DECAY_ACTOR) # Also solves it, but Adam quicker # Critic Network (w/ Target Network) self.critic_local = CriticNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.critic_target = CriticNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY_CRITIC) # self.critic_optimizer = optim.RMSprop(self.critic_local.parameters(), lr=LR_CRITIC, # weight_decay=WEIGHT_DECAY_CRITIC) # Also solves it, but Adam quicker # Make sure target is initiated with the same weight as the local network self.soft_update(self.actor_local, self.actor_target, 1) self.soft_update(self.critic_local, self.critic_target, 1) # Setting default modes for the networks # Target networks do not need to train, so always eval() # Local networks, in training mode, unless altered in code - eg when acting. self.actor_local.train() self.actor_target.eval() self.critic_local.train() self.critic_target.eval() # Action Noise process (encouraging exploration during training) # Could consider parameter noise in future as a potentially better alternative / addition if ACTION_NOISE_METHOD == 'initial': self.noise = InitialOrnsteinUhlenbeckActionNoise( shape=(num_agents, action_size), random_seed=random_seed, x0=0, mu=0, theta=NOISE_THETA, sigma=NOISE_SIGMA) elif ACTION_NOISE_METHOD == 'adjusted': self.noise = AdjustedOrnsteinUhlenbeckActionNoise( shape=(num_agents, action_size), random_seed=random_seed, x0=0, mu=0, sigma=NOISE_SIGMA, theta=NOISE_THETA, dt=NOISE_DT, sigma_delta=NOISE_SIGMA_DELTA, ) else: raise ValueError('Unknown action noise method: ' + ACTION_NOISE_METHOD) # Replay memory self.memory = ReplayBuffer( buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, sampling_method=REPLAY_BUFFER_SAMPLING_METHOD, random_seed=random_seed) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" self.t_step += 1 # Save experience / reward self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory, every UPDATE_EVERY steps if self.t_step % UPDATE_EVERY == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_action_noise=False): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval( ) # train state is set right before actual training with torch.no_grad( ): # All calcs here with no_grad, but many examples didn't do this. Weirdly, this is slower.. return np.clip( self.actor_local(states).cpu().data.numpy() + (self.noise.sample() if add_action_noise else 0), -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): reward discount factor """ states, actions, rewards, next_states, dones = experiences self.actor_local.train( ) # critic_local is always in train state, but actor_local goes into eval with acting # Critic # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if CLIP_GRADIENT_CRITIC: torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Actor # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() if CLIP_GRADIENT_ACTOR: torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() # Soft-Update of Target Networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update target model parameters from local model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: def __init__(self, n_actions, n_states, obs_shape, gamma=0.99, lr=0.0003, gae_lambda=0.95, entropy_coeff=0.0005, ppo_clip=0.2, mini_batch_size=64, n_epochs=10, clip_value_loss=True, normalize_observation=False, stop_normalize_obs_after_timesteps=50000, fc1=64, fc2=64, environment='None', run=0): self.entropy_coeff = entropy_coeff self.clip_value_loss = clip_value_loss self.gamma = gamma self.ppo_clip = ppo_clip self.n_epochs = n_epochs self.gae_lambda = gae_lambda self.normalize_observation = normalize_observation self.stop_obs_timesteps = stop_normalize_obs_after_timesteps self.timestep = 0 self.actor = ActorNetwork(n_states=n_states, n_actions=n_actions, lr=lr, fc1_dims=fc1, fc2_dims=fc2, chkpt_dir=environment, run=run) self.critic = CriticNetwork(n_states=n_states, lr=lr, fc1_dims=fc1, fc2_dims=fc2, chkpt_dir=environment, run=run) self.memory = PPOMemory(mini_batch_size, gamma, gae_lambda) self.running_stats = RunningStats(shape_states=obs_shape, chkpt_dir=environment, run=run) # self.optimizer = optim.Adam(list(self.actor.parameters()) + list(self.critic.parameters()), lr=lr, eps=1e-5) def remember(self, state, action, log_probs, value, reward, done): self.memory.store_memory(state, action, log_probs, value, reward, done) def remember_adv(self, advantage_list): self.memory.store_advantage(advantage_list) def save_networks(self): print('--saving networks--') self.actor.save_actor() self.critic.save_critic() if self.normalize_observation: self.running_stats.save_stats() def load_networks(self): print('--loading networks--') self.actor.load_actor() self.critic.load_critic() if self.normalize_observation: self.running_stats.load_stats() def normalize_obs(self, obs): mean, std = self.running_stats() obs_norm = (obs - mean) / (std + 1e-6) return obs_norm def choose_action(self, observation): if self.normalize_observation: if self.timestep < self.stop_obs_timesteps: self.running_stats.online_update(observation) elif self.timestep == self.stop_obs_timesteps: print('No online update for obs Normalization anymore') observation = self.normalize_obs( observation) #Normalize Observations state = T.tensor([observation], dtype=T.float).to(self.actor.device) dist, _ = self.actor(state) value = self.critic(state) action = dist.sample() log_probs = dist.log_prob(action) log_probs = T.sum(log_probs, dim=1, keepdim=True).squeeze().detach().cpu().numpy() value = T.squeeze(value).item() # action = T.squeeze(action).detach().numpy() if action.shape[0] == 1 and action.shape[1] == 1: action = action.detach().cpu().numpy()[0].reshape(1, ) else: action = T.squeeze(action).detach().cpu().numpy() self.timestep += 1 return action, log_probs, value def choose_deterministic_action(self, observation): if self.normalize_observation: observation = self.normalize_obs( observation) #Normalize Observations state = T.tensor([observation], dtype=T.float).to(self.actor.device) _, mean = self.actor(state) action = T.squeeze(mean).detach().cpu().numpy() #.reshape(1, ) return action def learn(self): for _ in range(self.n_epochs): state_arr, action_arr, old_prob_arr, vals_arr, \ reward_arr, dones_arr, advantage_arr, batches = \ self.memory.generate_batches() if self.normalize_observation: #print(state_arr[0:5,:]) state_arr = self.normalize_obs(state_arr) #print(state_arr[0:5,:]) for batch in batches: states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device) old_log_probs = T.tensor(old_prob_arr[batch]).to( self.actor.device).detach() actions = T.tensor(action_arr[batch]).to( self.actor.device).detach() critic_value_old = T.tensor(vals_arr[batch]).to( self.actor.device).detach() advantage = T.tensor(advantage_arr[batch]).to( self.actor.device) #returns = T.tensor(reward_arr[batch]).to(self.actor.device) #advantage = returns - critic_value_old # Advantage Normalization per Mini-Batch advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8) advantage = advantage.detach() ## Actor-Loss dist, _ = self.actor(states) critic_value_new = self.critic(states) critic_value_new = T.squeeze(critic_value_new) new_log_probs = dist.log_prob(actions) new_log_probs = T.sum(new_log_probs, dim=1, keepdim=True).squeeze() prob_ratio = (new_log_probs - old_log_probs).exp() weighted_probs = advantage * prob_ratio weighted_clipped_probs = T.clamp(prob_ratio, 1 - self.ppo_clip, 1 + self.ppo_clip) * advantage ppo_surr_loss = -T.min(weighted_probs, weighted_clipped_probs).mean() entropy_loss = -self.entropy_coeff * dist.entropy().mean() actor_loss = ppo_surr_loss + entropy_loss ## Critic-Loss returns = advantage + critic_value_old # Clipping Value Loss if self.clip_value_loss: v_loss_unclipped = ((critic_value_new - returns)**2) v_clipped = critic_value_old + T.clamp( critic_value_new - critic_value_old, -self.ppo_clip, self.ppo_clip) v_loss_clipped = (v_clipped - returns)**2 v_loss_max = T.max(v_loss_unclipped, v_loss_clipped) critic_loss = 0.5 * v_loss_max.mean() else: critic_loss = 0.5 * ( (critic_value_new - returns)**2).mean() ## Backprop Actor self.actor.optimizer.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(parameters=self.actor.parameters(), max_norm=0.5, norm_type=2) self.actor.optimizer.step() ## Backprop Critic self.critic.optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(parameters=self.critic.parameters(), max_norm=0.5, norm_type=2) self.critic.optimizer.step() # loss = critic_loss + actor_loss # self.optimizer.zero_grad() # loss.backward() # nn.utils.clip_grad_norm_(parameters=list(self.actor.parameters()) + list(self.critic.parameters()), # max_norm=0.8, # norm_type=2) # self.optimizer.step() self.memory.clear_memory( ) # Clear Memory to save new samples for next iteration
def main(args): args = parse_arguments() args.cuda = not args.no_cuda and torch.cuda.is_available() env = gym.make(args.env_name) os.environ['OMP_NUM_THREADS'] = '1' np.random.seed(args.seed) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed_all(args.seed) torch.set_num_threads(1) writer = SummaryWriter(log_dir=args.save_dir) actor = ActorNetwork(env.observation_space.shape[0], env.action_space.n) critic = CriticNetwork(env.observation_space.shape[0]) if args.continue_training: try: actorState = torch.load(args.load_dir, map_location=lambda storage, loc: storage) actor.load_state_dict(actorState) except: assert False, "Unable to find a model to load" if args.cuda: actor.cuda() critic.cuda() actor_optimizer = optim.Adam(actor.parameters(), lr=args.lr) critic_optimizer = optim.Adam(critic.parameters(), lr=args.lr) N = args.nsteps eps = 1.0 obsarr = [] rewardarr = [] actionlossarr = [] actionarr = [] valuearr = [] ep_len = 0 for ep in range(args.num_episodes): done = False obs = env.reset() while not done: ep_len += 1 obs_var = Variable(torch.from_numpy(obs).float(), volatile=True) action = actor.get_action(obs_var) value = critic(obs_var) action = action.data[0] next_obs, reward, done, _ = env.step(action) if args.render: env.render() obsarr.append(obs) actionarr.append(action) rewardarr.append(reward) valuearr.append(value) obs = next_obs T = len(obsarr) G = [0] * T batch_obs = Variable(torch.from_numpy(np.stack(obsarr)).float()) batch_act = Variable(torch.from_numpy(np.array(actionarr))) logprobvar = actor.evaluate_actions(batch_obs, batch_act) valvar = critic(batch_obs) logprobvar = logprobvar.squeeze(1) valvar = valvar.squeeze(1) for t in reversed(range(T)): V = 0 if t + N < T: V = valvar[t + N].data[0] G[t] = pow(args.gamma, N) * V u = min(N, T - t) for k in range(u): G[t] += pow(args.gamma, k) * rewardarr[t + k] Gtensor = Variable(torch.FloatTensor(G)) adv = 0.01 * Gtensor - valvar.detach() action_loss = -(adv * logprobvar).mean() value_loss = (0.01 * Gtensor - valvar).pow(2).mean() actionlossarr.append(action_loss) critic_optimizer.zero_grad() value_loss.backward() torch.nn.utils.clip_grad_norm(critic.parameters(), 3) critic_optimizer.step() if ep % args.update_freq == 0: actor_optimizer.zero_grad() l = torch.cat(actionlossarr).mean() l.backward() torch.nn.utils.clip_grad_norm(actor.parameters(), 3) actor_optimizer.step() r = np.array(rewardarr).sum() / args.update_freq print("Episode: {} | Reward: {:.3f}| Length: {}".format( ep, r, ep_len / args.update_freq)) obsarr = [] rewardarr = [] actionlossarr = [] actionarr = [] ep_len = 0 if ep % 500 == 0: torch.save(actor.state_dict(), args.save_dir + '/' + args.env_name + '.pt') rm, rs, em = test(env, actor, False) writer.add_scalar('test/reward_mean', rm, ep) writer.add_scalar('test/reward_std', rs, ep) writer.add_scalar('test/ep_len_mean', em, ep) writer.export_scalars_to_json(args.save_dir + '/' + args.env_name + '_scalars.json') writer.add_scalar('train/reward', r, ep)
class Agent: """ This class represents the reinforcement learning agent """ def __init__(self, state_size: int, action_size: int, gamma: float = 0.99, lr_actor: float = 0.001, lr_critic: float = 0.003, weight_decay: float = 0.0001, tau: float = 0.001, buffer_size: int = 100000, batch_size: int = 64): """ :param state_size: how many states does the agent get as input (input size of neural networks) :param action_size: from how many actions can the agent choose :param gamma: discount factor :param lr_actor: learning rate of the actor network :param lr_critic: learning rate of the critic network :param weight_decay: :param tau: soft update parameter :param buffer_size: size of replay buffer :param batch_size: size of learning batch (mini-batch) """ self.tau = tau self.gamma = gamma self.batch_size = batch_size self.actor_local = ActorNetwork(state_size, action_size).to(device) self.actor_target = ActorNetwork(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) print(self.actor_local) self.critic_local = CriticNetwork(state_size, action_size).to(device) self.critic_target = CriticNetwork(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) print(self.critic_local) self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) self.memory = ReplayBuffer(action_size, buffer_size, batch_size) # this would probably also work with Gaussian noise instead of Ornstein-Uhlenbeck process self.noise = OUNoise(action_size) def step(self, experience: tuple): """ :param experience: tuple consisting of (state, action, reward, next_state, done) :return: """ self.memory.add(*experience) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state, add_noise: bool = True): """ Actor uses the policy to act given a state """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local.forward(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def learn(self, experiences): # Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) # the actor_target returns the next action, this next action is then used (with the state) to estimate # the Q-value with the critic_target network states, actions, rewards, next_states, dones = experiences # region Update Critic actions_next = self.actor_target.forward(next_states) q_expected = self.critic_local.forward(states, actions) q_targets_next = self.critic_target.forward(next_states, actions_next) q_targets = rewards + (self.gamma * q_targets_next * (1 - dones)) # minimize the loss critic_loss = F.mse_loss(q_expected, q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # endregion Update Critic # region Update actor # Compute actor loss actions_predictions = self.actor_local.forward(states) actor_loss = -self.critic_local.forward(states, actions_predictions).mean() # Minimize actor loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # endregion Update actor # region update target network self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) # endregion update target network def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def hard_update(self, local_model, target_model): """Copy the weights and biases from the local to the target network""" for target_param, param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(param.data) def reset(self): self.noise.reset()