def __init__(self, state_dim, action_dim, max_action, args): self.actor = Actor(state_dim, action_dim, max_action).to(args.device) self.actor_target = Actor(state_dim, action_dim, max_action).to(args.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) self.critic = Critic(state_dim, action_dim).to(args.device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) self.list_target_critic = [] # create the different for c in range(args.num_q_target): critic_target = Critic(state_dim, action_dim).to(args.device) critic_target.load_state_dict(critic_target.state_dict()) self.list_target_critic.append(critic_target) self.target_critic = Critic(state_dim, action_dim).to(args.device) self.target_critic.load_state_dict(self.target_critic.state_dict()) self.max_action = max_action self.num_q_target = args.num_q_target self.batch_size = args.batch_size self.discount = args.discount self.tau = args.tau self.policy_noise = args.policy_noise self.noise_clip = args.noise_clip self.policy_freq = args.policy_freq self.device = args.device self.update_counter = 0 self.step = 0 self.currentQNet = 0
class MultiAgent(object): def __init__(self, config: DefaultMunch): self.config = config self.memory = self.config.memory self.n_agents = self.config.n_agents self.action_size = self.config.action_size self.state_size = self.config.state_size self.critic_local = Critic(self.state_size, self.config.action_size, self.config.n_agents).to(self.config.device) self.critic_target = Critic(self.state_size, self.config.action_size, self.config.n_agents).to( self.config.device) self.critic_target.load_state_dict(self.critic_local.state_dict()) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.config.lr_critic) self.agents = [Agent(self.config, self) for i in range(self.n_agents)] def step(self, states, actions, rewards, next_states, dones): self.memory.add((states[0], actions[0], rewards[0], next_states[0], dones[0], states[1], actions[1], next_states[1])) self.agents[0].step() self.memory.add((states[1], actions[1], rewards[1], next_states[1], dones[1], states[0], actions[0], next_states[0])) self.agents[1].step() def act(self, states, add_noise=True): actions1: torch.Tensor = self.agents[0].act(states[0], add_noise) actions2: torch.Tensor = self.agents[1].act(states[1], add_noise) actions = torch.stack([actions1, actions2], dim=0) return actions def reset(self): for agent in self.agents: agent.reset() def save(self, path, episode): for i, agent in enumerate(self.agents): agent.save(path + str(i), episode) def load(self, path): for i, agent in enumerate(self.agents): agent.load(path + str(i))
class DDPGAgent(): def __init__(self, state_size, action_size, seed, actor_file=None, critic_file=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed actor_file: path of file containing trained weights of actor network critic_file: path of file containing trained weights of critic network """ self.state_size = state_size self.action_size = action_size self.seed = seed #actor network: self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), LR) #critic network self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), LR) #load trained weights if needed if actor_file: weights = torch.load(actor_file) self.actor_local.load_state_dict(weights) self.actor_target.load_state_dict(weights) if critic_file: weights = torch.load(critic_file) self.critic_local.load_state_dict(weights) self.critic_target.load_state_dict(weights) def act(self, state): """Returns actions for given state as per current Actor network. Params ====== state (array_like): current state """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() return np.clip(action, -1, 1) def step(self): if len(shared_memory) > BATCH_SIZE: experiences = shared_memory.sample() self.learn(GAMMA, experiences) def learn(self, GAMMA, experiences): """Update value parameters using batch of experience tuples. Params ====== gamma (float): discount factor """ states_list, actions_list, rewards, next_states_list, dones = experiences next_states_tensor = torch.cat(next_states_list, dim=1).to(device) states_tensor = torch.cat(states_list, dim=1).to(device) actions_tensor = torch.cat(actions_list, dim=1).to(device) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models next_actions = [ self.actor_target(next_states) for next_states in next_states_list ] next_actions_tensor = torch.cat(next_actions, dim=1).to(device) Q_targets_next = self.critic_target(next_states_tensor, next_actions_tensor) # Compute Q targets for current states (y_i) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states_tensor, actions_tensor) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optim.zero_grad() critic_loss.backward() #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optim.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss # take the current states and predict actions actions_pred = [self.actor_local(states) for states in states_list] actions_pred_tensor = torch.cat(actions_pred, dim=1).to(device) # -1 * (maximize) Q value for the current prediction actor_loss = -self.critic_local(states_tensor, actions_pred_tensor).mean() # Minimize the loss self.actor_optim.zero_grad() actor_loss.backward() #torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optim.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau=TAU): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class ActorCriticAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, gamma=0.99, tau=1e-3, batch_size=128, hidden_layer_size=(512, 256), lr_actor_critic=(1e-3, 1e-4), noise=(0.6, 0.995)): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed gamma (float): discount factor update_every (int): how often to update the network tau (float): for soft update of target parameters batch_size (int): minibatch size hidden_layer_size (tuple(int, int)): tuple of hidden layer size for the actor and critic network lr_actor_critic (tuple(float, float)): tuple of learning rates of the actor and of the critic noise (tuple(float, float)): tuple containing the noise factor and the rate to apply to the factor after each episode """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.gamma = gamma self.tau = tau self.batch_size = batch_size self.name = f'agent' lr_actor, lr_critic = lr_actor_critic # Actor Networks (local one and target one) fc1_units, fc2_units = hidden_layer_size self.actor_local = Actor(state_size, action_size, seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_target = Actor(state_size, action_size, seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (local one and target one) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=WEIGHT_DECAY) # Initialize the target model weights with the local ones (same values) self.actor_target.load_state_dict(self.actor_local.state_dict()) self.critic_target.load_state_dict(self.critic_local.state_dict()) # Noise process factor, decay_rate = noise self.noise = GaussianNoise(action_size, factor, decay_rate=decay_rate) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, states, actions, rewards, next_states, dones): # Save experience in replay memory for i in range(len(states)): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, states, add_noise=True): """Returns actions for given state as per current policy. Params ====== states (array_like): current state add_noise: indicates if noise should be added """ states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def end(self): """ Method applied at the end of each episode """ self.noise.end() def reset(self): self.noise.reset() def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets.detach()) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # as suggested in the "Benchmak implementation" section of the course" self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states.detach(), actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class DDPG: """Implementation of DDPG. This implementation is adapted to this particular environment running several agent. At each time step, the same actor is controlling each agent sequentially. """ def __init__(self, state_size, action_size, config): """Initialize algorithm.""" if config.PER: self.memory = PrioritizeReplayBuffer( config.BUFFER_SIZE, config.BATCH_SIZE, config.SEED ) else: self.memory = ReplayBuffer( config.BUFFER_SIZE, config.BATCH_SIZE, config.SEED ) # Randomly initialize critic netowrk and actor self.actor = Actor(state_size, action_size, config.SEED).to(device) self.critic = Critic(state_size, action_size, config.SEED).to(device) # Initialize target networks with weights from actor critic # Actor self.actor_target = Actor(state_size, action_size, config.SEED).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) # Critic self.critic_target = Critic(state_size, action_size, config.SEED).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) # Actor optimizer self.actor_optimizer = torch.optim.Adam( self.actor.parameters(), lr=config.LR_ACTOR ) # Critic optimizer self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), lr=config.LR_CRITIC ) self.config = config self.t_step = 0 self.expl_noise = config.EXPL_NOISE def step(self, target_sample=None, **kwargs): """Run a step of algorithm update.""" # Sample a random minibatch of transitions states, actions, rewards, next_states, dones = self._draw_minibatch() # Compute the target Q value target_Q = self.critic_target( next_states, self.actor_target(next_states) ).detach() y = rewards + (1 - dones) * self.config.GAMMA * target_Q # Update critic by minimizing the loss current_Q = self.critic(states, actions) # Compute TD error td_error = y - current_Q if self.config.PER: # Get importance_sampling_weights weights = torch.Tensor(self.memory.importance_sampling()).unsqueeze(1) # Update priorities self.memory.update_priorities(td_error.detach().cpu().numpy()) # Compute critic loss critic_loss = torch.mean(weights * td_error ** 2) else: # Compute critic loss critic_loss = torch.mean(td_error ** 2) # Optimize critic self.critic_optimizer.zero_grad() critic_loss.backward() # Clip gradient nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_optimizer.step() # Update the actor policy using the sampled policy gradient: actor_loss = -self.critic(states, self.actor(states)).mean() self.actor_optimizer.zero_grad() actor_loss.backward() # CLip gradient nn.utils.clip_grad_norm_(self.actor.parameters(), 1) self.actor_optimizer.step() # Update target networks self.soft_update() def train(self, env, num_episode): """Train a DDPG agent.""" scores = [] scores_window = deque(maxlen=100) for episode in range(num_episode): # Init state and episode score states = env.reset(train_mode=True) score = np.zeros(states.shape[0]) done = False # Run episode while not done: # Select and run action actions = self.predict_actions(states) # TODO: dynamic low and high selection actions = self.add_gaussian_noise(actions, -1, 1) next_states, rewards, dones = env.step(actions) # Store all n_agent episodes in replay buffer for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones ): self.memory.add(state, action, reward, next_state, done) # Update time step self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY # Optimisation step if UPDATE_EVERY and enough examples in memory if self.t_step == 0 and len(self.memory) > self.config.BATCH_SIZE: for _ in range(self.config.UPDATE_STEPS): self.step() # Update state and scores states = next_states score += rewards # End episode if any of the agent is done, to avoid storing too much # Done transitions in the replay buffer done = any(dones) # Keep track of running mean scores_window.append(max(score)) # Append current mean to scores list scores.append(np.mean(scores_window)) # Logging print( "\rEpisode {}\tAverage Score: {:.2f}, Last Score: {:.2f}".format( episode, np.mean(scores_window), max(score) ), end="", ) if (episode + 1) % 100 == 0: print( "\rEpisode {}\tAverage Score: {:.2f}".format( episode, np.mean(scores_window) ) ) return scores def soft_update(self): """Update the frozen target models.""" tau = self.config.TAU # Actor for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters() ): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) # Critic for param, target_param in zip( self.actor.parameters(), self.actor_target.parameters() ): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) def predict_actions(self, states, **kwargs): """Predict next actions based on current policy.""" states = torch.from_numpy(states).float().unsqueeze(0).to(device) # Set actor to eval mode self.actor.eval() actions = [] with torch.no_grad(): for state in states: action = self.actor(state) actions.append(action.detach().numpy()) # Set actor to train mode self.actor.train() return np.array(actions).squeeze() def add_gaussian_noise(self, action, low, high): """Add Gaussian noise to action, and clip between low and high.""" return (action + np.random.normal(0, self.expl_noise, size=action.shape)).clip( low, high ) def _draw_minibatch(self): """Draw a minibatch in the replay buffer.""" states, actions, rewards, next_states, done = zip(*self.memory.sample()) states = torch.Tensor(states).to(device) actions = torch.Tensor(actions).to(device) rewards = torch.Tensor(rewards).unsqueeze(1).to(device) next_states = torch.Tensor(next_states).to(device) done = torch.Tensor(done).unsqueeze(1).to(device) return states, actions, rewards, next_states, done def save_model(self, path, **kwargs): """Save actor model weights.""" torch.save(self.actor.state_dict(), path)
class Agent: def __init__(self, state_size, action_size, seed, actor_file=None, critic_file=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed actor_file: path of file containing trained weights of actor network critic_file: path of file containing trained weights of critic network """ self.state_size = state_size self.action_size = action_size self.seed = seed #actor network: self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), LR) #critic network self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), LR) #load trained weights if needed if actor_file: weights = torch.load(actor_file) self.actor_local.load_state_dict(weights) self.actor_target.load_state_dict(weights) if critic_file: weights = torch.load(critic_file) self.critic_local.load_state_dict(weights) self.critic_target.load_state_dict(weights) #init replay buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def act(self, state): """Returns actions for given state as per current Actor network. Params ====== state (array_like): current state """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: self.learn(GAMMA) def learn(self, GAMMA): """Update value parameters using batch of experience tuples. Params ====== gamma (float): discount factor """ states, actions, rewards, next_states, dones = self.memory.sample() #update critic target_next_actions = self.actor_target(next_states) target_next_q = self.critic_target(next_states, target_next_actions) target_q = rewards + (GAMMA * target_next_q * (1 - dones)) local_q = self.critic_local(states, actions) critic_loss = F.mse_loss(local_q, target_q) self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() #update actor local_actions = self.actor_local(states) actor_loss = -self.critic_local(states, local_actions).mean() self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model, tau=TAU): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
generator = generator.apply(weights_init) critic = critic.apply(weights_init) opt_critic = optim.Adam(critic.parameters(), lr=learning_rate, betas=(0.0, 0.9)) opt_gen = optim.Adam(generator.parameters(), lr=learning_rate, betas=(0.0, 0.9)) fixed_sample = torch.randn(batch_size, n_dimension, 1, 1).to(device) if Load == "True": print("Load Weights...") critic.load_state_dict(torch.load("critic_weights.pt")) generator.load_state_dict(torch.load("gen_weights.pt")) for epoch in range(epochs_nums): for batch_idx, (real, _) in enumerate(loader): real = real.to(device) for itr in range(5): noise = torch.randn(len(real), n_dimension, 1, 1).to(device) fake = generator(noise) critic_real = critic(real) critic_fake = critic(fake) loss_critic = ( -(torch.mean(critic_real) - torch.mean(critic_fake)) + Lambda * Gradient_penality(critic, real, fake, device)) critic.zero_grad() loss_critic.backward(retain_graph=True)
class TD31v1(object): """ TD3 plus the Ensemble of Critics as an agent object to act and update the networkweights, save and laod the weights """ def __init__(self, state_dim, action_dim, max_action, args): self.actor = Actor(state_dim, action_dim, max_action).to(args.device) self.actor_target = Actor(state_dim, action_dim, max_action).to(args.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) self.critic = Critic(state_dim, action_dim).to(args.device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) self.list_target_critic = [] # create the different for c in range(args.num_q_target): critic_target = Critic(state_dim, action_dim).to(args.device) critic_target.load_state_dict(critic_target.state_dict()) self.list_target_critic.append(critic_target) self.target_critic = Critic(state_dim, action_dim).to(args.device) self.target_critic.load_state_dict(self.target_critic.state_dict()) self.max_action = max_action self.num_q_target = args.num_q_target self.batch_size = args.batch_size self.discount = args.discount self.tau = args.tau self.policy_noise = args.policy_noise self.noise_clip = args.noise_clip self.policy_freq = args.policy_freq self.device = args.device self.update_counter = 0 self.step = 0 self.currentQNet = 0 def select_action(self, state): state = torch.Tensor(state.reshape(1, -1)).to(self.device) return self.actor(state).cpu().data.numpy().flatten() def train(self, replay_buffer, writer, iterations): """ Update function for the networkweights of the Actor and Critis current and Target by useing the 3 new features of the TD3 paper to the DDPG implementation 1. Delay the policy Updates 2. Two crtitc networks take the min Q value 3. Target Policy Smoothing Own use an Ensemble Approach of delayed updated critics """ self.step += 1 for it in range(iterations): # Step 1: Sample a batch of transitions (s, s’, a, r) from the memory batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(self.batch_size) # convert the numpy arrays to tensors object # if cuda is available send data to gpu state = torch.Tensor(batch_states).to(self.device) next_state = torch.Tensor(batch_next_states).to(self.device) action = torch.Tensor(batch_actions).to(self.device) reward = torch.Tensor(batch_rewards).to(self.device) done = torch.Tensor(batch_dones).to(self.device) # Step 2: use the Target Actor to create the action of the next # state (part of the TD-Target next_action = self.actor_target(next_state) # Step 3: Add Gaussian noise to the action for exploration # clip the action value in case its outside the boundaries noise = torch.Tensor(batch_actions).data.normal_(0, self.policy_noise).to(self.device) noise = noise.clamp(-self.noise_clip, self.noise_clip) next_action = (next_action + noise).clamp(-self.max_action, self.max_action) # Step 4: Use the differenet Target Critis (delayed update) # and min of two Critic from TD3 to create the different Q Targets # compute the average of all Q Targets to get a single value target_Q = 0 for critic in self.list_target_critic: target_Q1, target_Q2 = critic(next_state, next_action) target_Q += torch.min(target_Q1, target_Q2) target_Q *= 1./ self.num_q_target # Step 5: Create the update based on the bellman equation target_Q = reward + ((1 - done) * self.discount * target_Q).detach() # Step 6: Use the critic compute the Q estimate for current state and action current_Q1, current_Q2 = self.critic(state, action) # Step 7: Compute the critc loss with the mean squard error # loss function critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) writer.add_scalar('critic_loss', critic_loss , self.step) # Step 8: Backpropagate this Critic loss and update the parameters # of the two Critic models use the adam optimizer self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Step 9: Delayed update og Actor model if it % self.policy_freq == 0: actor_loss = -self.critic.Q1(state, self.actor(state)).mean() writer.add_scalar('actor_loss', actor_loss , self.step) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Step 10: we update the weights of the Critic target by polyak averaging # hyperparameter tau determines the combination of current and # target weights for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def hardupdate(self): """ for the critic ensembles """ self.update_counter +=1 self.currentQNet = self.update_counter % self.num_q_target # Step 11: Override every n steps the weights for target_param, param in zip(self.target_critic.parameters(), self.list_target_critic[self.currentQNet].parameters()): param.data.copy_(target_param.data) # Making a save method to save a trained model def save(self, filename, directory): torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) # Making a load method to load a pre-trained model def load(self, filename, directory): self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename))) self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
class DDPG(object): def __init__(self, gamma, tau,num_inputs, env,device, results_path=None): self.gamma = gamma self.tau = tau self.min_action,self.max_action = env.action_range() self.device = device self.num_actions = env.action_space() self.noise_stddev = 0.3 self.results_path = results_path self.checkpoint_path = os.path.join(self.results_path, 'checkpoint/') os.makedirs(self.checkpoint_path, exist_ok=True) # Define the actor self.actor = Actor(num_inputs, self.num_actions).to(device) self.actor_target = Actor(num_inputs, self.num_actions).to(device) # Define the critic self.critic = Critic(num_inputs, self.num_actions).to(device) self.critic_target = Critic(num_inputs, self.num_actions).to(device) # Define the optimizers for both networks self.actor_optimizer = Adam(self.actor.parameters(), lr=1e-4 ) # optimizer for the actor network self.critic_optimizer = Adam(self.critic.parameters(), lr=1e-4, weight_decay=0.002) # optimizer for the critic network self.hard_swap() self.ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.num_actions), sigma=float(self.noise_stddev) * np.ones(self.num_actions)) self.ou_noise.reset() def eval_mode(self): self.actor.eval() self.actor_target.eval() self.critic_target.eval() self.critic.eval() def train_mode(self): self.actor.train() self.actor_target.train() self.critic_target.train() self.critic.train() def get_action(self, state, episode, action_noise=True): x = state.to(self.device) # Get the continous action value to perform in the env self.actor.eval() # Sets the actor in evaluation mode mu = self.actor(x) self.actor.train() # Sets the actor in training mode mu = mu.data # During training we add noise for exploration if action_noise: noise = torch.Tensor(self.ou_noise.noise()).to(self.device) * 1.0/(1.0 + 0.1*episode) noise = noise.clamp(0,0.1) mu = mu + noise # Add exploration noise ε ~ p(ε) to the action. Do not use OU noise (https://spinningup.openai.com/en/latest/algorithms/ddpg.html) # Clip the output according to the action space of the env mu = mu.clamp(self.min_action,self.max_action) return mu def update_params(self, batch): # Get tensors from the batch state_batch = torch.cat(batch.state).to(self.device) action_batch = torch.cat(batch.action).to(self.device) reward_batch = torch.cat(batch.reward).to(self.device) done_batch = torch.cat(batch.done).to(self.device) next_state_batch = torch.cat(batch.next_state).to(self.device) # Get the actions and the state values to compute the targets next_action_batch = self.actor_target(next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_batch.detach()) # Compute the target reward_batch = reward_batch.unsqueeze(1) done_batch = done_batch.unsqueeze(1) expected_values = reward_batch + (1.0 - done_batch) * self.gamma * next_state_action_values # Update the critic network self.critic_optimizer.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_values.detach()) value_loss.backward() self.critic_optimizer.step() # Update the actor network self.actor_optimizer.zero_grad() policy_loss = -self.critic(state_batch, self.actor(state_batch)) policy_loss = policy_loss.mean() policy_loss.backward() for param in self.actor.parameters(): param.grad.data.clamp_(-1, 1) self.actor_optimizer.step() # Update the target networks soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.item(), policy_loss.item() def hard_swap(self): # Make sure both targets are with the same weight hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) def store_model(self): print("Storing model at: ", self.checkpoint_path) checkpoint = { 'actor': self.actor.state_dict(), 'actor_optim': self.actor_optimizer.state_dict(), 'critic': self.critic.state_dict(), 'criti_optim': self.critic_optimizer.state_dict() } torch.save(checkpoint, os.path.join(self.checkpoint_path, 'checkpoint.pth') ) def load_model(self): files = os.listdir(self.checkpoint_path) if files: print("Loading models checkpoints!") model_dicts = torch.load(os.path.join(self.checkpoint_path, 'checkpoint.pth'),map_location=self.device) self.actor.load_state_dict(model_dicts['actor']) self.actor_optimizer.load_state_dict(model_dicts['actor_optim']) self.critic.load_state_dict(model_dicts['critic']) self.critic_optimizer.load_state_dict(model_dicts['criti_optim']) else: print("Checkpoints not found!")
class DDPG(object): def __init__(self, seed, nA, nS, L2, index): self.seed = seed self.nA = nA self.nS = nS self.nO = 52 # 24 * 2 state space + 2 * 2 action space self.L2 = L2 self.index = index self.noise = OUnoise(nA, seed) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.local_critic = Critic(seed, self.nO, nA).to(self.device) self.target_critic = Critic(seed, self.nO, nA).to(self.device) self.local_actor = Actor(seed, nS, nA).to(self.device) self.target_actor = Actor(seed, nS, nA).to(self.device) # Copy the weights from local to target hard_update(self.local_critic, self.target_critic) hard_update(self.local_actor, self.target_actor) self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=1e-3, weight_decay=self.L2) self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=1e-4) def load_weights(self, critic_path, actor_path): # Load weigths from both self.local_critic.load_state_dict( torch.load(critic_path + 'local_critic_' + str(self.index) + '.ckpt')) self.local_actor.load_state_dict( torch.load(actor_path + 'local_actor_' + str(self.index) + '.ckpt')) self.target_critic.load_state_dict( torch.load(critic_path + 'target_critic_' + str(self.index) + '.ckpt')) self.target_actor.load_state_dict( torch.load(actor_path + 'target_actor_' + str(self.index) + '.ckpt')) self.local_actor.eval() def save_weights(self, critic_path, actor_path): # Save weights for both torch.save(self.local_actor.state_dict(), actor_path + 'local_actor_' + str(self.index) + '.ckpt') torch.save(self.target_actor.state_dict(), actor_path + 'target_actor_' + str(self.index) + '.ckpt') torch.save(self.local_critic.state_dict(), critic_path + 'local_critic_' + str(self.index) + '.ckpt') torch.save(self.target_critic.state_dict(), critic_path + 'target_critic_' + str(self.index) + '.ckpt') def act(self, state): action = self.local_actor( state).detach().cpu().numpy() + self.noise.sample() return action def target_act(self, next_state): action = self.target_actor( next_state).detach().cpu().numpy() + self.noise.sample() return action def step(self): pass def learn(self): pass
class ddpg_agent: def __init__(self, args, env): self.args = args self.env = env # get the number of inputs... num_inputs = self.env.observation_space.shape[0] num_actions = self.env.action_space.shape[0] self.action_scale = self.env.action_space.high[0] # build up the network self.actor_net = Actor(num_inputs, num_actions) self.critic_net = Critic(num_inputs, num_actions) # get the target network... self.actor_target_net = Actor(num_inputs, num_actions) self.critic_target_net = Critic(num_inputs, num_actions) if self.args.cuda: self.actor_net.cuda() self.critic_net.cuda() self.actor_target_net.cuda() self.critic_target_net.cuda() # copy the parameters.. self.actor_target_net.load_state_dict(self.actor_net.state_dict()) self.critic_target_net.load_state_dict(self.critic_net.state_dict()) # setup the optimizer... self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(), lr=self.args.actor_lr) self.optimizer_critic = torch.optim.Adam( self.critic_net.parameters(), lr=self.args.critic_lr, weight_decay=self.args.critic_l2_reg) # setting up the noise self.ou_noise = OUNoise(num_actions) # check some dir if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) self.model_path = self.args.save_dir + self.args.env_name + '/' if not os.path.exists(self.model_path): os.mkdir(self.model_path) # start to train the network.. def learn(self): # init the brain memory replay_buffer = [] total_timesteps = 0 running_reward = None for episode_idx in range(self.args.max_episode): state = self.env.reset() # get the scale of the ou noise... self.ou_noise.scale = (self.args.noise_scale - self.args.final_noise_scale) * max(0, self.args.exploration_length - episode_idx) / \ self.args.exploration_length + self.args.final_noise_scale self.ou_noise.reset() # start the training reward_total = 0 while True: state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0) if self.args.cuda: state_tensor = state_tensor.cuda() with torch.no_grad(): policy = self.actor_net(state_tensor) # start to select the actions... actions = self._select_actions(policy) # step state_, reward, done, _ = self.env.step(actions * self.action_scale) total_timesteps += 1 reward_total += reward # start to store the samples... replay_buffer.append((state, reward, actions, done, state_)) # check if the buffer size is outof range if len(replay_buffer) > self.args.replay_size: replay_buffer.pop(0) if len(replay_buffer) > self.args.batch_size: mini_batch = random.sample(replay_buffer, self.args.batch_size) # start to update the network _, _ = self._update_network(mini_batch) if done: break state = state_ running_reward = reward_total if running_reward is None else running_reward * 0.99 + reward_total * 0.01 if episode_idx % self.args.display_interval == 0: torch.save(self.actor_net.state_dict(), self.model_path + 'model.pt') print('[{}] Episode: {}, Frames: {}, Rewards: {}'.format( datetime.now(), episode_idx, total_timesteps, running_reward)) self.env.close() # select actions def _select_actions(self, policy): actions = policy.detach().cpu().numpy()[0] actions = actions + self.ou_noise.noise() actions = np.clip(actions, -1, 1) return actions # update the network def _update_network(self, mini_batch): state_batch = np.array([element[0] for element in mini_batch]) state_batch = torch.tensor(state_batch, dtype=torch.float32) # reward batch reward_batch = np.array([element[1] for element in mini_batch]) reward_batch = torch.tensor(reward_batch, dtype=torch.float32).unsqueeze(1) # done batch done_batch = np.array([int(element[3]) for element in mini_batch]) done_batch = 1 - done_batch done_batch = torch.tensor(done_batch, dtype=torch.float32).unsqueeze(1) # action batch actions_batch = np.array([element[2] for element in mini_batch]) actions_batch = torch.tensor(actions_batch, dtype=torch.float32) # next stsate state_next_batch = np.array([element[4] for element in mini_batch]) state_next_batch = torch.tensor(state_next_batch, dtype=torch.float32) # check if use the cuda if self.args.cuda: state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() done_batch = done_batch.cuda() actions_batch = actions_batch.cuda() state_next_batch = state_next_batch.cuda() # update the critic network... with torch.no_grad(): actions_out = self.actor_target_net(state_next_batch) expected_q_value = self.critic_target_net(state_next_batch, actions_out) # get the target value target_value = reward_batch + self.args.gamma * expected_q_value * done_batch target_value = target_value.detach() values = self.critic_net(state_batch, actions_batch) critic_loss = (target_value - values).pow(2).mean() self.optimizer_critic.zero_grad() critic_loss.backward() self.optimizer_critic.step() # start to update the actor network actor_loss = -self.critic_net(state_batch, self.actor_net(state_batch)).mean() self.optimizer_actor.zero_grad() actor_loss.backward() self.optimizer_actor.step() # then, start to softupdate the network... self._soft_update_target_network(self.critic_target_net, self.critic_net) self._soft_update_target_network(self.actor_target_net, self.actor_net) return actor_loss.item(), critic_loss.item() # soft update the network def _soft_update_target_network(self, target, source): # update the critic network firstly... for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) # functions to test the network def test_network(self): model_path = self.args.save_dir + self.args.env_name + '/model.pt' self.actor_net.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) self.actor_net.eval() # start to test for _ in range(5): state = self.env.reset() reward_sum = 0 while True: self.env.render() state = torch.tensor(state, dtype=torch.float32).unsqueeze(0) with torch.no_grad(): actions = self.actor_net(state) actions = actions.detach().numpy()[0] state_, reward, done, _ = self.env.step(self.action_scale * actions) reward_sum += reward if done: break state = state_ print('The reward of this episode is {}.'.format(reward_sum)) self.env.close()
class MADDPGAgent(object): """Multi Agent DDPG Implementation Paper: https://arxiv.org/abs/1706.02275 I used their code to understand how the agents were implemented https://github.com/openai/maddpg """ def __init__(self, state_size, action_size, num_agents, agent_index, writer, random_seed, dirname, print_every=1000, model_path=None, saved_config=None, eval_mode=False): """Initialize an Agent object. Parameters: state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents agent_index (int): index (id) of current agent writer (object): visdom visualiser for realtime visualisations random_seed (int): random seed dirname (string): output directory to store config, losses print_every (int): how often to print progress model_path (string): if defined, load saved model to resume training eval_mode (bool): whether to use eval mode """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.agent_index = agent_index self.writer = writer self.dirname = dirname self.print_every = print_every # save config params if not saved_config: self.config = CONFIG save_to_json(self.config, '{}/hyperparams.json'.format(self.dirname)) else: self.config = json.load(open(saved_config, 'r')) logger.info( 'Loading config from saved location {}'.format(saved_config)) # Create Critic network self.local_critic = Critic(self.state_size * num_agents, self.action_size * num_agents, random_seed, fc1_units=self.config['FC1'], fc2_units=self.config['FC2']).to(device) self.target_critic = Critic(self.state_size * num_agents, self.action_size * num_agents, random_seed, fc1_units=self.config['FC1'], fc2_units=self.config['FC2']).to(device) # Optimizer self.critic_optimizer = optim.Adam( self.local_critic.parameters(), lr=self.config['LR_CRITIC'], weight_decay=self.config['WEIGHT_DECAY']) # Create Actor network self.local_actor = Actor(self.state_size, self.action_size, random_seed, fc1_units=self.config['FC1'], fc2_units=self.config['FC2']).to(device) self.target_actor = Actor(self.state_size, self.action_size, random_seed, fc1_units=self.config['FC1'], fc2_units=self.config['FC2']).to(device) self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=self.config['LR_ACTOR']) # Load saved model (if available) if model_path: logger.info('Loading model from {}'.format(model_path)) self.local_actor.load_state_dict( torch.load('{}/checkpoint_actor_{}.pth'.format( model_path, self.agent_index))) self.target_actor.load_state_dict( torch.load('{}/checkpoint_actor_{}.pth'.format( model_path, self.agent_index))) self.local_critic.load_state_dict( torch.load('{}/checkpoint_critic_{}.pth'.format( model_path, self.agent_index))) self.target_critic.load_state_dict( torch.load('{}/checkpoint_critic_{}.pth'.format( model_path, self.agent_index))) if eval_mode: logger.info('agent {} set to eval mode') self.actor_local.eval() self.noise = OUNoise(self.action_size, random_seed, sigma=self.config['SIGMA']) self.learn_step = 0 def act(self, state, add_noise=True, noise_weight=1): """Get the actions to take under the supplied states Parameters: state (array_like): Game state provided by the environment add_noise (bool): Whether we should apply the noise noise_weight (int): How much weight should be applied to the noise """ state = torch.from_numpy(state).float().to(device) # Run inference in eval mode self.local_actor.eval() with torch.no_grad(): action = self.local_actor(state).cpu().data.numpy() self.local_actor.train() # add noise if true if add_noise: action += self.noise.sample() * noise_weight return np.clip(action, -1, 1) def reset(self): """Resets the noise""" self.noise.reset() def learn(self, agents, experience, gamma): """Use the experience to allow agents to learn. The critic of each agent can see the actions taken by all agents and incorporate that in the learning. Parameters: agents (MADDPGAgent): instance of all the agents experience (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ num_agents = len(agents) states, actions, rewards, next_states, dones = experience # ---------------central critic------------------- # use target actor to get action, here we get target actors from # all agents to predict the next action next_actions = torch.zeros( (len(states), num_agents, self.action_size)).to(device) for i, agent in enumerate(agents): next_actions[:, i] = agent.target_actor(states[:, i, :]) # Flatten state and action # e.g from state (100,2,24) --> (100, 48) critic_states = flatten(next_states) next_actions = flatten(next_actions) # calculate target and expected Q_targets_next = self.target_critic(critic_states, next_actions) Q_targets = rewards[:, self.agent_index, :] + ( gamma * Q_targets_next * (1 - dones[:, self.agent_index, :])) Q_expected = self.local_critic(flatten(states), flatten(actions)) # use mse loss critic_loss = F.mse_loss(Q_expected, Q_targets) critic_loss_value = critic_loss.item() self.critic_optimizer.zero_grad() critic_loss.backward() if self.config['CLIP_GRADS']: for param in self.local_critic.parameters(): param.grad.data.clamp_(-1 * self.config['CLAMP_VALUE'], self.config['CLAMP_VALUE']) self.critic_optimizer.step() # ---------------actor--------------------- # Only update the predicted action of current agent predicted_actions = torch.zeros( (len(states), num_agents, self.action_size)).to(device) predicted_actions.data.copy_(actions.data) predicted_actions[:, self.agent_index] = self.local_actor( states[:, self.agent_index]) actor_loss = -self.local_critic(flatten(states), flatten(predicted_actions)).mean() # Kept to remind myself about the mistake that several tooks hours of investigation # and was only found when I looked at grads from self.local_actor.parameters() # actor_loss = -self.local_critic(flatten(states), flatten(actions)).mean() actor_loss_value = actor_loss.item() self.actor_optimizer.zero_grad() actor_loss.backward() if self.config['CLIP_GRADS']: for param in self.local_actor.parameters(): # import pdb; pdb.set_trace() param.grad.data.clamp_(-1 * self.config['CLAMP_VALUE'], self.config['CLAMP_VALUE']) self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # if self.learn_step == 0: # One time only, start local and target with same parameters self._copy_weights(self.local_critic, self.target_critic) self._copy_weights(self.local_actor, self.target_actor) else: self.soft_update(self.local_critic, self.target_critic, self.config["TAU"]) self.soft_update(self.local_actor, self.target_actor, self.config["TAU"]) self.learn_step += 1 return actor_loss_value, critic_loss_value def _copy_weights(self, source_network, target_network): """Copy source network weights to target""" for target_param, source_param in zip(target_network.parameters(), source_network.parameters()): target_param.data.copy_(source_param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def checkpoint(self): """Checkpoint actor and critic models""" if not os.path.exists('{}/multi'.format(self.dirname)): os.makedirs('{}/multi'.format(self.dirname)) torch.save( self.local_critic.state_dict(), '{}/multi/checkpoint_critic_{}.pth'.format(self.dirname, self.agent_index)) torch.save( self.local_actor.state_dict(), '{}/multi/checkpoint_actor_{}.pth'.format(self.dirname, self.agent_index))
actor_optimizers = [Adam(i.parameters(), lr=0.0005) for i in policy] critic_optimizers = Adam(value_func.parameters(), lr=0.0005) #------------------LOAD TRAINING CHECKPOINT--------------------------- model_dir = 'weights/model_exp_1.pt' save_dir = 'weights/model_tmp.pt' policy_name = "policy_agent_X" value_name = "value_func" actr_op_name = 'actor_opt_agent_X' crtc_op_name = 'critic_opt' if os.path.isfile(model_dir): print("Loaded params!") checkpoint = torch.load(model_dir, map_location=device) trainend_steps = checkpoint['steps'] value_func.load_state_dict(checkpoint[value_name]) critic_optimizers.load_state_dict(checkpoint[crtc_op_name]) for agent_id in range(N_VEHICLES): p_name = policy_name.replace("X", str(agent_id)) a_opt = actr_op_name.replace("X", str(agent_id)) policy[agent_id].load_state_dict(checkpoint[p_name]) actor_optimizers[agent_id].load_state_dict(checkpoint[a_opt]) else: trainend_steps = 0 max_steps = 260000 total_steps = max_steps - trainend_steps #INFO STATE: #----- o: (position, nodes_locations, demand, load, mask) --- observation
class DDPGAgent(Agent): """Interacts with and learns from the environment.""" def __init__(self, idx, params): """Initialize an Agent object. Params ====== params (dict-like): dictionary of parameters for the agent """ super().__init__(params) self.idx = idx self.params = params self.update_every = params['update_every'] self.gamma = params['gamma'] self.num_agents = params['num_agents'] self.name = "BATCH D4PG" # self.her = params['her'] # Actor Network (w/ Target Network) self.actor_local = Actor(params['actor_params']).to(device) self.actor_target = Actor(params['actor_params']).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['actor_params']['lr']) # Critic Network (w/ Target Network) self.critic_local = Critic(params['critic_params']).to(device) self.critic_target = Critic(params['critic_params']).to(device) print("\n################ ACTOR ################\n") print(self.actor_local) print("\n################ CRITIC ################\n") print(self.critic_local) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=params['critic_params']['lr'], weight_decay=params['critic_params']['weight_decay']) # Noise process self.noise = OUNoise(self.params['noise_params']) # Replay memory self.memory = params['experience_replay'] def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # next_state = torch.from_numpy(next_states[self.idx]).float().unsqueeze(0).to(device) # state = torch.from_numpy(states[self.idx]).float().unsqueeze(0).to(device) # # print("\nSTATE\n", state, "\nACTION\n", actions[self.idx], "\nREWARD\n", rewards[self.idx], "\nNEXT STATE\n", next_state, "\nDONE\n", dones[self.idx]) # # Save experience / reward # self.memory.add(state.cpu(), actions[self.idx], rewards[self.idx], next_state.cpu(), dones[self.idx]) next_state = torch.from_numpy(next_state).float().unsqueeze(0).to( device) state = torch.from_numpy(state).float().unsqueeze(0).to(device) # print("\nSTATE\n", state, "\nACTION\n", action, "\nREWARD\n", reward, "\nNEXT STATE\n", next_state, "\nDONE\n", done) # Save experience / reward self.memory.add(state.cpu(), action, reward, next_state.cpu(), done) def step_her(self, agent_idx, timestep, state, action, reward, next_state, done, goal): """Save experience in replay memory, and use random sample from buffer to learn.""" next_state = torch.from_numpy(next_state).float().unsqueeze(0).to( device) state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.memory.add_to_episode(agent_idx, timestep, state.cpu(), action, reward, next_state.cpu(), done, goal) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) # self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() # self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1., 1.) def reset(self): self.noise.reset() def learn(self): # Learn every UPDATE_EVERY time steps. self.t_step += 1 # print(self.t_step) # # self.t_step = (self.t_step + 1) % self.update_every # if self.t_step % self.update_every == 0: # print("LEARNING", self.t_step) # # If enough samples are available in memory, get random subset and learn # if self.memory.ready(): # experiences = self.memory.sample() # # print("################################## LEARN XP LENGTH",len(experiences)) # self.learn_(experiences) # If enough samples are available in memory, get random subset and learn if self.memory.ready(): experiences = self.memory.sample() # print("################################## LEARN XP LENGTH",len(experiences)) self.learn_(experiences) def learn_(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def add_param_noise(self, noise): """Adds noise to the weights of the agent""" with torch.no_grad(): for param in self.actor_local.parameters(): param.add_(torch.randn(param.size()).to(device) * noise) for param in self.critic_local.parameters(): param.add_(torch.randn(param.size()).to(device) * noise) def save_agent(self, average_reward, episode, save_history=False): """Save the checkpoint""" checkpoint = { 'actor_state_dict': self.actor_target.state_dict(), 'critic_state_dict': self.critic_target.state_dict(), 'average_reward': average_reward, 'episode': episode } if not os.path.exists("checkpoints"): os.makedirs("checkpoints") filePath = 'checkpoints\\' + self.name + '.pth' # print("\nSaving checkpoint\n") torch.save(checkpoint, filePath) if save_history: filePath = 'checkpoints\\' + self.name + '_' + str( episode) + '.pth' torch.save(checkpoint, filePath) def load_agent(self): """Load the checkpoint""" # print("\nLoading checkpoint\n") filePath = 'checkpoints\\' + self.name + '.pth' if os.path.exists(filePath): checkpoint = torch.load(filePath, map_location=lambda storage, loc: storage) self.actor_local.load_state_dict(checkpoint['actor_state_dict']) self.actor_target.load_state_dict(checkpoint['actor_state_dict']) self.critic_local.load_state_dict(checkpoint['critic_state_dict']) self.critic_target.load_state_dict(checkpoint['critic_state_dict']) average_reward = checkpoint['average_reward'] episode = checkpoint['episode'] print( "Loading checkpoint - Average Reward {} at Episode {}".format( average_reward, episode)) else: print( "\nCannot find {} checkpoint... Proceeding to create fresh neural network\n" .format(self.name))
class Actor_Crtic_Agent(): def __init__(self, name, id, device, state_size, action_size, load_agent=False): self.device = device self.state_size = state_size self.action_size = action_size self.seed = random.seed(RANDOM_SEED) self.name = name self.id = id # Hyperparameters self.gamma = GAMMA self.tau = TAU self.lr_actor = LR_ACTOR self.lr_critic = LR_CRITIC self.weight_decay = LEARNING_RATE_DECAY # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, RANDOM_SEED).to(self.device) self.actor_target = Actor(state_size, action_size, RANDOM_SEED).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, RANDOM_SEED).to(self.device) self.critic_target = Critic(state_size, action_size, RANDOM_SEED).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) if load_agent: self.load_agent(self.name) # Noise process self.noise = OUNoise(action_size, RANDOM_SEED) def step(self, state, action, reward, next_state, done, shared_memory): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward shared_memory.add(state, action, reward, next_state, done) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: noise = self.noise.sample() action += noise return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, shared_memory): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ indices, weights, experiences = shared_memory.sample() # if shared_memory.priority: # states, actions, rewards, next_states, dones, indices = experiences # else: critic_losses = [] actor_losses = [] # print(weights) for experience, index, weight in zip(experiences, indices, weights): print(index, weight) state, action, reward, next_state, done = experience state = torch.from_numpy(state).float().to(self.device) action = torch.from_numpy(action).float().to(self.device) # reward = torch.from_numpy(reward).float().to(self.device) next_state = torch.from_numpy(next_state).float().to(self.device) # done = torch.from_numpy(done).astype(np.uint8).float().to(self.device) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models action_next = self.actor_target(next_state) Q_target_next = self.critic_target(next_state, action_next) # Compute Q target for current state (y_i) Q_target = reward + (self.gamma * Q_target_next * (1 - done)) # Compute critic loss Q_expected = self.critic_local(state, action) critic_loss = F.mse_loss(Q_expected, Q_target) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(state) actor_loss = -self.critic_local(state, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() critic_losses.append(critic_loss.detach().cpu().numpy()) actor_losses.append(actor_loss.detach().cpu().numpy()) # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) if shared_memory.priority: # prios = actor_loss.detach().cpu().numpy() * weights + 1e-5 shared_memory.update(indices, np.ndarray.flatten(np.array(actor_losses))) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ tau = self.tau for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_agent(self, fileName): """Save the checkpoint""" checkpoint = { 'actor_state_dict': self.actor_target.state_dict(), 'critic_state_dict': self.critic_target.state_dict(), 'best_reward': self.best_reward } if not os.path.exists("checkpoints"): os.makedirs("checkpoints") filePath = 'checkpoints\\' + fileName + '.pth' # print("\nSaving checkpoint\n") torch.save(checkpoint, filePath) def load_agent(self, fileName): """Load the checkpoint""" # print("\nLoading checkpoint\n") filePath = 'checkpoints\\' + fileName + '.pth' if os.path.exists(filePath): checkpoint = torch.load(filePath, map_location=lambda storage, loc: storage) self.actor_local.load_state_dict(checkpoint['actor_state_dict']) self.actor_target.load_state_dict(checkpoint['actor_state_dict']) self.critic_local.load_state_dict(checkpoint['critic_state_dict']) self.critic_target.load_state_dict(checkpoint['critic_state_dict']) self.best_reward = checkpoint['best_reward'] print( "Loading checkpoint - Last Best Reward {} (%) at Frame {} with LR {}" .format((np.exp(self.best_reward) - 1) * 100, self.last_upgraded_frame, self.learning_rate)) else: print( "\nCannot find {} checkpoint... Proceeding to create fresh neural network\n" .format(fileName))
class Agent(): def __init__(self, nS, nA, indicies, config): self.nS = nS self.nA = nA self.indicies = indicies self.vector_size = self.indicies[-1][1] self.grade_mask = config.grade_technique_keys self.terrain_mask = config.terrain_technique_keys self.action_low = config.action_low self.action_high = config.action_high self.seed = config.seed self.clip_norm = config.clip_norm self.tau = config.tau self.gamma = config.gamma self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.L2 = config.L2 self.SGD_epoch = config.SGD_epoch # noise self.noise = OUnoise(nA, config.seed) self.noise_scale = 1.0 self.noise_decay = config.noise_decay # Priority Replay Buffer self.batch_size = config.batch_size self.buffer_size = config.buffer_size self.alpha = config.ALPHA self.beta = self.start_beta = config.START_BETA self.end_beta = config.END_BETA # actors networks self.actor = Actor(self.seed, nS, nA, self.grade_mask, self.terrain_mask, indicies).to(self.device) self.actor_target = Actor(self.seed, nS, nA, self.grade_mask, self.terrain_mask, indicies).to(self.device) # Param noise self.param_noise = AdaptiveParamNoise() self.actor_perturbed = Actor(self.seed, nS, nA, self.grade_mask, self.terrain_mask, indicies).to(self.device) # critic networks self.critic = Critic(self.seed, nS, nA).to(self.device) self.critic_target = Critic(self.seed, nS, nA).to(self.device) # Copy the weights from local to target hard_update(self.critic, self.critic_target) hard_update(self.actor, self.actor_target) # optimizer self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4, weight_decay=self.L2) self.critic_opt = optim.Adam(self.critic.parameters(), lr=1e-3, weight_decay=self.L2) # replay buffer self.PER = PriorityReplayBuffer(self.buffer_size, self.batch_size, self.seed, alpha=self.alpha, device=self.device) # reset agent for training self.reset_episode() self.it = 0 def save_weights(self, path): params = {} params['actor'] = self.actor.state_dict() params['critic'] = self.critic.state_dict() torch.save(params, path) def load_weights(self, path): checkpoint = torch.load(path, map_location=self.device) self.actor.load_state_dict(checkpoint['actor']) self.actor_target.load_state_dict(checkpoint['actor']) self.critic.load_state_dict(checkpoint['critic']) self.critic_target.load_state_dict(checkpoint['critic']) def reset_episode(self): self.noise.reset() def ddpg_distance_metric(self, actions1, actions2): """ TODO Necessary for param noise Computes distance between actions taken by two different policies Expects numpy arrays """ diff = actions1 - actions2 mean_diff = np.mean(np.square(diff), axis=0) dist = np.sqrt(np.mean(mean_diff)) return dist def norm_action(self, action): for index in self.indicies: action[index[0]:index[1]] = action[index[0]:index[1]] / np.sum( action[index[0]:index[1]]) return action def act(self, state): with torch.no_grad(): action = self.actor(self.tensor(state)).cpu().numpy() action += np.random.rand(self.indicies[-1][1]) * self.noise_scale self.noise_scale = max(self.noise_scale * self.noise_decay, 0.01) self.actor.train() action = self.norm_action(action) return action def act_perturbed(self, state): """ TODO """ with torch.no_grad(): action = self.actor_perturbed(self.tensor(state)).cpu().numpy() return action def perturbed_update(self): """ TODO """ hard_update(self.actor, self.actor_perturbed) params = self.actor_perturbed.state_dict() for name in params: if 'ln' in name: pass param = params[name] random = torch.randn(param.shape).to(self.device) param += random * self.param_noise.current_stddev def evaluate(self, state): self.actor.eval() with torch.no_grad(): action = self.actor(self.tensor(state)).cpu().numpy() return action def step(self, obs, actions, rewards, next_obs): # cast as torch tensors next_obs = torch.from_numpy(next_obs.reshape( self.vector_size)).float().to(self.device) obs = torch.from_numpy(obs.reshape(self.vector_size)).float().to( self.device) actions = torch.from_numpy(actions.reshape( self.vector_size)).float().to(self.device) # Calc TD error next_action = self.actor(next_obs) next_value = self.critic_target(next_obs, next_action) target = rewards + self.gamma * next_value local = self.critic(obs, actions) TD_error = (target - local).squeeze(0) self.PER.add(obs, actions, rewards, next_obs, TD_error) for _ in range(self.SGD_epoch): samples, indicies, importances = self.PER.sample() self.learn(samples, indicies, importances) def add_replay_warmup(self, obs, actions, rewards, next_obs): next_obs = torch.from_numpy(next_obs.reshape( self.vector_size)).float().to(self.device) obs = torch.from_numpy(obs.reshape(self.vector_size)).float().to( self.device) actions = torch.from_numpy(actions.reshape( self.vector_size)).float().to(self.device) # Calculate TD_error next_action = self.actor(next_obs) next_value = self.critic_target(next_obs, next_action) target = np.max(rewards) + self.gamma * next_value local = self.critic(obs, actions) TD_error = (target - local).squeeze(0) self.PER.add(obs, actions, np.max(rewards), next_obs, TD_error) def learn(self, samples, indicies, importances): states, actions, rewards, next_states = samples with torch.no_grad(): target_actions = self.actor_target(next_states) next_values = self.critic_target(next_states, target_actions) y_target = rewards + self.gamma * next_values y_current = self.critic(states, actions) TD_error = y_current - y_target # update critic critic_loss = ((torch.tensor(importances).to(self.device) * TD_error)**2).mean() self.critic.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic.parameters(),self.clip_norm) self.critic_opt.step() # update actor local_actions = self.actor(states) actor_loss = -self.critic(states, local_actions).mean() self.actor.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_norm) self.actor_opt.step() # Update PER TD_errors = TD_error.squeeze(1).detach().cpu().numpy() self.PER.sum_tree.update_priorities(TD_errors, indicies) # soft update networks self.soft_update() def soft_update(self): """Soft update of target network θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def tensor(self, x): return torch.from_numpy(x).float().to(self.device)
class SingleDDPGAgent: """ Single agent DDPG. Interacts with and learns from the environment. """ def __init__(self, state_size, action_size, cfg, num_agents=1, agent_id=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action cfg (config object): main configuration with other passed settings num_agents (int): optional (default: 1). If >1 will multiply state and action space sizes for critic. Used for usage with MADDPG. agent_id (int): optional (default: 0). Set agent id for MADDPG. """ print("Initializing single DDPG agent!") self.state_size = state_size self.action_size = action_size self.seed = random.seed(cfg.random_seed) self.n_agents = num_agents self.agent_id = agent_id self.cfg = cfg # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, cfg.random_seed, cfg.dense_layers_actor).to(device) self.actor_target = Actor(state_size, action_size, cfg.random_seed, cfg.dense_layers_actor).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=cfg.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * num_agents, action_size * num_agents, cfg.random_seed, cfg.dense_layers_critic).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, cfg.random_seed, cfg.dense_layers_critic).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=cfg.lr_critic, weight_decay=cfg.weight_decay) self.hard_copy_weights(self.critic_local, self.critic_target) self.hard_copy_weights(self.actor_local, self.actor_target) self.t_step = 0 # Noise process self.noise = OUNoise(action_size, cfg.random_seed, theta=cfg.theta_ou, sigma=cfg.sigma_ou) # Replay memory self.memory = ReplayBuffer(action_size, cfg.buffer_size, cfg.batch_size, cfg.random_seed, cfg) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward max_prio = self.memory.get_max_priority() self.memory.add(state, action, reward, next_state, max_prio, done) # Learn, if enough samples are available in memory # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.cfg.update_every if self.t_step == 0: if len(self.memory) > self.cfg.batch_size: experiences = self.memory.sample() self.learn(experiences, self.cfg.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state.view( 1, -1)).squeeze().cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def target_act(self, state): """ Let target network return action.""" self.actor_target.eval() with torch.no_grad(): action_target = self.actor_target(state) return np.clip(action_target, -1, 1) def reset(self): self.t_step = 0 self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', prio, done, indices) tuples gamma (float): discount factor """ states, actions, rewards, next_states, priorities, dones, indices = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) if self.cfg.prioritized_replay: weights = 1. / ( (self.cfg.batch_size * priorities)**self.cfg.priority_beta) weights /= max(weights) # calculating new transition priorities based on residuals # between target and local network predictions diffs = Q_targets - Q_expected # TD-error diffs = np.abs(np.squeeze(diffs.tolist())) self.memory.update_prios(indices, diffs) # bias-annealing weights Q_expected *= weights Q_targets *= weights critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.cfg.tau) self.soft_update(self.actor_local, self.actor_target, self.cfg.tau) @staticmethod def hard_copy_weights(local_model, target_model): """Update model parameters. Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_weights(self, model_save_path, suffix=""): """ Simple method to save network weights. """ # actors torch.save( self.actor_local.state_dict(), os.path.join(model_save_path, "weights_actor_local{:s}.pth".format(suffix))) torch.save( self.actor_target.state_dict(), os.path.join(model_save_path, "weights_actor_target{:s}.pth".format(suffix))) # critics torch.save( self.critic_local.state_dict(), os.path.join(model_save_path, "weights_critic_local{:s}.pth".format(suffix))) torch.save( self.critic_target.state_dict(), os.path.join(model_save_path, "weights_critic_target{:s}.pth".format(suffix))) def load_weights(self, model_save_path, suffix=""): """ Method to load network weights from saved files. """ self.actor_local.load_state_dict( torch.load( os.path.join(model_save_path, "weights_actor_local{:s}.pth".format(suffix)))) self.actor_target.load_state_dict( torch.load( os.path.join(model_save_path, "weights_actor_target{:s}.pth".format(suffix)))) self.critic_local.load_state_dict( torch.load( os.path.join(model_save_path, "weights_critic_local{:s}.pth".format(suffix)))) self.critic_target.load_state_dict( torch.load( os.path.join(model_save_path, "weights_critic_target{:s}.pth".format(suffix))))
class TD3Agent: """ Encapsulates the functioning of the TD3 agent """ def __init__(self, state_dim, action_dim, max_action, device, memory_capacity=10000, discount=0.99, update_freq=2, tau=0.005, policy_noise_std=0.2, policy_noise_clip=0.5, actor_lr=1e-3, critic_lr=1e-3, train_mode=True): self.train_mode = train_mode # whether the agent is in training or testing mode self.state_dim = state_dim # dimension of the state space self.action_dim = action_dim # dimension of the action space self.device = device # defines which cuda or cpu device is to be used to run the networks self.discount = discount # denoted a gamma in the equation for computation of the Q-value self.update_freq = update_freq # defines how frequently should the actor and target be updated self.tau = tau # defines the factor used for Polyak averaging (i.e., soft updating of the target networks) self.max_action = max_action # the max value of the range in the action space (assumes a symmetric range in the action space) self.policy_noise_clip = policy_noise_clip # max range within which the noise for the target policy smoothing must be contained self.policy_noise_std = policy_noise_std # standard deviation, i.e. sigma, of the Gaussian noise applied for target policy smoothing # create an instance of the replay buffer self.memory = ReplayMemory(memory_capacity) # instances of the networks for the actor and the two critics self.actor = Actor(state_dim, action_dim, max_action, actor_lr) self.critic = Critic( state_dim, action_dim, critic_lr ) # the critic class encapsulates two copies of the neural network for the two critics used in TD3 # instance of the target networks for the actor and the two critics self.target_actor = Actor(state_dim, action_dim, max_action, actor_lr) self.target_critic = Critic(state_dim, action_dim, critic_lr) # initialise the targets to the same weight as their corresponding current networks self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) # since we do not learn/train on the target networks self.target_actor.eval() self.target_critic.eval() # for test mode if not self.train_mode: self.actor.eval() self.critic.eval() self.actor.to(self.device) self.critic.to(self.device) self.target_actor.to(self.device) self.target_critic.to(self.device) def select_action(self, state, exploration_noise=0.1): """ Function to returns the appropriate action for the given state. During training, it returns adds a zero-mean gaussian noise with std=exploration_noise to the action to encourage exploration. No noise is added to the action decision during testing mode. Parameters --- state: vector or tensor The current state of the environment as observed by the agent exploration_noise: float, optional Standard deviation, i.e. sigma, of the Gaussian noise to be added to the agent's action to encourage exploration Returns --- A numpy array representing the noisy action to be performed by the agent in the current state """ if not torch.is_tensor(state): state = torch.tensor([state], dtype=torch.float32).to(self.device) act = self.actor(state).cpu().data.numpy().flatten( ) # performs inference using the actor based on the current state as the input and returns the corresponding np array if not self.train_mode: exploration_noise = 0.0 # since we do not need noise to be added to the action during testing noise = np.random.normal( 0.0, exploration_noise, size=act.shape ) # generate the zero-mean gaussian noise with standard deviation determined by exploration_noise noisy_action = act + noise noisy_action = noisy_action.clip( min=-self.max_action, max=self.max_action ) # to ensure that the noisy action being returned is within the limit of "legal" actions afforded to the agent; assumes action range is symmetric return noisy_action def learn(self, current_iteration, batchsize): """ Function to perform the updates on the 6 neural networks that run the TD3 algorithm. Parameters --- current_iteration: int Total number of steps that have been performed by the agent batchsize: int Number of experiences to be randomly sampled from the memory for the agent to learn from Returns --- none """ if len(self.memory) < batchsize: return states, actions, next_states, rewards, dones = self.memory.sample( batchsize, self.device ) # a batch of experiences randomly sampled form the memory # ensure that the actions and rewards tensors have the appropriate shapes actions = actions.view(-1, self.action_dim) rewards = rewards.view(-1, 1) # generate noisy target actions for target policy smoothing pred_action = self.target_actor(next_states) noise = torch.zeros_like(pred_action).normal_( 0, self.policy_noise_std).to(self.device) noise = torch.clamp(noise, min=-self.policy_noise_clip, max=self.policy_noise_clip) noisy_pred_action = torch.clamp(pred_action + noise, min=-self.max_action, max=self.max_action) # calculate TD-Target using Clipped Double Q-learning target_q1, target_q2 = self.target_critic(next_states, noisy_pred_action) target_q = torch.min(target_q1, target_q2) target_q[ dones] = 0.0 # being in a terminal state implies there are no more future states that the agent would encounter in the given episode and so set the associated Q-value to 0 y = rewards + self.discount * target_q current_q1, current_q2 = self.critic( states, actions ) # the critic class encapsulates two copies of the neural network thereby returning two Q values with each forward pass critic_loss = F.mse_loss(current_q1, y) + F.mse_loss( current_q2, y ) # the losses of the two critics need to be added as there is only one optimiser shared between the two networks critic_loss = critic_loss.mean() self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # delayed policy and target updates if current_iteration % self.update_freq == 0: # actor loss is calculated by a gradient ascent along crtic 1, thus need to apply the negative sign to convert to a gradient descent pred_current_actions = self.actor(states) pred_current_q1, _ = self.critic( states, pred_current_actions ) # since we only need the Q-value from critic 1, we can ignore the second value obtained through the forward pass actor_loss = -pred_current_q1.mean() self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # apply slow-update to all three target networks self.soft_update_targets() def soft_update_net(self, source_net_params, target_net_params): """ Function to perform Polyak averaging to update the parameters of the provided network Parameters --- source_net_params: list trainable parameters of the source, ie. current version of the network target_net_params: list trainable parameters of the corresponding target network Returns --- none """ for source_param, target_param in zip(source_net_params, target_net_params): target_param.data.copy_(self.tau * source_param.data + (1 - self.tau) * target_param.data) def soft_update_targets(self): """ Function that calls Polyak averaging on all three target networks Parameters --- none Returns --- none """ self.soft_update_net(self.actor.parameters(), self.target_actor.parameters()) self.soft_update_net(self.critic.parameters(), self.target_critic.parameters()) def save(self, path, model_name): """ Function to save the actor and critic networks Parameters --- path: str Location where the model is to be saved model_name: str Name of the model Returns --- none """ self.actor.save_model('{}/{}_actor'.format(path, model_name)) self.critic.save_model('{}/{}_critic'.format(path, model_name)) def load(self, model_name): """ Function to load the actor and critic networks Parameters --- path: str Location where the model is saved model_name: str Name of the model Returns --- none """ self.actor.load_model('{}/{}_actor'.format(path, model_name)) self.critic.load_model('{}/{}_critic'.format(path, model_name))
class DDPGAgent: """ Encapsulates the functioning of the DDPG agent """ def __init__(self, state_dim, action_dim, max_action, device, memory_capacity=10000, discount=0.99, tau=0.005, sigma=0.2, theta=0.15, actor_lr=1e-4, critic_lr=1e-3, train_mode=True): self.train_mode = train_mode # whether the agent is in training or testing mode self.state_dim = state_dim # dimension of the state space self.action_dim = action_dim # dimension of the action space self.device = device # defines which cuda or cpu device is to be used to run the networks self.discount = discount # denoted a gamma in the equation for computation of the Q-value self.tau = tau # defines the factor used for Polyak averaging (i.e., soft updating of the target networks) self.max_action = max_action # the max value of the range in the action space (assumes a symmetric range in the action space) # create an instance of the replay buffer self.memory = ReplayMemory(memory_capacity) # create an instance of the noise generating process self.ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(self.action_dim), sigma=sigma, theta=theta) # instances of the networks for the actor and the critic self.actor = Actor(state_dim, action_dim, max_action, actor_lr) self.critic = Critic(state_dim, action_dim, critic_lr) # instance of the target networks for the actor and the critic self.target_actor = Actor(state_dim, action_dim, max_action, actor_lr) self.target_critic = Critic(state_dim, action_dim, critic_lr) # initialise the targets to the same weight as their corresponding current networks self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) # since we do not learn/train on the target networks self.target_actor.eval() self.target_critic.eval() # for test mode if not self.train_mode: self.actor.eval() self.critic.eval() self.ounoise = None self.actor.to(self.device) self.critic.to(self.device) self.target_actor.to(self.device) self.target_critic.to(self.device) def select_action(self, state): """ Function to return the appropriate action for the given state. During training, it adds a zero-mean OU noise to the action to encourage exploration. During testing, no noise is added to the action decision. Parameters --- state: vector or tensor The current state of the environment as observed by the agent Returns --- A numpy array representing the noisy action to be performed by the agent in the current state """ if not torch.is_tensor(state): state = torch.tensor([state], dtype=torch.float32).to(self.device) self.actor.eval() act = self.actor(state).cpu().data.numpy().flatten() # performs inference using the actor based on the current state as the input and returns the corresponding np array self.actor.train() noise = 0.0 ## for adding Gaussian noise (to use, update the code pass the exploration noise as input) #if self.train_mode: # noise = np.random.normal(0.0, exploration_noise, size=act.shape) # generate the zero-mean gaussian noise with standard deviation determined by exploration_noise # for adding OU noise if self.train_mode: noise = self.ou_noise.generate_noise() noisy_action = act + noise noisy_action = noisy_action.clip(min=-self.max_action, max=self.max_action) # to ensure that the noisy action being returned is within the limit of "legal" actions afforded to the agent; assumes action range is symmetric return noisy_action def learn(self, batchsize): """ Function to perform the updates on the 4 neural networks that run the DDPG algorithm. Parameters --- batchsize: int Number of experiences to be randomly sampled from the memory for the agent to learn from Returns --- none """ if len(self.memory) < batchsize: return states, actions, next_states, rewards, dones = self.memory.sample(batchsize, self.device) # a batch of experiences randomly sampled form the memory # ensure that the actions and rewards tensors have the appropriate shapes actions = actions.view(-1, self.action_dim) rewards = rewards.view(-1, 1) with torch.no_grad(): # generate target actions target_action = self.target_actor(next_states) # calculate TD-Target target_q = self.target_critic(next_states, target_action) target_q[dones] = 0.0 # being in a terminal state implies there are no more future states that the agent would encounter in the given episode and so set the associated Q-value to 0 y = rewards + self.discount * target_q current_q = self.critic(states, actions) critic_loss = F.mse_loss(current_q, y).mean() self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # actor loss is calculated by a gradient ascent along the crtic, thus need to apply the negative sign to convert to a gradient descent pred_current_actions = self.actor(states) pred_current_q = self.critic(states, pred_current_actions) actor_loss = - pred_current_q.mean() self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # apply slow-update to the target networks self.soft_update_targets() def soft_update_net(self, source_net_params, target_net_params): """ Function to perform Polyak averaging to update the parameters of the provided network Parameters --- source_net_params: list trainable parameters of the source, ie. current version of the network target_net_params: list trainable parameters of the corresponding target network Returns --- none """ for source_param, target_param in zip(source_net_params, target_net_params): target_param.data.copy_(self.tau * source_param.data + (1 - self.tau) * target_param.data) def soft_update_targets(self): """ Function that calls Polyak averaging on all three target networks Parameters --- none Returns --- none """ self.soft_update_net(self.actor.parameters(), self.target_actor.parameters()) self.soft_update_net(self.critic.parameters(), self.target_critic.parameters()) def save(self, path, model_name): """ Function to save the actor and critic networks Parameters --- path: str Location where the model is to be saved model_name: str Name of the model Returns --- none """ self.actor.save_model('{}/{}_actor'.format(path, model_name)) self.critic.save_model('{}/{}_critic'.format(path, model_name)) def load(self, path, model_name): """ Function to load the actor and critic networks Parameters --- path: str Location where the model is saved model_name: str Name of the model Returns --- none """ self.actor.load_model('{}/{}_actor'.format(path, model_name)) self.critic.load_model('{}/{}_critic'.format(path, model_name))
def LoadModels(self, actorpath, criticpath): actor = Actor(self.env_params, self.hidden_neurons) critic = Critic(self.env_params, self.hidden_neurons) actor.load_state_dict(torch.load(actorpath)) critic.load_state_dict(torch.load(criticpath)) return actor, critic
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, a_check=None, c_check=None, gamma=0.99, tau=1e-3, add_noise=False, mu=0., theta=0.15, sigma=0.1, lr_actor=2e-4, lr_critic=2e-4, buffer_size=1e5, batch_size=128): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.random_process = OUNoise(action_size, seed, mu=mu, theta=theta, sigma=sigma) self.gamma = gamma self.tau = tau # Actor and Critic approximators self.targetActor = Actor(state_size, action_size, seed, (128, 128)).to(device) self.targetCritic = Critic(state_size, action_size, seed, (128, 128)).to(device) self.actor = Actor(state_size, action_size, seed, (128, 128)).to(device) self.critic = Critic(state_size, action_size, seed, (128, 128)).to(device) for target, local in zip(self.targetCritic.parameters(), self.critic.parameters()): target.data.copy_(local.data) for target, local in zip(self.targetActor.parameters(), self.actor.parameters()): target.data.copy_(local.data) if a_check is not None: self.actor.load_state_dict(a_check) if c_check is not None: self.critic.load_state_dict(c_check) # self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.actor_opt = optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic_opt = optim.Adam(self.critic.parameters(), lr=lr_critic) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.t = 0 self.warm_up = WARM_UP self.add_noise = add_noise def reset(self): self.random_process.reset() def act(self, state, random=False): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection eval (boolean) : Turns off mean and std deviation from evaluation batches if set to true """ if random is True or self.t < self.warm_up: action = np.random.randn(NUM_AGENTS, self.action_size) else: self.actor.eval() with torch.no_grad(): action = self.actor( torch.from_numpy(state).float().to( device)).cpu().data.numpy() if self.add_noise: noise = self.random_process.sample() action += noise self.actor.train() return np.clip(action, LOW_ACTION, HIGH_ACTION) def step(self, state, action, reward, next_state, done): # Save experience in replay memory NUMPY self.t += 1 self.memory.add(state, action, reward, next_state, done) if self.t > self.warm_up: # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if len(self.memory) > self.memory.batch_size: for i in range(0, CONSECUTIVE_LEARNS): # If enough samples are available in memory, get random subset and learn experiences = self.memory.sample() self.learn(experiences, self.gamma) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.targetActor(next_states) Q_targets_next = self.targetCritic(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss (using gradient clipping) self.critic_opt.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_opt.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor(states) actor_loss = -self.critic(states, actions_pred).mean() # Minimize the loss self.actor_opt.zero_grad() actor_loss.backward() self.actor_opt.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic, self.targetCritic, self.tau) self.soft_update(self.actor, self.targetActor, self.tau) ''' print(next_action_values_local.shape) print(next_action_values_local[0][:]) print(next_action_values_local.gather(1, actions).shape) print(actions[0][0]) print(next_action_values_local.gather(1, actions)[0][0]) ''' def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def adjust_learning_rate(self, episode, val): print("adjusting learning rate!") for param_group in self.optimizer.param_groups: param_group['lr'] = val
class Agent: def __init__(self,env, env_params, args, models=None, record_episodes=[0,.1,.25,.5,.75,1.]): self.env= env self.env_params = env_params self.args = args # networks if models == None: self.actor = Actor(self.env_params).double() self.critic = Critic(self.env_params).double() else: self.actor , self.critic = self.LoadModels() # target networks used to predict env actions with self.actor_target = Actor(self.env_params,).double() self.critic_target = Critic(self.env_params).double() self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) if self.args.cuda: self.actor.cuda() self.critic.cuda() self.actor_target.cuda() self.critic_target.cuda() self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=0.001) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=0.001) self.normalize = Normalizer(env_params,self.args.gamma) self.buffer = ReplayBuffer(1_000_000, self.env_params) self.tensorboard = ModifiedTensorBoard(log_dir = f"logs") self.record_episodes = [int(eps * self.args.n_epochs) for eps in record_episodes] def ModelsEval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def ModelsTrain(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def GreedyAction(self, state): self.ModelsEval() with torch.no_grad(): state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0) if self.args.cuda: state = state.cuda() action = self.actor.forward(state).detach().cpu().numpy().squeeze() return action def NoiseAction(self, state): self.ModelsEval() with torch.no_grad(): state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0) if self.args.cuda: state = state.cuda() action = self.actor.forward(state).detach().cpu().numpy() action += self.args.noise_eps * self.env_params['max_action'] * np.random.randn(*action.shape) action = np.clip(action, -self.env_params['max_action'], self.env_params['max_action']) return action.squeeze() def Update(self): self.ModelsTrain() for i in range(self.args.n_batch): state, a_batch, r_batch, nextstate, d_batch = self.buffer.SampleBuffer(self.args.batch_size) a_batch = torch.tensor(a_batch,dtype=torch.double) r_batch = torch.tensor(r_batch,dtype=torch.double) # d_batch = torch.tensor(d_batch,dtype=torch.double) state = torch.tensor(state,dtype=torch.double) nextstate = torch.tensor(nextstate,dtype=torch.double) # d_batch = 1 - d_batch if self.args.cuda: a_batch = a_batch.cuda() r_batch = r_batch.cuda() # d_batch = d_batch.cuda() state = state.cuda() nextstate = nextstate.cuda() with torch.no_grad(): action_next = self.actor_target.forward(nextstate) q_next = self.critic_target.forward(nextstate,action_next) q_next = q_next.detach().squeeze() q_target = r_batch + self.args.gamma * q_next q_target = q_target.detach().squeeze() q_prime = self.critic.forward(state, a_batch).squeeze() critic_loss = F.mse_loss(q_target, q_prime) action = self.actor.forward(state) actor_loss = -self.critic.forward(state, action).mean() # params = torch.cat([x.view(-1) for x in self.actor.parameters()]) # l2_reg = self.args.l2_norm *torch.norm(params,2) # actor_loss += l2_reg self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() self.SoftUpdateTarget(self.critic, self.critic_target) self.SoftUpdateTarget(self.actor, self.actor_target) def Explore(self): for epoch in range(self.args.n_epochs +1): start_time = time.process_time() for cycle in range(self.args.n_cycles): for _ in range(self.args.num_rollouts_per_mpi): state = self.env.reset() for t in range(self.env_params['max_timesteps']): action = self.NoiseAction(state) nextstate, reward, done, info = self.env.step([action]) nextstate = nextstate.squeeze() reward = self.normalize.normalize_reward(reward) self.buffer.StoreTransition(state, action, reward, nextstate, done) state = nextstate self.Update() avg_reward = self.Evaluate() self.tensorboard.step = epoch elapsed_time = time.process_time() - start_time print(f"Epoch {epoch} of total of {self.args.n_epochs +1} epochs, average reward is: {avg_reward}.\ Elapsedtime: {int(elapsed_time /60)} minutes {int(elapsed_time %60)} seconds") if epoch % 5 or epoch + 1 == self.args.n_epochs: self.SaveModels(epoch) self.record(epoch) def Evaluate(self): self.ModelsEval() total_reward = [] episode_reward = 0 succes_rate = [] for episode in range(self.args.n_evaluate): state = self.env.reset() episode_reward = 0 for t in range(self.env_params['max_timesteps']): action = self.GreedyAction(state) nextstate, reward, done, info = self.env.step([action]) episode_reward += reward state = nextstate if done or t + 1 == self.env_params['max_timesteps']: total_reward.append(episode_reward) episode_reward = 0 average_reward = sum(total_reward)/len(total_reward) min_reward = min(total_reward) max_reward = max(total_reward) self.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward) return average_reward def record(self, epoch): self.ModelsEval() try: if not os.path.exists("videos"): os.mkdir('videos') recorder = VideoRecorder(self.env, path=f'videos/epoch-{epoch}.mp4') for _ in range(self.args.n_record): done =False state = self.env.reset() while not done: recorder.capture_frame() action = self.GreedyAction(state) nextstate,reward,done,info = self.env.step([action]) state = nextstate recorder.close() except Exception as e: print(e) def SaveModels(self, ep): if not os.path.exists("models"): os.mkdir('models') torch.save(self.actor.state_dict(), os.path.join('models', 'Actor.pt')) torch.save(self.critic.state_dict(), os.path.join('models', 'Critic.pt')) def LoadModels(self, actorpath, criticpath): actor = Actor(self.env_params, self.hidden_neurons) critic = Critic(self.env_params, self.hidden_neurons) actor.load_state_dict(torch.load(actorpath)) critic.load_state_dict(torch.load(criticpath)) return actor, critic def SoftUpdateTarget(self, source, target): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)
class AgentDDPG: """Deep Deterministic Policy Gradient implementation for continuous action space reinforcement learning tasks""" def __init__(self, state_size, hidden_size, action_size, actor_learning_rate=1e-4, critic_learning_rate=1e-3, gamma=0.99, tau=1e-2, use_cuda=False, actor_path=None, critic_path=None): # Params self.state_size, self.hidden_size, self.action_size = state_size, hidden_size, action_size self.gamma, self.tau = gamma, tau self.use_cuda = use_cuda # Networks self.actor = Actor(state_size, hidden_size, action_size) self.actor_target = Actor(state_size, hidden_size, action_size) self.critic = Critic(state_size + action_size, hidden_size, action_size) self.critic_target = Critic(state_size + action_size, hidden_size, action_size) # Load model state_dicts from saved file if actor_path and path.exists(actor_path): self.actor.load_state_dict(torch.load(actor_path)) if critic_path and path.exists(critic_path): self.critic.load_state_dict(torch.load(critic_path)) # Hard copy params from original networks to target networks copy_params(self.actor, self.actor_target) copy_params(self.critic, self.critic_target) if self.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() # Create replay buffer for storing experience self.replay_buffer = ReplayBuffer(cache_size=int(1e6)) # Training self.critic_criterion = nn.MSELoss() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) def save_to_file(self, actor_file, critic_file): # Save the state_dict's of the Actor and Critic networks torch.save(self.actor.state_dict(), actor_file) torch.save(self.critic.state_dict(), critic_file) def get_action(self, state): """Select action with respect to state according to current policy and exploration noise""" state = Variable(torch.from_numpy(state).float()) if self.use_cuda: state = state.cuda() a = self.actor.forward(state) if self.use_cuda: return a.detach().cpu().numpy() return a.detach().numpy() def save_experience(self, state_t, action_t, reward_t, state_t1): self.replay_buffer.add_sample(state_t, action_t, reward_t, state_t1) def update(self, batch_size): states, actions, rewards, next_states = self.replay_buffer.get_samples( batch_size) states = torch.FloatTensor(states) actions = torch.FloatTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) if self.use_cuda: states = states.cuda() next_states = next_states.cuda() actions = actions.cuda() rewards = rewards.cuda() # Critic loss Qvals = self.critic.forward(states, actions) next_actions = self.actor_target.forward(next_states) next_Q = self.critic_target.forward(next_states, next_actions.detach()) Qprime = rewards + self.gamma * next_Q critic_loss = self.critic_criterion(Qvals, Qprime) # Update critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Actor loss policy_loss = -self.critic.forward(states, self.actor.forward(states)).mean() # Update actor self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # update target networks soft_copy_params(self.actor, self.actor_target, self.tau) soft_copy_params(self.critic, self.critic_target, self.tau) def add_noise_to_weights(self, amount=0.1): self.actor.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.critic.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.actor_target.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.critic_target.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
class D3PG(object): def __init__(self, state_dim, action_dim, max_action, memory, args): # misc self.criterion = nn.MSELoss() self.state_dim = state_dim self.action_dim = action_dim self.max_action = max_action self.memory = memory self.n = args.n_actor # actors self.actors = [ Actor(state_dim, action_dim, max_action, layer_norm=args.layer_norm) for i in range(self.n) ] self.actors_target = [ Actor(state_dim, action_dim, max_action, layer_norm=args.layer_norm) for i in range(self.n) ] self.actors_optimizer = [ torch.optim.Adam(self.actors[i].parameters(), lr=args.actor_lr) for i in range(self.n) ] for i in range(self.n): self.actors_target[i].load_state_dict(self.actors[i].state_dict()) # crtic self.critic = Critic(state_dim, action_dim, layer_norm=args.layer_norm) self.critic_target = Critic(state_dim, action_dim, layer_norm=args.layer_norm) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=args.critic_lr) # cuda if torch.cuda.is_available(): for i in range(self.n): self.actors[i] = self.actors[i].cuda() self.actors_target[i] = self.actors_target[i].cuda() self.critic = self.critic.cuda() self.critic_target = self.critic_target.cuda() # shared memory for i in range(self.n): self.actors[i].share_memory() self.actors_target[i].share_memory() self.critic.share_memory() self.critic_target.share_memory() # hyper-parameters self.tau = args.tau self.discount = args.discount self.batch_size = args.batch_size self.reward_scale = args.reward_scale def train(self, iterations, actor_index): for _ in tqdm(range(iterations)): # Sample replay buffer states, n_states, actions, rewards, dones = self.memory.sample( self.batch_size) # Q target = reward + discount * Q(next_state, pi(next_state)) with torch.no_grad(): target_Q = self.critic_target( n_states, self.actors_target[actor_index](n_states)) target_Q = self.reward_scale * rewards + \ (1 - dones) * self.discount * target_Q # Get current Q estimate current_Q = self.critic(states, actions) # Compute critic loss critic_loss = self.criterion(current_Q, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Compute actor loss actor_loss = - \ self.critic(states, self.actors[actor_index](states)).mean() # Optimize the actor self.actors_optimizer[actor_index].zero_grad() actor_loss.backward() self.actors_optimizer[actor_index].step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip( self.actors[actor_index].parameters(), self.actors_target[actor_index].parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def load(self, filename): for i in range(self.n): self.actors[i].load_model(filename, "actor_" + str(i)) self.critic.load_model(filename, "critic") def save(self, output): for i in range(self.n): self.actors[i].save_model(output, "actor_" + str(i)) self.critic.save_model(output, "critic")
class TD3: def __init__(self, env, state_dim, action_dim, max_action, gamma=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2): self.actor = Actor(state_dim, action_dim) self.actor_target = Actor(state_dim, action_dim) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-3) self.critic = Critic(state_dim, action_dim) self.critic_target = Critic(state_dim, action_dim) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3) self.max_action = max_action self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.actor.to(self.device) self.actor_target.to(self.device) self.critic.to(self.device) self.critic_target.to(self.device) self.env = env self.total_it = 0 def select_action(self, state, noise=0.1): action = self.actor(state.to(self.device)).data.cpu().numpy().flatten() if noise != 0: action = (action + np.random.normal( 0, noise, size=self.env.action_space.shape[0])) return action.clip(self.env.action_space.low, self.env.action_space.high) def train(self, replay_buffer, batch_size=128): self.total_it += 1 states, states_, actions, rewards, terminal = replay_buffer.sample_buffer( batch_size) with torch.no_grad(): noise = (torch.randn_like(actions.to(self.device)) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip) next_action = (self.actor_target(states_.to(self.device)) + noise).clamp(-self.max_action, self.max_action) # compute the target Q value target_q1, target_q2 = self.critic_target( states_.to(self.device), next_action.to(self.device)) target_q = torch.min(target_q1, target_q2) # target_q = rewards + terminal * self.gamma + target_q.cpu() # target_q = rewards + (terminal.reshape(256, 1) * self.gamma * target_q).detach() target_q = rewards + terminal * self.gamma * target_q[:, 0].cpu() # Get current Q value current_q1, current_q2 = self.critic(states.to(self.device), actions.to(self.device)) # Compute critic loss critic_loss = F.mse_loss(current_q1[:, 0], target_q.to( self.device)) + F.mse_loss(current_q2[:, 0], target_q.to(self.device)) # optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Delayed policy updates if self.total_it % self.policy_freq == 0: # Compote actor loss actor_loss = -self.critic.q1(states.to( self.device), self.actor(states.to(self.device))).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def save(self, filename): torch.save(self.critic.state_dict(), filename + "_critic") torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer") torch.save(self.actor.state_dict(), filename + "_actor") torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer") def load(self, filename): self.critic.load_state_dict(torch.load(filename + "_critic")) self.critic_optimizer.load_state_dict( torch.load(filename + "_critic_optimizer")) self.actor.load_state_dict(torch.load(filename + "_actor")) self.actor_optimizer.load_state_dict( torch.load(filename + "_actor_optimizer"))
class DDPG(object): def __init__(self, state_dim, action_dim, max_action, memory, args): # actor self.actor = Actor(state_dim, action_dim, max_action, layer_norm=args.layer_norm) self.actor_target = Actor(state_dim, action_dim, max_action, layer_norm=args.layer_norm) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=args.actor_lr) # crtic self.critic = Critic(state_dim, action_dim, layer_norm=args.layer_norm) self.critic_target = Critic(state_dim, action_dim, layer_norm=args.layer_norm) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=args.critic_lr) # cuda if torch.cuda.is_available(): self.actor = self.actor.cuda() self.actor_target = self.actor_target.cuda() self.critic = self.critic.cuda() self.critic_target = self.critic_target.cuda() # misc self.criterion = nn.MSELoss() self.state_dim = state_dim self.action_dim = action_dim self.max_action = max_action self.memory = memory # hyper-parameters self.tau = args.tau self.discount = args.discount self.batch_size = args.batch_size def show_lr(self): print(self.actor_optimizer.state_dict()) def select_action(self, state, noise=None): state = FloatTensor(state.reshape(-1, self.state_dim)) action = self.actor(state).cpu().data.numpy().flatten() if noise is not None: action += noise.sample() return np.clip(action, -self.max_action, self.max_action) def train(self, iterations): for _ in tqdm(range(iterations)): # Sample replay buffer x, y, u, r, d = self.memory.sample(self.batch_size) state = FloatTensor(x) action = FloatTensor(u) next_state = FloatTensor(y) done = FloatTensor(1 - d) reward = FloatTensor(r) # Q target = reward + discount * Q(next_state, pi(next_state)) with torch.no_grad(): target_Q = self.critic_target(next_state, self.actor_target(next_state)) target_Q = reward + (done * self.discount * target_Q) # Get current Q estimate current_Q = self.critic(state, action) # Compute critic loss critic_loss = self.criterion(current_Q, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Compute actor loss actor_loss = -self.critic(state, self.actor(state)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def train_critic(self, iterations): for _ in tqdm(range(iterations)): # Sample replay buffer states, n_states, actions, rewards, dones = self.memory.sample( self.batch_size) sys.stdout.flush() # Q target = reward + discount * Q(next_state, pi(next_state)) with torch.no_grad(): target_Q = self.critic_target(n_states, self.actor_target(n_states)) target_Q = rewards + (1 - dones) * self.discount * target_Q # Get current Q estimate current_Q = self.critic(states, actions) # Compute critic loss critic_loss = self.criterion(current_Q, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Compute actor loss actor_loss = - \ self.critic(states, self.actor(states)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def load(self, filename): self.actor.load_model(filename, "actor") self.critic.load_model(filename, "critic") def save(self, output): self.actor.save_model(output, "actor") self.critic.save_model(output, "critic")
def train(BATCH_SIZE, DISCOUNT, ENTROPY_WEIGHT, HIDDEN_SIZE, LEARNING_RATE, MAX_STEPS, POLYAK_FACTOR, REPLAY_SIZE, TEST_INTERVAL, UPDATE_INTERVAL, UPDATE_START, ENV, OBSERVATION_LOW, VALUE_FNC, FLOW_TYPE, FLOWS, DEMONSTRATIONS, PRIORITIZE_REPLAY, BEHAVIOR_CLONING, ARM, BASE, RPA, REWARD_DENSE, logdir): ALPHA = 0.3 BETA = 1 epsilon = 0.0001 #0.1 epsilon_d = 0.1 #0.3 weights = 1 #1 lambda_ac = 0.85 #0.7 lambda_bc = 0.3 #0.4 setup_logger(logdir, locals()) ENV = __import__(ENV) if ARM and BASE: env = ENV.youBotAll('youbot_navig2.ttt', obs_lowdim=OBSERVATION_LOW, rpa=RPA, reward_dense=REWARD_DENSE, boundary=1) elif ARM: env = ENV.youBotArm('youbot_navig.ttt', obs_lowdim=OBSERVATION_LOW, rpa=RPA, reward_dense=REWARD_DENSE) elif BASE: env = ENV.youBotBase('youbot_navig.ttt', obs_lowdim=OBSERVATION_LOW, rpa=RPA, reward_dense=REWARD_DENSE, boundary=1) action_space = env.action_space obs_space = env.observation_space() step_limit = env.step_limit() if OBSERVATION_LOW: actor = SoftActorGated(HIDDEN_SIZE, action_space, obs_space, flow_type=FLOW_TYPE, flows=FLOWS).float().to(device) critic_1 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) critic_2 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) else: actor = ActorImageNet(HIDDEN_SIZE, action_space, obs_space, flow_type=FLOW_TYPE, flows=FLOWS).float().to(device) critic_1 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) critic_2 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) critic_1.load_state_dict( torch.load( 'data/youbot_all_final_21-08-2019_22-32-00/models/critic1_model_473000.pkl' )) critic_2.load_state_dict( torch.load( 'data/youbot_all_final_21-08-2019_22-32-00/models/critic1_model_473000.pkl' )) actor.apply(weights_init) # critic_1.apply(weights_init) # critic_2.apply(weights_init) if VALUE_FNC: value_critic = Critic(HIDDEN_SIZE, 1, obs_space, action_space).float().to(device) target_value_critic = create_target_network(value_critic).float().to( device) value_critic_optimiser = optim.Adam(value_critic.parameters(), lr=LEARNING_RATE) else: target_critic_1 = create_target_network(critic_1) target_critic_2 = create_target_network(critic_2) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critics_optimiser = optim.Adam(list(critic_1.parameters()) + list(critic_2.parameters()), lr=LEARNING_RATE) # Replay buffer if PRIORITIZE_REPLAY: # D = PrioritizedReplayBuffer(REPLAY_SIZE, ALPHA) D = ReplayMemory(device, 3, DISCOUNT, 1, BETA, ALPHA, REPLAY_SIZE) else: D = deque(maxlen=REPLAY_SIZE) eval_ = evaluation_sac(env, logdir, device) #Automatic entropy tuning init target_entropy = -np.prod(action_space).item() log_alpha = torch.zeros(1, requires_grad=True, device=device) alpha_optimizer = optim.Adam([log_alpha], lr=LEARNING_RATE) home = os.path.expanduser('~') if DEMONSTRATIONS: dir_dem = os.path.join(home, 'robotics_drl/data/demonstrations/', DEMONSTRATIONS) D, n_demonstrations = load_buffer_demonstrations( D, dir_dem, PRIORITIZE_REPLAY, OBSERVATION_LOW) else: n_demonstrations = 0 if not BEHAVIOR_CLONING: behavior_loss = 0 os.mkdir(os.path.join(home, 'robotics_drl', logdir, 'models')) dir_models = os.path.join(home, 'robotics_drl', logdir, 'models') state, done = env.reset(), False if OBSERVATION_LOW: state = state.float().to(device) else: state['low'] = state['low'].float() state['high'] = state['high'].float() pbar = tqdm(range(1, MAX_STEPS + 1), unit_scale=1, smoothing=0) steps = 0 success = 0 for step in pbar: with torch.no_grad(): if step < UPDATE_START and not DEMONSTRATIONS: # To improve exploration take actions sampled from a uniform random distribution over actions at the start of training action = torch.tensor(env.sample_action(), dtype=torch.float32, device=device).unsqueeze(dim=0) else: # Observe state s and select action a ~ μ(a|s) if not OBSERVATION_LOW: state['low'] = state['low'].float().to(device) state['high'] = state['high'].float().to(device) action, _ = actor(state, log_prob=False, deterministic=False) if not OBSERVATION_LOW: state['low'] = state['low'].float().cpu() state['high'] = state['high'].float().cpu() #if (policy.mean).mean() > 0.4: # print("GOOD VELOCITY") # Execute a in the environment and observe next state s', reward r, and done signal d to indicate whether s' is terminal next_state, reward, done = env.step( action.squeeze(dim=0).cpu().tolist()) if OBSERVATION_LOW: next_state = next_state.float().to(device) else: next_state['low'] = next_state['low'].float() next_state['high'] = next_state['high'].float() # Store (s, a, r, s', d) in replay buffer D if PRIORITIZE_REPLAY: if OBSERVATION_LOW: D.add(state.cpu().tolist(), action.cpu().squeeze().tolist(), reward, next_state.cpu().tolist(), done) else: D.append(state['high'], state['low'], action.cpu().squeeze().tolist(), reward, done) else: D.append({ 'state': state.unsqueeze(dim=0) if OBSERVATION_LOW else state, 'action': action, 'reward': torch.tensor([reward], dtype=torch.float32, device=device), 'next_state': next_state.unsqueeze( dim=0) if OBSERVATION_LOW else next_state, 'done': torch.tensor([True if reward == 1 else False], dtype=torch.float32, device=device) }) state = next_state # If s' is terminal, reset environment state steps += 1 if done or steps > step_limit: #TODO: incorporate step limit in the environment eval_c2 = True #TODO: multiprocess pyrep with a session for each testing and training steps = 0 if OBSERVATION_LOW: state = env.reset().float().to(device) else: state = env.reset() state['low'] = state['low'].float() state['high'] = state['high'].float() if reward == 1: success += 1 if step > UPDATE_START and step % UPDATE_INTERVAL == 0: for _ in range(1): # Randomly sample a batch of transitions B = {(s, a, r, s', d)} from D if PRIORITIZE_REPLAY: if OBSERVATION_LOW: state_batch, action_batch, reward_batch, state_next_batch, done_batch, weights_pr, idxes = D.sample( BATCH_SIZE, BETA) state_batch = torch.from_numpy(state_batch).float().to( device) next_state_batch = torch.from_numpy( state_next_batch).float().to(device) action_batch = torch.from_numpy( action_batch).float().to(device) reward_batch = torch.from_numpy( reward_batch).float().to(device) done_batch = torch.from_numpy(done_batch).float().to( device) weights_pr = torch.from_numpy(weights_pr).float().to( device) else: idxes, high_state_batch, low_state_batch, action_batch, reward_batch, high_state_next_batch, low_state_next_batch, done_batch, weights_pr = D.sample( BATCH_SIZE) state_batch = { 'low': low_state_batch.float().to(device).view(-1, 32), 'high': high_state_batch.float().to(device).view( -1, 12, 128, 128) } next_state_batch = { 'low': low_state_next_batch.float().to(device).view( -1, 32), 'high': high_state_next_batch.float().to(device).view( -1, 12, 128, 128) } action_batch = action_batch.float().to(device) reward_batch = reward_batch.float().to(device) done_batch = done_batch.float().to(device) weights_pr = weights_pr.float().to(device) # for j in range(BATCH_SIZE): # new_state_batch['high'] = torch.cat((new_state_batch['high'], state_batch[j].tolist()['high'].view(-1,(3+1)*env.frames,128,128)), dim=0) # new_state_batch['low'] = torch.cat((new_state_batch['low'], state_batch[j].tolist()['low'].view(-1,32)), dim=0) # new_next_state_batch['high'] = torch.cat((new_next_state_batch['high'], state_next_batch[j].tolist()['high'].view(-1,(3+1)*env.frames,128,128)), dim=0) # new_next_state_batch['low'] = torch.cat((new_next_state_batch['low'], state_next_batch[j].tolist()['low'].view(-1,32)), dim=0) # new_state_batch['high'] = new_state_batch['high'].to(device) # new_state_batch['low'] = new_state_batch['low'].to(device) # new_next_state_batch['high'] = new_next_state_batch['high'].to(device) # new_next_state_batch['low'] = new_next_state_batch['low'].to(device) batch = { 'state': state_batch, 'action': action_batch, 'reward': reward_batch, 'next_state': next_state_batch, 'done': done_batch } state_batch = [] state_next_batch = [] else: batch = random.sample(D, BATCH_SIZE) state_batch = [] action_batch = [] reward_batch = [] state_next_batch = [] done_batch = [] for d in batch: state_batch.append(d['state']) action_batch.append(d['action']) reward_batch.append(d['reward']) state_next_batch.append(d['next_state']) done_batch.append(d['done']) batch = { 'state': torch.cat(state_batch, dim=0), 'action': torch.cat(action_batch, dim=0), 'reward': torch.cat(reward_batch, dim=0), 'next_state': torch.cat(state_next_batch, dim=0), 'done': torch.cat(done_batch, dim=0) } action, log_prob = actor(batch['state'], log_prob=True, deterministic=False) #Automatic entropy tuning alpha_loss = -( log_alpha.float() * (log_prob + target_entropy).float().detach()).mean() alpha_optimizer.zero_grad() alpha_loss.backward() alpha_optimizer.step() alpha = log_alpha.exp() weighted_sample_entropy = (alpha.float() * log_prob).view( -1, 1) # Compute targets for Q and V functions if VALUE_FNC: y_q = batch['reward'] + DISCOUNT * ( 1 - batch['done']) * target_value_critic( batch['next_state']) y_v = torch.min( critic_1(batch['state']['low'], action.detach()), critic_2(batch['state']['low'], action.detach()) ) - weighted_sample_entropy.detach() else: # No value function network with torch.no_grad(): next_actions, next_log_prob = actor( batch['next_state'], log_prob=True, deterministic=False) target_qs = torch.min( target_critic_1( batch['next_state']['low'] if not OBSERVATION_LOW else batch['next_state'], next_actions), target_critic_2( batch['next_state']['low'] if not OBSERVATION_LOW else batch['next_state'], next_actions)) - alpha * next_log_prob y_q = batch['reward'] + DISCOUNT * ( 1 - batch['done']) * target_qs.detach() td_error_critic1 = critic_1( batch['state']['low'] if not OBSERVATION_LOW else batch['state'], batch['action']) - y_q td_error_critic2 = critic_2( batch['state']['low'] if not OBSERVATION_LOW else batch['state'], batch['action']) - y_q q_loss = (td_error_critic1).pow(2).mean() + ( td_error_critic2).pow(2).mean() # q_loss = (F.mse_loss(critic_1(batch['state'], batch['action']), y_q) + F.mse_loss(critic_2(batch['state'], batch['action']), y_q)).mean() critics_optimiser.zero_grad() q_loss.backward() critics_optimiser.step() # Compute priorities, taking demonstrations into account if PRIORITIZE_REPLAY: td_error = weights_pr * (td_error_critic1.detach() + td_error_critic2.detach()).mean() action_dem = torch.tensor([]).to(device) if OBSERVATION_LOW: state_dem = torch.tensor([]).to(device) else: state_dem = { 'low': torch.tensor([]).float().to(device), 'high': torch.tensor([]).float().to(device) } priorities = torch.abs(td_error).tolist() i = 0 count_dem = 0 for idx in idxes: priorities[i] += epsilon if idx < n_demonstrations: priorities[i] += epsilon_d count_dem += 1 if BEHAVIOR_CLONING: action_dem = torch.cat( (action_dem, batch['action'][i].view( 1, -1)), dim=0) if OBSERVATION_LOW: state_dem = torch.cat( (state_dem, batch['state'][i].view( 1, -1)), dim=0) else: state_dem['high'] = torch.cat( (state_dem['high'], batch['state']['high'][i, ].view( -1, (3 + 1) * env.frames, 128, 128)), dim=0) state_dem['low'] = torch.cat( (state_dem['low'], batch['state']['low'][i, ].view( -1, 32)), dim=0) i += 1 if not action_dem.nelement() == 0: actual_action_dem, _ = actor(state_dem, log_prob=False, deterministic=True) # q_value_actor = (critic_1(batch['state'][i], batch['action'][i]) + critic_2(batch['state'][i], batch['action'][i]))/2 # q_value_actual = (critic_1(batch['state'][i], actual_action_dem) + critic_2(batch['state'][i], actual_action_dem))/2 # if q_value_actor > q_value_actual: # Q Filter behavior_loss = F.mse_loss( action_dem, actual_action_dem).unsqueeze(dim=0) else: behavior_loss = 0 D.update_priorities(idxes, priorities) lambda_bc = (count_dem / BATCH_SIZE) / 5 # Update V-function by one step of gradient descent if VALUE_FNC: v_loss = (value_critic(batch['state']) - y_v).pow(2).mean().to(device) value_critic_optimiser.zero_grad() v_loss.backward() value_critic_optimiser.step() # Update policy by one step of gradient ascent with torch.no_grad(): new_qs = torch.min( critic_1( batch["state"]['low'] if not OBSERVATION_LOW else batch['state'], action), critic_2( batch["state"]['low'] if not OBSERVATION_LOW else batch['state'], action)) policy_loss = lambda_ac * (weighted_sample_entropy.view( -1) - new_qs).mean().to(device) + lambda_bc * behavior_loss actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Update target value network if VALUE_FNC: update_target_network(value_critic, target_value_critic, POLYAK_FACTOR) else: update_target_network(critic_1, target_critic_1, POLYAK_FACTOR) update_target_network(critic_2, target_critic_2, POLYAK_FACTOR) state_dem = [] # Continues to sample transitions till episode is done and evaluation is on if step > UPDATE_START and step % TEST_INTERVAL == 0: eval_c = True else: eval_c = False if eval_c == True and eval_c2 == True: eval_c = False eval_c2 = False actor.eval() critic_1.eval() critic_2.eval() q_value_eval = eval_.get_qvalue(critic_1, critic_2) return_ep, steps_ep = eval_.sample_episode(actor) logz.log_tabular('Training steps', step) logz.log_tabular('Cumulative Success', success) logz.log_tabular('Validation return', return_ep.mean()) logz.log_tabular('Validation steps', steps_ep.mean()) logz.log_tabular('Validation return std', return_ep.std()) logz.log_tabular('Validation steps std', steps_ep.std()) logz.log_tabular('Q-value evaluation', q_value_eval) logz.log_tabular('Q-network loss', q_loss.detach().cpu().numpy()) if VALUE_FNC: logz.log_tabular('Value-network loss', v_loss.detach().cpu().numpy()) logz.log_tabular('Policy-network loss', policy_loss.detach().cpu().squeeze().numpy()) logz.log_tabular('Alpha loss', alpha_loss.detach().cpu().numpy()) logz.log_tabular('Alpha', alpha.detach().cpu().squeeze().numpy()) logz.log_tabular('Demonstrations current batch', count_dem) logz.dump_tabular() logz.save_pytorch_model(actor.state_dict()) torch.save(actor.state_dict(), os.path.join(dir_models, 'actor_model_%s.pkl' % (step))) torch.save( critic_1.state_dict(), os.path.join(dir_models, 'critic1_model_%s.pkl' % (step))) torch.save( critic_2.state_dict(), os.path.join(dir_models, 'critic1_model_%s.pkl' % (step))) #pbar.set_description('Step: %i | Reward: %f' % (step, return_ep.mean())) actor.train() critic_1.train() critic_2.train() env.terminate()
LAMBDA = .95 EPSILON = .2 TARGET_DISCOUNT = .4 N_TIMESTEPS_PER_UPDATE = 300 # ~~~~~~~~~~~~~~~~~~ # Initialization # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ env = gym.make('CartPole-v1') replay_memory = ReplayMemory(memory_capacity) policy_net = Actor(sum(env.observation_space.shape), 200, env.action_space.n) value_net = Critic(sum(env.observation_space.shape), 200, 1) target_value_net = Critic(sum(env.observation_space.shape), 200, 1) target_value_net.load_state_dict(value_net.state_dict()) target_value_net.eval() params = list(policy_net.parameters()) + list(value_net.parameters()) optimizer = optim.SGD(params, lr=1e-3, momentum=.9, weight_decay=1e-6) writer = SummaryWriter() reward_normalizer = RewardNormalizer() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ global_t = 0 for ep in range(10000): # episode loop # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~