class DDPG_Agent: def __init__(self, state_size, action_size, seed, index=0, num_agents=2): """Initialize an Agent object. Params ====== state_size (int): Dimension of each state action_size (int): Dimension of each action seed (int): Random seed index (int): Index assigned to the agent num_agents (int): Number of agents in the environment """ self.state_size = state_size # State size self.action_size = action_size # Action size self.seed = torch.manual_seed(seed) # Random seed self.index = index # Index of this agent, not used at the moment self.tau = TAU # Parameter for soft weight update self.num_updates = N_UPDATES # Number of updates to perform when updating self.num_agents = num_agents # Number of agents in the environment self.tstep = 0 # Simulation step (modulo (%) UPDATE_EVERY) self.gamma = GAMMA # Gamma for the reward discount self.alpha = ALPHA # PER: toggle prioritization (0..1) # Set up actor and critic networks self.actor_local = Actor(state_size, action_size, seed).to(device) self.critic_local = Critic(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise self.noise = OUNoise((1, action_size), seed) # Replay buffer self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, self.alpha) # act and act_targets similar to exercises and MADDPG Lab def act(self, states, noise=1.0): """Returns actions for given state as per current policy. Params ====== state [n_agents, state_size]: current state noise (float): control whether or not noise is added """ # Uncomment if state is numpy array instead of tensor states = torch.from_numpy(states).float().to(device) actions = np.zeros((1, self.action_size)) # Put model into evaluation mode self.actor_local.eval() # Get actions for current state, transformed from probabilities with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() # Put actor back into training mode self.actor_local.train() # Ornstein-Uhlenbeck noise addition actions += noise * self.noise.sample() # Transform probability into valid action ranges return np.clip(actions, -1, 1) def step(self, states, actions, rewards, next_states, dones, beta): """Save experience in replay memory, use random samples from buffer to learn. PARAMS ====== states: [n_agents, state_size] current state actions: [n_agents, action_size] taken action rewards: [n_agents] earned reward next_states:[n_agents, state_size] next state dones: [n_agents] Whether episode has finished beta: [0..1] PER: toggles correction for importance weights (0 - no corrections, 1 - full correction) """ # ------------------------------------------------------------------ # Save experience in replay memory - slightly more effort due to Prioritization # We need to calculate priorities for the experience tuple. # This is in our case (Q_expected - Q_target)**2 # ----------------------------------------------------------------- # Set all networks to evaluation mode self.actor_target.eval() self.critic_target.eval() self.critic_local.eval() state = torch.from_numpy(states).float().to(device) next_state = torch.from_numpy(next_states).float().to(device) action = torch.from_numpy(actions).float().to(device) #reward = torch.from_numpy(rewards).float().to(device) #done = torch.from_numpy(dones).float().to(device) with torch.no_grad(): next_actions = self.actor_target(state) own_action = action[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1 next_actions_agent = torch.cat((own_action, next_actions), dim=1) else: # Agent 0: flipped order next_actions_agent = torch.cat((next_actions, own_action), dim=1) # Predicted Q value from Critic target network Q_targets_next = self.critic_target(next_state, next_actions_agent).float() #print(f"Type Q_t_n: {type(Q_targets_next)}") #print(f"Type gamma: {type(self.gamma)}") #print(f"Type dones: {type(dones)}") Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) Q_expected = self.critic_local(state, action) # Use error between Q_expected and Q_targets as priority in buffer error = (Q_expected - Q_targets)**2 self.memory.add(state, action, rewards, next_state, dones, error) # Set all networks back to training mode self.actor_target.train() self.critic_target.train() self.critic_local.train() # ------------------------------------------------------------------ # Usual learning procedure # ----------------------------------------------------------------- # Learn every UPDATE_EVERY time steps self.tstep = (self.tstep + 1) % UPDATE_EVERY # If UPDATE_EVERY and enough samples are available in memory, get random subset and learn if self.tstep == 0 and len(self.memory) > BATCH_SIZE: for _ in range(self.num_updates): experiences = self.memory.sample(beta) self.learn(experiences) def reset(self): """Reset the noise parameter of the agent.""" self.noise.reset() def learn(self, experiences): """Update value parameters using given batch of experience tuples. Update according to Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) According to the lessons: actor_target (state) gives action critic_target (state, action) gives Q-value Params ====== experiences (Tuple[torch.Variable]): tuple of states states visited actions actions taken by all agents rewards rewards received next states all next states dones whether or not a final state is reached weights weights of the experiences indices indices of the experiences """ # Load experiences from sample states, actions, rewards, next_states, dones, weights_cur, indices = experiences # ------------------- update critic ------------------- # # Get next actions via actor network next_actions = self.actor_target(next_states) # Stack action together with action of the agent own_actions = actions[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1 next_actions_agent = torch.cat((own_actions, next_actions), dim=1) else: # Agent 0: flipped order next_actions_agent = torch.cat((next_actions, own_actions), dim=1) # Predicted Q value from Critic target network Q_targets_next = self.critic_target(next_states, next_actions_agent) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) Q_expected = self.critic_local(states, actions) # Update priorities in ReplayBuffer loss = (Q_expected - Q_targets).pow(2).reshape( weights_cur.shape) * weights_cur self.memory.update(indices, loss.data.cpu().numpy()) # Compute critic loss critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() # Clip gradients #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING) self.critic_optimizer.step() # ------------------- update actor ------------------- # actions_expected = self.actor_local(states) # Stack action together with action of the agent own_actions = actions[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1: actions_expected_agent = torch.cat((own_actions, actions_expected), dim=1) else: # Agent 0: flipped order actions_expected_agent = torch.cat((actions_expected, own_actions), dim=1) # Compute actor loss based on expectation from actions_expected actor_loss = -self.critic_local(states, actions_expected_agent).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.target_soft_update(self.critic_local, self.critic_target) self.target_soft_update(self.actor_local, self.actor_target) def target_soft_update(self, local_model, target_model): """Soft update model parameters for actor and critic of all MADDPG agents. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def save(self, filename): """Saves the agent to the local workplace Params ====== filename (string): where to save the weights """ checkpoint = { 'input_size': self.state_size, 'output_size': self.action_size, 'actor_hidden_layers': [ each.out_features for each in self.actor_local.hidden_layers if each._get_name() != 'BatchNorm1d' ], 'actor_state_dict': self.actor_local.state_dict(), 'critic_hidden_layers': [ each.out_features for each in self.critic_local.hidden_layers if each._get_name() != 'BatchNorm1d' ], 'critic_state_dict': self.critic_local.state_dict() } torch.save(checkpoint, filename) def load_weights(self, filename): """ Load weights to update agent's actor and critic networks. Expected is a format like the one produced by self.save() Params ====== filename (string): where to load data from. """ checkpoint = torch.load(filename) if not checkpoint['input_size'] == self.state_size: print( f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}" ) return None if not checkpoint['output_size'] == self.action_size: print( f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}" ) return None my_actor_hidden_layers = [ each.out_features for each in self.actor_local.hidden_layers if each._get_name() != 'BatchNorm1d' ] if not checkpoint['actor_hidden_layers'] == my_actor_hidden_layers: print( f"Error when loading weights from checkpoint {filename}: actor hidden layers {checkpoint['actor_hidden_layers']} don't match agent's actor hidden layers {my_actor_hidden_layers}" ) return None my_critic_hidden_layers = [ each.out_features for each in self.critic_local.hidden_layers if each._get_name() != 'BatchNorm1d' ] if not checkpoint['critic_hidden_layers'] == my_critic_hidden_layers: print( f"Error when loading weights from checkpoint {filename}: critic hidden layers {checkpoint['critic_hidden_layers']} don't match agent's critic hidden layers {my_critic_hidden_layers}" ) return None self.actor_local.load_state_dict(checkpoint['actor_state_dict']) self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
class DQNAgent(): def __init__(self, state_size, action_size): # if you want to see Cartpole learning, then change to True self.render = False self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.learning_rate = 0.001 self.memory_size = 20000 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 5000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.batch_size = 64 self.train_start = 1000 # create prioritized replay memory using SumTree self.memory = PrioritizedReplayBuffer(self.memory_size) # create main model and target model self.model = DQN(state_size, action_size) self.model.apply(self.weights_init) self.target_model = DQN(state_size, action_size) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) # initialize target model self.update_target_model() if self.load_model: self.model = torch.load('save_model/cartpole_dqn') # weight xavier initialize def weights_init(self, m): classname = m.__class__.__name__ if classname.find('Linear') != -1: torch.nn.init.xavier_uniform(m.weight) # after some time interval update the target model to be same with model def update_target_model(self): self.target_model.load_state_dict(self.model.state_dict()) # get action from model using epsilon-greedy policy def get_action(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: state = torch.from_numpy(state) state = Variable(state).float().cpu() q_value = self.model(state) _, action = torch.max(q_value, 1) return int(action) # save sample (error,<s,a,r,s'>) to the replay memory def append_sample(self, state, action, reward, next_state, done): target = self.model(Variable(torch.FloatTensor(state))).data old_val = target[0][action] target_val = self.target_model(Variable( torch.FloatTensor(next_state))).data if done: target[0][action] = reward else: target[0][ action] = reward + self.discount_factor * torch.max(target_val) error = abs(old_val - target[0][action]) self.memory.add(error, (state, action, reward, next_state, done)) # pick samples from prioritized replay memory (with batch_size) def train_model(self): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay mini_batch, idxs, is_weights = self.memory.sample(self.batch_size) mini_batch = np.array(mini_batch).transpose() states = np.vstack(mini_batch[0]) actions = list(mini_batch[1]) rewards = list(mini_batch[2]) next_states = np.vstack(mini_batch[3]) dones = mini_batch[4] # bool to binary dones = dones.astype(int) # Q function of current state states = torch.Tensor(states) states = Variable(states).float() pred = self.model(states) # one-hot encoding a = torch.LongTensor(actions).view(-1, 1) one_hot_action = torch.FloatTensor(self.batch_size, self.action_size).zero_() one_hot_action.scatter_(1, a, 1) pred = torch.sum(pred.mul(Variable(one_hot_action)), dim=1) # Q function of next state next_states = torch.Tensor(next_states) next_states = Variable(next_states).float() next_pred = self.target_model(next_states).data rewards = torch.FloatTensor(rewards) dones = torch.FloatTensor(dones) # Q Learning: get maximum Q value at s' from target model target = rewards + (1 - dones) * self.discount_factor * next_pred.max(1)[0] target = Variable(target) errors = torch.abs(pred - target).data.numpy() # update priority for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) self.optimizer.zero_grad() # MSE Loss function loss = (torch.FloatTensor(is_weights) * F.mse_loss(pred, target)).mean() loss.backward() # and train self.optimizer.step()
class Prioritized(DQN): def __init__(self, env, model, target_model, config, name_agent="prioritized-dqn"): self.name_agent = name_agent self.dim_space = env.observation_space.shape[0] self.nb_actions = env.action_space.n self.epsilon = config.epsilon_start self.epsilon_final = config.epsilon_final self.epsilon_start = config.epsilon_start self.epsilon_decay = config.epsilon_decay self.gamma = config.gamma self.update_nb_iter = config.update_nb_iter # changing the buffer (taking a priotirized buffer # insted of a uniform probability buffer) self.replay_buffer = PrioritizedReplayBuffer(10000, config.batch_size, config.w, config.beta_final, config.beta_start, config.beta_decay) self.environment = env self.batch_size = config.batch_size self.model = model self.target_model = target_model self.optimizer = optim.Adam(self.model.parameters(), lr=config.learning_rate) self.loss_data = [] self.rewards = [] def loss(self): """ the loss is equal to: Rt+1+γt+1qθ(St+1,argmax qθ(St+1,a′))−qθ(St,At))^2 """ states, actions, rewards, next_states, finish, indices, weight = self.replay_buffer.sample( ) actions = actions.long() # qθ(St,At) q0 = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1) # argmax qθ_barre(St+1,a′) max_next_q0 = self.model(next_states).max(1)[0] * (1 - finish) Rt_gamma_max = (rewards + self.gamma * max_next_q0) loss = (q0 - Rt_gamma_max).pow(2) * weight # update the priority of the buffer self.replay_buffer.add_p(indices, loss.detach().numpy()) loss = loss.sum() return loss