class DDQN(nn.Module): def __init__(self, obs, ac, config): super().__init__() self.q = QNetwork(obs, ac) self.target = QNetwork(obs, ac) self.target.load_state_dict(self.q.state_dict()) self.target_net_update_freq = config.target_net_update_freq self.update_counter = 0 def get_action(self, x): with torch.no_grad(): a = self.q(x).max(1)[1] return a.item() def update_policy(self, adam, memory, params): b_states, b_actions, b_rewards, b_next_states, b_masks = memory.sample( params.batch_size) states = torch.tensor(b_states).float() actions = torch.tensor(b_actions).long().reshape(-1, 1) rewards = torch.tensor(b_rewards).float().reshape(-1, 1) next_states = torch.tensor(b_next_states).float() masks = torch.tensor(b_masks).float().reshape(-1, 1) current_q_values = self.q(states).gather(1, actions) # print(current_q_values[:5]) with torch.no_grad(): max_next_q_vals = self.target(next_states).max(1)[0].reshape(-1, 1) # max_next_q_vals = self. expected_q_vals = rewards + max_next_q_vals * 0.99 * masks # print(expected_q_vals[:5]) loss = F.mse_loss(expected_q_vals, current_q_values) # input(loss) # print('\n'*5) adam.zero_grad() loss.backward() for p in self.q.parameters(): p.grad.data.clamp_(-1., 1.) adam.step() self.update_counter += 1 if self.update_counter % self.target_net_update_freq == 0: self.update_counter = 0 self.target.load_state_dict(self.q.state_dict())
class DQNAgent(): def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size self.policy_network = QNetwork(state_size, action_size).to(device) self.target_network = QNetwork(state_size, action_size).to(device) self.optimizer = optim.Adam(self.policy_network.parameters(), lr=LR) self.eps = EPS_START self.memory = ReplayBuffer(BUFFER_SIZE) self.t_step = 0 self.learn_count = 0 def step(self, state, action, reward, next_state, done): self.memory.store_transition(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) > BATCH_SIZE: experiences = self.memory.sample(BATCH_SIZE, device) self.learn(experiences) def act(self, state): if np.random.rand() < self.eps: return np.random.randint(self.action_size) else: state = torch.from_numpy(state).unsqueeze(0).to(device) action_values = self.policy_network(state) return torch.argmax(action_values).item() def update_eps(self): self.eps = max(EPS_END, EPS_DECAY * self.eps) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences Q_current = self.policy_network(states).gather(1, actions) Q_targets_next = self.target_network(next_states).max(1)[0].unsqueeze( 1) Q_targets = rewards + GAMMA * Q_targets_next * (1 - dones) loss = F.mse_loss(Q_current, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.learn_count += 1 if self.learn_count % SYNC_TARGET_EVERY == 0: self.target_network.load_state_dict( self.policy_network.state_dict())
class SAC(object): def __init__(self, num_inputs, action_space, config): self.gamma = config['gamma'] self.tau = config['tau'] self.alpha = config['alpha'] self.policy_type = config['policy'] self.target_update_interval = config['target_update_interval'] self.automatic_entropy_tuning = config['automatic_entropy_tuning'] self.device = torch.device( 'cuda:' + str(config['cuda'])) if torch.cuda.is_available( ) and config['cuda'] >= 0 else torch.device('cpu') self.critic = QNetwork(num_inputs, action_space.shape[0], config['hidden_size']).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=config['lr']) self.critic_target = QNetwork(num_inputs, action_space.shape[0], config['hidden_size']).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=config['lr']) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], config['hidden_size'], action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=config['lr']) def select_action(self, state, eval=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if eval == False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() # Save model parameters def save_model(self, save_path=None, env_name=None, suffix=None): if save_path is None: save_path = './models/' actor_path = '{}actor_{}_{}'.format(save_path, env_name, suffix) critic_path = "{}critic_{}_{}".format(save_path, env_name, suffix) print('Saving models to {} and {}'.format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path)
class AgentPriority(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, hidden_layers, lr=5e-4, alpha=0.5, beta=0.4): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (list[int, int, ...]): size of hidden layers lr (float): learning rate alpha (float (0<=alpha<=1)): parameter alpha for priority beta (float (0<=beta<=1)): parameter for importance sampling weight """ self.state_size = state_size self.action_size = action_size self.seed = seed # Q-Network self.lr = lr self.qnetwork_local = QNetwork(state_size, action_size, self.seed, hidden_layers).to(device) self.qnetwork_target = QNetwork(state_size, action_size, self.seed, hidden_layers).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.alpha = alpha self.beta = beta self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, self.alpha, self.beta) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # discount self.gamma = GAMMA self.checkpoint = { "input_size": self.state_size, "output_size": self.action_size, "hidden_layers": [each.out_features for each in self.qnetwork_local.hidden_layers], "state_dict": self.qnetwork_local.state_dict() } self.checkpointfile = 'priority_ddqn.pth' def step(self, state, action, reward, next_state, done): # Save experience in replay memory delta = self.comp_delta(state, action, reward, next_state, done) self.memory.add(state, action, reward, next_state, done, delta) # Learn NUM_LEARNS times par every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) >= MIN_BUF_SIZE: self.memory.set_priority_params(self.alpha, self.beta) for i in range(NUM_LEARNS): if i % SORT_EVERY == 0: # Sort memory based on delta every SORT_EVERY learnings self.memory.argsort_deltas() # Update q_target with q_local self.update_qtarget() # If PARAMETER_ANNEALING is set to True,anneal alpha & beta. if PARAMETER_ANNEALING: self.parameter_anneal() experiences, weights, mem_idxs = self.memory.sample() self.learn(experiences, weights, mem_idxs) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype(np.int32) else: return random.choice(np.arange(self.action_size)).astype(np.int32) def learn(self, experiences, weights, mem_idxs): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor mem_idxs (list of ints): indices in the replay buffer corresponding to the given experiences (used to update delta) """ states, actions, rewards, next_states, dones = experiences # Get argmax of Q values (for next states) from Q_local model Q_local_actions = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) # Evaluate that actions with Q_target model Q_targets_next = self.qnetwork_target(next_states).gather( 1, Q_local_actions).detach() # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # update deltas in self.memory deltas = (Q_targets - Q_expected).detach().cpu().numpy() self.memory.update_deltas(deltas, mem_idxs) # Compute loss loss = F.mse_loss(weights * Q_expected, weights * Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def update_qtarget(self): for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(local_param.data) def comp_delta(self, state, action, reward, next_state, done): """Compute delta given an experience delta = reward + gamma*argmax_action(Q_target(next_state, a)) - Q_local(state, action) """ state_ts = torch.from_numpy(np.expand_dims(state, 0)).float().to(device) action_ts = torch.from_numpy(np.array([[action]])).long().to(device) reward_ts = torch.from_numpy(np.array([[reward]])).float().to(device) next_state_ts = torch.from_numpy(np.expand_dims(next_state, 0)).float().to(device) done_ts = torch.from_numpy(np.array([[int(done)]])).float().to(device) Q_targets_next = self.qnetwork_target(next_state_ts).detach().max( 1)[0].unsqueeze(1) Q_targets = reward_ts + (self.gamma * Q_targets_next * (1 - done_ts)) Q_expected = self.qnetwork_local(state_ts).gather(1, action_ts) delta = (Q_targets - Q_expected).detach().cpu().numpy()[0, 0] return delta def get_gamma(self): return self.gamma def save_model(self): torch.save(self.checkpoint, self.checkpointfile) def set_lr(self, lr): self.lr = lr def load_model(self, filepath): checkpoint = torch.load(filepath) self.qnetwork_local = QNetwork(checkpoint["input_size"], checkpoint["output_size"], self.seed, checkpoint["hidden_layers"]) self.qnetwork_local.load_state_dict(checkpoint["state_dict"]) def set_uniform_sampling(self): """ Set alpha to 0.0 and beta to 1.0 so that the agent becomes equivalent to the uniform sampling. """ self.alpha = 0.0 self.beta = 1.0 self.memory.set_priority_params(self.alpha, self.beta) def parameter_anneal(self): self.alpha = max(0.0, self.alpha - ALPHA_ANNEALING) self.beta = min(1.0, self.beta + BETA_ANNEALING) self.memory.set_priority_params(self.alpha, self.beta)
class Agent(): """ Creates an agent that interacts with a Unity-ML Environment using a Deep Q-learning model (in pytorch). """ def __init__(self, n_state, n_actions, n_hidden=32, n_layers=2, seed=333, snapshotfile="snapshot.pth"): """ Initialize the agent. Args: n_state (int): Number of features that represent the state n_actions (int): Number of actions available to agent n_hidden (int): Number of units in hidden neural net layers n_layers (int): Number of layers for neural network seed (int): Set the random seed (for reproducibility) snapshotfile (str): Filepath to use for saving weights """ self.n_state = n_state self.n_actions = n_actions self.seed = random.seed(seed) self.snapshotfile = snapshotfile # Deep Q-Network self.qnetwork_local = QNetwork(n_state, n_actions, seed, n_hidden=64).to(device) self.qnetwork_target = QNetwork(n_state, n_actions, seed, n_hidden=64).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.loss_func = torch.nn.MSELoss(reduce=True) # Experience Replay Memory self.memory = ReplayBuffer(n_actions, EXPERIENCE_MEMORY_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # TODO: have the is_training attribute control eval and train # mode in pytprch network self.is_training = True def memorize_and_learn_step(self, state, action, reward, next_state, done): """ Given S,A,R',S' and if it is finished, it saves the eperience to memory, and occasionally samples from memorized experiences and learns from those memories. """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Once every UPDATE_EVERY steps, randomly sample memories to learn from self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def choose_action(self, state, epsilon=0.0): """ Given an environment state, it returns an action using epsilon greedy policy. Args: state (array_like): current state epsilon (float) : probability of choosing a random action """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): # temporarially set requires_grad flag to false action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > epsilon: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.n_actions)) def learn(self, experiences, gamma): """ Update the weights of the neural network representing the Q values, given a batch of experience tuples. Args: experiences (tuple of torch.Variable): tuple with the following torch tensors (states, actions, rewards, next_states, dones) gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Q_TARGET next_logits = self.qnetwork_target( next_states).detach() # no need to calculate gradients, so detach q_next = torch.max(next_logits, dim=1, keepdim=True)[0] # where dones=1, it will ignore q_next, and just use current reward q_target = rewards + ((1 - dones) * (gamma * q_next)) # Q_CURRENT - based on action taken in experience current_logits = self.qnetwork_local(states) q_pred = torch.gather(current_logits, 1, actions) # LOSS loss = self.loss_func(q_pred, q_target) # loss = F.mse_loss(q_pred, q_target) # OPTIMIZE WEIGHTS self.optimizer.zero_grad() # zero the parameter gradients loss.backward() self.optimizer.step() # UPDATE TARGET NETWORK self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """ Performs a soft update on the target Q network weights, by shifting them slightly towards the local Q network by a factor of `tau`. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def snapshot(self, file=None): """ Takes a snapshot file of the neural netowrk weights """ file = self.snapshotfile if file is None else file torch.save(self.qnetwork_local.state_dict(), file) def load_snapshot(self, file=None): """ Loads the neural network weights from a file """ file = self.snapshotfile if file is None else file self.qnetwork_local.load_state_dict(torch.load(file)) self.qnetwork_target.load_state_dict(torch.load(file))
class SAC(object): def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.policy_type == "Gaussian": self.alpha = args.alpha # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.value = ValueNetwork(self.num_inputs, args.hidden_size).to(self.device) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size).to(self.device) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) else: self.policy = DeterministicPolicy(self.num_inputs, self.action_space, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) def select_action(self, state, eval=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if eval == False: self.policy.train() action, _, _ = self.policy.sample(state) else: self.policy.eval() _, _, action = self.policy.sample(state) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates): state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step pi, log_pi, _ = self.policy.sample(state_batch) if self.policy_type == "Gaussian": if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_logs = torch.tensor(self.alpha) # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_logs = torch.tensor(self.alpha) # For TensorboardX logs vf = self.value( state_batch ) # separate function approximator for the soft value can stabilize training. with torch.no_grad(): vf_next_target = self.value_target(next_state_batch) next_q_value = reward_batch + mask_batch * self.gamma * ( vf_next_target) else: alpha_loss = torch.tensor(0.).to(self.device) alpha_logs = self.alpha # For TensorboardX logs with torch.no_grad(): next_state_action, _, _, _, _, = self.policy.sample( next_state_batch) # Use a target critic network for deterministic policy and eradicate the value value network completely. qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) if self.policy_type == "Gaussian": vf_target = min_qf_pi - (self.alpha * log_pi) value_loss = F.mse_loss( vf, vf_target.detach() ) # JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2] policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] # Regularization Loss # mean_loss = 0.001 * mean.pow(2).mean() # std_loss = 0.001 * log_std.pow(2).mean() # policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() if self.policy_type == "Gaussian": self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() else: value_loss = torch.tensor(0.).to(self.device) self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic": soft_update(self.critic_target, self.critic, self.tau) elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian": soft_update(self.value_target, self.value, self.tau) return value_loss.item(), qf1_loss.item(), qf2_loss.item( ), policy_loss.item(), alpha_loss.item(), alpha_logs.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format( actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE, buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE, tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, priority_eps=P_EPS, a=A, initial_beta=INIT_BETA, n_multisteps=N_STEPS, clip=CLIP, initial_sigma=INIT_SIGMA, linear_type=LINEAR, **kwds): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed batch_size (int): size of each sample batch buffer_size (int): size of the experience memory buffer start_since (int): number of steps to collect before start training gamma (float): discount factor target_update_every (int): how often to update the target network tau (float): target network soft-update parameter lr (float): learning rate weight_decay (float): weight decay for optimizer update_every (int): update(learning and target update) interval priority_eps (float): small base value for priorities a (float): priority exponent parameter initial_beta (float): initial importance-sampling weight n_multisteps (int): number of steps to consider for each experience clip (float): gradient norm clipping (`None` to disable) initial_sigma (float): initial noise parameter weights linear_type (str): one of ('linear', 'noisy'); type of linear layer to use """ if kwds != {}: print("Ignored keyword arguments: ", end='') print(*kwds, sep=', ') assert isinstance(state_size, int) assert isinstance(action_size, int) assert isinstance(seed, int) assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(buffer_size, int) and buffer_size >= batch_size assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1 assert isinstance(target_update_every, int) and target_update_every > 0 assert isinstance(tau, (int, float)) and 0 <= tau <= 1 assert isinstance(lr, (int, float)) and lr >= 0 assert isinstance(weight_decay, (int, float)) and weight_decay >= 0 assert isinstance(update_every, int) and update_every > 0 assert isinstance(priority_eps, (int, float)) and priority_eps >= 0 assert isinstance(a, (int, float)) and 0 <= a <= 1 assert isinstance(initial_beta, (int, float)) and 0 <= initial_beta <= 1 assert isinstance(n_multisteps, int) and n_multisteps > 0 if clip: assert isinstance(clip, (int, float)) and clip >= 0 assert isinstance(initial_sigma, (int, float)) and initial_sigma >= 0 assert isinstance(linear_type, str) and linear_type.strip().lower() in ('linear', 'noisy') self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.batch_size = batch_size self.buffer_size = buffer_size self.start_since = start_since self.gamma = gamma self.target_update_every = target_update_every self.tau = tau self.lr = lr self.weight_decay = weight_decay self.update_every = update_every self.priority_eps = priority_eps self.a = a self.beta = initial_beta self.n_multisteps = n_multisteps self.clip = clip self.initial_sigma = initial_sigma self.linear_type = linear_type.strip().lower() # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, linear_type, initial_sigma, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, linear_type, initial_sigma, seed).to(device) self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict()) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, n_multisteps, gamma, a, seed) # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps) self.u_step = 0 self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.u_step = (self.u_step + 1) % self.update_every if self.u_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) >= self.start_since: experiences, target_discount, is_weights, indices = self.memory.sample(self.beta) new_priorities = self.learn(experiences, is_weights, target_discount) self.memory.update_priorities(indices, new_priorities) # update the target network every TARGET_UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.target_update_every if self.t_step == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) return random.choice(np.arange(self.action_size)) def learn(self, experiences, is_weights, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples is_weights (torch.Tensor): tensor of importance-sampling weights gamma (float): discount factor for the target max-Q value Returns ======= new_priorities (List[float]): list of new priority values for the given sample """ states, actions, rewards, next_states, dones = experiences with torch.no_grad(): target = rewards + gamma * (1 - dones) * self.qnetwork_target(next_states)\ .gather(dim=1, index=self.qnetwork_local(next_states)\ .argmax(dim=1, keepdim=True)) pred = self.qnetwork_local(states) diff = target.sub(pred.gather(dim=1, index=actions)) new_priorities = diff.detach().abs().add(P_EPS).cpu().numpy().reshape((-1,)) loss = diff.pow(2).mul(is_weights).mean() self.optimizer.zero_grad() loss.backward() if self.clip: torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), CLIP) self.optimizer.step() return new_priorities def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class SAC(object): def __init__(self, num_inputs, action_space, variant): self.gamma = variant['gamma'] self.tau = variant['tau'] self.alpha = variant['alpha'] self.policy_type = variant['policy_type'] self.target_update_interval = variant['target_update_interval'] self.automatic_entropy_tuning = variant['automatic_entropy_tuning'] self.lr = variant.get("lr", 1e-3) self.device = torch.device("cuda" if variant['cuda'] else "cpu") self.hidden_size = variant.get('hidden_size', [128, 128]) self.critic = QNetwork(num_inputs, action_space.shape[0], self.hidden_size).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], self.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == 'Gaussian': if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], self.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], self.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) def select_action(self, state, evaluate=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if evaluate is False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] def update_parameters(self, memory, batch_size, updates): #sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device) mask_batch = torch.FloatTensor(mask_batch).to(self.device) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] # samle a batch of action and appropriate log_pi pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() # alpha_tlogs = self.alpha.clone() else: alpha_loss = torch.tensor(0.0).to(self.device) if update % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item() def save_model(self, env_nam, suffix=".pkl", actor_path=None, critic_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) print("Saving models to {} and {}".format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) def load_model(self, actor_path, critic_path): print('loading models from {} and {}'.format(actor_path, critic_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path))
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, double_dqn=False, dueling_network=False, prioritized_replay=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed double_dqn (bool): use Double DQN method dueling_network (bool): use Dueling Network prioritized_replay (bool): use Prioritized Replay Buffer """ self.state_size = state_size self.action_size = action_size self.dueling_network = dueling_network self.double_dqn = double_dqn self.prioritized_replay = prioritized_replay random.seed(seed) # Q-Network self.hidden_layers = [128, 32] if self.dueling_network: self.hidden_state_value_layers = [64, 32] self.qnetwork_local = DuelingQNetwork( state_size, action_size, seed, self.hidden_layers, self.hidden_state_value_layers).to(device) self.qnetwork_target = DuelingQNetwork( state_size, action_size, seed, self.hidden_layers, self.hidden_state_value_layers).to(device) self.qnetwork_target.eval() else: self.qnetwork_local = QNetwork(state_size, action_size, seed, self.hidden_layers).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, self.hidden_layers).to(device) self.qnetwork_target.eval() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = optim.lr_scheduler.ExponentialLR( self.optimizer, LR_DECAY) # Replay memory if prioritized_replay: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device, alpha=0.6, beta=0.4, beta_scheduler=1.0) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def load(self, filepath): # load weights from file state_dict = torch.load(filepath) self.qnetwork_local.load_state_dict(state_dict) self.qnetwork_local.eval() def save(self, filepath): # Save weights to file torch.save(self.qnetwork_local.state_dict(), filepath) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ # Epsilon-greedy action selection if random.random() >= eps: state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() return np.argmax(action_values.cpu().data.numpy()).astype(int) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done, w) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, w = experiences with torch.no_grad(): # Use of Double DQN method if self.double_dqn: # Select the greedy actions (maximum Q target for next states) from local model greedy_actions = self.qnetwork_local(next_states).max( dim=1, keepdim=True)[1] # Get the Q targets (for next states) for the greedy actions from target model q_targets_next = self.qnetwork_target(next_states).gather( 1, greedy_actions) # Use of Fixed Q-Target else: # Get max predicted Q values (for next states) from target model q_targets_next = self.qnetwork_target(next_states).max( dim=1, keepdim=True)[0] # Compute Q targets for current states q_targets = rewards + (gamma * q_targets_next * (1 - dones)) # Get expected Q values from local model q_expected = self.qnetwork_local(states).gather( 1, actions) # shape: [batch_size, 1] # Compute loss if self.prioritized_replay: q_targets.sub_(q_expected) q_targets.squeeze_() q_targets.pow_(2) with torch.no_grad(): td_error = q_targets td_error.pow_(0.5) self.memory.update_priorities(td_error) q_targets.mul_(w) loss = q_targets.mean() else: loss = F.mse_loss(q_expected, q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.lr_scheduler.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=42, hidden_layers=[32, 8]): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # detect GPU device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayMemory(BUFFER_SIZE, BATCH_SIZE, self.device, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_step, done): # Save experience in replay memory self.memory.add(state, action, reward, next_step, done) # Learn every UPDATE_EVERY time steps. self.t_step += 1 if self.t_step % UPDATE_EVERY == 0: if self.memory.length > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, next_states, rewards, dones = experiences self.qnetwork_target.eval() with torch.no_grad(): # get the max expected q-values Q_expected = self.qnetwork_local( next_states ) # gather = multiindex selector, dim=1 indices = actions action_argmax = torch.max(Q_expected, dim=1, keepdim=True)[1] Q_max_expected = Q_expected.gather(1, action_argmax) # get max predicted q-values for next states from target model (action with max value per state) # detach gets the tensor value, unsqueeze makes a matrix with one column Q_targets_next = self.qnetwork_target(next_states) # q-target for current state targets = rewards + gamma * Q_max_expected * ( 1 - dones) #consider only not dones self.qnetwork_target.train() expected = self.qnetwork_local(states).gather(1, actions) loss = torch.sum((expected - targets)**2) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) def train(self, env, brain_name, n_episodes=2000, timesteps=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): ''' train the model network applying experience replay Params ====== agent (Agent): agent that interacts with the enviroment n_episodes (int): number of games played timesteps (int): max number os steps to be played in the game eps_start (floaßt): initial proportion os random actions on epsilon-greedy action eps_end (float): final proportion os random actions on epsilon-greedy action eps_decay (float): epsilon decay rate ''' scores = [] last_scores = deque(maxlen=100) eps = eps_start for i_episode in range(n_episodes): env_status = env.reset(train_mode=True)[brain_name] state = env_status.vector_observations[0] #get state score = 0 for _ in range(timesteps): action = self.act(state, eps).astype(int) env_status = env.step(action)[brain_name] next_state = env_status.vector_observations[0] reward = env_status.rewards[0] done = env_status.local_done[0] self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores.append(score) last_scores.append(score) eps = max(eps_end, eps * eps_decay) #decreases epsilon print('\rEpisode {}\tScores mean: {:.2f}'.format( i_episode, np.mean(last_scores)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tLast 100 scores mean: {:.2f}'.format( i_episode, np.mean(last_scores))) if np.mean(last_scores) >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tScores mean: {:.2f}' .format(i_episode - 100, np.mean(last_scores))) torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth') break return scores
class SAC(object): """ SAC class from Haarnoja et al. (2018) We leave the option to use automatice_entropy_tuning to avoid selecting entropy rate alpha """ def __init__(self, num_inputs, action_space, args): #self.n_flow = args.n_flows #assert self.n_flow == 0 self.num_inputs = num_inputs #self.flow_family = args.flow_family self.num_layers = args.num_layers self.args = args self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, self.num_layers, args).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) def select_action(self, state, eval=False): """ Select action for a state (Train) Sample an action from NF{N(mu(s),Sigma(s))} (Eval) Pass mu(s) through NF{} """ state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if not eval: self.policy.train() action, _, _, _, _ = self.policy.evaluate(state) else: self.policy.eval() action, _, _, _, _ = self.policy.evaluate(state, eval=True) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, memory, batch_size, updates): """ Update parameters of SAC-NF Exactly like SAC, but keep two separate Adam optimizers for the Gaussian policy AND the NF layers .backward() on them sequentially """ state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) # for visualization info = {} ''' update critic ''' with torch.no_grad(): next_state_action, next_state_log_pi, _, _, _ = self.policy.evaluate( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] pi, log_pi, _, _, _ = self.policy.evaluate(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] nf_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # update self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() #retain_graph=True) self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs # update target value fuctions if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item(), info def save_model(self, info): """ Save the weights of the network (actor and critic separately) """ # policy save_checkpoint( { **info, 'state_dict': self.policy.state_dict(), 'optimizer': self.policy_optim.state_dict(), }, self.args, filename='policy-ckpt.pth.tar') # critic save_checkpoint( { **info, 'state_dict': self.critic.state_dict(), 'optimizer': self.critic_optim.state_dict(), }, self.args, filename='critic-ckpt.pth.tar') save_checkpoint( { **info, 'state_dict': self.critic_target.state_dict(), #'optimizer' : self.critic_optim.state_dict(), }, self.args, filename='critic_target-ckpt.pth.tar') def load_model(self, args): """ Jointly or separately load actor and critic weights """ # policy load_checkpoint( model=self.policy, optimizer=self.policy_optim, opt=args, device=self.device, filename='policy-ckpt.pth.tar', ) # critic load_checkpoint( model=self.critic, optimizer=self.critic_optim, opt=args, device=self.device, filename='critic-ckpt.pth.tar', ) load_checkpoint( model=self.critic_target, #optimizer=self.critic_optim, opt=args, device=self.device, filename='critic_target-ckpt.pth.tar', )
class SAC(object): def __init__(self): self.gamma = 0.99 self.tau = 0.005 self.alpha = 0.2 self.lr = 0.003 self.target_update_interval = 1 self.device = torch.device("cpu") # 8 phases self.num_inputs = 8 self.num_actions = 1 self.hidden_size = 256 self.critic = QNetwork(self.num_inputs, self.num_actions, self.hidden_size).to(self.device) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(self.num_inputs, self.num_actions, self.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) # Copy the parameters of critic to critic_target self.target_entropy = -torch.Tensor([1.0]).to(self.device).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optimizer = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(self.num_inputs, self.num_actions, self.hidden_size).to(self.device) self.policy_optimizer = Adam(self.policy.parameters(), lr=self.lr) def select_action(self, state): state = torch.FloatTensor(state).to(self.device) # TODO _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] # action is a CUDA tensor, you should do .detach().cpu().numpy(), when # you need a numpy def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) action_batch = np.expand_dims(action_batch, axis=1) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) # Unsqueeze: add one dimension to the index with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf_loss = qf1_loss + qf2_loss self.critic_optimizer.zero_grad() # Clear the cumulative grad qf_loss.backward() # Get grad via backward() self.critic_optimizer.step() # Update the para via grad pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() # TODO self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None): # Create a dir package in the current location if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) print('Saving models to {} and {}'.format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) # state_dict() stores the parameters of layers and optimizers which have grad torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path): print('Loading models from {} and {}'.format(actor_path, critic_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) def get_alpha(self): return self.alpha
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=0, double_dqn=False, dueling=False, per=False, per_args=(0.2, 0.01, 2e-5)): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed double_dqn (bool): whether to implement Double DQN (default=False) dueling (bool): whether to implement Dueling DQN per (bool): whether to implement Prioritized Experience Replay per_args (tuple): a,beta,beta_increment for PER """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_dqn = double_dqn self.per = per self.gamma = GAMMA # output name for checkpoint self.output_name = '' self.output_name += '_double' if double_dqn else '' self.output_name += '_dueling' if dueling else '' self.output_name += '_per' if per else '' # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, dueling=dueling).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, dueling=dueling).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory if self.per: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, *per_args) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def train(self, env, n_episodes=1000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== env (UnityEnvironment): Bananas environment n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # list containing scores from each episode scores = [] # list containing window averaged scores avg_scores = [] # last 100 scores scores_window = deque(maxlen=100) # initialize epsilon eps = eps_start for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = self.act(state, eps) env_info = env.step(action)[brain_name] # get the next state next_state = env_info.vector_observations[0] # get the reward reward = env_info.rewards[0] # see if episode has finished done = env_info.local_done[0] self.step((state, action, reward, next_state, done)) state = next_state score += reward if done: break # save most recent score scores_window.append(score) scores.append(score) avg_scores.append(np.mean(scores_window)) # decrease epsilon eps = max(eps_end, eps_decay * eps) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(self.qnetwork_local.state_dict(), f'./checkpoints/checkpoint{self.output_name}.pth') break return scores, avg_scores def step(self, experience): """Save experience in replay memory and learn. Params ====== experience (tuple): (state, action, reward, next_state, done) """ # save experience self.memory.add(experience) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: self.learn() def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self): """Update value parameters using given batch of experience tuples. """ # if using PER if self.per: states, actions, rewards, next_states, dones, idxs, is_weights = self.memory.sample( ) # else normal replay buffer else: states, actions, rewards, next_states, dones = self.memory.sample() # if Double DQN if self.double_dqn: # Get predicted Q values (for next actions chosen by local model) from target model self.qnetwork_local.eval() with torch.no_grad(): next_actions = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) self.qnetwork_local.train() Q_targets_next = self.qnetwork_target(next_states).gather( 1, next_actions) else: # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss if self.per: loss = (torch.FloatTensor(is_weights) * F.mse_loss(Q_expected, Q_targets)).mean() else: loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # if PER, update priority if self.per: errors = torch.abs(Q_expected - Q_targets).data.numpy() self.memory.update(idxs, errors) # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, seed): """ Initialize an agent object Params ====== state_size(int) => dimensions of each state action_size (int) => dimension of each action seed (int) => random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def act(self, state, eps=0.): """Returns action for a given state as per current policy. Params ====== state (array) => current state eps (float) => epsilon for epsilon greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def save_model(self, path): """Save current model. Params ====== path (string) => file path where model will be saved. """ torch.save(self.qnetwork_local.state_dict(), path) def restore_model(self, path): """Restore model from a file. Params ====== path (string) => file path from where to load the model. """ self.qnetwork_local.load_state_dict(torch.load(path)) self.qnetwork_target.load_state_dict(torch.load(path))
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, double_dqn=True): self.state_size = state_size self.action_size = action_size self.double_dqn = double_dqn # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = copy.deepcopy(self.qnetwork_local) self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) self.t_step = 0 def save(self, path, *data): torch.save(self.qnetwork_local.state_dict(), path / "model_checkpoint.local") torch.save(self.qnetwork_target.state_dict(), path / "model_checkpoint.target") torch.save(self.optimizer.state_dict(), path / 'model_checkpoint.optimizer') with open(path / 'model_checkpoint.meta', 'wb') as file: pickle.dump(data, file) def load(self, path, *defaults): try: print("Loading model from checkpoint...") self.qnetwork_local.load_state_dict( torch.load(path / 'model_checkpoint.local')) self.qnetwork_target.load_state_dict( torch.load(path / 'model_checkpoint.target')) self.optimizer.load_state_dict( torch.load(path / 'model_checkpoint.optimizer')) with open(path / 'model_checkpoint.meta', 'rb') as file: return pickle.load(file) except: print("No checkpoint file was found") return defaults def step(self, state, action, reward, next_state, done, train=True): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if train and len(self.memory) > BATCH_SIZE and self.t_step == 0: self.learn(self.memory.sample(), GAMMA) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if self.double_dqn: Q_best_action = self.qnetwork_local(next_states).max(1)[1] Q_targets_next = self.qnetwork_target(next_states).gather( 1, Q_best_action.unsqueeze(-1)) else: Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(-1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute loss and perform a gradient step self.optimizer.zero_grad() loss = F.mse_loss(Q_expected, Q_targets) loss.backward() self.optimizer.step() # Update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save(self, filename): """Saves the agent to the local workplace Params ====== filename (string): where to save the weights """ checkpoint = { 'input_size': self.state_size, 'output_size': self.action_size, 'hidden_layers': [each.out_features for each in self.qnetwork_local.hidden_layers], 'state_dict': self.qnetwork_local.state_dict() } torch.save(checkpoint, filename) def load_weights(self, filename): """ Load weights to update agent's Q-Network. Expected is a format like the one produced by self.save() Params ====== filename (string): where to load data from. """ checkpoint = torch.load(filename) if not checkpoint['input_size'] == self.state_size: print( f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}" ) return None if not checkpoint['output_size'] == self.action_size: print( f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}" ) return None my_hidden_layers = [ each.out_features for each in self.qnetwork_local.hidden_layers ] if not checkpoint['hidden_layers'] == my_hidden_layers: print( f"Error when loading weights from checkpoint {filename}: hidden layers {checkpoint['hidden_layers']} don't match agent's hidden layers {my_hidden_layers}" ) return None self.qnetwork_local.load_state_dict(checkpoint['state_dict']) self.qnetwork_target = self.qnetwork_local
def train_dqn(options): max_episode = options.max_episode flappyBird = game.GameState() print(f'FPS {flappyBird.FPS}') rpm = ReplayMemory(options.rpm_size, options) # DQN的经验回放池 model = QNetwork() if options.resume and options.ckpt_path is not None: print ('load previous model weight: {}'.format(options.ckpt_path)) episode, epsilon = load_checkpoint(options.ckpt_path, model) else: epsilon = options.init_e episode = 0 if options.cuda: model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=options.lr) algorithm = DQN(model, optimizer, epsilon, options) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < options.rpm_size/4: run_episode(algorithm, flappyBird, rpm, options) print(f'observation done {len(rpm)}') # 开始训练 logname = time.strftime('%Y-%m-%d %M-%I-%S' , time.localtime()) logger = get_logger(f'log/{logname}.log') best_reward = 0 max_score = 0 begin = time.time() while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part reward, loss, score = run_episode(algorithm, flappyBird, rpm, options) algorithm.epsilon = max(algorithm.final_e, algorithm.epsilon - algorithm.e_decrement) episode += 1 max_score = max(max_score, score) if (episode)%10 == 0: logger.info('episode:[{}/{}]\tscore:{:.3f}\ttrain_reward:{:.5f}\tloss:{:.5f}'.format( episode, max_episode, score, reward, loss)) # test part if (episode)%options.evaluate_freq == 0: eval_reward, score = evaluate(flappyBird, algorithm, options) mid = time.time() elapsed = round(mid-begin) logger.info('episode:[{}/{}]\tscore:{:.3f}\tepsilon:{:.5f}\ttest_reward:{:.5f}\t{}:{}'.format( episode, max_episode, score, algorithm.epsilon, eval_reward, elapsed//60, elapsed%60)) if eval_reward > best_reward: save_path = f'ckpt/best_{score}.ckpt' save_checkpoint({ 'episode': episode, 'epsilon': algorithm.epsilon, 'state_dict': model.state_dict(), }, False, save_path ) if (episode)%1000 == 0: save_path = f'ckpt/episode_{episode}.ckpt' save_checkpoint({ 'episode': episode, 'epsilon': algorithm.epsilon, 'state_dict': model.state_dict(), }, False, save_path ) # 训练结束,保存模型 save_path = f'ckpt/final_{episode}_{score}.ckpt' save_checkpoint({ 'episode': episode, 'epsilon': algorithm.epsilon, 'state_dict': model.state_dict(), }, False, save_path) mid = time.time() elapsed = round(mid-begin) logger.info('training completed, {} episiode, {}m {}s'.format(max_episode, elapsed//60, elapsed%60)) print(f'max_score {max_score}')
class DQNAgent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, memory=None, device='cpu', weights_filename=None, params=None, train_mode=True): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action memory (obj): Memory buffer to sample device (str): device string between cuda:0 and cpu weights_filename (str): file name having weights of local Q network to load params (dict): hyper-parameters train_mode (bool): True if it is train mode, otherwise False """ self.state_size = state_size self.action_size = action_size self.device = device # Set parameters self.gamma = params['gamma'] self.tau = params['tau'] self.lr = params['lr'] self.update_every = params['update_every'] self.seed = random.seed(params['seed']) # Q-Network if train_mode: drop_p = params['drop_p'] else: drop_p = 0 self.qnetwork_local = QNetwork(state_size, action_size, params['seed'], params['hidden_layers'], drop_p).to(device) self.qnetwork_target = QNetwork(state_size, action_size, params['seed'], params['hidden_layers'], drop_p).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = memory # Load weight file if weights_filename: self.qnetwork_local.load_state_dict(torch.load(weights_filename)) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def store_weights(self, filename): """Store weights of Q local network Params ====== filename (str): string of filename to store weights of Q local network """ torch.save(self.qnetwork_local.state_dict(), filename) def step(self, state, action, reward, next_state, done): """This defines an agent to do whenever moving. Params ====== state (array_like): current state action (int): current action reward (float): reward on next state next_state (array_like): next state done (bool): flag to indicate whether this episode is done """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.memory.get_batch_size(): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model # q_targets_next = max Q(next state, next action, theta dash) # qnetwork_target(next_states): Q values[next_states][action] # .detach(): detached from the current graph # .max(1): first - max(Q values), second - action: argmax(Q values), third - device (cuda:0 or cpu) # .max(1)[0]: select max(Q values) of next states # .unsqueeze(1)): from 1d-array to 2d-matrix ([a,b,c] -> [[a], [b], [c]]) q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states # If done, q_targets = rewards. # Otherwise, q_targets = rewards + gamma * q_targets_next q_targets = rewards + (gamma * q_targets_next * (1 - dones)) # Get expected Q values from local model q_locals = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = f.mse_loss(q_locals, q_targets) # Minimize the loss self.optimizer.zero_grad() # Clear gradients loss.backward() # Calculate gradients self.optimizer.step() # Move to the gradients # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def dqn(self, env, brain_name, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info = env.reset( train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # reset the score for t in range(max_t): action = self.act(state, eps).astype( int) # choose action based on epsilon-greedy policy env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[ 0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished self.step(state, action, reward, next_state, done) # make the agent take a step state = next_state # update the state score += reward # add the reward to the score if done: # (if done) break # end episode scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth') break return scores def test(self, env, brain_name): self.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) # load environment variables # action_size, state_size = info.getInfo() env_info = env.reset( train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while True: action = self.act(state).astype(int) # select an action env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break return score
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, layer_spec, seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.layer_spec = layer_spec self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, layer_spec).to(device) self.qnetwork_target = QNetwork(state_size, action_size, layer_spec).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # (Prioritized) experience replay setup self.buffer_size = BUFFER_SIZE self.batch_size = BATCH_SIZE self.min_prio = MIN_PRIO self.alpha = ALPHA self.beta = INIT_BETA self.beta_increment = BETA_INC if USE_PER: self.memory = PrioritizedReplayBuffer(size=self.buffer_size, alpha=self.alpha) else: self.memory = DequeReplayBuffer(action_size=self.action_size, buffer_size=self.buffer_size, batch_size=self.batch_size, seed=42) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # print info about Agent print('Units in the hidden layers are {}.'.format(str(layer_spec))) print('Using Double-DQN is \"{}\".'.format(str(USE_DDQN))) print('Using prioritized experience replay is \"{}\".'.format( str(USE_PER))) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get subset and learn if len(self.memory) > BATCH_SIZE: self.beta = min(1., self.beta + self.beta_increment) experiences = self.memory.sample(self.batch_size, beta=self.beta) self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # Get TD step from experiences states, actions, rewards, next_states, dones, weights, idxes = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # DOUBLE DQN: Select action based on _local, evaluate action based on _target if USE_DDQN: Q_action_select = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).detach().gather( 1, Q_action_select) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute (PER-weighted) MSE loss if USE_PER: TD_error = Q_targets - Q_expected weighted_TD_error = weights * (TD_error**2) loss = torch.mean(weighted_TD_error) # Update priorities in Replay Buffer prio_updates = np.abs( TD_error.detach().squeeze(1).cpu().numpy()) + self.min_prio self.memory.update_priorities(idxes, prio_updates.tolist()) else: loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # soft-update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def save_checkpoint(self): checkpoint = { 'input_size': self.state_size, 'output_size': self.action_size, 'layer_spec': self.layer_spec, 'state_dict': self.qnetwork_local.state_dict() } torch.save(checkpoint, 'checkpoint.pth') print('Checkpoint succesfully saved.') def load_checkpoint(self, filepath='checkpoint.pth'): checkpoint = torch.load(filepath) self.qnetwork_local = QNetwork(checkpoint['input_size'], checkpoint['output_size'], checkpoint['layer_spec']).to(device) self.qnetwork_local.load_state_dict(checkpoint['state_dict']) print('Checkpoint successfully loaded.') def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(args): env = gym.make(args.env) if 'MiniGrid' in args.env: env = ImgObsWrapper(env) path = args.base_path + args.env os.makedirs(path, exist_ok=True) # obs_shape = np.prod(env.observation_space.shape).astype(int) obs_shape = env.observation_space.shape act_shape = env.action_space.n q = QNetwork(obs_shape, act_shape) q_target = QNetwork(obs_shape, act_shape) opt = optim.Adam(lr=args.lr, params=q.parameters()) memory = Memory(capacity=args.memory) scheduler = LinearSchedule(schedule_timesteps=int(args.max_steps * 0.1), final_p=0.01) avg_rw = deque(maxlen=40) avg_len = deque(maxlen=40) def get_action(s, t): s = torch.Tensor(s[None,:]) _q = q(s) if np.random.sample() > scheduler.value: best_action = np.argmax(_q.detach(), axis=-1).item() else: best_action = np.random.randint(0, act_shape) scheduler.update(t) return best_action def train(batch): batch = Transition(*zip(*batch)) s = torch.Tensor(batch.state) a = torch.Tensor(one_hot(np.array(batch.action), num_classes=act_shape)) r = torch.Tensor(batch.reward) d = torch.Tensor(batch.done) s1 = torch.Tensor(batch.next_state) value = (q(s) * a).sum(dim=-1) next_value = r + args.gamma * (1. - d) * torch.max(q_target(s1), dim=-1)[0] loss = (.5 * (next_value - value) ** 2).mean() opt.zero_grad() loss.backward() opt.step() state = env.reset() q_target.load_state_dict(q.state_dict()) ep_rw = 0 ep_len = 0 ep = 0 for t in range(args.max_steps): action = get_action(state, t) next_state, reward, done, _ = env.step(action) memory.push(state, action, next_state, reward, done) ep_rw += reward ep_len += 1 state = next_state.copy() if done: ep += 1 avg_rw.append(ep_rw) avg_len.append(ep_len) ep_rw = 0 ep_len = 0 state = env.reset() if t % args.train_every == 0 and len(memory) > args.batch_size: batch = memory.sample(batch_size=args.batch_size) train(batch) if t % args.update_every == 0: q_target.load_state_dict(q.state_dict()) print(f't:{t}\tep:{ep}\tavg_rw:{np.mean(avg_rw)}\tavg_len:{np.mean(avg_len)}\teps:{scheduler.value}') env = Monitor(env, directory=path) for ep in range(4): s = env.reset() while True: a = get_action(s, t=0) s1, r, d, _ = env.step(a) s = s1.copy() if d: break
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE, buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE, tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, priority_eps=P_EPS, a=A, initial_beta=INIT_BETA, n_multisteps=N_STEPS, v_min=V_MIN, v_max=V_MAX, clip=CLIP, n_atoms=N_ATOMS, initial_sigma=INIT_SIGMA, linear_type=LINEAR, factorized=FACTORIZED, **kwds): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed batch_size (int): size of each sample batch buffer_size (int): size of the experience memory buffer start_since (int): number of steps to collect before start training gamma (float): discount factor target_update_every (int): how often to update the target network tau (float): target network soft-update parameter lr (float): learning rate weight_decay (float): weight decay for optimizer update_every (int): update(learning and target update) interval priority_eps (float): small base value for priorities a (float): priority exponent parameter initial_beta (float): initial importance-sampling weight n_multisteps (int): number of steps to consider for each experience v_min (float): minimum reward support value v_max (float): maximum reward support value clip (float): gradient norm clipping (`None` to disable) n_atoms (int): number of atoms in the discrete support distribution initial_sigma (float): initial noise parameter weights linear_type (str): one of ('linear', 'noisy'); type of linear layer to use factorized (bool): whether to use factorized gaussian noise in noisy layers """ if kwds != {}: print("Ignored keyword arguments: ", end='') print(*kwds, sep=', ') assert isinstance(state_size, int) assert isinstance(action_size, int) assert isinstance(seed, int) assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(buffer_size, int) and buffer_size >= batch_size assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1 assert isinstance(target_update_every, int) and target_update_every > 0 assert isinstance(tau, (int, float)) and 0 <= tau <= 1 assert isinstance(lr, (int, float)) and lr >= 0 assert isinstance(weight_decay, (int, float)) and weight_decay >= 0 assert isinstance(update_every, int) and update_every > 0 assert isinstance(priority_eps, (int, float)) and priority_eps >= 0 assert isinstance(a, (int, float)) and 0 <= a <= 1 assert isinstance(initial_beta, (int, float)) and 0 <= initial_beta <= 1 assert isinstance(n_multisteps, int) and n_multisteps > 0 assert isinstance(v_min, (int, float)) and isinstance(v_max, (int, float)) and v_min < v_max if clip: assert isinstance(clip, (int, float)) and clip >= 0 assert isinstance(n_atoms, int) and n_atoms > 0 assert isinstance(initial_sigma, (int, float)) and initial_sigma >= 0 assert isinstance(linear_type, str) and linear_type.strip().lower() in ('linear', 'noisy') assert isinstance(factorized, bool) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) self.state_size = state_size self.action_size = action_size self.seed = seed self.batch_size = batch_size self.buffer_size = buffer_size self.start_since = start_since self.gamma = gamma self.target_update_every = target_update_every self.tau = tau self.lr = lr self.weight_decay = weight_decay self.update_every = update_every self.priority_eps = priority_eps self.a = a self.beta = initial_beta self.n_multisteps = n_multisteps self.v_min = v_min self.v_max = v_max self.clip = clip self.n_atoms = n_atoms self.initial_sigma = initial_sigma self.linear_type = linear_type.strip().lower() self.factorized = factorized # Distribution self.supports = torch.linspace(v_min, v_max, n_atoms, device=device) self.delta_z = (v_max - v_min) / (n_atoms - 1) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, n_atoms, linear_type, initial_sigma, factorized).to(device) self.qnetwork_target = QNetwork(state_size, action_size, n_atoms, linear_type, initial_sigma, factorized).to(device) self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict()) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, n_multisteps, gamma, a) # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps) self.u_step = 0 self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.u_step = (self.u_step + 1) % self.update_every if self.u_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) >= self.start_since: experiences, target_discount, is_weights, indices = self.memory.sample(self.beta) new_priorities = self.learn(experiences, is_weights, target_discount) self.memory.update_priorities(indices, new_priorities) # update the target network every TARGET_UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.target_update_every if self.t_step == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) with torch.no_grad(): z_probs = F.softmax(self.qnetwork_local(state), dim=-1) action_values = self.supports.mul(z_probs).sum(dim=-1, keepdim=False) # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) return random.choice(np.arange(self.action_size)) def learn(self, experiences, is_weights, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples is_weights (torch.Tensor): tensor of importance-sampling weights gamma (float): discount factor for the target max-Q value Returns ======= new_priorities (List[float]): list of new priority values for the given sample """ states, actions, rewards, next_states, dones = experiences with torch.no_grad(): rows = tuple(range(next_states.size(0))) a_argmax = F.softmax(self.qnetwork_local(next_states), dim=2)\ .mul(self.supports)\ .sum(dim=2, keepdim=False)\ .argmax(dim=1, keepdim=False) p = F.softmax(self.qnetwork_target(next_states)[rows, a_argmax], dim=1) tz_projected = torch.clamp(rewards + (1 - dones) * gamma * self.supports, min=self.v_min, max=self.v_max) # """ b = (tz_projected - self.v_min) / self.delta_z u = b.ceil() l = b.floor() u_updates = b - l + u.eq(l).type(u.dtype) # fixes the problem when having b == u == l l_updates = u - b indices_flat = torch.cat((u.long(), l.long()), dim=1) indices_flat = indices_flat.add( torch.arange(start=0, end=b.size(0) * b.size(1), step=b.size(1), dtype=indices_flat.dtype, layout=indices_flat.layout, device=indices_flat.device).unsqueeze(1) ).view(-1) updates_flat = torch.cat((u_updates.mul(p), l_updates.mul(p)), dim=1).view(-1) target_distributions = torch.zeros_like(p) target_distributions.view(-1).index_add_(0, indices_flat, updates_flat) """ b = ((tz_projected - V_MIN) / self.delta_z).t() # transpose for later for-loop convenience u = b.ceil() l = b.floor() u_updates = b - l + u.eq(l).type(u.dtype) l_updates = u - b target_distributions = torch.zeros_like(p) for u_indices, l_indices, u_update, l_update, prob in zip(u.long(), l.long(), u_updates, l_updates, p.t()): target_distributions[rows, u_indices] += u_update * prob target_distributions[rows, l_indices] += l_update * prob """ pred_distributions = self.qnetwork_local(states) pred_distributions = pred_distributions.gather(dim=1, index=actions.unsqueeze(1).expand(-1, -1, pred_distributions.size(2))).squeeze(1) """ cross_entropy = target_distributions.mul(pred_distributions.exp().sum(dim=-1, keepdim=True).log() - pred_distributions).sum(dim=-1, keepdim=False) new_priorities = cross_entropy.detach().add(self.priority_eps).cpu().numpy() loss = cross_entropy.mul(is_weights.view(-1)).mean() """ kl_divergence = F.kl_div(F.log_softmax(pred_distributions, dim=-1), target_distributions, reduce=False).sum(dim=-1, keepdim=False) new_priorities = kl_divergence.detach().add(self.priority_eps).cpu().numpy() loss = kl_divergence.mul(is_weights.view(-1)).mean() # """ self.optimizer.zero_grad() loss.backward() if self.clip: torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), self.clip) self.optimizer.step() return new_priorities def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class SAC(object): def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.policy_type == "Gaussian": self.alpha = args.alpha # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape)).item() self.log_alpha = torch.zeros(1, requires_grad=True) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) else: pass self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.value = ValueNetwork(self.num_inputs, args.hidden_size) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) else: self.policy = DeterministicPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size) hard_update(self.critic_target, self.critic) def select_action(self, state, eval=False): state = torch.FloatTensor(state).unsqueeze(0) if eval == False: self.policy.train() action, _, _, _, _ = self.policy.sample(state) else: self.policy.eval() _, _, _, action, _ = self.policy.sample(state) if self.policy_type == "Gaussian": action = torch.tanh(action) else: pass #action = torch.tanh(action) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates): state_batch = torch.FloatTensor(state_batch) next_state_batch = torch.FloatTensor(next_state_batch) action_batch = torch.FloatTensor(action_batch) reward_batch = torch.FloatTensor(reward_batch).unsqueeze(1) mask_batch = torch.FloatTensor(np.float32(mask_batch)).unsqueeze(1) """ Use two Q-functions to mitigate positive bias in the policy improvement step that is known to degrade performance of value based methods. Two Q-functions also significantly speed up training, especially on harder task. """ expected_q1_value, expected_q2_value = self.critic( state_batch, action_batch) new_action, log_prob, _, mean, log_std = self.policy.sample( state_batch) if self.policy_type == "Gaussian": if self.automatic_entropy_tuning: """ Alpha Loss """ alpha_loss = -( self.log_alpha * (log_prob + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_logs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.) alpha_logs = self.alpha # For TensorboardX logs """ Including a separate function approximator for the soft value can stabilize training. """ expected_value = self.value(state_batch) target_value = self.value_target(next_state_batch) next_q_value = reward_batch + mask_batch * self.gamma * ( target_value).detach() else: """ There is no need in principle to include a separate function approximator for the state value. We use a target critic network for deterministic policy and eradicate the value value network completely. """ alpha_loss = torch.tensor(0.) alpha_logs = self.alpha # For TensorboardX logs next_state_action, _, _, _, _, = self.policy.sample( next_state_batch) target_critic_1, target_critic_2 = self.critic_target( next_state_batch, next_state_action) target_critic = torch.min(target_critic_1, target_critic_2) next_q_value = reward_batch + mask_batch * self.gamma * ( target_critic).detach() """ Soft Q-function parameters can be trained to minimize the soft Bellman residual JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1)) """ q1_value_loss = F.mse_loss(expected_q1_value, next_q_value) q2_value_loss = F.mse_loss(expected_q2_value, next_q_value) q1_new, q2_new = self.critic(state_batch, new_action) expected_new_q_value = torch.min(q1_new, q2_new) if self.policy_type == "Gaussian": """ Including a separate function approximator for the soft value can stabilize training and is convenient to train simultaneously with the other networks Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error. JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2] ∇JV = ∇V(st)(V(st) - Q(st,at) + (α * logπ(at|st))) """ next_value = expected_new_q_value - (self.alpha * log_prob) value_loss = F.mse_loss(expected_value, next_value.detach()) else: pass """ Reparameterization trick is used to get a low variance estimator f(εt;st) = action sampled from the policy εt is an input noise vector, sampled from some fixed distribution Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] ∇Jπ = ∇log π + ([∇at (α * logπ(at|st)) − ∇at Q(st,at)])∇f(εt;st) """ policy_loss = ((self.alpha * log_prob) - expected_new_q_value).mean() # Regularization Loss mean_loss = 0.001 * mean.pow(2).mean() std_loss = 0.001 * log_std.pow(2).mean() policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() q1_value_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() q2_value_loss.backward() self.critic_optim.step() if self.policy_type == "Gaussian": self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() else: value_loss = torch.tensor(0.) self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic": soft_update(self.critic_target, self.critic, self.tau) elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian": soft_update(self.value_target, self.value, self.tau) return value_loss.item(), q1_value_loss.item(), q2_value_loss.item( ), policy_loss.item(), alpha_loss.item(), alpha_logs # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format( actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))
class Agent: def __init__(self, state_size, action_size, num_agents, double_dqn=False): self.action_size = action_size self.double_dqn = double_dqn # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = copy.deepcopy(self.qnetwork_local) self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=4000, gamma=0.98, last_epoch=-1) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE) self.num_agents = num_agents self.t_step = 0 def reset(self): self.finished = [False] * self.num_agents # Decide on an action to take in the environment def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) # Epsilon-greedy action selection if random.random() > eps: return torch.argmax(action_values).item() else: return torch.randint(self.action_size, ()).item() # Record the results of the agent's action and update the model def step(self, handle, state, action, reward, next_state, agent_done): if not self.finished[handle]: # Save experience in replay memory self.memory.push(state, action, reward, next_state, agent_done) self.finished[handle] = agent_done # Perform a gradient update every UPDATE_EVERY time steps self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 1: # 320 self.learn(*self.memory.sample(BATCH_SIZE, device)) def learn(self, states, actions, rewards, next_states, dones): self.qnetwork_local.train() # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if self.double_dqn: Q_best_action = self.qnetwork_local(next_states).argmax(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_best_action.unsqueeze(-1)) else: Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(-1) # Compute Q targets for current states Q_targets = rewards + GAMMA * Q_targets_next * (1 - dones) # Compute loss and perform a gradient step self.optimizer.zero_grad() loss = F.mse_loss(Q_expected, Q_targets) loss.backward() self.optimizer.step() self.lr_scheduler.step() # Update the target network parameters to `tau * local.parameters() + (1 - tau) * target.parameters()` for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data) # Checkpointing methods def save(self, path, *data): torch.save(self.qnetwork_local.state_dict(), path / 'model_checkpoint.local') torch.save(self.qnetwork_target.state_dict(), path / 'model_checkpoint.target') torch.save(self.optimizer.state_dict(), path / 'model_checkpoint.optimizer') with open(path / 'model_checkpoint.meta', 'wb') as file: pickle.dump(data, file) def load(self, path, *defaults): try: print("Loading model from checkpoint...") self.qnetwork_local.load_state_dict(torch.load(path / 'model_checkpoint.local')) self.qnetwork_target.load_state_dict(torch.load(path / 'model_checkpoint.target')) self.optimizer.load_state_dict(torch.load(path / 'model_checkpoint.optimizer')) with open(path / 'model_checkpoint.meta', 'rb') as file: return pickle.load(file) except: print("No checkpoint file was found") return defaults
class SAC(object): def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) def select_action(self, state, evaluate=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if evaluate is False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * min_qf_next_target qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] critic_loss = qf1_loss + qf2_loss self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) print('Saving models to {} and {}'.format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, device='cpu'): print('Loading models from {} and {}'.format(actor_path, critic_path)) if actor_path is not None: self.policy.load_state_dict( torch.load(actor_path, map_location=torch.device(device))) if critic_path is not None: self.critic.load_state_dict( torch.load(critic_path, map_location=torch.device(device)))
class DQN_Agent: def __init__(self, state_size, action_size, seed=42): self.action_size = action_size # Q-Network self.q_eval = QNetwork(state_size, action_size, seed).to(device) self.q_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.RMSprop(self.q_eval.parameters(), lr=LR) # Replay Buffer self.memory = ReplayBuffer(seed=seed) self.step_count = 0 self.seed = random.seed(seed) def act(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.q_eval.eval() with torch.no_grad(): q_values = self.q_eval(state) self.q_eval.train() epsilon = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * self.step_count / EPS_DECAY) if random.random() > epsilon: # greedy return np.argmax(q_values.cpu().data.numpy()) else: # explore return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): self.memory.push(state, action, reward, next_state, done) loss_value = None if len(self.memory) >= BATCH_SIZE: # sample transitions from replay buffer states, actions, rewards, next_states, dones = self.memory.sample() # r if done # r + max_a \gamma Q(s, a; \theta') if not done q_next_values = self.q_target(next_states).detach().max(1)[0].unsqueeze(1) q_learning_targets = rewards + GAMMA * q_next_values * (1 - dones) # Q(s, a; \theta) q_values = self.q_eval(states).gather(1, actions) # perform gradient descent on the loss loss = F.mse_loss(q_values, q_learning_targets) loss_value = loss.data.item() self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target Q-Network self.update_target() self.step_count += 1 return loss_value def update_target(self): if self.step_count % UPDATE_TARGET_STEPS == 0: self.q_target.load_state_dict(self.q_eval.state_dict())
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, double_dqn=True): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action """ self.state_size = state_size self.action_size = action_size self.double_dqn = double_dqn # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = copy.deepcopy(self.qnetwork_local) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def save(self, filename): torch.save(self.qnetwork_local.state_dict(), filename + ".local") torch.save(self.qnetwork_target.state_dict(), filename + ".target") def load(self, filename): if os.path.exists(filename + ".local"): self.qnetwork_local.load_state_dict(torch.load(filename + ".local")) if os.path.exists(filename + ".target"): self.qnetwork_target.load_state_dict(torch.load(filename + ".target")) def step(self, state, action, reward, next_state, done, train=True): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() if train: self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if self.double_dqn: # Double DQN q_best_action = self.qnetwork_local(next_states).max(1)[1] Q_targets_next = self.qnetwork_target(next_states).gather(1, q_best_action.unsqueeze(-1)) else: # DQN Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(-1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """ Initialize Agent, inclduing: DQN Hyperparameters Local and Targat State-Action Policy Networks Replay Memory Buffer from Replay Buffer Class (define below) """ def __init__(self, state_size, action_size, dqn_type='DQN', replay_memory_size=1e5, batch_size=64, gamma=0.99, learning_rate=1e-3, target_tau=2e-3, update_rate=4, seed=0): """ DQN Agent Parameters ====== state_size (int): dimension of each state action_size (int): dimension of each action dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN. replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6) batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128) gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995) learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3)) seed (int): random seed for initializing training point. """ self.dqn_type = dqn_type self.state_size = state_size self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) """ # DQN Agent Q-Network # For DQN training, two nerual network models are employed; # (a) A network that is updated every (step % update_rate == 0) # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate. # The slower modulation of the target network weights operates to stablize learning. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate, betas=BETAS) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 ######################################################## # STEP() method # def step(self, state, action, reward, next_state, done, update=True): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_rate if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() if update: self.learn(experiences, self.gamma) ######################################################## # ACT() method # def act(self, state, eps=0.0): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) ######################################################## # LEARN() method # Update value parameters using given batch of experience tuples. def learn(self, experiences, gamma, DQN=True): """ Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get Q values from current observations (s, a) using model nextwork Qsa = self.network(states).gather(1, actions) if (self.dqn_type == 'DDQN'): #Double DQN #************************ Qsa_prime_actions = self.network(next_states).detach().max( 1)[1].unsqueeze(1) Qsa_prime_targets = self.target_network( next_states)[Qsa_prime_actions].unsqueeze(1) else: #Regular (Vanilla) DQN #************************ # Get max Q values for (s',a') from target model Qsa_prime_target_values = self.target_network(next_states).detach() Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1) # Compute Q targets for current states Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones)) # Compute loss (error) loss = F.mse_loss(Qsa, Qsa_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.network, self.target_network, self.tau) ######################################################## """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ def soft_update(self, local_model, target_model, tau): """ Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_the_model(self, iteration, f_name): if not os.path.exists('./save/dqn/'): os.makedirs('./save/dqn/') f_name = 'dqn_param_' + str(iteration) + '_' + f_name + '_model.pth' torch.save(self.network.state_dict(), './save/dqn/' + f_name) print('DQN Model Saved') def load_the_model(self, iteration, f_name): f_path = './save/dqn/dqn_param_' + str( iteration) + '_' + f_name + '_model.pth' self.network.load_state_dict(torch.load(f_path)) print('DQN Model Loaded')
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=0, gamma=0.99, learning_rate=5e-4, use_RB=True, RB_size=int(1e5), RB_batch_size=64, use_TM=True, TM_update_every=4, use_DDQN=True, use_PER=False, PER_epsilon=0.01, PER_alpha=0.5, PER_beta=0.4, PER_beta_increment=0.001, use_DUELING=True): """Initialize an Agent object. Params ====== state_size (int) : dimension of each state action_size (int) : dimension of each action seed (int) : random seed gamma (float) : discount factor learning_rate (float) : learning rate of the model use_RB (bool) : Use a replay buffer RB_size (int) : replay buffer size RB_batch_size (int) : minibatch size of the learning use_TM (bool) : Use a target model TM_update_every (int) : update target model every t steps use_DDQN (bool) : Use Double DQN, only valid if use target model use_PER (bool) : Use a prioritized replay buffer PER_epsilon (float) : Small value added to priorities to avoid zero probabilities PER_alpha (float) : Power used to compute the sampling probabilities [0-1] : 0=> Uniform sampling 1=>Fully prioritized PER_beta (float) : Used in importance-sampling - Initial value increased to 1 PER_beta_increment (float) : To increment beta at each sampling use_DUELING (bool) : Use DUELING network """ # Control some parameters assert not use_PER or ( use_PER and use_RB ), "Use replay buffer if use PER" # To make sure we remember to update RB params assert not use_DDQN or (use_DDQN and use_TM), "Use target model if use DDQN" self.state_size = state_size self.action_size = action_size self.gamma = gamma # Q-Network self.qnetwork_policy = QNetwork(state_size, action_size, seed, use_DUELING=use_DUELING).to(device) self.optimizer = optim.Adam(self.qnetwork_policy.parameters(), lr=learning_rate) self.use_DDQN = use_DDQN self.use_TM = use_TM if use_TM: self.qnetwork_target = QNetwork(state_size, action_size, seed, use_DUELING=use_DUELING).to(device) self.TM_update_every = TM_update_every # Initialize time step self.t_step = 0 # Replay memory self.use_RB = use_RB self.RB_batch_size = RB_batch_size self.use_PER = use_PER if use_PER: self.memory = ReplayBufferPER(RB_size, RB_batch_size, seed, epsilon=PER_epsilon, alpha=PER_alpha, beta=PER_beta, beta_increment=PER_beta_increment) elif use_RB: self.memory = ReplayBuffer(RB_size, RB_batch_size, seed) # Init the seed random.seed(seed) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ # Epsilon-greedy action selection if random.random() > eps: state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_policy.eval() with torch.no_grad(): action_values = self.qnetwork_policy(state) return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): # Save experience in replay memory if any if self.use_PER: # Need to compute the error of this experience Q_target, Q_expected = self._QValues([(state, action, reward, next_state, done)]) error = (Q_target - Q_expected).cpu().squeeze().data.item() self.memory.add(error, (state, action, reward, next_state, done)) elif self.use_RB: self.memory.add((state, action, reward, next_state, done)) else: self.experiences = [(state, action, reward, next_state, done)] # One more step. self.t_step += 1 # If no replay buffer or not enough samples available in memory, learn if not self.use_RB or len(self.memory) > self.RB_batch_size: self._learn() def _QValues(self, batch): """Execute a forward path for the QNetworks to get the QValues (expected and target) So the TD error can be computed or used to learn Params ====== batch : Array of tuple <state, action, reward, next_state, done> """ # Get the types by line mini_batch = np.array(batch).transpose() states = torch.Tensor(np.vstack(mini_batch[0])).float().to(device) actions = torch.Tensor(np.vstack(mini_batch[1])).long().to(device) rewards = torch.Tensor(np.vstack(mini_batch[2])).float().to(device) next_states = torch.Tensor(np.vstack(mini_batch[3])).float().to(device) dones = torch.Tensor(np.vstack( mini_batch[4]).astype(int)).float().to(device) # Get max predicted Q values (for next states) from target model if not self.use_TM or (self.use_TM and self.use_DDQN): self.qnetwork_policy.eval() with torch.no_grad(): action_values_policy = self.qnetwork_policy(next_states) if self.use_TM: self.qnetwork_target.eval() with torch.no_grad(): action_values_target = self.qnetwork_target(next_states) if self.use_TM: if self.use_DDQN: Q_targets_next = action_values_target.gather( dim=1, index=action_values_policy.max(dim=1, keepdim=True)[1]) else: Q_targets_next = action_values_target.max(dim=1, keepdim=True)[0] else: Q_targets_next = action_values_policy.max(dim=1, keepdim=True)[0] # Need to be at zero if we were done Q_targets_next *= torch.ones_like(dones) - dones # Compute the Q targets for current states Q_targets = rewards + self.gamma * Q_targets_next # Get the Q values from policy model self.qnetwork_policy.train() Q_expected = self.qnetwork_policy(states).gather(dim=1, index=actions) return Q_targets, Q_expected def _learn(self): """Update value parameters using given a batch of experience tuples.""" if self.use_PER: experiences, indexes, IS_weights = self.memory.sample() IS_weights = torch.Tensor(np.vstack(IS_weights)).float().to(device) elif self.use_RB: experiences = self.memory.sample() else: experiences = self.experiences # Get the Qvalues for those experiences Q_targets, Q_expected = self._QValues(experiences) if self.use_PER: # Update priorities of the replay buffer errors = (Q_targets - Q_expected).cpu().squeeze().data.numpy() self.memory.update_priorities(indexes, errors) # Update Qs with the importance-sampling weight correction Q_expected *= IS_weights**0.5 Q_targets *= IS_weights**0.5 # Loss computation loss = F.mse_loss(Q_expected, Q_targets) #loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # if self.use_TM: self.t_step %= self.TM_update_every if self.t_step == 0: self.qnetwork_target.load_state_dict( self.qnetwork_policy.state_dict()) def save_weights(self, file='checkpoint.pth'): """Save the agent network weights in a checkpoint file""" torch.save(self.qnetwork_policy.state_dict(), file) def load_weights(self, file='checkpoint.pth'): """Load the agent network weights from a checkpoint file""" self.qnetwork_policy.load_state_dict(torch.load(file))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, filepath): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.avarage_score = 0 self.start_epoch = 0 self.seed = random.randint(0, seed) random.seed(seed) print("seed ", seed, " self.seed ", self.seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, self.seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, self.seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if filepath: self.load_model(filepath) # Replay memory print("buffer size ", BUFFER_SIZE) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, self.seed) print("memory ", self.memory) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() #print("experiences ",experiences) self.learn_DDQN(experiences, GAMMA) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: self.update_network(self.qnetwork_local, self.qnetwork_target) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn_DDQN(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next_argmax = self.qnetwork_local(next_states).squeeze( 0).detach().max(1)[1].unsqueeze(1) #Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach() #Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).squeeze(0).gather( 1, Q_targets_next_argmax) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach() Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def save_model(self, filepath, epoch, score, last=False): checkpoint = { 'input_size': self.state_size, 'output_size': self.action_size, 'hidden_layers': [each.in_features for each in self.qnetwork_local.hidden_layers], 'state_dict': self.qnetwork_local.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'epoch': epoch, 'avarage_score': score } checkpoint['hidden_layers'].append( self.qnetwork_local.hidden_layers[-1].out_features) torch.save(checkpoint, filepath) if last: torch.save(self.qnetwork_local.state_dict(), '{}_state_dict_{}.pt'.format(last, epoch)) #print("checkpoint['hidden_layers'] ",checkpoint['hidden_layers']) def load_model(self, filepath): print("seed ", self.seed) if os.path.isfile(filepath): print("=> loading checkpoint '{}'".format(filepath)) checkpoint = torch.load(filepath) print("checkpoint['hidden_layers'] ", checkpoint['hidden_layers']) self.qnetwork_local = QNetwork( checkpoint['input_size'], checkpoint['output_size'], self.seed, checkpoint['hidden_layers']).to(device) self.qnetwork_local.load_state_dict(checkpoint['state_dict']) self.qnetwork_local.to(device) self.qnetwork_target = QNetwork( checkpoint['input_size'], checkpoint['output_size'], self.seed, checkpoint['hidden_layers']).to(device) self.qnetwork_target.load_state_dict(checkpoint['state_dict']) self.qnetwork_target.to(device) if 'optimizer_state_dict' in checkpoint: self.optimizer.load_state_dict( checkpoint['optimizer_state_dict']) for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) print(self.optimizer) if 'epoch' in checkpoint: self.start_epoch = checkpoint['epoch'] if 'avarage_score' in checkpoint: self.avarage_score = checkpoint['avarage_score'] print(self.qnetwork_target) print(self.optimizer) else: print("=> no checkpoint found at '{}'".format(filepath)) def update_network(self, local_model, target_model): for target, local in zip(target_model.parameters(), local_model.parameters()): target.data.copy_(local.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)