] num_steps = 0 for num_points, batch_size, epochs in configuration: dataset.num_points = num_points loader = DataLoader(dataset, batch_size, shuffle=True, num_workers=6) for epoch in range(1, epochs + 1): total_loss = 0 for uniform, _ in loader: num_steps += 1 uniform = uniform.to(device) u_pos, u_dist = uniform[..., :3], uniform[..., 3:] D_optimizer.zero_grad() z = torch.randn(uniform.size(0), LATENT_SIZE, device=device) fake = G(u_pos, z) out_real = D(u_pos, u_dist) out_fake = D(u_pos, fake) D_loss = out_fake.mean() - out_real.mean() alpha = torch.rand((uniform.size(0), 1, 1), device=device) interpolated = alpha * u_dist + (1 - alpha) * fake interpolated.requires_grad_(True) out = D(u_pos, interpolated) grad = torch.autograd.grad(out, interpolated, grad_outputs=torch.ones_like(out), create_graph=True, retain_graph=True,
def train_wdgrl(model, critic, criterion, n_epochs, loader_s, loader_t, save_name, device='cpu', patience=2, n_critic_iters=6, n_discriminator_iters=10, ws_scaler=1, gp_scaler=1, lr_critic=5e-5, lr_discriminator=1e-4): model.train() prev_val_loss = np_inf optim_critic = RMSprop(critic.parameters(), lr=lr_critic) optim_discriminator = Adam(model.parameters(), lr=lr_discriminator) lr_schedule = ReduceLROnPlateau(optim_discriminator, patience=patience, verbose=True, factor=0.5) for epoch in range(1, n_epochs + 1): batches = zip(loader_s, loader_t) n_batches = min(len(loader_s), len(loader_t)) total_loss_critic = 0 total_accuracy = 0 total_wasserstein_distance = 0 total_wasserstein_distance_clf = 0 total_discriminator_loss = 0 for (source_fatures, source_labels, _), (target_fatures, _, _) in tqdm(batches, leave=False, total=n_batches): set_requires_grad(model.feature_extractor, False) # Disable training of feature_extractor set_requires_grad(critic, True) # Enable training of critic source_fatures, target_fatures = source_fatures.to( device), target_fatures.to(device) source_labels = source_labels.to(device) # Get distribution of source and target domain samples in hidden space with no_grad(): h_s = model.feature_extractor(source_fatures).data.view( source_fatures.shape[0], -1) h_t = model.feature_extractor(target_fatures).data.view( target_fatures.shape[0], -1) # Critic training loop for _ in range(n_critic_iters): # Compute Wasserstein distance between source and target domain distribution critic_source = critic(h_s) critic_target = critic(h_t) wasserstein_distance = ws_scaler * (critic_source.mean() - critic_target.mean()) # Compute gradient penalty (1-Lipschitz constrain) gp = gp_scaler * lipschitz_constrain(critic, h_s, h_t, device) critic_loss = -t_abs(wasserstein_distance) + gp optim_critic.zero_grad() critic_loss.backward() optim_critic.step() total_loss_critic += critic_loss.item() total_wasserstein_distance += wasserstein_distance # Discriminator trianing loop set_requires_grad(model.feature_extractor, True) # Enable training of feature_extractor set_requires_grad(critic, False) # Disable training of critic for _ in range(n_discriminator_iters): h_s = model.feature_extractor(source_fatures).view( source_fatures.shape[0], -1) h_t = model.feature_extractor(target_fatures).view( target_fatures.shape[0], -1) predicted_labels = model.discriminator(h_s) discriminator_loss = criterion(predicted_labels.view(-1), source_labels) wasserstein_distance = ws_scaler * (critic(h_s).mean() - critic(h_t).mean()) combined_loss = discriminator_loss + t_abs( wasserstein_distance) total_discriminator_loss += discriminator_loss total_wasserstein_distance_clf += wasserstein_distance optim_discriminator.zero_grad() combined_loss.backward() optim_discriminator.step() lr_schedule.step(combined_loss) # Compute mean losses mean_loss = total_loss_critic / (n_batches * n_critic_iters) mean_wasserstein_distance = total_wasserstein_distance / ( n_batches * n_critic_iters) total_wasserstein_distance_clf = total_wasserstein_distance_clf / ( n_batches * n_discriminator_iters) mean_clf_loss = total_discriminator_loss / (n_batches * n_discriminator_iters) total_loss_discriminator = total_wasserstein_distance_clf + mean_clf_loss tqdm.write( f'EPOCH {epoch:03d}: total_loss_critic={mean_loss:.4f}, ' f'mean_ws_dist_critic={mean_wasserstein_distance:.4f}, ' f'clf_loss={mean_clf_loss:.4f}, ' f'total_discriminator_loss={total_loss_discriminator:.4f}, ' f'mean_ws_dist_discriminator={total_wasserstein_distance_clf:.4f}') # Check quality out, true_l = evaluate.propagte_data_through_network( loader_t, model, device) out_prod = out.view(-1).cpu().numpy() true_l = true_l.cpu().numpy() _ = evaluate.evaluate_model_ova(out_prod, true_l) if mean_clf_loss < prev_val_loss: prev_val_loss = mean_clf_loss print('Saving model ...') save(model.state_dict(), save_name + '.pt') model.eval() model.eval()
class QLearner: def __init__(self, mac, scheme, logger, args): self.args = args self.mac = mac self.logger = logger self.params = list(mac.parameters()) self.last_target_update_episode = 0 self.mixer = None if args.mixer == "qtran_base": self.mixer = QTranBase(args) elif args.mixer == "qtran_alt": raise Exception("Not implemented here!") self.params += list(self.mixer.parameters()) self.target_mixer = copy.deepcopy(self.mixer) self.optimiser = RMSprop(params=self.params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) # a little wasteful to deepcopy (e.g. duplicates action selector), but should work for any MAC self.target_mac = copy.deepcopy(mac) self.log_stats_t = -self.args.learner_log_interval - 1 def train(self, batch: EpisodeBatch, t_env: int, episode_num: int, show_demo=False, save_data=None): # Get the relevant quantities rewards = batch["reward"][:, :-1] actions = batch["actions"][:, :-1] terminated = batch["terminated"][:, :-1].float() mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) avail_actions = batch["avail_actions"] # Calculate estimated Q-Values mac_out = [] mac_hidden_states = [] self.mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): agent_outs = self.mac.forward(batch, t=t) mac_out.append(agent_outs) mac_hidden_states.append(self.mac.hidden_states) mac_out = th.stack(mac_out, dim=1) # Concat over time mac_hidden_states = th.stack(mac_hidden_states, dim=1) mac_hidden_states = mac_hidden_states.reshape(batch.batch_size, self.args.n_agents, batch.max_seq_length, -1).transpose(1, 2) #btav # Pick the Q-Values for the actions taken by each agent chosen_action_qvals = th.gather(mac_out[:, :-1], dim=3, index=actions).squeeze( 3) # Remove the last dim x_mac_out = mac_out.clone().detach() x_mac_out[avail_actions == 0] = -9999999 max_action_qvals, max_action_index = x_mac_out[:, :-1].max(dim=3) max_action_index = max_action_index.detach().unsqueeze(3) is_max_action = (max_action_index == actions).int().float() if show_demo: q_i_data = chosen_action_qvals.detach().cpu().numpy() q_data = (max_action_qvals - chosen_action_qvals).detach().cpu().numpy() # Calculate the Q-Values necessary for the target target_mac_out = [] target_mac_hidden_states = [] self.target_mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): target_agent_outs = self.target_mac.forward(batch, t=t) target_mac_out.append(target_agent_outs) target_mac_hidden_states.append(self.target_mac.hidden_states) # We don't need the first timesteps Q-Value estimate for calculating targets target_mac_out = th.stack(target_mac_out[:], dim=1) # Concat across time target_mac_hidden_states = th.stack(target_mac_hidden_states, dim=1) target_mac_hidden_states = target_mac_hidden_states.reshape( batch.batch_size, self.args.n_agents, batch.max_seq_length, -1).transpose(1, 2) #btav # Mask out unavailable actions target_mac_out[avail_actions[:, :] == 0] = -9999999 # From OG deepmarl mac_out_maxs = mac_out.clone() mac_out_maxs[avail_actions == 0] = -9999999 # Best joint action computed by target agents target_max_actions = target_mac_out.max(dim=3, keepdim=True)[1] # Best joint-action computed by regular agents max_actions_qvals, max_actions_current = mac_out_maxs[:, :].max( dim=3, keepdim=True) if self.args.mixer == "qtran_base": # -- TD Loss -- # Joint-action Q-Value estimates joint_qs, vs = self.mixer(batch[:, :-1], mac_hidden_states[:, :-1]) # Need to argmax across the target agents' actions to compute target joint-action Q-Values if self.args.double_q: max_actions_current_ = th.zeros( size=(batch.batch_size, batch.max_seq_length, self.args.n_agents, self.args.n_actions), device=batch.device) max_actions_current_onehot = max_actions_current_.scatter( 3, max_actions_current[:, :], 1) max_actions_onehot = max_actions_current_onehot else: max_actions = th.zeros( size=(batch.batch_size, batch.max_seq_length, self.args.n_agents, self.args.n_actions), device=batch.device) max_actions_onehot = max_actions.scatter( 3, target_max_actions[:, :], 1) target_joint_qs, target_vs = self.target_mixer( batch[:, 1:], hidden_states=target_mac_hidden_states[:, 1:], actions=max_actions_onehot[:, 1:]) # Td loss targets td_targets = rewards.reshape(-1, 1) + self.args.gamma * ( 1 - terminated.reshape(-1, 1)) * target_joint_qs td_error = (joint_qs - td_targets.detach()) masked_td_error = td_error * mask.reshape(-1, 1) td_loss = (masked_td_error**2).sum() / mask.sum() # -- TD Loss -- # -- Opt Loss -- # Argmax across the current agents' actions if not self.args.double_q: # Already computed if we're doing double Q-Learning max_actions_current_ = th.zeros( size=(batch.batch_size, batch.max_seq_length, self.args.n_agents, self.args.n_actions), device=batch.device) max_actions_current_onehot = max_actions_current_.scatter( 3, max_actions_current[:, :], 1) max_joint_qs, _ = self.mixer( batch[:, :-1], mac_hidden_states[:, :-1], actions=max_actions_current_onehot[:, :-1] ) # Don't use the target network and target agent max actions as per author's email # max_actions_qvals = th.gather(mac_out[:, :-1], dim=3, index=max_actions_current[:,:-1]) opt_error = max_actions_qvals[:, :-1].sum(dim=2).reshape( -1, 1) - max_joint_qs.detach() + vs masked_opt_error = opt_error * mask.reshape(-1, 1) opt_loss = (masked_opt_error**2).sum() / mask.sum() # -- Opt Loss -- # -- Nopt Loss -- # target_joint_qs, _ = self.target_mixer(batch[:, :-1]) nopt_values = chosen_action_qvals.sum(dim=2).reshape( -1, 1) - joint_qs.detach( ) + vs # Don't use target networks here either nopt_error = nopt_values.clamp(max=0) masked_nopt_error = nopt_error * mask.reshape(-1, 1) nopt_loss = (masked_nopt_error**2).sum() / mask.sum() # -- Nopt loss -- elif self.args.mixer == "qtran_alt": raise Exception("Not supported yet.") if show_demo: tot_q_data = joint_qs.detach().cpu().numpy() tot_target = td_targets.detach().cpu().numpy() bs = q_data.shape[0] tot_q_data = tot_q_data.reshape(bs, -1) tot_target = tot_target.reshape(bs, -1) print('action_pair_%d_%d' % (save_data[0], save_data[1]), np.squeeze(q_data[:, 0]), np.squeeze(q_i_data[:, 0]), np.squeeze(tot_q_data[:, 0]), np.squeeze(tot_target[:, 0])) self.logger.log_stat( 'action_pair_%d_%d' % (save_data[0], save_data[1]), np.squeeze(tot_q_data[:, 0]), t_env) return loss = td_loss + self.args.opt_loss * opt_loss + self.args.nopt_min_loss * nopt_loss masked_hit_prob = th.mean(is_max_action, dim=2) * mask hit_prob = masked_hit_prob.sum() / mask.sum() # Optimise self.optimiser.zero_grad() loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.params, self.args.grad_norm_clip) self.optimiser.step() if (episode_num - self.last_target_update_episode ) / self.args.target_update_interval >= 1.0: self._update_targets() self.last_target_update_episode = episode_num if t_env - self.log_stats_t >= self.args.learner_log_interval: self.logger.log_stat("loss", loss.item(), t_env) self.logger.log_stat("hit_prob", hit_prob.item(), t_env) self.logger.log_stat("td_loss", td_loss.item(), t_env) self.logger.log_stat("opt_loss", opt_loss.item(), t_env) self.logger.log_stat("nopt_loss", nopt_loss.item(), t_env) self.logger.log_stat("grad_norm", grad_norm, t_env) if self.args.mixer == "qtran_base": mask_elems = mask.sum().item() self.logger.log_stat( "td_error_abs", (masked_td_error.abs().sum().item() / mask_elems), t_env) self.logger.log_stat( "td_targets", ((masked_td_error).sum().item() / mask_elems), t_env) self.logger.log_stat("td_chosen_qs", (joint_qs.sum().item() / mask_elems), t_env) self.logger.log_stat("v_mean", (vs.sum().item() / mask_elems), t_env) self.logger.log_stat( "agent_indiv_qs", ((chosen_action_qvals * mask).sum().item() / (mask_elems * self.args.n_agents)), t_env) self.log_stats_t = t_env def _update_targets(self): self.target_mac.load_state(self.mac) if self.mixer is not None: self.target_mixer.load_state_dict(self.mixer.state_dict()) self.logger.console_logger.info("Updated target network") def cuda(self): self.mac.cuda() self.target_mac.cuda() if self.mixer is not None: self.mixer.cuda() self.target_mixer.cuda() def save_models(self, path): self.mac.save_models(path) if self.mixer is not None: th.save(self.mixer.state_dict(), "{}/mixer.th".format(path)) th.save(self.optimiser.state_dict(), "{}/opt.th".format(path)) def load_models(self, path): self.mac.load_models(path) # Not quite right but I don't want to save target networks self.target_mac.load_models(path) if self.mixer is not None: self.mixer.load_state_dict( th.load("{}/mixer.th".format(path), map_location=lambda storage, loc: storage)) self.optimiser.load_state_dict( th.load("{}/opt.th".format(path), map_location=lambda storage, loc: storage))
class A2CAgentPytorch(object): """ A unified agent interface: - interact: interact with the environment to collect experience - _take_one_step: take one step - _take_n_steps: take n steps - _discount_reward: discount roll out rewards - train: train on a sample batch - _soft_update_target: soft update the target network - exploration_action: choose an action based on state with random noise added for exploration in training - action: choose an action based on state for execution - value: evaluate value for a state-action pair - evaluation: evaluation a learned agent """ def __init__( self, state_shape, action_dim, memory_capacity=10000, max_steps=None, reward_gamma=0.95, reward_scale=1.0, done_penalty=None, actor_hidden_size=128, critic_hidden_size=128, actor_output_act=nn.functional.log_softmax, critic_loss="mse", actor_lr=0.001, critic_lr=0.001, optimizer_type="adam", entropy_reg=0.01, max_grad_norm=0.5, batch_size=100, epsilon_start=0.9, epsilon_end=0.01, epsilon_decay=0.99, use_cuda=True, train_every_n_episodes=1, ): self.use_raw = False self.state_dim = np.prod(state_shape) self.action_dim = action_dim self.n_episodes = 0 self.n_steps = 0 self.max_steps = max_steps self.reward_gamma = reward_gamma self.reward_scale = reward_scale self.done_penalty = done_penalty self.memory = ReplayMemory(memory_capacity) self.actor_hidden_size = actor_hidden_size self.critic_hidden_size = critic_hidden_size self.actor_output_act = actor_output_act self.critic_loss = critic_loss self.actor_lr = actor_lr self.critic_lr = critic_lr self.optimizer_type = optimizer_type self.entropy_reg = entropy_reg self.max_grad_norm = max_grad_norm self.batch_size = batch_size self.train_every_n_episodes = train_every_n_episodes self.target_tau = 0.01 # params for epsilon greedy self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.use_cuda = use_cuda and th.cuda.is_available() self.actor = ActorNetwork( self.state_dim, self.actor_hidden_size, self.action_dim, self.actor_output_act, ) self.critic = CriticNetwork( self.state_dim, self.action_dim, self.critic_hidden_size, 1 ) if self.optimizer_type == "adam": self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) elif self.optimizer_type == "rmsprop": self.actor_optimizer = RMSprop(self.actor.parameters(), lr=self.actor_lr) self.critic_optimizer = RMSprop(self.critic.parameters(), lr=self.critic_lr) if self.use_cuda: self.actor.cuda() self.critic.cuda() # def resetMemory(self): # self.states = [] # self.actions = [] # self.rewards = [] # def feed(self, ts): # (state, action, reward, next_state, done) = tuple(ts) # obs = np.ravel(state['obs']) # self.memory._push_one(obs, action, reward) # self.n_steps += 1 # if done: # self.n_episodes += 1 # if self.n_steps % self.train_every == 0: # self.train() def feed_game(self, trajectories): t = list(zip(*trajectories)) states = [np.ravel(s["obs"]) for s in t[0]] actions = t[1] rewards = t[2] self.n_episodes += 1 rewards = self._discount_reward(rewards, 0) self.n_steps += len(trajectories) self.memory.push(states, actions, rewards) if self.n_episodes % self.train_every_n_episodes == 0: self.train() # discount roll out rewards def _discount_reward(self, rewards, final_value): discounted_r = np.zeros_like(rewards) running_add = final_value for t in reversed(range(0, len(rewards))): running_add = running_add * self.reward_gamma + rewards[t] discounted_r[t] = running_add return discounted_r # soft update the actor target network or critic target network def _soft_update_target(self, target, source): for t, s in zip(target.parameters(), source.parameters()): t.data.copy_((1.0 - self.target_tau) * t.data + self.target_tau * s.data) # train on a roll out batch def train(self): batch = self.memory.sample(self.batch_size) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view( -1, self.action_dim ) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # update actor network self.actor_optimizer.zero_grad() action_log_probs = self.actor(states_var) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum(action_log_probs * actions_var, 1) values = self.critic(states_var, actions_var) advantages = rewards_var - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg actor_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() # update critic network self.critic_optimizer.zero_grad() target_values = rewards_var if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) critic_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step() # print("actor loss {}, critic loss {}".format(actor_loss, critic_loss)) wandb.log( { "actor_loss": actor_loss, "critic_loss": critic_loss, "total_loss": actor_loss + critic_loss, } ) # predict softmax action based on state def _softmax_action(self, state): state_var = to_tensor_var([state], self.use_cuda) softmax_action_var = th.exp(self.actor(state_var)) if self.use_cuda: softmax_action = softmax_action_var.data.cpu().numpy()[0] else: softmax_action = softmax_action_var.data.numpy()[0] return softmax_action # choose an action based on state with random noise added for exploration in training def exploration_action(self, state, legal_actions): epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * np.exp( -1.0 * self.n_steps * self.epsilon_decay ) if np.random.rand() < epsilon: action = np.random.choice(legal_actions) else: softmax_action = self._softmax_action(state) action = self._pick_action(softmax_action, legal_actions) return action # choose an action based on state for execution def action(self, state, legal_actions): softmax_action = self._softmax_action(state) return self._pick_action(softmax_action, legal_actions) def _pick_action(self, softmax_action, legal_actions): # ie if all probs are 0 legal_probs = remove_illegal(softmax_action, legal_actions) if not np.any(legal_probs): action = np.random.choice(legal_actions) else: action = np.argmax(legal_probs) return action # evaluate value for a state-action pair def value(self, state, action): state_var = to_tensor_var([state], self.use_cuda) action = index_to_one_hot(action, self.action_dim) action_var = to_tensor_var([action], self.use_cuda) value_var = self.critic(state_var, action_var) if self.use_cuda: value = value_var.data.cpu().numpy()[0] else: value = value_var.data.numpy()[0] return value def step(self, state): obs = np.ravel(state["obs"]) return self.exploration_action(obs, state["legal_actions"]) def eval_step(self, state): obs = np.ravel(state["obs"]) softmax_action = remove_illegal( self._softmax_action(obs), state["legal_actions"] ) action = np.argmax(softmax_action) return action, softmax_action def save(self, dir): th.save(self.actor.state_dict(), os.path.join(dir, "actor.pth")) th.save(self.critic.state_dict(), os.path.join(dir, "critic.pth")) def load(self, dir): self.actor.load_state_dict(th.load(os.path.join(dir, "actor.pth"))) self.critic.load_state_dict(th.load(os.path.join(dir, "critic.pth"))) self.actor.eval() self.critic.eval()
def subject_dependent_CV(session=1): # prepare data session = session balance = True shuffle = False modal = 'concat' nor_method = 1 label_smooth = 0.3 fine_tuning = True best_acc_list = [] best_precision_list = [] best_recall_list = [] best_f1_list = [] result_save_path = './seed_cv_results/session{}'.format(session) if not os.path.exists(result_save_path): os.makedirs(result_save_path) # reading the data in the whole dataset for idx in range(1, 16): print("contructing dataset...") eeg = SEED_IV(session=session, individual=idx, modal=modal, shuffle=shuffle, balance=balance, k_fold=3) k_fold_data = eeg.get_training_kfold_data() for fold, (train_X, train_Y, test_X, test_Y) in enumerate(k_fold_data): best_acc = -1 best_precision = -1 best_recall = -1 best_f1 = -1 print("train_X shape", train_X.shape) print("train_Y shape", train_Y.shape) print("test_X shape", test_X.shape) print("test_Y shape", test_Y.shape) train_X, train_Y, test_X, test_Y = seed_normalization(train_X, train_Y, test_X, test_Y, nor_method=1, merge=0, column=0) train_X = train_X.astype(np.float32) test_X = test_X.astype(np.float32) train_Y = train_Y.astype(np.int32) test_Y = test_Y.astype(np.int32) # Hyper-parameters device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') epochs = 500 batch_size = 1024 learning_rate = 1e-4 criterion = LabelSmoothSoftmax(lb_smooth=label_smooth) exp_des = "%d_dependent_in_seesion_%d_fold%d_%s_%s_%s_%d_%d" % ( idx, session, fold, 'balance' if balance else 'without_balance', 'shuffle' if shuffle else "without_shuffle", 'seed', epochs, batch_size) print( "starting subject-dependent training experiments on individual %d in session %d" % (idx, session)) print("train_X shape", train_X.shape) print("train_Y shape", train_Y.shape) print("test_X shape", test_X.shape) print("test_Y shape", test_Y.shape) print("model construction...") net = Hierarchical_ATTN_With_Senti_Map() # if fine_tuning we continue train the pretrained model net = net.to(device) save_model_path = '../../saved_models/%s/session_%d/subject_%d/fold_%d' % ( net.__class__.__name__, session, idx, fold) if not os.path.exists(save_model_path): os.makedirs(save_model_path) optimization = RMSprop(net.parameters(), lr=learning_rate, weight_decay=0.01) # save model training state running_loss_list = [] running_acc_list = [] testing_loss_list = [] testing_acc_list = [] print("start training...") scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer=optimization, T_max=epochs) scheduler_warmup = GradualWarmupScheduler( optimizer=optimization, multiplier=10, total_epoch=np.ceil(0.1 * epochs), after_scheduler=scheduler_cosine) for epoch in range(epochs): net.train() optimization.zero_grad() #print("脏数据统计", torch.sum(torch.isnan(feature), dim=0)) eeg = train_X[:, :310] eye = train_X[:, 310:] eeg = eeg.reshape(-1, 62, 5) eeg = torch.FloatTensor(eeg).to(device) eye = torch.FloatTensor(eye).to(device) #print("eeg type {}, eye type {}".format(type(eeg),type(eye))) target = torch.LongTensor(train_Y).to(device) out = net(eeg, eye) #print("batch output",out[0]) loss = criterion(out, target) loss.backward() clip_grad_norm_(net.parameters(), max_norm=10) optimization.step() scheduler_warmup.step() running_loss = loss.item() #print("batch loss", loss.item()) _, prediction = torch.max(out.data, dim=-1) total = target.size(0) correct = prediction.eq(target.data).cpu().sum().item() cur_loss = running_loss / len(train_X) cur_acc = correct / total if isinstance(cur_acc, torch.Tensor): cur_acc = cur_acc.item() if isinstance(cur_loss, torch.Tensor): cur_loss = cur_loss.item() print( 'Epoch %d/%d\tTraining Loss: %.10f | Acc: %.3f%% (%d/%d)' % (epoch, epochs, cur_loss, 100 * cur_acc, correct, total)) running_loss_list.append(cur_loss) running_acc_list.append(cur_acc) if epoch % 1 == 0: net.eval() print("start evaluating...") eeg = test_X[:, :310] eye = test_X[:, 310:] eeg = eeg.reshape(-1, 62, 5) eeg = torch.FloatTensor(eeg).to(device) eye = torch.FloatTensor(eye).to(device) target = torch.LongTensor(test_Y).to(device) with torch.no_grad(): out = net(eeg, eye) loss = criterion(out, target) testing_loss = loss.item() _, prediction = torch.max(out.data, dim=-1) # print(prediction) test_total = target.size(0) test_correct = prediction.eq( target.data).cpu().sum().item() y_pre = prediction.cpu().numpy() y_true = target.cpu().numpy() test_acc = accuracy_score(y_true, y_pre) test_loss = testing_loss / test_total if isinstance(test_acc, torch.Tensor): test_acc = test_acc.item() if isinstance(test_loss, torch.Tensor): test_loss = test_loss.item() print('Testset Loss: %.10f | Acc: %.3f%% (%d/%d)' % (test_loss, 100 * test_acc, test_correct, test_total)) testing_acc_list.append(test_acc) testing_loss_list.append(test_loss) if test_acc > best_acc: best_acc = test_acc best_precision = precision_score(y_true, y_pre, average="macro") best_recall = precision_score(y_true, y_pre, average="macro") best_f1 = f1_score(y_true, y_pre, average="macro") print( "better model founded in testsets, start saving new model" ) model_name = '%s' % (net.__class__.__name__) state = { 'net': net.state_dict(), 'epoch': epoch, 'best_acc': best_acc, 'current_loss': test_loss } torch.save( state, os.path.join(save_model_path, model_name)) best_f1_list.append(best_f1) best_acc_list.append(best_acc) best_precision_list.append(best_precision) best_recall_list.append(best_recall) plot_acc_loss_curve( { 'train_loss': running_loss_list, 'train_acc': running_acc_list, 'test_loss': testing_loss_list, 'test_acc': testing_acc_list }, net.__class__.__name__, exp_des) df = pd.DataFrame().from_dict({ "acc": best_acc_list, "precision": best_precision_list, "recall": best_recall_list, "f1": best_f1_list }) df_mean = df.mean() df_std = df.std() df = df.append(df_mean, ignore_index=True) df = df.append(df_std, ignore_index=True) df.to_csv(result_save_path + '/results.csv')
class HRAModel(Model): """Neural Network with the HRA architecture """ def __init__(self, name, network_config, restore=True, learning_rate=0.001): logger.info("Building network for %s" % name) self.network_config = network_config model = _HRAModel(network_config) Model.__init__(self, model, name, network_config, restore) logger.info("Created network for %s " % self.name) self.optimizer = RMSprop(self.model.parameters(), lr=learning_rate) self.loss_fn = nn.MSELoss() def clear_weights(self, reward_type): for type in range(self.model.networks): if type != reward_type: getattr(self.model, 'layer_q_{}'.format(type)).weight.data.fill_(0) network = self.network_config.networks[type] for i in range(len(network['layers'])): getattr(self.model, 'network_{}_layer_{}'.format( type, i)).apply(self.weights_init) def display_weights(self): for network_i, network in enumerate(self.network_config.networks): out = input for i in range(len(network['layers'])): print('*****************network_{}_layer_{}'.format( network_i, i)) l, _ = getattr(self.model, 'network_{}_layer_{}'.format(network_i, i)) print(l.weight.data) print('-----------------network_{}_layer_{}'.format( network_i, i)) print('*************layer_q_{}'.format(network_i)) print( getattr(self.model, 'layer_q_{}'.format(network_i)).weight.data) print('-------------layer_q_{}'.format(network_i)) def top_layer(self, reward_type): return getattr(self.model, 'layer_q_{}'.format(reward_type)) def weights_init(self, m): classname = m.__class__.__name__ if type(m) == nn.Linear: m.weight.data.fill_(0) m.bias.data.fill_(0) if classname.find('Conv') != -1: m.weight.data.normal_(0.0, 0.0) elif classname.find('BatchNorm') != -1: m.weight.data.normal_(0.0, 0.0) m.bias.data.fill_(0) def fit(self, states, target, steps): self.optimizer.zero_grad() predict = self.model(states) loss = 0 for i, p in enumerate(predict): loss += self.loss_fn(p, Variable(torch.Tensor(target[i]))) loss.backward() self.optimizer.step() def predict(self, input): q_values = self.predict_batch(input.unsqueeze(0)).squeeze(axis=1) q_actions = np.sum(q_values, axis=0) return np.argmax(q_actions), q_values
class LFMACLearner: def __init__(self, mac, scheme, logger, args): self.args = args self.mac = mac self.logger = logger self.act_params, self.comm_params, self.freq_params = mac.parameters() if args.comm_type != "no": self.act_params += self.comm_params self.last_target_update_episode = 0 self.mixer = None self.freq_mixer = None if args.mixer is not None: if args.mixer == "vdn": self.mixer = VDNMixer() elif args.mixer == "qmix": self.mixer = QMixer(args) else: raise ValueError("Mixer {} not recognised.".format(args.mixer)) self.act_params += list(self.mixer.parameters()) self.target_mixer = copy.deepcopy(self.mixer) if args.freq_mixer is not None and args.comm_type == "normal": if args.freq_mixer == "vdn": self.freq_mixer = VDNMixer() elif args.freq_mixer == "qmix": self.freq_mixer = QMixer(args) else: raise ValueError("Freq Mixer {} not recognised.".format(args.freq_mixer)) self.freq_params += list(self.freq_mixer.parameters()) self.target_freq_mixer = copy.deepcopy(self.freq_mixer) self.freq_optimiser = RMSprop(params=self.freq_params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) self.ac_optimiser = RMSprop(params=self.act_params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) # a little wasteful to deepcopy (e.g. duplicates action selector), but should work for any MAC self.target_mac = copy.deepcopy(mac) self.log_stats_t = -self.args.learner_log_interval - 1 def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): # Calculate estimated Q-Values mac_out = [] freq_out = [] comm_freqs = [] freq_rewards = [] freq_terminated = [] freq_mask = [] freq_states = [] self.mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): agent_outs, comm_act_q, _ = self.mac.forward(batch, t=t) mac_out.append(agent_outs) if self.args.comm_type == "normal" and t % self.args.freq_interval == 0: freq_states.append(batch["state"][:, t]) next_idx = t + self.args.freq_interval if ( t + self.args.freq_interval) < batch.max_seq_length else batch.max_seq_length freq_rewards.append(batch["reward"][:, t:next_idx].sum(dim=1)) comm_freqs.append(batch["is_comm"][:, t]) freq_terminated.append(batch["terminated"][:, t]) freq_mask.append(batch["filled"][:, t]) freq_out.append(comm_act_q) # Calculate the Q-Values necessary for the target target_mac_out = [] target_freq_out = [] self.target_mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): target_agent_outs, target_comm_act_q, _ = self.target_mac.forward(batch, t=t) target_mac_out.append(target_agent_outs) if self.args.comm_type == "normal" and t % self.args.freq_interval == 0: target_freq_out.append(target_comm_act_q) act_loss, act_grad_norm, act_mask, act_masked_td_error, act_chosen_action_qvals, act_targets = self._train_act( batch, mac_out, target_mac_out) if self.args.freq_mixer is not None and self.args.comm_type == "normal": freq_loss, freq_grad_norm, freq_mask, freq_targets = self._train_freq( freq_states, freq_rewards, comm_freqs, freq_terminated, freq_mask, freq_out, target_freq_out) if (episode_num - self.last_target_update_episode) / self.args.target_update_interval >= 1.0: self._update_targets() self.last_target_update_episode = episode_num if t_env - self.log_stats_t >= self.args.learner_log_interval: self.logger.log_stat("act_loss", act_loss.item(), t_env) self.logger.log_stat("act_grad_norm", act_grad_norm, t_env) mask_elems = act_mask.sum().item() self.logger.log_stat("act_td_error_abs", (act_masked_td_error.abs().sum().item() / mask_elems), t_env) self.logger.log_stat("act_q_taken_mean", (act_chosen_action_qvals * act_mask).sum().item() / (mask_elems * self.args.n_agents), t_env) self.logger.log_stat("act_target_mean", (act_targets * act_mask).sum().item() / (mask_elems * self.args.n_agents), t_env) if self.args.freq_mixer is not None and self.args.comm_type == "normal": self.logger.log_stat("freq_loss", freq_loss.item(), t_env) self.logger.log_stat("freq_grad_norm", freq_grad_norm, t_env) freq_mask_elems = freq_mask.sum().item() self.logger.log_stat("act_target_mean", (freq_targets * freq_mask).sum().item() / (freq_mask_elems * self.args.n_agents), t_env) self.log_stats_t = t_env def _train_act(self, batch: EpisodeBatch, mac_out, target_mac_out): # Get the relevant quantities rewards = batch["reward"][:, :-1] actions = batch["actions"][:, :-1] terminated = batch["terminated"][:, :-1].float() mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) avail_actions = batch["avail_actions"] mac_out = th.stack(mac_out, dim=1) # Concat over time # We don't need the first timesteps Q-Value estimate for calculating targets target_mac_out = th.stack(target_mac_out[1:], dim=1) # Pick the Q-Values for the actions taken by each agent chosen_action_qvals = th.gather(mac_out[:, :-1], dim=3, index=actions).squeeze(3) # Mask out unavailable actions target_mac_out[avail_actions[:, 1:] == 0] = -9999999 # Max over target Q-Values if self.args.double_q: # Get actions that maximise live Q (for double q-learning) mac_out_detach = mac_out.clone().detach() mac_out_detach[avail_actions == 0] = -9999999 cur_max_actions = mac_out_detach[:, 1:].max(dim=3, keepdim=True)[1] target_max_qvals = th.gather(target_mac_out, 3, cur_max_actions).squeeze(3) else: target_max_qvals = target_mac_out.max(dim=3)[0] # Mix if self.mixer is not None: chosen_action_qvals = self.mixer(chosen_action_qvals, batch["state"][:, :-1]) target_max_qvals = self.target_mixer(target_max_qvals, batch["state"][:, 1:]) # Calculate 1-step Q-Learning targets targets = rewards + self.args.gamma * (1 - terminated) * target_max_qvals # Td-error td_error = (chosen_action_qvals - targets.detach()) mask = mask.expand_as(td_error) # 0-out the targets that came from padded data masked_td_error = td_error * mask # Normal L2 loss, take mean over actual data loss = (masked_td_error**2).sum() / mask.sum() # Optimise self.ac_optimiser.zero_grad() loss.backward(retain_graph=True) grad_norm = th.nn.utils.clip_grad_norm_(self.act_params, self.args.grad_norm_clip) self.ac_optimiser.step() return loss, grad_norm, mask, masked_td_error, chosen_action_qvals, targets def _train_freq(self, freq_states, freq_rewards, comm_freqs, freq_terminated, freq_mask, freq_out, target_freq_out): # data for LFMAC training freq_rewards = th.stack(freq_rewards, dim=1)[:, :-1] # comm_freqs = th.stack(comm_freqs, dim=1)[:, :-1] freq_terminated = th.stack(freq_terminated, dim=1)[:, :-1].float() freq_mask = th.stack(freq_mask, dim=1)[:, :-1].float() freq_mask[:, 1:] = freq_mask[:, 1:] * (1 - freq_terminated[:, :-1]) freq_states = th.stack(freq_states, dim=1) freq_out = th.stack(freq_out, dim=1) chosen_freq_qvals = th.gather(freq_out[:, :-1], dim=3, index=comm_freqs).squeeze(3) target_freq_out = th.stack(target_freq_out[1:], dim=1) if self.args.double_q: freq_out_detach = freq_out.clone().detach() cur_max__freq = freq_out_detach[:, 1:].max(dim=3, keepdim=True)[1] target_max_freq_qvals = th.gather(target_freq_out, 3, cur_max__freq).squeeze(3) else: target_max_freq_qvals = target_freq_out.max(dim=3)[0] # Mix if self.mixer is not None: chosen_freq_qvals = self.mixer(chosen_freq_qvals, freq_states[:, :-1]) target_max_freq_qvals = self.target_mixer(target_max_freq_qvals, freq_states[:, 1:]) # Calculate 1-step Q-Learning targets freq_targets = freq_rewards + self.args.gamma * (1 - freq_terminated) * target_max_freq_qvals # Td-error td_error = (chosen_freq_qvals - freq_targets.detach()) freq_mask = freq_mask.expand_as(td_error) # 0-out the targets that came from padded data masked_td_error = td_error * freq_mask # Normal L2 loss, take mean over actual data freq_loss = (masked_td_error**2).sum() / freq_mask.sum() # Optimise self.freq_optimiser.zero_grad() freq_loss.backward() freq_grad_norm = th.nn.utils.clip_grad_norm_(self.freq_params, self.args.grad_norm_clip) self.freq_optimiser.step() return freq_loss, freq_grad_norm, freq_mask, freq_targets def _update_targets(self): self.target_mac.load_state(self.mac) if self.mixer is not None: self.target_mixer.load_state_dict(self.mixer.state_dict()) if self.args.freq_mixer is not None and self.args.comm_type == "normal": self.target_freq_mixer.load_state_dict(self.freq_mixer.state_dict()) self.logger.console_logger.info("Updated target network") def cuda(self): self.mac.cuda() self.target_mac.cuda() if self.mixer is not None: self.mixer.cuda() self.target_mixer.cuda() if self.args.comm_type == "normal": self.freq_mixer.cuda() self.target_freq_mixer.cuda() def save_models(self, path): self.mac.save_models(path) if self.args.mixer is not None: th.save(self.mixer.state_dict(), "{}/mixer.th".format(path)) if self.args.freq_mixer is not None and self.args.comm_type == "normal": th.save(self.freq_mixer.state_dict(), "{}/freq_mixer.th".format(path)) th.save(self.freq_optimiser.state_dict(), "{}/freq_opt.th".format(path)) th.save(self.ac_optimiser.state_dict(), "{}/opt.th".format(path)) def load_models(self, path): self.mac.load_models(path) # Not quite right but I don't want to save target networks self.target_mac.load_models(path) if self.args.mixer is not None: self.mixer.load_state_dict( th.load("{}/mixer.th".format(path), map_location=lambda storage, loc: storage)) if self.args.freq_mixer is not None and self.args.comm_type == "normal": self.freq_mixer.load_state_dict( th.load("{}/freq_mixer.th".format(path), map_location=lambda storage, loc: storage)) self.freq_optimiser.load_state_dict( th.load("{}/freq_opt.th".format(path), map_location=lambda storage, loc: storage)) self.ac_optimiser.load_state_dict( th.load("{}/opt.th".format(path), map_location=lambda storage, loc: storage))
class QMixTorchPolicy(Policy): """QMix impl. Assumes homogeneous agents for now. You must use MultiAgentEnv.with_agent_groups() to group agents together for QMix. This creates the proper Tuple obs/action spaces and populates the '_group_rewards' info field. Action masking: to specify an action mask for individual agents, use a dict space with an action_mask key, e.g. {"obs": ob, "action_mask": mask}. The mask space must be `Box(0, 1, (n_actions,))`. """ def __init__(self, obs_space, action_space, config): _validate(obs_space, action_space) config = dict(ray.rllib.agents.qmix.qmix.DEFAULT_CONFIG, **config) self.framework = "torch" super().__init__(obs_space, action_space, config) self.n_agents = len(obs_space.original_space.spaces) self.n_actions = action_space.spaces[0].n self.h_size = config["model"]["lstm_cell_size"] self.has_env_global_state = False self.has_action_mask = False self.device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) agent_obs_space = obs_space.original_space.spaces[0] if isinstance(agent_obs_space, Dict): space_keys = set(agent_obs_space.spaces.keys()) if "obs" not in space_keys: raise ValueError( "Dict obs space must have subspace labeled `obs`") self.obs_size = _get_size(agent_obs_space.spaces["obs"]) if "action_mask" in space_keys: mask_shape = tuple(agent_obs_space.spaces["action_mask"].shape) if mask_shape != (self.n_actions, ): raise ValueError( "Action mask shape must be {}, got {}".format( (self.n_actions, ), mask_shape)) self.has_action_mask = True if ENV_STATE in space_keys: self.env_global_state_shape = _get_size( agent_obs_space.spaces[ENV_STATE]) self.has_env_global_state = True else: self.env_global_state_shape = (self.obs_size, self.n_agents) # The real agent obs space is nested inside the dict config["model"]["full_obs_space"] = agent_obs_space agent_obs_space = agent_obs_space.spaces["obs"] else: self.obs_size = _get_size(agent_obs_space) self.env_global_state_shape = (self.obs_size, self.n_agents) self.model = ModelCatalog.get_model_v2(agent_obs_space, action_space.spaces[0], self.n_actions, config["model"], framework="torch", name="model", default_model=RNNModel).to( self.device) self.target_model = ModelCatalog.get_model_v2( agent_obs_space, action_space.spaces[0], self.n_actions, config["model"], framework="torch", name="target_model", default_model=RNNModel).to(self.device) self.exploration = self._create_exploration() # Setup the mixer network. if config["mixer"] is None: self.mixer = None self.target_mixer = None elif config["mixer"] == "qmix": self.mixer = QMixer(self.n_agents, self.env_global_state_shape, config["mixing_embed_dim"]).to(self.device) self.target_mixer = QMixer( self.n_agents, self.env_global_state_shape, config["mixing_embed_dim"]).to(self.device) elif config["mixer"] == "vdn": self.mixer = VDNMixer().to(self.device) self.target_mixer = VDNMixer().to(self.device) else: raise ValueError("Unknown mixer type {}".format(config["mixer"])) self.cur_epsilon = 1.0 self.update_target() # initial sync # Setup optimizer self.params = list(self.model.parameters()) if self.mixer: self.params += list(self.mixer.parameters()) self.loss = QMixLoss(self.model, self.target_model, self.mixer, self.target_mixer, self.n_agents, self.n_actions, self.config["double_q"], self.config["gamma"]) from torch.optim import RMSprop self.optimiser = RMSprop(params=self.params, lr=config["lr"], alpha=config["optim_alpha"], eps=config["optim_eps"]) @override(Policy) def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, info_batch=None, episodes=None, explore=None, timestep=None, **kwargs): explore = explore if explore is not None else self.config["explore"] obs_batch, action_mask, _ = self._unpack_observation(obs_batch) # We need to ensure we do not use the env global state # to compute actions # Compute actions with torch.no_grad(): q_values, hiddens = _mac( self.model, torch.as_tensor(obs_batch, dtype=torch.float, device=self.device), [ torch.as_tensor( np.array(s), dtype=torch.float, device=self.device) for s in state_batches ]) avail = torch.as_tensor(action_mask, dtype=torch.float, device=self.device) masked_q_values = q_values.clone() masked_q_values[avail == 0.0] = -float("inf") masked_q_values_folded = torch.reshape( masked_q_values, [-1] + list(masked_q_values.shape)[2:]) actions, _ = self.exploration.get_exploration_action( action_distribution=TorchCategorical(masked_q_values_folded), timestep=timestep, explore=explore) actions = torch.reshape( actions, list(masked_q_values.shape)[:-1]).cpu().numpy() hiddens = [s.cpu().numpy() for s in hiddens] return tuple(actions.transpose([1, 0])), hiddens, {} @override(Policy) def compute_log_likelihoods(self, actions, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None): obs_batch, action_mask, _ = self._unpack_observation(obs_batch) return np.zeros(obs_batch.size()[0]) @override(Policy) def learn_on_batch(self, samples): obs_batch, action_mask, env_global_state = self._unpack_observation( samples[SampleBatch.CUR_OBS]) (next_obs_batch, next_action_mask, next_env_global_state) = self._unpack_observation( samples[SampleBatch.NEXT_OBS]) group_rewards = self._get_group_rewards(samples[SampleBatch.INFOS]) input_list = [ group_rewards, action_mask, next_action_mask, samples[SampleBatch.ACTIONS], samples[SampleBatch.DONES], obs_batch, next_obs_batch ] if self.has_env_global_state: input_list.extend([env_global_state, next_env_global_state]) output_list, _, seq_lens = \ chop_into_sequences( episode_ids=samples[SampleBatch.EPS_ID], unroll_ids=samples[SampleBatch.UNROLL_ID], agent_indices=samples[SampleBatch.AGENT_INDEX], feature_columns=input_list, state_columns=[], # RNN states not used here max_seq_len=self.config["model"]["max_seq_len"], dynamic_max=True) # These will be padded to shape [B * T, ...] if self.has_env_global_state: (rew, action_mask, next_action_mask, act, dones, obs, next_obs, env_global_state, next_env_global_state) = output_list else: (rew, action_mask, next_action_mask, act, dones, obs, next_obs) = output_list B, T = len(seq_lens), max(seq_lens) def to_batches(arr, dtype): new_shape = [B, T] + list(arr.shape[1:]) return torch.as_tensor(np.reshape(arr, new_shape), dtype=dtype, device=self.device) rewards = to_batches(rew, torch.float) actions = to_batches(act, torch.long) obs = to_batches(obs, torch.float).reshape( [B, T, self.n_agents, self.obs_size]) action_mask = to_batches(action_mask, torch.float) next_obs = to_batches(next_obs, torch.float).reshape( [B, T, self.n_agents, self.obs_size]) next_action_mask = to_batches(next_action_mask, torch.float) if self.has_env_global_state: env_global_state = to_batches(env_global_state, torch.float) next_env_global_state = to_batches(next_env_global_state, torch.float) # TODO(ekl) this treats group termination as individual termination terminated = to_batches(dones, torch.float).unsqueeze(2).expand( B, T, self.n_agents) # Create mask for where index is < unpadded sequence length filled = np.reshape(np.tile(np.arange(T, dtype=np.float32), B), [B, T]) < np.expand_dims(seq_lens, 1) mask = torch.as_tensor(filled, dtype=torch.float, device=self.device).unsqueeze(2).expand( B, T, self.n_agents) # Compute loss loss_out, mask, masked_td_error, chosen_action_qvals, targets = ( self.loss(rewards, actions, terminated, mask, obs, next_obs, action_mask, next_action_mask, env_global_state, next_env_global_state)) # Optimise self.optimiser.zero_grad() loss_out.backward() grad_norm = torch.nn.utils.clip_grad_norm_( self.params, self.config["grad_norm_clipping"]) self.optimiser.step() mask_elems = mask.sum().item() stats = { "loss": loss_out.item(), "grad_norm": grad_norm if isinstance(grad_norm, float) else grad_norm.item(), "td_error_abs": masked_td_error.abs().sum().item() / mask_elems, "q_taken_mean": (chosen_action_qvals * mask).sum().item() / mask_elems, "target_mean": (targets * mask).sum().item() / mask_elems, } return {LEARNER_STATS_KEY: stats} @override(Policy) def get_initial_state(self): # initial RNN state return [ s.expand([self.n_agents, -1]).cpu().numpy() for s in self.model.get_initial_state() ] @override(Policy) def get_weights(self): return { "model": self._cpu_dict(self.model.state_dict()), "target_model": self._cpu_dict(self.target_model.state_dict()), "mixer": self._cpu_dict(self.mixer.state_dict()) if self.mixer else None, "target_mixer": self._cpu_dict(self.target_mixer.state_dict()) if self.mixer else None, } @override(Policy) def set_weights(self, weights): self.model.load_state_dict(self._device_dict(weights["model"])) self.target_model.load_state_dict( self._device_dict(weights["target_model"])) if weights["mixer"] is not None: self.mixer.load_state_dict(self._device_dict(weights["mixer"])) self.target_mixer.load_state_dict( self._device_dict(weights["target_mixer"])) @override(Policy) def get_state(self): state = self.get_weights() state["cur_epsilon"] = self.cur_epsilon return state @override(Policy) def set_state(self, state): self.set_weights(state) self.set_epsilon(state["cur_epsilon"]) def update_target(self): self.target_model.load_state_dict(self.model.state_dict()) if self.mixer is not None: self.target_mixer.load_state_dict(self.mixer.state_dict()) logger.debug("Updated target networks") def set_epsilon(self, epsilon): self.cur_epsilon = epsilon def _get_group_rewards(self, info_batch): group_rewards = np.array([ info.get(GROUP_REWARDS, [0.0] * self.n_agents) for info in info_batch ]) return group_rewards def _device_dict(self, state_dict): return { k: torch.as_tensor(v, device=self.device) for k, v in state_dict.items() } @staticmethod def _cpu_dict(state_dict): return {k: v.cpu().detach().numpy() for k, v in state_dict.items()} def _unpack_observation(self, obs_batch): """Unpacks the observation, action mask, and state (if present) from agent grouping. Returns: obs (np.ndarray): obs tensor of shape [B, n_agents, obs_size] mask (np.ndarray): action mask, if any state (np.ndarray or None): state tensor of shape [B, state_size] or None if it is not in the batch """ unpacked = _unpack_obs(np.array(obs_batch, dtype=np.float32), self.observation_space.original_space, tensorlib=np) if isinstance(unpacked[0], dict): assert "obs" in unpacked[0] unpacked_obs = [ np.concatenate(tree.flatten(u["obs"]), 1) for u in unpacked ] else: unpacked_obs = unpacked obs = np.concatenate(unpacked_obs, axis=1).reshape( [len(obs_batch), self.n_agents, self.obs_size]) if self.has_action_mask: action_mask = np.concatenate([o["action_mask"] for o in unpacked], axis=1).reshape([ len(obs_batch), self.n_agents, self.n_actions ]) else: action_mask = np.ones( [len(obs_batch), self.n_agents, self.n_actions], dtype=np.float32) if self.has_env_global_state: state = np.concatenate(tree.flatten(unpacked[0][ENV_STATE]), 1) else: state = None return obs, action_mask, state
class AgentMario: def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.9 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 16 self.seed = 7122 self.max_steps = 1e7 self.grad_norm = 0.5 self.entropy_weight = 0.05 ####################### NOTE: You need to implement self.recurrent = True # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 4000 self.save_freq = 100000 self.save_dir = './checkpoints/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) self.hidden = None self.init_game_setting() def _update(self): # TODO: Compute returns # R_t = reward_t + gamma * R_{t+1} # TODO: # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) return loss.item() def _step(self, obs, hiddens, masks): with torch.no_grad(): pass # TODO: # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) # TODO: # Store transitions (obs, hiddens, actions, values, rewards, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) def train(self): print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) while True: # Update once every n-steps for step in range(self.update_freq): self._step(self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: print('Steps: %d/%d | Avg reward: %f' % (total_steps, self.max_steps, avg_reward)) if total_steps % self.save_freq == 0: self.save_model('model.pt') if total_steps >= self.max_steps: break def save_model(self, filename): torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): self.model = torch.load(path) def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): # TODO: Use you model to choose an action return action
class QLearner: def __init__(self, mac, args): self.args = args self.method = args.method if "aiqmix" in self.method: self.imaginary_lambda = args.imaginary_lambda self.mac = mac self.mixer = Mixer(args) # target networks self.target_mac = copy.deepcopy(mac) self.target_mixer = copy.deepcopy(self.mixer) self.disable_gradient(self.target_mac) self.disable_gradient(self.target_mixer) self.modules = [ self.mac, self.mixer, self.target_mac, self.target_mixer ] self.params = list(self.mac.parameters()) + list( self.mixer.parameters()) self.optimizer = RMSprop(params=self.params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) self.n_params = sum(p.numel() for p in self.mac.parameters() if p.requires_grad) + \ sum(p.numel() for p in self.mixer.parameters() if p.requires_grad) if args.has_coach: self.coach = Coach(args) self.target_coach = copy.deepcopy(self.coach) self.disable_gradient(self.target_coach) self.modules.append(self.coach) self.modules.append(self.target_coach) self.n_params += sum(p.numel() for p in self.coach.parameters() if p.requires_grad) coach_params = list(self.coach.parameters()) if "vi1" in self.method: self.vi1 = VI1(args) self.modules.append(self.vi1) coach_params += list(self.vi1.parameters()) if "vi2" in self.method: self.vi2 = VI2(args) self.modules.append(self.vi2) coach_params += list(self.vi2.parameters()) self.coach_params = coach_params self.coach_optimizer = RMSprop(coach_params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) print(f"[info] Total number of params: {self.n_params}") self.buffer = ReplayBuffer(args.buffer_size) self.t = 0 def disable_gradient(self, module): module.eval() for p in module.parameters(): p.requires_grad = False def tensorize(self, args): o, e, c, m, ms, a, r = args device = self.args.device o = torch.Tensor(o).to(device) # [batch, t, n_agents, observation_dim] e = torch.Tensor(e).to(device) # [batch, t, n_others, entity_dim] c = torch.Tensor(c).to(device) # [batch, t, n_agents, attribute_dim] m = torch.Tensor(m).to(device) # [batch, t, n_agents, n_all] ms = torch.Tensor(ms).to( device) # [batch, t, n_agents, n_all] full observation a = torch.LongTensor(a).to(device) # [batch, t, n_agents] r = torch.Tensor(r).to(device) # [batch, t,] mask = ms.sum(-1, keepdims=True).gt(0).float() o = mask * o a = mask.long().squeeze(-1) * a c = mask * (c - 0.5) return o, e, c, m, ms, a, r def update(self, logger, step): if len(self.buffer) < self.args.batch_size: return self.t += 1 o, e, c, m, ms, a, r = self.tensorize( self.buffer.sample(self.args.batch_size)) T = o.shape[1] - 1 # since we have T+1 steps 0, 1, ..., T if self.args.has_coach: # get the z_team_t0 training_team_strategy = self.mac.z_team.clone( ) # save previous team strategy z_t0, mu_t0, logvar_t0 = self.coach(o[:, 0], e[:, 0], c[:, 0], ms[:, 0]) z_t0_target, _, _ = self.target_coach(o[:, 0], e[:, 0], c[:, 0], ms[:, 0]) z_T_target, _, _ = self.target_coach(o[:, T], e[:, T], c[:, T], ms[:, T]) self.mac.set_team_strategy(z_t0) self.target_mac.set_team_strategy(z_t0_target) rnn_hidden = self.mac.init_hidden(o.shape[0], o.shape[2]) # [batch, n_agents, dh] Q = [] H_mixer = [] for t in range(T): prev_a = torch.zeros_like(a[:, 0]) if t == 0 else a[:, t - 1] qa, h, h_full, rnn_hidden = self.mac(o[:, t], e[:, t], c[:, t], m[:, t], ms[:, t], rnn_hidden, prev_a, a[:, t]) if self.args.has_coach: coach_h = self.coach.encode(o[:, t], e[:, t], c[:, t], ms[:, t]) q = self.mixer.coach_forward(coach_h, qa, ms[:, t]) else: q = self.mixer(h_full, qa, ms[:, t]) H_mixer.append(h_full) Q.append(q.unsqueeze(-1)) Q = torch.cat(Q, -1) # [batch, T] with torch.no_grad(): NQ = [] NQ_ = [] rnn_hidden = self.mac.init_hidden( o.shape[0], o.shape[2]) # [batch, n_agents, dh] for t in range(T + 1): if t == T and self.args.has_coach: # update strategy for last step self.target_mac.set_team_strategy(z_T_target) prev_a = torch.zeros_like(a[:, 0]) if t == 0 else a[:, t - 1] qa, h, h_full, rnn_hidden = self.target_mac( o[:, t], e[:, t], c[:, t], m[:, t], ms[:, t], rnn_hidden, prev_a) qa = qa.max(-1)[0] if self.args.has_coach: coach_h = self.target_coach.encode(o[:, t], e[:, t], c[:, t], ms[:, t]) nq = self.target_mixer.coach_forward(coach_h, qa, ms[:, t]) else: nq = self.target_mixer(h_full, qa, ms[:, t]) NQ.append(nq.unsqueeze(-1)) NQ = torch.cat(NQ, -1)[:, 1:] # [batch, T] #if self.args.has_coach: # NQ_ = torch.cat(NQ_, -1)[:,1:] # [batch, T] ###################################################################### # 1a. Bellman error ###################################################################### td_target = r[:, :-1] + self.args.gamma * NQ td_error = F.mse_loss(Q, td_target) #if self.args.has_coach: # td_error = td_error * 0.5 + \ # 0.5 * F.mse_loss(Q_, r[:,:-1] + self.args.gamma * NQ_) ###################################################################### # 1b. Imaginary Bellman error ###################################################################### if "aiqmix" in self.method: rnn_hidden = self.mac.init_hidden(o.shape[0] * 2, o.shape[2]) im_Q = [] for t in range(T): prev_a = torch.zeros_like(a[:, 0]) if t == 0 else a[:, t - 1] im_qa, im_h, im_h_full, rnn_hidden = self.mac.im_forward( o[:, t], e[:, t], c[:, t], m[:, t], ms[:, t], rnn_hidden, prev_a, a[:, t]) h_mixer = im_h_full im_qa = self.mixer.im_forward(h_mixer, H_mixer[t], im_qa, ms[:, t]) im_Q.append(im_qa.unsqueeze(-1)) im_Q = torch.cat(im_Q, -1) im_td_error = F.mse_loss(im_Q, td_target) td_error = (1-self.imaginary_lambda) * td_error + \ self.imaginary_lambda * im_td_error ###################################################################### # 2. ELBO ###################################################################### elbo = 0. if self.args.has_coach: if "vi1" in self.method: vi1_loss = self.vi1(o[:, 0], c[:, 0], ms[:, 0], z_t0) elbo += vi1_loss * self.args.lambda1 if "vi2" in self.method: vi2_loss = self.vi2(o, e, c, m, ms[:, 0], a, z_t0) p_ = D.normal.Normal(mu_t0, (0.5 * logvar_t0).exp()) entropy = p_.entropy().clamp_(0, 10).mean() elbo += vi2_loss * self.args.lambda2 - entropy * self.args.lambda2 / 10 if "vi3" in self.method: p_ = D.normal.Normal(mu_t0, (0.5 * logvar_t0).exp()) q_ = D.normal.Normal(torch.zeros_like(mu_t0), torch.ones_like(logvar_t0)) vi3_loss = D.kl_divergence(p_, q_).clamp_(0, 10).mean() elbo += vi3_loss * self.args.lambda3 #print(f"td {td_error.item():.4f} l2 {vi2_loss.item():.4f}") #print(f"td {td_error.item():.4f} ent {entropy.item():.4f} l2 {vi2_loss.item():.4f}") self.optimizer.zero_grad() if self.args.has_coach: self.coach_optimizer.zero_grad() (td_error + elbo).backward() grad_norm = torch.nn.utils.clip_grad_norm_(self.params, self.args.grad_norm_clip) if self.args.has_coach: coach_grad_norm = torch.nn.utils.clip_grad_norm_( self.coach_params, self.args.grad_norm_clip) self.optimizer.step() if self.args.has_coach: self.coach_optimizer.step() # set back team strategy for rollout self.mac.set_team_strategy(training_team_strategy) # update target once in a while if self.t % self.args.update_target_every == 0: self._update_targets() if "aiqmix" in self.method: logger.add_scalar("im_q_loss", im_td_error.cpu().item(), step) if "vi1" in self.method: logger.add_scalar("vi1", vi1_loss.item(), step) if "vi2" in self.method: logger.add_scalar("vi2", vi2_loss.item(), step) logger.add_scalar("q_loss", td_error.cpu().item(), step) logger.add_scalar("grad_norm", grad_norm.item(), step) def save_models(self, path): torch.save(self.mac.state_dict(), "{}/mac.th".format(path)) torch.save(self.mixer.state_dict(), "{}/mixer.th".format(path)) torch.save(self.optimizer.state_dict(), "{}/opt.th".format(path)) if self.args.has_coach: torch.save(self.coach.state_dict(), "{}/coach.th".format(path)) torch.save(self.coach_optimizer.state_dict(), "{}/coach_opt.th".format(path)) if "vi1" in self.method: torch.save(self.vi1.state_dict(), "{}/vi1.th".format(path)) if "vi2" in self.method: torch.save(self.vi2.state_dict(), "{}/vi2.th".format(path)) def load_models(self, path): self.mac.load_state_dict(torch.load("{}/mac.th".format(path))) self.mixer.load_state_dict(torch.load("{}/mixer.th".format(path))) self.optimizer.load_state_dict(torch.load("{}/opt.th".format(path))) if self.args.has_coach: self.coach.load_state_dict(torch.load("{}/coach.th".format(path))) self.coach_optimizer.load_state_dict( torch.load("{}/coach_opt.th".format(path))) if "vi1" in self.method: self.vi1.load_state_dict(torch.load("{}/vi1.th".format(path))) if "vi2" in self.method: self.vi2.load_state_dict(torch.load("{}/vi2.th".format(path))) self.target_mac = copy.deepcopy(self.mac) self.target_mixer = copy.deepcopy(self.mixer) self.disable_gradient(self.target_mac) self.disable_gradient(self.target_mixer) if self.args.has_coach: self.target_coach = copy.deepcopy(self.coach) self.disable_gradient(self.target_coach) def _update_targets(self): self.target_mac.load_state_dict(self.mac.state_dict()) self.target_mixer.load_state_dict(self.mixer.state_dict()) if self.args.has_coach: self.target_coach.load_state_dict(self.coach.state_dict()) return def cuda(self): for m in self.modules: m.cuda() def cpu(self): for m in self.modules: m.cpu()
class Learner(object): def __init__(self, args, q_batch, actor_critic): self.args = args self.q_batch = q_batch self.actor_critic = actor_critic self.optimizer = RMSprop(self.actor_critic.parameters(), lr=args.lr) self.actor_critic.share_memory() def learning(self): writer = SummaryWriter(log_dir=self.args.result_dir) torch.manual_seed(self.args.seed) coef_hat = torch.Tensor([[self.args.coef_hat]]).to(device) rho_hat = torch.Tensor([[self.args.rho_hat]]).to(device) i = 0 while True: values, coef, rho, entropies, log_prob = [], [], [], [], [] obs, actions, rewards, log_probs, masks, mu_logits, action_onehot = self.q_batch.get( block=True) #print('Get batch: obs: {}, action: {}, reward: {}, prob: {}'.format(obs.shape, actions.shape, rewards.shape, probs.shape)) obs_shape = obs.shape[3:] recurrent_hidden_states = torch.zeros( (self.args.batch_size, self.actor_critic.recurrent_hidden_state_size), device=device) for step in range(obs.size(1)): if step >= actions.size( 1 ): # noted that s[, n_step+1, ...] but a[, n_step,...] value = self.actor_critic.get_value( obs[:, step], recurrent_hidden_states, masks[:, step]) values.append(value) break value, action_log_prob, logits, recurrent_hidden_states = self.actor_critic.evaluate_actions( obs[:, step], recurrent_hidden_states, masks[:, step], actions[:, step]) values.append(value) #logit_a = action_onehot[:, step] * logits.detach() + (1-action_onehot[:, step]) * (1-logits.detach()) #logit_a = logit_a.detach() #prob_a = action_onehot[:, step] * mu_logits[:, step] + (1-action_onehot[:, step]) * (1-mu_logits[:, step]) #print(torch.exp(action_log_prob.detach()-log_probs[:, step])) #is_rate = torch.cumprod(logit_a/(prob_a+1e-6), dim=1)[:, -1] #is_rate = torch.sum(torch.exp(logit_a - prob_a), dim=1) #print(torch.exp(-action_log_prob.detach()+log_probs[:, step])) is_rate = torch.exp(action_log_prob.detach() - log_probs[:, step]) coef.append(torch.min(coef_hat, is_rate)) rho.append(torch.min(rho_hat, is_rate)) policy = F.softmax(logits, dim=1) log_policy = F.log_softmax(logits, dim=1) entropy = torch.sum(-policy * log_policy) entropies.append(entropy) log_prob.append(action_log_prob) policy_loss = 0 baseline_loss = 0 entropy_loss = 0 vs = torch.zeros((obs.size(1), obs.size(0), 1)).to(device) """ vs: v-trace target """ for rev_step in reversed(range(obs.size(1) - 1)): # r + args * v(s+1) - V(s) #fix_vp = rewards[:, rev_step] + self.args.gamma * (values[rev_step+1]+value_loss) - values[rev_step] delta_s = rho[rev_step] * ( rewards[:, rev_step] + self.args.gamma * values[rev_step + 1] - values[rev_step]) # value_loss = v_{s} - V(x_{s}) advantages = rho[rev_step] * ( rewards[:, rev_step] + self.args.gamma * vs[rev_step + 1] - values[rev_step]) vs[rev_step] = values[ rev_step] + delta_s + self.args.gamma * coef[rev_step] * ( vs[rev_step + 1] - values[rev_step + 1]) policy_loss += log_prob[rev_step] * advantages.detach() baseline_loss = torch.sum( 0.5 * (vs[:-1].detach() - torch.stack(values[:-1]))**2) entropy_loss = self.args.entropy_coef * torch.sum( torch.stack(entropies)) policy_loss = policy_loss.sum() loss = policy_loss + self.args.value_loss_coef * baseline_loss - entropy_loss self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.args.max_grad_norm) print( "v_loss {:.3f} p_loss {:.3f} entropy_loss {:.5f} loss {:.3f}". format(baseline_loss.item(), policy_loss.item(), entropy_loss.item(), loss.item())) self.optimizer.step() writer.add_scalar('total_loss', float(loss.item()), i) if (i % self.args.save_interval == 0): torch.save(self.actor_critic, os.path.join(self.args.model_dir, "impala.pt")) i += 1
class NQLearner: def __init__(self, mac, scheme, logger, args): self.args = args self.mac = mac self.logger = logger self.last_target_update_episode = 0 self.device = th.device('cuda' if args.use_cuda else 'cpu') self.params = list(mac.parameters()) if args.mixer == "qatten": self.mixer = QattenMixer(args) elif args.mixer == "vdn": self.mixer = VDNMixer(args) elif args.mixer == "qmix": self.mixer = Mixer(args) else: raise "mixer error" self.target_mixer = copy.deepcopy(self.mixer) self.params += list(self.mixer.parameters()) print('Mixer Size: ') print(get_parameters_num(self.mixer.parameters())) if self.args.optimizer == 'adam': self.optimiser = Adam(params=self.params, lr=args.lr) else: self.optimiser = RMSprop(params=self.params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) # a little wasteful to deepcopy (e.g. duplicates action selector), but should work for any MAC self.target_mac = copy.deepcopy(mac) self.log_stats_t = -self.args.learner_log_interval - 1 self.train_t = 0 # th.autograd.set_detect_anomaly(True) def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): # Get the relevant quantities rewards = batch["reward"][:, :-1] actions = batch["actions"][:, :-1] terminated = batch["terminated"][:, :-1].float() mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) avail_actions = batch["avail_actions"] # Calculate estimated Q-Values mac_out = [] self.mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): agent_outs = self.mac.forward(batch, t=t) mac_out.append(agent_outs) mac_out = th.stack(mac_out, dim=1) # Concat over time # Pick the Q-Values for the actions taken by each agent chosen_action_qvals = th.gather(mac_out[:, :-1], dim=3, index=actions).squeeze( 3) # Remove the last dim chosen_action_qvals_ = chosen_action_qvals # Calculate the Q-Values necessary for the target with th.no_grad(): target_mac_out = [] self.target_mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): target_agent_outs = self.target_mac.forward(batch, t=t) target_mac_out.append(target_agent_outs) # We don't need the first timesteps Q-Value estimate for calculating targets target_mac_out = th.stack(target_mac_out, dim=1) # Concat across time # Max over target Q-Values/ Double q learning mac_out_detach = mac_out.clone().detach() mac_out_detach[avail_actions == 0] = -9999999 cur_max_actions = mac_out_detach.max(dim=3, keepdim=True)[1] target_max_qvals = th.gather(target_mac_out, 3, cur_max_actions).squeeze(3) # Calculate n-step Q-Learning targets target_max_qvals = self.target_mixer(target_max_qvals, batch["state"]) if getattr(self.args, 'q_lambda', False): qvals = th.gather(target_mac_out, 3, batch["actions"]).squeeze(3) qvals = self.target_mixer(qvals, batch["state"]) targets = build_q_lambda_targets(rewards, terminated, mask, target_max_qvals, qvals, self.args.gamma, self.args.td_lambda) else: targets = build_td_lambda_targets(rewards, terminated, mask, target_max_qvals, self.args.n_agents, self.args.gamma, self.args.td_lambda) # Mixer chosen_action_qvals = self.mixer(chosen_action_qvals, batch["state"][:, :-1]) td_error = (chosen_action_qvals - targets.detach()) td_error = 0.5 * td_error.pow(2) mask = mask.expand_as(td_error) masked_td_error = td_error * mask loss = L_td = masked_td_error.sum() / mask.sum() # Optimise self.optimiser.zero_grad() loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.params, self.args.grad_norm_clip) self.optimiser.step() if (episode_num - self.last_target_update_episode ) / self.args.target_update_interval >= 1.0: self._update_targets() self.last_target_update_episode = episode_num if t_env - self.log_stats_t >= self.args.learner_log_interval: self.logger.log_stat("loss_td", L_td.item(), t_env) self.logger.log_stat("grad_norm", grad_norm, t_env) mask_elems = mask.sum().item() self.logger.log_stat( "td_error_abs", (masked_td_error.abs().sum().item() / mask_elems), t_env) self.logger.log_stat("q_taken_mean", (chosen_action_qvals * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) self.logger.log_stat("target_mean", (targets * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) self.log_stats_t = t_env # print estimated matrix if self.args.env == "one_step_matrix_game": print_matrix_status(batch, self.mixer, mac_out) def _update_targets(self): self.target_mac.load_state(self.mac) if self.mixer is not None: self.target_mixer.load_state_dict(self.mixer.state_dict()) self.logger.console_logger.info("Updated target network") def cuda(self): self.mac.cuda() self.target_mac.cuda() if self.mixer is not None: self.mixer.cuda() self.target_mixer.cuda() def save_models(self, path): self.mac.save_models(path) if self.mixer is not None: th.save(self.mixer.state_dict(), "{}/mixer.th".format(path)) th.save(self.optimiser.state_dict(), "{}/opt.th".format(path)) def load_models(self, path): self.mac.load_models(path) # Not quite right but I don't want to save target networks self.target_mac.load_models(path) if self.mixer is not None: self.mixer.load_state_dict( th.load("{}/mixer.th".format(path), map_location=lambda storage, loc: storage)) self.optimiser.load_state_dict( th.load("{}/opt.th".format(path), map_location=lambda storage, loc: storage))
class DDQN_Agent: def __init__(self, args, exp_model, logging_func): self.args = args # Exploration Model self.exp_model = exp_model self.log = logging_func["log"] # Experience Replay if self.args.set_replay: self.replay = ExpReplaySet(10, 10, exp_model, args, priority=False) else: self.replay = ExpReplay(args.exp_replay_size, args.stale_limit, exp_model, args, priority=self.args.prioritized) # DQN and Target DQN model = get_models(args.model) self.dqn = model(actions=args.actions) self.target_dqn = model(actions=args.actions) dqn_params = 0 for weight in self.dqn.parameters(): weight_params = 1 for s in weight.size(): weight_params *= s dqn_params += weight_params print("DQN has {:,} parameters.".format(dqn_params)) self.target_dqn.eval() if args.gpu: print("Moving models to GPU.") self.dqn.cuda() self.target_dqn.cuda() # Optimizer # self.optimizer = Adam(self.dqn.parameters(), lr=args.lr) self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr) self.T = 0 self.target_sync_T = -self.args.t_max def sync_target_network(self): for target, source in zip(self.target_dqn.parameters(), self.dqn.parameters()): target.data = source.data def act(self, state, epsilon, exp_model, evaluation=False): # self.T += 1 self.dqn.eval() orig_state = state[:, :, -1:] state = torch.from_numpy(state).float().transpose_(0, 2).unsqueeze(0) q_values = self.dqn(Variable(state, volatile=True)).cpu().data[0] q_values_numpy = q_values.numpy() extra_info = {} if self.args.optimistic_init and not evaluation: q_values_pre_bonus = np.copy(q_values_numpy) if not self.args.ucb: for a in range(self.args.actions): _, info = exp_model.bonus(orig_state, a, dont_remember=True) action_pseudo_count = info["Pseudo_Count"] # TODO: Log the optimism bonuses optimism_bonus = self.args.optimistic_scaler / np.power(action_pseudo_count + 0.01, self.args.bandit_p) if self.args.tb and self.T % self.args.tb_interval == 0: self.log("Bandit/Action_{}".format(a), optimism_bonus, step=self.T) q_values[a] += optimism_bonus else: action_counts = [] for a in range(self.args.actions): _, info = exp_model.bonus(orig_state, a, dont_remember=True) action_pseudo_count = info["Pseudo_Count"] action_counts.append(action_pseudo_count) total_count = sum(action_counts) for ai, a in enumerate(action_counts): # TODO: Log the optimism bonuses optimisim_bonus = self.args.optimistic_scaler * np.sqrt(2 * np.log(max(1, total_count)) / (a + 0.01)) self.log("Bandit/UCB/Action_{}".format(ai), optimisim_bonus, step=self.T) q_values[ai] += optimisim_bonus extra_info["Action_Bonus"] = q_values_numpy - q_values_pre_bonus extra_info["Q_Values"] = q_values_numpy if np.random.random() < epsilon: action = np.random.randint(low=0, high=self.args.actions) else: action = q_values.max(0)[1][0] # Torch... extra_info["Action"] = action return action, extra_info def experience(self, state, action, reward, state_next, steps, terminated, pseudo_reward=0, density=1, exploring=False): if not exploring: self.T += 1 self.replay.Add_Exp(state, action, reward, state_next, steps, terminated, pseudo_reward, density) def end_of_trajectory(self): self.replay.end_of_trajectory() def train(self): if self.T - self.target_sync_T > self.args.target: self.sync_target_network() self.target_sync_T = self.T info = {} for _ in range(self.args.iters): self.dqn.eval() # TODO: Use a named tuple for experience replay n_step_sample = 1 if np.random.random() < self.args.n_step_mixing: n_step_sample = self.args.n_step batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, n_step_sample, self.args.gamma) columns = list(zip(*batch)) states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3)) actions = Variable(torch.LongTensor(columns[1])) terminal_states = Variable(torch.FloatTensor(columns[5])) rewards = Variable(torch.FloatTensor(columns[2])) # Have to clip rewards for DQN rewards = torch.clamp(rewards, -1, 1) steps = Variable(torch.FloatTensor(columns[4])) new_states = Variable(torch.from_numpy(np.array(columns[3])).float().transpose_(1, 3)) target_dqn_qvals = self.target_dqn(new_states).cpu() # Make a new variable with those values so that these are treated as constants target_dqn_qvals_data = Variable(target_dqn_qvals.data) q_value_targets = (Variable(torch.ones(terminal_states.size()[0])) - terminal_states) inter = Variable(torch.ones(terminal_states.size()[0]) * self.args.gamma) # print(steps) q_value_targets = q_value_targets * torch.pow(inter, steps) if self.args.double: # Double Q Learning new_states_qvals = self.dqn(new_states).cpu() new_states_qvals_data = Variable(new_states_qvals.data) q_value_targets = q_value_targets * target_dqn_qvals_data.gather(1, new_states_qvals_data.max(1)[1]) else: q_value_targets = q_value_targets * target_dqn_qvals_data.max(1)[0] q_value_targets = q_value_targets + rewards self.dqn.train() if self.args.gpu: actions = actions.cuda() q_value_targets = q_value_targets.cuda() model_predictions = self.dqn(states).gather(1, actions.view(-1, 1)) # info = {} td_error = model_predictions - q_value_targets info["TD_Error"] = td_error.mean().data[0] # Update the priorities if not self.args.density_priority: self.replay.Update_Indices(indices, td_error.cpu().data.numpy(), no_pseudo_in_priority=self.args.count_td_priority) # If using prioritised we need to weight the td_error if self.args.prioritized and self.args.prioritized_is: # print(td_error) weights_tensor = torch.from_numpy(is_weights).float() weights_tensor = Variable(weights_tensor) if self.args.gpu: weights_tensor = weights_tensor.cuda() # print(weights_tensor) td_error = td_error * weights_tensor l2_loss = (td_error).pow(2).mean() info["Loss"] = l2_loss.data[0] # Update self.optimizer.zero_grad() l2_loss.backward() # Taken from pytorch clip_grad_norm # Remove once the pip version it up to date with source gradient_norm = clip_grad_norm(self.dqn.parameters(), self.args.clip_value) if gradient_norm is not None: info["Norm"] = gradient_norm self.optimizer.step() if "States" in info: states_trained = info["States"] info["States"] = states_trained + columns[0] else: info["States"] = columns[0] # Pad out the states to be of size batch_size if len(info["States"]) < self.args.batch_size: old_states = info["States"] new_states = old_states[0] * (self.args.batch_size - len(old_states)) info["States"] = new_states return info
class CateQLearner: def __init__(self, mac, scheme, logger, args): self.args = args self.mac = mac self.logger = logger self.params = list(mac.parameters()) self.last_target_update_episode = 0 self.mixer = None if args.mixer is not None: if args.mixer == "vdn": self.mixer = VDNMixer() elif args.mixer == "qmix": self.mixer = QMixer(args) else: raise ValueError("Mixer {} not recognised.".format(args.mixer)) self.params += list(self.mixer.parameters()) self.target_mixer = copy.deepcopy(self.mixer) self.optimiser = RMSprop(params=self.params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) # a little wasteful to deepcopy (e.g. duplicates action selector), but should work for any MAC self.target_mac = copy.deepcopy(mac) self.log_stats_t = -self.args.learner_log_interval - 1 self.s_mu = th.zeros(1) self.s_sigma = th.ones(1) def get_comm_beta(self, t_env): comm_beta = self.args.comm_beta if self.args.is_comm_beta_decay and t_env > self.args.comm_beta_start_decay: comm_beta += 1. * (self.args.comm_beta_target - self.args.comm_beta) / \ (self.args.comm_beta_end_decay - self.args.comm_beta_start_decay) * \ (t_env - self.args.comm_beta_start_decay) return comm_beta def get_comm_entropy_beta(self, t_env): comm_entropy_beta = self.args.comm_entropy_beta if self.args.is_comm_entropy_beta_decay and t_env > self.args.comm_entropy_beta_start_decay: comm_entropy_beta += 1. * (self.args.comm_entropy_beta_target - self.args.comm_entropy_beta) / \ (self.args.comm_entropy_beta_end_decay - self.args.comm_entropy_beta_start_decay) * \ (t_env - self.args.comm_entropy_beta_start_decay) return comm_entropy_beta def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): # Get the relevant quantities rewards = batch["reward"][:, :-1] actions = batch["actions"][:, :-1] terminated = batch["terminated"][:, :-1].float() mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) avail_actions = batch["avail_actions"] # Calculate estimated Q-Values # shape = (bs, self.n_agents, -1) mac_out = [] mu_out = [] sigma_out = [] logits_out = [] m_sample_out = [] g_out = [] self.mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): if self.args.comm and self.args.use_IB: agent_outs, (mu, sigma), logits, m_sample = self.mac.forward(batch, t=t) mu_out.append(mu) sigma_out.append(sigma) logits_out.append(logits) m_sample_out.append(m_sample) else: agent_outs = self.mac.forward(batch, t=t) mac_out.append(agent_outs) mac_out = th.stack(mac_out, dim=1) # Concat over time if self.args.use_IB: mu_out = th.stack(mu_out, dim=1)[:, :-1] # Concat over time sigma_out = th.stack(sigma_out, dim=1)[:, :-1] # Concat over time logits_out = th.stack(logits_out, dim=1)[:, :-1] m_sample_out = th.stack(m_sample_out, dim=1)[:, :-1] # Pick the Q-Values for the actions taken by each agent chosen_action_qvals = th.gather(mac_out[:, :-1], dim=3, index=actions).squeeze( 3) # Remove the last dim # I believe that code up to here is right... # Q values are right, the main issue is to calculate loss for message... # Calculate the Q-Values necessary for the target target_mac_out = [] self.target_mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): if self.args.comm and self.args.use_IB: target_agent_outs, (target_mu, target_sigma), target_logits, target_m_sample = \ self.target_mac.forward(batch, t=t) else: target_agent_outs = self.target_mac.forward(batch, t=t) target_mac_out.append(target_agent_outs) # label label_target_max_out = th.stack(target_mac_out[:-1], dim=1) label_target_max_out[avail_actions[:, :-1] == 0] = -9999999 label_target_actions = label_target_max_out.max(dim=3, keepdim=True)[1] # We don't need the first timesteps Q-Value estimate for calculating targets target_mac_out = th.stack(target_mac_out[1:], dim=1) # Concat across time # Mask out unavailable actions target_mac_out[avail_actions[:, 1:] == 0] = -9999999 # Max over target Q-Values if self.args.double_q: # Get actions that maximise live Q (for double q-learning) mac_out[avail_actions == 0] = -9999999 cur_max_actions = mac_out[:, 1:].max(dim=3, keepdim=True)[1] target_max_qvals = th.gather(target_mac_out, 3, cur_max_actions).squeeze(3) else: target_max_qvals = target_mac_out.max(dim=3)[0] # Mix if self.mixer is not None: chosen_action_qvals = self.mixer(chosen_action_qvals, batch["state"][:, :-1]) target_max_qvals = self.target_mixer(target_max_qvals, batch["state"][:, 1:]) # Calculate 1-step Q-Learning targets targets = rewards + self.args.gamma * (1 - terminated) * target_max_qvals # Td-error td_error = (chosen_action_qvals - targets.detach()) mask = mask.expand_as(td_error) # 0-out the targets that came from padded data masked_td_error = td_error * mask # Normal L2 loss, take mean over actual data loss = (masked_td_error**2).sum() / mask.sum() if self.args.only_downstream or not self.args.use_IB: expressiveness_loss = th.Tensor([0.]) compactness_loss = th.Tensor([0.]) entropy_loss = th.Tensor([0.]) comm_loss = th.Tensor([0.]) comm_beta = th.Tensor([0.]) comm_entropy_beta = th.Tensor([0.]) else: # ### Optimize message # Message are controlled only by expressiveness and compactness loss. # Compute cross entropy with target q values of the same time step expressiveness_loss = 0 label_prob = th.gather(logits_out, 3, label_target_actions).squeeze(3) expressiveness_loss += ( -th.log(label_prob + 1e-6)).sum() / mask.sum() # Compute KL divergence compactness_loss = D.kl_divergence(D.Normal(mu_out, sigma_out), D.Normal(self.s_mu, self.s_sigma)).sum() / \ mask.sum() # Entropy loss entropy_loss = -D.Normal(self.s_mu, self.s_sigma).log_prob( m_sample_out).sum() / mask.sum() # Gate loss gate_loss = 0 # Total loss comm_beta = self.get_comm_beta(t_env) comm_entropy_beta = self.get_comm_entropy_beta(t_env) comm_loss = expressiveness_loss + comm_beta * compactness_loss + comm_entropy_beta * entropy_loss comm_loss *= self.args.c_beta loss += comm_loss comm_beta = th.Tensor([comm_beta]) comm_entropy_beta = th.Tensor([comm_entropy_beta]) # Optimise self.optimiser.zero_grad() loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.params, self.args.grad_norm_clip) self.optimiser.step() # Update target if (episode_num - self.last_target_update_episode ) / self.args.target_update_interval >= 1.0: self._update_targets() self.last_target_update_episode = episode_num if t_env - self.log_stats_t >= self.args.learner_log_interval: self.logger.log_stat("loss", loss.item(), t_env) self.logger.log_stat("comm_loss", comm_loss.item(), t_env) self.logger.log_stat("exp_loss", expressiveness_loss.item(), t_env) self.logger.log_stat("comp_loss", compactness_loss.item(), t_env) self.logger.log_stat("comm_beta", comm_beta.item(), t_env) self.logger.log_stat("entropy_loss", entropy_loss.item(), t_env) self.logger.log_stat("comm_beta", comm_beta.item(), t_env) self.logger.log_stat("comm_entropy_beta", comm_entropy_beta.item(), t_env) self.logger.log_stat("grad_norm", grad_norm, t_env) mask_elems = mask.sum().item() self.logger.log_stat( "td_error_abs", (masked_td_error.abs().sum().item() / mask_elems), t_env) self.logger.log_stat("q_taken_mean", (chosen_action_qvals * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) self.logger.log_stat("target_mean", (targets * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) self.log_stats_t = t_env def _update_targets(self): self.target_mac.load_state(self.mac) if self.mixer is not None: self.target_mixer.load_state_dict(self.mixer.state_dict()) # self.logger.console_logger.info("Updated target network") def cuda(self): self.mac.cuda() self.target_mac.cuda() if self.mixer is not None: self.mixer.cuda() self.target_mixer.cuda() self.s_mu = self.s_mu.cuda() self.s_sigma = self.s_sigma.cuda() def save_models(self, path): self.mac.save_models(path) if self.mixer is not None: th.save(self.mixer.state_dict(), "{}/mixer.th".format(path)) th.save(self.optimiser.state_dict(), "{}/opt.th".format(path)) def load_models(self, path): self.mac.load_models(path) # Not quite right but I don't want to save target networks self.target_mac.load_models(path) if self.mixer is not None: self.mixer.load_state_dict( th.load("{}/mixer.th".format(path), map_location=lambda storage, loc: storage)) self.optimiser.load_state_dict( th.load("{}/opt.th".format(path), map_location=lambda storage, loc: storage))
class AgentMario: def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.9 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 16 self.seed = 7122 self.max_steps = 1e7 self.grad_norm = 0.5 self.entropy_weight = 0.05 ####################### NOTE: You need to implement self.recurrent = True # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 4000 self.save_freq = 100000 self.save_dir = './checkpoints/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) self.hidden = None self.init_game_setting() #### def calc_actual_state_values(self, rewards, dones): R = [] rewards.reverse() # If we happen to end the set on a terminal state, set next return to zero if dones[-1] == True: next_return = 0 # If not terminal state, bootstrap v(s) using our critic # TODO: don't need to estimate again, just take from last value of v(s) estimates else: s = torch.from_numpy(self.rollouts.obs[-1]).float().unsqueeze( 0) #states next_return = self.model.get_state_value(Variable(s)).data[0][0] # Backup from last state to calculate "true" returns for each state in the set R.append(next_return) dones.reverse() for r in range(1, len(rewards)): if not dones[r]: this_return = rewards[r] + next_return * self.gamma else: this_return = 0 R.append(this_return) next_return = this_return R.reverse() state_values_true = Variable(torch.FloatTensor(R)).unsqueeze(1) return state_values_true #### def _update(self): # TODO: Compute returns # R_t = reward_t + gamma * R_{t+1} state_values_true = self.calc_actual_state_values( self.rollouts.rewards, self.rollouts.dones ) #(rewards, dones)#from storage: obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()); obs =state? # TODO: # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) s = Variable(torch.FloatTensor(self.rollouts.obs)) action_probs, state_values_est, hiddens = self.model( s) #action_probs, state_values_est action_log_probs = action_probs.log() a = Variable(torch.LongTensor(self.rollouts.actions).view(-1, 1)) chosen_action_log_probs = action_log_probs.gather(1, a) # This is also the TD error advantages = state_values_true - state_values_est entropy = (action_probs * action_log_probs).sum(1).mean() action_loss = (chosen_action_log_probs * advantages).mean() value_loss = advantages.pow(2).mean() loss = value_loss + action_loss - 0.0001 * entropy #entropy_weight = 0.0001 # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) RolloutStorage.reset() ## return loss.item() def _step(self, obs, hiddens, masks): with torch.no_grad(): pass # TODO: # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical actions, values, hiddens = self.make_action(obs, hiddens, masks) #print("##################################*****************",actions.cpu().numpy(),type(actions.cpu().numpy()),actions.cpu().numpy().shape) #print("##################################*****************",actions.max(1)[0].item()) obs, rewards, dones, infos = self.envs.step( actions.max(1)[0]) #.numpy().max(0)[0].item()) # TODO: # Store transitions (obs, hiddens, actions, values, rewards, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) self.rollouts.to(device) masks = 1 - dones self.rollouts.insert(obs, hiddens, actions, values, rewards, masks) self.rollouts.to(device) def train(self): print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) self.rollouts.obs[0].copy_(obs) #torch.Size([16, 4, 84, 84]) self.rollouts.to(self.device) while True: # Update once every n-steps for step in range(self.update_freq): print("# ******************step***********************", step) #print("self.rollouts.actions[step]", self.rollouts.actions[step]) # print("self.rollouts.obs[step]", self.rollouts.hiddens[step]) # print("self.rollouts.obs[step]", self.rollouts.masks[step]) self._step(self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: print('Steps: %d/%d | Avg reward: %f' % (total_steps, self.max_steps, avg_reward)) if total_steps % self.save_freq == 0: self.save_model('model.pt') if total_steps >= self.max_steps: break def save_model(self, filename): torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): self.model = torch.load(path) def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, hiddens, masks, test=False): # TODO: Use you model to choose an action # if test == True: # observation = torch.from_numpy(observation).permute(2, 0, 1).unsqueeze(0).to(device) # print("!!!!!!!!!!!!!!",observation.shape) # state = torch.from_numpy(observation).float().unsqueeze(0) values, action_probs, hiddens = self.model(observation, hiddens, masks) # m = Categorical(action_probs) # action = m.sample() # #self.saved_actions.append(m.log_prob(action)) return action_probs, values, hiddens
class QMixPolicyGraph(PolicyGraph): """QMix impl. Assumes homogeneous agents for now. You must use MultiAgentEnv.with_agent_groups() to group agents together for QMix. This creates the proper Tuple obs/action spaces and populates the '_group_rewards' info field. Action masking: to specify an action mask for individual agents, use a dict space with an action_mask key, e.g. {"obs": ob, "action_mask": mask}. The mask space must be `Box(0, 1, (n_actions,))`. """ def __init__(self, obs_space, action_space, config): _validate(obs_space, action_space) config = dict(ray.rllib.agents.qmix.qmix.DEFAULT_CONFIG, **config) self.config = config self.observation_space = obs_space self.action_space = action_space self.n_agents = len(obs_space.original_space.spaces) self.n_actions = action_space.spaces[0].n self.h_size = config["model"]["lstm_cell_size"] agent_obs_space = obs_space.original_space.spaces[0] if isinstance(agent_obs_space, Dict): space_keys = set(agent_obs_space.spaces.keys()) if space_keys != {"obs", "action_mask"}: raise ValueError( "Dict obs space for agent must have keyset " "['obs', 'action_mask'], got {}".format(space_keys)) mask_shape = tuple(agent_obs_space.spaces["action_mask"].shape) if mask_shape != (self.n_actions, ): raise ValueError("Action mask shape must be {}, got {}".format( (self.n_actions, ), mask_shape)) self.has_action_mask = True self.obs_size = _get_size(agent_obs_space.spaces["obs"]) # The real agent obs space is nested inside the dict agent_obs_space = agent_obs_space.spaces["obs"] else: self.has_action_mask = False self.obs_size = _get_size(agent_obs_space) self.model = ModelCatalog.get_torch_model( agent_obs_space, self.n_actions, config["model"], default_model_cls=RNNModel) self.target_model = ModelCatalog.get_torch_model( agent_obs_space, self.n_actions, config["model"], default_model_cls=RNNModel) # Setup the mixer network. # The global state is just the stacked agent observations for now. self.state_shape = [self.obs_size, self.n_agents] if config["mixer"] is None: self.mixer = None self.target_mixer = None elif config["mixer"] == "qmix": self.mixer = QMixer(self.n_agents, self.state_shape, config["mixing_embed_dim"]) self.target_mixer = QMixer(self.n_agents, self.state_shape, config["mixing_embed_dim"]) elif config["mixer"] == "vdn": self.mixer = VDNMixer() self.target_mixer = VDNMixer() else: raise ValueError("Unknown mixer type {}".format(config["mixer"])) self.cur_epsilon = 1.0 self.update_target() # initial sync # Setup optimizer self.params = list(self.model.parameters()) self.loss = QMixLoss(self.model, self.target_model, self.mixer, self.target_mixer, self.n_agents, self.n_actions, self.config["double_q"], self.config["gamma"]) self.optimiser = RMSprop( params=self.params, lr=config["lr"], alpha=config["optim_alpha"], eps=config["optim_eps"]) @override(PolicyGraph) def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, info_batch=None, episodes=None, **kwargs): obs_batch, action_mask = self._unpack_observation(obs_batch) # Compute actions with th.no_grad(): q_values, hiddens = _mac( self.model, th.from_numpy(obs_batch), [th.from_numpy(np.array(s)) for s in state_batches]) avail = th.from_numpy(action_mask).float() masked_q_values = q_values.clone() masked_q_values[avail == 0.0] = -float("inf") # epsilon-greedy action selector random_numbers = th.rand_like(q_values[:, :, 0]) pick_random = (random_numbers < self.cur_epsilon).long() random_actions = Categorical(avail).sample().long() actions = (pick_random * random_actions + (1 - pick_random) * masked_q_values.max(dim=2)[1]) actions = actions.numpy() hiddens = [s.numpy() for s in hiddens] return TupleActions(list(actions.transpose([1, 0]))), hiddens, {} @override(PolicyGraph) def learn_on_batch(self, samples): obs_batch, action_mask = self._unpack_observation(samples["obs"]) group_rewards = self._get_group_rewards(samples["infos"]) # These will be padded to shape [B * T, ...] [rew, action_mask, act, dones, obs], initial_states, seq_lens = \ chop_into_sequences( samples["eps_id"], samples["agent_index"], [ group_rewards, action_mask, samples["actions"], samples["dones"], obs_batch ], [samples["state_in_{}".format(k)] for k in range(len(self.get_initial_state()))], max_seq_len=self.config["model"]["max_seq_len"], dynamic_max=True, _extra_padding=1) # TODO(ekl) adding 1 extra unit of padding here, since otherwise we # lose the terminating reward and the Q-values will be unanchored! B, T = len(seq_lens), max(seq_lens) + 1 def to_batches(arr): new_shape = [B, T] + list(arr.shape[1:]) return th.from_numpy(np.reshape(arr, new_shape)) rewards = to_batches(rew)[:, :-1].float() actions = to_batches(act)[:, :-1].long() obs = to_batches(obs).reshape([B, T, self.n_agents, self.obs_size]).float() action_mask = to_batches(action_mask) # TODO(ekl) this treats group termination as individual termination terminated = to_batches(dones.astype(np.float32)).unsqueeze(2).expand( B, T, self.n_agents)[:, :-1] filled = (np.reshape(np.tile(np.arange(T), B), [B, T]) < np.expand_dims(seq_lens, 1)).astype(np.float32) mask = th.from_numpy(filled).unsqueeze(2).expand(B, T, self.n_agents)[:, :-1] mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) # Compute loss loss_out, mask, masked_td_error, chosen_action_qvals, targets = \ self.loss(rewards, actions, terminated, mask, obs, action_mask) # Optimise self.optimiser.zero_grad() loss_out.backward() grad_norm = th.nn.utils.clip_grad_norm_( self.params, self.config["grad_norm_clipping"]) self.optimiser.step() mask_elems = mask.sum().item() stats = { "loss": loss_out.item(), "grad_norm": grad_norm if isinstance(grad_norm, float) else grad_norm.item(), "td_error_abs": masked_td_error.abs().sum().item() / mask_elems, "q_taken_mean": (chosen_action_qvals * mask).sum().item() / mask_elems, "target_mean": (targets * mask).sum().item() / mask_elems, } return {"stats": stats}, {} @override(PolicyGraph) def get_initial_state(self): return [ s.expand([self.n_agents, -1]).numpy() for s in self.model.state_init() ] @override(PolicyGraph) def get_weights(self): return {"model": self.model.state_dict()} @override(PolicyGraph) def set_weights(self, weights): self.model.load_state_dict(weights["model"]) @override(PolicyGraph) def get_state(self): return { "model": self.model.state_dict(), "target_model": self.target_model.state_dict(), "mixer": self.mixer.state_dict() if self.mixer else None, "target_mixer": self.target_mixer.state_dict() if self.mixer else None, "cur_epsilon": self.cur_epsilon, } @override(PolicyGraph) def set_state(self, state): self.model.load_state_dict(state["model"]) self.target_model.load_state_dict(state["target_model"]) if state["mixer"] is not None: self.mixer.load_state_dict(state["mixer"]) self.target_mixer.load_state_dict(state["target_mixer"]) self.set_epsilon(state["cur_epsilon"]) self.update_target() def update_target(self): self.target_model.load_state_dict(self.model.state_dict()) if self.mixer is not None: self.target_mixer.load_state_dict(self.mixer.state_dict()) logger.debug("Updated target networks") def set_epsilon(self, epsilon): self.cur_epsilon = epsilon def _get_group_rewards(self, info_batch): group_rewards = np.array([ info.get(GROUP_REWARDS, [0.0] * self.n_agents) for info in info_batch ]) return group_rewards def _unpack_observation(self, obs_batch): """Unpacks the action mask / tuple obs from agent grouping. Returns: obs (Tensor): flattened obs tensor of shape [B, n_agents, obs_size] mask (Tensor): action mask, if any """ unpacked = _unpack_obs( np.array(obs_batch), self.observation_space.original_space, tensorlib=np) if self.has_action_mask: obs = np.concatenate( [o["obs"] for o in unpacked], axis=1).reshape([len(obs_batch), self.n_agents, self.obs_size]) action_mask = np.concatenate( [o["action_mask"] for o in unpacked], axis=1).reshape( [len(obs_batch), self.n_agents, self.n_actions]) else: obs = np.concatenate( unpacked, axis=1).reshape([len(obs_batch), self.n_agents, self.obs_size]) action_mask = np.ones( [len(obs_batch), self.n_agents, self.n_actions]) return obs, action_mask
i = 1 while i < 1001: for x, _ in dl: if x.size(0) < args.batch_size: break x = x.view(-1, 1, 28, 28).cuda() f = model(x) f_norm = F.normalize(f, p=2, dim=1) I = f_norm.mm(f_norm.t()) loss = -torch.mean((I.detach() > u).float() * torch.log(torch.clamp(I, 1e-10, 1)) + (I.detach() < l).float() * torch.log(torch.clamp(1 - I, 1e-10, 1))) opti_model.zero_grad() loss.backward() opti_model.step() if i % 20 == 0: print('[Epoch {}]\t[Iteration {}]\t[Loss={:.4f}]'.format( epoch, i, loss.detach().cpu().numpy())) i += 1 if i == 1001: break model.eval() pre_y = [] tru_y = []
class DQN_Model_Agent: def __init__(self, args, exp_model, logging_func): self.args = args # Exploration Model self.exp_model = exp_model self.log = logging_func["log"] self.log_image = logging_func["image"] os.makedirs("{}/transition_model".format(args.log_path)) # Experience Replay self.replay = ExpReplay(args.exp_replay_size, args.stale_limit, exp_model, args, priority=self.args.prioritized) # DQN and Target DQN model = get_models(args.model) print("\n\nDQN") self.dqn = model(actions=args.actions) print("Target DQN") self.target_dqn = model(actions=args.actions) dqn_params = 0 for weight in self.dqn.parameters(): weight_params = 1 for s in weight.size(): weight_params *= s dqn_params += weight_params print("Model DQN has {:,} parameters.".format(dqn_params)) self.target_dqn.eval() if args.gpu: print("Moving models to GPU.") self.dqn.cuda() self.target_dqn.cuda() # Optimizer # self.optimizer = Adam(self.dqn.parameters(), lr=args.lr) self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr) self.T = 0 self.target_sync_T = -self.args.t_max # Action sequences self.actions_to_take = [] def sync_target_network(self): for target, source in zip(self.target_dqn.parameters(), self.dqn.parameters()): target.data = source.data def get_pc_estimates(self, root_state, depth=0, starts=None): state = root_state bonuses = [] for action in range(self.args.actions): # Current pc estimates if depth == 0 or not self.args.only_leaf: numpy_state = state[0].numpy().swapaxes(0, 2) _, info = self.exp_model.bonus(numpy_state, action, dont_remember=True) action_pseudo_count = info["Pseudo_Count"] action_bonus = self.args.optimistic_scaler / np.power(action_pseudo_count + 0.01, self.args.bandit_p) if starts is not None: action_bonus += starts[action] # If the depth is 0 we don't want to look any further ahead if depth == 0: bonuses.append(action_bonus) continue one_hot_action = torch.zeros(1, self.args.actions) one_hot_action[0, action] = 1 _, next_state_prediction = self.dqn(Variable(state, volatile=True), Variable(one_hot_action, volatile=True)) next_state_prediction = next_state_prediction.cpu().data next_state_pc_estimates = self.get_pc_estimates(next_state_prediction, depth=depth - 1) if self.args.only_leaf: bonuses += next_state_pc_estimates else: ahead_pc_estimates = [action_bonus + self.args.gamma * n for n in next_state_pc_estimates] bonuses += ahead_pc_estimates return bonuses def act(self, state, epsilon, exp_model, evaluation=False): # self.T += 1 if not evaluation: if len(self.actions_to_take) > 0: action_to_take = self.actions_to_take[0] self.actions_to_take = self.actions_to_take[1:] return action_to_take, {"Action": action_to_take, "Q_Values": self.prev_q_vals} self.dqn.eval() # orig_state = state[:, :, -1:] state = torch.from_numpy(state).float().transpose_(0, 2).unsqueeze(0) q_values = self.dqn(Variable(state, volatile=True)).cpu().data[0] q_values_numpy = q_values.numpy() self.prev_q_vals = q_values_numpy extra_info = {} if self.args.optimistic_init and not evaluation and len(self.actions_to_take) == 0: # 2 action lookahead action_bonuses = self.get_pc_estimates(state, depth=self.args.lookahead_depth, starts=q_values_numpy) # Find the maximum sequence max_so_far = -100000 best_index = 0 best_seq = [] for ii, bonus in enumerate(action_bonuses): if bonus > max_so_far: best_index = ii max_so_far = bonus for depth in range(self.args.lookahead_depth): last_action = best_index % self.args.actions best_index = best_index // self.args.actions best_seq = best_seq + [last_action] # print(best_seq) self.actions_to_take = best_seq extra_info["Q_Values"] = q_values_numpy if np.random.random() < epsilon: action = np.random.randint(low=0, high=self.args.actions) else: action = q_values.max(0)[1][0] # Torch... extra_info["Action"] = action return action, extra_info def experience(self, state, action, reward, state_next, steps, terminated, pseudo_reward=0, density=1, exploring=False): if not exploring: self.T += 1 self.replay.Add_Exp(state, action, reward, state_next, steps, terminated, pseudo_reward, density) def end_of_trajectory(self): self.replay.end_of_trajectory() def train(self): if self.T - self.target_sync_T > self.args.target: self.sync_target_network() self.target_sync_T = self.T info = {} for _ in range(self.args.iters): self.dqn.eval() # TODO: Use a named tuple for experience replay n_step_sample = self.args.n_step batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, n_step_sample, self.args.gamma) columns = list(zip(*batch)) states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3)) actions = Variable(torch.LongTensor(columns[1])) terminal_states = Variable(torch.FloatTensor(columns[5])) rewards = Variable(torch.FloatTensor(columns[2])) # Have to clip rewards for DQN rewards = torch.clamp(rewards, -1, 1) steps = Variable(torch.FloatTensor(columns[4])) new_states = Variable(torch.from_numpy(np.array(columns[3])).float().transpose_(1, 3)) target_dqn_qvals = self.target_dqn(new_states).cpu() # Make a new variable with those values so that these are treated as constants target_dqn_qvals_data = Variable(target_dqn_qvals.data) q_value_targets = (Variable(torch.ones(terminal_states.size()[0])) - terminal_states) inter = Variable(torch.ones(terminal_states.size()[0]) * self.args.gamma) # print(steps) q_value_targets = q_value_targets * torch.pow(inter, steps) if self.args.double: # Double Q Learning new_states_qvals = self.dqn(new_states).cpu() new_states_qvals_data = Variable(new_states_qvals.data) q_value_targets = q_value_targets * target_dqn_qvals_data.gather(1, new_states_qvals_data.max(1)[1]) else: q_value_targets = q_value_targets * target_dqn_qvals_data.max(1)[0] q_value_targets = q_value_targets + rewards self.dqn.train() one_hot_actions = torch.zeros(self.args.batch_size, self.args.actions) for i in range(self.args.batch_size): one_hot_actions[i][actions[i].data] = 1 if self.args.gpu: actions = actions.cuda() one_hot_actions = one_hot_actions.cuda() q_value_targets = q_value_targets.cuda() new_states = new_states.cuda() model_predictions_q_vals, model_predictions_state = self.dqn(states, Variable(one_hot_actions)) model_predictions = model_predictions_q_vals.gather(1, actions.view(-1, 1)) # info = {} td_error = model_predictions - q_value_targets info["TD_Error"] = td_error.mean().data[0] # Update the priorities if not self.args.density_priority: self.replay.Update_Indices(indices, td_error.cpu().data.numpy(), no_pseudo_in_priority=self.args.count_td_priority) # If using prioritised we need to weight the td_error if self.args.prioritized and self.args.prioritized_is: # print(td_error) weights_tensor = torch.from_numpy(is_weights).float() weights_tensor = Variable(weights_tensor) if self.args.gpu: weights_tensor = weights_tensor.cuda() # print(weights_tensor) td_error = td_error * weights_tensor # Model 1 step state transition error # Save them every x steps if self.T % self.args.model_save_image == 0: os.makedirs("{}/transition_model/{}".format(self.args.log_path, self.T)) for ii, image, action, next_state, current_state in zip(range(self.args.batch_size), model_predictions_state.cpu().data, actions.data, new_states.cpu().data, states.cpu().data): image = image.numpy()[0] image = np.clip(image, 0, 1) # print(next_state) next_state = next_state.numpy()[0] current_state = current_state.numpy()[0] black_bars = np.zeros_like(next_state[:1, :]) # print(black_bars.shape) joined_image = np.concatenate((current_state, black_bars, image, black_bars, next_state), axis=0) joined_image = np.transpose(joined_image) self.log_image("{}/transition_model/{}/{}_____Action_{}".format(self.args.log_path, self.T, ii + 1, action), joined_image * 255) # self.log_image("{}/transition_model/{}/{}_____Action_{}".format(self.args.log_path, self.T, ii + 1, action), image * 255) # self.log_image("{}/transition_model/{}/{}_____Correct".format(self.args.log_path, self.T, ii + 1), next_state * 255) # print(model_predictions_state) # Cross Entropy Loss # TODO # Regresssion loss state_error = model_predictions_state - new_states # state_error_val = state_error.mean().data[0] info["State_Error"] = state_error.mean().data[0] self.log("DQN/State_Loss", state_error.mean().data[0], step=self.T) self.log("DQN/State_Loss_Squared", state_error.pow(2).mean().data[0], step=self.T) self.log("DQN/State_Loss_Max", state_error.abs().max().data[0], step=self.T) # self.log("DQN/Action_Matrix_Norm", self.dqn.action_matrix.weight.norm().cpu().data[0], step=self.T) combined_loss = (1 - self.args.model_loss) * td_error.pow(2).mean() + (self.args.model_loss) * state_error.pow(2).mean() l2_loss = combined_loss # l2_loss = (combined_loss).pow(2).mean() info["Loss"] = l2_loss.data[0] # Update self.optimizer.zero_grad() l2_loss.backward() # Taken from pytorch clip_grad_norm # Remove once the pip version it up to date with source gradient_norm = clip_grad_norm(self.dqn.parameters(), self.args.clip_value) if gradient_norm is not None: info["Norm"] = gradient_norm self.optimizer.step() if "States" in info: states_trained = info["States"] info["States"] = states_trained + columns[0] else: info["States"] = columns[0] # Pad out the states to be of size batch_size if len(info["States"]) < self.args.batch_size: old_states = info["States"] new_states = old_states[0] * (self.args.batch_size - len(old_states)) info["States"] = new_states return info
class Trainer: def __init__(self, e_model, g_model, dataload, epoch, lr, device, writer): self.e_model = e_model self.g_model = g_model self.dataload = dataload self.epoch = epoch self.lr = lr self.device = device self.writer = writer self.step = 0 self.optimizer = RMSprop(itertools.chain(self.e_model.parameters(), self.g_model.parameters()), lr=self.lr) # def _set_requires_grad(self, net, requires_grad=False): # for param in net.parameters(): # param.requires_grad = requires_grad def _l2_normalize(self, x, axis=-1): y = torch.max(torch.sum(x**2, axis, keepdim=True), axis, keepdim=True)[0] return x / torch.sqrt(y) def _correlation(self, x, y): x = x - torch.mean(x, dim=1, keepdim=True) y = y - torch.mean(y, dim=1, keepdim=True) x = self._l2_normalize(x, 1) y = self._l2_normalize(y, 1) return torch.sum(x * y, 1, keepdim=True) # def _forward(self): # self.x_fake = self.g_model(self.z) # self.z_fake = self.e_model(self.x_fake) # self.x_fake_ng = self.x_fake.detach() # self.z_real = self.e_model(self.x_real) # self.z_fake_ng = self.e_model(self.x_fake_ng) # def backward(self): # self.z_real_mean = torch.mean(self.z_real, 1, keepdim=True) # self.z_fake_ng_mean = torch.mean(self.z_fake_ng, 1, keepdim=True) # self.z_fake_mean = torch.mean(self.z_fake, 1, keepdim=True) # self.t1_loss = self.z_real_mean - self.z_fake_ng_mean # self.t2_loss = self.z_fake_mean - self.z_fake_ng_mean # self.z_corr = self._correlation(self.z, self.z_fake) # self.qp_loss = 0.25 * self.t1_loss[:, 0] ** 2 / \ # torch.mean((self.x_real - self.x_fake_ng)**2, dim=[1, 2, 3]) # self.z_corr = self._correlation(self.z, self.z_fake_ng) # self.loss = torch.mean(self.t1_loss + self.t2_loss - 0.5 * self.z_corr) + \ # torch.mean(self.qp_loss) # self.loss.backward() def _epoch(self): progress = tqdm(total=len(self.dataload.dataset)) for _, x in enumerate(self.dataload): z = torch.randn(x.size()[0], 128).to(self.device) x_real = x.to(self.device) self.optimizer.zero_grad() x_fake = self.g_model(z) x_fake_ng = x_fake.detach() z_fake = self.e_model(x_fake) z_real = self.e_model(x_real) z_fake_ng = self.e_model(x_fake_ng) z_real_mean = torch.mean(z_real, 1, keepdim=True) z_fake_ng_mean = torch.mean(z_fake_ng, 1, keepdim=True) z_fake_mean = torch.mean(z_fake, 1, keepdim=True) t1_loss = z_real_mean - z_fake_ng_mean t2_loss = z_fake_mean - z_fake_ng_mean z_corr = self._correlation(z, z_fake) qp_loss = 0.25 * t1_loss[:, 0] ** 2 / \ torch.mean((x_real - x_fake_ng)**2, dim=[1, 2, 3]) loss = torch.mean(t1_loss + t2_loss - 0.5 * z_corr) + \ torch.mean(qp_loss) loss.backward() self.optimizer.step() self.writer.add_scalar('t1_loss', torch.mean(t1_loss), self.step) self.writer.add_scalar('z_corr', torch.mean(z_corr), self.step) self.step += 1 progress.update(self.dataload.batch_size) progress.set_description(f't1_loss: {torch.mean(t1_loss).item()}, \ z_corr: {torch.mean(z_corr).item()}') def train(self): self._epoch()
def main(): parser = argparse.ArgumentParser( description='Tuning with Multitask bi-directional RNN-CNN-CRF') parser.add_argument('--config', help='Config file (Python file format)', default="config_multitask.py") parser.add_argument('--grid', help='Grid Search Options', default="{}") args = parser.parse_args() logger = get_logger("Multi-Task") use_gpu = torch.cuda.is_available() # Config Tensorboard Writer log_writer = SummaryWriter() # Load from config file spec = importlib.util.spec_from_file_location("config", args.config) config_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(config_module) config = config_module.entries # Load options from grid search options = eval(args.grid) for k, v in options.items(): if isinstance(v, six.string_types): cmd = "%s = \"%s\"" % (k, v) else: cmd = "%s = %s" % (k, v) log_writer.add_scalar(k, v, 1) exec(cmd) # Load embedding dict embedding = config.embedding.embedding_type embedding_path = config.embedding.embedding_dict embedd_dict, embedd_dim = utils.load_embedding_dict( embedding, embedding_path) # Collect data path data_dir = config.data.data_dir data_names = config.data.data_names train_paths = [ os.path.join(data_dir, data_name, "train.tsv") for data_name in data_names ] dev_paths = [ os.path.join(data_dir, data_name, "devel.tsv") for data_name in data_names ] test_paths = [ os.path.join(data_dir, data_name, "test.tsv") for data_name in data_names ] # Create alphabets logger.info("Creating Alphabets") if not os.path.exists('tmp'): os.mkdir('tmp') word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, ner_alphabet_task, label_reflect = \ bionlp_data.create_alphabets(os.path.join(Path(data_dir).abspath(), "alphabets", "_".join(data_names)), train_paths, data_paths=dev_paths + test_paths, use_cache=True, embedd_dict=embedd_dict, max_vocabulary_size=50000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) logger.info( "NER Alphabet Size per Task: %s", str([task_alphabet.size() for task_alphabet in ner_alphabet_task])) #task_reflects = torch.LongTensor(reverse_reflect(label_reflect, ner_alphabet.size())) #if use_gpu: # task_reflects = task_reflects.cuda() if embedding == 'elmo': logger.info("Loading ELMo Embedder") ee = ElmoEmbedder(options_file=config.embedding.elmo_option, weight_file=config.embedding.elmo_weight, cuda_device=config.embedding.elmo_cuda) else: ee = None logger.info("Reading Data") # Prepare dataset data_trains = [ bionlp_data.read_data_to_variable(train_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet_task[task_id], use_gpu=use_gpu, elmo_ee=ee) for task_id, train_path in enumerate(train_paths) ] num_data = [sum(data_train[1]) for data_train in data_trains] num_labels = ner_alphabet.size() num_labels_task = [task_item.size() for task_item in ner_alphabet_task] data_devs = [ bionlp_data.read_data_to_variable(dev_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet_task[task_id], use_gpu=use_gpu, volatile=True, elmo_ee=ee) for task_id, dev_path in enumerate(dev_paths) ] data_tests = [ bionlp_data.read_data_to_variable(test_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet_task[task_id], use_gpu=use_gpu, volatile=True, elmo_ee=ee) for task_id, test_path in enumerate(test_paths) ] writer = BioNLPWriter(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[bionlp_data.UNK_ID, :] = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in word_alphabet.items(): if not embedd_dict == None and word in embedd_dict: embedding = embedd_dict[word] elif not embedd_dict == None and word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") # Construct network window = 3 num_layers = 1 mode = config.rnn.mode hidden_size = config.rnn.hidden_size char_dim = config.rnn.char_dim num_filters = config.rnn.num_filters tag_space = config.rnn.tag_space bigram = config.rnn.bigram attention_mode = config.rnn.attention if config.rnn.dropout == 'std': network = MultiTaskBiRecurrentCRF( len(data_trains), embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, num_labels_task=num_labels_task, tag_space=tag_space, embedd_word=word_table, p_in=config.rnn.p, p_rnn=config.rnn.p, bigram=bigram, elmo=(embedding == 'elmo'), attention_mode=attention_mode, adv_loss_coef=config.multitask.adv_loss_coef, diff_loss_coef=config.multitask.diff_loss_coef, char_level_rnn=config.rnn.char_level_rnn) else: raise NotImplementedError if use_gpu: network.cuda() # Prepare training unk_replace = config.embedding.unk_replace num_epochs = config.training.num_epochs batch_size = config.training.batch_size lr = config.training.learning_rate momentum = config.training.momentum alpha = config.training.alpha lr_decay = config.training.lr_decay schedule = config.training.schedule gamma = config.training.gamma # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) optim = RMSprop(network.parameters(), lr=lr, alpha=alpha, momentum=momentum, weight_decay=gamma) logger.info( "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d, crf=%s" % (mode, num_layers, hidden_size, num_filters, tag_space, 'bigram' if bigram else 'unigram')) logger.info( "training: l2: %f, (#training data: %s, batch: %d, dropout: %.2f, unk replace: %.2f)" % (gamma, num_data, batch_size, config.rnn.p, unk_replace)) num_batches = [x // batch_size + 1 for x in num_data] dev_f1 = [0.0 for x in num_data] dev_acc = [0.0 for x in num_data] dev_precision = [0.0 for x in num_data] dev_recall = [0.0 for x in num_data] test_f1 = [0.0 for x in num_data] test_acc = [0.0 for x in num_data] test_precision = [0.0 for x in num_data] test_recall = [0.0 for x in num_data] best_epoch = [0 for x in num_data] # Training procedure for epoch in range(1, num_epochs + 1): print( 'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' % (epoch, mode, config.rnn.dropout, lr, lr_decay, schedule)) train_err = 0. train_total = 0. # Gradient decent on training data start_time = time.time() num_back = 0 network.train() batch_count = 0 for batch in range(1, 2 * num_batches[0] + 1): r = random.random() task_id = 0 if r <= 0.5 else random.randint(1, len(num_data) - 1) #if batch > num_batches[task_id]: # batch = batch % num_batches[task_id] + 1 batch_count += 1 word, char, _, _, labels, masks, lengths, elmo_embedding = bionlp_data.get_batch_variable( data_trains[task_id], batch_size, unk_replace=unk_replace) optim.zero_grad() loss, task_loss, adv_loss, diff_loss = network.loss( task_id, word, char, labels, mask=masks, elmo_word=elmo_embedding) #log_writer.add_scalars( # 'train_loss_task' + str(task_id), # {'all_loss': loss, 'task_loss': task_loss, 'adv_loss': adv_loss, 'diff_loss': diff_loss}, # (epoch - 1) * (num_batches[task_id] + 1) + batch #) #log_writer.add_scalars( # 'train_loss_overview', # {'all_loss': loss, 'task_loss': task_loss, 'adv_loss': adv_loss, 'diff_loss': diff_loss}, # (epoch - 1) * (sum(num_batches) + 1) + batch_count #) loss.backward() clip_grad_norm(network.parameters(), 5.0) optim.step() num_inst = word.size(0) train_err += loss.data[0] * num_inst train_total += num_inst time_ave = (time.time() - start_time) / batch time_left = (2 * num_batches[0] - batch) * time_ave # update log if batch % 100 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, time left (estimated): %.2fs' % ( batch, 2 * num_batches[0], train_err / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, time: %.2fs' % (2 * num_batches[0], train_err / train_total, time.time() - start_time)) # Evaluate performance on dev data network.eval() for task_id in range(len(num_batches)): tmp_filename = 'tmp/%s_dev%d%d' % (str(uid), epoch, task_id) writer.start(tmp_filename) for batch in bionlp_data.iterate_batch_variable( data_devs[task_id], batch_size): word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch preds, _ = network.decode( task_id, word, char, target=labels, mask=masks, leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS, elmo_word=elmo_embedding) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() acc, precision, recall, f1 = evaluate(tmp_filename) log_writer.add_scalars( 'dev_task' + str(task_id), { 'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1 }, epoch) print( 'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) if dev_f1[task_id] < f1: dev_f1[task_id] = f1 dev_acc[task_id] = acc dev_precision[task_id] = precision dev_recall[task_id] = recall best_epoch[task_id] = epoch # Evaluate on test data when better performance detected tmp_filename = 'tmp/%s_test%d%d' % (str(uid), epoch, task_id) writer.start(tmp_filename) for batch in bionlp_data.iterate_batch_variable( data_tests[task_id], batch_size): word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch preds, _ = network.decode( task_id, word, char, target=labels, mask=masks, leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS, elmo_word=elmo_embedding) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() test_acc[task_id], test_precision[task_id], test_recall[ task_id], test_f1[task_id] = evaluate(tmp_filename) log_writer.add_scalars( 'test_task' + str(task_id), { 'accuracy': test_acc[task_id], 'precision': test_precision[task_id], 'recall': test_recall[task_id], 'f1': test_f1[task_id] }, epoch) print( "================================================================================" ) print("dataset: %s" % data_names[task_id]) print( "best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (dev_acc[task_id], dev_precision[task_id], dev_recall[task_id], dev_f1[task_id], best_epoch[task_id])) print( "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (test_acc[task_id], test_precision[task_id], test_recall[task_id], test_f1[task_id], best_epoch[task_id])) print( "================================================================================\n" ) if epoch % schedule == 0: # lr = learning_rate / (1.0 + epoch * lr_decay) # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) lr = lr * lr_decay optim.param_groups[0]['lr'] = lr # writer.export_scalars_to_json("./all_scalars.json") writer.close()
class COMALearner: def __init__(self, mac, scheme, logger, args): self.args = args self.n_agents = args.n_agents self.n_actions = args.n_actions self.mac = mac self.logger = logger self.Mode = str(self.args.running_mode) self.last_target_update_step = 0 self.critic_training_steps = 0 self.log_stats_t = -self.args.learner_log_interval - 1 self.critic = COMACritic(scheme, args) self.target_critic = copy.deepcopy(self.critic) self.agent_params = list(mac.parameters()) #print("self.agent_params=",self.agent_params) self.critic_params = list(self.critic.parameters()) self.params = self.agent_params + self.critic_params self.agent_optimiser = RMSprop(params=self.agent_params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) self.critic_optimiser = RMSprop(params=self.critic_params, lr=args.critic_lr, alpha=args.optim_alpha, eps=args.optim_eps) def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): # Get the relevant quantities bs = batch.batch_size #print("episode batch=",EpisodeBatch) #print("batch=",batch,"--------------------------------------------------------------------------") #print("batch[intrinsic_reward]=",batch["intrinsic_reward"],"--------------------------------------------------------------------------") #print("batch[reward]=",batch["reward"],"--------------------------------------------------------------------------") #print("shape of batch[reward]=",batch["actions"].shape,"--------------------------------------------------------------------------") max_t = batch.max_seq_length rewards = batch["reward"][:, :-1] #print("rewards =",rewards.shape) #print("len rewards =",len(rewards)) actions = batch["actions"][:, :] #print("actions =",actions.shape) terminated = batch["terminated"][:, :-1].float() mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) #print("mask =",mask.shape) #print("len mask =",len(mask)) avail_actions = batch["avail_actions"][:, :-1] critic_mask = mask.clone() mask = mask.repeat(1, 1, self.n_agents).view(-1) #print("mask2 =",mask.shape) q_vals, critic_train_stats = self._train_critic( batch, rewards, terminated, actions, avail_actions, critic_mask, bs, max_t) #print("q_vals =",q_vals.shape) actions = actions[:, :-1] #print("actions2 =",actions.shape) mac_out = [] self.mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length - 1): agent_outs = self.mac.forward(batch, t=t) #print("t=",t,"agent_outs=",agent_outs) mac_out.append(agent_outs) mac_out = th.stack(mac_out, dim=1) # Concat over time #print("mac_out=",mac_out.shape) #print("mac_out shape =",mac_out.size()) # Mask out unavailable actions, renormalise (as in action selection) mac_out[avail_actions == 0] = 0 mac_out = mac_out / mac_out.sum(dim=-1, keepdim=True) mac_out[avail_actions == 0] = 0 #print("mac_out2=",mac_out.shape) #print("mac_out shape2 =",mac_out.size()) # Calculated baseline q_vals = q_vals.reshape(-1, self.n_actions) pi = mac_out.view(-1, self.n_actions) baseline = (pi * q_vals).sum(-1).detach() #print("baseline=",baseline.shape) # Calculate policy grad with mask q_taken = th.gather(q_vals, dim=1, index=actions.reshape(-1, 1)).squeeze(1) pi_taken = th.gather(pi, dim=1, index=actions.reshape(-1, 1)).squeeze(1) pi_taken[mask == 0] = 1.0 log_pi_taken = th.log(pi_taken) advantages = th.FloatTensor([0.0]) #torch.clamp(a, min=-0.5, max=0.5) advantages = (q_taken - baseline).detach() #print("advantages",advantages) ##################################################### individual Intrinsic Reward advantages = advantages.reshape(-1) if self.Mode == "2": int_adv = batch["intrinsic_reward"][:, :-1, :].reshape(-1) #print("int_adv",int_adv) clip_ratio = 2 for t in range(len(advantages)): #print("adv shape =",advantages[t]) #print("int_adv shape =",int_adv[t]) int_adv_clipped = th.clamp(int_adv[t], min=clip_ratio * -advantages[t], max=clip_ratio * advantages[t]) advantages[t] = advantages[t] + int_adv_clipped #print("advantages after",advantages) ##################################################### Combined Intrinsic Reward #print("batchzzzz = ",batch["intrinsic_reward"][:, :-1, 3]) elif self.Mode == "5": #print("batch all =", th.cat((batch["intrinsic_reward"][:, :-1, :],batch["intrinsic_reward"][:, :-1, :],batch["intrinsic_reward"][:, :-1, :]),0).reshape(-1).shape) #print("batch soze =", batch["intrinsic_reward"][:, :-1, :].shape) #print("advantages =", advantages.shape) #temp = [] int_adv = batch["intrinsic_reward"][:, :-1, :] for p in range(self.n_agents - 1): int_adv = th.cat( (int_adv, batch["intrinsic_reward"][:, :-1, :]), 0) int_adv = int_adv.view(-1) #int_adv = th.cat((batch["intrinsic_reward"][:, :-1, :],batch["intrinsic_reward"][:, :-1, :],batch["intrinsic_reward"][:, :-1, :]),1).reshape(-1) clip_ratio = 2 for t in range(len(advantages)): #print("adv shape =",len(advantages)) #print("int_adv shape =",len(int_adv)) int_adv_clipped = th.clamp(int_adv[t], min=clip_ratio * -advantages[t], max=clip_ratio * advantages[t]) advantages[t] = advantages[t] + int_adv_clipped else: pass #print("advantages after",advantages) ################################################################################### #print("int_adv",int_adv.shape) #print("batch[intrinsic_reward]",batch["intrinsic_reward"].shape) #print("batch[reward]",batch["reward"].shape) print("log_pi_taken", log_pi_taken.shape) print("advantages", advantages.shape) coma_loss = -((advantages * log_pi_taken) * mask).sum() / mask.sum() #print("self.agent_optimiser=",self.agent_optimiser) # Optimise agents #print(self.critic.parameters()) #print(self.agent_optimiser.parameters()) self.agent_optimiser.zero_grad() coma_loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.agent_params, self.args.grad_norm_clip) self.agent_optimiser.step() if (self.critic_training_steps - self.last_target_update_step ) / self.args.target_update_interval >= 1.0: self._update_targets() self.last_target_update_step = self.critic_training_steps if t_env - self.log_stats_t >= self.args.learner_log_interval: ts_logged = len(critic_train_stats["critic_loss"]) for key in [ "critic_loss", "critic_grad_norm", "td_error_abs", "q_taken_mean", "target_mean" ]: self.logger.log_stat(key, sum(critic_train_stats[key]) / ts_logged, t_env) self.logger.log_stat("advantage_mean", (advantages * mask).sum().item() / mask.sum().item(), t_env) self.logger.log_stat("coma_loss", coma_loss.item(), t_env) self.logger.log_stat("agent_grad_norm", grad_norm, t_env) self.logger.log_stat("pi_max", (pi.max(dim=1)[0] * mask).sum().item() / mask.sum().item(), t_env) self.log_stats_t = t_env def _train_critic(self, batch, rewards, terminated, actions, avail_actions, mask, bs, max_t): # Optimise critic #print("batch obs =",batch["obs"][0][0]) target_q_vals = self.target_critic(batch)[:, :] #print("target_q_vals=",target_q_vals) #print("shape target_q_vals=",target_q_vals.shape) #print("batch obs =",batch["obs"]) #print("size batch obs =",batch["obs"].size()) #print("rewards", rewards) #print("size of rewards", rewards.shape) targets_taken = th.gather(target_q_vals, dim=3, index=actions).squeeze(3) # Calculate td-lambda targets targets = build_td_lambda_targets(rewards, terminated, mask, targets_taken, self.n_agents, self.args.gamma, self.args.td_lambda) #print("targets=",targets) q_vals = th.zeros_like(target_q_vals)[:, :-1] running_log = { "critic_loss": [], "critic_grad_norm": [], "td_error_abs": [], "target_mean": [], "q_taken_mean": [], } for t in reversed(range(rewards.size(1))): #print("mask_t before=",mask[:, t]) mask_t = mask[:, t].expand(-1, self.n_agents) #print("mask_t after=",mask_t) if mask_t.sum() == 0: continue q_t = self.critic(batch, t) # may be implement in here #print("batch check what inside =",batch) #print("q_t=",q_t) q_vals[:, t] = q_t.view(bs, self.n_agents, self.n_actions) #print("q_vals=",q_vals) #print("q_vals shpae=",q_vals.shape) q_taken = th.gather(q_t, dim=3, index=actions[:, t:t + 1]).squeeze(3).squeeze(1) #print("q_taken=",q_taken) targets_t = targets[:, t] #print("targets_t=",targets_t) td_error = (q_taken - targets_t.detach()) # 0-out the targets that came from padded data masked_td_error = td_error * mask_t # Normal L2 loss, take mean over actual data loss = (masked_td_error**2).sum() / mask_t.sum() self.critic_optimiser.zero_grad() loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.critic_params, self.args.grad_norm_clip) self.critic_optimiser.step() self.critic_training_steps += 1 running_log["critic_loss"].append(loss.item()) running_log["critic_grad_norm"].append(grad_norm) mask_elems = mask_t.sum().item() running_log["td_error_abs"].append( (masked_td_error.abs().sum().item() / mask_elems)) running_log["q_taken_mean"].append( (q_taken * mask_t).sum().item() / mask_elems) running_log["target_mean"].append( (targets_t * mask_t).sum().item() / mask_elems) return q_vals, running_log def _update_targets(self): self.target_critic.load_state_dict(self.critic.state_dict()) self.logger.console_logger.info("Updated target network") def cuda(self): self.mac.cuda() self.critic.cuda() self.target_critic.cuda() def save_models(self, path): self.mac.save_models(path) th.save(self.critic.state_dict(), "{}/critic.th".format(path)) th.save(self.agent_optimiser.state_dict(), "{}/agent_opt.th".format(path)) th.save(self.critic_optimiser.state_dict(), "{}/critic_opt.th".format(path)) def load_models(self, path): self.mac.load_models(path) self.critic.load_state_dict( th.load("{}/critic.th".format(path), map_location=lambda storage, loc: storage)) # Not quite right but I don't want to save target networks self.target_critic.load_state_dict(self.critic.state_dict()) self.agent_optimiser.load_state_dict( th.load("{}/agent_opt.th".format(path), map_location=lambda storage, loc: storage)) self.critic_optimiser.load_state_dict( th.load("{}/critic_opt.th".format(path), map_location=lambda storage, loc: storage))
class QMixPolicyGraph(PolicyGraph): """QMix impl. Assumes homogeneous agents for now. You must use MultiAgentEnv.with_agent_groups() to group agents together for QMix. This creates the proper Tuple obs/action spaces and populates the '_group_rewards' info field. Action masking: to specify an action mask for individual agents, use a dict space with an action_mask key, e.g. {"obs": ob, "action_mask": mask}. The mask space must be `Box(0, 1, (n_actions,))`. """ def __init__(self, obs_space, action_space, config): _validate(obs_space, action_space) config = dict(ray.rllib.agents.qmix.qmix.DEFAULT_CONFIG, **config) self.config = config self.observation_space = obs_space self.action_space = action_space self.n_agents = len(obs_space.original_space.spaces) self.n_actions = action_space.spaces[0].n self.h_size = config["model"]["lstm_cell_size"] agent_obs_space = obs_space.original_space.spaces[0] if isinstance(agent_obs_space, Dict): space_keys = set(agent_obs_space.spaces.keys()) if space_keys != {"obs", "action_mask"}: raise ValueError( "Dict obs space for agent must have keyset " "['obs', 'action_mask'], got {}".format(space_keys)) mask_shape = tuple(agent_obs_space.spaces["action_mask"].shape) if mask_shape != (self.n_actions, ): raise ValueError("Action mask shape must be {}, got {}".format( (self.n_actions, ), mask_shape)) self.has_action_mask = True self.obs_size = _get_size(agent_obs_space.spaces["obs"]) # The real agent obs space is nested inside the dict agent_obs_space = agent_obs_space.spaces["obs"] else: self.has_action_mask = False self.obs_size = _get_size(agent_obs_space) self.model = ModelCatalog.get_torch_model( agent_obs_space, self.n_actions, config["model"], default_model_cls=RNNModel) self.target_model = ModelCatalog.get_torch_model( agent_obs_space, self.n_actions, config["model"], default_model_cls=RNNModel) # Setup the mixer network. # The global state is just the stacked agent observations for now. self.state_shape = [self.obs_size, self.n_agents] if config["mixer"] is None: self.mixer = None self.target_mixer = None elif config["mixer"] == "qmix": self.mixer = QMixer(self.n_agents, self.state_shape, config["mixing_embed_dim"]) self.target_mixer = QMixer(self.n_agents, self.state_shape, config["mixing_embed_dim"]) elif config["mixer"] == "vdn": self.mixer = VDNMixer() self.target_mixer = VDNMixer() else: raise ValueError("Unknown mixer type {}".format(config["mixer"])) self.cur_epsilon = 1.0 self.update_target() # initial sync # Setup optimizer self.params = list(self.model.parameters()) self.loss = QMixLoss(self.model, self.target_model, self.mixer, self.target_mixer, self.n_agents, self.n_actions, self.config["double_q"], self.config["gamma"]) self.optimiser = RMSprop( params=self.params, lr=config["lr"], alpha=config["optim_alpha"], eps=config["optim_eps"]) @override(PolicyGraph) def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, info_batch=None, episodes=None, **kwargs): obs_batch, action_mask = self._unpack_observation(obs_batch) # Compute actions with th.no_grad(): q_values, hiddens = _mac( self.model, th.from_numpy(obs_batch), [th.from_numpy(np.array(s)) for s in state_batches]) avail = th.from_numpy(action_mask).float() masked_q_values = q_values.clone() masked_q_values[avail == 0.0] = -float("inf") # epsilon-greedy action selector random_numbers = th.rand_like(q_values[:, :, 0]) pick_random = (random_numbers < self.cur_epsilon).long() random_actions = Categorical(avail).sample().long() actions = (pick_random * random_actions + (1 - pick_random) * masked_q_values.max(dim=2)[1]) actions = actions.numpy() hiddens = [s.numpy() for s in hiddens] return TupleActions(list(actions.transpose([1, 0]))), hiddens, {} @override(PolicyGraph) def learn_on_batch(self, samples): obs_batch, action_mask = self._unpack_observation( samples[SampleBatch.CUR_OBS]) group_rewards = self._get_group_rewards(samples[SampleBatch.INFOS]) # These will be padded to shape [B * T, ...] [rew, action_mask, act, dones, obs], initial_states, seq_lens = \ chop_into_sequences( samples[SampleBatch.EPS_ID], samples[SampleBatch.AGENT_INDEX], [ group_rewards, action_mask, samples[SampleBatch.ACTIONS], samples[SampleBatch.DONES], obs_batch ], [samples["state_in_{}".format(k)] for k in range(len(self.get_initial_state()))], max_seq_len=self.config["model"]["max_seq_len"], dynamic_max=True, _extra_padding=1) # TODO(ekl) adding 1 extra unit of padding here, since otherwise we # lose the terminating reward and the Q-values will be unanchored! B, T = len(seq_lens), max(seq_lens) + 1 def to_batches(arr): new_shape = [B, T] + list(arr.shape[1:]) return th.from_numpy(np.reshape(arr, new_shape)) rewards = to_batches(rew)[:, :-1].float() actions = to_batches(act)[:, :-1].long() obs = to_batches(obs).reshape([B, T, self.n_agents, self.obs_size]).float() action_mask = to_batches(action_mask) # TODO(ekl) this treats group termination as individual termination terminated = to_batches(dones.astype(np.float32)).unsqueeze(2).expand( B, T, self.n_agents)[:, :-1] filled = (np.reshape(np.tile(np.arange(T), B), [B, T]) < np.expand_dims(seq_lens, 1)).astype(np.float32) mask = th.from_numpy(filled).unsqueeze(2).expand(B, T, self.n_agents)[:, :-1] mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) # Compute loss loss_out, mask, masked_td_error, chosen_action_qvals, targets = \ self.loss(rewards, actions, terminated, mask, obs, action_mask) # Optimise self.optimiser.zero_grad() loss_out.backward() grad_norm = th.nn.utils.clip_grad_norm_( self.params, self.config["grad_norm_clipping"]) self.optimiser.step() mask_elems = mask.sum().item() stats = { "loss": loss_out.item(), "grad_norm": grad_norm if isinstance(grad_norm, float) else grad_norm.item(), "td_error_abs": masked_td_error.abs().sum().item() / mask_elems, "q_taken_mean": (chosen_action_qvals * mask).sum().item() / mask_elems, "target_mean": (targets * mask).sum().item() / mask_elems, } return {LEARNER_STATS_KEY: stats}, {} @override(PolicyGraph) def get_initial_state(self): return [ s.expand([self.n_agents, -1]).numpy() for s in self.model.state_init() ] @override(PolicyGraph) def get_weights(self): return {"model": self.model.state_dict()} @override(PolicyGraph) def set_weights(self, weights): self.model.load_state_dict(weights["model"]) @override(PolicyGraph) def get_state(self): return { "model": self.model.state_dict(), "target_model": self.target_model.state_dict(), "mixer": self.mixer.state_dict() if self.mixer else None, "target_mixer": self.target_mixer.state_dict() if self.mixer else None, "cur_epsilon": self.cur_epsilon, } @override(PolicyGraph) def set_state(self, state): self.model.load_state_dict(state["model"]) self.target_model.load_state_dict(state["target_model"]) if state["mixer"] is not None: self.mixer.load_state_dict(state["mixer"]) self.target_mixer.load_state_dict(state["target_mixer"]) self.set_epsilon(state["cur_epsilon"]) self.update_target() def update_target(self): self.target_model.load_state_dict(self.model.state_dict()) if self.mixer is not None: self.target_mixer.load_state_dict(self.mixer.state_dict()) logger.debug("Updated target networks") def set_epsilon(self, epsilon): self.cur_epsilon = epsilon def _get_group_rewards(self, info_batch): group_rewards = np.array([ info.get(GROUP_REWARDS, [0.0] * self.n_agents) for info in info_batch ]) return group_rewards def _unpack_observation(self, obs_batch): """Unpacks the action mask / tuple obs from agent grouping. Returns: obs (Tensor): flattened obs tensor of shape [B, n_agents, obs_size] mask (Tensor): action mask, if any """ unpacked = _unpack_obs( np.array(obs_batch), self.observation_space.original_space, tensorlib=np) if self.has_action_mask: obs = np.concatenate( [o["obs"] for o in unpacked], axis=1).reshape([len(obs_batch), self.n_agents, self.obs_size]) action_mask = np.concatenate( [o["action_mask"] for o in unpacked], axis=1).reshape( [len(obs_batch), self.n_agents, self.n_actions]) else: obs = np.concatenate( unpacked, axis=1).reshape([len(obs_batch), self.n_agents, self.obs_size]) action_mask = np.ones( [len(obs_batch), self.n_agents, self.n_actions]) return obs, action_mask
else: step_image = F.interpolate(pyramid.reconstruct(content_pyramid), (height, width), mode='bilinear', align_corners=False) lr = 1e-3 content_pyramid = pyramid(step_image) content_pyramid = [ layer.data.requires_grad_() for layer in content_pyramid ] optim = RMSprop(content_pyramid, lr=lr) try: for i in range(200): result_image = pyramid.reconstruct(content_pyramid) optim.zero_grad() out_features = checkpoint(vgg_encoder, result_image) loss = criteria(out_features, content_features, style_features, indices, alpha) loss.backward() optim.step() indices = indices_generator(con_image.shape) except RuntimeError as e: print(f'Error: {e}') if torch.cuda.is_available(): torch.cuda.empty_cache() break alpha /= 2.0 result = pyramid.reconstruct(content_pyramid) result.data.clamp_(0, 1) save_tensor_to_image(result, args.output, args.max_resolution)
class LIIRLearner: def __init__(self, mac, scheme, logger, args): self.args = args self.n_agents = args.n_agents self.n_actions = args.n_actions self.mac = mac self.logger = logger self.last_target_update_step = 0 self.critic_training_steps = 0 self.log_stats_t = -self.args.learner_log_interval - 1 self.critic = LIIRCritic(scheme, args) self.target_critic = copy.deepcopy(self.critic) self.policy_new = copy.deepcopy(self.mac) self.policy_old = copy.deepcopy(self.mac) if self.args.use_cuda: # following two lines should be used when use GPU self.policy_old.agent = self.policy_old.agent.to("cuda") self.policy_new.agent = self.policy_new.agent.to("cuda") else: # following lines should be used when use CPU, self.policy_old.agent = self.policy_old.agent.to("cpu") self.policy_new.agent = self.policy_new.agent.to("cpu") self.agent_params = list(mac.parameters()) self.critic_params = list(self.critic.fc1.parameters()) + list( self.critic.fc2.parameters()) + list( self.critic.fc3_v_mix.parameters()) self.intrinsic_params = list(self.critic.fc3_r_in.parameters()) + list( self.critic.fc4.parameters()) # to do self.params = self.agent_params + self.critic_params + self.intrinsic_params self.agent_optimiser = RMSprop(params=self.agent_params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) self.critic_optimiser = RMSprop(params=self.critic_params, lr=args.critic_lr, alpha=args.optim_alpha, eps=args.optim_eps) self.intrinsic_optimiser = RMSprop( params=self.intrinsic_params, lr=args.critic_lr, alpha=args.optim_alpha, eps=args.optim_eps) # should distinguish them self.update = 0 self.count = 0 def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): # Get the relevant quantities bs = batch.batch_size max_t = batch.max_seq_length rewards = batch["reward"][:, :-1] actions = batch["actions"][:, :] terminated = batch["terminated"][:, :-1].float() mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) avail_actions = batch["avail_actions"][:, :-1] critic_mask = mask.clone() mask_long = mask.repeat(1, 1, self.n_agents).view(-1, 1) mask = mask.view(-1, 1) avail_actions1 = avail_actions.reshape(-1, self.n_agents, self.n_actions) # [maskxx,:] mask_alive = 1.0 - avail_actions1[:, :, 0] mask_alive = mask_alive.float() q_vals, critic_train_stats, target_mix, target_ex, v_ex, r_in = self._train_critic( batch, rewards, terminated, actions, avail_actions, critic_mask, bs, max_t) actions = actions[:, :-1] mac_out = [] self.mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length - 1): agent_outs = self.mac.forward(batch, t=t) mac_out.append(agent_outs) mac_out = th.stack(mac_out, dim=1) # Concat over time # Mask out unavailable actions, renormalise (as in action selection) mac_out[avail_actions == 0] = 0 mac_out = mac_out / mac_out.sum(dim=-1, keepdim=True) mac_out[avail_actions == 0] = 0 # Calculated baseline q_vals = q_vals.reshape(-1, 1) pi = mac_out.view(-1, self.n_actions) # Calculate policy grad with mask pi_taken = th.gather(pi, dim=1, index=actions.reshape(-1, 1)).squeeze(1) pi_taken[mask_long.squeeze(-1) == 0] = 1.0 log_pi_taken = th.log(pi_taken) advantages = (target_mix.reshape(-1, 1) - q_vals).detach() advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) log_pi_taken = log_pi_taken.reshape(-1, self.n_agents) log_pi_taken = log_pi_taken * mask_alive log_pi_taken = log_pi_taken.reshape(-1, 1) liir_loss = -( (advantages * log_pi_taken) * mask_long).sum() / mask_long.sum() # Optimise agents self.agent_optimiser.zero_grad() liir_loss.backward() grad_norm_policy = th.nn.utils.clip_grad_norm_( self.agent_params, self.args.grad_norm_clip) self.agent_optimiser.step() # _________Intrinsic loss optimizer -------------------- # ____value loss v_ex_loss = (((v_ex - target_ex.detach())**2).view(-1, 1) * mask).sum() / mask.sum() # _____pg1____ mac_out_old = [] self.policy_old.init_hidden(batch.batch_size) for t in range(batch.max_seq_length - 1): agent_outs_tmp = self.policy_old.forward(batch, t=t, test_mode=True) mac_out_old.append(agent_outs_tmp) mac_out_old = th.stack(mac_out_old, dim=1) # Concat over time # Mask out unavailable actions, renormalise (as in action selection) mac_out_old[avail_actions == 0] = 0 mac_out_old = mac_out_old / mac_out.sum(dim=-1, keepdim=True) mac_out_old[avail_actions == 0] = 0 pi_old = mac_out_old.view(-1, self.n_actions) # Calculate policy grad with mask pi_taken_old = th.gather(pi_old, dim=1, index=actions.reshape(-1, 1)).squeeze(1) pi_taken_old[mask_long.squeeze(-1) == 0] = 1.0 log_pi_taken_old = th.log(pi_taken_old) log_pi_taken_old = log_pi_taken_old.reshape(-1, self.n_agents) log_pi_taken_old = log_pi_taken_old * mask_alive # ______pg2___new pi theta self._update_policy() # update policy_new to new params mac_out_new = [] self.policy_new.init_hidden(batch.batch_size) for t in range(batch.max_seq_length - 1): agent_outs_tmp = self.policy_new.forward(batch, t=t, test_mode=True) mac_out_new.append(agent_outs_tmp) mac_out_new = th.stack(mac_out_new, dim=1) # Concat over time # Mask out unavailable actions, renormalise (as in action selection) mac_out_new[avail_actions == 0] = 0 mac_out_new = mac_out_new / mac_out.sum(dim=-1, keepdim=True) mac_out_new[avail_actions == 0] = 0 pi_new = mac_out_new.view(-1, self.n_actions) # Calculate policy grad with mask pi_taken_new = th.gather(pi_new, dim=1, index=actions.reshape(-1, 1)).squeeze(1) pi_taken_new[mask_long.squeeze(-1) == 0] = 1.0 log_pi_taken_new = th.log(pi_taken_new) log_pi_taken_new = log_pi_taken_new.reshape(-1, self.n_agents) log_pi_taken_new = log_pi_taken_new * mask_alive neglogpac_new = -log_pi_taken_new.sum(-1) pi2 = log_pi_taken.reshape(-1, self.n_agents).sum(-1).clone() ratio_new = th.exp(-pi2 - neglogpac_new) adv_ex = (target_ex - v_ex.detach()).detach() adv_ex = (adv_ex - adv_ex.mean()) / (adv_ex.std() + 1e-8) # _______ gadient for pg 1 and 2--- mask_tnagt = critic_mask.repeat(1, 1, self.n_agents) pg_loss1 = (log_pi_taken_old.view(-1, 1) * mask_long).sum() / mask_long.sum() pg_loss2 = ((adv_ex.view(-1) * ratio_new) * mask.squeeze(-1)).sum() / mask.sum() self.policy_old.agent.zero_grad() pg_loss1_grad = th.autograd.grad(pg_loss1, self.policy_old.parameters()) self.policy_new.agent.zero_grad() pg_loss2_grad = th.autograd.grad(pg_loss2, self.policy_new.parameters()) grad_total = 0 for grad1, grad2 in zip(pg_loss1_grad, pg_loss2_grad): grad_total += (grad1 * grad2).sum() target_mix = target_mix.reshape(-1, max_t - 1, self.n_agents) pg_ex_loss = ((grad_total.detach() * target_mix) * mask_tnagt).sum() / mask_tnagt.sum() intrinsic_loss = pg_ex_loss + vf_coef * v_ex_loss self.intrinsic_optimiser.zero_grad() intrinsic_loss.backward() self.intrinsic_optimiser.step() self._update_policy_piold() # ______config tensorboard if (self.critic_training_steps - self.last_target_update_step ) / self.args.target_update_interval >= 1.0: self._update_targets() self.last_target_update_step = self.critic_training_steps if t_env - self.log_stats_t >= self.args.learner_log_interval: ts_logged = len(critic_train_stats["critic_loss"]) for key in [ "critic_loss", "critic_grad_norm", "td_error_abs", "value_mean", "target_mean" ]: self.logger.log_stat(key, sum(critic_train_stats[key]) / ts_logged, t_env) self.logger.log_stat("advantage_mean", (advantages * mask_long).sum().item() / mask_long.sum().item(), t_env) self.logger.log_stat("liir_loss", liir_loss.item(), t_env) self.logger.log_stat("agent_grad_norm", grad_norm_policy, t_env) self.logger.log_stat( "pi_max", (pi.max(dim=1)[0] * mask_long.squeeze(-1)).sum().item() / mask_long.sum().item(), t_env) reward1 = rewards.reshape(-1, 1) self.logger.log_stat('rewards_mean', (reward1 * mask).sum().item() / mask.sum().item(), t_env) self.log_stats_t = t_env def _train_critic(self, batch, rewards, terminated, actions, avail_actions, mask, bs, max_t): # Optimise critic r_in, target_vals, target_val_ex = self.target_critic(batch) r_in, _, target_val_ex_opt = self.critic(batch) r_in_taken = th.gather(r_in, dim=3, index=actions) r_in = r_in_taken.squeeze(-1) target_vals = target_vals.squeeze(-1) targets_mix, targets_ex = build_td_lambda_targets_v2( rewards, terminated, mask, target_vals, self.n_agents, self.args.gamma, self.args.td_lambda, r_in, target_val_ex) vals_mix = th.zeros_like(target_vals)[:, :-1] vals_ex = target_val_ex_opt[:, :-1] running_log = { "critic_loss": [], "critic_grad_norm": [], "td_error_abs": [], "target_mean": [], "value_mean": [], } for t in reversed(range(rewards.size(1))): mask_t = mask[:, t].expand(-1, self.n_agents) if mask_t.sum() == 0: continue _, q_t, _ = self.critic(batch, t) # 8,1,3,1, vals_mix[:, t] = q_t.view(bs, self.n_agents) targets_t = targets_mix[:, t] td_error = (q_t.view(bs, self.n_agents) - targets_t.detach()) # 0-out the targets that came from padded data masked_td_error = td_error * mask_t # Normal L2 loss, take mean over actual data loss = (masked_td_error**2).sum() / mask_t.sum() self.critic_optimiser.zero_grad() loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.critic_params, self.args.grad_norm_clip) self.critic_optimiser.step() self.critic_training_steps += 1 running_log["critic_loss"].append(loss.item()) running_log["critic_grad_norm"].append(grad_norm) mask_elems = mask_t.sum().item() running_log["td_error_abs"].append( (masked_td_error.abs().sum().item() / mask_elems)) running_log["value_mean"].append( (q_t.view(bs, self.n_agents) * mask_t).sum().item() / mask_elems) running_log["target_mean"].append( (targets_t * mask_t).sum().item() / mask_elems) return vals_mix, running_log, targets_mix, targets_ex, vals_ex, r_in def _update_targets(self): self.target_critic.load_state_dict(self.critic.state_dict()) self.logger.console_logger.info("Updated target network") def _update_policy(self): self.policy_new.load_state(self.mac) def _update_policy_piold(self): self.policy_old.load_state(self.mac) def cuda(self): self.mac.cuda() self.critic.cuda() self.target_critic.cuda() def save_models(self, path): self.mac.save_models(path) th.save(self.critic.state_dict(), "{}/critic.th".format(path)) th.save(self.agent_optimiser.state_dict(), "{}/agent_opt.th".format(path)) th.save(self.critic_optimiser.state_dict(), "{}/critic_opt.th".format(path)) def load_models(self, path): self.mac.load_models(path) self.critic.load_state_dict( th.load("{}/critic.th".format(path), map_location=lambda storage, loc: storage)) self.target_critic.load_state_dict(self.critic.state_dict()) self.agent_optimiser.load_state_dict( th.load("{}/agent_opt.th".format(path), map_location=lambda storage, loc: storage)) self.critic_optimiser.load_state_dict( th.load("{}/critic_opt.th".format(path), map_location=lambda storage, loc: storage))
class QLearner: def __init__(self, mac, scheme, logger, args): self.args = args self.mac = mac self.logger = logger self.params = list(mac.parameters()) self.last_target_update_episode = 0 self.mixer = None if args.mixer is not None: if args.mixer == "vdn": self.mixer = VDNMixer() elif args.mixer == "qmix": self.mixer = QMixer(args) else: raise ValueError("Mixer {} not recognised.".format(args.mixer)) self.params += list(self.mixer.parameters()) self.target_mixer = copy.deepcopy(self.mixer) self.optimiser = RMSprop(params=self.params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) # a little wasteful to deepcopy (e.g. duplicates action selector), but should work for any MAC self.target_mac = copy.deepcopy(mac) self.log_stats_t = -self.args.learner_log_interval - 1 def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): # Get the relevant quantities rewards = batch["reward"][:, :-1] actions = batch["actions"][:, :-1] terminated = batch["terminated"][:, :-1].float() mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) avail_actions = batch["avail_actions"] # Calculate estimated Q-Values mac_out = [] self.mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): agent_outs = self.mac.forward(batch, t=t) mac_out.append(agent_outs) mac_out = th.stack(mac_out, dim=1) # Concat over time # Pick the Q-Values for the actions taken by each agent chosen_action_qvals = th.gather(mac_out[:, :-1], dim=3, index=actions).squeeze( 3) # Remove the last dim # Calculate the Q-Values necessary for the target target_mac_out = [] self.target_mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): target_agent_outs = self.target_mac.forward(batch, t=t) target_mac_out.append(target_agent_outs) # We don't need the first timesteps Q-Value estimate for calculating targets target_mac_out = th.stack(target_mac_out[1:], dim=1) # Concat across time # Mask out unavailable actions target_mac_out[avail_actions[:, 1:] == 0] = -9999999 # From OG deepmarl # Max over target Q-Values if self.args.double_q: # Get actions that maximise live Q (for double q-learning) mac_out_detach = mac_out.clone().detach() mac_out_detach[avail_actions == 0] = -9999999 cur_max_actions = mac_out_detach[:, 1:].max(dim=3, keepdim=True)[1] target_max_qvals = th.gather(target_mac_out, 3, cur_max_actions).squeeze(3) else: target_max_qvals = target_mac_out.max(dim=3)[0] # Mix if self.mixer is not None: chosen_action_qvals = self.mixer(chosen_action_qvals, batch["state"][:, :-1]) target_max_qvals = self.target_mixer(target_max_qvals, batch["state"][:, 1:]) N = getattr(self.args, "n_step", 1) if N == 1: # Calculate 1-step Q-Learning targets targets = rewards + self.args.gamma * ( 1 - terminated) * target_max_qvals else: # N step Q-Learning targets n_rewards = th.zeros_like(rewards) gamma_tensor = th.tensor([self.args.gamma**i for i in range(N)], dtype=th.float, device=n_rewards.device) steps = mask.flip(1).cumsum(dim=1).flip(1).clamp_max(N).long() for i in range(batch.max_seq_length - 1): n_rewards[:, i, 0] = ( (rewards * mask)[:, i:i + N, 0] * gamma_tensor[:(batch.max_seq_length - 1 - i)]).sum(dim=1) indices = th.linspace(0, batch.max_seq_length - 2, steps=batch.max_seq_length - 1, device=steps.device).unsqueeze(1).long() n_targets_terminated = th.gather(target_max_qvals * (1 - terminated), dim=1, index=steps.long() + indices - 1) targets = n_rewards + th.pow(self.args.gamma, steps.float()) * n_targets_terminated # Td-error td_error = (chosen_action_qvals - targets.detach()) mask = mask.expand_as(td_error) # 0-out the targets that came from padded data masked_td_error = td_error * mask # Normal L2 loss, take mean over actual data loss = (masked_td_error**2).sum() / mask.sum() # Optimise self.optimiser.zero_grad() loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.params, self.args.grad_norm_clip) self.optimiser.step() if (episode_num - self.last_target_update_episode ) / self.args.target_update_interval >= 1.0: self._update_targets() self.last_target_update_episode = episode_num if t_env - self.log_stats_t >= self.args.learner_log_interval: self.logger.log_stat("loss", loss.item(), t_env) self.logger.log_stat("grad_norm", grad_norm, t_env) mask_elems = mask.sum().item() self.logger.log_stat( "td_error_abs", (masked_td_error.abs().sum().item() / mask_elems), t_env) self.logger.log_stat("q_taken_mean", (chosen_action_qvals * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) self.logger.log_stat("target_mean", (targets * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) agent_utils = ( th.gather(mac_out[:, :-1], dim=3, index=actions).squeeze(3) * mask).sum().item() / (mask_elems * self.args.n_agents) self.logger.log_stat("agent_utils", agent_utils, t_env) self.log_stats_t = t_env def _update_targets(self): self.target_mac.load_state(self.mac) if self.mixer is not None: self.target_mixer.load_state_dict(self.mixer.state_dict()) self.logger.console_logger.info("Updated target network") def cuda(self): self.mac.cuda() self.target_mac.cuda() if self.mixer is not None: self.mixer.cuda() self.target_mixer.cuda() def save_models(self, path): self.mac.save_models(path) if self.mixer is not None: th.save(self.mixer.state_dict(), "{}/mixer.th".format(path)) th.save(self.optimiser.state_dict(), "{}/opt.th".format(path)) def load_models(self, path): self.mac.load_models(path) # Not quite right but I don't want to save target networks self.target_mac.load_models(path) if self.mixer is not None: self.mixer.load_state_dict( th.load("{}/mixer.th".format(path), map_location=lambda storage, loc: storage)) self.optimiser.load_state_dict( th.load("{}/opt.th".format(path), map_location=lambda storage, loc: storage))
class QLearner: def __init__(self, mac, scheme, logger, args): self.args = args self.mac = mac self.logger = logger self.params = list(mac.parameters()) self.last_target_update_episode = 0 self.mixer = None if args.mixer is not None: if args.mixer == "vdn": self.mixer = VDNMixer() elif args.mixer == "qmix": self.mixer = QMixer(args) else: raise ValueError("Mixer {} not recognised.".format(args.mixer)) self.params += list(self.mixer.parameters()) self.target_mixer = copy.deepcopy(self.mixer) self.optimiser = RMSprop(params=self.params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) # a little wasteful to deepcopy (e.g. duplicates action selector), but should work for any MAC self.target_mac = copy.deepcopy(mac) self.log_stats_t = -self.args.learner_log_interval - 1 def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): # Get the relevant quantities rewards = batch["reward"][:, :-1] actions = batch["actions"][:, :-1] terminated = batch["terminated"][:, :-1].float() mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) avail_actions = batch["avail_actions"] # Calculate estimated Q-Values mac_out = [] # q_without_comm_arr = [] self.mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): agent_outs = self.mac.forward(batch, t=t) # agent_outs, q_without_comm_ = self.mac.forward(batch, t=t, counterfactual=True) mac_out.append(agent_outs) # q_without_comm_arr.append(q_without_comm_) mac_out = th.stack(mac_out, dim=1) # Concat over time # q without communication # q_without_comm = th.stack(q_without_comm_arr, dim=1) # [batch_size, max_seq_length, n_agents, n_actions] # entropy loss(information gain via communication) # q_entropy = Categorical(F.softmax(mac_out, dim=-1)).entropy() # [batch_size, max_seq_length, n_agents, 1] # q_without_comm_entropy = Categorical(F.softmax(q_without_comm, dim=-1)).entropy().detach() # loss_entropy = (q_entropy - q_without_comm_entropy).mean() # Pick the Q-Values for the actions taken by each agent chosen_action_qvals = th.gather(mac_out[:, :-1], dim=3, index=actions).squeeze( 3) # Remove the last dim # Calculate the Q-Values necessary for the target target_mac_out = [] self.target_mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length): target_agent_outs = self.target_mac.forward(batch, t=t) target_mac_out.append(target_agent_outs) # We don't need the first timesteps Q-Value estimate for calculating targets target_mac_out = th.stack(target_mac_out[1:], dim=1) # Concat across time # Mask out unavailable actions target_mac_out[avail_actions[:, 1:] == 0] = -9999999 # Max over target Q-Values if self.args.double_q: # Get actions that maximise live Q (for double q-learning) mac_out_detach = mac_out.clone().detach() mac_out_detach[avail_actions == 0] = -9999999 cur_max_actions = mac_out_detach[:, 1:].max(dim=3, keepdim=True)[1] target_max_qvals = th.gather(target_mac_out, 3, cur_max_actions).squeeze(3) else: target_max_qvals = target_mac_out.max(dim=3)[0] # Mix if self.mixer is not None: chosen_action_qvals = self.mixer(chosen_action_qvals, batch["state"][:, :-1]) target_max_qvals = self.target_mixer(target_max_qvals, batch["state"][:, 1:]) # Calculate 1-step Q-Learning targets targets = rewards + self.args.gamma * (1 - terminated) * target_max_qvals # Td-error td_error = (chosen_action_qvals - targets.detach()) mask = mask.expand_as(td_error) # 0-out the targets that came from padded data masked_td_error = td_error * mask # Normal L2 loss, take mean over actual data # loss = (masked_td_error ** 2).sum() / mask.sum() + self.args.lambda_entropy * loss_entropy loss = (masked_td_error**2).sum() / mask.sum() # Optimise self.optimiser.zero_grad() loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.params, self.args.grad_norm_clip) self.optimiser.step() if (episode_num - self.last_target_update_episode ) / self.args.target_update_interval >= 1.0: self._update_targets() self.last_target_update_episode = episode_num if t_env - self.log_stats_t >= self.args.learner_log_interval: self.logger.log_stat("loss", loss.item(), t_env) # self.logger.log_stat("loss_entropy", loss_entropy.item(), t_env) self.logger.log_stat("grad_norm", grad_norm, t_env) mask_elems = mask.sum().item() self.logger.log_stat( "td_error_abs", (masked_td_error.abs().sum().item() / mask_elems), t_env) self.logger.log_stat("q_taken_mean", (chosen_action_qvals * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) self.logger.log_stat("target_mean", (targets * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) self.log_stats_t = t_env def _update_targets(self): self.target_mac.load_state(self.mac) if self.mixer is not None: self.target_mixer.load_state_dict(self.mixer.state_dict()) self.logger.console_logger.info("Updated target network") def cuda(self): self.mac.cuda() self.target_mac.cuda() if self.mixer is not None: self.mixer.cuda() self.target_mixer.cuda() def save_models(self, path): self.mac.save_models(path) if self.mixer is not None: th.save(self.mixer.state_dict(), "{}/mixer.th".format(path)) th.save(self.optimiser.state_dict(), "{}/opt.th".format(path)) def load_models(self, path): self.mac.load_models(path) # Not quite right but I don't want to save target networks self.target_mac.load_models(path) if self.mixer is not None: self.mixer.load_state_dict( th.load("{}/mixer.th".format(path), map_location=lambda storage, loc: storage)) self.optimiser.load_state_dict( th.load("{}/opt.th".format(path), map_location=lambda storage, loc: storage))
class LatentQLearner(QLearner): def __init__(self, mac, scheme, logger, args): super(LatentQLearner, self).__init__(mac, scheme, logger, args) self.args = args self.mac = mac self.logger = logger self.params = list(mac.parameters()) self.last_target_update_episode = 0 self.mixer = None if args.mixer is not None: if args.mixer == "vdn": self.mixer = VDNMixer() elif args.mixer == "qmix": self.mixer = QMixer(args) else: raise ValueError("Mixer {} not recognised.".format(args.mixer)) self.params += list(self.mixer.parameters()) self.target_mixer = copy.deepcopy(self.mixer) self.optimiser = RMSprop(params=self.params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) # a little wasteful to deepcopy (e.g. duplicates action selector), but should work for any MAC self.target_mac = copy.deepcopy(mac) self.log_stats_t = -self.args.learner_log_interval - 1 self.role_save = 0 self.role_save_interval = 10 def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): # Get the relevant quantities rewards = batch["reward"][:, :-1] actions = batch["actions"][:, :-1] terminated = batch["terminated"][:, :-1].float() mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) avail_actions = batch["avail_actions"] # Calculate estimated Q-Values mac_out = [] self.mac.init_hidden(batch.batch_size) indicator, latent, latent_vae = self.mac.init_latent(batch.batch_size) reg_loss = 0 dis_loss = 0 ce_loss = 0 for t in range(batch.max_seq_length): agent_outs, loss_, dis_loss_, ce_loss_ = self.mac.forward( batch, t=t, t_glob=t_env, train_mode=True) # (bs,n,n_actions),(bs,n,latent_dim) reg_loss += loss_ dis_loss += dis_loss_ ce_loss += ce_loss_ # loss_cs=self.args.gamma*loss_cs + _loss mac_out.append(agent_outs) # [t,(bs,n,n_actions)] # mac_out_latent.append((agent_outs_latent)) #[t,(bs,n,latent_dim)] reg_loss /= batch.max_seq_length dis_loss /= batch.max_seq_length ce_loss /= batch.max_seq_length mac_out = th.stack(mac_out, dim=1) # Concat over time # (bs,t,n,n_actions), Q values of n_actions # mac_out_latent=th.stack(mac_out_latent,dim=1) # (bs,t,n,latent_dim) # mac_out_latent=mac_out_latent.reshape(-1,self.args.latent_dim) # Pick the Q-Values for the actions taken by each agent chosen_action_qvals = th.gather(mac_out[:, :-1], dim=3, index=actions).squeeze( 3) # Remove the last dim # (bs,t,n) Q value of an action # Calculate the Q-Values necessary for the target target_mac_out = [] self.target_mac.init_hidden(batch.batch_size) # (bs,n,hidden_size) self.target_mac.init_latent(batch.batch_size) # (bs,n,latent_size) for t in range(batch.max_seq_length): target_agent_outs, loss_cs_target, _, _ = self.target_mac.forward( batch, t=t) # (bs,n,n_actions), (bs,n,latent_dim) target_mac_out.append(target_agent_outs) # [t,(bs,n,n_actions)] # We don't need the first timesteps Q-Value estimate for calculating targets target_mac_out = th.stack( target_mac_out[1:], dim=1) # Concat across time, dim=1 is time index # (bs,t,n,n_actions) # Mask out unavailable actions target_mac_out[avail_actions[:, 1:] == 0] = -9999999 # Q values # Max over target Q-Values if self.args.double_q: # True for QMix # Get actions that maximise live Q (for double q-learning) mac_out_detach = mac_out.clone().detach( ) # return a new Tensor, detached from the current graph mac_out_detach[avail_actions == 0] = -9999999 # (bs,t,n,n_actions), discard t=0 cur_max_actions = mac_out_detach[:, 1:].max( dim=3, keepdim=True)[1] # indices instead of values # (bs,t,n,1) target_max_qvals = th.gather(target_mac_out, 3, cur_max_actions).squeeze(3) # (bs,t,n,n_actions) ==> (bs,t,n,1) ==> (bs,t,n) max target-Q else: target_max_qvals = target_mac_out.max(dim=3)[0] # Mix if self.mixer is not None: chosen_action_qvals = self.mixer(chosen_action_qvals, batch["state"][:, :-1]) target_max_qvals = self.target_mixer(target_max_qvals, batch["state"][:, 1:]) # (bs,t,1) # Calculate 1-step Q-Learning targets targets = rewards + self.args.gamma * (1 - terminated) * target_max_qvals # Td-error td_error = (chosen_action_qvals - targets.detach() ) # no gradient through target net # (bs,t,1) mask = mask.expand_as(td_error) # 0-out the targets that came from padded data masked_td_error = td_error * mask # Normal L2 loss, take mean over actual data loss = (masked_td_error**2).sum() / mask.sum() # entropy loss # mac_out_latent_norm=th.sqrt(th.sum(mac_out_latent*mac_out_latent,dim=1)) # mac_out_latent=mac_out_latent/mac_out_latent_norm[:,None] # loss+=(th.norm(mac_out_latent)/mac_out_latent.size(0))*self.args.entropy_loss_weight loss += reg_loss # Optimise self.optimiser.zero_grad() loss.backward() grad_norm = th.nn.utils.clip_grad_norm_( self.params, self.args.grad_norm_clip) # max_norm self.optimiser.step() if (episode_num - self.last_target_update_episode ) / self.args.target_update_interval >= 1.0: self._update_targets() self.last_target_update_episode = episode_num if t_env - self.log_stats_t >= self.args.learner_log_interval: # if self.role_save % self.role_save_interval == 0: # self.role_save = 0 # if self.args.latent_dim in [2, 3]: # fig = plt.figure() # ax = fig.add_subplot(111, projection='3d') # print(self.mac.agent.latent[:, :self.args.latent_dim], # self.mac.agent.latent[:, -self.args.latent_dim:]) # self.role_save += 1 self.logger.log_stat("loss", loss.item(), t_env) self.logger.log_stat("loss_reg", reg_loss.item(), t_env) self.logger.log_stat("loss_dis", dis_loss.item(), t_env) self.logger.log_stat("loss_ce", ce_loss.item(), t_env) #indicator=[var_mean,mi.max(),mi.min(),mi.mean(),mi.std(),di.max(),di.min(),di.mean(),di.std()] self.logger.log_stat("var_mean", indicator[0].item(), t_env) self.logger.log_stat("mi_max", indicator[1].item(), t_env) self.logger.log_stat("mi_min", indicator[2].item(), t_env) self.logger.log_stat("mi_mean", indicator[3].item(), t_env) self.logger.log_stat("mi_std", indicator[4].item(), t_env) self.logger.log_stat("di_max", indicator[5].item(), t_env) self.logger.log_stat("di_min", indicator[6].item(), t_env) self.logger.log_stat("di_mean", indicator[7].item(), t_env) self.logger.log_stat("di_std", indicator[8].item(), t_env) self.logger.log_stat("grad_norm", grad_norm, t_env) mask_elems = mask.sum().item() self.logger.log_stat( "td_error_abs", (masked_td_error.abs().sum().item() / mask_elems), t_env) self.logger.log_stat("q_taken_mean", (chosen_action_qvals * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) self.logger.log_stat("target_mean", (targets * mask).sum().item() / (mask_elems * self.args.n_agents), t_env) if self.args.use_tensorboard: # log_vec(self,mat,metadata,label_img,global_step,tag) self.logger.log_vec(latent, list(range(self.args.n_agents)), t_env, "latent") self.logger.log_vec(latent_vae, list(range(self.args.n_agents)), t_env, "latent-VAE") self.log_stats_t = t_env def _update_targets(self): self.target_mac.load_state(self.mac) if self.mixer is not None: self.target_mixer.load_state_dict(self.mixer.state_dict()) self.logger.console_logger.info("Updated target network") def cuda(self): self.mac.cuda() self.target_mac.cuda() if self.mixer is not None: self.mixer.cuda() self.target_mixer.cuda() def save_models(self, path): self.mac.save_models(path) if self.mixer is not None: th.save(self.mixer.state_dict(), "{}/mixer.th".format(path)) th.save(self.optimiser.state_dict(), "{}/opt.th".format(path)) def load_models(self, path): self.mac.load_models(path) # Not quite right but I don't want to save target networks self.target_mac.load_models(path) if self.mixer is not None: self.mixer.load_state_dict( th.load("{}/mixer.th".format(path), map_location=lambda storage, loc: storage)) self.optimiser.load_state_dict( th.load("{}/opt.th".format(path), map_location=lambda storage, loc: storage))
class PPO(Agent): """ An agent learned with PPO using Advantage Actor-Critic framework - Actor takes state as input - Critic takes both state and action as input - agent interact with environment to collect experience - agent training with experience to update policy - adam seems better than rms prop for ppo """ def __init__(self, env, state_dim, action_dim, memory_capacity=10000, max_steps=None, roll_out_n_steps=1, target_tau=1., target_update_steps=5, clip_param=0.2, reward_gamma=0.99, reward_scale=1., done_penalty=None, actor_hidden_size=32, critic_hidden_size=32, actor_output_act=nn.functional.log_softmax, critic_loss="mse", actor_lr=0.001, critic_lr=0.001, optimizer_type="adam", entropy_reg=0.01, max_grad_norm=0.5, batch_size=100, episodes_before_train=100, epsilon_start=0.9, epsilon_end=0.01, epsilon_decay=200, use_cuda=True): super(PPO, self).__init__(env, state_dim, action_dim, memory_capacity, max_steps, reward_gamma, reward_scale, done_penalty, actor_hidden_size, critic_hidden_size, actor_output_act, critic_loss, actor_lr, critic_lr, optimizer_type, entropy_reg, max_grad_norm, batch_size, episodes_before_train, epsilon_start, epsilon_end, epsilon_decay, use_cuda) self.roll_out_n_steps = roll_out_n_steps self.target_tau = target_tau self.target_update_steps = target_update_steps self.clip_param = clip_param self.actor = ActorNetwork(self.state_dim, self.actor_hidden_size, self.action_dim, self.actor_output_act) self.critic = CriticNetwork(self.state_dim, self.action_dim, self.critic_hidden_size, 1) # to ensure target network and learning network has the same weights self.actor_target = deepcopy(self.actor) self.critic_target = deepcopy(self.critic) if self.optimizer_type == "adam": self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) elif self.optimizer_type == "rmsprop": self.actor_optimizer = RMSprop(self.actor.parameters(), lr=self.actor_lr) self.critic_optimizer = RMSprop(self.critic.parameters(), lr=self.critic_lr) if self.use_cuda: self.actor.cuda() self.critic.cuda() self.actor_target.cuda() self.critic_target.cuda() # agent interact with the environment to collect experience def interact(self): super(PPO, self)._take_n_steps() # train on a roll out batch def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # update actor network self.actor_optimizer.zero_grad() values = self.critic_target(states_var, actions_var).detach() advantages = rewards_var - values # # normalizing advantages seems not working correctly here # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) action_log_probs = self.actor(states_var) action_log_probs = torch.sum(action_log_probs * actions_var, 1) old_action_log_probs = self.actor_target(states_var).detach() old_action_log_probs = torch.sum(old_action_log_probs * actions_var, 1) ratio = torch.exp(action_log_probs - old_action_log_probs) surr1 = ratio * advantages surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * advantages # PPO's pessimistic surrogate (L^CLIP) actor_loss = -torch.mean(torch.min(surr1, surr2)) actor_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() # update critic network self.critic_optimizer.zero_grad() target_values = rewards_var values = self.critic(states_var, actions_var) if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) critic_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step() # update actor target network and critic target network if self.n_steps % self.target_update_steps == 0 and self.n_steps > 0: super(PPO, self)._soft_update_target(self.actor_target, self.actor) super(PPO, self)._soft_update_target(self.critic_target, self.critic) # predict softmax action based on state def _softmax_action(self, state): state_var = to_tensor_var([state], self.use_cuda) softmax_action_var = torch.exp(self.actor(state_var)) if self.use_cuda: softmax_action = softmax_action_var.data.cpu().numpy()[0] else: softmax_action = softmax_action_var.data.numpy()[0] return softmax_action # choose an action based on state with random noise added for exploration in training def exploration_action(self, state): softmax_action = self._softmax_action(state) epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ np.exp(-1. * self.n_steps / self.epsilon_decay) if np.random.rand() < epsilon: action = np.random.choice(self.action_dim) else: action = np.argmax(softmax_action) return action # choose an action based on state for execution def action(self, state): softmax_action = self._softmax_action(state) action = np.argmax(softmax_action) return action # evaluate value for a state-action pair def value(self, state, action): state_var = to_tensor_var([state], self.use_cuda) action = index_to_one_hot(action, self.action_dim) action_var = to_tensor_var([action], self.use_cuda) value_var = self.critic(state_var, action_var) if self.use_cuda: value = value_var.data.cpu().numpy()[0] else: value = value_var.data.numpy()[0] return value
class DDPG(Agent): """ An agent learned with Deep Deterministic Policy Gradient using Actor-Critic framework - Actor takes state as input - Critic takes both state and action as input - Critic uses gradient temporal-difference learning """ def __init__(self, env, state_dim, action_dim, memory_capacity=10000, max_steps=None, target_tau=0.01, target_update_steps=5, reward_gamma=0.99, reward_scale=1., done_penalty=None, actor_hidden_size=32, critic_hidden_size=32, actor_output_act=nn.functional.tanh, critic_loss="mse", actor_lr=0.001, critic_lr=0.001, optimizer_type="adam", entropy_reg=0.01, max_grad_norm=0.5, batch_size=100, episodes_before_train=100, epsilon_start=0.9, epsilon_end=0.01, epsilon_decay=200, use_cuda=True): super(DDPG, self).__init__(env, state_dim, action_dim, memory_capacity, max_steps, reward_gamma, reward_scale, done_penalty, actor_hidden_size, critic_hidden_size, actor_output_act, critic_loss, actor_lr, critic_lr, optimizer_type, entropy_reg, max_grad_norm, batch_size, episodes_before_train, epsilon_start, epsilon_end, epsilon_decay, use_cuda) self.target_tau = target_tau self.target_update_steps = target_update_steps self.actor = ActorNetwork(self.state_dim, self.actor_hidden_size, self.action_dim, self.actor_output_act) self.critic = CriticNetwork(self.state_dim, self.action_dim, self.critic_hidden_size, 1) # to ensure target network and learning network has the same weights self.actor_target = deepcopy(self.actor) self.critic_target = deepcopy(self.critic) if self.optimizer_type == "adam": self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) elif self.optimizer_type == "rmsprop": self.actor_optimizer = RMSprop(self.actor.parameters(), lr=self.actor_lr) self.critic_optimizer = RMSprop(self.critic.parameters(), lr=self.critic_lr) if self.use_cuda: self.actor.cuda() self.critic.cuda() self.actor_target.cuda() self.critic_target.cuda() # agent interact with the environment to collect experience def interact(self): super(DDPG, self)._take_one_step() # train on a sample batch def train(self): # do not train until exploration is enough if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) state_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) action_var = to_tensor_var(batch.actions, self.use_cuda).view(-1, self.action_dim) reward_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) next_state_var = to_tensor_var(batch.next_states, self.use_cuda).view(-1, self.state_dim) done_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1) # estimate the target q with actor_target network and critic_target network next_action_var = self.actor_target(next_state_var) next_q = self.critic_target(next_state_var, next_action_var).detach() target_q = self.reward_scale * reward_var + self.reward_gamma * next_q * (1. - done_var) # update critic network self.critic_optimizer.zero_grad() # current Q values current_q = self.critic(state_var, action_var) # rewards is target Q values if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(current_q, target_q) else: critic_loss = nn.MSELoss()(current_q, target_q) critic_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step() # update actor network self.actor_optimizer.zero_grad() # the accurate action prediction action = self.actor(state_var) # actor_loss is used to maximize the Q value for the predicted action actor_loss = - self.critic(state_var, action) actor_loss = actor_loss.mean() actor_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() # update actor target network and critic target network if self.n_steps % self.target_update_steps == 0 and self.n_steps > 0: super(DDPG, self)._soft_update_target(self.critic_target, self.critic) super(DDPG, self)._soft_update_target(self.actor_target, self.actor) # choice an action based on state with random noise added for exploration in training def exploration_action(self, state): action = self.action(state) epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ np.exp(-1. * self.n_steps / self.epsilon_decay) # add noise noise = np.random.randn(self.action_dim) * epsilon action += noise return action # choice an action based on state for execution def action(self, state): action_var = self.actor(to_tensor_var([state], self.use_cuda)) if self.use_cuda: action = action_var.data.cpu().numpy()[0] else: action = action_var.data.numpy()[0] return action
class PolicyGradientLearner: def __init__(self, mac, scheme, logger, args): self.args = args self.n_agents = args.n_agents self.n_actions = args.n_actions self.mac = mac self.logger = logger self.last_target_update_step = 0 #self.critic_training_steps = 0 self.log_stats_t = -self.args.learner_log_interval - 1 #if args.critic_fact is not None: # self.critic = FactoredCentralVCritic(scheme, args) #else: # self.critic = CentralVCritic(scheme, args) #self.target_critic = copy.deepcopy(self.critic) self.agent_params = list(mac.parameters()) #self.critic_params = list(self.critic.parameters()) self.params = self.agent_params #+ self.critic_params self.agent_optimiser = RMSprop(params=self.agent_params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) #self.critic_optimiser = RMSprop(params=self.critic_params, lr=args.critic_lr, alpha=args.optim_alpha, eps=args.optim_eps) def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): # Get the relevant quantities rewards = batch["reward"][:, :-1] actions = batch["actions"][:, :-1] terminated = batch["terminated"][:, :-1].float() mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) avail_actions = batch["avail_actions"][:, :-1] mask = mask.repeat(1, 1, self.n_agents) #critic_mask = mask.clone() #get pilogits mac_out = [] self.mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length - 1): agent_outs = self.mac.forward(batch, t=t) mac_out.append(agent_outs) mac_out = th.stack(mac_out, dim=1) # Concat over time # Mask out unavailable actions, renormalise (as in action selection) mac_out[avail_actions == 0] = 0 mac_out = mac_out/mac_out.sum(dim=-1, keepdim=True) mac_out[avail_actions == 0] = 0 pi = mac_out pi_taken = th.gather(pi, dim=3, index=actions).squeeze(3) pi_taken[mask == 0] = 1.0 log_pi_taken = th.log(pi_taken) #get V-values from Central V critic #q_sa, v_vals, critic_train_stats = self._train_critic(batch, rewards, terminated, critic_mask) #baseline = v_vals q_sa = self.returns(rewards, mask) #use no baseline---just vanilla policy gradient with RNNs #advantages = (q_sa - baseline).detach().squeeze() advantages = q_sa.detach().squeeze() entropy = -(pi * th.log(pi) * mask[:, :, :, None]).sum() / (mask.sum() * pi.shape[-1]) centralV_loss = - ((advantages * log_pi_taken) * mask).sum() / mask.sum() - self.args.entropy_alpha*entropy # Optimise agents self.agent_optimiser.zero_grad() centralV_loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.agent_params, self.args.grad_norm_clip) self.agent_optimiser.step() #if (self.critic_training_steps - self.last_target_update_step) / self.args.target_update_interval >= 1.0: # self._update_targets() # self.last_target_update_step = self.critic_training_steps if t_env - self.log_stats_t >= self.args.learner_log_interval: #ts_logged = len(critic_train_stats["critic_loss"]) #for key in ["critic_loss", "critic_grad_norm", "td_error_abs", "q_taken_mean", "target_mean"]: # self.logger.log_stat(key, sum(critic_train_stats[key])/ts_logged, t_env) self.logger.log_stat("advantage_mean", (advantages * mask).sum().item() / mask.sum().item(), t_env) self.logger.log_stat("centralV_loss", centralV_loss.item(), t_env) self.logger.log_stat("agent_grad_norm", grad_norm, t_env) self.logger.log_stat("pi_entropy", entropy.item(), t_env) self.logger.log_stat("pi_max", (pi.max(dim=-1)[0] * mask).sum().item() / mask.sum().item(), t_env) self.log_stats_t = t_env def returns(self, rewards, mask): nstep_values = th.zeros_like(mask) for t_start in range(rewards.size(1)): nstep_return_t = th.zeros_like(mask[:, 0]) for step in range(rewards.size(1)): t = t_start + step if t >= rewards.size(1) - 1: break #elif step == nsteps: # nstep_return_t += self.args.gamma ** (step) * values[:, t] * mask[:, t] #elif t == rewards.size(1) - 1: # #nstep_return_t += self.args.gamma ** (step) * values[:, t] * mask[:, t] else: nstep_return_t += self.args.gamma ** (step) * rewards[:, t] * mask[:, t] nstep_values[:, t_start, :] = nstep_return_t return nstep_values def cuda(self): self.mac.cuda() self.critic.cuda() self.target_critic.cuda() def save_models(self, path): self.mac.save_models(path) #th.save(self.critic.state_dict(), "{}/critic.th".format(path)) th.save(self.agent_optimiser.state_dict(), "{}/agent_opt.th".format(path)) #th.save(self.critic_optimiser.state_dict(), "{}/critic_opt.th".format(path)) def load_models(self, path): self.mac.load_models(path) #self.critic.load_state_dict(th.load("{}/critic.th".format(path), map_location=lambda storage, loc: storage)) # Not quite right but I don't want to save target networks #self.target_critic.load_state_dict(self.critic.state_dict()) self.agent_optimiser.load_state_dict(th.load("{}/agent_opt.th".format(path), map_location=lambda storage, loc: storage))
def train(**kwargs): # attributes for k, v in kwargs.items(): setattr(opt, k, v) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') torch.backends.cudnn.enabled = False # dataset if opt.chinese: mydata = TrainData(opt.chinese_data_path, opt.conversation_path, opt.chinese_results_path, opt.chinese, opt.fb, opt.prev_sent, True) else: mydata = TrainData(opt.data_path, opt.conversation_path, opt.results_path, opt.chinese, opt.fb, opt.prev_sent, True) # models if opt.attn: seq2seq = NewSeq2seqAttention(num_tokens=mydata.data.num_tokens, opt=opt, sos_id=mydata.data.word2id["<START>"]) if opt.model_attention_path: seq2seq.load_state_dict( torch.load(opt.model_attention_path, map_location="cpu")) print("Pretrained model has been loaded.\n") else: seq2seq = NewSeq2seq(num_tokens=mydata.data.num_tokens, opt=opt, sos_id=mydata.data.word2id["<START>"]) if opt.chinese: if opt.chinese_model_path: seq2seq.load_state_dict( torch.load(opt.chinese_model_path, map_location="cpu")) print("Pretrained model has been loaded.\n") else: if opt.model_path: seq2seq.load_state_dict( torch.load(opt.model_path, map_location="cpu")) print("Pretrained model has been loaded.\n") seq2seq = seq2seq.to(device) #=============================================================# optimizer = RMSprop(seq2seq.parameters(), lr=opt.learning_rate) criterion = nn.CrossEntropyLoss().to(device) for epoch in range(opt.epochs): print("epoch %d:" % epoch) mini_batches = mydata._mini_batches(opt.batch_size) for ii, (ib, tb) in enumerate(mini_batches): ib = ib.to(device) tb = tb.to(device) optimizer.zero_grad() decoder_outputs, decoder_hidden1, decoder_hidden2 = seq2seq(ib, tb) # Its own last output a = [] b = [] for t in range(opt.mxlen): _, indices = torch.topk(decoder_outputs[t][0], 1) a.append(mydata.data.id2word[ib[t][0].item()]) b.append(mydata.data.id2word[indices[0].item()]) print(a) print(b) # Reshape the outputs b = decoder_outputs.size(1) t = decoder_outputs.size(0) targets = Variable(torch.zeros(t, b)).to( device) # (time_steps,batch_size) targets[:-1, :] = tb[1:, :] targets = targets.contiguous().view(-1) # (time_steps*batch_size) decoder_outputs = decoder_outputs.view( b * t, -1) # S = (time_steps*batch_size) x V loss = criterion(decoder_outputs, targets.long()) if ii % 1 == 0: print("Current Loss:", loss.data.item()) loss.backward() optimizer.step() if opt.chinese: save_path = "checkpoints/chinese-epoch-%s.pth" % epoch else: save_path = "checkpoints/epoch-%s.pth" % epoch torch.save(seq2seq.state_dict(), save_path)
class COMALearner: def __init__(self, mac, scheme, logger, args): self.args = args self.n_agents = args.n_agents self.n_actions = args.n_actions self.mac = mac self.logger = logger self.last_target_update_step = 0 self.critic_training_steps = 0 self.log_stats_t = -self.args.learner_log_interval - 1 self.critic = COMACritic(scheme, args) self.target_critic = copy.deepcopy(self.critic) self.agent_params = list(mac.parameters()) self.critic_params = list(self.critic.parameters()) self.params = self.agent_params + self.critic_params self.agent_optimiser = RMSprop(params=self.agent_params, lr=args.lr, alpha=args.optim_alpha, eps=args.optim_eps) self.critic_optimiser = RMSprop(params=self.critic_params, lr=args.critic_lr, alpha=args.optim_alpha, eps=args.optim_eps) def train(self, batch: EpisodeBatch, t_env: int, episode_num: int): # Get the relevant quantities bs = batch.batch_size max_t = batch.max_seq_length rewards = batch["reward"][:, :-1] actions = batch["actions"][:, :] terminated = batch["terminated"][:, :-1].float() mask = batch["filled"][:, :-1].float() mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) avail_actions = batch["avail_actions"][:, :-1] critic_mask = mask.clone() mask = mask.repeat(1, 1, self.n_agents).view(-1) q_vals, critic_train_stats = self._train_critic(batch, rewards, terminated, actions, avail_actions, critic_mask, bs, max_t) actions = actions[:,:-1] mac_out = [] self.mac.init_hidden(batch.batch_size) for t in range(batch.max_seq_length - 1): agent_outs = self.mac.forward(batch, t=t) mac_out.append(agent_outs) mac_out = th.stack(mac_out, dim=1) # Concat over time # Mask out unavailable actions, renormalise (as in action selection) # modify if self.args.legal_action: mac_out[avail_actions == 0] = 0 mac_out = mac_out/mac_out.sum(dim=-1, keepdim=True) # modify if self.args.legal_action: mac_out[avail_actions == 0] = 0 # Calculated baseline q_vals = q_vals.reshape(-1, self.n_actions) pi = mac_out.view(-1, self.n_actions) baseline = (pi * q_vals).sum(-1).detach() # Calculate policy grad with mask q_taken = th.gather(q_vals, dim=1, index=actions.reshape(-1, 1)).squeeze(1) pi_taken = th.gather(pi, dim=1, index=actions.reshape(-1, 1)).squeeze(1) pi_taken[mask == 0] = 1.0 log_pi_taken = th.log(pi_taken) advantages = (q_taken - baseline).detach() coma_loss = - ((advantages * log_pi_taken) * mask).sum() / mask.sum() # Optimise agents self.agent_optimiser.zero_grad() coma_loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.agent_params, self.args.grad_norm_clip) self.agent_optimiser.step() if (self.critic_training_steps - self.last_target_update_step) / self.args.target_update_interval >= 1.0: self._update_targets() self.last_target_update_step = self.critic_training_steps if t_env - self.log_stats_t >= self.args.learner_log_interval: ts_logged = len(critic_train_stats["critic_loss"]) for key in ["critic_loss", "critic_grad_norm", "td_error_abs", "q_taken_mean", "target_mean"]: self.logger.log_stat(key, sum(critic_train_stats[key])/ts_logged, t_env) self.logger.log_stat("advantage_mean", (advantages * mask).sum().item() / mask.sum().item(), t_env) self.logger.log_stat("coma_loss", coma_loss.item(), t_env) self.logger.log_stat("agent_grad_norm", grad_norm, t_env) self.logger.log_stat("pi_max", (pi.max(dim=1)[0] * mask).sum().item() / mask.sum().item(), t_env) self.log_stats_t = t_env def _train_critic(self, batch, rewards, terminated, actions, avail_actions, mask, bs, max_t): # Optimise critic target_q_vals = self.target_critic(batch)[:, :] targets_taken = th.gather(target_q_vals, dim=3, index=actions).squeeze(3) # Calculate td-lambda targets targets = build_td_lambda_targets(rewards, terminated, mask, targets_taken, self.n_agents, self.args.gamma, self.args.td_lambda) q_vals = th.zeros_like(target_q_vals)[:, :-1] running_log = { "critic_loss": [], "critic_grad_norm": [], "td_error_abs": [], "target_mean": [], "q_taken_mean": [], } for t in reversed(range(rewards.size(1))): mask_t = mask[:, t].expand(-1, self.n_agents) if mask_t.sum() == 0: continue q_t = self.critic(batch, t) q_vals[:, t] = q_t.view(bs, self.n_agents, self.n_actions) q_taken = th.gather(q_t, dim=3, index=actions[:, t:t+1]).squeeze(3).squeeze(1) targets_t = targets[:, t] td_error = (q_taken - targets_t.detach()) # 0-out the targets that came from padded data masked_td_error = td_error * mask_t # Normal L2 loss, take mean over actual data loss = (masked_td_error ** 2).sum() / mask_t.sum() self.critic_optimiser.zero_grad() loss.backward() grad_norm = th.nn.utils.clip_grad_norm_(self.critic_params, self.args.grad_norm_clip) self.critic_optimiser.step() self.critic_training_steps += 1 running_log["critic_loss"].append(loss.item()) running_log["critic_grad_norm"].append(grad_norm) mask_elems = mask_t.sum().item() running_log["td_error_abs"].append((masked_td_error.abs().sum().item() / mask_elems)) running_log["q_taken_mean"].append((q_taken * mask_t).sum().item() / mask_elems) running_log["target_mean"].append((targets_t * mask_t).sum().item() / mask_elems) return q_vals, running_log def _update_targets(self): self.target_critic.load_state_dict(self.critic.state_dict()) self.logger.console_logger.info("Updated target network") def cuda(self): self.mac.cuda() self.critic.cuda() self.target_critic.cuda() def save_models(self, path): self.mac.save_models(path) th.save(self.critic.state_dict(), "{}/critic.th".format(path)) th.save(self.agent_optimiser.state_dict(), "{}/agent_opt.th".format(path)) th.save(self.critic_optimiser.state_dict(), "{}/critic_opt.th".format(path)) def load_models(self, path): self.mac.load_models(path) self.critic.load_state_dict(th.load("{}/critic.th".format(path), map_location=lambda storage, loc: storage)) # Not quite right but I don't want to save target networks self.target_critic.load_state_dict(self.critic.state_dict()) self.agent_optimiser.load_state_dict(th.load("{}/agent_opt.th".format(path), map_location=lambda storage, loc: storage)) self.critic_optimiser.load_state_dict(th.load("{}/critic_opt.th".format(path), map_location=lambda storage, loc: storage))
class NEC_Agent: def __init__(self, args, exp_model, logging_func): self.args = args # Exploration Model self.exp_model = exp_model self.log = logging_func["log"] # Experience Replay self.replay = ExpReplay(args.exp_replay_size, args) self.dnds = [DND(kernel=kernel, num_neighbors=args.nec_neighbours, max_memory=args.dnd_size, embedding_size=args.nec_embedding) for _ in range(self.args.actions)] # DQN and Target DQN model = get_models(args.model) self.embedding = model(embedding=args.nec_embedding) embedding_params = 0 for weight in self.embedding.parameters(): weight_params = 1 for s in weight.size(): weight_params *= s embedding_params += weight_params print("Embedding Network has {:,} parameters.".format(embedding_params)) if args.gpu: print("Moving models to GPU.") self.embedding.cuda() # Optimizer self.optimizer = RMSprop(self.embedding.parameters(), lr=args.lr) # self.optimizer = Adam(self.embedding.parameters(), lr=args.lr) self.T = 0 self.target_sync_T = -self.args.t_max self.experiences = [] self.keys = [] self.q_val_estimates = [] self.table_updates = 0 def Q_Value_Estimates(self, state): # Get state embedding state = torch.from_numpy(state).float().transpose_(0, 2).unsqueeze(0) key = self.embedding(Variable(state, volatile=True)).cpu() if (key != key).sum().data[0] > 0: pass # print(key) # for param in self.embedding.parameters(): # print(param) # print(key != key) # print((key != key).sum().data[0]) # print("Nan key") estimate_from_dnds = torch.cat([dnd.lookup(key) for dnd in self.dnds]) # print(estimate_from_dnds) self.keys.append(key.data[0].numpy()) self.q_val_estimates.append(estimate_from_dnds.data.numpy()) return estimate_from_dnds, key # return np.array(estimate_from_dnds), key def act(self, state, epsilon, exp_model): q_values, key = self.Q_Value_Estimates(state) q_values_numpy = q_values.data.numpy() extra_info = {} extra_info["Q_Values"] = q_values_numpy if np.random.random() < epsilon: action = np.random.randint(low=0, high=self.args.actions) else: action = np.argmax(q_values_numpy) extra_info["Action"] = action return action, extra_info def experience(self, state, action, reward, state_next, steps, terminated, pseudo_reward=0, density=1, exploring=False): experience = (state, action, reward, pseudo_reward, state_next, terminated) self.experiences.append(experience) if len(self.experiences) >= self.args.n_step: self.add_experience() if not exploring: self.T += 1 def end_of_trajectory(self): self.replay.end_of_trajectory() # Go through the experiences and add them to the replay using a less than N-step Q-Val estimate while len(self.experiences) > 0: self.add_experience() def add_experience(self): # Match the key and q val estimates size to the number of experieneces N = len(self.experiences) self.keys = self.keys[-N:] self.q_val_estimates = self.q_val_estimates[-N:] first_state = self.experiences[0][0] first_action = self.experiences[0][1] last_state = self.experiences[-1][4] terminated_last_state = self.experiences[-1][5] accum_reward = 0 for ex in reversed(self.experiences): r = ex[2] pr = ex[3] accum_reward = (r + pr) + self.args.gamma * accum_reward # if accum_reward > 1000: # print(accum_reward) if terminated_last_state: last_state_max_q_val = 0 else: # last_state_q_val_estimates, last_state_key = self.Q_Value_Estimates(last_state) # last_state_max_q_val = last_state_q_val_estimates.data.max(0)[0][0] last_state_max_q_val = np.max(self.q_val_estimates[-1]) # print(last_state_max_q_val) # first_state_q_val_estimates, first_state_key = self.Q_Value_Estimates(first_state) # first_state_key = first_state_key.data[0].numpy() first_state_key = self.keys[0] n_step_q_val_estimate = accum_reward + (self.args.gamma ** len(self.experiences)) * last_state_max_q_val n_step_q_val_estimate = n_step_q_val_estimate # print(n_step_q_val_estimate) # Add to dnd # print(first_state_key) # print(tuple(first_state_key.data[0])) # if any(np.isnan(first_state_key)): # print("NAN") if self.dnds[first_action].is_present(key=first_state_key): current_q_val = self.dnds[first_action].get_value(key=first_state_key) new_q_val = current_q_val + self.args.nec_alpha * (n_step_q_val_estimate - current_q_val) self.dnds[first_action].upsert(key=first_state_key, value=new_q_val) self.table_updates += 1 self.log("NEC/Table_Updates", self.table_updates, step=self.T) else: self.dnds[first_action].upsert(key=first_state_key, value=n_step_q_val_estimate) # Add to replay self.replay.Add_Exp(first_state, first_action, n_step_q_val_estimate) # Remove first experience self.experiences = self.experiences[1:] def train(self): info = {} if self.T % self.args.nec_update != 0: return info # print("Training") for _ in range(self.args.iters): # TODO: Use a named tuple for experience replay batch = self.replay.Sample(self.args.batch_size) columns = list(zip(*batch)) states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3)) # print(states) actions = columns[1] # print(actions) targets = Variable(torch.FloatTensor(columns[2])) # print(targets) keys = self.embedding(states).cpu() # print(keys) # print("Keys", keys.requires_grad) # for action in actions: # print(action) # for action, key in zip(actions, keys): # print(action, key) # kk = key.unsqueeze(0) # print("kk", kk.requires_grad) # k = self.dnds[action].lookup(key.unsqueeze(0)) # print("key", key.requires_grad, key.volatile) model_predictions = torch.cat([self.dnds[action].lookup(key.unsqueeze(0)) for action, key in zip(actions, keys)]) # print(model_predictions) # print(targets) td_error = model_predictions - targets # print(td_error) info["TD_Error"] = td_error.mean().data[0] l2_loss = (td_error).pow(2).mean() info["Loss"] = l2_loss.data[0] # Update self.optimizer.zero_grad() l2_loss.backward() # Taken from pytorch clip_grad_norm # Remove once the pip version it up to date with source gradient_norm = clip_grad_norm(self.embedding.parameters(), self.args.clip_value) if gradient_norm is not None: info["Norm"] = gradient_norm self.optimizer.step() if "States" in info: states_trained = info["States"] info["States"] = states_trained + columns[0] else: info["States"] = columns[0] return info