def one_model_evaluation(self, model, epsilon, exclude_wrong_predictions, targeted, true_labels, all_preds, entropies): adv_acc = [] ood_entropies = np.zeros(0) for im, crit in self.test_loader: adv_results, predictions = construct_adversarial_examples( im, crit, self.method, model, self.device, epsilon, exclude_wrong_predictions, targeted) _, advs, _ = adv_results advs = advs.cpu() new_x = [] for image in advs: if image.dim() > 3: image = self.transform(image.squeeze(0)).unsqueeze(0) else: image = self.transform(image).unsqueeze(0) new_x.append(image) advs = torch.cat(new_x) advs = advs.to(self.device) adv_acc.append((model.forward(advs).argmax(dim=-1).cpu().flatten() == crit).float().sum().numpy() / len(im)) x = advs out = model(x) probs = F.softmax(out, dim=-1) preds, indices = torch.max(probs, dim=-1) entropy = Categorical(probs).entropy().squeeze() ood_entropies = np.concatenate( (ood_entropies, entropy.detach().cpu().numpy())) entropies = np.concatenate( (entropies, entropy.detach().cpu().numpy())) true_labels = np.concatenate((true_labels, np.zeros(len(x)))) all_preds = np.concatenate( (all_preds, preds.detach().cpu().reshape((-1)))) auroc = calculate_auroc(true_labels, all_preds) aupr = calculate_aupr(true_labels, all_preds) auroc_entropy = calculate_auroc(1 - true_labels, entropies) aupr_entropy = calculate_aupr(1 - true_labels, entropies) return np.mean( adv_acc), auroc, aupr, auroc_entropy, aupr_entropy, np.mean( ood_entropies)
def forward(self, state_rep): logits = self.net(state_rep) log_probs = F.log_softmax(logits, dim=(-1)) probs = torch.exp(log_probs) arg = Categorical(probs).sample() arg = arg.detach().cpu().numpy() return arg.reshape(-1, 1), log_probs[range(len(arg)), arg], log_probs
def forward(self, state_rep): """ Input ----- state_rep: (batch_size, n_features) Returns ------- arg: (batch_size, n_args) log_prob: (batch_size, n_args) """ logits = self.net(state_rep).view( -1, self.n_args, self.max_size) # (batch_size, n_args, max_size) # Infer device from spatial_params_net output with parallel_log_prob.is_cuda if logits.is_cuda: device = 'cuda' # Assume only 1 GPU device is used else: device = 'cpu' self.sizes_mask = self.sizes_mask.to(device) log_probs = F.log_softmax(logits.masked_fill(self.sizes_mask.bool(), float('-inf')), dim=(-1)) probs = torch.exp(log_probs) arg = Categorical(probs).sample() #(batch_size, n_args) log_prob = log_probs.view(-1, self.max_size)[torch.arange(arg.shape[0]*arg.shape[1]), arg.flatten()]\ .view(arg.shape[0], arg.shape[1]) arg = arg.detach().cpu().numpy() return arg, log_prob
def actor_step(self, env_output): spatial_state = env_output['spatial_state'].unsqueeze(0).to(self.device) player_state = env_output['player_state'].unsqueeze(0).to(self.device) action_mask = env_output['action_mask'].to(self.device) #print("action_mask: ", action_mask) log_probs, spatial_features, nonspatial_features = self.pi(spatial_state, player_state, action_mask) #print("log_probs: ", log_probs) probs = torch.exp(log_probs) #print("probs: ", probs) main_action_torch = Categorical(probs).sample() # check probs < 0?! main_action = main_action_torch.detach().cpu().numpy() log_prob = log_probs[range(len(main_action)), main_action] args, args_log_prob, args_indexes = self.sample_params(nonspatial_features, spatial_features, main_action) assert args_log_prob.shape == log_prob.shape, ("Shape mismatch between arg_log_prob and log_prob ",\ args_log_prob.shape, log_prob.shape) log_prob = log_prob + args_log_prob action_id = np.array([self.action_table[act] for act in main_action]) sc2_env_action = [sc_actions.FunctionCall(action_id[i], args[i]) for i in range(len(action_id))] actor_output = {'log_prob':log_prob.flatten(), 'main_action':main_action_torch.flatten(), 'sc_env_action':sc2_env_action, **args_indexes} # args_indexes = {'categorical_args_indexes', 'spatial_args_indexes'} return actor_output
def selfplay_batch_a2c(objs, l_opt, listener, s_opt, speaker, value_coef, ent_coef): """ Use a learnt value function """ # Generate batch a2c_info = speaker.a2c(objs) oh_msgs = listener.one_hot(a2c_info['msgs']) l_logits = listener.get_logits(oh_msgs) # Train listener l_logprobs = Categorical(logits=l_logits).log_prob(objs) l_logprobs = l_logprobs.sum(-1) l_opt.zero_grad() (-l_logprobs.mean()).backward(retain_graph=True) l_opt.step() # Policy gradient rewards = l_logprobs.detach() v_loss = torch.mean((a2c_info['values'] - rewards[:, None]).pow(2)) adv = (rewards[:, None] - a2c_info['values']).detach() reinforce = adv * a2c_info['logprobs'] p_loss = -reinforce.mean() ent_loss = -a2c_info['ents'].mean() s_opt.zero_grad() (p_loss + value_coef * v_loss + ent_coef * ent_loss).backward() s_opt.step()
def forward(self, state_rep): log_probs = self.get_log_probs(state_rep) probs = torch.exp(log_probs) torch_arg = Categorical(probs).sample() #(batch_size, n_args) log_prob = log_probs.view(-1, self.max_size)[torch.arange(torch_arg.shape[0]*torch_arg.shape[1]), torch_arg.flatten()]\ .view(torch_arg.shape[0], torch_arg.shape[1]) arg = torch_arg.detach().cpu().numpy() return arg, log_prob, torch_arg
def inspection_step(agent, inspector, state, action_mask): spatial_state = state['spatial'] player_state = state['player'] spatial_state = torch.from_numpy(spatial_state).float().to(agent.device) player_state = torch.from_numpy(player_state).float().to(agent.device) action_mask = torch.tensor(action_mask).to(agent.device) log_probs, spatial_features, nonspatial_features = agent.AC.pi( spatial_state, player_state, action_mask) entropy = agent.compute_entropy(log_probs) probs = torch.exp(log_probs) a = Categorical(probs).sample() a = a.detach().cpu().numpy() log_prob = log_probs[range(len(a)), a] ### Inspection ### step_dict = {} p = probs.detach().cpu().numpy() step_dict['action_distr'] = p step_dict['action_sel'] = a # Choose top 5 actions from the probabilities - check about the batch dim top_5 = np.argsort(p)[:, -5:] top_5_actions = np.array(top_5[:, ::-1])[ 0] # some issues in accessing p if I don't call np.array() step_dict['top_5_actions'] = top_5_actions # Save SPATIAL distributions only of the top 5 actions + THEIR NAMES with torch.no_grad(): _, _, log_probs = agent.AC.spatial_params_net(spatial_features) log_probs = log_probs.detach().cpu().numpy()[ 0] # batch dim 1 during inspection step_dict['top_5_action_distr'] = {} for act in top_5_actions: step_dict['top_5_action_distr'][act] = {} arg_mask = agent.AC.spatial_arg_mask[act, :].astype(bool) arg_names = np.array(agent.AC.spatial_arg_names)[arg_mask] distr = log_probs[arg_mask].reshape((-1, ) + agent.AC.screen_res) for i, name in enumerate(arg_names): step_dict['top_5_action_distr'][act][name + '_distr'] = distr[i] ### End inspection ### args, args_log_prob = agent.AC.sample_params(nonspatial_features, spatial_features, a) step_dict['args'] = args log_prob = log_prob + args_log_prob action_id = np.array([agent.AC.action_table[act] for act in a]) action = [ actions.FunctionCall(action_id[i], args[i]) for i in range(len(action_id)) ] inspector.store_step(step_dict) return action, log_prob, torch.mean(entropy)
def evaluate(self, true_labels, all_preds, entropies, **kwargs): ood_entropies = np.zeros(0) accuracies = [] with torch.no_grad(): for batch_num, batch in enumerate(self.ds_loader): x, y = batch x = x.to(self.device) if not self.ensemble: out = self.model(x) else: out = 0 for model in self.ensemble: out += model(x) out /= len(self.ensemble) probs = F.softmax(out, dim=-1) preds, _ = torch.max(probs, dim=-1) # entropy entropy = Categorical(probs).entropy().squeeze() entropies = np.concatenate( (entropies, entropy.detach().cpu().numpy())) ood_entropies = np.concatenate( (ood_entropies, entropy.cpu().numpy())) # accuracy predictions = out.argmax(dim=-1, keepdim=True).view_as(y).cpu() correct = y.eq(predictions).sum().item() acc = correct / out.shape[0] accuracies.append(acc) true_labels = np.concatenate((true_labels, np.zeros(len(x)))) all_preds = np.concatenate((all_preds, preds.cpu().reshape( (-1)))) auroc = calculate_auroc(true_labels, all_preds) aupr = calculate_aupr(true_labels, all_preds) auroc_entropy = calculate_auroc(1 - true_labels, entropies) aupr_entropy = calculate_aupr(1 - true_labels, entropies) auroc_name = f'auroc_{self.ds_dataset}' aupr_name = f'aupr_{self.ds_dataset}' auroc_ent_name = f'auroc_entropy_{self.ds_dataset}' aupr_ent_name = f'aupr_entropy_{self.ds_dataset}' entropy_name = f'entropy_{self.ds_dataset}' acc_name = f"acc_{self.ds_dataset}" return { acc_name: np.mean(accuracies), auroc_name: auroc, aupr_name: aupr, entropy_name: np.mean(ood_entropies), auroc_ent_name: auroc_entropy, aupr_ent_name: aupr_entropy }
def inspection_step(agent, state, action_mask): state = torch.from_numpy(state).float().to(agent.device) action_mask = torch.tensor(action_mask).to(agent.device) log_probs, spatial_features, nonspatial_features = agent.AC.pi( state, action_mask) probs = torch.exp(log_probs) entropy = agent.compute_entropy(probs) a = Categorical(probs).sample() a = a.detach().cpu().numpy() ### Inspection ### step_dict = {} p = probs.detach().cpu().numpy() step_dict['action_distr'] = p step_dict['action_sel'] = a # All this sampling is completely wrong with torch.no_grad(): # select_add sel_arg, sel_log_prob, sel_distr = agent.AC.sample_param( nonspatial_features, 'select_add') p = sel_distr.detach().cpu().numpy() step_dict['selectall_distr'] = p #step_dict['selectall_sel'] = sel_arg # queued q_arg, q_log_prob, q_distr = agent.AC.sample_param( nonspatial_features, 'queued') p = q_distr.detach().cpu().numpy() step_dict['queue_distr'] = p #step_dict['queue_sel'] = q_arg # screen screen_arg, screen_log_prob, screen_distr = agent.AC.sample_param( spatial_features, 'screen') p = screen_distr.detach().cpu().numpy().reshape(state.shape[-2:]) step_dict['spatial_distr'] = p #step_dict['spatial_sel'] = screen_arg ### End inspection ### log_prob = log_probs[range(len(a)), a] action_id = np.array([agent.AC.action_dict[act] for act in a]) args, args_log_prob, args_entropy = agent.get_arguments( spatial_features, nonspatial_features, a) if move_only: if a[0] != 2: step_dict['spatial_sel'] = [0, 0] else: step_dict['spatial_sel'] = args[0][1] log_prob = log_prob + args_log_prob entropy = entropy + args_entropy action = [ actions.FunctionCall(action_id[i], args[i]) for i in range(len(action_id)) ] return action, log_prob, entropy, step_dict
def inspection_step(agent, inspector, state, action_mask): state = torch.from_numpy(state).float().to(agent.device) action_mask = torch.tensor(action_mask).to(agent.device) log_probs, spatial_features, nonspatial_features = agent.AC.pi( state, action_mask) probs = torch.exp(log_probs) entropy = agent.compute_entropy(probs) a = Categorical(probs).sample() a = a.detach().cpu().numpy() #embedded_a = agent._embed_action(a) ### Inspection ### step_dict = {} p = probs.detach().cpu().numpy() step_dict['action_distr'] = p step_dict['action_sel'] = a # Concatenate embedded action to spatial and nonspatial features #spatial_features = agent._cat_action_to_spatial(embedded_a, spatial_features) #nonspatial_features = agent._cat_action_to_nonspatial(embedded_a, nonspatial_features) # All this sampling is completely wrong - but distributions are ok with torch.no_grad(): for i, name in enumerate(inspector.arg_names): if inspector.spatial[i]: insp_arg, insp_log_prob, insp_distr = agent.AC.sample_param( spatial_features, name) p = insp_distr.detach().cpu().numpy().reshape(state.shape[-2:]) step_dict[name + '_distr'] = p else: insp_arg, insp_log_prob, insp_distr = agent.AC.sample_param( nonspatial_features, name) p = insp_distr.detach().cpu().numpy() step_dict[name + '_distr'] = p ### End inspection ### log_prob = log_probs[range(len(a)), a] action_id = np.array([agent.AC.action_dict[act] for act in a]) args, args_log_prob, args_entropy = agent.get_arguments( spatial_features, nonspatial_features, a) step_dict['args'] = args log_prob = log_prob + args_log_prob entropy = entropy + args_entropy action = [ actions.FunctionCall(action_id[i], args[i]) for i in range(len(action_id)) ] inspector.store_step(step_dict) return action, log_prob, entropy
def evaluate(self, true_labels, all_preds, entropies, **kwargs): ood_entropies = np.zeros(0) with torch.no_grad(): for batch_num, batch in enumerate(self.ood_loader): x, y = batch x = x.float().to(self.device) if not self.ensemble: out = self.model(x) else: out = 0 for model in self.ensemble: out += model(x) out /= len(self.ensemble) probs = F.softmax(out, dim=-1) preds, _ = torch.max(probs, dim=-1) entropy = Categorical(probs).entropy().squeeze() entropies = np.concatenate( (entropies, entropy.detach().cpu().numpy())) ood_entropies = np.concatenate( (ood_entropies, entropy.cpu().numpy())) true_labels = np.concatenate((true_labels, np.zeros(len(x)))) all_preds = np.concatenate((all_preds, preds.cpu().reshape( (-1)))) auroc = calculate_auroc(true_labels, all_preds) aupr = calculate_aupr(true_labels, all_preds) auroc_entropy = calculate_auroc(1 - true_labels, entropies) aupr_entropy = calculate_aupr(1 - true_labels, entropies) auroc_name = f'auroc_{self.ood_dataset}' aupr_name = f'aupr_{self.ood_dataset}' auroc_ent_name = f'auroc_entropy_{self.ood_dataset}' aupr_ent_name = f'aupr_entropy_{self.ood_dataset}' entropy_name = f'entropy_{self.ood_dataset}' return { auroc_name: auroc, aupr_name: aupr, entropy_name: np.mean(ood_entropies), auroc_ent_name: auroc_entropy, aupr_ent_name: aupr_entropy }
def actor_step(self, env_output, hidden_state=None, cell_state=None): screen_layers = env_output['screen_layers'].to(self.device) minimap_layers = env_output['minimap_layers'].to(self.device) done = env_output['done'].to(self.device).view(1,1) player_state = env_output['player_state'].unsqueeze(0).to(self.device) last_action = env_output['last_action'].to(self.device) # add it to the output of the environment action_mask = env_output['action_mask'].to(self.device) # add time and batch dimension screen_layers = screen_layers.view(1,1,*screen_layers.shape[-3:]) minimap_layers = minimap_layers.view(1,1,*minimap_layers.shape[-3:]) results = self.compute_features(screen_layers, minimap_layers, player_state, last_action, hidden_state, cell_state, done ) spatial_features, shared_features, hidden_state, cell_state = results log_probs = self.pi(shared_features, action_mask) probs = torch.exp(log_probs) main_action_torch = Categorical(probs).sample() # check probs < 0?! main_action = main_action_torch.detach().cpu().numpy() log_prob = log_probs[range(len(main_action)), main_action] args, args_log_prob, args_indexes = self.sample_params(shared_features, spatial_features, main_action) assert args_log_prob.shape == log_prob.shape, ("Shape mismatch between arg_log_prob and log_prob ",\ args_log_prob.shape, log_prob.shape) log_prob = log_prob + args_log_prob action_id = np.array([self.action_table[act] for act in main_action]) sc2_env_action = [sc_actions.FunctionCall(action_id[i], args[i]) for i in range(len(action_id))] actor_output = {'log_prob':log_prob.flatten(), 'main_action':main_action_torch.flatten(), 'sc_env_action':sc2_env_action, **args_indexes} # args_indexes = {'categorical_args_indexes', 'spatial_args_indexes'} return actor_output, (hidden_state, cell_state)
def selfplay_batch(objs, l_opt, listener, s_opt, speaker, ema_reward=None): """ Use exponential reward (kinda depricated not working) :return updated average reward """ # Generate batch idxes = objs[:, 5] * 100000 + objs[:, 4] * 10000 + objs[:, 3] * 1000 + objs[:, 2] * 100 + objs[:, 1] * 10 + objs[:, 0] s_logits = speaker(objs) msgs = Categorical(logits=s_logits).sample() oh_msgs = listener.one_hot(msgs) l_logits = listener(oh_msgs) # Train listener l_logprobs = Categorical(logits=l_logits).log_prob(objs) l_logprobs = l_logprobs.sum(-1) l_opt.zero_grad() (-l_logprobs.mean()).backward(retain_graph=True) l_opt.step() # Policy gradient rewards = l_logprobs.detach() values = rewards.numpy() # Compute reward average if ema_reward is not None: ema_reward.update(values, idxes) else: ema_reward = ExponentialMovingAverager() ema_reward.update(values, idxes) s_dist = Categorical(s_logits) s_logprobs = s_dist.log_prob(msgs).sum(-1) reinforce = (rewards - torch.tensor(ema_reward.mean[idxes])) * s_logprobs entropy = s_dist.entropy().sum(-1) s_opt.zero_grad() (-reinforce.mean() - 0.0001 * entropy.mean()).backward() s_opt.step() return ema_reward
def evaluate(args, model, tokenizer, prefix="", test=False): eval_task_names = (args.task_name, ) eval_outputs_dirs = (args.output_dir, ) m = torch.nn.Softmax(dim=1) with open(args.data_dir, "r", encoding='utf8') as f: inputjson = [json.loads(jline) for jline in f.readlines()] results = {} ABCD = ["A", "B", "C", "D"] for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=not test, test=test) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: print("===evaluate===", eval_output_dir) os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None entropys = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None, # XLM don't use segment_ids "labels": batch[3], } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] logits = m(logits) entropy = Categorical(probs=logits).entropy() eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: entropys = entropy.detach().cpu().numpy() preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) entropys = np.append(entropys, entropy.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps max_pred = np.argmax(preds, axis=1) error = {"errors": []} entropy_result = defaultdict(list) entropy_list = [] for i in range(len(max_pred)): entropy_result[inputjson[i]['id']].append( [entropys[i], inputjson[i]]) for id, eitem in entropy_result.items(): max_e_item = sorted(eitem, key=lambda l: l[0], reverse=True)[0] entropy_list.append(max_e_item[1]) for i, (p, t) in enumerate(zip(max_pred, out_label_ids)): error['errors'].append([ entropys[i], "predict:" + ABCD[p], "answer:" + ABCD[t], inputjson[i] ]) error['errors'] = sorted(error['errors'], key=lambda l: l[0], reverse=True) results.update(error) preds = np.argmax(preds, axis=1) acc = simple_accuracy(preds, out_label_ids) result = {"eval_acc": acc, "eval_loss": eval_loss} results.update(result) output_eval_file = os.path.join( eval_output_dir, "datadir_" + args.data_dir.replace("/", "").replace(".", "") + "_eval_results.txt") output_entropy_file = os.path.join( eval_output_dir, "datadir_" + args.data_dir.replace("/", "").replace(".", "") + "_entropy.jsonl") with jsonlines.open(output_entropy_file, mode='w') as writer: writer.write_all(entropy_list) with open(output_eval_file, "w") as w: w.writelines(json.dumps(i) for i in entropy_list) with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( str(prefix) + " is test:" + str(test))) writer.write("model =%s\n" % str(args.model_name_or_path)) writer.write("total batch size=%d\n" % (args.per_gpu_train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))) writer.write("train num epochs=%d\n" % args.num_train_epochs) writer.write("fp16 =%s\n" % args.fp16) writer.write("max seq length =%d\n" % args.max_seq_length) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) for e in error['errors']: writer.write(str(e) + "\n") return results
def _batch_iteration(self, x: torch.Tensor, y: torch.Tensor, train: bool = True, targeted=False, exclude_wrong_predictions=False, return_acc: bool = True, **kwargs): """ one iteration of forward-backward """ # attack stuff if train: adv_batch_size = int(self._arguments['batch_size'] / 2) x_to_adv = kwargs['xun'][:adv_batch_size] y_to_adv = y[:adv_batch_size] self._model.eval() if self._model.is_maskable: self._model.apply_weight_mask() adv_results, _ = construct_adversarial_examples( x_to_adv, y_to_adv, self.attack_method, self._model, self._device, self.epsilon, exclude_wrong_predictions, targeted) _, advs, _ = adv_results advs = advs.cpu() new_advs = [] for image in advs: image = self.transform(image.squeeze()).unsqueeze(0) new_advs.append(image) advs = torch.cat(new_advs) x = torch.cat((advs, x[adv_batch_size:])) self._model.train() # unpack x, y = x.to(self._device).float(), y.to(self._device) # update metrics self._metrics.update_batch(train) # record time if "cuda" in str(self._device): start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() # forward pass if return_acc: accuracy, loss, out = self._forward_pass(x, y, train=train, return_acc=return_acc) else: loss, out = self._forward_pass(x, y, train=train, return_acc=return_acc) # backward pass if train: self._backward_pass(loss) # compute entropy probs = F.softmax(out, dim=-1) entropy = Categorical(probs).entropy().squeeze().mean() # get max predicted prob preds, _ = torch.max(probs, dim=-1) # record time if "cuda" in str(self._device): end.record() torch.cuda.synchronize(self._device) time = start.elapsed_time(end) else: time = 0 # free memory for tens in [out, y, x, loss, entropy, preds]: tens.detach() if return_acc: return accuracy, loss.item(), time, entropy.detach().cpu( ), preds.cpu() else: return loss.item(), time, entropy.detach().cpu(), preds.cpu()
def inspection_step(agent, inspector, state, action_mask): spatial_state = state['spatial'] player_state = state['player'] spatial_state = torch.from_numpy(spatial_state).float().to(agent.device) player_state = torch.from_numpy(player_state).float().to(agent.device) action_mask = torch.tensor(action_mask).to(agent.device) log_probs, spatial_features, nonspatial_features = agent.AC.pi( spatial_state, player_state, action_mask) entropy = agent.compute_entropy(log_probs) probs = torch.exp(log_probs) a = Categorical(probs).sample() a = a.detach().cpu().numpy() log_prob = log_probs[range(len(a)), a] ### Inspection ### step_dict = {} p = probs.detach().cpu().numpy() step_dict['action_distr'] = p step_dict['action_sel'] = a # Choose top 5 actions from the probabilities - check about the batch dim top_5 = np.argsort(p)[:, -5:] top_5_actions = np.array(top_5[:, ::-1])[ 0] # some issues in accessing p if I don't call np.array() #print("top_5_actions: ", top_5_actions, top_5_actions.shape) step_dict['top_5_actions'] = top_5_actions # Save distributions only of the top 5 actions step_dict['top_5_action_distr'] = {} with torch.no_grad(): for act in top_5_actions: step_dict['top_5_action_distr'][act] = {} # first nested level arg_names = inspector.act_to_arg_names[act] for arg_name in arg_names: if inspector.arguments_type[ arg_name] == 'spatial': # it's either 'spatial' or 'categorical' insp_arg, insp_log_prob, insp_distr = agent.AC.sample_param( spatial_features, arg_name) p = insp_distr.detach().cpu().numpy().reshape( spatial_state.shape[-2:]) else: insp_arg, insp_log_prob, insp_distr = agent.AC.sample_param( nonspatial_features, arg_name) p = insp_distr.detach().cpu().numpy() step_dict['top_5_action_distr'][act][ arg_name + '_distr'] = p # second nested level ### End inspection ### args, args_log_prob, args_entropy = agent.get_arguments( spatial_features, nonspatial_features, a) step_dict['args'] = args log_prob = log_prob + args_log_prob action = [actions.FunctionCall(a[i], args[i]) for i in range(len(a))] inspector.store_step(step_dict) return action, log_prob, torch.mean(entropy)
def _batch_iteration_ood(self, x: torch.Tensor, y: torch.Tensor, ood_x: torch.Tensor, ood_y: torch.Tensor, train: bool = True, return_acc: bool = True): """ one iteration of forward-backward """ # unpack x, y = x.to(self._device).float(), y.to(self._device) ood_x, ood_y = ood_x.to(self._device).float(), ood_y.to(self._device) # update metrics self._metrics.update_batch(train) # record time if "cuda" in str(self._device): start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() # forward pass if return_acc: accuracy, loss, out = self._forward_pass_ood(x, y, ood_x, ood_y, train=train, return_acc=return_acc) else: loss, out = self._forward_pass_ood(x, y, ood_x, ood_y, train=train, return_acc=return_acc) if self._arguments['prune_criterion'] == 'RigL': self._handle_pruning(self._metrics._epoch) # backward pass if train: self._backward_pass(loss) # compute entropy probs = F.softmax(out, dim=-1) entropy = Categorical(probs).entropy().squeeze().mean() # get max predicted prob preds, _ = torch.max(probs, dim=-1) # record time if "cuda" in str(self._device): end.record() torch.cuda.synchronize(self._device) time = start.elapsed_time(end) else: time = 0 # free memory for tens in [out, y, x, loss, entropy, preds]: tens.detach() if return_acc: return accuracy, loss.item(), time, entropy.detach().cpu( ), preds.cpu() else: return loss.item(), time, entropy.detach().cpu(), preds.cpu()
def train(args, nets, optimizers, env, obs_size, n_drones): icm_model_name = "ICM_" if args.enable_icm else "" log_file = f"A2C_{icm_model_name}{args.policy}.log" logging.basicConfig(filename=log_file, level=logging.INFO, format="%(message)s") steps = [] total_steps = 0 ep_rewards = 0.0 grad_step = 0 if args.enable_icm: icm = ICM(obs_size=obs_size, action_space=env.action_size) pbar = tqdm(total=args.total_steps) while total_steps < args.total_steps: obs = env.reset() drone_pos = np.array(env.n_drones_pos) obs, drone_pos = prepare_inputs(args, obs, drone_pos, n_drones, obs_size) curr_state = obs avg_rewards = [] for _ in range(args.rollout_steps): # network forward pass policies = [] values = [] actions = [] for i in range(n_drones): p, v = nets[i](obs, drone_pos) probs = F.softmax(p, dim=-1) a = Categorical(probs).sample()[0] policies.append(p) values.append(v) actions.append(a.detach().unsqueeze(0).numpy()) # gather env data, reset done envs and update their obs obs, rewards, dones = env.step(actions) ep_rewards += rewards if dones: ep_rewards = 0.0 obs = env.reset() drone_pos = np.array(env.n_drones_pos) obs, drone_pos = prepare_inputs(args, obs, drone_pos, n_drones, obs_size) next_state = obs avg_rewards.append(rewards) if args.enable_icm: ## ICM for one drone rewards += icm( a_t=torch.tensor(actions[0], dtype=torch.long), a_t_logits=policies[0].detach(), s_t=curr_state, s_t1=next_state, ) # reset the LSTM state for done envs masks = ( 1.0 - torch.from_numpy(np.array([dones], dtype=np.float32)) ).unsqueeze(1) total_steps += 1 pbar.update(1) rewards = torch.tensor([rewards]).float().unsqueeze(1) actions = torch.tensor(actions) policies = torch.cat(policies) values = torch.cat(values) steps.append((rewards, masks, actions, policies, values)) final_obs = obs final_drone_pos = drone_pos final_values = [] for i in range(n_drones): _, final_v = nets[i](final_obs, final_drone_pos) final_values.append(final_v) final_values = torch.cat(final_values) steps.append((None, None, None, None, final_values)) actions, policies, values, returns, advantages = process_rollout(args, steps) probs = F.softmax(policies, dim=-1) log_probs = F.log_softmax(policies, dim=-1) log_action_probs = log_probs.clone() policy_loss = (-log_action_probs * Variable(advantages)).mean() value_loss = advantages.pow(2).mean() entropy_loss = (-log_probs * probs).mean() loss = ( policy_loss + value_loss * args.value_coeff + entropy_loss * args.entropy_coeff ) loss.backward() if (grad_step + 1) % args.grad_acc == 0: for i in range(n_drones): torch.nn.utils.clip_grad_norm_( nets[i].parameters(), args.grad_norm_limit ) optimizers[i].step() optimizers[i].zero_grad() grad_step += 1 steps = [] if total_steps % args.save_freq == 0: for i in range(n_drones): torch.save( nets[i].state_dict(), f"A2C_models/{args.policy}_policy/A2C_drone_{icm_model_name}{i}.bin", ) pbar.set_postfix(loss=loss.item(), reward=np.mean(avg_rewards)) logging.info(f"loss: {loss.item()}, reward: {np.mean(avg_rewards)}")
def _batch_iteration(self, x: torch.Tensor, y: torch.Tensor, train: bool = True, return_acc: bool = True, **kwargs): """ one iteration of forward-backward """ # unpack x, y = x.to(self._device).float(), y.to(self._device) # update metrics self._metrics.update_batch(train) # record time if "cuda" in str(self._device): start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() # forward pass if return_acc: accuracy, loss, out = self._forward_pass(x, y, train=train, return_acc=return_acc) else: loss, out = self._forward_pass(x, y, train=train, return_acc=return_acc) if self._arguments['prune_criterion'] == 'RigL': self._handle_pruning(self._metrics._epoch) # backward pass if train: self._backward_pass(loss) # compute entropy probs = F.softmax(out, dim=-1) entropy = Categorical(probs).entropy().squeeze().mean() # get max predicted prob preds, _ = torch.max(probs, dim=-1) # AUGERINO '''''''''''''' # loss = 0 # accuracy = 0 # out = 0 # for i, (im, crit) in enumerate(zip(x, y)): # batch = [im.unsqueeze(0)] # batch_y = [torch.tensor(crit)] # for _ in range(4): # batch.append(self.augerino(im.unsqueeze(0))) # batch_y.append(torch.tensor(crit)) # batch = torch.cat(batch) # batch_y = torch.tensor(batch_y, device=self._device) # if return_acc: # _accuracy, _loss, _out = self._forward_pass(batch, batch_y, train=train, return_acc=return_acc) # accuracy += (_accuracy) # else: # _loss, _out = self._forward_pass(batch, batch_y, train=train, return_acc=return_acc) # loss += _loss # out += _out # loss = loss / len(x) # accuracy = accuracy / len(x) # out = out / len(x) # ''''''''''''''''''''''' # record time if "cuda" in str(self._device): end.record() torch.cuda.synchronize(self._device) time = start.elapsed_time(end) else: time = 0 # free memory for tens in [out, y, x, loss, entropy, preds]: tens.detach() if return_acc: return accuracy, loss.item(), time, entropy.detach().cpu( ), preds.cpu() else: return loss.item(), time, entropy.detach().cpu(), preds.cpu()
def entropy_categorical(categorical_parameters): entropy = Categorical(categorical_parameters).entropy() # TODO: discuss whether we want numpy in these functions assert_no_nan_no_inf(entropy) entropy = entropy.detach().numpy() return entropy
def get_llhs(self, obs, actions): with torch.no_grad(): pi = self.p_net(obs) llhs = Categorical(pi).log_prob(actions) return llhs.detach()