def play_episode(self,episode:int): state = self.env.reset() previous_x = None episode_actions = torch.empty(size=(0,),dtype=torch.long,device=self.device) episode_logits = torch.empty(size=(0,self.env.action_space.n),device=self.device) average_rewards = numpy.empty(shape=(0,), dtype=numpy.float) episode_rewards = numpy.empty(shape=(0,), dtype=numpy.float) while True: #if not self.render: # self.env.render() current_x = self.PreProcessing(state) x = current_x - previous_x if previous_x is not None else numpy.zeros_like(current_x) previous_x = current_x action_logits = self.agent(torch.tensor(x).float().unsqueeze(dim=0).to(self.device)) episode_logits = torch.cat((action_logits,episode_logits),dim=0) action = Categorical(logits=action_logits).sample() episode_actions = torch.cat((episode_actions,action),dim=0) state,reward,done,_ = self.env.step(action = action.cpu().item()) episode_rewards = numpy.concatenate((episode_rewards,numpy.array([reward])),axis=0) average_rewards = numpy.concatenate((average_rewards,numpy.expand_dims(numpy.mean(episode_rewards),axis=0)),axis=0) if done: episode+=1 discounted_rewards = PG_RL.get_discounted_rewards(rewards=episode_rewards,gamma=self.gamma) discounted_rewards -= average_rewards discounted_rewards /= numpy.std(discounted_rewards) sum_of_rewards = numpy.sum(episode_rewards) mask = one_hot(episode_actions,num_classes=self.env.action_space.n) episode_log_probs = torch.sum(mask.float()*log_softmax(episode_logits,dim=1),dim=1) episode_weighted_log_probs = episode_log_probs * torch.tensor(discounted_rewards).float().to(self.device) sum_weighted_log_probs = torch.sum(episode_weighted_log_probs).unsqueeze(dim=0) #show_video() return sum_weighted_log_probs, episode_logits, sum_of_rewards, episode
def agent_step(self): with torch.no_grad(): action_probs, log_probs, termination_probs, q_u, q_omega = self.policy( self.state) if self.current_option is None: self.current_option = Categorical( probs=self._epsilon_probs(q_omega[0])).sample() action = Categorical( probs=action_probs[0, self.current_option, :]).sample() action = action.cpu().detach().numpy() # action = self.env.action_space.sample() action = int(action) next_state, reward, done, info = self.env.step(action) # self.env.render() self.replay_buffer.add([ self.state, action, self.current_option, self.previous_option, reward, next_state, done ]) self.state = next_state self.previous_option = self.current_option if done: self.agent_reset() elif termination_probs[0, self.current_option] >= torch.rand(1): self.current_option = None return reward, done
def evaluate(self, true_labels, all_preds, entropies, **kwargs): ood_entropies = np.zeros(0) accuracies = [] with torch.no_grad(): for batch_num, batch in enumerate(self.ds_loader): x, y = batch x = x.to(self.device) if not self.ensemble: out = self.model(x) else: out = 0 for model in self.ensemble: out += model(x) out /= len(self.ensemble) probs = F.softmax(out, dim=-1) preds, _ = torch.max(probs, dim=-1) # entropy entropy = Categorical(probs).entropy().squeeze() entropies = np.concatenate( (entropies, entropy.detach().cpu().numpy())) ood_entropies = np.concatenate( (ood_entropies, entropy.cpu().numpy())) # accuracy predictions = out.argmax(dim=-1, keepdim=True).view_as(y).cpu() correct = y.eq(predictions).sum().item() acc = correct / out.shape[0] accuracies.append(acc) true_labels = np.concatenate((true_labels, np.zeros(len(x)))) all_preds = np.concatenate((all_preds, preds.cpu().reshape( (-1)))) auroc = calculate_auroc(true_labels, all_preds) aupr = calculate_aupr(true_labels, all_preds) auroc_entropy = calculate_auroc(1 - true_labels, entropies) aupr_entropy = calculate_aupr(1 - true_labels, entropies) auroc_name = f'auroc_{self.ds_dataset}' aupr_name = f'aupr_{self.ds_dataset}' auroc_ent_name = f'auroc_entropy_{self.ds_dataset}' aupr_ent_name = f'aupr_entropy_{self.ds_dataset}' entropy_name = f'entropy_{self.ds_dataset}' acc_name = f"acc_{self.ds_dataset}" return { acc_name: np.mean(accuracies), auroc_name: auroc, aupr_name: aupr, entropy_name: np.mean(ood_entropies), auroc_ent_name: auroc_entropy, aupr_ent_name: aupr_entropy }
def play_ep(self): # reset env state after every episode state = self.env.reset() prev_x = None episode_actions = torch.empty(size=(0,), dtype=torch.long, device=self.device) episode_logits = torch.empty(size=(0, 2),device=self.device) average_rewards = np.empty(shape=(0,), dtype=np.float) episode_rewards = np.empty(shape=(0,), dtype=np.float) while True: # render env for display if self.render_env: self.env.render() # pre-preprocess current the state and subtract from previous state to add-in motion information cur_x = prepro(state) x = cur_x - prev_x if prev_x is not None else np.zeros(self.in_sz).astype(np.float32) prev_x = cur_x # get choice from network action_logit = self.agent(torch.tensor(x).float().unsqueeze(0).to(self.device)) # add to buffer episode_logits = torch.cat((episode_logits, action_logit), dim=0) # sample and action and execute the action action = Categorical(logits=action_logit).sample() # add to buffer episode_actions = torch.cat((episode_actions, action),dim=0) state, reward, done, _ = self.env.step(action=action.cpu().item()) # add to buffer episode_rewards = np.concatenate((episode_rewards, np.array([reward])), axis=0) # like averaging from 1 to nth time step (on-average return till that time step) average_rewards = np.concatenate((average_rewards, np.expand_dims(np.mean(episode_rewards), axis=0)), axis=0) if reward != 0: # Pong has either +1 or -1 reward exactly when game ends. print(('ep #: game finished, reward: %f' % (reward)) + ('' if reward == -1 else ' !!!!!!!!')) if done: # end of episode # get discounted rewards and normalize the return discounted_rewards = discount_rewards(episode_rewards, gamma=self.gamma) # subtract baseline rewards discounted_rewards -= average_rewards # set mask for the actions executed mask = one_hot(episode_actions, num_classes=2) # similar to cross-entropy for classification but with fake labels and our action confidence weighted_ps = torch.sum(mask.float() * log_softmax(episode_logits, dim=1), dim=1) # weight the loss with the discounted rewards to get expected reward from distribution episode_weighted_loss = weighted_ps * torch.tensor(discounted_rewards).float().to(self.device) return episode_weighted_loss, episode_logits, episode_rewards
def get_action(self, state_np): state_th = torch.tensor(state_np).float() action_th = self.forward(state_th) if self.type == 'discrete': action_sampled_th = Categorical(logits=action_th).sample() else: raise NotImplementedError action_sampled_np = action_sampled_th.cpu().detach().numpy() return action_sampled_np
def choose_action(self, states, buffer=True): probs, values = self.forward(states) # print("values:", values) # print("probs:", probs) actions = Categorical(probs).sample() if buffer: self.state_buffer.append(states) self.value_buffer.append(values) self.prob_buffer.append(probs) self.action_buffer.append(torch.unsqueeze(actions, 1)) # print("actions:", actions) actions = actions.cpu().numpy() + 1 values = values.detach().cpu().numpy() probs = probs.detach().cpu().numpy() return actions, values, probs
def evaluate(self, true_labels, all_preds, entropies, **kwargs): ood_entropies = np.zeros(0) with torch.no_grad(): for batch_num, batch in enumerate(self.ood_loader): x, y = batch x = x.float().to(self.device) if not self.ensemble: out = self.model(x) else: out = 0 for model in self.ensemble: out += model(x) out /= len(self.ensemble) probs = F.softmax(out, dim=-1) preds, _ = torch.max(probs, dim=-1) entropy = Categorical(probs).entropy().squeeze() entropies = np.concatenate( (entropies, entropy.detach().cpu().numpy())) ood_entropies = np.concatenate( (ood_entropies, entropy.cpu().numpy())) true_labels = np.concatenate((true_labels, np.zeros(len(x)))) all_preds = np.concatenate((all_preds, preds.cpu().reshape( (-1)))) auroc = calculate_auroc(true_labels, all_preds) aupr = calculate_aupr(true_labels, all_preds) auroc_entropy = calculate_auroc(1 - true_labels, entropies) aupr_entropy = calculate_aupr(1 - true_labels, entropies) auroc_name = f'auroc_{self.ood_dataset}' aupr_name = f'aupr_{self.ood_dataset}' auroc_ent_name = f'auroc_entropy_{self.ood_dataset}' aupr_ent_name = f'aupr_entropy_{self.ood_dataset}' entropy_name = f'entropy_{self.ood_dataset}' return { auroc_name: auroc, aupr_name: aupr, entropy_name: np.mean(ood_entropies), auroc_ent_name: auroc_entropy, aupr_ent_name: aupr_entropy }
def _step(self, obs, hiddens, masks): with torch.no_grad(): values, action_probs, hiddens = self.model(obs, hiddens, masks) actions = Categorical(action_probs.detach()).sample() # Sample actions from the output distributions obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) obs = torch.from_numpy(obs) rewards = torch.from_numpy(rewards).unsqueeze(1) masks = torch.from_numpy(1 - (dones)).unsqueeze(1) actions = actions.unsqueeze(1) self.rollouts.insert( obs, #next hiddens, #next actions, #now action_probs, #now values, #now rewards, #now masks) #next
def get_next_batch(self, env): for _ in range(C.NUM_EPOCHS): epoch_logits = torch.empty(size=(0, self.action_space_size), device=self.DEVICE) epoch_weighted_log_probs = torch.empty(size=(0, ), dtype=torch.float, device=self.DEVICE) total_rewards = deque([], maxlen=C.BATCH_SIZE_PER_THREAD) episode_counter = 0 while episode_counter < C.BATCH_SIZE_PER_THREAD: episode_counter += 1 # reset the environment to a random initial state every epoch state = env.reset() # initialize the episode arrays episode_actions = torch.empty(size=(0, ), dtype=torch.long, device=self.DEVICE) episode_logits = torch.empty(size=(0, C.action_space_size), device=self.DEVICE) average_rewards = np.empty(shape=(0, ), dtype=np.float) episode_rewards = np.empty(shape=(0, ), dtype=np.float) # episode loop for step_index in range(0, C.max_simulation_length): # get the action logits from the agent - (preferences) action_logits = self.m( torch.tensor(state).float().unsqueeze(dim=0).to( self.DEVICE)) # append the logits to the episode logits list episode_logits = torch.cat((episode_logits, action_logits), dim=0) # sample an action according to the action distribution action = Categorical(logits=action_logits).sample() # append the action to the episode action list to obtain the trajectory # we need to store the actions and logits so we could calculate the gradient of the performance episode_actions = torch.cat((episode_actions, action), dim=0) # take the chosen action, observe the reward and the next state state, reward, done, _ = env.step( action=action.cpu().item()) # append the reward to the rewards pool that we collect during the episode # we need the rewards so we can calculate the weights for the policy gradient # and the baseline of average episode_rewards = np.concatenate( (episode_rewards, np.array([reward])), axis=0) # here the average reward is state specific average_rewards = np.concatenate( (average_rewards, np.expand_dims(np.mean(episode_rewards), axis=0)), axis=0) # turn the rewards we accumulated during the episode into the rewards-to-go: # earlier actions are responsible for more rewards than the later taken actions discounted_rewards_to_go = utils.get_discounted_rewards( rewards=episode_rewards, gamma=C.GAMMA) discounted_rewards_to_go -= average_rewards # baseline - state specific average # calculate the sum of the rewards for the running average metric sum_of_rewards = np.sum(episode_rewards) # after each episode append the sum of total rewards to the deque total_rewards.append(sum_of_rewards) # set the mask for the actions taken in the episode mask = one_hot(episode_actions, num_classes=C.action_space_size) # calculate the log-probabilities of the taken actions # mask is needed to filter out log-probabilities of not related logits episode_log_probs = torch.sum( mask.float() * log_softmax(episode_logits, dim=1), dim=1) # weight the episode log-probabilities by the rewards-to-go episode_weighted_log_probs = episode_log_probs * \ torch.tensor(discounted_rewards_to_go).float().to(self.DEVICE) # calculate the sum over trajectory of the weighted log-probabilities sum_weighted_log_probs = torch.sum( episode_weighted_log_probs).unsqueeze(dim=0) # append the weighted log-probabilities of actions epoch_weighted_log_probs = torch.cat( (epoch_weighted_log_probs, sum_weighted_log_probs), dim=0) # append the logits - needed for the entropy bonus calculation epoch_logits = torch.cat((epoch_logits, episode_logits), dim=0) # calculate the loss loss, entropy = utils.calculate_loss( C.BETA, epoch_logits=epoch_logits, weighted_log_probs=epoch_weighted_log_probs) yield loss, total_rewards
def _update(self, states, actions, rewards, advantages, returns, masks, epoch): old_model = copy.deepcopy(self.model) policy_losses = np.array([]) entropies = np.array([]) value_losses = np.array([]) losses = np.array([]) for _ in range(self.ppo_epochs): rand_list = (torch.randperm(self.batch_num * self.batch_size).view( -1, self.batch_size).tolist()) for ind in rand_list: batch = states[ind] actor_logits, vals, _ = self.model(batch) log_probs = F.log_softmax(actor_logits, dim=1) with torch.no_grad(): old_actor_logits, _, _ = old_model(batch) old_log_probs = F.log_softmax(old_actor_logits, dim=1) adv = advantages[ind].to(self.device) advs = advantages.to(self.device) adv = (adv - advs.mean()) / (advs.std() + 1e-8) A = returns[ind].to(self.device) - vals action = actions[ind].to(self.device) old_log_probs = old_log_probs.gather(1, action) log_probs = log_probs.gather(1, action) r = (log_probs - old_log_probs).exp() clip = r.clamp(min=1 - self.epsilon, max=1 + self.epsilon) L, _ = torch.stack([r * adv.detach(), clip * adv.detach()]).min(0) v_l = A.pow(2).mean() L = L.mean() entropy = Categorical(F.softmax(actor_logits, dim=1)).entropy().mean() loss = -L + self.v_loss_coef * v_l - self.entropy_coef * entropy self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) self.optimizer.step() policy_losses = np.append(policy_losses, L.cpu().detach().numpy()) value_losses = np.append(value_losses, v_l.cpu().detach().numpy()) losses = np.append(losses, loss.cpu().detach().numpy()) entropies = np.append(entropies, entropy.cpu().detach().numpy()) policy_loss = policy_losses.mean() value_loss = value_losses.mean() loss = losses.mean() entropy = entropies.mean() self.writer.add_scalar("PolicyLoss", policy_loss, epoch + 1) self.writer.add_scalar("ValueLoss", value_loss, epoch + 1) self.writer.add_scalar("Loss", loss, epoch + 1) self.writer.add_scalar("Entropy", entropy, epoch + 1) del states, actions, rewards, advantages, returns, masks
def play_episode(environment, device, action_space_size, agent, gamma, episode: int): """ Plays an episode of the environment. episode: the episode counter Returns: sum_weighted_log_probs: the sum of the log-prob of an action multiplied by the reward-to-go from that state episode_logits: the logits of every step of the episode - needed to compute entropy for entropy bonus finished_rendering_this_epoch: pass-through rendering flag sum_of_rewards: sum of the rewards for the episode - needed for the average over 200 episode statistic """ agent.to('cpu') device = 'cpu' # reset the environment to a random initial state every epoch state = environment.reset() # initialize the episode arrays episode_actions = torch.empty(size=(0, ), dtype=torch.long, device=device) episode_logits = torch.empty(size=(0, action_space_size), device=device) average_rewards = np.empty(shape=(0, ), dtype=np.float) episode_rewards = np.empty(shape=(0, ), dtype=np.float) # episode loop while True: # get the action logits from the agent - (preferences) action_logits = agent( torch.tensor(state).float().unsqueeze(dim=0).to(device)) #print('action logits is',action_logits) # append the logits to the episode logits list episode_logits = torch.cat((episode_logits, action_logits), dim=0) # sample an action according to the action distribution action = Categorical(logits=action_logits).sample() #print('the action after categorical is',action) # append the action to the episode action list to obtain the trajectory # we need to store the actions and logits so we could calculate the gradient of the performance episode_actions = torch.cat((episode_actions, action), dim=0) # take the chosen action, observe the reward and the next state state, reward, done, _ = environment.step(action=action.cpu().item()) # append the reward to the rewards pool that we collect during the episode # we need the rewards so we can calculate the weights for the policy gradient # and the baseline of average episode_rewards = np.concatenate((episode_rewards, np.array([reward])), axis=0) # here the average reward is state specific average_rewards = np.concatenate( (average_rewards, np.expand_dims(np.mean(episode_rewards), axis=0)), axis=0) # the episode is over if done: # increment the episode episode += 1 # turn the rewards we accumulated during the episode into the rewards-to-go: # earlier actions are responsible for more rewards than the later taken actions discounted_rewards_to_go = utils.get_discounted_rewards( rewards=episode_rewards, gamma=gamma) discounted_rewards_to_go -= average_rewards # baseline - state specific average # # calculate the sum of the rewards for the running average metric sum_of_rewards = np.sum(episode_rewards) # set the mask for the actions taken in the episode mask = one_hot(episode_actions, num_classes=environment.action_space.n) # calculate the log-probabilities of the taken actions # mask is needed to filter out log-probabilities of not related logits episode_log_probs = torch.sum(mask.float() * log_softmax(episode_logits, dim=1), dim=1) # weight the episode log-probabilities by the rewards-to-go episode_weighted_log_probs = episode_log_probs * \ torch.tensor(discounted_rewards_to_go).float().to(device) # calculate the sum over trajectory of the weighted log-probabilities sum_weighted_log_probs = torch.sum( episode_weighted_log_probs).unsqueeze(dim=0) sum_weighted_log_probs = sum_weighted_log_probs.to('cpu') episode_logits = episode_logits.to('cpu') sum_weighted_log_probs = sum_weighted_log_probs.to(device) episode_logits = episode_logits.to(device) return sum_weighted_log_probs, episode_logits, sum_of_rewards, episode
def inference( self, sent_memory_emb, graph_memory_emb, sent_memory_mask, graph_memory_mask, max_step, use_sampling=False, ): batch_size, sent_memory_seq, dim = list(sent_memory_emb.shape) _, graph_memory_seq, _ = list(graph_memory_emb.shape) sent_memory_mask_inv = sent_memory_mask == 0 # [batch, sent_memory_seq] graph_memory_mask_inv = graph_memory_mask == 0 # [batch, sent_memory_seq] target_ids = [[self.BOS for i in range(batch_size)]] # [target_seq, batch] target_mask = [[1.0] for i in range(batch_size)] # [batch, target_seq] target_prob = [] # [target_seq, batch] is_finish = [False for _ in range(batch_size)] rows = torch.arange(batch_size).to(device) for step in range(max_step): cur_seq = step + 1 cur_emb = self.dec_word_embedding( torch.tensor(target_ids).to(device)) # [cur_seq, batch, dim] cur_emb = self.position_encoder(cur_emb) # [cur_seq, batch, dim] cur_mask = torch.tensor(target_mask).to(device) cur_mask_inv = cur_mask == 0.0 # [batch, cur_seq] cur_triu_mask = torch.triu(torch.ones(cur_seq, cur_seq).to(device), diagonal=1) # [cur_seq, cur_seq] cur_triu_mask.masked_fill_(cur_triu_mask == 1, -1e20) cur_emb = self.decoder( cur_emb, sent_memory_emb, # [batch, sent_len, dim] graph_memory_emb, # [batch, graph_len, dim] tgt_mask=cur_triu_mask, tgt_key_padding_mask=cur_mask_inv, sent_memory_key_padding_mask=sent_memory_mask_inv, graph_memory_key_padding_mask=graph_memory_mask_inv, ) # [batch, cur_seq, dim] assert has_nan(cur_emb) is False # break after the first time when all items are finished if all(is_finish) or step == max_step - 1: cur_len = cur_mask.sum(dim=1).long() target_vec = universal_sentence_embedding( cur_emb, cur_mask, cur_len) break # generating step outputs logits = self.projector(cur_emb[:, -1, :]).view( batch_size, self.word_vocab_size) # [batch, vocab] if use_sampling is False: indices = logits.argmax(dim=1) # [batch] else: indices = Categorical(logits=logits).sample() # [batch] prob = F.softmax(logits, dim=1)[rows, indices] # [batch] target_prob.append(prob) indices = indices.cpu().tolist() target_ids.append(indices) for i in range(batch_size): target_mask[i].append( 0.0 if is_finish[i] else 1.0) # based on if is_finish in the last step for i in range(batch_size): is_finish[i] |= indices[i] == self.EOS target_ids = list(map(list, zip(*target_ids[1:]))) # [batch, target_seq] target_mask = torch.tensor([x[1:] for x in target_mask ]).to(device) # [batch, target_seq] target_prob = torch.stack(target_prob, dim=1) # [batch, target_seq] return target_vec, target_ids, target_prob, target_mask
def predict_mstcn(self, model_dir, results_dir, features_path, vid_list_file, epoch, actions_dict, device, sample_rate, bsn_result_path, mstcn_use_lbp, poolingLength=99): self.model.eval() inverse_dict = {v: k for k, v in actions_dict.items()} lbp = LocalBarrierPooling(poolingLength) lbp = lbp.to(device) with torch.no_grad(): self.model.to(device) self.model.load_state_dict( torch.load(model_dir + "/epoch-" + str(epoch) + ".model")) file_ptr = open(vid_list_file, 'r') list_of_vids = file_ptr.read().split('\n')[:-1] file_ptr.close() for vid in list_of_vids: print(vid) features = np.load(features_path + vid.split('.')[0] + '.npy') features = features[:, ::sample_rate] if mstcn_use_lbp: num_frames = np.shape(features)[1] barrier_file = bsn_result_path + vid + ".csv" barrier = np.array(pd.read_csv(barrier_file)) temporal_scale = np.shape(barrier)[0] barrier = np.transpose(barrier) barrier = torch.tensor( barrier, dtype=torch.float) #size=[num_frames] if temporal_scale <= num_frames: resize_barrier = F.interpolate(barrier, size=num_frames, mode='nearest') else: resize_barrier = barrier resize_barrier = resize_barrier.unsqueeze(0) resize_barrier = resize_barrier.unsqueeze( 0) # size=[1,1,num_frames] resize_barrier = resize_barrier.to(device) input_x = torch.tensor(features, dtype=torch.float) input_x.unsqueeze_(0) input_x = input_x.to(device) predictions = self.model( input_x, torch.ones(input_x.size(), device=device)) predictions = predictions[-1] if mstcn_use_lbp: if temporal_scale <= num_frames: predictions = lbp(predictions, resize_barrier) else: predictions = F.interpolate(predictions, size=temporal_scale, mode='linear', align_corners=False) predictions = lbp(predictions, resize_barrier) predictions = F.interpolate(predictions, size=num_frames, mode='linear', align_corners=False) predictions = F.softmax(predictions, dim=1) entropy = Categorical( probs=predictions.squeeze(0).transpose(1, 0)).entropy() entropy = entropy.cpu().numpy().astype(np.str) f_name = vid.split('/')[-1].split('.')[0] f_ptr = open(results_dir + "/entropy_" + f_name, "w") f_ptr.write(' '.join(entropy)) f_ptr.close() _, predicted = torch.max(predictions.data, 1) predicted = predicted.squeeze() recognition = [] for i in range(len(predicted)): recognition = np.concatenate( (recognition, [inverse_dict[predicted[i].item()]] * sample_rate)) f_name = vid.split('/')[-1].split('.')[0] f_ptr = open(results_dir + "/" + f_name, "w") f_ptr.write("### Frame level recognition: ###\n") f_ptr.write(' '.join(recognition)) f_ptr.close()
def evaluate(self, **kwargs): true_labels = np.zeros(0) all_preds = np.zeros(0) all_correct = np.zeros(0) conf_true_labels = np.zeros(0) brier_scores = [] entropies = np.zeros(0) acc = [] nll = [] with torch.no_grad(): for batch_num, batch in enumerate(self.test_loader): x, y = batch x = x.to(self.device) if not self.ensemble: out = self.model(x) else: out = 0 for model in self.ensemble: out += model(x) out /= len(self.ensemble) # Logits to probability distribution probs = F.softmax(out, dim=-1) # Maximum softmax probability preds, indices = torch.max(probs, dim=-1) # Label predictions label_preds = probs.argmax(dim=-1, keepdim=True).view_as(y) # Compute accuracy corrects = y.eq(label_preds.cpu()) correct = corrects.sum().item() acc.append(correct / out.shape[0]) all_correct = np.concatenate( (all_correct, corrects.cpu().numpy())) # Compute entropy entropy = Categorical(probs).entropy().squeeze() entropies = np.concatenate((entropies, entropy.cpu().numpy())) # Compute brier score brier_scores.append(calculate_brier_score(probs, y)) # Compute NLL nll.append(-np.mean(np.log(preds.cpu().numpy()))) true_labels = np.concatenate((true_labels, np.ones(len(x)))) all_preds = np.concatenate((all_preds, preds.cpu().reshape( (-1)))) conf_true_labels = np.concatenate( (conf_true_labels, torch.isclose( y.cpu(), indices.cpu()).numpy().astype(float).reshape(-1))) conf_auroc = calculate_auroc(conf_true_labels, all_preds) conf_aupr = calculate_aupr(conf_true_labels, all_preds) brier_score = np.mean(np.array(brier_scores)) ece = calculate_ece(all_preds, all_correct) return { 'conf_auroc': conf_auroc, 'conf_aupr': conf_aupr, 'brier_score': brier_score, 'entropy': np.mean(entropies), 'test_acc': np.mean(acc), 'nll': np.mean(nll), 'ece': ece, }, true_labels, all_preds, entropies
def act(self, x): with torch.no_grad(): logits = self(x) m = Categorical(logits=logits).sample().squeeze() return m.cpu().item()
def main(): # make the environments if args.num_envs == 1: env = [gym.make(args.env_name)] else: env = [gym.make(args.env_name) for i in range(args.num_envs)] env = MultiGym(env, render=args.render) n_states = env.observation_space.shape n_actions = env.action_space.n print('state shape:', n_states, 'actions:', n_actions) policy = ConvPolicy(n_actions).to(device) optimizer = optim.RMSprop(policy.parameters(), lr=args.lr) if args.algo == 'ppo': sys.path.append('../') from algorithms.ppo import PPO update_algo = PPO(policy=policy, optimizer=optimizer, num_steps=args.num_steps, num_envs=args.num_envs, state_size=(4, 105, 80), entropy_coef=args.entropy, gamma=args.gamma, device=device, epochs=args.ppo_epochs) else: sys.path.append('../') from algorithms.a2c import A2C update_algo = A2C(policy=policy, optimizer=optimizer, num_steps=args.num_steps, num_envs=args.num_envs, state_size=(4, 105, 80), entropy_coef=args.entropy, gamma=args.gamma, device=device) end_rewards = [] try: print('starting episodes') idx = 0 d = False reward_sum = np.zeros((args.num_envs)) restart = True frame = env.reset() mask = torch.ones(args.num_envs) all_start = time.time() for update_idx in range(args.num_updates): update_algo.policy.train() # stack the frames s = train_state_proc.proc_state(frame, mask=mask) # insert state before getting actions update_algo.states[0].copy_(s) start = time.time() for step in range(args.num_steps): with torch.no_grad(): # get probability dist and values p, v = update_algo.policy(update_algo.states[step]) a = Categorical(p).sample() # take action get response frame, r, d = env.step( a.cpu().numpy() if args.num_envs > 1 else [a.item()]) s = train_state_proc.proc_state(frame, mask) update_algo.insert_experience(step=step, s=s, a=a, v=v, r=r, d=d) mask = torch.tensor(1. - d).float() reward_sum = (reward_sum + r) # if any episode finished append episode reward to list if d.any(): end_rewards.extend(reward_sum[d]) # reset any rewards that finished reward_sum = reward_sum * mask.numpy() idx += 1 with torch.no_grad(): _, next_val = update_algo.policy(update_algo.states[-1]) update_algo.update(next_val.view(1, args.num_envs).to(device), next_mask=mask.to(device)) if args.lr_decay: for params in update_algo.optimizer.param_groups: params['lr'] = ( lr_min + 0.5 * (args.lr - lr_min) * (1 + np.cos(np.pi * idx / args.num_updates))) # update every so often by displaying results in term if (update_idx % args.log_interval == 0) and (len(end_rewards) > 0): total_steps = (idx + 1) * args.num_envs * args.num_steps end = time.time() print(end_rewards[-10:]) print('Updates {}\t Time: {:.4f} \t FPS: {}'.format( update_idx, end - start, int(total_steps / (end - all_start)))) print( 'Mean Episode Rewards: {:.2f} \t Min/Max Current Rewards: {}/{}' .format(np.mean(end_rewards[-10:]), reward_sum.min(), reward_sum.max())) except KeyboardInterrupt: pass torch.save( update_algo.policy.state_dict(), '../model_weights/{}_{}_conv.pth'.format(args.env_name, args.algo)) import pandas as pd out_dict = {'avg_end_rewards': end_rewards} out_log = pd.DataFrame(out_dict) out_log.to_csv('../logs/{}_{}_rewards.csv'.format(args.env_name, args.algo), index=False) out_dict = { 'actor losses': update_algo.actor_losses, 'critic losses': update_algo.critic_losses, 'entropy': update_algo.entropy_logs } out_log = pd.DataFrame(out_dict) out_log.to_csv('../logs/{}_{}_training_behavior.csv'.format( args.env_name, args.algo), index=False) plt.plot(end_rewards) plt.show()