def evaluate(self, transition): output, hn = self.w_lstm(transition["inputs"], transition["h0"]) output = output.squeeze(0) h0 = hn logits = self.w_soft(output) # if self.temperature is not None: logits /= self.temperature out_dist = Categorical(logits=logits) branch_logprob = out_dist.log_prob(transition["branch"]) branch_entropy = out_dist.entropy() inputs = self.w_emb(transition["branch"]) inputs = inputs.unsqueeze(0) output, hn = self.w_lstm(inputs, h0) output = output.squeeze(0) if transition["layer_id"] > 0: query = torch.cat(transition["anchors_w1"], dim=0) query = torch.tanh(query + self.w_attn_2(output)) query = self.v_attn(query) logits = torch.cat([query, -query], dim=1) if self.temperature is not None: logits /= self.temperature if self.tanh_constant is not None: logits = self.tanh_constant * torch.tanh(logits) #print("logits_eval", logits) skip_distribution = Categorical(logits=logits) skip_logprob = skip_distribution.log_prob( transition["skip_connections"]) skip_logprob = torch.sum(skip_logprob) # maybe without sum() skip_entropy = skip_distribution.entropy() skip_entropy = torch.sum(skip_entropy) else: skip_logprob = 0 skip_entropy = 0 return branch_logprob[0], skip_logprob, branch_entropy[0], skip_entropy # # DONE PPO # -update weights # -old policy # -memory # -eval action in controller
def forward_entropy(model, loader, device, max_item_id=0): for i, batch in enumerate(loader): scores = softmax(model(batch.to(device), max_item_id), dim=1) dis_score = Categorical(scores) if i == 0: entropy = dis_score.entropy() else: entropy = torch.cat((entropy, dis_score.entropy())) # pro = softmax(entropy).cpu().detach().numpy() pro = entropy.cpu().detach().numpy() weights = np.exp((pd.Series(pro).rank() / len(pro)).values) return weights / np.sum(weights)
def get_action(self, x, action=None): logits = self.actor(x) probs = Categorical(logits=logits) # if action is not specified, we select it stochastically if action is None: action = probs.sample() return action, probs.log_prob(action), probs.entropy()
def PPO_update(self): self.optimizer.zero_grad() # Compute and normalize discounted rewards (use the discount_rewards function) rewards = np.asarray(self.discount_rewards()) rewards = (rewards - np.mean(rewards)) / np.var(rewards) + 1e-5 for _ in range(self.K_epochs): # sample a random 50% of the data stored in every epoch len_history = len(self.actions) n_batch = round(len_history * 0.7) idxs = random.sample(range(len_history), n_batch) old_rewards = torch.tensor([rewards[idx] for idx in idxs]).to(self.train_device) old_states = [self.states[idx] for idx in idxs] old_action_probs = [self.action_probs[idx] for idx in idxs] old_actions = [self.actions[idx] for idx in idxs] # Convert list to tensor old_states = torch.stack(old_states, dim=0).to(self.train_device).detach() old_action_probs = torch.stack(old_action_probs, dim=0).to( self.train_device).detach() old_actions = torch.stack(old_actions, dim=0).to(self.train_device).detach() # Evaluate batch actions and values: # Pass batch states to actor layers action_logits, values = self.policy.forward(old_states) action_distribution = Categorical(logits=action_logits) # Caculate action log probability and entropy given batch actions dist_entropy = action_distribution.entropy() # Caculate the loss: # Finding the ratio (pi_theta / pi_theta__batch) vs = np.array([[1., 0.], [0., 1.]]) ts = torch.FloatTensor(vs[old_actions.cpu().numpy()]) ratios = torch.sum(F.softmax(action_logits, dim=1) * ts, dim=1) / old_action_probs # Finding Surrogate Loss: advantages = old_rewards - values.detach() surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages loss = (-torch.min(surr1, surr2).mean() + 0.5 * self.MseLoss(values.squeeze(1), old_rewards) - 0.01 * dist_entropy.mean()) # Take gradient step to update network parameters loss.backward() print('Loss:', loss) self.optimizer.step() # Copy new weights into old policy: self.policy_old.load_state_dict(self.policy.state_dict()) # Clear memory self.states, self.action_probs, self.actions, self.rewards, self.dones = [], [], [], [], []
def act(self, state, action=None, calc_ent=False): """Returns dict of trajectory info. Shape ====== state (uint8) : (batch_size, framestack=4, 84, 84) Returns example {'a': tensor([10, 5, 1]), 'ent': None, 'log_pi_a': tensor([-2.8904, -2.8904, -2.8904], grad_fn=<SqueezeBackward1>), 'v_ext': tensor([0.0012, 0.0012, 0.0012], grad_fn=<SqueezeBackward0>), 'v_int': tensor([-0.0013, -0.0013, -0.0013], grad_fn=<SqueezeBackward0>)} """ #state = torch.FloatTensor(state / 255).to(self.device) assert state.dtype == 'uint8' state = torch.tensor(state / 255., dtype=torch.float, device=self.device) #state = torch.from_numpy(state /255).float().to(self.device) action_probs, value_ext, value_int = self.model(state) dist = Categorical(action_probs) if action is None: action = dist.sample() log_prob = dist.log_prob(action) entropy = dist.entropy() if calc_ent else None return { 'a': action, 'log_pi_a': log_prob, 'ent': entropy, 'v_ext': value_ext.squeeze(), 'v_int': value_int.squeeze() }
def evaluate(self, state, action): pred = self.policy(state) value = self.vf(state).squeeze() dist = Categorical(pred) log_prob = dist.log_prob(action).squeeze() entropy = dist.entropy().squeeze() return value, log_prob, entropy
def forward(self,encoder_inputs,hx,n_steps,greedy=False): _input = encoder_inputs.new_zeros((encoder_inputs.size(0),encoder_inputs.size(2))) mask = encoder_inputs.new_zeros((encoder_inputs.size(0),encoder_inputs.size(1))) log_ps = [] actions = [] entropys = [] for i in range(n_steps): hx = self.cell(_input, hx) # print (hx.size(),encoder_inputs.size(),mask.size()) p = self.attn(hx,encoder_inputs,mask) dist = Categorical(p) entropy = dist.entropy() if greedy: _,index = p.max(dim=-1) else: index = dist.sample() actions.append(index) log_p = dist.log_prob(index) log_ps.append(log_p) entropys.append(entropy) mask = mask.scatter(1,index.unsqueeze(-1).expand(mask.size(0),-1),1) _input = torch.gather(encoder_inputs,1,index.unsqueeze(-1).unsqueeze(-1).expand(encoder_inputs.size(0),-1,encoder_inputs.size(2))).squeeze(1) log_ps = torch.stack(log_ps,1) actions = torch.stack(actions,1) entropys = torch.stack(entropys,1) log_p = log_ps.sum(dim=1) entropy = entropys.mean(dim=1) return actions,log_p,entropy
def forward(self): inputs, h0 = self.input_vars, None log_probs, entropys, sampled_arch = [], [], [] for iedge in range(self.num_edge): outputs, h0 = self.w_lstm(inputs, h0) logits = self.w_pred(outputs) logits = logits / self.temperature logits = self.tanh_constant * torch.tanh(logits) # distribution op_distribution = Categorical(logits=logits) op_index = op_distribution.sample() sampled_arch.append(op_index.item()) op_log_prob = op_distribution.log_prob(op_index) log_probs.append(op_log_prob.view(-1)) op_entropy = op_distribution.entropy() entropys.append(op_entropy.view(-1)) # obtain the input embedding for the next step inputs = self.w_embd(op_index) # print(op_index,inputs) return torch.sum(torch.cat(log_probs)), torch.sum( torch.cat(entropys)), sampled_arch
def get_action(self, x, action=None): #x = x.permute(0, 3, 1, 2).contiguous() logits = self.actor(self.forward(x)) probs = Categorical(logits=logits) if action is None: action = probs.sample() return action, probs.log_prob(action), probs.entropy()
def forward(self, ): ''' ''' entropys = [] log_probs = [] sampled_arcs = [] self.op_dist = [] for layer_id in range(self.num_layers): logit = self.alpha[layer_id] # if self.temperature > 0: # logit /= self.temperature # if self.tanh_constant is not None: # logit = self.tanh_constant * torch.tanh(logit) op_dist = Categorical(logits=logit) self.op_dist.append(op_dist) sampled_op = op_dist.sample() sampled_arcs.append(sampled_op.view(-1, 1)) log_prob = op_dist.log_prob(sampled_op) log_probs.append(log_prob.view(-1, 1)) entropy = op_dist.entropy() entropys.append(entropy.view(-1, 1)) # inputs = self.w_emb(branch_id) self.sampled_arcs = torch.cat(sampled_arcs, dim=1) self.sample_entropy = torch.cat(entropys, dim=1) self.sample_log_prob = torch.cat(log_probs, dim=1) return self.sampled_arcs
def evaluate(self, encoder_inputs, hx, actions): _input = encoder_inputs.new_zeros( (encoder_inputs.size(0), encoder_inputs.size(2))) mask = encoder_inputs.new_zeros( (encoder_inputs.size(0), encoder_inputs.size(1))) log_ps = [] entropys = [] actions = actions.transpose(0, 1) for act in actions: hx = self.cell(_input, hx) p = self.attn(hx, encoder_inputs, mask) dist = Categorical(p) entropy = dist.entropy() log_p = dist.log_prob(act) log_ps.append(log_p) mask = mask.scatter(1, act.unsqueeze(-1).expand(mask.size(0), -1), 1) _input = torch.gather( encoder_inputs, 1, act.unsqueeze(-1).unsqueeze(-1).expand( encoder_inputs.size(0), -1, encoder_inputs.size(2))).squeeze(1) entropys.append(entropy) log_ps = torch.stack(log_ps, 1) entropys = torch.stack(entropys, 1) log_p = log_ps.sum(dim=1) entropy = entropys.mean(dim=1) return log_p, entropy
def forward(self, img1, img2): #print("img1.shape", img1.shape) img1 = img1.view(img1.size(0), -1) #print("img1.shape", img1.shape) img2 = img2.view(img2.size(0), -1) out1 = self.policy_single(img1) #print("out1.shape", out1.shape) out2 = self.policy_single(img2) combined = torch.cat( (out1, out2), dim=1) # attaccate usando asse x (una sopra l'altra) probs = self.policy_combined(combined) # sampling dist = Categorical(probs=probs) if self.training: actions = dist.sample() else: actions = dist.argmax(dim=1) logprobs = dist.log_prob(actions) entropy = dist.entropy() return probs, actions, logprobs, entropy
def get_action_and_value(self, x, action=None): hidden = self.network(x.permute((0, 3, 1, 2)) / 255.0) # "bhwc" -> "bchw" logits = self.actor(hidden) probs = Categorical(logits=logits) if action is None: action = probs.sample() return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)
def train_model(args, device, output_size, model, rnd, optimizer, s_batch, target_ext_batch, target_int_batch, y_batch, adv_batch, next_obs_batch, old_action_probs): epoch = 3 update_proportion = 0.25 s_batch = torch.FloatTensor(s_batch).to(device) target_ext_batch = torch.FloatTensor(target_ext_batch).to(device) target_int_batch = torch.FloatTensor(target_int_batch).to(device) y_batch = torch.LongTensor(y_batch).to(device) adv_batch = torch.FloatTensor(adv_batch).to(device) next_obs_batch = torch.FloatTensor(next_obs_batch).to(device) sample_range = np.arange(len(s_batch)) forward_mse = nn.MSELoss(reduction='none') with torch.no_grad(): action_probs_old_list = torch.stack(old_action_probs).permute(1, 0, 2).contiguous().view(-1, output_size).to(device) m_old = Categorical(action_probs_old_list) log_prob_old = m_old.log_prob(y_batch) # ------------------------------------------------------------ for i in range(epoch): np.random.shuffle(sample_range) for j in range(int(len(s_batch) / args.batch_size)): sample_idx = sample_range[args.batch_size * j:args.batch_size * (j + 1)] # -------------------------------------------------------------------------------- # for Curiosity-driven(Random Network Distillation) predict_next_state_feature, target_next_state_feature = rnd(next_obs_batch[sample_idx]) forward_loss = forward_mse(predict_next_state_feature, target_next_state_feature.detach()).mean(-1) # Proportion of exp used for predictor update mask = torch.rand(len(forward_loss)).to(device) mask = (mask < update_proportion).type(torch.FloatTensor).to(device) forward_loss = (forward_loss * mask).sum() / torch.max(mask.sum(), torch.Tensor([1]).to(device)) # --------------------------------------------------------------------------------- action_probs, value_ext, value_int = model(s_batch[sample_idx]) m = Categorical(action_probs) log_prob = m.log_prob(y_batch[sample_idx]) ratio = torch.exp(log_prob - log_prob_old[sample_idx]) surr1 = ratio * adv_batch[sample_idx] surr2 = torch.clamp( ratio, 1.0 - args.eps, 1.0 + args.eps) * adv_batch[sample_idx] actor_loss = -torch.min(surr1, surr2).mean() critic_ext_loss = F.mse_loss(value_ext.sum(1), target_ext_batch[sample_idx]) critic_int_loss = F.mse_loss(value_int.sum(1), target_int_batch[sample_idx]) critic_loss = critic_ext_loss + critic_int_loss entropy = m.entropy().mean() optimizer.zero_grad() loss = actor_loss + 0.5 * critic_loss - args.entropy_coef * entropy + forward_loss loss.backward() optimizer.step()
def forward(self, x, a=None): policy = Categorical(logits=self.logits(x)) if a is None: a = policy.sample().squeeze() logp_a = policy.log_prob(a).squeeze() ent = policy.entropy().squeeze() return a, logp_a, ent
def train_model(self, s_batch, target_batch, y_batch, adv_batch, actor_agent): s_batch = torch.FloatTensor(s_batch) target_batch = torch.FloatTensor(target_batch) adv_batch = torch.FloatTensor(adv_batch) with torch.no_grad(): policy_old, value_old = actor_agent.model_old(s_batch) m_old = Categorical(policy_old) y_batch_old = torch.LongTensor(y_batch) log_prob_old = m_old.log_prob(y_batch_old) # for multiply advantage policy, value = self.model(s_batch) m = Categorical(policy) y_batch = m.sample() log_prob = m.log_prob(y_batch) entropy = m.entropy().mean() for i in range(EPOCH): minibatch = random.sample(range(len(s_batch)), BATCH_SIZE) ratio = torch.exp(log_prob[minibatch] - log_prob_old[minibatch]) surr1 = ratio * adv_batch[minibatch].sum(1) surr2 = torch.clamp(ratio, 1.0 - EPSILON, 1.0 + EPSILON) * adv_batch[minibatch].sum(1) actor_loss = -torch.min(surr1, surr2).mean() critic_loss = F.mse_loss(value_old[minibatch], target_batch[minibatch]) self.optimizer.zero_grad() loss = actor_loss + V_COEF * critic_loss - 0.01 * entropy loss.backward(retain_graph=True) self.optimizer.step()
def get_action_and_value(self, x, action=None): hidden = self.network(x / 255.0) logits = self.actor(hidden) probs = Categorical(logits=logits) if action is None: action = probs.sample() return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)
def rl_get_action(self, state, compass): """Select an action by running a tile-input through the neural network. :param state: tile-grid; numpy tensor :return: int of selected action """ logits = self.nn(state, compass) probs = Categorical(logits=logits) action = probs.sample() if state.shape[0] == 1: a2 = action.item() # update orientation if not hasattr(self, '_env') and a2 != self.prev_move: if a2 in self.rotating_actions: self.prev_move = action self.compass_info = self.orientation[self.prev_move - 1] elif hasattr(self, '_env'): self.compass_info = self.env.orientation[self.env.prev_move - 1] else: actions = action.data.numpy() for act in actions: pass return action, -probs.log_prob(action), probs.entropy()
def get_action(self, x, action=None): logits = self.actor(self.network(x.permute( (0, 3, 1, 2)))) # "bhwc" -> "bchw" # "bhwc" -> "bchw" probs = Categorical(logits=logits) if action is None: action = probs.sample() return action, probs.log_prob(action), probs.entropy()
def _evaluate_actions(self, env_states, rec_hs, rec_cs, actions): """ See how likely these actions (using the current model) in the given env_states Called when updating, on batches of transitions Args: env_states: float tensor of shape [batch_size, *env_state_shape] rec_hs: float tensor of shape [num_recurrent_layers, batch_size, recurrent_layer_size] rec_cs: float tensor of shape [num_recurrent_layers, batch_size, recurrent_layer_size] actions: int tensor of shape [batch_size,] Returns: encoder_out: float tensor of shape [batch_size, m] -- so it's not recomputed again for values action_log_probs: float tensor of shape [batch_size,] entropy: float tensor of shape [batch_size,] """ latent_means, latent_log_vars, encoder_out, _, _ = self.controller.encode( env_states, rec_hs, rec_cs) actor_logits = self.controller.actor(encoder_out) action_distributions = Categorical( logits=actor_logits ) # float tensor of shape [batch_size, num_actions] action_log_probs = action_distributions.log_prob( actions) # float tensor of shape [batch_size,] entropy = action_distributions.entropy( ) # float tensor of shape [batch_size,] return latent_means, latent_log_vars, encoder_out, action_log_probs, entropy
def sample_action(self, probs, already_selected=None, greedy=False): # probs = (B, k+1) # already_selected = (num_timesteps, B) if already_selected is None: mask = 1 else: mask = Variable(torch.ones(probs.size())) if USE_CUDA: # TODO: uncomment this, when this model works mask = mask.cuda() pass mask = mask.scatter_(1, already_selected.t(), 0) # (B, k+1) masked_probs = mask * ( probs + 1e-20 ) # (B, k+1), add epsilon to make sure no non-masked value is zero. dist = Categorical(probs=masked_probs) if greedy: _, a = masked_probs.max(dim=1) # (B) else: a = dist.sample() # (B) log_prob = dist.log_prob(a) # (B) entropy = dist.entropy() # (B) return a, log_prob, entropy
def train_model(self, s_batch, target_batch, y_batch, adv_batch): s_batch = torch.FloatTensor(s_batch) target_batch = torch.FloatTensor(target_batch) y_batch = torch.LongTensor(y_batch) adv_batch = torch.FloatTensor(adv_batch) # for multiply advantage policy, value = self.model(s_batch) m = Categorical(policy) # mse = nn.SmoothL1Loss() mse = nn.MSELoss() # Actor loss actor_loss = -m.log_prob(y_batch) * adv_batch.sum(1) # Entropy(for more exploration) entropy = m.entropy() # Critic loss critic_loss = mse(value, target_batch) # Total loss loss = actor_loss.mean() + 0.5 * critic_loss - 0.01 * entropy.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step()
def get_action_and_value(self, x, lstm_state, done, action=None): hidden, lstm_state = self.get_states(x, lstm_state, done) logits = self.actor(hidden) probs = Categorical(logits=logits) if action is None: action = probs.sample() return action, probs.log_prob(action), probs.entropy(), self.critic( hidden), lstm_state
def evaluate(self, state, action): action_probs = self.action_layer(state) dist = Categorical(action_probs) action_logprobs = dist.log_prob(action) dist_entropy = dist.entropy() state_value = self.value_layer(state) return action_logprobs, state_value, dist_entropy
def act(self, s, action=None): prob, v = self.forward(s) dist = Categorical(prob) if action is None: action = dist.sample() log_prob = dist.log_prob(action) entropy = dist.entropy() return action, log_prob, entropy, v.squeeze()
def train_model(self, observations_tensor, ext_returns_tensor, int_returns_tensor, actions_tensor, advantages_tensor, one_channel_observations_tensor, old_log_prob): if flag.DEBUG: print("input observations shape", observations_tensor.shape) print("ext returns shape", ext_returns_tensor.shape) print("int returns shape", int_returns_tensor.shape) print("input actions shape", actions_tensor.shape) print("input advantages shape", advantages_tensor.shape) print("one channel observations", one_channel_observations_tensor.shape) self.new_model.train() self.predictor_model.train() target_value = self.target_model(one_channel_observations_tensor) predictor_value = self.predictor_model(one_channel_observations_tensor) predictor_loss = self.predictor_mse_loss(predictor_value, target_value).mean(-1) mask = torch.rand(len(predictor_loss)).to(self.device) mask = (mask < self.predictor_update_proportion).type( torch.FloatTensor).to(self.device) predictor_loss = (predictor_loss * mask).sum() / torch.max( mask.sum(), torch.Tensor([1]).to(self.device)) new_policy, ext_new_values, int_new_values = self.new_model( observations_tensor) ext_value_loss = self.mse_loss(ext_new_values, ext_returns_tensor) int_value_loss = self.mse_loss(int_new_values, int_returns_tensor) value_loss = ext_value_loss + int_value_loss softmax_policy = F.softmax(new_policy, dim=1) new_dist = Categorical(softmax_policy) new_log_prob = new_dist.log_prob(actions_tensor) ratio = torch.exp(new_log_prob - old_log_prob) clipped_policy_loss = torch.clamp(ratio, 1.0 - self.clip_range, 1 + self.clip_range) \ * advantages_tensor policy_loss = ratio * advantages_tensor selected_policy_loss = -torch.min(clipped_policy_loss, policy_loss).mean() entropy = new_dist.entropy().mean() self.optimizer.zero_grad() loss = selected_policy_loss + (self.value_coef * value_loss) \ - (self.entropy_coef * entropy) + predictor_loss loss.backward() global_grad_norm_( list(self.new_model.parameters()) + list(self.predictor_model.parameters())) self.optimizer.step() return loss, selected_policy_loss, value_loss, predictor_loss, entropy
def evaluate(self, state, action): action_probs = self.action_layer(state) dist = Categorical(action_probs) actInx = torch.argmax(action, dim=1) action_logprobs = dist.log_prob(actInx) dist_entropy = dist.entropy() state_value = self.value_layer(state) return action_logprobs, torch.squeeze(state_value), dist_entropy
def forward(self, class_ids, determine_sample=False): ''' https://github.com/melodyguan/enas/blob/master/src/cifar10/general_controller.py#L126 ''' h0 = None # setting h0 to None will initialize LSTM state with 0s arc_seq = [] entropys = [] log_probs = [] if isinstance(class_ids, int): class_ids = [class_ids] if isinstance(class_ids, list): class_ids = torch.tensor(class_ids, dtype=torch.int64) class_ids = class_ids.type(torch.int64) inputs = self.g_emb.weight[class_ids] for layer_id in range(self.num_layers): if self.search_whole_channels: inputs = inputs.unsqueeze(dim=0) output, hn = self.w_lstm(inputs, h0) output = output.squeeze(dim=0) h0 = hn logit = self.w_soft(output) if self.temperature > 0: logit /= self.temperature if self.tanh_constant is not None: logit = self.tanh_constant * torch.tanh(logit) branch_id_dist = Categorical(logits=logit) if determine_sample: branch_id = logit.argmax(dim=1) else: branch_id = branch_id_dist.sample() arc_seq.append(branch_id) log_prob = branch_id_dist.log_prob(branch_id) log_probs.append(log_prob.view(-1)) entropy = branch_id_dist.entropy() entropys.append(entropy.view(-1)) else: # https://github.com/melodyguan/enas/blob/master/src/cifar10/general_controller.py#L171 assert False, "Not implemented error: search_whole_channels = False" # Calculate average of class and branch embedding # and use it as input for next step inputs = self.w_emb(branch_id) + self.g_emb.weight[class_ids] inputs /= 2 self.sample_arc = torch.stack(arc_seq, dim=1) self.sample_entropy = torch.stack(entropys, dim=1) self.sample_log_prob = torch.stack(log_probs, dim=1) self.sample_prob = self.sample_log_prob.exp()
class OneHotCategorical(Distribution): r""" Creates a one-hot categorical distribution parameterized by `probs`. Samples are one-hot coded vectors of size probs.size(-1). See also: :func:`torch.distributions.Categorical` Example:: >>> m = OneHotCategorical(torch.Tensor([ 0.25, 0.25, 0.25, 0.25 ])) >>> m.sample() # equal probability of 0, 1, 2, 3 0 0 1 0 [torch.FloatTensor of size 4] Args: probs (Tensor or Variable): event probabilities """ params = {'probs': constraints.simplex} support = constraints.simplex has_enumerate_support = True def __init__(self, probs=None, logits=None): self._categorical = Categorical(probs, logits) batch_shape = self._categorical.probs.size()[:-1] event_shape = self._categorical.probs.size()[-1:] super(OneHotCategorical, self).__init__(batch_shape, event_shape) def sample(self, sample_shape=torch.Size()): sample_shape = torch.Size(sample_shape) probs = self._categorical.probs one_hot = probs.new(self._extended_shape(sample_shape)).zero_() indices = self._categorical.sample(sample_shape) if indices.dim() < one_hot.dim(): indices = indices.unsqueeze(-1) return one_hot.scatter_(-1, indices, 1) def log_prob(self, value): indices = value.max(-1)[1] return self._categorical.log_prob(indices) def entropy(self): return self._categorical.entropy() def enumerate_support(self): probs = self._categorical.probs n = self.event_shape[0] if isinstance(probs, Variable): values = Variable(torch.eye(n, out=probs.data.new(n, n))) else: values = torch.eye(n, out=probs.new(n, n)) values = values.view((n,) + (1,) * len(self.batch_shape) + (n,)) return values.expand((n,) + self.batch_shape + (n,))
class OneHotCategorical(Distribution): r""" Creates a one-hot categorical distribution parameterized by `probs`. Samples are one-hot coded vectors of size probs.size(-1). See also: :func:`torch.distributions.Categorical` Example:: >>> m = OneHotCategorical(torch.Tensor([ 0.25, 0.25, 0.25, 0.25 ])) >>> m.sample() # equal probability of 0, 1, 2, 3 0 0 1 0 [torch.FloatTensor of size 4] Args: probs (Tensor or Variable): event probabilities """ params = {'probs': constraints.simplex} support = constraints.simplex has_enumerate_support = True def __init__(self, probs): self._categorical = Categorical(probs) batch_shape = probs.size()[:-1] event_shape = probs.size()[-1:] super(OneHotCategorical, self).__init__(batch_shape, event_shape) def sample(self, sample_shape=torch.Size()): sample_shape = torch.Size(sample_shape) probs = self._categorical.probs one_hot = probs.new(self._extended_shape(sample_shape)).zero_() indices = self._categorical.sample(sample_shape) if indices.dim() < one_hot.dim(): indices = indices.unsqueeze(-1) return one_hot.scatter_(-1, indices, 1) def log_prob(self, value): indices = value.max(-1)[1] return self._categorical.log_prob(indices) def entropy(self): return self._categorical.entropy() def enumerate_support(self): probs = self._categorical.probs n = self.event_shape[0] if isinstance(probs, Variable): values = Variable(torch.eye(n, out=probs.data.new(n, n))) else: values = torch.eye(n, out=probs.new(n, n)) values = values.view((n,) + (1,) * len(self.batch_shape) + (n,)) return values.expand((n,) + self.batch_shape + (n,))
def __preprocess_ac_space_discrete(logits: torch.Tensor, ac_space: Space, stochastic=True, action=[]): probs = Categorical(logits=logits) if len(action) == 0: if stochastic: action = probs.sample() else: action = torch.argmax(probs.probs, dim=1) else: action = torch.LongTensor(action.astype(np.int)) return probs, action.tolist(), -probs.log_prob(action), probs.entropy()
class OneHotCategorical(Distribution): r""" Creates a one-hot categorical distribution parameterized by :attr:`probs` or :attr:`logits`. Samples are one-hot coded vectors of size ``probs.size(-1)``. .. note:: :attr:`probs` will be normalized to be summing to 1. See also: :func:`torch.distributions.Categorical` for specifications of :attr:`probs` and :attr:`logits`. Example:: >>> m = OneHotCategorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ])) >>> m.sample() # equal probability of 0, 1, 2, 3 tensor([ 0., 0., 0., 1.]) Args: probs (Tensor): event probabilities logits (Tensor): event log probabilities """ arg_constraints = {'probs': constraints.simplex} support = constraints.simplex has_enumerate_support = True def __init__(self, probs=None, logits=None, validate_args=None): self._categorical = Categorical(probs, logits) batch_shape = self._categorical.batch_shape event_shape = self._categorical.param_shape[-1:] super(OneHotCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args) def _new(self, *args, **kwargs): return self._categorical._new(*args, **kwargs) @property def probs(self): return self._categorical.probs @property def logits(self): return self._categorical.logits @property def mean(self): return self._categorical.probs @property def variance(self): return self._categorical.probs * (1 - self._categorical.probs) @property def param_shape(self): return self._categorical.param_shape def sample(self, sample_shape=torch.Size()): sample_shape = torch.Size(sample_shape) probs = self._categorical.probs one_hot = probs.new(self._extended_shape(sample_shape)).zero_() indices = self._categorical.sample(sample_shape) if indices.dim() < one_hot.dim(): indices = indices.unsqueeze(-1) return one_hot.scatter_(-1, indices, 1) def log_prob(self, value): if self._validate_args: self._validate_sample(value) indices = value.max(-1)[1] return self._categorical.log_prob(indices) def entropy(self): return self._categorical.entropy() def enumerate_support(self): n = self.event_shape[0] values = self._new((n, n)) torch.eye(n, out=values) values = values.view((n,) + (1,) * len(self.batch_shape) + (n,)) return values.expand((n,) + self.batch_shape + (n,))