def _forward_alg(self, feats): # Do the forward algorithm to compute the partition function init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.) # START_TAG has all of the score. init_alphas[0][self.tag_to_ix[START_TAG]] = 0. # Wrap in a variable so that we will get automatic backprop forward_var = autograd.Variable(init_alphas) # Iterate through the sentence for feat in feats: alphas_t = [] # The forward variables at this timestep for next_tag in range(self.tagset_size): # broadcast the emission score: it is the same regardless of # the previous tag emit_score = feat[next_tag].view(1, -1).expand( 1, self.tagset_size) # the ith entry of trans_score is the score of transitioning to # next_tag from i trans_score = self.transitions[next_tag].view(1, -1) # The ith entry of next_tag_var is the value for the # edge (i -> next_tag) before we do log-sum-exp next_tag_var = forward_var + trans_score + emit_score # The forward variable for this tag is log-sum-exp of all the # scores. alphas_t.append(log_sum_exp(next_tag_var)) forward_var = torch.cat(alphas_t).view(1, -1) terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] alpha = log_sum_exp(terminal_var) return alpha
def _forward_alg(self, fts): bsz, tag_size = fts.shape[1], fts.shape[2] # init_alphas: (bsz, tag_size) init_alphas = self.dummy.new(bsz, self.tag_size).fill_(NINF) init_alphas[:][self.tag2idx[TAG_BOS]] = 0. # forward_var: (bsz, tag_size) forward_var = init_alphas # fts: (seq_len, bsz, tag_size) # ft: (bsz, tag_size) # trans: (tag_size, tag_size) for ft in fts: alphas_t = [] for next_tag in range(self.tag_size): # emit_score: (bsz, 1) emit_score = ft[:, next_tag].unsqueeze(-1) # trans_score: (bsz, tag_size) trans_score = self.trans[next_tag].expand(bsz, tag_size) # next_tag_var: (bsz, tag_size) next_tag_var = forward_var + trans_score + emit_score alphas_t.append(utils.log_sum_exp(next_tag_var)) # alphas_t(list): tag_size * (bsz, 1) # forward_var: (bsz, tag_size) forward_var = torch.cat(alphas_t, dim=1) terminal_var = forward_var + self.trans[self.tag2idx[TAG_EOS]] alpha = utils.log_sum_exp(terminal_var) return alpha
def _forward_alg(self, feats): # Do the forward algorithm to compute the partition function if self.gpu: init_alphas = torch.full((1, self.tagset_size), -10000., device='cuda:0') else: init_alphas = torch.full((1, self.tagset_size), -10000.) # START_TAG has all of the score. init_alphas[0][self.tag_to_ix[START_TAG]] = 0. forward_var = init_alphas for feat in feats: alphas_t = [] # The forward tensors at this timestep for next_tag in range(self.tagset_size): # broadcast the emission score emit_score = feat[next_tag].view(1, 1).expand(1, self.tagset_size) # the ith entry of trans_score is the score of transitioning to next_tag from i trans_score = self.transitions[next_tag].view(1, -1) # The ith entry of next_tag_var is the value for the # edge (i -> next_tag) before we do log-sum-exp next_tag_var = forward_var + trans_score + emit_score # The forward variable for this tag is log-sum-exp of all the # scores. alphas_t.append(log_sum_exp(next_tag_var).view(1)) forward_var = torch.cat(alphas_t).view(1, -1) terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] alpha = log_sum_exp(terminal_var) return alpha
def _e_step(self): # compute alphas and betas self._forward() # print 'alphas: {}'.format(self.alphas) self._backward() # print 'betas: {}'.format(self.betas) # gammas self.gammas = self.alphas + self.betas for tidx in range(self.T): self.gammas[tidx, :] -= utils.log_sum_exp(self.gammas[tidx, :]) self.gammas = np.exp(self.gammas) # print 'gammas: {}'.format(self.gammas) # etas for tidx in range(self.T - 1): for i in range(self.k): for j in range(self.k): a = self.alphas[tidx, i] b = self.betas[tidx + 1, j] transition_prob = np.log(self.A[i, j]) emission_prob = log_poisson_density(self.data[tidx + 1], self.B[j]) self.etas[tidx, i, j] = a + transition_prob + emission_prob + b self.etas -= utils.log_sum_exp(self.alphas[-1, :]) self.etas = np.exp(self.etas) # print 'etas: {}'.format(self.etas) # raw_input() return np.random.rand() * 1000
def _forward_alg_pp(self, feats): # Do the forward algorithm to compute the partition function init_alphas = torch.full((1, self.tagset_size), -100, dtype=torch.float, requires_grad=True).to(device=self.device) # START_TAG has all of the score. init_alphas[0][self.tag_to_ix[DatasetPreprosessed.__START_TAG__]] = 0. # Wrap in a variable so that we will get automatic backprop forward_var = init_alphas # Iterate through the sentence for feat in feats[:-1]: alphas_t = [] # The forward tensors at this timestep for next_tag in range(self.tagset_size): # the ith entry of trans_score is the score of transitioning to # next_tag from i trans_score = feat.view(self.tagset_size, self.tagset_size)[next_tag].view( 1, -1).expand(1, self.tagset_size) # The ith entry of next_tag_var is the value for the # edge (i -> next_tag) before we do log-sum-exp next_tag_var = forward_var + trans_score # The forward variable for this tag is log-sum-exp of all the # scores. alphas_t.append(utils.log_sum_exp(next_tag_var).view(1)) forward_var = torch.cat(alphas_t).view(1, -1) terminal_var = forward_var + feats[-1].view( self.tagset_size, self.tagset_size)[self.tag_to_ix[DatasetPreprosessed.__STOP_TAG__]] alpha = utils.log_sum_exp(terminal_var) return alpha
def U_z(z, uid=0): eps = 1e-8 z1 = z[:, 0] z2 = z[:, 1] w1z = torch.sin(math.pi / 2 * z1) w2z = 3.0 * torch.exp(-0.5 * ((z1 - 1) / 0.6)**2) w3z = 3.0 * F.sigmoid((z1 - 1) / 0.3) if uid == 1: # Potential 1 in NF paper tmp = torch.cat(((-0.5 * ((z1 - 2) / 0.6)**2).view(-1, 1), (-0.5 * ((z1 + 2) / 0.6)**2).view(-1, 1)), 1) return 0.5 * ( (z.norm(p=2, dim=1) - 2) / 0.4)**2 - log_sum_exp(tmp, dim=1) elif uid == 2: # Potentital 2 in NF paper return 0.5 * ((z2 - w1z) / 0.4)**2 elif uid == 3: # Potential 3 in NF paper tmp = torch.cat(((-0.5 * ((z2 - w1z) / 0.35)**2).view(-1, 1), (-0.5 * ((z2 - w1z + w2z) / 0.35)**2).view(-1, 1)), 1) return -log_sum_exp(tmp, dim=1) elif uid == 4: # Potential 4 in NF paper tmp = torch.cat(((-0.5 * ((z2 - w1z) / 0.4)**2).view(-1, 1), (-0.5 * ((z2 - w1z + w3z) / 0.35)**2).view(-1, 1)), 1) return -log_sum_exp(tmp, dim=1) else: return 1
def train_c(self, labeled_loader, unlabeled_loader): args = self.args set_require_grad(self.classifier, requires_grad=True) # standard classification loss lab_data, lab_labels = labeled_loader.next() lab_data, lab_labels = tensor2Var(lab_data), tensor2Var(lab_labels) lab_labels = lab_labels.view(-1) unl_data, _ = unlabeled_loader.next() unl_data = tensor2Var(unl_data) noise = create_noise(unl_data.size(0), args.noise_size) noise = tensor2Var(noise) gen_data = self.gen(noise).detach() lab_logits = self.classifier(lab_data, 'class') unl_logits = self.classifier(unl_data, 'class') gen_logits = self.classifier(gen_data, 'class') lab_loss = F.cross_entropy(lab_logits, lab_labels) unl_logsumexp = log_sum_exp(unl_logits) gen_logsumexp = log_sum_exp(gen_logits) unl_acc = torch.mean(torch.sigmoid(unl_logsumexp.detach()).gt(0.5).float()) gen_acc = torch.mean(torch.sigmoid(gen_logsumexp.detach()).lt(0.5).float()) # This is the typical GAN cost, where sumexp(logits) is seen as the input to the sigmoid true_loss = - 0.5 * torch.mean(unl_logsumexp) + 0.5 * torch.mean(F.softplus(unl_logsumexp)) fake_loss = 0.5 * torch.mean(F.softplus(gen_logsumexp)) # max_unl_acc = torch.mean(unl_logits.max(1)[0].detach().gt(0.0).float()) # max_gen_acc = torch.mean(gen_logits.max(1)[0].detach().gt(0.0).float()) unl_prob = F.softmax(unl_logits, dim=1) entropy = -(unl_prob * torch.log(unl_prob + 1e-8)).sum(1).mean() unl_loss = true_loss + fake_loss c_loss = lab_loss + args.lambda_gan * unl_loss + args.lambda_e * entropy if args.lambda_consistency > 0: unl_logits_2 = self.classifier(unl_data, 'class') unl_prob_2 = F.softmax(unl_logits_2, dim=1) consistency_loss = ((unl_prob - unl_prob_2) ** 2).mean() c_loss += args.lambda_consistency * consistency_loss if self.total_iter % 1000 == 0: print(consistency_loss) self.classifier_opt.zero_grad() c_loss.backward() self.classifier_opt.step() return lab_loss.cpu().item(), unl_loss.cpu().item(), entropy.cpu().item()
def _forward_alg(self, feats, sentence_masks, device): """ Get alpha values for CRF :param feats: LSTM output, batch x max_seq x tag :param sentence_masks: binary (0,1) int matrix, batch x max_seq :param device: device info :return: alpha values for each sentence """ batch_size, max_seq_length, tag_num = feats.size() sentence_lengths = torch.sum(sentence_masks, 1) # initialize alpha with a Tensor with values all equal to Constants.Invalid_Transition, 1 x tag init_alphas = torch.Tensor(1, self.tag_set_size).fill_( Constants.Invalid_Transition) init_alphas[0][self.tag_to_id[Constants.Tag_Start]] = 0. # batch x 1 x tag forward_var = init_alphas.view(1, 1, tag_num).expand(batch_size, 1, tag_num) all_alphas = torch.zeros((max_seq_length, batch_size, tag_num), dtype=torch.float) if self.use_gpu: forward_var = forward_var.to(device) all_alphas = all_alphas.to(device) for i in range(max_seq_length): # batch x tag feat = feats[:, i, :] # batch x tag x 1 emit_score = feat.view(batch_size, tag_num, 1) # batch x tag x tag transition_expanded = self.transitions.view( 1, tag_num, tag_num).expand(batch_size, tag_num, tag_num) # batch x tag x tag tag_var = forward_var + transition_expanded + emit_score # batch x tag --> batch x 1 x tag new_forward_var = log_sum_exp(tag_var, dim=2) forward_var = new_forward_var.unsqueeze(1) all_alphas[i] = new_forward_var # max_seq x batch x tag forward_var_selection = (sentence_lengths - 1).view(1, -1, 1).expand( 1, -1, tag_num) # batch x tag forward_var_last = torch.gather(all_alphas, 0, forward_var_selection).squeeze(0) terminal_var = forward_var_last + self.transitions[ self.tag_to_id[Constants.Tag_End], :].view(1, -1) # batch Z = log_sum_exp(terminal_var, dim=1) return Z
def forward(self, h, mask): # forward algorithm # initialize forward variables in log space score = Tensor(BATCH_SIZE, self.num_tags).fill_(-10000.) # [B, C] score[:, SOS_IDX] = 0. trans = self.trans.unsqueeze(0) # [1, C, C] for t in range(h.size(1)): # recursion through the sequence mask_t = mask[:, t].unsqueeze(1) emit_t = h[:, t].unsqueeze(2) # [B, C, 1] score_t = score.unsqueeze( 1) + emit_t + trans # [B, 1, C] -> [B, C, C] score_t = log_sum_exp(score_t) # [B, C, C] -> [B, C] score = score_t * mask_t + score * (1 - mask_t) score = log_sum_exp(score + self.trans[EOS_IDX]) return score # partition function
def partition(self,unary_pot): score = utils.Tensor(unary_pot.size()[0], self.total_labels).fill_(LOW_POT) score[:, self.START_IDX] = 0.0 score = Variable(score) for t in range(unary_pot.size(1)): # iterate through the sequence score_t = score.unsqueeze(-1).expand(-1,-1,self.total_labels) emit = unary_pot[:, t,:].unsqueeze(-1).expand(-1,-1,self.total_labels).transpose(1,2) trans = self.transition_table.unsqueeze(0).expand(unary_pot.size()[0],-1,-1).transpose(1,2) score = utils.log_sum_exp(score_t + emit + trans,1) # #take care of transition to self.END_IDX score = score + self.transition_table[self.END_IDX].unsqueeze(0).expand_as(score) score = utils.log_sum_exp(score) return score # partition function
def iwae(nll, p_nu, q_nu, p_z, q_z, p_a, q_a, batch_sz, sz, num_importance_samples): kl_divergence = distributions.kl_divergence # the global variables are repeated (because sampled once per batch - local are not scaled logK = np.log(num_importance_samples) components = ( -log_sum_exp(-nll.sum(1).view(num_importance_samples, batch_sz), 0) + logK, kl_divergence(q_nu, p_nu).sum().repeat(batch_sz) / sz, -log_sum_exp( -kl_divergence(q_z, p_z).sum(1).view(num_importance_samples, batch_sz), 0) + logK, kl_divergence(q_a, p_a).sum().repeat(batch_sz) / sz) return components
def calc_weights(self, beta=1.0): """ Calculate the canonical weights to be in a canonical average for a list of energies. """ if (numpy.any(self.support)): # Calculate the probability of each bin according to the # canonical ensemble within the common support lnGs = self.lnG[self.support] Es = self.bin_centers[self.support] lnZ = log_sum_exp(lnGs - beta*Es) P = numpy.zeros(self.support.shape) P[self.support] = exp(lnGs - beta*Es - lnZ) # Normalize the probabilities by the counts in the histogram P[self.support] /= self.histogram[self.support] # Calculate the weight for each energy weights = [] for bin_number in self.bin_number_for_energies: if 0 <= bin_number: weights.append(P[bin_number]) else: weights.append(0.0) return weights else: return [0.0 for energy in self.energies]
def get_loss(self, scores, target, mask): """ calculate viterbi loss args: scores (seq_len, bat_size, target_size_from, target_size_to) : class score for CRF target (seq_len, bat_size, 1) : crf label mask (seq_len, bat_size) : mask for crf label """ seq_len = scores.size(0) bat_size = scores.size(1) tg_energy = torch.gather(scores.view(seq_len, bat_size, -1), 2, target).view(seq_len, bat_size) # seq_len * bat_size tg_energy = tg_energy.masked_select(mask).sum() seq_iter = enumerate(scores) _, inivalues = seq_iter.next() partition = inivalues[:, self.start_tag, :].clone() for idx, cur_values in seq_iter: cur_values = cur_values + partition.contiguous().view(bat_size, self.tagset_size, 1).\ expand(bat_size, self.tagset_size, self.tagset_size) cur_partition = utils.log_sum_exp(cur_values, self.tagset_size) mask_idx = mask[idx, :].view(bat_size, 1).expand(bat_size, self.tagset_size) partition.masked_scatter_(mask_idx, cur_partition.masked_select(mask_idx)) partition = partition[:, self.end_tag].sum() loss = (partition - tg_energy) / bat_size return loss
def forward(self, x, logdet, dsparams, mollify=0.0, delta=nn_.delta): ndim = self.num_ds_dim a_ = self.act_a(dsparams[:, :, 0 * ndim:1 * ndim]) b_ = self.act_b(dsparams[:, :, 1 * ndim:2 * ndim]) w = self.act_w(dsparams[:, :, 2 * ndim:3 * ndim]) a = a_ * (1 - mollify) + 1.0 * mollify b = b_ * (1 - mollify) + 0.0 * mollify pre_sigm = a * x[:, :, None] + b sigm = torch.sigmoid(pre_sigm) x_pre = torch.sum(w * sigm, dim=2) x_pre_clipped = x_pre * (1 - delta) + delta * 0.5 x_ = log(x_pre_clipped) - log(1 - x_pre_clipped) xnew = x_ logj = F.log_softmax(dsparams[:,:,2*ndim:3*ndim], dim=2) + \ nn_.logsigmoid(pre_sigm) + \ nn_.logsigmoid(-pre_sigm) + log(a) logj = utils.log_sum_exp(logj, 2).sum(2) logdet_ = logj + np.log(1-delta) - \ (log(x_pre_clipped) + log(-x_pre_clipped+1)) logdet = logdet_.sum(1) + logdet return xnew, logdet
def _backward(self): # initialize first timestep value of beta to zero # and then iterate backward starting from the end # zero because operating in log space self.betas[-1, :] = 0 # allocate a buffer to reuse in inner loop timestep_values = np.empty(self.k) # start from second to last for tidx in range(self.T - 2, -1, -1): # iterate over k values to fill (timestep t) # note that i and j are flipped from forward pass for i in range(self.k): # iterate over next k values (timestep t + 1) for j in range(self.k): emission_prob = log_poisson_density(self.data[tidx + 1], self.B[j]) transition_prob = np.log(self.A[i, j]) beta_prob = self.betas[tidx + 1, j] timestep_values[j] = emission_prob + transition_prob + beta_prob # numerically stable sum timestep_total = utils.log_sum_exp(timestep_values) # set value for jth class at time t self.betas[tidx, i] = timestep_total
def forward_unlabeled(self, features): init_alphas = [-1e10] * self.num_labels init_alphas[self.label2idx[START]] = 0 for_expr = dy.inputVector(init_alphas) for obs in features: alphas_t = [] for next_tag in range(self.num_labels): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.num_labels) next_tag_expr = for_expr + self.transition[ next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr, self.num_labels)) for_expr = dy.concatenate(alphas_t) terminal_expr = for_expr + self.transition[self.label2idx[STOP]] alpha = log_sum_exp(terminal_expr, self.num_labels) return alpha
def latent_loss(outputs, target, device): """Numerically stable implementation of the language modeling loss """ #target dim # btchsize x numtags x sentLen tag_logits = outputs[0] #btchsize x sentlen x numtags word_dist_logits = outputs[ 1] #list #for jth tag -> batch_size, sent_len, j_vocab_size numtags = len(word_dist_logits) btchSize = tag_logits.shape[0] sentLen = tag_logits.shape[1] #calculate loss for tags crossEntropy_tag = nn.CrossEntropyLoss(reduction='none') taglogitloss = [ -crossEntropy_tag( tag_logits.transpose(1, 2), torch.zeros( (btchSize, sentLen), dtype=torch.long, device=device) + j) for j in range(numtags) ] #calculate loss for words ignore_mask = ((target == Vocabulary.TOKEN_NOT_IN_TAGVOCAB) | (target == Vocabulary.PADTOKEN_FOR_TAGVOCAB)) target_with_ignore = target.clone() target_with_ignore[ignore_mask] = -100 crossEntropy_word = nn.CrossEntropyLoss(reduction='none', ignore_index=-100) wordlogitloss = [ -crossEntropy_word(word_logit.transpose(1, 2), target_with_ignore[:, j, :]) for j, word_logit in enumerate(word_dist_logits) ] taglogitloss = torch.stack(taglogitloss) wordlogitloss = torch.stack(wordlogitloss) totalloss = taglogitloss + wordlogitloss #0 loss for a tag if output word is not present in tag's vocab outofvocab_mask = (torch.transpose(target, 0, 1) == Vocabulary.TOKEN_NOT_IN_TAGVOCAB) totalloss[outofvocab_mask] = float('-inf') finalLoss = -log_sum_exp(totalloss, dim=0) #mask the loss from tokens, if the output token is not present in even single tag category presentInZeroTagMask = torch.all( (torch.transpose(target, 1, 2) == Vocabulary.TOKEN_NOT_IN_TAGVOCAB), dim=-1) #mask the loss of padding tokens paddingMask = (target[:, 0, :] == Vocabulary.PADTOKEN_FOR_TAGVOCAB) tokenContributingToZeroLoss = (presentInZeroTagMask | paddingMask) num_useful_tokens = (~tokenContributingToZeroLoss).sum().item() return torch.sum( finalLoss[~tokenContributingToZeroLoss]), num_useful_tokens
def forward(self, scores, target, mask): """ args: scores (seq_len, bat_size, target_size_from, target_size_to) : crf scores target (seq_len, bat_size, 1) : golden state mask (size seq_len, bat_size) : mask for padding return: loss """ # calculate batch size and seq len seq_len = scores.size(0) bat_size = scores.size(1) # calculate sentence score tg_energy = torch.gather(scores.view(seq_len, bat_size, -1), 2, target).view(seq_len, bat_size) # seq_len * bat_size tg_energy = tg_energy.masked_select(mask).sum() # calculate forward partition score # build iter seq_iter = enumerate(scores) # the first score should start with <start> _, inivalues = seq_iter.__next__( ) # bat_size * from_target_size * to_target_size # only need start from start_tag partition = inivalues[:, self.start_tag, :].clone( ) # bat_size * to_target_size # iter over last scores for idx, cur_values in seq_iter: # previous to_target is current from_target # partition: previous results log(exp(from_target)), #(batch_size * from_target) # cur_values: bat_size * from_target * to_target cur_values = cur_values + partition.contiguous().view( bat_size, self.tagset_size, 1).expand( bat_size, self.tagset_size, self.tagset_size) cur_partition = utils.log_sum_exp(cur_values, self.tagset_size) # (bat_size * from_target * to_target) -> (bat_size * to_target) # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1) mask_idx = mask[idx, :].view(bat_size, 1).expand(bat_size, self.tagset_size) partition.masked_scatter_( mask_idx, cur_partition.masked_select( mask_idx)) #0 for partition, 1 for cur_partition #only need end at end_tag partition = partition[:, self.end_tag].sum() # average = mask.sum() # average_batch if self.average_batch: loss = (partition - tg_energy) / bat_size else: loss = (partition - tg_energy) return loss
def forward_alg_pairwise(self, feats): init_alphas = torch.full((1, self.tagset_size), 0, dtype=torch.float, requires_grad=True).to(device=self.device) forward_var = init_alphas for feat in feats: alphas_t = [] for next_tag in range(self.tagset_size): trans_score = feat.view(self.tagset_size, self.tagset_size)[:, next_tag].view( 1, -1) next_tag_var = forward_var + trans_score alphas_t.append(utils.log_sum_exp(next_tag_var).view(1)) forward_var = torch.cat(alphas_t).view(1, -1) terminal_var = forward_var alpha = utils.log_sum_exp(terminal_var) return alpha
def _log_p(self, data, params): ll = [] for cn_n, cn_r, cn_v, mu_n, mu_r, mu_v, log_pi in zip( data.cn_n, data.cn_r, data.cn_v, data.mu_n, data.mu_r, data.mu_v, data.log_pi): temp = log_pi + self._log_binomial_likelihood( data.b, data.d, cn_n, cn_r, cn_v, mu_n, mu_r, mu_v, params.x) ll.append(temp) return log_sum_exp(ll)
def forward(self, x, logdet, dsparams): inv = np.log(np.exp(1 - nn_.delta) - 1) ndim = self.hidden_dim pre_u = self.u_[None, None, :, :] + dsparams[:, :, -self.in_dim:][:, :, None, :] pre_w = self.w_[None, None, :, :] + dsparams[:, :, 2 * ndim:3 * ndim][:, :, None, :] a = self.act_a(dsparams[:, :, 0 * ndim:1 * ndim] + inv) b = self.act_b(dsparams[:, :, 1 * ndim:2 * ndim]) w = self.act_w(pre_w) u = self.act_u(pre_u) pre_sigm = torch.sum(u * a[:, :, :, None] * x[:, :, None, :], 3) + b sigm = torch.sigmoid(pre_sigm) x_pre = torch.sum(w * sigm[:, :, None, :], dim=3) x_pre_clipped = x_pre * (1 - nn_.delta) + nn_.delta * 0.5 x_ = log(x_pre_clipped) - log(1 - x_pre_clipped) xnew = x_ logj = F.log_softmax(pre_w, dim=3) + \ nn_.logsigmoid(pre_sigm[:,:,None,:]) + \ nn_.logsigmoid(-pre_sigm[:,:,None,:]) + log(a[:,:,None,:]) # n, d, d2, dh logj = logj[:, :, :, :, None] + F.log_softmax(pre_u, dim=3)[:, :, None, :, :] # n, d, d2, dh, d1 logj = utils.log_sum_exp(logj, 3).sum(3) # n, d, d2, d1 logdet_ = logj + np.log(1-nn_.delta) - \ (log(x_pre_clipped) + log(-x_pre_clipped+1))[:,:,:,None] logdet = utils.log_sum_exp( logdet_[:, :, :, :, None] + logdet[:, :, None, :, :], 3).sum(3) # n, d, d2, d1, d0 -> n, d, d2, d0 return xnew, logdet
def _forward_alg(self, feats): # Do the forward algorithm to compute the partition function # init_alphas = torch.randn(1, self.tagset_size, dtype=torch.float, requires_grad=True).to(device=self.device) init_alphas = torch.full((1, self.tagset_size), -100, dtype=torch.float, requires_grad=True).to(device=self.device) # START_TAG has all of the score. # init_alphas = feats[0].view(1, -1).expand(1, self.tagset_size) + self.transitions[self.tag_to_ix[START_TAG]] init_alphas[0][self.tag_to_ix[DatasetPreprosessed.__START_TAG__]] = 0. # Wrap in a variable so that we will get automatic backprop forward_var = init_alphas # Iterate through the sentence for feat in feats: alphas_t = [] # The forward tensors at this timestep for next_tag in range(self.tagset_size): # broadcast the emission score: it is the same regardless of # the previous tag emit_score = feat[next_tag].view(1, -1).expand( 1, self.tagset_size) # the ith entry of trans_score is the score of transitioning to # next_tag from i trans_score = self.transitions[next_tag].view(1, -1).expand( 1, self.tagset_size) assert emit_score.size() == trans_score.size() # The ith entry of next_tag_var is the value for the # edge (i -> next_tag) before we do log-sum-exp next_tag_var = forward_var + trans_score + emit_score # The forward variable for this tag is log-sum-exp of all the # scores. # print(next_tag_var, next_tag_var.size()) # print(utils.log_sum_exp(next_tag_var).view(1)) alphas_t.append(utils.log_sum_exp(next_tag_var).view(1)) forward_var = torch.cat(alphas_t).view(1, -1) terminal_var = forward_var + self.transitions[self.tag_to_ix[ DatasetPreprosessed.__STOP_TAG__]] alpha = utils.log_sum_exp(terminal_var) return alpha
def fit(self, x, max_iter=1): # Initialize parameters params = self.params n_comp = params['n_components'] n_states = params['n_states'] transition_matrix = normalize(np.random.rand(n_comp, n_states, n_states), axis=1) self.params['transition_matrix'] = transition_matrix init_probs = normalize(np.random.rand(n_comp, n_states), axis=1) self.params['initial_probs'] = init_probs n_seq, n_t = x.shape comp_post = normalize(np.random.rand(n_comp, n_seq)) comp_probs = normalize(np.random.rand(n_comp)) self.params['component_probs'] = comp_probs transition_counts = np.zeros((n_seq, n_states, n_states)) init_states_dummy = np.zeros((n_seq, n_states)) for n, seq in enumerate(x): init_states_dummy[n, seq[0]] = 1 for i in range(1, n_t): transition_counts[n, seq[i], seq[i-1]] += 1 transition_counts = transition_counts.reshape(-1, n_states**2) # Fit iters = 0 for i in range(max_iter): log_transition_matrix = log_clip(transition_matrix).reshape(n_comp, -1) log_init_probs = log_clip(init_probs) log_comp_probs = log_clip(comp_probs) # E-Step comp_loglikes = ((log_init_probs @ init_states_dummy.T) + (log_transition_matrix @ transition_counts.T) + log_comp_probs.reshape(-1, 1)) comp_post = exp_normalize(comp_loglikes) self.history['train_loglike'].append(log_sum_exp(comp_loglikes)) # M-Step init_probs = normalize(comp_post @ init_states_dummy, axis=1) transition_matrix = normalize((comp_post @ transition_counts).reshape(-1, n_states, n_states), axis=1) comp_probs = normalize(comp_post.sum(axis=1, keepdims=True)).reshape(-1) # Update self.params['transition_matrix'] = transition_matrix self.params['initial_probs'] = init_probs self.params['component_probs'] = comp_probs
def forward_alg_unary(self, feats): init_alphas = torch.full((1, self.tagset_size), -100., dtype=torch.float, requires_grad=True).to(device=self.device) init_alphas[0][self.__start__] = 0. forward_var = init_alphas for i, feat in enumerate(feats): alphas_t = [] for next_tag in range(self.tagset_size): emit_score = feat[next_tag].view(1, -1).expand( 1, self.tagset_size) trans_score = self.transitions[:, next_tag].view(1, -1) assert emit_score.size() == trans_score.size() next_tag_var = forward_var + emit_score + trans_score alphas_t.append(utils.log_sum_exp(next_tag_var).view(1)) forward_var = torch.cat(alphas_t).view(1, -1) terminal_var = forward_var + self.transitions[:, self.__stop__].view( 1, -1) alpha = utils.log_sum_exp(terminal_var) return alpha
def estimate_agg_posterior(z: MultiGaussian, z_samples=None ) -> torch.Tensor: batch_size, zdim = z.mu.size() if z_samples is None: z_samples = z.sample() log_qzx = MultiGaussian( mu=(z.mu.unsqueeze(0).expand(batch_size, batch_size, -1) .reshape(-1, zdim)), var_logit=(z.var_logit.unsqueeze(0).expand(batch_size, batch_size, -1) .reshape(-1, zdim)) ).log_prob(z_samples.unsqueeze(1).expand(batch_size, batch_size, -1) .reshape(-1, zdim)) return (utils.log_sum_exp(log_qzx.reshape(batch_size, batch_size), 1) - math.log(batch_size))
def _get_useful_funcs(self): super(MLPWeightNorm_BHN_dais, self)._get_useful_funcs() self.project = theano.function([self.input_var], self.hs) input2 = T.matrix('input2') h2 = get_output(self.hiddens, input2) y2 = get_output(self.p_net, input2) self.dais_ = theano.function([ self.input_var, self.target_var, input2, self.weight, self.dataset_size ], [self.loss, y2] + h2) imps_ = T.vector('imps_') logsoftmax_exp = theano.function([imps_], T.exp(imps_ - log_sum_exp(imps_))) def dais_y(refx, refy, newx, n_iw, n=None): if n is None: n = refx.shape[0] imps = np.zeros(n_iw).astype('float32') ys = np.zeros( (n_iw, newx.shape[0], self.n_classes)).astype('float32') for i in range(n_iw): outs = self.dais_(refx, refy, newx, 1.0, n) imps[i] = outs[0] ys[i] = outs[1] imps = logsoftmax_exp(imps) return (ys * imps[:, None, None]).sum(0) def dais_h(refx, refy, newx, n_iw, n=None): if n is None: n = refx.shape[0] imps = np.zeros(n_iw).astype('float32') hs = list() for i in range(n_iw): outs = self.dais_(refx, refy, newx, 1.0, n) imps[i] = outs[0] hs.append(outs[2:]) imps = logsoftmax_exp(imps) ind = np.random.multinomial(1, imps).argmax() return hs[ind] self.dais_y = dais_y self.dais_h = dais_h
def forward_labeled(self, id, features, marginals): init_alphas = [-1e10] * self.num_labels init_alphas[self.label2idx[START]] = 0 for_expr = dy.inputVector(init_alphas) # print(id) # print(len(features)) # print(self.mask_tensor[id].dim()) marginal = dy.inputTensor(marginals) for pos, obs in enumerate(features): alphas_t = [] for next_tag in range(self.num_labels): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.num_labels) next_tag_expr = for_expr + self.transition[ next_tag] + obs_broadcast score = log_sum_exp(next_tag_expr, self.num_labels) alphas_t.append(score) # print(self.transition[next_tag].value()) # print(" pos is %d, tag is %s, label score is %.2f "% ( pos, self.labels[next_tag],score.value()) ) for_expr = dy.concatenate(alphas_t) + marginal[pos] terminal_expr = for_expr + self.transition[self.label2idx[STOP]] alpha = log_sum_exp(terminal_expr, self.num_labels) return alpha
def _calc_log_probs(self, labels, q): if self._dist_type == 'logistic': return discretized_mix_logistic_log_probs_nd( labels, q, nr_mix=self._num_components, ndims=self._ndims) if self._dist_type == 'gaussian': if len(self._num_classes) == 1: pi = tf.nn.softmax(q[:, :self._num_components]) mus = q[:, self._num_components:2 * self._num_components] sigmas = tf.nn.softplus(q[:, self._num_components * 2:]) return log_sum_exp( tf.log(pi) + tf.contrib.distributions.Normal( mus, sigmas).log_prob(labels)) else: return mvn_mix_log_probs(labels, q, self._ndims, self._num_components)
def elbo(self, logits, targets, criterion, means, logvars, args, iwae=False, num_importance_samples=3, prior_means=None): """ If iwae == False, then this returns (elbo, elbo, ...), otherwise it returns (iwae, elbo, ...) """ seq_len, batch_size, ntokens = logits.size() # compute NLL NLL = criterion(logits.view(-1, ntokens), targets.view(-1)) # takes the sum, not the mean if iwae: NLL = torch.stack( torch.chunk( NLL.view(seq_len, batch_size).sum(0), num_importance_samples, 0)) # compute KL KL = 0 if prior_means is None: for mean, logvar in zip(means, logvars): KL += -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp(), -1) else: for mean, prior_mean, logvar in zip(means, prior_means, logvars): # KL += -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp()) KL += -0.5 * torch.sum( 1 + logvar - (mean - prior_mean).pow(2) - logvar.exp(), -1) if iwae: KL = torch.stack(torch.chunk(KL, num_importance_samples, 0)) assert args.anneal == 1, "can't mix annealing and IWAE" iwae_loss = (-(log_sum_exp(-(NLL + KL), dim=0)) + math.log(num_importance_samples)).sum() elbo_loss = (NLL + KL).mean(0).sum() return iwae_loss, elbo_loss, NLL.mean(0).sum(), KL.mean(0).sum(), ( (seq_len * batch_size) / num_importance_samples) else: elbo_loss = NLL.sum() + args.anneal * KL.sum() return elbo_loss, elbo_loss, NLL.sum(), KL.sum( ), seq_len * batch_size
def hard_mining(self, conf_data, conf_t, pos, num): # Compute max conf across batch for hard negative mining batch_conf = conf_data.view(-1, self.num_classes) loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.long().view(-1, 1)) # Hard Negative Mining loss_c[pos.view(-1, 1)] = 0 # filter out pos boxes for now loss_c = loss_c.view(num, -1) _, loss_idx = loss_c.sort(1, descending=True) _, idx_rank = loss_idx.sort(1) num_pos = pos.long().sum(1, keepdim=True) num_neg = torch.clamp(self.negpos_ratio * num_pos, max=pos.size(1) - 1) neg = idx_rank < num_neg.expand_as(idx_rank) return neg
def _forward_alg(self, feats): ''' This function performs the forward algorithm explained above ''' # calculate in log domain # feats is len(sentence) * tagset_size # initialize alpha with a Tensor with values all equal to -10000. # Do the forward algorithm to compute the partition function init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.) # START_TAG has all of the score. init_alphas[0][self.tag_to_ix[START_TAG]] = 0. # Wrap in a variable so that we will get automatic backprop forward_var = autograd.Variable(init_alphas) if self.use_gpu: forward_var = forward_var.cuda() # Iterate through the sentence for feat in feats: # broadcast the emission score: it is the same regardless of # the previous tag emit_score = feat.view(-1, 1) # the ith entry of trans_score is the score of transitioning to # next_tag from i tag_var = forward_var + self.transitions + emit_score # The ith entry of next_tag_var is the value for the # edge (i -> next_tag) before we do log-sum-exp max_tag_var, _ = torch.max(tag_var, dim=1) # The forward variable for this tag is log-sum-exp of all the # scores. tag_var = tag_var - max_tag_var.view(-1, 1) # Compute log sum exp in a numerically stable way for the forward algorithm forward_var = max_tag_var + \ torch.log(torch.sum(torch.exp(tag_var), dim=1) ).view(1, -1) # ).view(1, -1) terminal_var = (forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]).view( 1, -1) alpha = log_sum_exp(terminal_var) # Z(x) return alpha
def _log_p(self, data, params): ll = [] for cn_n, cn_r, cn_v, mu_n, mu_r, mu_v, log_pi in zip(data.cn_n, data.cn_r, data.cn_v, data.mu_n, data.mu_r, data.mu_v, data.log_pi): temp = log_pi + self._log_binomial_likelihood(data.b, data.d, cn_n, cn_r, cn_v, mu_n, mu_r, mu_v, params.x) ll.append(temp) return log_sum_exp(ll)
def calc_weights(self, energies, beta=1.0): """ Calculate the canonical weights to be in a canonical average for a list of energies. """ # Make a histogram of the energies histogram = numpy.histogram(energies, bins=self.binning)[0] # In some versions of numpy, there will be an extra bin, with # values faling outside the histogram if len(histogram)==len(self.lnG_support)+1: histogram = histogram[:-1] # Calculate the support between lnG and the histogram support = self.lnG_support & (histogram>0) if (numpy.any(support)): # Calculate the probability of each bin according to the # canonical ensemble within the common support lnGs = self.lnG[support] Es = self.bin_centers[support] lnZ = log_sum_exp(lnGs - beta*Es) P = numpy.zeros(support.shape) P[support] = exp(lnGs - beta*Es - lnZ) # Normalize the probabilities by the counts in the histogram P[support] /= histogram[support] # Calculate the weight for each energy weights = [] for energy in energies: bin_number = self.calc_bin(energy) if 0 <= bin_number < len(self.bin_centers): weights.append(P[bin_number]) else: weights.append(0.0) return weights else: return [0.0 for energy in energies]
def _forward(self): """ The forward pass computes for each sample, for each timestep, and for each latent class, the probability that the latent state was the latent class, and stores these values in self.alphas. This is accomplished using a dynamic programming approach that takes advantage of the assumption that the future depends only upon the previous timestep. Specifically, it iterates through each sequence keeping track of the probability of each class up until that timestep. Then, to compute the probability of each time step at t + 1, it sums over a set of probabilities where each is the probability of transitioning from a previous class times the probability of the current class given the observation times the probability of the previous class. This sum gives the total probability of being in a certain class at timestep t + 1. """ # initialize first timestep value of alpha for each sample # to the start probability of the corresponding latent class in A self.alphas[0, :] = np.log(self.pi) for i in range(self.k): self.alphas[0, i] += log_poisson_density(self.data[0], self.B[i]) # allocate a buffer to reuse in inner loop timestep_values = np.empty(self.k) # tidx starts at 1 since zeroth timestep # of alphas has already been initialized for tidx, value in enumerate(self.data[1:], 1): # iterate over k values to fill for j in range(self.k): # iterate over previous k values for i in range(self.k): timestep_values[i] = np.log(self.A[i, j]) + self.alphas[tidx - 1, i] # numerically stable sum over timestep_values timestep_total = utils.log_sum_exp(timestep_values) # probability of emitting value emission_prob = log_poisson_density(value, self.B[j]) # set value for jth class at time t self.alphas[tidx, j] = timestep_total + emission_prob
def e_step(self): # # assert valid tau # assert np.all([abs(v - 1) < 1e-5 for v in np.sum(self.taus, axis=1)]) # assert not np.any([v < 0 for v in self.taus.flatten()]) # # assert valid pi # assert abs(np.sum(self.pis) - 1) < 1e-5 and not np.any([v < 0 for v in self.pis]) # # assert valid gammas # assert not np.any([v < 0 for v in self.gammas.flatten()]) # use deepcopy # start total at a value for idx in range(self.e_iterations): tau_copy = copy.deepcopy(self.taus) for i in range(self.N): for k in range(self.k): total = np.log(self.pis[k]) for j in range(self.N): if i == j: continue for l in range(self.k): edge_prob = log_poisson_density(self.data[i,j], self.gammas[k,l]) total += tau_copy[j,l] * edge_prob self.taus[i,k] = total # normalize for i in range(self.N): self.taus[i, :] -= utils.log_sum_exp(self.taus[i, :]) # exponentiate self.taus = np.exp(self.taus) # find residual residual = np.max(np.abs(self.taus - tau_copy)) if residual < 1: break
def log_likelihood(self): return utils.log_sum_exp(self.log_c())
def lnZ(self, beta): """ Calculates the logarithm to partition function at beta """ return log_sum_exp(self.lnGs - beta*self.Es)