def merge_final_log_probs(self, source1_decoder_attention_score, source2_decoder_attention_score, source1_local_words_ids, source2_local_words_ids, gate_score): """ 根据三个概率,计算全词表上的对数似然。 """ # 获取group_size和两个序列的长度 group_size, seq_max_len_1 = source1_decoder_attention_score.size() group_size, seq_max_len_2 = source2_decoder_attention_score.size() # 需要和source1相乘的gate概率,shape: (group_size, seq_max_len_1) gate_1 = gate_score.expand(seq_max_len_1, -1).t() # 需要和source2相乘的gate概率,shape: (group_size, seq_max_len_2) gate_2 = (1 - gate_score).expand(seq_max_len_2, -1).t() # 加权后的source1分值,shape: (group_size, seq_max_len_1) source1_decoder_attention_score = source1_decoder_attention_score * gate_1 # 加权后的source2分值,shape: (group_size, seq_max_len_2) source2_decoder_attention_score = source2_decoder_attention_score * gate_2 # shape: (group_size, seq_max_len_1) log_probs_1 = (source1_decoder_attention_score + 1e-45).log() # shape: (group_size, seq_max_len_2) log_probs_2 = (source2_decoder_attention_score + 1e-45).log() # 初始化全词表上的概率为全0, shape: (group_size, target_vocab_size) final_log_probs = (source1_decoder_attention_score.new_zeros((group_size, 2 * self.max_seq_len)) + 1e-45).log() for i in range(seq_max_len_1): # 遍历source1的所有时间步 # 当前时间步的预测概率,shape: (group_size, 1) log_probs_slice = log_probs_1[:, i].unsqueeze(-1) #print(log_probs_slice) # 当前时间步的token ids,shape: (group_size, 1) source_to_target_slice = source1_local_words_ids[:, i].unsqueeze(-1) #print(source_to_target_slice) # 选出要更新位置,原有的词表概率,shape: (group_size, 1) #print(source_to_target_slice.shape,"\t",final_log_probs.shape) selected_log_probs = final_log_probs.gather(-1, source_to_target_slice) # 更新后的概率值(原有概率+更新概率,混合),shape: (group_size, 1) combined_scores = logsumexp(torch.cat((selected_log_probs, log_probs_slice), dim=-1)).unsqueeze(-1) # 将combined_scores设置回final_log_probs中 final_log_probs = final_log_probs.scatter(-1, source_to_target_slice, combined_scores) # 对source2也同样做一遍 for i in range(seq_max_len_2): log_probs_slice = log_probs_2[:, i].unsqueeze(-1) source_to_target_slice = source2_local_words_ids[:, i].unsqueeze(-1) selected_log_probs = final_log_probs.gather(-1, source_to_target_slice) combined_scores = logsumexp(torch.cat((selected_log_probs, log_probs_slice), dim=-1)).unsqueeze(-1) final_log_probs = final_log_probs.scatter(-1, source_to_target_slice, combined_scores) return final_log_probs
def loss(self, input, target, mask=None): ''' Args: input: Tensor the input tensor with shape = [batch, length, input_size] target: Tensor the tensor of target labels with shape [batch, length] mask:Tensor or None the mask tensor with shape = [batch, length] Returns: Tensor A 1D tensor for minus log likelihood loss ''' batch, length, _ = input.size() energy = self.forward(input, mask=mask) # shape = [length, batch, num_label, num_label] energy_transpose = energy.transpose(0, 1) # shape = [length, batch] target_transpose = target.transpose(0, 1) # shape = [length, batch, 1] mask_transpose = None if mask is not None: mask_transpose = mask.unsqueeze(2).transpose(0, 1) # shape = [batch, num_label] partition = None if input.is_cuda: # shape = [batch] batch_index = torch.arange(0, batch).long().cuda() prev_label = torch.cuda.LongTensor(batch).fill_(self.num_labels - 1) tgt_energy = torch.zeros(batch).cuda() else: # shape = [batch] batch_index = torch.arange(0, batch).long() prev_label = torch.LongTensor(batch).fill_(self.num_labels - 1) tgt_energy = torch.zeros(batch) for t in range(length): # shape = [batch, num_label, num_label] curr_energy = energy_transpose[t] if t == 0: partition = curr_energy[:, -1, :] else: # shape = [batch, num_label] partition_new = utils.logsumexp(curr_energy + partition.unsqueeze(2), dim=1) if mask_transpose is None: partition = partition_new else: mask_t = mask_transpose[t] partition = partition + (partition_new - partition) * mask_t tgt_energy += curr_energy[batch_index, prev_label, target_transpose[t].data] prev_label = target_transpose[t].data return utils.logsumexp(partition, dim=1) - tgt_energy
def gammaKsi(self, logB): """ Compute gamma (posterior distribution) and Ksi (joint succesive posterior distrbution) values. gamma [i,n] = conditional probability of the event state 'i' at time 'n', given the complete observation sequence. ksi[n,i,j] = joint posterior probability of two succesive hidden states 'i' and 'j' at time 'n'. Parameters ---------- logB : ndarray The observation probability matrix in logarithmic space. Returns ------- llh : float The normalized log-likelihood. logGamma : ndarray The log posterior distribution. logKsi : ndarray The log joint posterior probability distribution. logAlpha : ndarray The log scaled alpha distribution. logBeta : ndarray The log scaled beta distribution. """ K = self.K N = logB.shape[1] logKsi = np.zeros(( N-1, K, K), dtype=np.float) logGamma = np.zeros((K,N), dtype = np.float) logAlpha = self.alpha(logB) logBeta = self.beta(logB) loglikelihood = logsumexp(logAlpha[:,-1]) #compute gamma logGamma = (logAlpha + logBeta) logGamma-=logsumexp(logGamma,0) #compute ksi for n in xrange (N-1): temp=logB[:,n+1] + logBeta[:,n+1] logKsi[n,:,:] = (logAlpha[:,n][:,np.newaxis] + self.logA[:-1][:,:] + temp) logKsi[n, :, :] -= logsumexp(logKsi[n, :, :].flatten()) return (loglikelihood, logGamma , logKsi,logAlpha,logBeta)
def compute_estimator(self, log_p_all, log_q_all): n_samples = tt.shape(log_p_all)[1] # See equation 14, for definition of I see equation 2 f_x_h = log_p_all - log_q_all # f_x_h: (batch_size, n_samples) sum_p_over_q = logsumexp(f_x_h, axis=1) # sum_p_over_q: (batch_size, ) L = sum_p_over_q - tt.log(n_samples) # L: (batch_size, ) # Equation 10 sum_min_i = logsubexp(sum_p_over_q.dimshuffle(0, 'x'), f_x_h) sum_min_i_normalized = sum_min_i - np.log(n_samples - 1).astype( theano.config.floatX) L_h_given_h = L.dimshuffle(0, 'x') - sum_min_i_normalized # equation (10) # Get gradient of log Q and scale part_1 = L_h_given_h * log_q_all # equation 11, part 1 weights = f_x_h - sum_p_over_q.dimshuffle(0, 'x') exp_weights = tt.exp(weights) part_2 = exp_weights * f_x_h estimator = (part_1 + part_2).sum() / self.batch_size gradients = tt.grad(estimator, self.params.values(), consider_constant=[exp_weights, L_h_given_h]) likelihood = L.sum() / self.batch_size return likelihood, gradients
def log_cond_prob(self, y, y_pred): a = torch.sum(y * y_pred, 2) b = logsumexp(y_pred, 2) # print("y_pred",torch.sum(y_pred)) prob = a - b # print("loss",prob) return prob
def compute_estimator(self, log_p_all, log_q_all): n_samples = tt.shape(log_p_all)[1] # See equation 14, for definition of I see equation 2 f_x_h = log_p_all - log_q_all # f_x_h: (batch_size, n_samples) sum_p_over_q = logsumexp(f_x_h, axis=1) # sum_p_over_q: (batch_size, ) L = sum_p_over_q - tt.log(n_samples) # L: (batch_size, ) # Equation 10 sum_min_i = logsubexp(sum_p_over_q.dimshuffle(0, 'x'), f_x_h) sum_min_i_normalized = sum_min_i - np.log(n_samples - 1).astype(theano.config.floatX) L_h_given_h = L.dimshuffle(0, 'x') - sum_min_i_normalized # equation (10) # Get gradient of log Q and scale part_1 = L_h_given_h * log_q_all # equation 11, part 1 weights = f_x_h - sum_p_over_q.dimshuffle(0, 'x') exp_weights = tt.exp(weights) part_2 = exp_weights * f_x_h estimator = (part_1 + part_2).sum() / self.batch_size gradients = tt.grad(estimator, self.params.values(), consider_constant=[exp_weights, L_h_given_h]) likelihood = L.sum() / self.batch_size return likelihood, gradients
def logProbability(self, x): logProbability = [self.flow.logProbability(x)] for op in self.symmetryList: logProbability.append(self.flow.logProbability(op(x))) logp = logsumexp(logProbability).view(-1) logp = logp - math.log(len(self.symmetryList) + 1) return logp
def dp_inside_batch(batch_size,sentence_len,tags_dim,weights): inside_table = torch.DoubleTensor(batch_size, sentence_len * sentence_len * 8, tags_dim, tags_dim) inside_table.fill_(-np.inf) if torch.cuda.is_available(): inside_table = inside_table.cuda() m = sentence_len seed_spans, base_left_spans, base_right_spans, left_spans, right_spans, ijss, ikss, kjss, id_span_map, span_id_map = test_constituent_indexes( m, False) for ii in seed_spans: inside_table[:, ii, :, :] = 0.0 for ii in base_right_spans: (l, r, c) = id_span_map[ii] swap_weights = weights.permute(0, 1, 4, 3, 2) inside_table[:, ii, :, :] = swap_weights[:, r, :, l, :] for ii in base_left_spans: (l, r, c) = id_span_map[ii] inside_table[:, ii, :, :] = weights[:, l, :, r, :] for ij in ijss: (l, r, c) = id_span_map[ij] if ij in left_spans: ids = span_id_map.get((l, r, get_state_code(0, 0, 0)), -1) prob = inside_table[:, ids, :, :] + weights[:, l, :, r, :] inside_table[:, ij, :, :] = utils.logaddexp(inside_table[:, ij, :, :], prob) elif ij in right_spans: ids = span_id_map.get((l, r, get_state_code(0, 0, 0)), -1) swap_weights = weights.permute(0, 1, 4, 3, 2) prob = inside_table[:, ids, :, :] + swap_weights[:, r, :, l, :] inside_table[:, ij, :, :] = utils.logaddexp(inside_table[:, ij, :, :], prob) else: num_k = len(ikss[ij]) beta_ik, beta_kj = inside_table[:, ikss[ij], :, :], inside_table[:, kjss[ij], :, :] probs = beta_ik.contiguous().view(batch_size, num_k, tags_dim, tags_dim, 1) +\ beta_kj.contiguous().view(batch_size, num_k, 1, tags_dim, tags_dim) probs = utils.logsumexp(probs, axis=(1, 3)) inside_table[:, ij, :, :] = utils.logaddexp(inside_table[:, ij, :, :], probs) id1 = span_id_map.get((0, m - 1, get_state_code(0, 1, 0)), -1) id2 = span_id_map.get((0, m - 1, get_state_code(0, 1, 1)), -1) score1 = inside_table[:, id1, 0, :].contiguous().view(batch_size, 1, tags_dim) score2 = inside_table[:, id2, 0, :].contiguous().view(batch_size, 1, tags_dim) ll = utils.logaddexp(utils.logsumexp(score1, axis=2), utils.logsumexp(score2, axis=2)) return inside_table, ll
def predict(self, data, mc_test): out = self.likelihood.predict(self.layer_out) nll = - tf.reduce_sum(-np.log(mc_test) + utils.logsumexp(self.likelihood.log_cond_prob(self.Y, self.layer_out), 0)) #nll = - tf.reduce_sum(tf.reduce_mean(self.likelihood.log_cond_prob(self.Y, self.layer_out), 0)) pred, neg_ll = self.session.run([out, nll], feed_dict={self.X:data.X, self.Y: data.Y, self.mc:mc_test}) mean_pred = np.mean(pred, 0) return mean_pred, neg_ll
def _calculate_log_p(self, W, b): log_p = 0. log_p += logsumexp( torch.stack([ float(np.log(self.pi)) + self._calculate_log_gaussian(W, 0, self.sigma1), float(np.log(1 - self.pi)) + self._calculate_log_gaussian(W, 0, self.sigma2) ])).sum() log_p += logsumexp( torch.stack([ float(np.log(self.pi)) + self._calculate_log_gaussian(b, 0, self.sigma1), float(np.log(1 - self.pi)) + self._calculate_log_gaussian(b, 0, self.sigma2) ])).sum() return log_p
def estimatepostduration(self, logalpha, logbeta, logB, rankn, g , llh): """Estimate state durations based on the posterior distribution. Since the durations are truncated by the timeout parameter, we use a distribution free method. Parameters ----------- logalpha : ndarray Log scaled alpha distribution. logbeta : ndarray Log scaled beta values. logB : ndarray Observation probability distribution in log-space. rankn : ndarray the top ranked 'n' for eah state 'k', used to estimate state durations. g : ndarray log scaled posterior distribution ('logGamma') llh : float the normalized log-likelihood. Returns -------- int The estimated durations in each state. ndarray The expected value of the state duration at the 'rankn'. Notes ------ The QDHMM EM algorithm requires good initial estimates of the model parameters in order to converge to a good solution. We propose a distribution free method to find the expected value of state durations in a standard HMM model, which is then used to initialize the QDHMM 'tau' parameters. """ sub = len(rankn[0]) N = logalpha.shape[1] K=self.K durations = np.zeros((K)) res = np.zeros((K,sub)) for o,k in enumerate(range(K)): inotk = set(range(K)) - set([k]) for idx,n in enumerate(rankn[o]): const = np.zeros(len(inotk)) #Base Case tmp = (N-1-n)*self.logA[k,k] - np.log(1-np.exp(self.logA[k,k])) #Induction for t in xrange(N-2, n,-1): tmp = (np.logaddexp(logbeta[k,t], logB[k,t+1] + self.logA[k,k]+ tmp)) for x,i in enumerate(inotk): const[x]=(logalpha[i,n] + self.logA[i,k]+logB[k,n+1] - (g[i,n] + llh) ) res[o,idx] = logsumexp(const) + tmp durations = np.max(np.exp(res), 1 ) return durations.astype(int), np.exp(res)
def predict(self, latent_val): """ return the probabilty for all the samples, datapoints and calsses :param latent_val: :return: """ logprob = latent_val - tf.expand_dims(utils.logsumexp(latent_val, 2), 2) return tf.exp(logprob)
def predict(self, y_pred): """ return the probabilty for all the samples, datapoints and classes param: y_pred return: """ logprob = y_pred - logsumexp(y_pred, 2).unsqueeze(2).expand( self.mc, self.batch_size, self.classes) return torch.exp(logprob)
def log_likelihood(X_apps, theta_unconstrained, pi_unconstrained): """ X_apps.shape = [n, 1, d] theta_unconstrained \in \mathbb{R}^{k, d}, where d is the number of app cats """ log_pi = logsoftmax(pi_unconstrained, dim=-1) logp_theta = torch.sum( Bernoulli(logits=theta_unconstrained).log_prob(X_apps), dim=-1) logps = logsumexp(log_pi + logp_theta, dim=-1) return logps.sum()
def logposterior(self, data): """ Calculate log-posterior over scales given the data points. @type data: array_like @param data: data stored in columns """ jnt = self.logjoint(data) return jnt - logsumexp(jnt, 0)
def predict_log_proba(self, X): """ Estimate log-probability :param X_test: array-like of shape (n_samples, n_features) -- Input data :return: ndarray of shape (n_samples, n_features) -- Estimated log-probabilities """ values = self._decision_function(X) loglikelihood = (values - values.max(axis=1)[:, np.newaxis]) normalization = logsumexp(loglikelihood, axis=1) return loglikelihood - normalization[:, np.newaxis]
def get_const(log_lambdas, desired_k): base_inc_probs = np.log(desired_k) + log_lambdas remaining_prob = 1 - np.exp(utils.logsumexp(log_lambdas)) c = desired_k / expected_k(base_inc_probs) start = c * desired_k results = opt.minimize( lambda x: (desired_k - (expected_k(log_lambdas + x) + desired_k * remaining_prob))**2, np.log(start)) return np.exp(results.x[0])
def step(obs, prev): # The "outer sum" of the previous log-probabilities and the # observation log-probabilities gives a (num_states, # num_states) matrix of the paired log-probabilities, and # adding in the transition log-probabilities gives the matrix # representing the log-probabilities of moving from each state # to each other and generating the observation. Summing these # (in the standard [0,1] probability space, not the log space) # along the 1st axis then gives the vector with the total # log-probability of being in each state after the step. return logsumexp(outer_sum(prev, obs, batch) + transitions, axis=axis)
def get_agg_kl(data, model, meta_optimizer): model.eval() criterion = nn.NLLLoss().cuda() means = [] logvars = [] all_z = [] for i in range(len(data)): sents, length, batch_size = data[i] if args.gpu >= 0: sents = sents.cuda() mean, logvar = model._enc_forward(sents) z_samples = model._reparameterize(mean, logvar) if args.model == 'savae': mean_svi = Variable(mean.data, requires_grad=True) logvar_svi = Variable(logvar.data, requires_grad=True) var_params_svi = meta_optimizer.forward([mean_svi, logvar_svi], sents) mean_svi_final, logvar_svi_final = var_params_svi z_samples = model._reparameterize(mean_svi_final, logvar_svi_final) preds = model._dec_forward(sents, z_samples) nll_svi = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final) mean, logvar = mean_svi_final, logvar_svi_final means.append(mean.data) logvars.append(logvar.data) all_z.append(z_samples.data) means = torch.cat(means, 0) logvars = torch.cat(logvars, 0) all_z = torch.cat(all_z, 0) N = float(means.size(0)) mean_prior = torch.zeros(1, means.size(1)).cuda() logvar_prior = torch.zeros(1, means.size(1)).cuda() agg_kl = 0. count = 0. for i in range(all_z.size(0)): z_i = all_z[i].unsqueeze(0).expand_as(means) log_agg_density = utils.log_gaussian(z_i, means, logvars) # log q(z|x) for all x log_q = utils.logsumexp(log_agg_density, 0) log_q = -np.log(N) + log_q log_p = utils.log_gaussian(all_z[i].unsqueeze(0), mean_prior, logvar_prior) agg_kl += log_q.sum() - log_p.sum() count += 1 mean_var = mean.var(0) print('active units', (mean_var > 0.02).float().sum()) print(mean_var) return agg_kl / count
def log_sample_poisson(log_lambdas, k=1, normalize=True, seed=0): np.random.seed(seed=seed) J = [] inc_probs = np.log(k) + log_lambdas if normalize: inc_probs -= utils.logsumexp(log_lambdas) for i, l in enumerate(inc_probs): u = np.random.uniform() if np.log(u) < l: J.append(i) #print(len(J)) return J, inc_probs
def logdrcdf(norm): """ Logarithm of the derivative of the radial CDF. """ # allocate memory result = zeros([self.gsm.num_scales, len(norm)]) tmp = sqrt(self.gsm.scales) for j in range(self.gsm.num_scales): result[j, :] = log(self.gsm.priors[j]) + logdgrcdf(tmp[j] * norm, self.gsm.dim) + log(tmp[j]) return logsumexp(result, 0)
def posterior(self, data): """ Compute posterior over component indices. """ log_post = empty([len(self), data.shape[1]]) for k in range(len(self.components)): log_post[k] = self.components[k].expected_log_likelihood(data) log_post += (psi(self.gamma) - psi(sum(self.gamma))) log_post -= logsumexp(log_post, 0) return exp(log_post)
def logdrcdf(norm): """ Logarithm of the derivative of the radial CDF. """ # allocate memory result = zeros([self.gsm.num_scales, len(norm)]) tmp = sqrt(self.gsm.scales) for j in range(self.gsm.num_scales): result[j, :] = log(self.gsm.priors[j]) + logdgrcdf( tmp[j] * norm, self.gsm.dim) + log(tmp[j]) return logsumexp(result, 0)
def compute_log_model_probability(scores, ranking, gpu_id=None): """ more stable version if rel is provided, use it to calculate probability only till all the relevant documents are found in the ranking """ subtracts = torch.zeros_like(scores) log_probs = torch.zeros_like(scores) if gpu_id is not None: subtracts, log_probs = convert_vars_to_gpu([subtracts, log_probs], gpu_id) for j in range(scores.size()[0]): posj = ranking[j] log_probs[j] = scores[posj] - logsumexp(scores - subtracts, dim=0) subtracts[posj] = scores[posj] + 1e6 return torch.sum(log_probs)
def sample_nav_article_nav_assignments(self, article_id): nav_article_log_proportions = np.log( self.nav_article_proportions[-1][article_id]) updated_assignments = [] for nav_id in self.training_data.article_navs[article_id]: assignment_proportions = nav_article_log_proportions + np.array([ self.calculate_topic_nav_logprob(topic_id, nav_id) \ for topic_id in range(self.num_nav_topics) ]) assignment_probabilities = np.exp( assignment_proportions - logsumexp(assignment_proportions)) updated_assignments.append( choice(self.num_nav_topics, p=assignment_probabilities)) return updated_assignments
def xe(z, targets, predict=False, error=False, addon=0): """ Cross entropy error. """ if predict: return gpu.argmax(z, axis=1) _xe = z - logsumexp(z, axis=1) n, _ = _xe.shape xe = -gpu.mean(_xe[np.arange(n), targets]) if error: err = gpu.exp(_xe) err[np.arange(n), targets] -= 1 return xe + addon, err / n else: return xe + addon
def calc_marg_log_prob(X, obs_rvs, w, all_rvs_params=None, g=None): """ Calculate marginal log probabilities of observations :param g: :param X: M x N_o matrix of (partial) observations, where N_o is the number of obs nodes; alternatively a N_o vector :param obs_rvs: obs_rvs: length N_o list of observed rvs :param params: :return: """ comp_log_probs = calc_marg_comp_log_prob(X, obs_rvs, all_rvs_params=all_rvs_params, g=g) # M x K out = utils.logsumexp(np.log(w) + comp_log_probs, axis=-1) # reduce along the last dimension return out
def logLikelihood(self, X, Xcov): """ Compute the log-likelihood of data given the model Parameters ---------- X: array_like data, shape = (n_samples, n_features) Xcov: array_like covariances, shape = (n_samples, n_features, n_features) Returns ------- logL : float log-likelihood """ return np.sum(logsumexp(self.logprob_a(X, Xcov), -1))
def logposterior(self, data): """ Computes the log-posterior distribution over components. @type data: array_like @param data: data points stored in columns @rtype: ndarray @return: a posterior distribution for each data point """ # compute unnormalized log-posterior def logposterior_(i): return self[i].loglikelihood(data) + log(self.priors[i]) logpost = vstack(map(logposterior_, range(len(self)))) # normalize posterior return asarray(logpost) - logsumexp(logpost, 0)
def loglikelihood(self, data): """ Computes the log-likelihood of the model for the given data. @type data: array_like @param data: data points stored in columns @rtype: ndarray @return: a log-likelihood for each data point """ # compute joint density over components and data points def loglikelihood_(i): return self[i].loglikelihood(data) + log(self.priors[i]) logjoint = vstack(map(loglikelihood_, range(len(self)))) # marginalize return logsumexp(logjoint, 0).flatten()
def get_negative_log_likelihood(self, source1_decoder_attention_score, source2_decoder_attention_score, source1_token_mask, source2_token_mask, target_to_source1, target_to_source2, target_tokens, gate_score): # shape: (batch_size, seq_max_len_1) combined_log_probs_1 = ((source1_decoder_attention_score * target_to_source1.float()).sum(-1) + 1e-20).log() # shape: (batch_size, seq_max_len_2) combined_log_probs_2 = ((source2_decoder_attention_score * target_to_source2.float()).sum(-1) + 1e-20).log() # 计算 log(p1 * gate + p2 * (1-gate)) log_gate_score_1 = (gate_score + 1e-20).log() # shape: (batch_size,) log_gate_score_2 = (1 - gate_score + 1e-20).log() # shape: (batch_size,) item_1 = (log_gate_score_1 + combined_log_probs_1).unsqueeze(-1) item_2 = (log_gate_score_2 + combined_log_probs_2).unsqueeze(-1) step_log_likelihood = logsumexp(torch.cat((item_1, item_2), -1)) return step_log_likelihood
def alpha(self, logB): """ Compute alpha (forward) distribution. alpha [i,n] = joint probability of being in state i, after observing 1..N observations. . Parameters ---------- logB : ndarray The observation probability matrix in logarithmic space. Returns ------- logalpha : ndarray The log scaled alpha distribution. Notes ----- Refer to Tobias Man's paper [1]_ for the motivation behind the scaling factors used here. Note that this scaling methods is suitable when the dynamics of the system is not highly sparse. Adaptation of log-scaling in the QDHMM would require the use to construct a new sparse data structure References ---------- .. [1] Mann, T. P. Numerically Stable Hidden Markov Model Implementation 2006. """ K = self.K N = logB.shape[1] assert logB.shape == (K,N) logAlpha = np.zeros((K, N), dtype=np.float) # Base case, when n=0 logAlpha[:,0] = self.logA[-1] + logB[:,0] #induction for n in xrange(1, N): logAlpha[:,n] = logsumexp(self.logA[:-1][:,:].T + \ logAlpha[:,n-1],1) + logB[:,n] return logAlpha
def beta(self, logB): """ Compute beta (backward) distribution. beta [i,n] = conditional probability generating observations Y_n+1..Y_N, given Z_n. Parameters ---------- logB : ndarray The observation probability matrix in logarithmic space. Returns ------- logbeta : ndarray The log scaled beta distribution. Notes ----- Refer to Tobias Man's paper [1]_ for the motivation behind the scaling factors used here. Note that this scaling methods is suitable when the dynamics of the system is not highly sparse. Adaptation of log-scaling in the QDHMM would require the use to construct a new sparse data structure References ---------- .. [1] Mann, T. P. Numerically Stable Hidden Markov Model Implementation 2006. """ K = self.K N = logB.shape[1] logBeta = np.zeros((K, N), dtype=np.float) #Base case when n = N logBeta[:,-1] = 0.0 #Induction for n in xrange(N-2, -1, -1): logBeta[:,n] = logsumexp(logBeta[:,n+1]+\ self.logA[:-1][:,:] \ + logB[:,n+1],1) return logBeta
def get_agg_kl(q_data, test_data, model): model.eval() means = [] logvars = [] all_z = [] for datum in q_data: img, _ = datum batch_size = img.size(0) img = Variable(img.cuda()) mean, logvar = model._enc_forward(img) z_samples = model._reparameterize(mean, logvar) means.append(mean.data) logvars.append(logvar.data) all_z.append(z_samples.data) means = torch.cat(means, 0) logvars = torch.cat(logvars, 0) N = float(means.size(0)) mean_prior = torch.zeros(1, means.size(1)).cuda() logvar_prior = torch.zeros(1, means.size(1)).cuda() agg_kl = 0. count = 0. for datum in test_data: img, _ = datum batch_size = img.size(0) img = Variable(img.cuda()) mean, logvar = model._enc_forward(img) z_samples = model._reparameterize(mean, logvar).data for i in range(z_samples.size(0)): z_i = z_samples[i].unsqueeze(0).expand_as(means) log_agg_density = utils.log_gaussian( z_i, means, logvars) # log q(z|x) for all x log_q = utils.logsumexp(log_agg_density, 0) log_q = -np.log(N) + log_q log_p = utils.log_gaussian(z_samples[i].unsqueeze(0), mean_prior, logvar_prior) agg_kl += log_q.sum() - log_p.sum() count += 1 mean_var = means.var(0) print('active units', (mean_var > 0.02).float().sum()) print(mean_var) return agg_kl / count
def fit(self, obs, logweights): """Fit a Gaussian to the state distributions after observing the data. Parameters ----------- obs : ndarray Observation sequence. logweights : ndarray The weights attached to each state (posterior distribution). In log-space. """ #oldmeans = self.mu.copy() logGamma = np.concatenate(logweights, 1) normalizer = np.exp(logGamma - logsumexp(logGamma,1)[:,np.newaxis]) for k in range(self.K): self.mu[:,k] = np.dot(normalizer[k,:][np.newaxis,:] , obs.T) obs_bar = obs - self.mu[:,k][:,np.newaxis] self.covar[k,:,:] = np.dot(obs_bar * normalizer[k,:] , obs_bar.T)
def forward(observations, transitions, sequence_len, batch=False): """Implementation of the forward algorithm in Keras. Returns the log probability of the given observations and transitions by recursively summing over the probabilities of all paths through the state space. All probabilities are in logarithmic space. See e.g. https://en.wikipedia.org/wiki/Forward_algorithm . Args: observations (tensor): A tensor of the observation log probabilities, shape (sequence_len, num_states) if batch is False, (batch_size, sequence_len, num_states) otherwise. transitions (tensor): A (num_states, num_states) tensor of the transition weights (log probabilities). sequence_len (int): The number of steps in the sequence. This must be given because unrolling scan() requires a definite (not tensor) value. batch (bool): Whether to run in batchwise mode. If True, the first dimension of observations corresponds to the batch. Returns: Total log probability if batch is False or vector of log probabiities otherwise. """ step = make_forward_step(transitions, batch) if not batch: first, rest = observations[0, :], observations[1:, :] else: first, rest = observations[:, 0, :], observations[:, 1:, :] sequence_len -= 1 # exclude first outputs, _ = scan(step, rest, first, n_steps=sequence_len, batch=batch) if not batch: last, axis = outputs[sequence_len - 1], 0 else: last, axis = outputs[:, sequence_len - 1], 1 return logsumexp(last, axis=axis)
def log_likelihood(self, reward_hypothesis, q_values, demonstrations): #input is reward weights, q_values as a list [q(s0,a0), q(s1,a0), ..., q(sn,am)] # and demonstrations = [(s0,a0), ..., (sm,am)] list of state-action pairs #if self.prior is None: log_sum = 0.0 if self.prior == "non-pos": #check if weights are all non-pos for r in reward_hypothesis: if r > 0: return -np.inf for s, a in demonstrations: if s not in self.mdp_env.terminals and a is not None: #there are no counterfactuals in a terminal state if self.likelihood == "birl": Z_exponents = [] for b in range(self.num_actions): Z_exponents.append(self.beta * q_values[s + self.num_states * b]) #print Z_exponents log_sum += self.beta * q_values[ s + self.num_states * a] - utils.logsumexp(Z_exponents) #print "likelihood:", np.exp(self.beta * placement_reward - scipy.misc.logsumexp(Z_exponents)) #plt.show() elif self.likelihood == "uniform": #print(s,self.mdp_env.get_readable_actions(a)) hinge_losses = 0.0 for b in range(self.num_actions): # print(b) # print(q_values[s + self.num_states * b]) # print(a) # print(q_values[s + self.num_states * a]) hinge_losses += max( q_values[s + self.num_states * b] - q_values[s + self.num_states * a], 0.0) # print(hinge_losses) log_sum += -self.beta * hinge_losses else: raise NotImplementedError return log_sum
def runSmc(args): smcData, settings, do_metrics = args print '\nInitializing SMC\n' # precomputation (particles, param, log_weights, cache, cache_tmp) = bdtsmc.init_smc(smcData, settings) # Run smc print '\nRunning SMC' (particles, ess_itr, log_weights_itr, log_pd, particle_stats_itr_d, particles_itr_d, log_pd_islands) = \ bdtsmc.run_smc(particles, smcData, settings, param, log_weights, cache) # Printing some diagnostics print print 'Estimate of log marginal probability i.e. log p(Y|X) = %s ' % log_pd print 'Estimate of log marginal probability for different islands = %s' % log_pd_islands print 'logsumexp(log_pd_islands) - np.max(log_pd_islands) = %s\n' % \ (logsumexp(log_pd_islands) - np.max(log_pd_islands)) if settings.debug == 1: print 'log_weights_itr = \n%s' % log_weights_itr # check if log_weights are computed correctly for i_, p in enumerate(particles): log_w = log_weights_itr[-1, i_] + np.log(settings.n_particles) - np.log(settings.n_islands) logprior_p = p.compute_logprior() loglik_p = p.compute_loglik() logprob_p = p.compute_logprob() if (np.abs(settings.ess_threshold) < 1e-15) and (settings.proposal == 'prior'): # for the criterion above, only loglik should contribute to the weight update try: check_if_zero(log_w - loglik_p) except AssertionError: print 'Incorrect weight computation: log_w (smc) = %s, loglik_p = %s' % (log_w, loglik_p) raise AssertionError try: check_if_zero(logprob_p - loglik_p - logprior_p) except AssertionError: print 'Incorrect weight computation' print 'check if 0: %s, logprior_p = %s, loglik_p = %s' % (logprob_p - loglik_p - logprior_p, logprior_p, loglik_p) raise AssertionError # Evaluate print 'Results on training data (log predictive prob is bogus)' # log_predictive on training data is bogus ... you are computing something like \int_{\theta} p(data|\theta) p(\theta|data) if settings.weight_islands == 1: # each island's prediction is weighted by its marginal likelihood estimate which is equivalent to micro-averaging globally weights_prediction = softmax(log_weights_itr[-1, :]) assert('islandv1' in settings.tag) else: # correction for macro-averaging predictions across islands weights_prediction = np.ones(settings.n_particles) / settings.n_islands n_particles_tmp = settings.n_particles / settings.n_islands for i_ in range(settings.n_islands): pid_min, pid_max = i_ * n_particles_tmp, (i_ + 1) * n_particles_tmp - 1 pid_range_tmp = range(pid_min, pid_max+1) weights_prediction[pid_range_tmp] *= softmax(log_weights_itr[-1, pid_range_tmp]) (pred_prob_overall_train, metrics_train) = \ evaluate_predictions_smc(particles, smcData, smcData['x_train'], smcData['y_train'], settings, param, weights_prediction, do_metrics) print '\nResults on test data' (pred_prob_overall_test, metrics_test) = \ evaluate_predictions_smc(particles, smcData, smcData['x_test'], smcData['y_test'], settings, param, weights_prediction, do_metrics) #return pred_prob_overall_test, particles, param, weights_prediction return pred_prob_overall_test,
def s82_star_galaxy_classification(model_parms_file, epoch, Nstar, features, filters, r_pmm, figname, threshold=0., Nthreads=4): """ Compare quality of classifcation for a model with the s82 coadd. Should be a model trained on s82 single epoch data. """ # get the data single, singlecov = fetch_matched_s82data(epoch, features=features, filters=filters) coadd, coaddecov = fetch_matched_s82data(epoch, features=features, filters=filters, use_single=False) # classfy the single epoch data single_class = np.zeros(single.shape[0]) ind = np.abs(single[:, r_pmm]) < 0.145 single_class[ind] = 1. alpha, mu, V, _, _ = load_xd_parms(model_parms_file) logls = log_multivariate_gaussian_Nthreads(single, mu, V, singlecov, Nthreads) logls += np.log(alpha) logodds = logsumexp(logls[:, :Nstar], axis=1) logodds -= logsumexp(logls[:, Nstar:], axis=1) ind = logodds > threshold model_class = np.zeros(single.shape[0]) model_class[ind] = 1. coadd_class = np.zeros(single.shape[0]) ind = np.abs(coadd[:, r_pmm]) < 0.03 coadd_class[ind] = 1. fs = 10 f = pl.figure(figsize=(2 * fs, 2 * fs)) pl.subplot(221) pl.plot(single[single_class==0, 0], single[single_class==0, r_pmm], '.', color='#ff6633', alpha=0.2) pl.plot(single[single_class==1, 0], single[single_class==1, r_pmm], '.', color='#3b5998', alpha=0.2) pl.ylim(-0.2, 0.5) pl.subplot(222) pl.plot(coadd[single_class==0, 0], coadd[single_class==0, r_pmm], '.', color='#ff6633', alpha=0.2) pl.plot(coadd[single_class==1, 0], coadd[single_class==1, r_pmm], '.', color='#3b5998', alpha=0.2) pl.plot([17.5, 22.], [0.03, 0.03], 'k') pl.ylim(-0.2, 0.5) pl.subplot(223) pl.plot(single[model_class==0, 0], single[model_class==0, r_pmm], '.', color='#ff6633', alpha=0.2) pl.plot(single[model_class==1, 0], single[model_class==1, r_pmm], '.', color='#3b5998', alpha=0.2) pl.ylim(-0.2, 0.5) pl.subplot(224) pl.plot(coadd[model_class==0, 0], coadd[model_class==0, r_pmm], '.', color='#ff6633', alpha=0.2) pl.plot(coadd[model_class==1, 0], coadd[model_class==1, r_pmm], '.', color='#3b5998', alpha=0.2) pl.plot([17.5, 22.], [0.03, 0.03], 'k') pl.ylim(-0.2, 0.5) f.savefig(figname, bbox_inches='tight')
def hmmFit(self, obs, maxiter = 50 , epsilon = 0.0001, debug= False): """Fit the standard HMM to the given data using the (adapted Baum-Welch) EM algorithm. Parameters ---------- obs : list The list of observations sequences where every sequence is a ndarray. The sequences can be of different length, but the dimension of the features needs to be identical. maxiter : int, optional The maximum number of iterations of the EM algorithm. Default = 50. epsilon : float, optional The minimum allowed threshold in the variation of the log-likelihood between succesive iterations of the EM algorithm. Once the variation exceeds 'epsilon' the algorithm is said to have converged. Default = 1e-6. debug : bool, optional Display verbose On/off. Returns ------- float The normalized log-likelihood. list The list of log-likelihoods for each iteration of the EM algorithm. To check for monotonicity of the log-likelihoods. int The duration estimates of each HMM state from the posterior distribution. ndarray The top ranked 'n' which are used to estimate the state durations. ndarray The expected value of the state durations obtained at the top ranked 'n'. """ if debug: logger.setLevel(logging.DEBUG) logger.debug('Running the HMM EM algorithm..') ll=[] numseq = len(obs) lastavgloglikelihood = -np.inf logksilist = [None] * numseq logGammalist = [None] * numseq logAlphalist = [None] * numseq logBetalist = [None] * numseq llhlist = [None] * numseq obsmatrix = [None] * numseq logB = [None] * numseq duration = [None]* numseq res = [None]* numseq rankn = [None]* numseq N=[None] * numseq for iteration in xrange(maxiter): start_time=time.time() logger.debug('-------------------------------------------') logger.debug('iter: %d'% iteration ) logger.debug('E step..' ) for seq,obsseq in enumerate(obs): N[seq] = obsseq.shape[1] if self.O.__class__.__name__=='Discrete': obsmatrix[seq] = np.zeros((len(self.O.c),N)) obsmatrix[seq][ np.int_(obsseq),xrange(N)] = 1 #E-step #calcualte the posterior probability for each sequence logB[seq] = self.O.loglikelihood(obsseq) llhlist[seq], logGammalist[seq], logksilist[seq],\ logAlphalist[seq],logBetalist[seq] \ = self.gammaKsi(logB[seq]) normalizellhlist = np.divide(llhlist, N) loglikelihood = np.sum(normalizellhlist) ll.append(loglikelihood) logger.debug('LLH: %0.10f' % (loglikelihood)) if abs(loglikelihood - lastavgloglikelihood) < epsilon: for seq in range(len(obs)): rankn[seq] = self.rankn(logksilist[seq]) duration[seq], res[seq] = \ self.estimatepostduration(logAlphalist[seq], \ logBetalist[seq], logB[seq], rankn[seq], logGammalist[seq], llhlist[seq] ) logger.info("Convergence after %d iterations" % iteration) break lastavgloglikelihood = loglikelihood obsarray = np.concatenate(obs, 1) if self.O.__class__.__name__=='Discrete': obsarray = np.concatenate(obsmatrix,1) #M-step logger.debug('M step..' ) self.logA[-1] = (logsumexp(np.array([g[:,0] for g in logGammalist]),axis = 0) - np.log(np.double(numseq))) logKsiarray = np.concatenate(logksilist, axis = 0) logGammasArray = (np.concatenate(map(lambda x: x[:,:-1], logGammalist),axis = 1)) self.logA[:-1] = (logsumexp(logKsiarray,axis = 0) - logsumexp(logGammasArray, axis= 1) [:,np.newaxis]) self.O.fit(obsarray, logGammalist) end_time=time.time() logger.debug('Time to run iter : %.5f s' %(end_time-start_time)) if iteration == maxiter-1: for seq in range(len(obs)): rankn[seq] = self.rankn(logksilist[seq]) duration[seq], res[seq] = \ self.estimatepostduration(logAlphalist[seq], \ logBetalist[seq], logB[seq], rankn[seq], logGammalist[seq], llhlist[seq] ) logger.info('No convergence after %d iterations'%(iteration+1)) break return (loglikelihood,ll, duration, rankn, res)
def fit(self,obs,logweights): logGamma = np.concatenate(logweights, 1) normalizer = np.exp(logGamma - logsumexp(logGamma, axis = 1)[:, np.newaxis]) for k in range(self.K): self.p[k]=np.exp(np.log(np.sum(normalizer[k,:]*obs, 1))-\ np.log(np.sum(normalizer[k,:])))