def obj(w): p = unpack(w) p.update(fixed_params) f = 0.0 # ln_p_a, ln_p_mix = self.class_prior() ln_p_a = np.log(self.action(p)) # individual- and time-invariant logits_mix = p[self.mixture_param_key] ln_p_mix = logits_mix - logsumexp(logits_mix) for y, x in samples: # Outcome model mixture = log_likelihood(p, y, x, self.mean, self.cov, self.tr, ln_p_a, ln_p_mix) f -= logsumexp(np.array(mixture)) # Action model _, rx = x f -= action_log_likelihood(rx, ln_p_a, self.tr_cont_flag) # Regularizers for k, _ in trainable_params.items(): if k.endswith('_F'): f += np.sum(p[k]**2) return f
def sinkhorn_logspace(logP, niters=10): for _ in range(niters): # Normalize columns and take the log again logP = logP - logsumexp(logP, axis=0, keepdims=True) # Normalize rows and take the log again logP = logP - logsumexp(logP, axis=1, keepdims=True) return logP
def hmm_expected_states(log_pi0, log_Ps, ll): T, K = ll.shape # Make sure everything is C contiguous log_pi0 = to_c(log_pi0) log_Ps = to_c(log_Ps) ll = to_c(ll) alphas = np.zeros((T, K)) forward_pass(log_pi0, log_Ps, ll, alphas) normalizer = logsumexp(alphas[-1]) betas = np.zeros((T, K)) backward_pass(log_Ps, ll, betas) expected_states = alphas + betas expected_states -= logsumexp(expected_states, axis=1, keepdims=True) expected_states = np.exp(expected_states) expected_joints = alphas[:-1,:,None] + betas[1:,None,:] + ll[1:,None,:] + log_Ps expected_joints -= expected_joints.max((1,2))[:,None, None] expected_joints = np.exp(expected_joints) expected_joints /= expected_joints.sum((1,2))[:,None,None] return expected_states, expected_joints, normalizer
def hmm_expected_states(log_pi0, log_Ps, ll): T, K = ll.shape # Make sure everything is C contiguous to_c = lambda arr: np.copy(arr, 'C') if not arr.flags['C_CONTIGUOUS'] else arr log_pi0 = to_c(getval(log_pi0)) log_Ps = to_c(getval(log_Ps)) ll = to_c(getval(ll)) alphas = np.zeros((T, K)) forward_pass(log_pi0, log_Ps, ll, alphas) normalizer = logsumexp(alphas[-1]) betas = np.zeros((T, K)) backward_pass(log_Ps, ll, betas) expected_states = alphas + betas expected_states -= logsumexp(expected_states, axis=1, keepdims=True) expected_states = np.exp(expected_states) expected_joints = alphas[:-1,:,None] + betas[1:,None,:] + ll[1:,None,:] + log_Ps expected_joints -= expected_joints.max((1,2))[:,None, None] expected_joints = np.exp(expected_joints) expected_joints /= expected_joints.sum((1,2))[:,None,None] return expected_states, expected_joints, normalizer
def location_mixture_logpdf(samps, locations, location_weights, distr_at_origin, contr_var = False, variant = 1): # lpdfs = zeroprop.logpdf() diff = samps - locations[:, np.newaxis, :] lpdfs = distr_at_origin.logpdf(diff.reshape([np.prod(diff.shape[:2]), diff.shape[-1]])).reshape(diff.shape[:2]) logprop_weights = log(location_weights/location_weights.sum())[:, np.newaxis] if not contr_var: return logsumexp(lpdfs + logprop_weights, 0) #time_m1 = np.hstack([time0[:,:-1],time0[:,-1:]]) else: time0 = lpdfs + logprop_weights + log(len(location_weights)) if variant == 1: time1 = np.hstack([time0[:,1:],time0[:,:1]]) cov = np.mean(time0**2-time0*time1) var = np.mean((time0-time1)**2) lpdfs = lpdfs - cov/var * (time0-time1) return logsumexp(lpdfs - log(len(location_weights)), 0) elif variant == 2: cvar = (time0[:,:,np.newaxis] - np.dstack([np.hstack([time0[:, 1:], time0[:, :1]]), np.hstack([time0[:,-1:], time0[:,:-1]])])) ## self-covariance matrix of control variates K_cvar = np.diag(np.mean(cvar**2, (0, 1))) #add off diagonal K_cvar = K_cvar + (1.-np.eye(2)) * np.mean(cvar[:,:,0]*cvar[:,:,1]) ## covariance of control variates with random variable cov = np.mean(time0[:,:,np.newaxis] * cvar, 0).mean(0) optimal_comb = np.linalg.inv(K_cvar) @ cov lpdfs = lpdfs - cvar @ optimal_comb return logsumexp(lpdfs - log(len(location_weights)), 0)
def hmm_logZ_python(natparam): init_params, pair_params, node_params = natparam log_alpha = init_params + node_params[0] for node_param in node_params[1:]: log_alpha = logsumexp(log_alpha[:,None] + pair_params, axis=0) + node_param return logsumexp(log_alpha)
def single_episode_log_partition_function(episode): log_p_state = log_p_init for action, rendering in episode: log_p_state = (logsumexp( log_p_state[:, None] + log_p_dynamics[action], axis=0) + log_p_render[:, rendering]) return logsumexp(log_p_state)
def hmm_expected_states(log_pi0, log_Ps, ll, memlimit=2**31): T, K = ll.shape # Make sure everything is C contiguous log_pi0 = to_c(log_pi0) log_Ps = to_c(log_Ps) ll = to_c(ll) alphas = np.zeros((T, K)) forward_pass(log_pi0, log_Ps, ll, alphas) normalizer = logsumexp(alphas[-1]) betas = np.zeros((T, K)) backward_pass(log_Ps, ll, betas) # Compute E[z_t] for t = 1, ..., T expected_states = alphas + betas expected_states -= logsumexp(expected_states, axis=1, keepdims=True) expected_states = np.exp(expected_states) # Compute E[z_t, z_{t+1}] for t = 1, ..., T-1 # Note that this is an array of size T*K*K, which can be quite large. # To be a bit more frugal with memory, first check if the given log_Ps # are TxKxK. If so, instantiate the full expected joints as well, since # we will need them for the M-step. However, if log_Ps is 1xKxK then we # know that the transition matrix is stationary, and all we need for the # M-step is the sum of the expected joints. stationary = (log_Ps.shape[0] == 1) if not stationary: expected_joints = alphas[:-1, :, None] + betas[1:, None, :] + ll[ 1:, None, :] + log_Ps expected_joints -= expected_joints.max((1, 2))[:, None, None] expected_joints = np.exp(expected_joints) expected_joints /= expected_joints.sum((1, 2))[:, None, None] else: # Compute the sum over time axis of the expected joints # Limit ourselves to approximately 1GB of memory, assuming # the entries are float64's (8 bytes) batch_size = int(memlimit / (8 * K * K)) assert batch_size > 0 expected_joints = np.zeros((1, K, K)) for start in range(0, T - 1, batch_size): stop = min(T - 1, start + batch_size) # Compute expectations in this batch tmp = alphas[start:stop, :, None] + betas[start + 1:stop + 1, None, :] + ll[start + 1:stop + 1, None, :] + log_Ps tmp -= tmp.max((1, 2))[:, None, None] tmp = np.exp(tmp) tmp /= tmp.sum((1, 2))[:, None, None] expected_joints += tmp.sum(axis=0) return expected_states, expected_joints, normalizer
def log_partition_function(natural_params, data): if isinstance(data, list): return sum(map(partial(log_partition_function, natural_params), data)) log_pi, log_A, log_B = natural_params log_alpha = log_pi for y_t in data: log_alpha = logsumexp(log_alpha[:,None] + log_A, axis=0) + log_B[:,y_t] return logsumexp(log_alpha)
def energy(w, X, y, v_prior, m_prior, K, N, alpha): """Extract parameters""" q = get_parameters_q(w, v_prior) v_noise = np.exp(parser.get(w, 'log_v_noise')[ 0, 0 ]) """Note: A-approx computes its own log_factor value inside the helper function log_Z_likelihood, so we can shave of some computation time""" if Dtype != "A-approx": samples_q = draw_samples(q, K) log_factor_value = 1.0 * N * log_likelihood_factor(samples_q, v_noise, X, y) if Dtype == "KL": """I.e., standard VI""" KL = np.sum(-0.5 * np.log(2 * math.pi * v_prior) - 0.5 * ((q[ 'm' ]-m_prior)**2 + q[ 'v' ]) / v_prior) - \ np.sum(-0.5 * np.log(2 * math.pi * q[ 'v' ] * np.exp(1))) vfe = -(np.mean(log_factor_value) + KL) elif Dtype == "AR-approx": """NOTE: Needs modification to be GVI""" logp0 = log_prior(samples_q, v_prior, m_prior) logq = log_q(samples_q, q) logF = logp0 + log_factor_value - logq logF = (1 - alpha) * logF vfe = -(logsumexp(logF) - np.log(K)) vfe = vfe / (1 - alpha) elif Dtype == "AB-approx": logp0 = log_prior(samples_q, v_prior, m_prior) logq = log_q(samples_q, q) part1 = (alpha + beta_D) * (log_factor_value + logp0) - logq part2 = (alpha + beta_D -1 ) * logq part3 = (beta_D * (log_factor_value + logp0) + (alpha - 1) * logq) vfe = ( (1.0 / (alpha * (alpha + beta_D))) * (logsumexp(part1) - np.log(K)) + (1.0 / (beta_D * (alpha + beta_D))) * (logsumexp(part2) - np.log(K)) - (1.0 / (alpha * beta_D)) * (logsumexp(part3) - np.log(K))) elif Dtype == "A-approx": f_hat = get_parameters_f_hat(q, v_prior, m_prior, N) vfe = -log_normalizer(q) - 1.0 * N / X.shape[ 0 ] / alpha * log_Z_likelihood(q, f_hat, v_noise, X, y, K) + log_Z_prior(v_prior,m_prior) elif Dtype == "AR": prior_reg = (1/(alpha*(alpha-1))) * prior_regularizer(q,v_prior, m_prior,alpha) vfe = -np.mean(log_factor_value) + prior_reg #NOTE: While this should work, this is the alpha-divergence regularizer, which # overconcentrates substantially. We refer to the appendix of our # paper for some visuals on this phenomenon. The performance from this # divergence should be expected to be much worse than that for the # Alpha-renyi as Uncertainty Quantifier elif Dtype == "A": prior_reg = (1/(alpha*(alpha-1))) * ( np.exp(prior_regularizer(q,v_prior, m_prior,alpha))-1) vfe = -np.mean(log_factor_value) + prior_reg return vfe
def single_update_belief_log_probas(prev_belief_log_proba_K, curr_data_log_proba_K, ltrans, a): trans_log_proba_KK = ltrans[:, int(a), :] curr_belief_log_proba_K = logsumexp(trans_log_proba_KK + prev_belief_log_proba_K) curr_belief_log_proba_K = curr_belief_log_proba_K + curr_data_log_proba_K log_norm_const = logsumexp(curr_belief_log_proba_K) cur_belief_log_proba_K = curr_belief_log_proba_K - log_norm_const return curr_belief_log_proba_K
def predict_half(self, X_top): """ plot the top half the image concatenated with the marginal distribution over each pixel in the bottom half. """ X_bot = np.zeros((X_top.shape[0], X_top.shape[1])) theta_top, theta_bot = self.theta[:, :392].T, self.theta[:, 392:].T for i in range(392): constant = np.dot(X_top, np.log(theta_top)) + np.dot(1 - X_top, np.log(1 - theta_top)) X_bot[:, i] = logsumexp(np.add(constant, np.log(theta_bot[i])), axis=1) - logsumexp(constant, axis=1) save_images(np.concatenate((X_top, np.exp(X_bot)), axis=1), "predict_half.png")
def log_partition_function(natural_params, data): if isinstance(data, list): return sum(map(partial(log_partition_function, natural_params), data)) log_pi, log_A, log_B = natural_params log_alpha = log_pi for y_t in data: log_alpha = logsumexp(log_alpha[:, None] + log_A, axis=0) + log_B[:, y_t] return logsumexp(log_alpha)
def build_pomdp(pi, trans, emission_mu, emission_std, data, fcpt, args): lpi = pi - logsumexp(pi, axis=0) ltrans = trans - logsumexp(trans, axis=-1, keepdims=True) ll = 0 lbelief_state_set_TK = None # collect the complete set of beliefs over all sequences ll, lbelief_state_set_TK = calc_log_proba_for_many_sequences( lpi, ltrans, emission_mu, emission_std, data, fcpt, args) return lbelief_state_set_TK, ll
def label_meanfield(label_global, gaussian_globals, gaussian_stats): partial_contract = lambda a, b: \ sum(np.tensordot(x, y, axes=np.ndim(y)) for x, y, in zip(a, b)) gaussian_local_natparams = map(niw.expectedstats, gaussian_globals) node_params = np.array([ partial_contract(gaussian_stats, natparam) for natparam in gaussian_local_natparams]).T local_natparam = dirichlet.expectedstats(label_global) + node_params stats = normalize(np.exp(local_natparam - logsumexp(local_natparam, axis=1, keepdims=True))) vlb = np.sum(logsumexp(local_natparam, axis=1)) - contract(stats, node_params) return local_natparam, stats, vlb
def logpdf(self, x): comp_logpdf = np.array([self.dist_cat.logpdf(i)+ self.comp_dist[i].logpdf(x) for i in range(len(self.comp_dist))]) rval = logsumexp(comp_logpdf, 0) if len(comp_logpdf.shape) > 1: rval = rval.reshape((rval.size, 1)) return rval
def logpdf_grad(self, x): rval = np.array([exp(self.dist_cat.logpdf(i))* self.comp_dist[i].logpdf_grad(x) for i in range(len(self.comp_dist))]) rval = logsumexp(rval, 0) return rval
def categorical_logpdf(data, logits, mask=None): """ Compute the log probability density of a categorical distribution. This will broadcast as long as data and logits have the same (or at least compatible) leading dimensions. Parameters ---------- data : array_like (..., D) int (0 <= data < C) The points at which to evaluate the log density lambdas : array_like (..., D, C) The logits of the categorical distribution(s) with C classes mask : array_like (..., D) bool Optional mask indicating which entries in the data are observed Returns ------- lps : array_like (...,) Log probabilities under the categorical distribution(s). """ D = data.shape[-1] C = logits.shape[-1] assert data.dtype in (int, np.int8, np.int16, np.int32, np.int64) assert np.all((data >= 0) & (data < C)) assert logits.shape[-2] == D # Check mask mask = mask if mask is not None else np.ones_like(data, dtype=bool) assert mask.shape == data.shape logits = logits - logsumexp(logits, axis=-1, keepdims=True) # (..., D, C) x = one_hot(data, C) # (..., D, C) lls = np.sum(x * logits, axis=-1) # (..., D) return np.sum(lls * mask, axis=-1) # (...,)
def logloss(K_conj): """ K is a tensor of CONJUGATE Kraus Operators of dim s x y x x x x s: dim of features y: number of features x: number of labels """ total_loss = 0.0 # Iterate over each sequence in batch for i in range(labels.shape[0]): features = feats_matrix[i, :] label = labels[i] - 1 # Compute likelihood of the label generating the given features conjKrausProduct = np.log(K_conj[features[0] - 1, 0, :, :]) for s in range(1, features.shape[0]): conjKrausProduct = logdotexp( np.log(K_conj[features[s] - 1, s, :, :]), conjKrausProduct) eta = np.zeros([K_conj.shape[3], K_conj.shape[3]], dtype='complex128') eta[label, label] = 1 prod1 = logdotexp(np.conjugate(conjKrausProduct), np.log(eta)) prod2 = logdotexp(prod1, conjKrausProduct.T) total_loss += np.real(logsumexp(np.diag(prod2))) # total_loss += np.real(np.trace(np.kron(np.conjugate(conjKrausProduct)[:, label], conjKrausProduct.T[:, label]).reshape(K_conj.shape[2], K_conj.shape[3]))) return -total_loss / labels.shape[0]
def avg_log_likelihood(self, X, y, theta): ll = 0 for c in range(10): X_c = get_images_by_label(X, y, c) log_p_x = logsumexp(np.log(0.1) + np.dot(X_c, np.log(theta.T)) + np.dot((1. - X_c), np.log(1. - theta.T)), axis=1) ll += np.sum(np.dot(X_c, np.log(theta[c])) + np.dot((1. - X_c), np.log(1. - theta[c])) + np.log(0.1) - log_p_x) return ll / X.shape[0]
def _cost_with_vis(self, inputs, targets, hprev, weights, disable_tqdm=True, epoch=None): if epoch is not None: f = open('values' + str(epoch) + '.txt', "w+") W_hh, W_xh, b_h, W_hy, b_y = weights h = np.copy(hprev) loss = 0 for t in tqdm(range(len(inputs)), disable=disable_tqdm): x = char_to_one_hot(inputs[t]) h = np.tanh(W_hh @ h + W_xh @ x + b_h) if epoch is not None: f.write(','.join(h.astype(str)) + '\n') y = W_hy @ h + b_y target_index = char_to_index[targets[t]] # ps_target[t] = np.exp(ys[t][target_index])/np.sum(np.exp(ys[t])) # probability for next chars being target # loss += -np.log(ps_target[t]) loss += -(y[target_index] - logsumexp(y)) if epoch is not None: f.close() loss = loss / len(inputs) return loss
def nn_predict_GCN(params, x): # x: NSAMPLES x NFEATURES U = hyper['U'] xf = np.matmul(x, U) xf = np.expand_dims(xf, 1) # NSAMPLES x 1 x NFEATURES xf = np.transpose(xf) # NFEATURES x 1 x NSAMPLES # Filter yf = np.matmul(params['W1'], xf) # for each feature yf = np.transpose(yf) # NSAMPLES x NFILTERS x NFEATURES yf = np.reshape(yf, [-1, hyper['NFEATURES']]) # Transform back to graph domain Ut = np.transpose(U) y = np.matmul(yf, Ut) y = np.reshape(y, [-1, hyper['F'], hyper['NFEATURES']]) y += params['b1'] # NSAMPLES x NFILTERS x NFEATURES # nonlinear layer y = ReLU(y) # y = np.tanh(y) # dense layer y = np.reshape(y, [-1, hyper['F']*hyper['NFEATURES']]) y = np.matmul(y, params['W2']) + params['b2'] outputs = y return outputs - logsumexp(outputs, axis=1, keepdims=True)
def _cost(self, inputs, targets, hprev, Cprev, weights, disable_tqdm=True): W_1, b_1, W_f, b_f, W_i, b_i, W_c, b_c, W_o, b_o, W_2, b_2 = weights h = np.copy(hprev) C = np.copy(Cprev) loss = 0 for t in tqdm(range(len(inputs)), disable=disable_tqdm): x = char_to_one_hot(inputs[t]) x = np.matmul(W_1, x) + b_1 f = sigmoid(np.matmul(W_f, np.concatenate((h, x))) + b_f) i = sigmoid(np.matmul(W_i, np.concatenate((h, x))) + b_i) C_hat = np.tanh(np.matmul(W_c, np.concatenate((h, x))) + b_c) C = f * C + i * C_hat o = sigmoid(np.matmul(W_o, np.concatenate((h, x))) + b_o) h = o * np.tanh(C) y = np.matmul(W_2, h) + b_2 target_index = char_to_index[targets[t]] # ps_target[t] = np.exp(ys[t][target_index])/np.sum(np.exp(ys[t])) # probability for next chars being target # loss += -np.log(ps_target[t]) loss += -(y[target_index] - logsumexp(y)) loss = loss / len(inputs) return loss
def _cost_batched(self, inputs, targets, hprev, Cprev, weights, disable_tqdm=True): W_1, b_1, W_f, b_f, W_i, b_i, W_c, b_c, W_o, b_o, W_2, b_2 = weights h = np.copy(hprev) C = np.copy(Cprev) h = h.reshape((self.batch_size, self.h_size, 1)) C = C.reshape((self.batch_size, self.h_size, 1)) loss = 0 # W_sth_dropout = get_dropout_function((self.h_size, self.h_size + self.x_size), self.keep_prob) # b_sth_dropout = get_dropout_function((self.h_size,), self.keep_prob) # W_dropout = get_dropout_function((self.y_size, self.h_size), self.keep_prob) # b_dropout = get_dropout_function((self.y_size,), self.keep_prob) cell_dropout = get_dropout_function((self.batch_size, self.h_size, 1), self.keep_prob) y_dropout = get_dropout_function((self.batch_size, self.y_size, 1), self.keep_prob) for t in tqdm(range(len(inputs)), disable=disable_tqdm): x = np.array([char_to_one_hot(c) for c in inputs[:, t]]) x = x.reshape((self.batch_size, -1, 1)) x = np.matmul(W_1, x) + np.reshape(b_1, (-1, 1)) x = cell_dropout(x) f = sigmoid( np.matmul(W_f, np.concatenate((h, x), axis=1)) + np.reshape(b_f, (-1, 1))) f = cell_dropout(f) i = sigmoid( np.matmul(W_i, np.concatenate((h, x), axis=1)) + np.reshape(b_i, (-1, 1))) i = cell_dropout(i) C_hat = np.tanh( np.matmul(W_c, np.concatenate((h, x), axis=1)) + np.reshape(b_c, (-1, 1))) C_hat = cell_dropout(C_hat) C = f * C + i * C_hat C = cell_dropout(C) o = sigmoid( np.matmul(W_o, np.concatenate((h, x), axis=1)) + np.reshape(b_o, (-1, 1))) o = cell_dropout(o) h = o * np.tanh(C) h = cell_dropout(h) ys = np.matmul(W_2, h) + np.reshape(b_2, (-1, 1)) ys = y_dropout(ys) target_indices = np.array( [char_to_index[c] for c in targets[:, t]]) # ps_target[t] = np.exp(ys[t][target_index])/np.sum(np.exp(ys[t])) # probability for next chars being target # loss += -np.log(ps_target[t]) loss += np.sum([ -(y[target_index] - logsumexp(y)) for y, target_index in zip(ys, target_indices) ]) / (self.number_of_steps * self.batch_size) return loss
def _backprop_single(self, params, x, num_samples = 1, alpha = 1.0): """ Efficient training by computing k forward pass and only 1 backward pass (by sampling particles according to the weights). For VI all the weights are equal. """ # compute weights logF = self._comp_log_weights(params, x, num_samples) batchsize = x.shape[1] lowerbound = 0.0 logFa = (1 - alpha) * logF for i in xrange(batchsize): indl = int(i * num_samples); indr = int((i+1) * num_samples) log_weights = logFa[indl:indr] - logsumexp(logFa[indl:indr]) prob = list(np.exp(log_weights)) # current autograd doesn't support np.random.choice! sample_uniform = np.random.random() for j in xrange(num_samples): sample_uniform = sample_uniform - prob[j] if sample_uniform <= 0.0: break ind_current = indl + j lowerbound = lowerbound + logF[ind_current] return lowerbound
def c_given_x(x): p = np.ndarray(shape=(x.shape[0], 10)) for c in range(10): p[:, c] = np.log(theta[c]**x * (1 - theta[c])**(1 - x)).sum(axis=1) p = p - logsumexp(p, axis=1, keepdims=True) p = np.exp(p) return p
def log_transition_matrices(self, data, input, mask, tag): T, D = data.shape log_Ps = np.dot(input[1:], self.Ws.T)[:, None, :] # inputs log_Ps = log_Ps + np.dot(data[:-1], self.Rs.T)[:, None, :] # past observations log_Ps = log_Ps + self.r # bias log_Ps = np.tile(log_Ps, (1, self.K, 1)) # expand return log_Ps - logsumexp(log_Ps, axis=2, keepdims=True) # normalize
def grad_pred_ll(self, X, W, c): """ This function calculate the gradient of the predictive log-likelihood. return a 10 * 784 vector """ constant = np.exp(logsumexp(np.dot(X, W.T), axis=1)) return np.sum(X - (X.T * np.divide(np.exp(np.dot(X, W[c])), constant)).T, axis=0)
def calc_log_proba_for_one_seq(x_n_TD, a_n_T, lpi, ltrans, emission_mu, emission_std): n_timesteps = x_n_TD.shape[0] n_states = lpi.shape[0] belief_log_proba_TK = np.zeros((n_timesteps, n_states)) # Compute log proba array x_n_log_proba_TK = calc_log_proba_arr_for_x(x_n_TD, emission_mu, emission_std, n_states, a_n_T) x_n_log_proba_TK = x_n_log_proba_TK.flatten() # Initialise fwd belief vector at t = 0 curr_belief_log_proba_K = lpi + x_n_log_proba_TK[0] curr_x_log_proba = logsumexp(curr_belief_log_proba_K) curr_belief_log_proba_K = curr_belief_log_proba_K - curr_x_log_proba belief_log_proba_TK[0, :] = curr_belief_log_proba_K log_proba_x = curr_x_log_proba for t in range(1, n_timesteps): # Update the beliefs over time curr_belief_log_proba_K, curr_x_log_proba = update_belief_log_probas( curr_belief_log_proba_K, x_n_log_proba_TK[t], ltrans, a_n_T[t]) belief_log_proba_TK[t, :] = curr_belief_log_proba_K log_proba_x += curr_x_log_proba return log_proba_x, belief_log_proba_TK
def class_prior(self): ln_p_a = np.log(self.action( self.params)) # individual- and time-invariant logits_mix = self.params[self.mixture_param_key] ln_p_mix = logits_mix - logsumexp(logits_mix) return ln_p_a, ln_p_mix
def lower_bound_MoG(theta, s2min=1e-7, return_dmu=False, n=0, return_ds2=False): """ Lower bound on the entropy of a mixture of Gaussians. INPUT: theta --- all MoG parameters in the [mu; lns2] format s2min --- minimum variance return_dmu --- returns gradient with respect to mu_n, for the input n n --- see above return_ds2 ---- returns grad with respect to all s2 params """ # unpack num components and dimensionality N, Dpp = theta.shape D = Dpp - 1 # unpack mean and variance parameters mu = theta[:, :D] s2 = np.exp(theta[:, -1]) + s2min # compute lower bound to entropy, Eq (7) --- we compute the # Normal Probability N(mu_n | mu_j, s2_n + s2_j) S = sq_dist(mu) s = s2[:, None] + s2[None, :] lnP = (-.5 * S / s) - .5 * D * np.log(2 * np.pi) - .5 * D * np.log(s) lnqn = scpm.logsumexp(lnP, 1) - np.log(N) H = np.sum(lnqn) / float(N) # TODO implement gradients in the same matlab style return -1. * H
def avg_pred_log(w,images): log_pc_x = 0 for i in range(0,images.shape[0]): current_log_pc_x = np.dot(np.transpose(w),images[i,:]) - logsumexp(np.dot(np.transpose(w),images[i,:])) log_pc_x = log_pc_x + current_log_pc_x return np.sum(log_pc_x)/float(images.shape[0])
def _cost_batched(self, inputs, targets, hprev, weights, disable_tqdm=True): W_hh, W_xh, b_h, W_hy, b_y = weights h = np.copy(hprev) h = h.reshape((self.batch_size, self.hidden_size, 1)) loss = 0 for t in tqdm(range(self.number_of_steps), disable=disable_tqdm): x = np.array([char_to_one_hot(c) for c in inputs[:, t]]) x = x.reshape((self.batch_size, -1, 1)) h = np.tanh(W_hh @ h + W_xh @ x + np.reshape(b_h, (-1, 1))) ys = W_hy @ h + np.reshape(b_y, (-1, 1)) ys = np.squeeze(ys) target_indices = np.array( [char_to_index[c] for c in targets[:, t]]) # ps_target[t] = np.exp(ys[t][target_index])/np.sum(np.exp(ys[t])) # probability for next chars being target # loss += -np.log(ps_target[t]) loss += np.sum([ -(y[target_index] - logsumexp(y)) for y, target_index in zip(ys, target_indices) ]) / (self.number_of_steps * self.batch_size) return loss
def callback(params, t, g): print("Iteration {} lower bound {}".format(t, -objective(params, t))) # print (params.shape) log_weights = params[:k] - logsumexp(params[:k]) print(np.exp(log_weights)) # params2 = np.reshape(params[10:], (10, -1)) # print (params2.shape) # print (params2) plt.cla() target_distribution = lambda x: np.exp(log_density(x, t)) var_distribution = lambda x: np.exp(variational_log_density(params, x)) plot_isocontours(ax, target_distribution) plot_isocontours(ax, var_distribution, cmap=plt.cm.bone) ax.set_autoscale_on(False) rs = npr.RandomState(0) samples = variational_sampler(params, num_plotting_samples, rs) plt.plot(samples[:, 0], samples[:, 1], 'x') plt.draw() plt.pause(1.0 / 30.0)
def loss(params, X, T): W_vect = params[:-1] alpha = params[-1] log_prior = -L2_reg * np.dot(W_vect, W_vect) preds = predictions(W_vect, X, alpha) normalised_log_probs = preds - logsumexp(preds) log_lik = np.sum(normalised_log_probs * T) return -1.0 * (log_prior + log_lik)
def log_likelihood(all_params, X, y, n_samples): rs = npr.RandomState(0) samples = [sample_mean_cov_from_deep_gp(all_params, X, True, rs, FITC=True) for i in xrange(n_samples)] return ( logsumexp(np.array([mvn.logpdf(y, mean, var) for mean, var in samples])) - np.log(n_samples) + evaluate_prior(all_params) )
def log_marginal_likelihood(params, data): cluster_lls = [] for log_proportion, mean, chol in zip(*unpack_params(params)): cov = np.dot(chol.T, chol) + 0.000001 * np.eye(D) cluster_log_likelihood = log_proportion + mvn.logpdf(data, mean, cov) cluster_lls.append(np.expand_dims(cluster_log_likelihood, axis=0)) cluster_lls = np.concatenate(cluster_lls, axis=0) return np.sum(logsumexp(cluster_lls, axis=0))
def get_error_and_ll(w, v_prior, X, y, K, location, scale): v_noise = np.exp(parser.get(w, 'log_v_noise')[ 0, 0 ]) * scale**2 q = get_parameters_q(w, v_prior) samples_q = draw_samples(q, K) outputs = predict(samples_q, X) * scale + location log_factor = -0.5 * np.log(2 * math.pi * v_noise) - 0.5 * (np.tile(y, (1, K)) - np.array(outputs))**2 / v_noise ll = np.mean(logsumexp(log_factor - np.log(K), 1)) error = np.sqrt(np.mean((y - np.mean(outputs, 1, keepdims = True))**2)) return error, ll
def neural_net_predict(params, inputs): """Implements a deep neural network for classification. params is a list of (weights, bias) tuples. inputs is an (N x D) matrix. returns normalized class log-probabilities.""" for W, b in params: outputs = np.dot(inputs, W) + b inputs = np.tanh(outputs) return outputs - logsumexp(outputs, axis=1, keepdims=True)
def predicted_class_logprobs(self, W_vect, inputs): for W, b in self.unpack_layers(W_vect): outputs = np.dot(inputs, W) + b if self.activation_type == 'tanh': inputs = np.tanh(outputs) elif self.activation_type == 'relu': inputs = relu(outputs) else: raise ValueException('unknown activation_type {}'.format(self.activation_type)) return outputs - logsumexp(outputs, axis=1, keepdims=True)
def predictions(self, W_vect, inputs): '''For classsification, returns N*C matrix of log probabilities. For rregression, returns N*K matrix of predicted means''' for W, b in self.unpack_layers(W_vect): outputs = np.dot(inputs, W) + b inputs = self.nonlinearity(outputs) if self.output_type == 'regression': return outputs if self.output_type == 'classification': logprobs = outputs - logsumexp(outputs, axis=1, keepdims=True) return logprobs
def unpack_params(params): """Unpacks parameter vector into the proportions, means and covariances of each mixture component. The covariance matrices are parametrized by their Cholesky decompositions.""" log_proportions = parser.get(params, "log proportions") normalized_log_proportions = log_proportions - logsumexp(log_proportions) means = parser.get(params, "means") lower_tris = np.tril(parser.get(params, "lower triangles"), k=-1) diag_chols = np.exp(parser.get(params, "log diagonals")) chols = lower_tris + np.make_diagonal(diag_chols, axis1=-1, axis2=-2) return normalized_log_proportions, means, chols
def unpack_params(params): """Unpacks parameter vector into the proportions, means and covariances of each mixture component. The covariance matrices are parametrized by their Cholesky decompositions.""" log_proportions = parser.get(params, 'log proportions') normalized_log_proportions = log_proportions - logsumexp(log_proportions) means = parser.get(params, 'means') lower_tris = np.tril(parser.get(params, 'lower triangles'), k=-1) diag_chols = np.exp( parser.get(params, 'log diagonals')) chols = [] for lower_tri, diag in zip(lower_tris, diag_chols): chols.append(np.expand_dims(lower_tri + np.diag(diag), 0)) chols = np.concatenate(chols, axis=0) return normalized_log_proportions, means, chols
def _m_step(self): assert(self.resp.shape[0] == self.num_samp) pseud_lcount = logsumexp(self.resp, axis = 0).flat r = exp(self.resp) self.comp_dist = [] for c in range(self.num_components): norm = exp(pseud_lcount[c]) mu = np.sum(r[:,c:c+1] * self.samples, axis=0) / norm diff = self.samples - mu scatter_matrix = np.zeros([self.samples.shape[1]]*2) for i in range(diff.shape[0]): scatter_matrix += r[i,c:c+1] *diff[i:i+1,:].T.dot(diff[i:i+1,:]) scatter_matrix /= norm self.comp_dist.append(mvnorm(mu, scatter_matrix)) self.comp_lprior = pseud_lcount - log(self.num_samp)
def mog_like(x, means, icovs, dets, pis): """ compute the log likelihood according to a mixture of gaussians with means = [mu0, mu1, ... muk] icovs = [C0^-1, ..., CK^-1] dets = [|C0|, ..., |CK|] pis = [pi1, ..., piK] (sum to 1) at locations given by x = [x1, ..., xN] """ xx = np.atleast_2d(x) centered = xx[:,:,np.newaxis] - means.T[np.newaxis,:,:] solved = np.einsum('ijk,lji->lki', icovs, centered) logprobs = -0.5*np.sum(solved * centered, axis=1) - np.log(2*np.pi) - 0.5*np.log(dets) + np.log(pis) logprob = scpm.logsumexp(logprobs, axis=1) if len(x.shape) == 1: return np.exp(logprob[0]) else: return np.exp(logprob)
def mog_logmarglike(x, means, covs, pis, ind=0): """ marginal x or y (depending on ind) """ K = pis.shape[0] xx = np.atleast_2d(x) centered = xx.T - means[:,ind,np.newaxis].T logprobs = [] for kk in xrange(K): quadterm = centered[:,kk] * centered[:,kk] * (1./covs[kk,ind,ind]) logprobsk = -.5*quadterm - .5*np.log(2*np.pi) \ -.5*np.log(covs[kk,ind,ind]) + np.log(pis[kk]) logprobs.append(np.squeeze(logprobsk)) logprobs = np.array(logprobs) logprob = scpm.logsumexp(logprobs, axis=0) if np.isscalar(x): return logprob[0] else: return logprob
def cost(theta): # Unpack parameters nu = np.concatenate([theta[1], [0]], axis=0) S = theta[0] logdetS = np.expand_dims(np.linalg.slogdet(S)[1], 1) y = np.concatenate([samples.T, np.ones((1, N))], axis=0) # Calculate log_q y = np.expand_dims(y, 0) # 'Probability' of y belonging to each cluster log_q = -0.5 * (np.sum(y * np.linalg.solve(S, y), axis=1) + logdetS) alpha = np.exp(nu) alpha = alpha / np.sum(alpha) alpha = np.expand_dims(alpha, 1) loglikvec = logsumexp(np.log(alpha) + log_q, axis=0) return -np.sum(loglikvec)
def gmm_logprob(x, ws, mus, sigs, invsigs=None, logdets=None): """ Gaussian Mixture Model likelihood Input: - x = N x D array of data (N iid) - ws = K length vector that sums to 1, mixing weights - mus = K x D array of mixture component means - sigs = K x D x D array of mixture component covariances - invsigs = K x D x D array of mixture component covariance inverses - logdets = K array of mixture component covariance logdets Output: - N length array of log likelihood values TODO: speed this up """ if sigs is None: assert invsigs is not None and logdets is not None, \ "need sigs if you don't include logdets and invsigs" # compute invsigs if needed if invsigs is None: invsigs = np.array([np.linalg.inv(sig) for sig in sigs]) logdets = np.array([np.linalg.slogdet(sig)[1] for sig in sigs]) # compute each gauss component separately xx = np.atleast_2d(x) centered = xx[:,:,np.newaxis] - mus.T[np.newaxis,:,:] solved = np.einsum('ijk,lji->lki', invsigs, centered) logprobs = -0.5*np.sum(solved * centered, axis=1) - \ np.log(2*np.pi) - 0.5*logdets + np.log(ws) logprob = scpm.logsumexp(logprobs, axis=1) if len(x.shape) == 1: return logprob[0] else: return logprob
def logsoftmax(v): return v - logsumexp(v, 1).reshape(-1, 1)
def hiddens_to_output_probs(hiddens): output = concat_and_multiply(params['predict'], hiddens) return output - logsumexp(output, axis=1, keepdims=True)
def loss(W_vect, X, T): log_prior = -L2_reg * np.dot(W_vect, W_vect) preds = predictions(W_vect, X) normalised_log_probs = preds - logsumexp(preds) log_lik = np.sum(normalised_log_probs * T) return -1.0 * (log_prior + log_lik)
def mixture_log_density(var_mixture_params, x): """Returns a weighted average over component densities.""" log_weights, var_params = unpack_mixture_params(var_mixture_params) component_log_densities = np.vstack([component_log_density(params_k, x) for params_k in var_params]).T return logsumexp(component_log_densities + log_weights, axis=1, keepdims=False)
def _e_step(self): lpdfs = np.array([d.logpdf(self.samples).flat[:] for d in self.comp_dist]).T + self.comp_lprior self.resp = lpdfs - logsumexp(lpdfs, axis = 1).reshape((self.num_samp, 1))
def log_likelihood(all_params): # implement mini batches later? n_samples = 1 samples = [sample_mean_cov_from_deep_gp(all_params, X, True) for i in xrange(n_samples)] return logsumexp(np.array([mvn.logpdf(y,mean,var+1e-6*np.eye(len(var))*np.max(np.diag(var))) for mean,var in samples])) - np.log(n_samples) \ + evaluate_prior(all_params)
def log_Z_likelihood(q, f_hat, v_noise, X, y, K): samples = draw_samples(q, K) log_f_hat = np.sum(-0.5 / f_hat[ 'v' ] * samples**2 + f_hat[ 'm' ] / f_hat[ 'v' ] * samples, 1) log_factor_value = alpha * (log_likelihood_factor(samples, v_noise, X, y) - log_f_hat) return np.sum(logsumexp(log_factor_value, 1) + np.log(1.0 / K))
def predictions(W_vect, inputs): for W, b in unpack_layers(W_vect): outputs = np.dot(inputs, W) + b inputs = np.tanh(outputs) return outputs - logsumexp(outputs, axis=1, keepdims=True)
def gaussian_loglike(x, mu, log_sigmasq): return np.mean(logsumexp( -0.5*((np.log(2*np.pi) + log_sigmasq) + (x - mu)**2. / np.exp(log_sigmasq)), axis=0))
def log_softmax(self, batch): batch = batch - np.max(batch, axis=1, keepdims=True) return batch - logsumexp(batch, axis=1).reshape((batch.shape[0], -1))
def logpdf(self, x): rval = np.array([self.comp_lprior[i]+ self.comp_dist[i].logpdf(x) for i in range(self.comp_lprior.size)]) rval = logsumexp(rval, 0).flatten() return rval