def _get_grad_log_post(W1D, Wprior, H, y, X, testing=False): """Returns multinomial gradient of the negative log posterior probability with C classes. Parameters ---------- W1D : array-like, shape (C*p, ) Flattened vector of parameters at which the negative log posterior is to be evaluated Wprior : array-like, shape (C, p) vector of prior means on the parameters to be fit H : array-like, shape (C*p, C*p) or independent between classes (C, p, p) Array of prior Hessian (inverse covariance of prior distribution of parameters) y : array-like, shape (N, ) starting at 0 vector of binary ({0, 1, ... C} possible responses) X : array-like, shape (N, p) array of features Returns ------- grad_log_post1D : array-like, shape (C*p, ) Flattened gradient of negative log posterior References ---------- Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012) Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006) """ # calculate gradient log posterior C, p = Wprior.shape W = W1D.reshape(C, p) mu = _get_softmax_probs(X, W) # shape (N, C) grad_log_likelihood = np.zeros_like(W) grad_log_prior = np.zeros_like(W) for c in range(C): if H.shape == (C, p, p): grad_log_likelihood[:, c] = X.T @ (mu[:, c] - np.int32(y == c)) K = (W[c] - Wprior[c]).reshape(-1) grad_log_prior[c] = H[c] @ K elif H.shape == (C * p, C * p): grad_log_likelihood[c] = X.T @ (mu[:, c] - np.int32(y == c)) if H.shape == (C * p, C * p): K = (W - Wprior).reshape(-1) # change to shape (C*p, ) grad_log_prior = H @ K grad_log_prior = grad_log_prior.reshape(C, p) # change to shape (C, p) grad_log_posterior = grad_log_likelihood + grad_log_prior grad_log_post1D = grad_log_posterior.reshape(-1) if testing: return [grad_log_post1D, grad_log_likelihood.reshape(-1), grad_log_prior.reshape(-1)] else: return grad_log_post1D
def plot_images(images, ax, ims_per_row=5, padding=5, digit_dimensions=(28, 28), cmap=matplotlib.cm.binary, vmin=None, vmax=None): """Images should be a (N_images x pixels) matrix.""" N_images = images.shape[0] N_rows = np.int32(np.ceil(float(N_images) / ims_per_row)) pad_value = np.min(images.ravel()) concat_images = np.full( ((digit_dimensions[0] + padding) * N_rows + padding, (digit_dimensions[1] + padding) * ims_per_row + padding), pad_value) for i in range(N_images): cur_image = np.reshape(images[i, :], digit_dimensions) row_ix = i // ims_per_row col_ix = i % ims_per_row row_start = padding + (padding + digit_dimensions[0]) * row_ix col_start = padding + (padding + digit_dimensions[1]) * col_ix concat_images[row_start:row_start + digit_dimensions[0], col_start:col_start + digit_dimensions[1]] = cur_image cax = ax.matshow(concat_images, cmap=cmap, vmin=vmin, vmax=vmax) plt.xticks(np.array([])) plt.yticks(np.array([])) return cax
def get_binary_monte_carlo_probs(X, w, H, num_samples=100): """ Uses monte carlo approximation to get posterior predictive logistic regression probability with C classes. Parameters ---------- X : array-like, shape (N, p) array of covariates w : array-like, shape (p, ) array of fitted MAP parameters H : array-like, shape (p, p) or (p, ) array of log posterior Hessian (covariance matrix of fitted MAP parameters) num_samples: number of samples to approximate the posterior Returns ------- probs : array-like, shape (N, C) moderated (by full distribution) logistic probability preds : array-like, shape (N, ) predicted classes ({0,1, ..., C}) References ---------- Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012) Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006) """ N, _ = X.shape if len(H.shape) == 2: w_sample = np.random.multivariate_normal(w, np.linalg.inv(H), num_samples) elif len(H.shape) == 1: w_sample = np.random.multivariate_normal(w, np.diag(1 / (H + EPS)), num_samples) else: raise ValueError('Incompatible Hessian') probs = np.mean(expit(X @ w_sample.T), axis=1) preds = np.int32(probs > 0.5) return probs, preds
# -------------- LOADING DATASET ------------------------ # load the images npr.seed(0) _, train_images, train_labels, test_images, test_labels = load_mnist() rand_idx = np.arange(train_images.shape[0]) npr.shuffle(rand_idx) train_images = train_images[rand_idx] train_labels = train_labels[rand_idx] # UNIFORM CLASS SAMPLE CODE # uniformly sample each class cls_labels = train_labels.argmax(axis=1) cls_images = [train_images[cls_labels == i] for i in range(10)] rand_cls = np.int32(npr.random(30) / 0.1) rand_idx = [npr.randint(cls_images[cls].shape[0]) for cls in rand_cls] train_images = np.vstack( [cls_images[rand_cls[i]][rand_idx[i]] for i in range(30)]) train_labels = np.vstack([ train_labels[cls_labels == rand_cls[i]][rand_idx[i]] for i in range(30) ]) # binarize train_images = np.round(train_images) test_images = np.round(test_images) # -------------- LOADING DATASET ------------------------ print('LOADED DATASET')
def fit(y, X, Wprior, H, solver='BFGS', use_autograd=True, bounds=None, maxiter=10000, disp=False): """ Bayesian Logistic Regression Solver. Assumes Laplace (Gaussian) Approximation to the posterior of the fitted parameter vector. Uses scipy.optimize.minimize Parameters ---------- y : array-like, shape (N, ) starting at 0 vector of binary ({0, 1, ... C} possible responses) X : array-like, shape (N, p) array of features Wprior : array-like, shape (C, p) vector of prior means on the parameters to be fit H : array-like, shape (C*p, C*p) or independent between classes (C, p, p) Array of prior Hessian (inverse covariance of prior distribution of parameters) solver : string scipy optimize solver used. this should be either 'Newton-CG', 'BFGS' or 'L-BFGS-B'. The default is BFGS. use_autograd: whether to use autograd's jacobian and hessian functions to solve bounds : iterable of length p a length p list (or tuple) of tuples each of length 2. This is only used if the solver is set to 'L-BFGS-B'. In that case, a tuple (lower_bound, upper_bound), both floats, is defined for each parameter. See the scipy.optimize.minimize docs for further information. maxiter : int maximum number of iterations for scipy.optimize.minimize solver. disp: bool whether to print convergence messages and additional information Returns ------- W_results : array-like, shape (C, p) posterior parameters (MAP estimate) H_results : array-like, shape like `H` posterior Hessian (Hessian of negative log posterior evaluated at MAP parameters) References ---------- Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012) Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006) """ # Check dimensionalities and data types # check X if len(X.shape) != 2: raise ValueError('X should be a matrix of shape (N, p)') (nX, pX) = X.shape if not np.issubdtype(X.dtype, np.float): X = np.float32(X) # check y if len(y.shape) > 1: raise ValueError('y should be a vector of shape (N, )') if len(y) != nX: raise ValueError('y and X should have the same number of examples') if not np.issubdtype(y.dtype, np.integer): y = np.int32(y) # check Wprior if len(Wprior.shape) != 2: raise ValueError('prior mean should be a vector of shape (C, p)') cW, pW = Wprior.shape if cW == 1: raise ValueError('please use binary logistic regression since the number of classes is 1') if pW != pX: raise ValueError('prior mean should have the same number of features as X') if not np.issubdtype(Wprior.dtype, np.float): Wprior = np.float32(Wprior) # check H if len(H.shape) == 3: cH, pH1, pH2 = H.shape if cH != cW: raise ValueError('prior Hessian does not have the same number of classes as prior mean') if pH1 != pX: raise ValueError('prior Hessian does not have the same number of features as prior mean') if pH1 != pH2: raise ValueError('prior Hessian should be a square matrix of shape (C, p, p)') elif len(H.shape) == 2: cpH1, cpH2 = H.shape if cpH1 != cpH2: raise ValueError('prior Hessian should be a square matrix of shape (C*p, C*p)') if cpH1 != pX * cW: raise ValueError('prior Hessian should be a square matrix of shape (C*p, C*p)') else: raise ValueError('prior Hessian should be of shape (C*p, C*p) or (C, p, p)') if not np.issubdtype(H.dtype, np.float): H = np.float32(H) if not has_autograd: use_autograd = False # choose between manually coded or autograd's jacobian and hessian functions # and use hessian product rather than hessian for newton-cg solver if use_autograd: jac_f = jacobian(_get_f_log_posterior) hess_f = hessian(_get_f_log_posterior) else: jac_f = _get_grad_log_post hess_f = _get_H_log_post # Do the regression if solver == 'Newton-CG': hessp_f = lambda W1D, q, Wprior, H, y, X: hess_f(W1D, Wprior, H, y, X) @ q results = minimize(_get_f_log_posterior, Wprior.reshape(-1), args=(Wprior, H, y, X), jac=jac_f, hessp=hessp_f, method='Newton-CG', options={'maxiter': maxiter, 'disp': disp}) W_results1D = results.x H_results = hess_f(W_results1D, Wprior, H, y, X) elif solver == 'BFGS': results = minimize(_get_f_log_posterior, Wprior.reshape(-1), args=(Wprior, H, y, X), jac=jac_f, method='BFGS', options={'maxiter': maxiter, 'disp': disp}) W_results1D = results.x H_results = hess_f(W_results1D, Wprior, H, y, X) elif solver == 'L-BFGS-B': results = minimize(_get_f_log_posterior, Wprior.reshape(-1), args=(Wprior, H, y, X), jac=jac_f, method='L-BFGS-B', bounds=bounds, options={'maxiter': maxiter, 'disp': disp}) W_results1D = results.x H_results = hess_f(W_results1D, Wprior, H, y, X) else: raise ValueError('Unknown solver specified: "{0}"'.format(solver)) W_results = W_results1D.reshape(Wprior.shape) return W_results, H_results