def free_energy(self, v, beta=1.0, raw=False): if self.hidden_type == UnitType.BERNOULLI: hidden = self.h_bias + np.matmul((v / self.v_sigma), self.W) hidden *= beta hidden = -np.sum(np.log(1.0 + np.exp(np.clip(hidden, -30, 30))), axis=1) elif self.hidden_type == UnitType.GAUSSIAN: # TODO: Implement # Have the formulas, but gotta make sure yo! hidden = np.sum( 1 / (2 * self.h_sigma) * (self.h_bias**2 - (self.h_bias + self.h_sigma * np.matmul(v / self.v_sigma, self.W))**2), axis=1) hidden -= 0.5 * self.num_hidden * np.log(2 * np.pi) + np.sum( np.log(self.h_sigma)) # raise NotImplementedError() if self.visible_type == UnitType.BERNOULLI: visible = -np.matmul(v, self.v_bias) visible *= beta elif self.visible_type == UnitType.GAUSSIAN: visible = 0.5 * np.sum( ((v - self.v_bias)**2) / (self.v_sigma**2 / beta + np.finfo(np.float32).eps), axis=1) else: raise ValueError(f"unknown type {self.visible_type}") # sum across batch to obtain log of joint-likelihood if raw: return hidden + visible else: return np.mean(hidden + visible)
def test_metropolis_hastings(samples, proba, proposal_proba, proposal_sample): # initialize kernel kernel = sampling.MetropolisHastingsKernel(proba, proposal_sample, proposal_proba) # test kernel state = 1.0 state = kernel.sample(state) print(state) # test sampler sampler = sampling.Sampler(kernel, show_progress=True) trace = sampler.run(initial=state) # verify the sampler produces reasonable results target_mean = np.mean(samples) target_std = np.std(samples) assert np.abs(np.mean(trace) - target_mean) < 0.1 assert np.abs(np.std(trace) - target_std) < 0.1
def test_gaussian_rbm(mnist_data): np.random.seed(RANDOM_SEED) X, _, _, _ = mnist_data batch_size = 10 visible_size = X.shape[1] # Gaussian RBMs are VERY sensitive to params on MNIST # and `hidden_size == 250` just happens to work. hidden_size = 300 X = (X - np.mean(X, axis=0) / (np.std(X, axis=0) + np.finfo(np.float32).eps)) X[np.isnan(X)] = 1.0 v = X[:batch_size].astype(np.float64) model = GaussianRBM(visible_size, hidden_size, estimate_visible_sigma=False) # verify shapes rbm_verify_shapes(model, v, batch_size, visible_size, hidden_size) # train :) rbm_train_single_sample(model, v)
def fit(self, train_data, k=1, learning_rate=0.01, num_epochs=5, batch_size=64, test_data=None, show_progress=True, weight_decay=0.0, early_stopping=-1, callbacks={}, **sampler_kwargs): """ Parameters ---------- train_data: array-like Data to fit RBM on. k: int, default=1 Number of sampling steps to perform. Used by CD-k, PCD-k and PT. learning_rate: float or array, default=0.01 Learning rate used when updating the parameters. Can also be array of same length as `self.variables`, in which case the learning rate at index `i` will be used to to update ``RBM.variables[i]``. num_epochs: int, default=5 Number of epochs to train. batch_size: int, default=64 Batch size to within the epochs. test_data: array-like, default=None Data similar to ``train_data``, but this will only be used as validation data, not trained on. If specified, will compute and print the free energy / negative log-likelihood on this dataset after each epoch. show_progress: bool, default=True If true, will display progress bar for each epoch. weight_decay: float, default=0.0 If greater than 0.0, weight decay will be applied to the parameter updates. See :func:`RBM.step` for more information. early_stopping: int, default=-1 If ``test_data`` is given and ``early_stopping > 0``, training will terminate after epoch if the free energy of the ``test_data`` did not improve over the fast ``early_stopping`` epochs. Returns ------- nlls_train, nlls_test : array-like, array-like Returns the free energy of both ``train_data`` and ``test_data`` as computed at each epoch. """ num_samples = train_data.shape[0] indices = np.arange(num_samples) np.random.shuffle(indices) nlls_train = [] nlls = [] prev_best = None for epoch in range(1, num_epochs + 1): if "pre_epoch" in callbacks: for c in callbacks["pre_epoch"]: c(self, epoch) # reset sampler at beginning of epoch # Used by methods such as PCD to reset the # initialization value. self.reset_sampler() # compute train & test negative log-likelihood # TODO: compute train- and test-nll in mini-batches # to avoid numerical problems nll_train = float(np.mean(self.free_energy(train_data))) nlls_train.append(nll_train) _log.info(f"[{epoch:03d} / {num_epochs:03d}] NLL (train):" f" {nll_train:>20.5f}") if test_data is not None: nll = float(np.mean(self.free_energy(test_data))) _log.info(f"[{epoch:03d} / {num_epochs:03d}] NLL (test):" f" {nll:>20.5f}") nlls.append(nll) # stop early if all `early_stopping` previous # evaluations on `test_data` did not improve. if early_stopping > 0: if epoch > early_stopping and \ np.all([a >= prev_best for a in nlls[epoch - early_stopping:]]): _log.info( "Hasn't improved in {early_stopping} epochs; stopping early" ) break else: # update `prev_best` if prev_best is None: prev_best = nll elif nll < prev_best: prev_best = nll # iterate through dataset in batches if show_progress: bar = tqdm(total=num_samples) for start in range(0, num_samples, batch_size): # ensure we don't go out-of-bounds end = min(start + batch_size, num_samples) # take a gradient-step self.step(train_data[start:end], k=k, lr=learning_rate, lmbda=weight_decay, **sampler_kwargs) if "post_step" in callbacks: for c in callbacks["post_step"]: c(self, epoch, end) # update progress if show_progress: bar.update(end - start) if show_progress: bar.close() # shuffle indices for next epoch np.random.shuffle(indices) if "post_epoch" in callbacks: for c in callbacks["post_epoch"]: c(self, epoch) # compute train & test negative log-likelihood of final batch nll_train = float(np.mean(self.free_energy(train_data))) nlls_train.append(nll_train) _log.info(f"[{epoch:03d} / {num_epochs:03d}] NLL (train): " f"{nll_train:>20.5f}") if test_data is not None: nll = float(np.mean(self.free_energy(test_data))) _log.info(f"[{epoch:03d} / {num_epochs:03d}] NLL (test): " f"{nll:>20.5f}") nlls.append(nll) return nlls_train, nlls
def grad(self, v, burnin=-1, persist=False, **sampler_kwargs): if self.sampler_method.lower() == 'cd': v_0, h_0, v_k, h_k = self.contrastive_divergence( v, **sampler_kwargs) elif self.sampler_method.lower() == 'pcd': # Persistent Contrastive Divergence if self._prev is not None: v_0, h_0 = self._prev else: # ``burnin`` specified, we perform this to initialize the chain if burnin > 0: _log.info( f"Performing burnin of {burnin} steps to initialize PCD" ) _, _, h_0, v_0 = self.contrastive_divergence( v, k=burnin, **sampler_kwargs) else: h_0 = self.sample_hidden(v, **sampler_kwargs) v_0 = v v_0, h_0, v_k, h_k = self.contrastive_divergence(v, h_0=h_0, **sampler_kwargs) # persist self._prev = (v_k, h_k) elif self.sampler_method.lower() == 'pt': h_0 = None if self._prev is not None: v_0, h_0 = self._prev else: _log.info("Initializing PT chain...") v_0 = self._init_parallel_tempering(v, **sampler_kwargs) # FIXME: make compatible with `parallel_tempering` returning # all the states if h_0 is None: v_0, h_0, v_k, h_k = self.parallel_tempering( v_0, hs=h_0, include_negative_shift=True, **sampler_kwargs) elif sampler_kwargs.get("include_negative_shift", False): v_0, h_0, v_k, h_k = self.parallel_tempering(v_0, hs=h_0, **sampler_kwargs) else: # FIXME: make compatible with `parallel_tempering` returning # all the states v_k, h_k = self.parallel_tempering(v_0, hs=h_0, **sampler_kwargs) if persist: self._prev = (v_k, h_k) # take the first tempered distribution, i.e. the one corresponding # the target distribution v_0 = v_0[0] h_0 = h_0[0] v_k = v_k[0] h_k = v_k[0] else: raise ValueError(f"{self.sampler_method} is not supported") # all expressions below using `v` or `mean_h` will contain # AT LEAST one factor of `1 / v_sigma` and `1 / h_sigma`, respectively # so we include those right away v_0 = v_0 / self.v_sigma v_k = v_k / self.v_sigma mean_h_0 = self.mean_hidden(v_0) / self.h_sigma mean_h_k = self.mean_hidden(v_k) / self.h_sigma # Recall: `v_sigma` and `h_sigma` has no affect if they are set to 1 # v_0 / (v_sigma^2) - v_k / (v_sigma^2) delta_v_bias = (v_0 - v_k) / self.v_sigma # E[h_0 | v_0] / (h_sigma^2) - E[h_k | v_k] / (h_sigma^2) delta_h_bias = (mean_h_0 - mean_h_k) / self.h_sigma # Gradient wrt. W # (v_0 / v_sigma) (1 / h_sigma) E[h_0 | v_0] - (v_k / v_sigma) (1 / h_sigma) E[h_k | v_k] x = mean_h_0.reshape(mean_h_0.shape[0], 1, mean_h_0.shape[1]) y = v_0.reshape(v_0.shape[0], v_0.shape[1], 1) z_0 = np.matmul(y, x) x = mean_h_k.reshape(mean_h_k.shape[0], 1, mean_h_k.shape[1]) y = v_k.reshape(v_k.shape[0], v_k.shape[1], 1) z_k = np.matmul(y, x) delta_W = z_0 - z_k # average over batch take the negative delta_v_bias = -np.mean(delta_v_bias, axis=0) delta_h_bias = -np.mean(delta_h_bias, axis=0) delta_W = -np.mean(delta_W, axis=0) grads = [delta_v_bias, delta_h_bias, delta_W] # variances if self.visible_type == UnitType.GAUSSIAN \ and self.estimate_visible_sigma: # in `GaussianRBM`, where only VISIBLE units Gaussian, # we only compute `v_sigma` # (((v_0 - b)^2 / (v_sigma^2)) - (v / (v_sigma)) \sum_{\mu} E[h_{\mu} | v] / sigma_{\mu}) / v_sigma delta_v_sigma_data = (((v_0 - (self.v_bias / self.v_sigma))**2) - v_0 * (np.matmul(mean_h_0, self.W.T))) delta_v_sigma_model = (((v_k - (self.v_bias / self.v_sigma))**2) - v_k * (np.matmul(mean_h_k, self.W.T))) delta_v_sigma = (delta_v_sigma_data - delta_v_sigma_model) / self.v_sigma # average over batch take the negative delta_v_sigma = -np.mean(delta_v_sigma, axis=0) grads.append(delta_v_sigma) if self.hidden_type == UnitType.GAUSSIAN \ and self.estimate_hidden_sigma: # TODO: Implement raise NotImplementedError("gradients for gaussian hidden" " units not yet implemented") delta_h_sigma_data = (((h_0 - (self.h_bias / self.h_sigma))**2) - h_0 * (np.matmul(mean_h_0, self.W.T))) delta_h_sigma_model = (((h_k - (self.h_bias / self.h_sigma))**2) - h_k * (np.matmul(mean_h_k, self.W.T))) delta_h_sigma = delta_h_sigma_data - delta_h_sigma_model # average over batch take the negative delta_h_sigma = -np.mean(delta_h_sigma, axis=0) grads.append(delta_h_sigma) return grads