def free_energy(self, v, beta=1.0, raw=False): if self.hidden_type == UnitType.BERNOULLI: hidden = self.h_bias + np.matmul((v / self.v_sigma), self.W) hidden *= beta hidden = -np.sum(np.log(1.0 + np.exp(np.clip(hidden, -30, 30))), axis=1) elif self.hidden_type == UnitType.GAUSSIAN: # TODO: Implement # Have the formulas, but gotta make sure yo! hidden = np.sum( 1 / (2 * self.h_sigma) * (self.h_bias**2 - (self.h_bias + self.h_sigma * np.matmul(v / self.v_sigma, self.W))**2), axis=1) hidden -= 0.5 * self.num_hidden * np.log(2 * np.pi) + np.sum( np.log(self.h_sigma)) # raise NotImplementedError() if self.visible_type == UnitType.BERNOULLI: visible = -np.matmul(v, self.v_bias) visible *= beta elif self.visible_type == UnitType.GAUSSIAN: visible = 0.5 * np.sum( ((v - self.v_bias)**2) / (self.v_sigma**2 / beta + np.finfo(np.float32).eps), axis=1) else: raise ValueError(f"unknown type {self.visible_type}") # sum across batch to obtain log of joint-likelihood if raw: return hidden + visible else: return np.mean(hidden + visible)
def mean_hidden(self, v, beta=1.0): "Computes conditional expectation E[h | v]." mean = self.h_bias + self.h_sigma * np.matmul(v / self.v_sigma, self.W) if self.hidden_type == UnitType.BERNOULLI: return sigmoid(mean * beta) elif self.hidden_type == UnitType.GAUSSIAN: return mean
def energy(self, v, h): if self.visible_type == UnitType.BERNOULLI: visible = np.matmul(v, self.v_bias) elif self.visible_type == UnitType.GAUSSIAN: visible = ((v - self.v_bias)** 2) / (self.v_sigma**2 + np.finfo(np.float32).eps) visible = 0.5 * np.sum(visible, axis=1) # term only dependent on hidden if self.hidden_type == UnitType.BERNOULLI: hidden = np.matmul(h, self.h_bias) elif self.hidden_type == UnitType.GAUSSIAN: hidden = ((h - self.h_bias)** 2) / (self.h_sigma**2 + np.finfo(np.float32).eps) hidden = 0.5 * np.sum(hidden, axis=1) # "covariance" term # v^T W = sum_j( (v_j / sigma_j) W_{j \mu} ) covariance = np.matmul(v, self.W) # v^T W h = sum_{\mu} h_{\mu} sum_j( (v_j / sigma_j) W_{j \mu} ) covariance = dot_batch(h, covariance) return -(visible + hidden + covariance)
def mean_visible(self, h, beta=1.0): r""" Computes :math:`\mathbb{E}[\mathbf{v} \mid \mathbf{h}]`. It can be shown that this expectation equals: [1]_ - Bernoulli: .. math:: :nowrap: \begin{equation} \mathbb{E}[\mathbf{v} \mid \mathbf{h}] = p \big( V_{i} = 1 \mid \mathbf{h} \big) = \text{sigmoid} \Bigg( \beta \bigg( b_{i} + \sum_{\mu=1}^{|\mathcal{H}|} W_{i \mu} \frac{h_{\mu}}{\sigma_{\mu}} \bigg) \Bigg) \end{equation} - Gaussian: .. math:: :nowrap: \begin{equation*} \mathbb{E}[\mathbf{v} \mid \mathbf{h}] = b_i + \sigma_i \sum_{\mu=1}^{|\mathcal{H}|} W_{i \mu} \frac{h_{\mu}}{\sigma_{\mu}} \end{equation*} where :math:`\sigma_{\mu} = 1` if :math:`H_\mu` is a Bernoulli random variable. Notes ----- Observe that the expectation when using Gaussian units is independent of :math:`\beta`. To see the effect :math:`\beta` has on the Gaussian case, see :func:`RBM.proba_visible`. References ---------- .. [1] Fjelde, T. E., Restricted Boltzmann Machines, , (), (2018). """ mean = self.v_bias + (self.v_sigma * np.matmul(h / self.h_sigma, self.W.T)) if self.visible_type == UnitType.BERNOULLI: return sigmoid(mean * beta) elif self.visible_type == UnitType.GAUSSIAN: return mean
def grad(self, v, burnin=-1, persist=False, **sampler_kwargs): if self.sampler_method.lower() == 'cd': v_0, h_0, v_k, h_k = self.contrastive_divergence( v, **sampler_kwargs) elif self.sampler_method.lower() == 'pcd': # Persistent Contrastive Divergence if self._prev is not None: v_0, h_0 = self._prev else: # ``burnin`` specified, we perform this to initialize the chain if burnin > 0: _log.info( f"Performing burnin of {burnin} steps to initialize PCD" ) _, _, h_0, v_0 = self.contrastive_divergence( v, k=burnin, **sampler_kwargs) else: h_0 = self.sample_hidden(v, **sampler_kwargs) v_0 = v v_0, h_0, v_k, h_k = self.contrastive_divergence(v, h_0=h_0, **sampler_kwargs) # persist self._prev = (v_k, h_k) elif self.sampler_method.lower() == 'pt': h_0 = None if self._prev is not None: v_0, h_0 = self._prev else: _log.info("Initializing PT chain...") v_0 = self._init_parallel_tempering(v, **sampler_kwargs) # FIXME: make compatible with `parallel_tempering` returning # all the states if h_0 is None: v_0, h_0, v_k, h_k = self.parallel_tempering( v_0, hs=h_0, include_negative_shift=True, **sampler_kwargs) elif sampler_kwargs.get("include_negative_shift", False): v_0, h_0, v_k, h_k = self.parallel_tempering(v_0, hs=h_0, **sampler_kwargs) else: # FIXME: make compatible with `parallel_tempering` returning # all the states v_k, h_k = self.parallel_tempering(v_0, hs=h_0, **sampler_kwargs) if persist: self._prev = (v_k, h_k) # take the first tempered distribution, i.e. the one corresponding # the target distribution v_0 = v_0[0] h_0 = h_0[0] v_k = v_k[0] h_k = v_k[0] else: raise ValueError(f"{self.sampler_method} is not supported") # all expressions below using `v` or `mean_h` will contain # AT LEAST one factor of `1 / v_sigma` and `1 / h_sigma`, respectively # so we include those right away v_0 = v_0 / self.v_sigma v_k = v_k / self.v_sigma mean_h_0 = self.mean_hidden(v_0) / self.h_sigma mean_h_k = self.mean_hidden(v_k) / self.h_sigma # Recall: `v_sigma` and `h_sigma` has no affect if they are set to 1 # v_0 / (v_sigma^2) - v_k / (v_sigma^2) delta_v_bias = (v_0 - v_k) / self.v_sigma # E[h_0 | v_0] / (h_sigma^2) - E[h_k | v_k] / (h_sigma^2) delta_h_bias = (mean_h_0 - mean_h_k) / self.h_sigma # Gradient wrt. W # (v_0 / v_sigma) (1 / h_sigma) E[h_0 | v_0] - (v_k / v_sigma) (1 / h_sigma) E[h_k | v_k] x = mean_h_0.reshape(mean_h_0.shape[0], 1, mean_h_0.shape[1]) y = v_0.reshape(v_0.shape[0], v_0.shape[1], 1) z_0 = np.matmul(y, x) x = mean_h_k.reshape(mean_h_k.shape[0], 1, mean_h_k.shape[1]) y = v_k.reshape(v_k.shape[0], v_k.shape[1], 1) z_k = np.matmul(y, x) delta_W = z_0 - z_k # average over batch take the negative delta_v_bias = -np.mean(delta_v_bias, axis=0) delta_h_bias = -np.mean(delta_h_bias, axis=0) delta_W = -np.mean(delta_W, axis=0) grads = [delta_v_bias, delta_h_bias, delta_W] # variances if self.visible_type == UnitType.GAUSSIAN \ and self.estimate_visible_sigma: # in `GaussianRBM`, where only VISIBLE units Gaussian, # we only compute `v_sigma` # (((v_0 - b)^2 / (v_sigma^2)) - (v / (v_sigma)) \sum_{\mu} E[h_{\mu} | v] / sigma_{\mu}) / v_sigma delta_v_sigma_data = (((v_0 - (self.v_bias / self.v_sigma))**2) - v_0 * (np.matmul(mean_h_0, self.W.T))) delta_v_sigma_model = (((v_k - (self.v_bias / self.v_sigma))**2) - v_k * (np.matmul(mean_h_k, self.W.T))) delta_v_sigma = (delta_v_sigma_data - delta_v_sigma_model) / self.v_sigma # average over batch take the negative delta_v_sigma = -np.mean(delta_v_sigma, axis=0) grads.append(delta_v_sigma) if self.hidden_type == UnitType.GAUSSIAN \ and self.estimate_hidden_sigma: # TODO: Implement raise NotImplementedError("gradients for gaussian hidden" " units not yet implemented") delta_h_sigma_data = (((h_0 - (self.h_bias / self.h_sigma))**2) - h_0 * (np.matmul(mean_h_0, self.W.T))) delta_h_sigma_model = (((h_k - (self.h_bias / self.h_sigma))**2) - h_k * (np.matmul(mean_h_k, self.W.T))) delta_h_sigma = delta_h_sigma_data - delta_h_sigma_model # average over batch take the negative delta_h_sigma = -np.mean(delta_h_sigma, axis=0) grads.append(delta_h_sigma) return grads