示例#1
0
文件: core.py 项目: torfjelde/ml
    def free_energy(self, v, beta=1.0, raw=False):
        if self.hidden_type == UnitType.BERNOULLI:
            hidden = self.h_bias + np.matmul((v / self.v_sigma), self.W)
            hidden *= beta
            hidden = -np.sum(np.log(1.0 + np.exp(np.clip(hidden, -30, 30))),
                             axis=1)
        elif self.hidden_type == UnitType.GAUSSIAN:
            # TODO: Implement
            # Have the formulas, but gotta make sure yo!
            hidden = np.sum(
                1 / (2 * self.h_sigma) *
                (self.h_bias**2 -
                 (self.h_bias +
                  self.h_sigma * np.matmul(v / self.v_sigma, self.W))**2),
                axis=1)
            hidden -= 0.5 * self.num_hidden * np.log(2 * np.pi) + np.sum(
                np.log(self.h_sigma))
            # raise NotImplementedError()

        if self.visible_type == UnitType.BERNOULLI:
            visible = -np.matmul(v, self.v_bias)
            visible *= beta
        elif self.visible_type == UnitType.GAUSSIAN:
            visible = 0.5 * np.sum(
                ((v - self.v_bias)**2) /
                (self.v_sigma**2 / beta + np.finfo(np.float32).eps),
                axis=1)
        else:
            raise ValueError(f"unknown type {self.visible_type}")

        # sum across batch to obtain log of joint-likelihood
        if raw:
            return hidden + visible
        else:
            return np.mean(hidden + visible)
示例#2
0
文件: core.py 项目: torfjelde/ml
 def mean_hidden(self, v, beta=1.0):
     "Computes conditional expectation E[h | v]."
     mean = self.h_bias + self.h_sigma * np.matmul(v / self.v_sigma, self.W)
     if self.hidden_type == UnitType.BERNOULLI:
         return sigmoid(mean * beta)
     elif self.hidden_type == UnitType.GAUSSIAN:
         return mean
示例#3
0
文件: core.py 项目: torfjelde/ml
    def energy(self, v, h):
        if self.visible_type == UnitType.BERNOULLI:
            visible = np.matmul(v, self.v_bias)
        elif self.visible_type == UnitType.GAUSSIAN:
            visible = ((v - self.v_bias)**
                       2) / (self.v_sigma**2 + np.finfo(np.float32).eps)
            visible = 0.5 * np.sum(visible, axis=1)

        # term only dependent on hidden
        if self.hidden_type == UnitType.BERNOULLI:
            hidden = np.matmul(h, self.h_bias)
        elif self.hidden_type == UnitType.GAUSSIAN:
            hidden = ((h - self.h_bias)**
                      2) / (self.h_sigma**2 + np.finfo(np.float32).eps)
            hidden = 0.5 * np.sum(hidden, axis=1)

        # "covariance" term
        # v^T W = sum_j( (v_j / sigma_j) W_{j \mu} )
        covariance = np.matmul(v, self.W)
        # v^T W h = sum_{\mu} h_{\mu} sum_j( (v_j / sigma_j) W_{j \mu} )
        covariance = dot_batch(h, covariance)

        return -(visible + hidden + covariance)
示例#4
0
文件: core.py 项目: torfjelde/ml
    def mean_visible(self, h, beta=1.0):
        r"""
        Computes :math:`\mathbb{E}[\mathbf{v} \mid \mathbf{h}]`.

        It can be shown that this expectation equals: [1]_

        - Bernoulli:

          .. math::
            :nowrap:

            \begin{equation}
            \mathbb{E}[\mathbf{v} \mid \mathbf{h}] =
            p \big( V_{i} = 1 \mid \mathbf{h} \big) = \text{sigmoid}
            \Bigg( \beta \bigg( b_{i} + \sum_{\mu=1}^{|\mathcal{H}|} W_{i \mu} \frac{h_{\mu}}{\sigma_{\mu}} \bigg) \Bigg)
            \end{equation}

        - Gaussian:

          .. math::
            :nowrap:

            \begin{equation*}
            \mathbb{E}[\mathbf{v} \mid \mathbf{h}] = b_i + \sigma_i \sum_{\mu=1}^{|\mathcal{H}|} W_{i \mu} \frac{h_{\mu}}{\sigma_{\mu}}
            \end{equation*}

        where :math:`\sigma_{\mu} = 1` if :math:`H_\mu` is a Bernoulli random variable.
        
        Notes
        -----
        Observe that the expectation when using Gaussian units is
        independent of :math:`\beta`. To see the effect :math:`\beta` has
        on the Gaussian case, see :func:`RBM.proba_visible`.

        References
        ----------
        .. [1] Fjelde, T. E., Restricted Boltzmann Machines, , (),  (2018).
        """
        mean = self.v_bias + (self.v_sigma *
                              np.matmul(h / self.h_sigma, self.W.T))
        if self.visible_type == UnitType.BERNOULLI:
            return sigmoid(mean * beta)
        elif self.visible_type == UnitType.GAUSSIAN:
            return mean
示例#5
0
文件: core.py 项目: torfjelde/ml
    def grad(self, v, burnin=-1, persist=False, **sampler_kwargs):
        if self.sampler_method.lower() == 'cd':
            v_0, h_0, v_k, h_k = self.contrastive_divergence(
                v, **sampler_kwargs)
        elif self.sampler_method.lower() == 'pcd':
            # Persistent Contrastive Divergence
            if self._prev is not None:
                v_0, h_0 = self._prev
            else:
                # ``burnin`` specified, we perform this to initialize the chain
                if burnin > 0:
                    _log.info(
                        f"Performing burnin of {burnin} steps to initialize PCD"
                    )
                    _, _, h_0, v_0 = self.contrastive_divergence(
                        v, k=burnin, **sampler_kwargs)
                else:
                    h_0 = self.sample_hidden(v, **sampler_kwargs)
                    v_0 = v

            v_0, h_0, v_k, h_k = self.contrastive_divergence(v,
                                                             h_0=h_0,
                                                             **sampler_kwargs)

            # persist
            self._prev = (v_k, h_k)

        elif self.sampler_method.lower() == 'pt':
            h_0 = None
            if self._prev is not None:
                v_0, h_0 = self._prev
            else:
                _log.info("Initializing PT chain...")
                v_0 = self._init_parallel_tempering(v, **sampler_kwargs)

            # FIXME: make compatible with `parallel_tempering` returning
            # all the states
            if h_0 is None:
                v_0, h_0, v_k, h_k = self.parallel_tempering(
                    v_0, hs=h_0, include_negative_shift=True, **sampler_kwargs)
            elif sampler_kwargs.get("include_negative_shift", False):
                v_0, h_0, v_k, h_k = self.parallel_tempering(v_0,
                                                             hs=h_0,
                                                             **sampler_kwargs)
            else:
                # FIXME: make compatible with `parallel_tempering` returning
                # all the states
                v_k, h_k = self.parallel_tempering(v_0,
                                                   hs=h_0,
                                                   **sampler_kwargs)

            if persist:
                self._prev = (v_k, h_k)

            # take the first tempered distribution, i.e. the one corresponding
            # the target distribution
            v_0 = v_0[0]
            h_0 = h_0[0]
            v_k = v_k[0]
            h_k = v_k[0]
        else:
            raise ValueError(f"{self.sampler_method} is not supported")

        # all expressions below using `v` or `mean_h` will contain
        # AT LEAST one factor of `1 / v_sigma` and `1 / h_sigma`, respectively
        # so we include those right away
        v_0 = v_0 / self.v_sigma
        v_k = v_k / self.v_sigma
        mean_h_0 = self.mean_hidden(v_0) / self.h_sigma
        mean_h_k = self.mean_hidden(v_k) / self.h_sigma

        # Recall: `v_sigma` and `h_sigma` has no affect if they are set to 1
        # v_0 / (v_sigma^2) - v_k / (v_sigma^2)
        delta_v_bias = (v_0 - v_k) / self.v_sigma
        # E[h_0 | v_0] / (h_sigma^2) - E[h_k | v_k] / (h_sigma^2)
        delta_h_bias = (mean_h_0 - mean_h_k) / self.h_sigma

        # Gradient wrt. W
        # (v_0 / v_sigma) (1 / h_sigma) E[h_0 | v_0] - (v_k / v_sigma) (1 / h_sigma) E[h_k | v_k]
        x = mean_h_0.reshape(mean_h_0.shape[0], 1, mean_h_0.shape[1])
        y = v_0.reshape(v_0.shape[0], v_0.shape[1], 1)
        z_0 = np.matmul(y, x)

        x = mean_h_k.reshape(mean_h_k.shape[0], 1, mean_h_k.shape[1])
        y = v_k.reshape(v_k.shape[0], v_k.shape[1], 1)
        z_k = np.matmul(y, x)

        delta_W = z_0 - z_k

        # average over batch take the negative
        delta_v_bias = -np.mean(delta_v_bias, axis=0)
        delta_h_bias = -np.mean(delta_h_bias, axis=0)
        delta_W = -np.mean(delta_W, axis=0)

        grads = [delta_v_bias, delta_h_bias, delta_W]

        # variances
        if self.visible_type == UnitType.GAUSSIAN \
           and self.estimate_visible_sigma:
            # in `GaussianRBM`, where only VISIBLE units Gaussian,
            # we only compute `v_sigma`
            # (((v_0 - b)^2 / (v_sigma^2)) - (v / (v_sigma)) \sum_{\mu} E[h_{\mu} | v] / sigma_{\mu}) / v_sigma
            delta_v_sigma_data = (((v_0 - (self.v_bias / self.v_sigma))**2) -
                                  v_0 * (np.matmul(mean_h_0, self.W.T)))
            delta_v_sigma_model = (((v_k - (self.v_bias / self.v_sigma))**2) -
                                   v_k * (np.matmul(mean_h_k, self.W.T)))
            delta_v_sigma = (delta_v_sigma_data -
                             delta_v_sigma_model) / self.v_sigma
            # average over batch take the negative
            delta_v_sigma = -np.mean(delta_v_sigma, axis=0)

            grads.append(delta_v_sigma)

        if self.hidden_type == UnitType.GAUSSIAN \
           and self.estimate_hidden_sigma:
            # TODO: Implement
            raise NotImplementedError("gradients for gaussian hidden"
                                      " units not yet implemented")

            delta_h_sigma_data = (((h_0 - (self.h_bias / self.h_sigma))**2) -
                                  h_0 * (np.matmul(mean_h_0, self.W.T)))
            delta_h_sigma_model = (((h_k - (self.h_bias / self.h_sigma))**2) -
                                   h_k * (np.matmul(mean_h_k, self.W.T)))
            delta_h_sigma = delta_h_sigma_data - delta_h_sigma_model
            # average over batch take the negative
            delta_h_sigma = -np.mean(delta_h_sigma, axis=0)

            grads.append(delta_h_sigma)

        return grads