def gaussian_cdf(x, mean, var): """ Compute the probability that a random draw from a 1D Gaussian with mean `mean` and variance `var` is less than or equal to `x`. """ eps = np.finfo(float).eps x_scaled = (x - mean) / np.sqrt(var + eps) return (1 + erf(x_scaled / np.sqrt(2))) / 2
def DCT(frame, orthonormal=True): """ A naive :math:`O(N^2)` implementation of the 1D discrete cosine transform-II (DCT-II). Notes ----- For a signal :math:`\mathbf{x} = [x_1, \ldots, x_N]` consisting of `N` samples, the `k` th DCT coefficient, :math:`c_k`, is .. math:: c_k = 2 \sum_{n=0}^{N-1} x_n \cos(\pi k (2 n + 1) / (2 N)) where `k` ranges from :math:`0, \ldots, N-1`. The DCT is highly similar to the DFT -- whereas in a DFT the basis functions are sinusoids, in a DCT they are restricted solely to cosines. A signal's DCT representation tends to have more of its energy concentrated in a smaller number of coefficients when compared to the DFT, and is thus commonly used for signal compression. [1] .. [1] Smoother signals can be accurately approximated using fewer DFT / DCT coefficients, resulting in a higher compression ratio. The DCT naturally yields a continuous extension at the signal boundaries due its use of even basis functions (cosine). This in turn produces a smoother extension in comparison to DFT or DCT approximations, resulting in a higher compression. Parameters ---------- frame : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` A signal frame consisting of N samples orthonormal : bool Scale to ensure the coefficient vector is orthonormal. Default is True. Returns ------- dct : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` The discrete cosine transform of the samples in `frame`. """ N = len(frame) out = np.zeros_like(frame) for k in range(N): for (n, xn) in enumerate(frame): out[k] += xn * np.cos(np.pi * k * (2 * n + 1) / (2 * N)) scale = np.sqrt(1 / (4 * N)) if k == 0 else np.sqrt(1 / (2 * N)) out[k] *= 2 * scale if orthonormal else 2 return out
def glorot_normal(weight_shape, gain=1.0): """ Initialize network weights `W` using the Glorot normal initialization strategy. Notes ----- The Glorot normal initializaiton initializes weights with draws from TruncatedNormal(0, b) where the variance `b` is .. math:: b = \\frac{2 \\text{gain}^2}{\\text{fan_in} + \\text{fan_out}} The motivation for Glorot normal initialization is to choose weights to ensure that the variance of the layer outputs are approximately equal to the variance of its inputs. This initialization strategy was primarily developed for deep networks with :class:`~numpy_ml.neural_nets.activations.Tanh` and :class:`~numpy_ml.neural_nets.activations.Sigmoid` nonlinearities. Parameters ---------- weight_shape : tuple The dimensions of the weight matrix/volume. Returns ------- W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape` The initialized weights. """ fan_in, fan_out = calc_fan(weight_shape) std = gain * np.sqrt(2 / (fan_in + fan_out)) return truncated_normal(0, std, weight_shape)
def glorot_uniform(weight_shape, gain=1.0): """ Initialize network weights `W` using the Glorot uniform initialization strategy. Notes ----- The Glorot uniform initialization strategy initializes weights using draws from ``Uniform(-b, b)`` where: .. math:: b = \\text{gain} \sqrt{\\frac{6}{\\text{fan_in} + \\text{fan_out}}} The motivation for Glorot uniform initialization is to choose weights to ensure that the variance of the layer outputs are approximately equal to the variance of its inputs. This initialization strategy was primarily developed for deep networks with tanh and logistic sigmoid nonlinearities. Parameters ---------- weight_shape : tuple The dimensions of the weight matrix/volume. Returns ------- W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape` The initialized weights. """ fan_in, fan_out = calc_fan(weight_shape) b = gain * np.sqrt(6 / (fan_in + fan_out)) return np.random.uniform(-b, b, size=weight_shape)
def he_normal(weight_shape): """ Initialize network weights `W` using the He normal initialization strategy. Notes ----- The He normal initialization strategy initializes the weights in `W` using draws from TruncatedNormal(0, b) where the variance `b` is .. math:: b = \\frac{2}{\\text{fan_in}} He normal initialization was originally developed for deep networks with :class:`~numpy_ml.neural_nets.activations.ReLU` nonlinearities. Parameters ---------- weight_shape : tuple The dimensions of the weight matrix/volume. Returns ------- W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape` The initialized weights. """ fan_in, fan_out = calc_fan(weight_shape) std = np.sqrt(2 / fan_in) return truncated_normal(0, std, weight_shape)
def he_uniform(weight_shape): """ Initializes network weights `W` with using the He uniform initialization strategy. Notes ----- The He uniform initializations trategy initializes thew eights in `W` using draws from Uniform(-b, b) where .. math:: b = \sqrt{\\frac{6}{\\text{fan_in}}} Developed for deep networks with ReLU nonlinearities. Parameters ---------- weight_shape : tuple The dimensions of the weight matrix/volume. Returns ------- W : :py:class:`ndarray <numpy.ndarray>` of shape `weight_shape` The initialized weights. """ fan_in, fan_out = calc_fan(weight_shape) b = np.sqrt(6 / fan_in) return np.random.uniform(-b, b, size=weight_shape)
def update(self, param, param_grad, param_name, cur_loss=None): """ Compute the Adam update for a given parameter. Parameters ---------- param : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The value of the parameter to be updated. param_grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The gradient of the loss function with respect to `param_name`. param_name : str The name of the parameter. cur_loss : float The training or validation loss for the current minibatch. Used for learning rate scheduling e.g., by :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`. Default is None. Returns ------- updated_params : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The value of `param` after applying the Adam update. """ C = self.cache H = self.hyperparameters d1, d2 = H["decay1"], H["decay2"] eps, clip_norm = H["eps"], H["clip_norm"] lr = self.lr_scheduler(self.cur_step, cur_loss) if param_name not in C: C[param_name] = { "t": 0, "mean": np.zeros_like(param_grad), "var": np.zeros_like(param_grad), } # scale gradient to avoid explosion t = np.inf if clip_norm is None else clip_norm if norm(param_grad) > t: param_grad = param_grad * t / norm(param_grad) t = C[param_name]["t"] + 1 var = C[param_name]["var"] mean = C[param_name]["mean"] # update cache C[param_name]["t"] = t C[param_name]["var"] = d2 * var + (1 - d2) * param_grad**2 C[param_name]["mean"] = d1 * mean + (1 - d1) * param_grad self.cache = C # calc unbiased moment estimates and Adam update v_hat = C[param_name]["var"] / (1 - d2**t) m_hat = C[param_name]["mean"] / (1 - d1**t) update = lr * m_hat / (np.sqrt(v_hat) + eps) return param - update
def update(self, param, param_grad, param_name, cur_loss=None): """ Compute the AdaGrad update for a given parameter. Notes ----- Adjusts the learning rate of each weight based on the magnitudes of its gradients (big gradient -> small lr, small gradient -> big lr). Parameters ---------- param : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The value of the parameter to be updated param_grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The gradient of the loss function with respect to `param_name` param_name : str The name of the parameter cur_loss : float or None The training or validation loss for the current minibatch. Used for learning rate scheduling e.g., by :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`. Default is None. Returns ------- updated_params : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The value of `param` after applying the AdaGrad update """ C = self.cache H = self.hyperparameters eps, clip_norm = H["eps"], H["clip_norm"] lr = self.lr_scheduler(self.cur_step, cur_loss) if param_name not in C: C[param_name] = np.zeros_like(param_grad) # scale gradient to avoid explosion t = np.inf if clip_norm is None else clip_norm if norm(param_grad) > t: param_grad = param_grad * t / norm(param_grad) C[param_name] += param_grad**2 update = lr * param_grad / (np.sqrt(C[param_name]) + eps) self.cache = C return param - update
def euclidean(x, y): """ Compute the Euclidean (`L2`) distance between two real vectors Notes ----- The Euclidean distance between two vectors **x** and **y** is .. math:: d(\mathbf{x}, \mathbf{y}) = \sqrt{ \sum_i (x_i - y_i)^2 } Parameters ---------- x,y : :py:class:`ndarray <numpy.ndarray>` s of shape `(N,)` The two vectors to compute the distance between Returns ------- d : float The L2 distance between **x** and **y**. """ return np.sqrt(np.sum((x - y)**2))
def _maximize_alpha(self, max_iters=1000, tol=0.1): """ Optimize alpha using Blei's O(n) Newton-Raphson modification for a Hessian with special structure """ D = self.D T = self.T alpha = self.alpha gamma = self.gamma for _ in range(max_iters): alpha_old = alpha # Calculate gradient g = D * (digamma(np.sum(alpha)) - digamma(alpha)) + np.sum( digamma(gamma) - np.tile(digamma(np.sum(gamma, axis=1)), (T, 1)).T, axis=0, ) # Calculate Hessian diagonal component h = -D * polygamma(1, alpha) # Calculate Hessian constant component z = D * polygamma(1, np.sum(alpha)) # Calculate constant c = np.sum(g / h) / (z ** (-1.0) + np.sum(h ** (-1.0))) # Update alpha alpha = alpha - (g - c) / h # Check convergence if np.sqrt(np.mean(np.square(alpha - alpha_old))) < tol: break return alpha
def predict(self, X, conf_interval=0.95, return_cov=False): """ Return the MAP estimate for :math:`y^*`, corresponding the mean/mode of the posterior predictive distribution, :math:`p(y^* \mid x^*, X, y)`. Notes ----- Under the GP regression model, the posterior predictive distribution is .. math:: y^* \mid x^*, X, y \sim \mathcal{N}(\mu^*, \\text{cov}^*) where .. math:: \mu^* &= K^* (K + \\alpha I)^{-1} y \\\\ \\text{cov}^* &= K^{**} - K^{*'} (K + \\alpha I)^{-1} K^* and .. math:: K &= \\text{kernel}(X, X) \\\\ K^* &= \\text{kernel}(X, X^*) \\\\ K^{**} &= \\text{kernel}(X^*, X^*) NB. This implementation uses the inefficient but general purpose `np.linalg.inv` routine to invert :math:`(K + \\alpha I)`. A more efficient way is to rely on the fact that `K` (and hence also :math:`K + \\alpha I`) is symmetric positive (semi-)definite and take the inner product of the inverse of its (lower) Cholesky decompositions: .. math:: Q^{-1} = \\text{cholesky}(Q)^{-1 \\top} \\text{cholesky}(Q)^{-1} For more details on a production-grade implementation, see Algorithm 2.1 in Rasmussen & Williams (2006). Parameters ---------- X : :py:class:`ndarray <numpy.ndarray>` of shape (N, M) The collection of datapoints to generate predictions on conf_interval : float in (0, 1) The percentage confidence bound to return for each prediction. If the scipy package is not available, this value is always set to 0.95. Default is 0.95. return_cov : bool If True, also return the covariance (`cov*`) of the posterior predictive distribution for the points in `X`. Default is False. Returns ------- y_pred : :py:class:`ndarray <numpy.ndarray>` of shape `(N, O)` The predicted values for each point in `X`, each with dimensionality `O`. conf : :py:class:`ndarray <numpy.ndarray>` of shape `(N, O)` The % conf_interval confidence bound for each `y_pred`. The conf % confidence interval for the `i`'th prediction is ``[y[i] - conf[i], y[i] + conf[i]]``. cov : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)` The covariance (`cov*`) of the posterior predictive distribution for `X`. Only returned if `return_cov` is True. """ if conf_interval != 0.95 and not _SCIPY: fstr = "Cannot compute {}% confidence score without scipy.stats" warnings.warn(fstr.format(conf_interval)) X_star = X X = self.parameters["X"] y = self.parameters["y"] K = self.parameters["GP_cov"] alpha = self.hyperparameters["alpha"] K_star = self.kernel(X_star, X) K_star_star = self.kernel(X_star, X_star) sig = np.eye(K.shape[0]) * alpha K_y_inv = inv(K + sig) pp_mean = np.dot(np.dot(K_star, K_y_inv), y) pp_cov = K_star_star - np.dot(np.dot(K_star, K_y_inv), K_star.T) # if we can't use scipy, ignore the passed value for `conf_interval` # and return the 95% confidence bound. # (norm.ppf == inverse CDF for standard normal) percentile = 1.96 if not _SCIPY else norm.ppf(conf_interval) conf = percentile * np.sqrt(np.diag(pp_cov)) return (pp_mean, conf) if not return_cov else (pp_mean, conf, pp_cov)