def var(x, axis=None, ddof=0, keepdims=False, constant=False): """ Compute the variance along the specified axis. Returns the variance of the array elements, a measure of the spread of a distribution. The variance is computed for the flattened array by default, otherwise over the specified axis. Parameters ---------- x : array_like Array containing numbers whose variance is desired. axis : Optional[int, Tuple[int, ...]] Axis or axes along which the variance is computed. The default is to compute the variance of the flattened array. ddof : int, optional (default=0) "Delta Degrees of Freedom": the divisor used in the calculation is ``N - ddof``, where ``N`` represents the number of elements. By default `ddof` is zero. keepdims : bool, optional (default=False) If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array.. constant : bool, optional(default=False) If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- variance : mygrad.Tensor Notes ----- The variance is the average of the squared deviations from the mean, i.e., ``var = mean(abs(x - x.mean())**2)``. The mean is normally calculated as ``x.sum() / N``, where ``N = len(x)``. If, however, `ddof` is specified, the divisor ``N - ddof`` is used instead. In standard statistical practice, ``ddof=1`` provides an unbiased estimator of the variance of a hypothetical infinite population. ``ddof=0`` provides a maximum likelihood estimate of the variance for normally distributed variables. Examples -------- >>> import mygrad as mg >>> import numpy as np >>> a = mg.Tensor([[1, 2], ... [3, 4]]) >>> mg.var(a) Tensor(1.25) >>> mg.var(a, axis=0) Tensor([ 1., 1.]) >>> mg.var(a, axis=1) Tensor([ 0.25, 0.25]) In single precision, ``var()`` can be inaccurate: >>> a = mg.zeros((2, 512*512), dtype=np.float32) >>> a[0, :] = 1.0 >>> a[1, :] = 0.1 >>> mg.var(a) Tensor(0.20250003) Computing the variance in float64 is more accurate: >>> mg.var(a, dtype=np.float64) Tensor(0.20249999932944759) >>> ((1-0.55)**2 + (0.1-0.55)**2)/2 Tensor(0.2025) """ return Tensor._op( Variance, x, op_kwargs=dict(axis=axis, keepdims=keepdims, ddof=ddof), constant=constant, )
def sum(x, axis=None, keepdims=False, constant=False): """ Sum of tensor elements over a given axis. Parameters ---------- x : array_like axis : Optional[int, Tuple[ints, ...]] Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input tensor. If axis is negative it counts from the last to the first axis. If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single axis or all the axes as before. keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input tensor. constant : bool, optional(default=False) If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- sum_along_axis : mygrad.Tensor A Tensor with the same shape as `self`, with the specified axis/axes removed. If `self` is a 0-d tensor, or if `axis` is None, a 0-dim Tensor is returned. See Also -------- mygrad.Tensor.sum : Equivalent method. cumsum : Cumulative sum of array elements. mean, average Notes ----- Arithmetic is modular when using integer types, and no error is raised on overflow. The sum of an empty tensor is the neutral element 0: >>> mygrad.sum([]) Tensor(0.0) Examples -------- >>> import mygrad as mg >>> import numpy as np >>> mg.sum([0.5, 1.5]) Tensor(2.0) >>> mg.sum([0.5, 0.7, 0.2, 1.5], dtype=np.int32) Tensor(1) >>> mg.sum([[0, 1], [0, 5]]) Tensor(6) >>> mg.sum([[0, 1], [0, 5]], axis=0) Tensor([0, 6]) >>> mg.sum([[0, 1], [0, 5]], axis=1) Tensor([1, 5]) If the accumulator is too small, overflow occurs: >>> mg.ones(128, dtype=mg.int8).sum(dtype=np.int8) Tensor(-128) You can also start the sum with a value other than zero: >>> mg.sum([10], initial=5) Tensor(15) """ return Tensor._op(Sum, x, op_args=(axis, keepdims), constant=constant)
def gru( X, Uz, Wz, bz, Ur, Wr, br, Uh, Wh, bh, s0=None, bp_lim=None, dropout=0.0, constant=False, ): r""" Performs a forward pass of sequential data through a Gated Recurrent Unit layer, returning the 'hidden-descriptors' arrived at by utilizing the trainable parameters as follows:: Z_{t} = sigmoid(X_{t} Uz + S_{t-1} Wz + bz) R_{t} = sigmoid(X_{t} Ur + S_{t-1} Wr + br) H_{t} = tanh(X_{t} Uh + (R{t} * S_{t-1}) Wh + bh) S_{t} = (1 - Z{t}) * H{t} + Z{t} * S_{t-1} Parameters ---------- X : array_like, shape=(T, N, C) The sequential data to be passed forward. Uz : array_like, shape=(C, D) The weights used to map sequential data to its hidden-descriptor representation Wz : array_like, shape=(D, D) The weights used to map a hidden-descriptor to a hidden-descriptor. bz : array_like, shape=(D,) The biases used to scale a hidden-descriptor. Ur : array_like, shape=(C, D) The weights used to map sequential data to its hidden-descriptor representation Wr : array_like, shape=(D, D) The weights used to map a hidden-descriptor to a hidden-descriptor. br : array_like, shape=(D,) The biases used to scale a hidden-descriptor. Uh : array_like, shape=(C, D) The weights used to map sequential data to its hidden-descriptor representation Wh : array_like, shape=(D, D) The weights used to map a hidden-descriptor to a hidden-descriptor. bh : array_like, shape=(D,) The biases used to scale a hidden-descriptor. s0 : Optional[array_like], shape=(N, D) The 'seed' hidden descriptors to feed into the RNN. If None, a Tensor of zeros of shape (N, D) is created. bp_lim : Optional[int] *This feature is experimental and is currently untested*. The (non-zero) limit of the depth of back propagation through time to be performed. If `None` back propagation is passed back through the entire sequence. E.g. `bp_lim=3` will propagate gradients only up to 3 steps backward through the recursive sequence. dropout : float (default=0.), 0 <= dropout < 1 If non-zero, the dropout scheme described in [1]_ is applied. See Notes for more details. constant : bool, optional (default=False) If True, the resulting Tensor is a constant. Returns ------- mygrad.Tensor, shape=(T+1, N, D) The sequence of 'hidden-descriptors' produced by the forward pass of the RNN. Notes ----- - :math:`T` : Sequence length - :math:`N` : Batch size - :math:`C` : Length of single datum - :math:`D` : Length of 'hidden' descriptor The GRU system of equations is given by: .. math:: Z_{t} = \sigma (X_{t} U_z + S_{t-1} Wz + bz) R_{t} = \sigma (X_{t} U_r + S_{t-1} Wr + br) H_{t} = tanh(X_{t} U_h + (R_{t} * S_{t-1}) W_h + b_h) S_{t} = (1 - Z_{t}) * H_{t} + Z_{t} * S_{t-1} Following the dropout scheme specified in [1]_, the hidden-hidden weights (Wz/Wr/Wh) randomly have their weights dropped prior to forward/back-prop. The input connections (via Uz/Ur/Uh) have variational dropout ([2]_) applied to them with a common dropout mask across all t. That is three static dropout masks, each with shape-(N,D), are applied to .. math:: X_{t} U_z X_{t} U_r X_{t} U_h respectively, for all :math:`t`. References ---------- .. [1] S. Merity, et. al. "Regularizing and Optimizing LSTM Language Models", arXiv:1708.02182v1, 2017. .. [2] Y. Gal, Z. Ghahramani "A Theoretically Grounded Application of Dropout in Recurrent Neural Networks" arXiv:1512.05287v5, 2016. """ if s0 is not None: if not isinstance(s0, np.ndarray) and not (isinstance(s0, Tensor) and (constant or s0.constant)): raise ValueError( "GRU does not support non-constant tensors for the initial hidden" "state value, `s0`") s = Tensor._op( GRUnit, X, Uz, Wz, bz, Ur, Wr, br, Uh, Wh, bh, op_kwargs=dict(s0=s0, bp_lim=bp_lim, dropout=dropout), constant=constant, ) s.creator._hidden_seq = s return s
def conv_nd(x, filter_bank, *, stride, padding=0, dilation=1, constant=False): """ Use `filter_bank` to perform strided N-dimensional neural network-style convolutions (see Notes) over `x`.:: f(x, w) -> x ⋆ w shapes: (N, C, X0, ...) ⋆ (F, C, W0, ...) -> (N, F, G0, ...) ``x`` represents a batch of data over which the filters are convolved. Specifically, it must be a tensor of shape :math:`(N, C, X_0, ...)`, where :math:`N` is the number of samples in the batch, C is the channel-depth of each datum, and :math:`(X_0, ...)` are the dimensions over which the filters are convolved. Accordingly, each filter must have a channel depth of :math:`C`. Thus convolving :math:`F` filters, each with a shape :math:`(C, W_0, ...)`, over the data batch will produce a tensor of shape :math:`(N, F, G_0, ...)`, where :math:`(G_0, ...)` is the shape of the grid commensurate with the filter placements Parameters ---------- x : Union[Tensor, array_like], shape=(N, C, Xo, ...) The data batch to be convolved over. filter_bank : Union[Tensor, array_like], shape=(F, C, Wo, ...) The filters used to perform the convolutions. stride : Union[int, Tuple[int, ...]] (keyword-only argument) The step-size with which each filter is placed along the H and W axes during the convolution. The tuple indicates (stride-0, ...). If a single integer is provided, this stride is used for all convolved dimensions padding : Union[int, Tuple[int, ...]] (keyword-only argument) The number of zeros to be padded to both ends of each convolved dimension, respectively. If a single integer is provided, this padding is used for all of the convolved axes dilation : Union[int, Tuple[int, ...]], optional (default=1) (keyword-only argument) The spacing used when placing kernel elements along the data. E.g. for a 1D convolution the ith placement of the kernel multiplied against the dilated-window: ``x[:, :, i*s:(i*s + w*d):d]``, where ``s`` is the stride, ``w`` is the kernel-size, and ``d`` is the dilation factor. If a single integer is provided, that dilation value is used for all of the convolved axes constant : bool, optional (default=False) If True, the resulting Tensor is a constant. Returns ------- Tensor, shape=(N, F, G0, ...) The result of each filter being convolved over each datum in the batch. Notes ----- - The filters are *not* flipped by this operation, meaning that an auto-correlation is being performed rather than a true convolution. - Only 'valid' filter placements are permitted - where the filters overlap completely with the (padded) data. - This is a "scalar-only" operation, meaning that back propagation through this layer assumes that a scalar (i.e. a 0-dimensional tensor) will invoke ``tensor.backward()`` for the computational graph. This is standard for a neural network, which terminates in a scalar loss. Examples -------- Here we perform a 1D convolution of a constant-valued kernel, ``k``, with a 'square-wave' signal, ``x``, using stride-1. Note that because we are constrained to doing deep learning-style convolutions, that we prepend the dimensions :math:`(N=1, C=1)` to ``x``, and :math:`(F=1, C=1)` and to ``k``. That is, we are performing a convolution on one, single-channeled signal using one kernel. See that this convolution produces the expected triangle-shaped response. The shape of the resulting tensor is :math:`(N=1, F=1, G_0=12)`. That is, the length-5 kernel can be placed in 12 valid positions, using a stride of 1. >>> import mygrad as mg >>> from mygrad.nnet import conv_nd >>> x = mg.zeros((1, 1, 16)) # a square-wave signal >>> x[..., 5:11] = 1 >>> k = mg.ones((1, 1, 5)) # a constant-valued kernel >>> conv_nd(x, k, stride=1) # performing a stride-1, 1D convolution Tensor([[[0., 1., 2., 3., 4., 5., 5., 4., 3., 2., 1., 0.]]], dtype=float32) Back-propagating through the (summed) convolution: >>> conv_nd(x, k, stride=1).sum().backward() # sum to a scalar to perform back-prop >>> x.grad # d(summed_conv)/dx array([[[1., 2., 3., 4., 5., 5., 5., 5., 5., 5., 5., 5., 4., 3., 2., 1.]]], dtype=float32) >>> k.grad # d(summed_conv)/dk array([[[6., 6., 6., 6., 6.]]]) Now, let's demonstrate a more typical usage for ``conv_nd`` in the context of neural networks. ``x`` will represent 10, 32x32 RGB images, and we will use 5 distinct 2x2 kernels to convolve over each of these images . Note that each kernel must possess 3-channel - one for each RGB channel. That is, we will be performing NxF channel-wise 2D convolutions. Supposing that we don't want the kernel placements to overlap, we can use a stride of 2. In total, this will produce a shape-:math:`(N=10, F=5, G_0=16, G_1=16)` tensor as a result. >>> import numpy as np >>> x = mg.Tensor(np.random.rand(10, 3, 32, 32)) # creating 10 random 32x32 RGB images >>> k = mg.Tensor(np.random.rand(5, 3, 2, 2)) # creating 5 random 3-channel 2x2 kernels Given the shapes of ``x`` and ``k``, ``conv_nd`` automatically executes a 2D convolution: >>> conv_nd(x, k, stride=2).shape (10, 5, 16, 16) Extrapolating further, ``conv_nd`` is capable of performing ND convolutions! """ return Tensor._op( ConvND, x, filter_bank, op_kwargs=dict(stride=stride, padding=padding, dilation=dilation), constant=constant, )
def softmax_crossentropy(x, y_true, constant=False): r""" Given the classification scores of C classes for N pieces of data, computes the NxC softmax classification probabilities. The cross entropy is then computed by using the true classification labels. log-softmax is used for improved numerical stability. Parameters ---------- x : array_like, shape=(N, C) The C class scores for each of the N pieces of data. y_true : array_like, shape=(N,) The correct class-indices, in [0, C), for each datum. constant : bool, optional(default=False) If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- The average softmax loss Raises ------ ValueError Bad dimensionalities for ``x`` or ``y_true`` Notes ----- - :math:`N` is the number of samples in the batch. - :math:`C` is the number of possible classes for which scores are provided. Given the shape-:math:`(N, C)` tensor of scores, ``x``, the softmax classification probabilities are computed. That is, the score for class-:math:`k` of a given datum (:math:`s_{k}`) is normalized using the 'softmax' transformation: .. math:: p_{k} = \frac{e^{s_k}}{\sum_{i=1}^{C}{e^{s_i}}} This produces the "prediction probability distribution", :math:`p`, for each datum. The cross-entropy loss for that datum is then computed according to the true class-index for that datum, as reported in ``y_true``. That is the "true probability distribution", :math:`t`, for the datum is :math:`1` for the correct class-index and :math:`0` elsewhere. The cross-entropy loss for that datum is thus: .. math:: l = - \sum_{k=1}^{C}{t_{k} \log{p_{k}}} Having computed each per-datum cross entropy loss, this function then returns the loss averaged over all :math:`N` pieces of data: .. math:: L = \frac{1}{N}\sum_{i=1}^{N}{l_{i}} Examples -------- >>> import mygrad as mg >>> from mygrad.nnet import softmax_crossentropy Let's take a simple case where N=1, and C=3. We'll thus make up classification scores for a single datum. Suppose the scores are identical for the three classes and that the true class is class-0: >>> x = mg.Tensor([[2., 2., 2.]]) # a shape-(1, 3) tensor of scores >>> y_true = mg.Tensor([0]) # the correct class for this datum is class-0 Because the scores are identical for all three classes, the softmax normalization will simply produce :math:`p = [\frac{1}{3}, \frac{1}{3}, \frac{1}{3}]`. Because class-0 is the "true" class, :math:`t = [1., 0., 0.]`. Thus our softmax cross-entropy loss should be: .. math:: -(1 \times \log{\frac{1}{3}} + 0 \times \log{\frac{1}{3}} + 0 \times \log{\frac{1}{3}}) = \log(3) \approx 1.099 Let's see that this is what ``softmax_crossentropy`` returns: >>> softmax_crossentropy(x, y_true) Tensor(1.09861229) Similarly, suppose a datum's scores are :math:`[0, 0, 10^6]`, then the softmax normalization will return :math:`p \approx [0., 0., 1.]`. If the true class for this datum is class-2, then the loss should be nearly 0, since :math:`p` and :math:`t` are essentially identical: .. math:: -(0 \times \log{0} + 0 \times \log{0} + 1 \times \log{1}) = -\log(1) = 0 Now, let's construct ``x`` and ``y_true`` so that they incorporate the scores/labels for both of the data that we have considered: >>> x = mg.Tensor([[2., 2., 2.], # a shape-(2, 3) tensor of scores ... [0., 0., 1E6]]) >>> y_true = mg.Tensor([0, 2]) # the class IDs for the two data ``softmax_crossentropy(x, y_true)`` will return the average loss of these two data, :math:`\frac{1}{2}(1.099 + 0) \approx 0.55`: >>> softmax_crossentropy(x, y_true) Tensor(0.54930614) """ return Tensor._op(SoftmaxCrossEntropy, x, op_args=(y_true, ), constant=constant)
def dummy(a, b, constant=False): return Tensor._op(Dummy, a, b, constant=constant)
def max_pool(x, pool, stride, constant=False): """ Perform max-pooling over the last N dimensions of a data batch. Extended Summary ---------------- The data consists of N trailing axes to be pooled over, denoted by ``C0, ...``. These can be preceded, optionally, by un-pooled axes, denoted by ``(N0, ...)``. The dimensions of the window over which pooling is performed is denoted by ``P0, ...``. The window is placed with stride values ``S0, ...``. Ultimately the pooled channels have a shape ``G0, ...``. Parameters ---------- x : mygrad.Tensor, shape=([...], C0, ...) The data batch; to be pooled along the trailing axes denoted by ``C0, ...``. pool : Tuple[Integral, ...], (P0, ...) The extent of the pooling window along the ``(C0, ...)`` axes, respectively. The length of `pool` determines ``N`` - the number of trailing dimensions to pool over. stride : Union[Integral, Tuple[Integral, ...]], (S0, ...) The spacing used to place the pooling window, along ``(P0, ...)`` axes, respectively. If a single value is provided, it is used for all ``N`` pooling axes. Returns ------- numpy.ndarray, shape=([...], G0, ...) The pooled data batch. Notes ----- Only "valid" placements of the pooling window are permitted - the pooling window cannot extend passed the "boundaries" of the data dimensions. Examples -------- Simple 2D pooling on a 2D tensor. Tiling a 2x2 max-pool window with stride-1 over a shape-(3, 3) tensor ``x``: >>> import mygrad as mg >>> from mygrad.nnet import max_pool >>> x = mg.Tensor([[0., 10., 8.], ... [2., 7., 3.], ... [5., 7., 20.]]) >>> out = max_pool(x, pool=(2, 2), stride=1) >>> out Tensor([[ 10., 10.], [ 7., 20.]]) >>> out.sum().backward() # sum to reduce to scalar for back-prop >>> x.grad # dout/dx array([[0., 2., 0.], [0., 1., 0.], [0., 0., 1.]]) Let's perform 1D pooling on a 2D tensor. Each row of the tensor will be pooled over independently. Let's apply a size-2 max-pool window to each row of ``x``, using a stride of 1: >>> x = mg.Tensor([[0., 10., 8.], ... [9., 7., 3.], ... [5., 0., 20.]]) >>> max_pool(x, pool=(2,), stride=1) Tensor([[10., 10.], [ 9., 7.], [ 5., 20.]]) Here we perform pooling over the trailing two dimensions of a 4D tensor, ``x``. By specifying ``pool = (2, 2)``, we instruct ``max_pool`` to tile a 2x2 pooling window along these last two axes. Let's apply the window every two rows, and for each column; i.e. we specify ``stride = (2, 1)``: >>> import numpy as np >>> x = mg.Tensor(np.random.rand(10, 3, 12, 12)) >>> pool = (2, 2) # 2x2 pooling over the last axes >>> stride = (2, 1) # Apply 2x1 stride >>> out = max_pool(x, pool, stride) # max-pooled Tensor >>> out.shape (10, 3, 6, 11) Had we specified, say, ``pool = (3, 2, 2)``, then a 3x2x2 pooling window would have been tiled along the last *three* axes of ``x``. """ return Tensor._op(MaxPoolND, x, op_args=(pool, stride), constant=constant)