示例#1
0
    def backward(self, grad, *, graph, **kwargs):
        """ Back-propagates the gradient through all of the operation's inputs.
            Constant tensors do not propagate a gradient.

            grad : numpy.ndarray
                The back-propagated total derivative with respect to the present
                operation (`f`): d(out)/df

            graph : Set[Operation]"""
        for index, var in enumerate(self.variables):
            if not var.constant:
                if not var._ops:
                    raise Exception(
                        "Invalid Backprop: part of the computational graph containing "
                        "this tensor was cleared prior to backprop")
                if var.grad is None:
                    tmp_grad = reduce_broadcast(
                        self.backward_var(grad, index, **kwargs), var.shape)
                    var.grad = np.copy(tmp_grad) if np.shares_memory(
                        tmp_grad, grad) else tmp_grad
                else:
                    var.grad += reduce_broadcast(
                        self.backward_var(grad, index, **kwargs), var.shape)

        for var in {
                i
                for i in self.variables
                if not i.constant and i.creator is not None
        }:
            var._accum_ops.add(self)
            var._backward(graph=graph)
示例#2
0
def test_bad_gradient_dimensionality(shapes: Tuple[Tuple[int, ...],
                                                   Tuple[int, ...]]):
    """ test that grad.dim < len(var_shape) raises ValueError"""
    var_shape = shapes[0]
    grad = np.empty(shapes[1])
    with raises(ValueError):
        reduce_broadcast(grad=grad, var_shape=var_shape)
示例#3
0
def test_reduce_broadcast_shape_consistency(shapes: hnp.BroadcastableShapes):
    grad = np.zeros(shapes.result_shape)

    assert (reduce_broadcast(
        grad,
        var_shape=shapes.input_shapes[0]).shape == shapes.input_shapes[0])
    assert (reduce_broadcast(
        grad,
        var_shape=shapes.input_shapes[1]).shape == shapes.input_shapes[1])
示例#4
0
def test_hybrid_broadcasting(grad):
    """ tests new-dim and keep-dim broadcasting
         (3, 1, 2) -> (5, 3, 4, 2)"""
    var_shape = (3, 1, 2)
    reduced = reduce_broadcast(grad=grad, var_shape=var_shape)
    answer = grad.sum(axis=0).sum(axis=-2, keepdims=True)
    assert_allclose(actual=reduced, desired=answer)
示例#5
0
def test_reduce_broadcast_nokeepdim(var_shape, data):
    """ example broadcasting: (2, 3) -> (5, 2, 3)"""
    grad = data.draw(hnp.arrays(dtype=float,
                                shape=broadcastable_shape(
                                    shape=var_shape,
                                    min_dim=len(var_shape) + 1,
                                    max_dim=len(var_shape) + 3),
                                elements=st.just(1.)),
                     label='grad')
    assume(1 not in grad.shape[-len(var_shape):])
    reduced_grad = reduce_broadcast(grad=grad, var_shape=var_shape)
    reduced_grad *= np.prod(
        var_shape) / grad.size  # scale reduced-grad so all elements are 1
    assert_allclose(actual=reduced_grad, desired=np.ones(var_shape))
示例#6
0
def test_reduce_broadcast_nokeepdim(var_shape, data):
    """ example broadcasting: (2, 3) -> (5, 2, 3)"""
    grad_shape = data.draw(
        broadcastable_shapes(
            shape=var_shape,
            min_dims=len(var_shape) + 1,
            max_dims=len(var_shape) + 3,
            min_side=2,
        ),
        label="grad_shape",
    )
    grad = np.ones(grad_shape, dtype=float)

    reduced_grad = reduce_broadcast(grad=grad, var_shape=var_shape)
    reduced_grad *= (np.prod(var_shape) / grad.size
                     )  # scale reduced-grad so all elements are 1
    assert_allclose(actual=reduced_grad, desired=np.ones(var_shape))
示例#7
0
def test_reduce_broadcast_keepdim(var_shape, data):
    """ example broadcasting: (2, 1, 4) -> (2, 5, 4)"""
    grad = data.draw(hnp.arrays(dtype=float,
                                shape=broadcastable_shape(
                                    shape=var_shape,
                                    min_dim=len(var_shape),
                                    max_dim=len(var_shape)),
                                elements=st.just(1.)),
                     label='grad')

    reduced_grad = reduce_broadcast(grad=grad, var_shape=var_shape)
    assert reduced_grad.shape == tuple(i if i < j else j
                                       for i, j in zip(var_shape, grad.shape))
    assert (i == 1 for i, j in zip(var_shape, grad.shape) if i < j)
    sum_axes = tuple(n for n, (i, j) in enumerate(zip(var_shape, grad.shape))
                     if i != j)
    assert_allclose(actual=reduced_grad,
                    desired=grad.sum(axis=sum_axes, keepdims=True))
示例#8
0
    def backward_var(self, grad, index, **kwargs):
        """
        example
        -------
        fwd:          "ijk, k -> ji", x, y
        bkwd (var: 0): "ji, k -> ijk", grad, y
        bkwd (var: 1): "ji, ijk -> k", grad, x
        """

        # ijk, k
        in_lbls = copy(self.in_lbls)
        original_var_lbl = in_lbls.pop(index)
        var = self.variables[index]

        factor = self.cache[(var, original_var_lbl)]
        if factor == 0:
            # the gradient for the current tensor-label pair
            # has already been computed, scaled, and back-propped,
            # skip gradient calculation.
            raise SkipGradient()

        numpy_arrays = tuple(i.data for i in self.variables)
        self.cache[(var, original_var_lbl)] = 0

        var_lbl = _unique_from_end(original_var_lbl)
        repeat_lbls = len(var_lbl) != len(original_var_lbl)

        if repeat_lbls:
            # example fwd-prop: einsum("iji -> ij", x)
            # "iji" becomes "ji", later we will write along
            # the diagonal of an array to reinstate this axis that
            # we just removed
            mapping_gen = ({k: v
                            for k, v in zip(lbl, arr.shape)}
                           for lbl, arr in zip(self.in_lbls, numpy_arrays))
            lbl_to_size = _merge_max_mappings(*mapping_gen)
            var_shape = tuple(lbl_to_size[lbl] for lbl in var_lbl)
        else:
            var_shape = self.variables[index].shape

        # ji
        grad_lbl = self.out_lbls

        # Catch indices over which un-contracted sum was performed
        # for the given variable: e.g for var-0 in "ijk, jk -> k"
        # i is summed over without contraction with another tensor
        #
        # Backpropping through this is illegal, as it requires the creation
        # of an axis; e.g. k, jk -> ijk
        # Broadcast the gradient along all such dimensions; e.g. k -> ik
        # then proceed as usual; e.g. ik, jk -> ijk
        unique_in_lbls = set(chain.from_iterable(in_lbls)) | set(grad_lbl)
        if len(set(var_lbl) - unique_in_lbls) > 0:
            exp_dims = [slice(None) for i in range(grad.ndim)]
            grad_shape = list(grad.shape)
            for n, lbl in enumerate(var_lbl):
                if lbl not in unique_in_lbls:
                    grad_lbl = grad_lbl[:n] + lbl + grad_lbl[n:]
                    exp_dims.insert(n, np.newaxis)
                    grad_shape.insert(n, var_shape[n])

            grad = np.broadcast_to(
                grad if not grad.ndim else grad[tuple(exp_dims)], grad_shape)

        # "ji, k -> ijk"
        back_prop_lbls = ",".join([grad_lbl] + in_lbls) + "->" + var_lbl

        # (grad, y)
        operands = (grad, ) + numpy_arrays[:index] + numpy_arrays[index + 1:]

        if not repeat_lbls:
            # dfdx: einsum("ji, k -> ijk", grad, y)
            outshape = self.variables[index].shape
            dfdx = reduce_broadcast(
                np.einsum(back_prop_lbls, *operands, optimize=self.optimize),
                outshape)
            if var_shape != dfdx.shape:
                # if y was broadcast over x, the gradient needs to
                # be broadcast to x's shape: dfdx-shape (i,j,1) -> (i,j,k)
                dfdx = np.broadcast_to(dfdx, var_shape)
            if factor > 1:
                # This tensor-label pair appears several times as
                # input to einsum. Scale the gradient accordingly
                # such that the full contribution of the tensor-label
                # pair is accounted for.
                dfdx *= factor
            return dfdx

        # Accommodate trace by writing to strided view on array of zeros
        # For example:
        #
        # fwd:  einsum('ijkji, k -> jk', x, y)
        # dfdx: einsum('jk, k -> kji', grad, y, out=view_of_x)
        #
        # writing to `view_of_x`, which is a view along the appropriate
        # diagonals of x, is equivalent to:
        #
        # dfdx: einsum('jk, k -> ijkji', grad, y)
        #
        # which is formally correct but not supported by einsum.
        dfdx = np.zeros(tuple(lbl_to_size[i] for i in original_var_lbl))
        out_view_shape = tuple(lbl_to_size[i] for i in var_lbl)

        # compute strides required to traverse the appropriate diagonals of
        # the output tensor.
        strides = tuple(
            sum(dfdx.strides[ind]
                for ind in _get_indices(lbl, original_var_lbl))
            for lbl in var_lbl)
        out_view = as_strided(dfdx, shape=out_view_shape, strides=strides)
        np.einsum(back_prop_lbls,
                  *operands,
                  out=out_view,
                  optimize=self.optimize)
        if factor > 1:
            # This tensor-label pair appears several times as
            # input to einsum. Scale the gradient accordingly
            # such that the full contribution of the tensor-label
            # pair is accounted for.
            dfdx *= factor
        return dfdx
示例#9
0
def test_reduce_broadcast_same_shape(grad):
    """ test when no broadcasting occurred"""
    var_shape = grad.shape
    reduced_grad = reduce_broadcast(grad=grad, var_shape=var_shape)
    assert_allclose(actual=reduced_grad, desired=grad)
示例#10
0
def test_broadcast_scalar(grad):
    """ test when grad was broadcasted from a scalar"""
    assert_allclose(reduce_broadcast(grad, tuple()), grad.sum())
示例#11
0
def finite_difference(f,
                      *args,
                      back_grad,
                      vary_ind=None,
                      h=Decimal(1) / Decimal(int(1e8)),
                      as_decimal=False,
                      kwargs=None):
    """ Computes numerical partial derivatives of f(x0, x1, ...) in each
        of its variables, using the central difference method.
        This is a "fast" method - it varies entire arrays at once. Thus
        this is only appropriate for trivial vectorized functions that
        map accross entries of arrays (like add or multiply). E.g.
        matrix multiplication is *not* suited for this style of gradient.

        Parameters
        ----------
        f : Callable[[numpy.ndarray, ...], numpy.ndarray]
            f(x, ...) -> numpy.ndarray
        *args : Tuple[numpy.ndarray, ...]
            The input arguments to be fed to f.

        back_grad : numpy.ndarray
            The gradient being back-propagated to x and y, via f

        vary_ind : Optional[Tuple[int, ...]]
            If `None`, the partials of f with respect to all the inputs are.
            computed. Otherwise you can specify a sequence of the indices
            of the variables whose partials are to be computed
               0 -> w.r.t x only, 1 -> w.r.t y only, etc.

        h : float, optional, (default=Decimal(1E-8))
            Approximating infinitesimal.

        as_decimal : bool, optional (default=True)
            If True, f's arguments are passed as Decimal-type arrays. This
            improves numerical precision, but is not permitted by some functions.

        kwargs : Optional[Dict]

        Returns
        -------
        Tuple[Union[NoneType, numpy.ndarray], ...]
            df/dx0, df/dx1, ... - evaluated at (`x0`, `x1`, ... ).
        """
    def to_decimal_array(arr):
        """ Convert numpy ND-array to Decimal-type object array of the same shape.
            Used for facilitating high-precision arithmetic.

            Parameters
            ----------
            arr : Union[float, numpy.ndarray]

            Returns
            -------
            numpy.ndarray
                Decimal-type object array"""
        arr = np.asarray(arr)

        if arr.dtype.kind == "O":
            return arr
        return np.array(tuple(Decimal(float(i)) for i in arr.flat),
                        dtype=Decimal).reshape(arr.shape)

    if kwargs is None:
        kwargs = {}

    if not args:
        raise ValueError("At least one value must be passed to `args`")

    h = Decimal(h) if as_decimal else float(h)
    two_h = Decimal(2) * h if as_decimal else 2 * h

    args = tuple(to_decimal_array(i) if as_decimal else i for i in args)

    grads = [None] * len(args)

    def gen_fwd_diff(i):
        # x1, ..., x_i + h, ..., xn
        return ((var if j != i else var + h) for j, var in enumerate(args))

    def gen_bkwd_diff(i):
        # x1, ..., x_i - h, ..., xn
        return ((var if j != i else var - h) for j, var in enumerate(args))

    for n in range(len(args)):
        if vary_ind is not None and n not in vary_ind:
            continue
        # central difference in variable n
        dvar = (f(*gen_fwd_diff(n), **kwargs) -
                f(*gen_bkwd_diff(n), **kwargs)) / (two_h)
        grads[n] = reduce_broadcast(back_grad * dvar.astype(float),
                                    args[n].shape)

    return grads
示例#12
0
def numerical_gradient(f,
                       *args,
                       back_grad,
                       vary_ind=None,
                       h=1e-20,
                       kwargs=None):
    """ Computes numerical partial derivatives of f(x0, x1, ...) in each
        of its variables, using the central difference method.
        This is a "fast" method - it varies entire arrays at once. Thus
        this is only appropriate for trivial vectorized functions that
        map across entries of arrays (like add or multiply). E.g.
        matrix multiplication is *not* suited for this style of gradient.

        Parameters
        ----------
        f : Callable[[numpy.ndarray, ...], numpy.ndarray]
            f(x, ...) -> numpy.ndarray
        *args : Tuple[numpy.ndarray, ...]
            The input arguments to be fed to f.

        back_grad : numpy.ndarray
            The gradient being back-propagated to x and y, via f

        vary_ind : Optional[Tuple[int, ...]]
            If `None`, the partials of f with respect to all the inputs are.
            computed. Otherwise you can specify a sequence of the indices
            of the variables whose partials are to be computed
               0 -> w.r.t x only, 1 -> w.r.t y only, etc.

        h : float, optional, (default=Decimal(1E-8))
            Approximating infinitesimal.

        kwargs : Optional[Dict]

        Returns
        -------
        Tuple[Union[NoneType, numpy.ndarray], ...]
            df/dx0, df/dx1, ... - evaluated at (`x0`, `x1`, ... ).
        """

    if kwargs is None:
        kwargs = {}

    if not args:
        raise ValueError("At least one value must be passed to `args`")

    args = tuple(i.astype(np.complex128) for i in args)
    grads = [None] * len(args)

    def gen_fwd_diff(i):
        # x1, ..., x_i + h, ..., xn
        return ((var if j != i else var + h * 1j)
                for j, var in enumerate(args))

    for n in range(len(args)):
        if vary_ind is not None and n not in vary_ind:
            continue
        # central difference in variable n
        dvar = f(*gen_fwd_diff(n), **kwargs).imag / h
        grads[n] = reduce_broadcast(back_grad * dvar, args[n].shape)

    return grads
示例#13
0
def test_bad_gradient_dimensionality():
    """ test that grad.dim < len(var_shape) raises ValueError"""
    var_shape = (1, 2, 3)
    grad = np.empty((1, 2))
    with raises(ValueError):
        reduce_broadcast(grad=grad, var_shape=var_shape)