Exemplo n.º 1
0
def test_reject_bad_filt_number():
    for cls in (FilterActs, ImageActs):
        # Tests that running FilterActs with a # of filters per
        # group that is not 16 is an error
        rng = np.random.RandomState([2012, 10, 9])
        batch_size = 5
        rows = 10
        cols = 9
        channels = 3
        filter_rows = 4
        filter_cols = filter_rows
        num_filters = 6

        images = shared(rng.uniform(
            -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                        name='images')
        filters = shared(rng.uniform(-1., 1.,
                                     (channels, filter_rows, filter_cols,
                                      num_filters)).astype('float32'),
                         name='filters')

        gpu_images = gpu_from_host(images)
        gpu_filters = gpu_from_host(filters)

        if cls is ImageActs:
            output = cls()(gpu_images, gpu_filters,
                           as_tensor_variable((rows, cols)))
        else:
            output = cls()(gpu_images, gpu_filters)
        f = function([], output)
        try:
            output = f()
        except ValueError:
            continue
        assert False
Exemplo n.º 2
0
def test_match_valid_conv():

    # Tests that running FilterActs with no padding is the same as running
    # theano's conv2D in valid mode

    rng = np.random.RandomState([2012, 10, 9])

    batch_size = 5
    rows = 10
    cols = 9
    channels = 3
    filter_rows = 4
    filter_cols = filter_rows
    num_filters = 16

    images = shared(rng.uniform(
        -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                    name='images')
    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs()(gpu_images, gpu_filters)
    output = host_from_gpu(output)

    images_bc01 = images.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid')

    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

    f = function([], [output, output_conv2d])

    output, output_conv2d = f()

    warnings.warn(
        """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others."""
    )
    if np.abs(output - output_conv2d).max() > 2.4e-6:
        assert type(output) == type(output_conv2d)
        assert output.dtype == output_conv2d.dtype
        if output.shape != output_conv2d.shape:
            print('cuda-convnet shape: ', output.shape)
            print('theano shape: ', output_conv2d.shape)
            assert False
        err = np.abs(output - output_conv2d)
        print('absolute error range: ', (err.min(), err.max()))
        print('mean absolute error: ', err.mean())
        print('cuda-convnet value range: ', (output.min(), output.max()))
        print('theano value range: ',
              (output_conv2d.min(), output_conv2d.max()))
        assert False
Exemplo n.º 3
0
def insert_gpu_weight_acts(node):
    """
    .. todo::

        WRITEME
    """
    if isinstance(node.op, WeightActs):
        """
        .. todo::

            WRITEME
        """
        images, hidacts, frows, fcols = node.inputs
        if any_from_gpu(images, hidacts) or any_gpu_client(*node.outputs):
            gpu_weight_acts = GpuWeightActs(
                module_stride=node.op.module_stride, partial_sum=1)
            return [
                host_from_gpu(
                    gpu_weight_acts(
                        gpu_from_host(images),
                        gpu_contiguous(hidacts),
                        frows,
                        fcols,
                    ))
            ]
Exemplo n.º 4
0
def insert_gpu_filter_acts(node):
    """
    .. todo::

        WRITEME
    """
    if isinstance(node.op, FilterActs):
        images, filters = node.inputs
        if any_from_gpu(images, filters) or any_gpu_client(*node.outputs):
            gpu_filter_acts = GpuFilterActs(
                module_stride=node.op.module_stride, partial_sum=1)
            return [
                host_from_gpu(
                    gpu_filter_acts(gpu_from_host(images),
                                    gpu_from_host(filters)))
            ]
Exemplo n.º 5
0
def test_cross_map_norm_grad_simple():
    rng = numpy.random.RandomState([2013, 2, 10])
    op = CrossMapNorm(16, 15/16., 1, True)
    make_graph = lambda inp: op(gpu_from_host(inp))[0]
    verify = lambda array: verify_grad(make_graph, [array])
    inputs = [numpy.ones((16, 1, 1, 1), dtype='float32'),
              rng.normal(size=(32, 5, 5, 10)).astype('float32')]
    for arr in inputs:
        yield verify, arr
Exemplo n.º 6
0
    def lmul(self, x):
        """
        .. todo::

            WRITEME properly

        dot(x, A)
        aka, do convolution with input image x
        """

        check_cuda(str(type(self)) + ".lmul")

        cpu = 'Cuda' not in str(type(x))

        if cpu:
            x = gpu_from_host(x)

        # x must be formatted as channel, topo dim 0, topo dim 1, batch_index
        # for use with FilterActs
        assert x.ndim == 4
        x_axes = self.input_axes
        assert len(x_axes) == 4

        op_axes = ('c', 0, 1, 'b')

        if tuple(x_axes) != op_axes:
            x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes])

        x = gpu_contiguous(x)

        # Patch old pickle files.
        if not hasattr(self, 'kernel_stride'):
            self.kernel_stride = (1, 1)
        rval = FilterActs(self.pad, self.partial_sum, self.kernel_stride[0])(
            x,
            self._filters
        )

        # Format the output based on the output space
        rval_axes = self.output_axes
        assert len(rval_axes) == 4

        if cpu:
            rval = host_from_gpu(rval)

        if tuple(rval_axes) != op_axes:
            rval = rval.dimshuffle(*[op_axes.index(axis)
                                     for axis in rval_axes])

        return rval
Exemplo n.º 7
0
def insert_gpu_img_acts(node):
    """
    .. todo::

        WRITEME
    """
    if isinstance(node.op, ImgActs):
        filters, hidacts, irows, icols = node.inputs
        if any_from_gpu(filters, hidacts) or any_gpu_client(*node.outputs):
            gpu_img_acts = GpuImgActs(module_stride=node.op.module_stride,
                                      partial_sum=1)
            return [
                host_from_gpu(
                    gpu_img_acts(
                        gpu_from_host(filters),
                        gpu_contiguous(hidacts),
                        irows,
                        icols,
                    ))
            ]
Exemplo n.º 8
0
def test_match_full_conv_grad():

    # Tests that the gradient of ImageActs with no padding is the same as the
    # gradient of
    # theano's conv2D in full mode after flipping the kernel and tranposing
    # the output and input channels

    rng = np.random.RandomState([2013, 1, 29])

    batch_size = 2
    rows = 6
    cols = 7
    channels = 3
    filter_rows = 5
    filter_cols = filter_rows
    num_filters = 16

    hid_acts = shared(rng.uniform(
        -1., 1., (num_filters, rows - filter_rows + 1, cols - filter_cols + 1,
                  batch_size)).astype('float32'),
                      name='hidacts')

    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(hid_acts)
    gpu_filters = gpu_from_host(filters)

    output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7)))
    output = host_from_gpu(output)

    images_bc01 = hid_acts.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    # need to tranpose the kernel stack to do imgActs rather than filterActs
    filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3)
    # In order to do the transpose operation, we must flip the kernels
    # But in theano's conv2d, the kernels get flipped anyway
    # so in this case, we do not flip the kernel

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full')

    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

    theano_rng = MRG_RandomStreams(5 * 10 * 2013)

    random = theano_rng.normal(size=output_conv2d.shape,
                               dtype=output_conv2d.dtype)

    projected = (output * random).sum()
    projected_conv_2d = (output_conv2d * random).sum()

    grads = T.grad(projected, [hid_acts, filters]) + T.grad(
        projected_conv_2d, [hid_acts, filters])

    f = function([], grads)

    gi, gf, gi_th, gf_th = f()

    assert gi.shape == gi_th.shape
    diff = np.abs(gi - gi_th).max()
    if diff > 2.9e-6:
        assert False

    diff = np.abs(gf - gf_th).max()
    if diff > 1.5e-6:
        raise AssertionError(diff)
Exemplo n.º 9
0
def test_match_full_conv():

    # Tests that running ImageActs with no padding is the same as running
    # theano's conv2D in full mode after flipping the kernel and tranposing
    # the output and input channels
    # In other words, if convolution computes H=XK, we now compute
    # R=HK^T

    rng = np.random.RandomState([2013, 1, 29])

    batch_size = 2
    rows = 6
    cols = 7
    channels = 3
    filter_rows = 5
    filter_cols = filter_rows
    num_filters = 16

    hid_acts = shared(rng.uniform(
        -1., 1., (num_filters, rows - filter_rows + 1, cols - filter_cols + 1,
                  batch_size)).astype('float32'),
                      name='hidacts')

    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(hid_acts)
    gpu_filters = gpu_from_host(filters)

    output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7)))
    output = host_from_gpu(output)

    images_bc01 = hid_acts.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    # need to tranpose the kernel stack to do imgActs rather than filterActs
    filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3)
    # In order to do the transpose operation, we must flip the kernels
    # But in theano's conv2d, the kernels get flipped anyway
    # so in this case, we do not flip the kernel

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full')

    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

    f = function([], [output, output_conv2d])

    output, output_conv2d = f()

    warnings.warn(
        """test_match_full_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others."""
    )
    if np.abs(output - output_conv2d).max() > 2.4e-6:
        assert type(output) == type(output_conv2d)
        assert output.dtype == output_conv2d.dtype
        if output.shape != output_conv2d.shape:
            print('cuda-convnet shape: ', output.shape)
            print('theano shape: ', output_conv2d.shape)
            assert False
        err = np.abs(output - output_conv2d)
        print('absolute error range: ', (err.min(), err.max()))
        print('mean absolute error: ', err.mean())
        print('cuda-convnet value range: ', (output.min(), output.max()))
        print('theano value range: ',
              (output_conv2d.min(), output_conv2d.max()))
        assert False
Exemplo n.º 10
0
def main():
    logger = logging.getLogger(__name__)

    # Tests that running FilterActs with no padding is the same as running
    # theano's conv2D in valid mode

    rng = np.random.RandomState([2012, 10, 9])

    batch_size = 128
    rows = 32
    cols = 32
    channels = 3
    filter_rows = 7
    filter_cols = filter_rows
    num_filters = 16

    images = shared(rng.uniform(
        -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                    name='images')
    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs()(gpu_images, gpu_filters)
    output = host_from_gpu(output)

    images_bc01 = images.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid')

    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

    f = function([], [output, output_conv2d])

    def err():
        output, output_conv2d = f()
        diff = output - output_conv2d

        return np.abs(diff).max()

    prev_err = err()
    accepted_steps = 0

    while True:
        logger.debug('Current error: {0}'.format(prev_err))
        change_filters = rng.randint(2)

        if change_filters:
            target = filters
        else:
            target = images

        old_val = target.get_value()

        selector = rng.randint(2)
        if selector == 0:
            new_val = old_val + rng.uniform(-.1, .1, old_val.shape)
        else:
            idx1 = rng.randint(old_val.shape[0])
            idx2 = rng.randint(old_val.shape[1])
            idx3 = rng.randint(old_val.shape[2])
            idx4 = rng.randint(old_val.shape[3])
            new_val = old_val.copy()
            new_val[idx1, idx2, idx3, idx4] += rng.uniform(-1., 1.)
        new_val = new_val.astype(old_val.dtype)

        target.set_value(new_val)

        new_err = err()

        if new_err <= prev_err:
            logger.debug(
                'Failed to move beyond step {0}'.format(accepted_steps))
            target.set_value(old_val)
        else:
            prev_err = new_err
            accepted_steps += 1
Exemplo n.º 11
0
def test_grad_strided():
    rng = np.random.RandomState([2012, 10, 9])
    batch_size = 5
    rows = 9
    cols = 9
    channels = 3
    filter_rows = 3
    filter_cols = filter_rows
    num_filters = 16
    stride = 3

    images = shared(rng.uniform(
        -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                    name='images')
    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs(stride=stride)(gpu_images, gpu_filters)
    output = host_from_gpu(output)

    images_bc01 = images.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

    output_conv2d = conv2d(images_bc01,
                           filters_bc01,
                           border_mode='valid',
                           subsample=(stride, stride))
    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

    checker = function([], [output, output_conv2d])
    output_numpy, output_conv2d_numpy = checker()
    if output_numpy.shape != output_conv2d_numpy.shape:
        raise AssertionError(
            "theano and cuda convnet follow different conventions for this input size, so we can't test cuda convnet by matching it against theano for these inputs"
        )

    # Proper random projection, like verify_grad does.
    theano_rng = MRG_RandomStreams(2013 * 5 * 4)
    cost_weights = theano_rng.normal(size=output_conv2d.shape,
                                     dtype=output_conv2d.dtype)
    cost = (cost_weights * output).sum()

    # XXX: use verify_grad
    images_grad, filters_grad = grad(cost, [images, filters])
    reference_cost = (cost_weights * output_conv2d).sum()
    images_conv2d_grad, filters_conv2d_grad = grad(reference_cost,
                                                   [images, filters])

    f = function(
        [],
        [images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad])

    images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad = f()

    warnings.warn(
        """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others."""
    )
    # XXX: Refactor
    if np.abs(images_grad - images_conv2d_grad).max() > 1.15e-5:
        print("=== IMAGES GRADIENT ===")
        assert type(images_grad) == type(images_conv2d_grad)
        assert images_grad.dtype == images_conv2d_grad.dtype
        if images_grad.shape != images_conv2d_grad.shape:
            print('cuda-convnet shape: ', images_grad.shape)
            print('theano shape: ', images_conv2d_grad.shape)
            assert False
        err = np.abs(images_grad - images_conv2d_grad)
        print('absolute error range: ', (err.min(), err.max()))
        print('mean absolute error: ', err.mean())
        print('cuda-convnet value range: ',
              (images_grad.min(), images_grad.max()))
        print('theano value range: ',
              (images_conv2d_grad.min(), images_conv2d_grad.max()))
        assert False
    if np.abs(filters_grad - filters_conv2d_grad).max() > 1e-5:
        print("=== FILTERS GRADIENT ===")
        assert type(filters_grad) == type(filters_conv2d_grad)
        assert filters_grad.dtype == filters_conv2d_grad.dtype
        if filters_grad.shape != filters_conv2d_grad.shape:
            print('cuda-convnet shape: ', filters_grad.shape)
            print('theano shape: ', filters_conv2d_grad.shape)
            assert False
        err = np.abs(filters_grad - filters_conv2d_grad)
        print('absolute error range: ', (err.min(), err.max()))
        print('mean absolute error: ', err.mean())
        print('cuda-convnet value range: ',
              (filters_grad.min(), filters_grad.max()))
        print('theano value range: ',
              (filters_conv2d_grad.min(), filters_conv2d_grad.max()))
        assert False
Exemplo n.º 12
0
def test_grad():
    rng = np.random.RandomState([2012, 10, 9])
    batch_size = 5
    rows = 10
    cols = 9
    channels = 3
    filter_rows = 4
    filter_cols = filter_rows
    num_filters = 16

    images = shared(rng.uniform(
        -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                    name='images')
    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs()(gpu_images, gpu_filters)
    output = host_from_gpu(output)

    # Proper random projection, like verify_grad does.
    cost_weights = rng.normal(size=(num_filters, rows - filter_rows + 1,
                                    cols - filter_cols + 1, batch_size))
    cost = (constant(cost_weights) * output).sum()

    images_bc01 = images.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid')

    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)
    # XXX: use verify_grad
    images_grad, filters_grad = grad(cost.sum(), [images, filters])
    reference_cost = (constant(cost_weights) * output_conv2d).sum()
    images_conv2d_grad, filters_conv2d_grad = grad(reference_cost,
                                                   [images, filters])
    f = function(
        [],
        [images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad])

    images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad = f()

    warnings.warn(
        """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others."""
    )
    # XXX: Refactor
    if np.abs(images_grad - images_conv2d_grad).max() > 1.15e-5:
        print("=== IMAGES GRADIENT ===")
        assert type(images_grad) == type(images_conv2d_grad)
        assert images_grad.dtype == images_conv2d_grad.dtype
        if images_grad.shape != images_conv2d_grad.shape:
            print('cuda-convnet shape: ', images_grad.shape)
            print('theano shape: ', images_conv2d_grad.shape)
            assert False
        err = np.abs(images_grad - images_conv2d_grad)
        print('absolute error range: ', (err.min(), err.max()))
        print('mean absolute error: ', err.mean())
        print('cuda-convnet value range: ',
              (images_grad.min(), images_grad.max()))
        print('theano value range: ',
              (images_conv2d_grad.min(), images_conv2d_grad.max()))
        assert False
    if np.abs(filters_grad - filters_conv2d_grad).max() > 1.15e-5:
        print("=== FILTERS GRADIENT ===")
        assert type(filters_grad) == type(filters_conv2d_grad)
        assert filters_grad.dtype == filters_conv2d_grad.dtype
        if filters_grad.shape != filters_conv2d_grad.shape:
            print('cuda-convnet shape: ', filters_grad.shape)
            print('theano shape: ', filters_conv2d_grad.shape)
            assert False
        err = np.abs(filters_grad - filters_conv2d_grad)
        print('absolute error range: ', (err.min(), err.max()))
        print('mean absolute error: ', err.mean())
        print('cuda-convnet value range: ',
              (filters_grad.min(), filters_grad.max()))
        print('theano value range: ',
              (filters_conv2d_grad.min(), filters_conv2d_grad.max()))
        assert False
Exemplo n.º 13
0
def test_match_grad_valid_conv():

    # Tests that weightActs is the gradient of FilterActs
    # with respect to the weights.

    for partial_sum in [0, 1, 4]:
        rng = np.random.RandomState([2012, 10, 9])

        batch_size = 3
        rows = 7
        cols = 9
        channels = 8
        filter_rows = 4
        filter_cols = filter_rows
        num_filters = 16

        images = shared(rng.uniform(
            -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                        name='images')
        filters = rng.uniform(-1., 1., (channels, filter_rows, filter_cols,
                                        num_filters)).astype('float32')
        filters = shared(filters, name='filters')

        gpu_images = gpu_from_host(images)
        gpu_filters = gpu_from_host(filters)

        output = FilterActs(partial_sum=partial_sum)(gpu_images, gpu_filters)
        output = host_from_gpu(output)

        images_bc01 = images.dimshuffle(3, 0, 1, 2)
        filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
        filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

        output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid')

        output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

        theano_rng = MRG_RandomStreams(2013 + 1 + 31)

        coeffs = theano_rng.normal(avg=0.,
                                   std=1.,
                                   size=output_conv2d.shape,
                                   dtype='float32')

        cost_conv2d = (coeffs * output_conv2d).sum()

        weights_grad_conv2d = T.grad(cost_conv2d, filters)

        cost = (coeffs * output).sum()
        hid_acts_grad = T.grad(cost, output)

        weights_grad = WeightActs(partial_sum=partial_sum)(
            gpu_images, gpu_from_host(hid_acts_grad), as_tensor_variable(
                (4, 4)))[0]
        weights_grad = host_from_gpu(weights_grad)

        f = function(
            [], [output, output_conv2d, weights_grad, weights_grad_conv2d])

        output, output_conv2d, weights_grad, weights_grad_conv2d = f()

        if np.abs(output - output_conv2d).max() > 8e-6:
            assert type(output) == type(output_conv2d)
            assert output.dtype == output_conv2d.dtype
            if output.shape != output_conv2d.shape:
                print('cuda-convnet shape: ', output.shape)
                print('theano shape: ', output_conv2d.shape)
                assert False
            err = np.abs(output - output_conv2d)
            print('absolute error range: ', (err.min(), err.max()))
            print('mean absolute error: ', err.mean())
            print('cuda-convnet value range: ', (output.min(), output.max()))
            print('theano value range: ',
                  (output_conv2d.min(), output_conv2d.max()))
            assert False

        warnings.warn(
            "test_match_grad_valid_conv success criterion is not very strict."
            " Can we verify that this is OK? One possibility is that theano"
            " is numerically unstable and Alex's code is better. Probably"
            " theano CPU 64 bit is OK but it's worth checking the others.")

        if np.abs(weights_grad - weights_grad_conv2d).max() > 8.6e-6:
            if type(weights_grad) != type(weights_grad_conv2d):
                raise AssertionError("weights_grad is of type " +
                                     str(weights_grad))
            assert weights_grad.dtype == weights_grad_conv2d.dtype
            if weights_grad.shape != weights_grad_conv2d.shape:
                print('cuda-convnet shape: ', weights_grad.shape)
                print('theano shape: ', weights_grad_conv2d.shape)
                assert False
            err = np.abs(weights_grad - weights_grad_conv2d)
            print('absolute error range: ', (err.min(), err.max()))
            print('mean absolute error: ', err.mean())
            print('cuda-convnet value range: ',
                  (weights_grad.min(), weights_grad.max()))
            print('theano value range: ',
                  (weights_grad_conv2d.min(), weights_grad_conv2d.max()))
            assert False