Exemplo n.º 1
0
    def impl_test_binaryop_2d(self, dtype):
        if issubclass(dtype, numbers.Integral):
            a_sca = np.array(np.random.randint(1, 10), dtype=dtype)
            b_sca = np.array(np.random.randint(1, 10), dtype=dtype)
            a_vec = np.random.randint(1, 10, 3).astype(dtype)
            b_vec = np.random.randint(1, 10, 3).astype(dtype)
            a_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype)
            b_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype)
        else:
            a_sca = np.random.normal(scale=5.0, size=()).astype(dtype)
            b_sca = np.random.normal(scale=5.0, size=()).astype(dtype)
            a_vec = np.random.normal(scale=5.0, size=(3, )).astype(dtype)
            b_vec = np.random.normal(scale=5.0, size=(3, )).astype(dtype)
            a_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype)
            b_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype)

        a_sca_gpu = gpuarray.to_gpu(a_sca)
        b_sca_gpu = gpuarray.to_gpu(b_sca)
        a_vec_gpu = gpuarray.to_gpu(a_vec)
        b_vec_gpu = gpuarray.to_gpu(b_vec)
        a_mat_gpu = gpuarray.to_gpu(a_mat)
        b_mat_gpu = gpuarray.to_gpu(b_mat)

        # addition
        assert np.allclose(misc.add(a_sca_gpu, b_sca_gpu).get(), a_sca + b_sca)
        assert np.allclose(misc.add(a_vec_gpu, b_vec_gpu).get(), a_vec + b_vec)
        assert np.allclose(misc.add(a_mat_gpu, b_mat_gpu).get(), a_mat + b_mat)

        # subtract
        assert np.allclose(
            misc.subtract(a_sca_gpu, b_sca_gpu).get(), a_sca - b_sca)
        assert np.allclose(
            misc.subtract(a_vec_gpu, b_vec_gpu).get(), a_vec - b_vec)
        assert np.allclose(
            misc.subtract(a_mat_gpu, b_mat_gpu).get(), a_mat - b_mat)

        # multiplication
        assert np.allclose(
            misc.multiply(a_sca_gpu, b_sca_gpu).get(), a_sca * b_sca)
        assert np.allclose(
            misc.multiply(a_vec_gpu, b_vec_gpu).get(), a_vec * b_vec)
        assert np.allclose(
            misc.multiply(a_mat_gpu, b_mat_gpu).get(), a_mat * b_mat)

        # division
        assert np.allclose(
            misc.divide(a_sca_gpu, b_sca_gpu).get(), a_sca / b_sca)
        assert np.allclose(
            misc.divide(a_vec_gpu, b_vec_gpu).get(), a_vec / b_vec)
        assert np.allclose(
            misc.divide(a_mat_gpu, b_mat_gpu).get(), a_mat / b_mat)
Exemplo n.º 2
0
        def thunk():
            alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None])
            x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :])
            x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :])
            Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b))
            Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b))
            Xtn = misc.sum(Xt, axis=1, keepdims=True)
            Xfn = misc.sum(Xf, axis=1, keepdims=True)
            Xt = misc.divide(Xt, Xtn)
            Xf = misc.divide(Xf, Xfn)
            w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha)
            dq = Xt - Xf
            qdw = dq / w
            t1 = misc.sum(x * qdw, axis=1)
            f = 2 * depth + self.base.n
            t2 = f * misc.sum(dq, axis=1) / misc.sum(w, axis=1)
            t3 = misc.sum(x, axis=1) * misc.sum(qdw, axis=1)
            dalpha = t1 - t2 + t3
            del dq, t1, f, t2, t3

            iw = 1 / w
            S1 = misc.multiply(
                depth[:, None] * (self.base.n - 1) / self.base.n, iw)
            S2 = (self.base.n + depth[:, None]) / cumath.log(
                misc.sum(w, axis=1, keepdims=True))
            F = misc.multiply(misc.subtract((x * iw) - S1, S2), alpha)
            del w, iw, S1, S2

            cast = gpuarray.zeros((x_t.shape[1], Xt.shape[1]),
                                  dtype=theano.config.floatX)
            dLq_t = gpuarray.zeros(x_t.shape, dtype=theano.config.floatX)
            dLq_f = gpuarray.zeros(x_f.shape, dtype=theano.config.floatX)
            for i in range(Xt.shape[0]):
                S1 = misc.multiply(Xt[None, i, :], A)
                S2 = misc.sum(S1, axis=1, keepdims=True)
                S2 = misc.multiply(S2, misc.add(Xt[None, i, :], cast))
                dLq_t[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2),
                                       axis=1)
                S1 = misc.multiply(Xf[None, i, :], A)
                S2 = misc.sum(S1, axis=1, keepdims=True)
                S2 = misc.multiply(S2, misc.add(Xf[None, i, :], cast))
                dLq_f[i, :] = misc.sum(misc.multiply(F[None, i, :], S1 - S2),
                                       axis=1)
            outputs[0][0] = dalpha.get()
            outputs[1][0] = dLq_t.get()
            outputs[2][0] = dLq_f.get()
            for v in node.outputs:
                compute_map[v][0] = True
Exemplo n.º 3
0
    def impl_test_binaryop_2d(self, dtype):
        if issubclass(dtype, numbers.Integral):
            a_sca = np.array(np.random.randint(1, 10), dtype=dtype)
            b_sca = np.array(np.random.randint(1, 10), dtype=dtype)
            a_vec = np.random.randint(1, 10, 3).astype(dtype)
            b_vec = np.random.randint(1, 10, 3).astype(dtype)
            a_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype)
            b_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype)            
        else:
            a_sca = np.random.normal(scale=5.0, size=()).astype(dtype)
            b_sca = np.random.normal(scale=5.0, size=()).astype(dtype)
            a_vec = np.random.normal(scale=5.0, size=(3,)).astype(dtype)
            b_vec = np.random.normal(scale=5.0, size=(3,)).astype(dtype)
            a_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype)
            b_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype)

        a_sca_gpu = gpuarray.to_gpu(a_sca)
        b_sca_gpu = gpuarray.to_gpu(b_sca)
        a_vec_gpu = gpuarray.to_gpu(a_vec)
        b_vec_gpu = gpuarray.to_gpu(b_vec)
        a_mat_gpu = gpuarray.to_gpu(a_mat)
        b_mat_gpu = gpuarray.to_gpu(b_mat)

        # addition
        assert np.allclose(misc.add(a_sca_gpu, b_sca_gpu).get(), a_sca+b_sca)
        assert np.allclose(misc.add(a_vec_gpu, b_vec_gpu).get(), a_vec+b_vec)
        assert np.allclose(misc.add(a_mat_gpu, b_mat_gpu).get(), a_mat+b_mat)

        # subtract
        assert np.allclose(misc.subtract(a_sca_gpu, b_sca_gpu).get(), a_sca-b_sca)
        assert np.allclose(misc.subtract(a_vec_gpu, b_vec_gpu).get(), a_vec-b_vec)
        assert np.allclose(misc.subtract(a_mat_gpu, b_mat_gpu).get(), a_mat-b_mat)

        # multiplication
        assert np.allclose(misc.multiply(a_sca_gpu, b_sca_gpu).get(), a_sca*b_sca)
        assert np.allclose(misc.multiply(a_vec_gpu, b_vec_gpu).get(), a_vec*b_vec)
        assert np.allclose(misc.multiply(a_mat_gpu, b_mat_gpu).get(), a_mat*b_mat)

        # division
        assert np.allclose(misc.divide(a_sca_gpu, b_sca_gpu).get(), a_sca/b_sca)
        assert np.allclose(misc.divide(a_vec_gpu, b_vec_gpu).get(), a_vec/b_vec)
        assert np.allclose(misc.divide(a_mat_gpu, b_mat_gpu).get(), a_mat/b_mat)
Exemplo n.º 4
0
 def thunk():
     alpha = gpuarray.to_gpu(np.squeeze(np.asarray(inputs[0]))[:, None])
     x_t = gpuarray.to_gpu(np.asarray(inputs[1])[0, :, :])
     x_f = gpuarray.to_gpu(np.asarray(inputs[2])[0, :, :])
     Xt = cumath.exp(misc.add(linalg.dot(x_t, A), b))
     Xf = cumath.exp(misc.add(linalg.dot(x_f, A), b))
     Xtn = misc.sum(Xt, axis=1, keepdims=True)
     Xfn = misc.sum(Xf, axis=1, keepdims=True)
     Xt = misc.divide(Xt, Xtn)
     Xf = misc.divide(Xf, Xfn)
     w = misc.multiply(Xt, alpha) + misc.multiply(Xf, 1 - alpha)
     wp = cumath.log(w)
     wpn = misc.sum(wp, axis=1, keepdims=True) / self.n
     wp = misc.subtract(wp, wpn)
     t1 = misc.sum(x * wp, axis=1)
     t2 = (self.n + depth) * cumath.log(misc.sum(w, axis=1))
     t3 = depth * wpn
     outputs[0][0] = misc.sum(t1 - t2 + t3).get()
     for v in node.outputs:
         compute_map[v][0] = True
Exemplo n.º 5
0
 def __rmul__(self, other): return cumisc.multiply(other, self)
 def __rdiv__(self, other): return cumisc.divide(other, self)
Exemplo n.º 6
0
 def __mul__(self, other): return cumisc.multiply(self, other)
 def __div__(self, other): return cumisc.divide(  self, other)
Exemplo n.º 7
0
    def _impl_test_binaryop_2d(self, dtype):
        if issubclass(dtype, numbers.Integral):
            a_sca = np.array(np.random.randint(1, 10), dtype=dtype)
            b_sca = np.array(np.random.randint(1, 10), dtype=dtype)
            a_vec = np.random.randint(1, 10, 3).astype(dtype)
            b_vec = np.random.randint(1, 10, 3).astype(dtype)
            a_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype)
            b_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype)
            b_mat_f = np.random.randint(1, 10, 6).reshape(
                (3, 2)).astype(dtype, order='F')
        else:
            a_sca = np.random.normal(scale=5.0, size=()).astype(dtype)
            b_sca = np.random.normal(scale=5.0, size=()).astype(dtype)
            a_vec = np.random.normal(scale=5.0, size=(3, )).astype(dtype)
            b_vec = np.random.normal(scale=5.0, size=(3, )).astype(dtype)
            a_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype)
            b_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype)
            b_mat_f = np.random.normal(scale=5.0,
                                       size=(3, 2)).astype(dtype, order='F')

        a_sca_gpu = gpuarray.to_gpu(a_sca)
        b_sca_gpu = gpuarray.to_gpu(b_sca)
        a_vec_gpu = gpuarray.to_gpu(a_vec)
        b_vec_gpu = gpuarray.to_gpu(b_vec)
        a_mat_gpu = gpuarray.to_gpu(a_mat)
        b_mat_gpu = gpuarray.to_gpu(b_mat)
        b_mat_f_gpu = gpuarray.to_gpu(b_mat_f)

        # addition
        assert_allclose(misc.add(a_sca_gpu, b_sca_gpu).get(),
                        a_sca + b_sca,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.add(a_vec_gpu, b_vec_gpu).get(),
                        a_vec + b_vec,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.add(a_mat_gpu, b_mat_gpu).get(),
                        a_mat + b_mat,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])

        # subtract
        assert_allclose(misc.subtract(a_sca_gpu, b_sca_gpu).get(),
                        a_sca - b_sca,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.subtract(a_vec_gpu, b_vec_gpu).get(),
                        a_vec - b_vec,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.subtract(a_mat_gpu, b_mat_gpu).get(),
                        a_mat - b_mat,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])

        # multiplication
        assert_allclose(misc.multiply(a_sca_gpu, b_sca_gpu).get(),
                        a_sca * b_sca,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.multiply(a_vec_gpu, b_vec_gpu).get(),
                        a_vec * b_vec,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.multiply(a_mat_gpu, b_mat_gpu).get(),
                        a_mat * b_mat,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])

        # division
        if issubclass(dtype, numbers.Integral):
            assert_allclose(misc.divide(a_sca_gpu, b_sca_gpu).get(),
                            a_sca // b_sca,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])
            assert_allclose(misc.divide(a_vec_gpu, b_vec_gpu).get(),
                            a_vec // b_vec,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])
            assert_allclose(misc.divide(a_mat_gpu, b_mat_gpu).get(),
                            a_mat // b_mat,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])
        else:
            assert_allclose(misc.divide(a_sca_gpu, b_sca_gpu).get(),
                            a_sca / b_sca,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])
            assert_allclose(misc.divide(a_vec_gpu, b_vec_gpu).get(),
                            a_vec / b_vec,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])
            assert_allclose(misc.divide(a_mat_gpu, b_mat_gpu).get(),
                            a_mat / b_mat,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])

        # mismatched order
        assert_raises(ValueError, misc.add, a_mat_gpu, b_mat_f_gpu)
Exemplo n.º 8
0
    for step in xrange(N_TIMESTEPS):
        # print step
       # Implementing split-step method
       # Update wavefunction and resovoir, record density
        cu_fft.fft(psi_gpu, psi_gpu, plan_forward)
        psi_gpu *= kineticFactorHalf_gpu
        cu_fft.ifft(psi_gpu, psi_gpu, plan_inverse, scale=True)

        # currentDensity_gpu = abs(psi_gpu) ** 2
        # currentDensity_gpu = psi_gpu.real **2 + psi_gpu.imag ** 2
        currentDensity_gpu = (psi_gpu * psi_gpu.conj()).real
        # modSquared.prepared_call(grid, block, psi_gpu.gpudata,
        #                          currentDensity_gpu.gpudata, 1024)
        # n_gpu *= cumath.exp(-gammaRdt_gpu + Rdt_gpu * currentDensity_gpu)
        n_gpu *= cumath.exp(misc.add(- gammaRdt_gpu,
                                     - misc.multiply(Rdt_gpu, currentDensity_gpu)))
        n_gpu += Pdt_gpu
        psi_gpu *= cumath.exp(
            misc.add(
                misc.add(misc.multiply(expFactorPolFirst_gpu, n_gpu),
                         misc.multiply(expFactorPolSecond_gpu, currentDensity_gpu)),
                expFactorPolThird_gpu))

        #  psiNonlinear.prepared_call(grid, block, expFactorPolFirst,
        #                             expFactorPolSecond, expFactorPolThird,
        #                             psi_gpu.gpudata, n_gpu.gpudata,
        #                             currentDensity_gpu.gpudata, 1024)

        cu_fft.fft(psi_gpu, psi_gpu, plan_forward)
        # record spectrum
        drv.memcpy_dtod(spectrum[step, :].gpudata, psi_gpu[N//2, :].gpudata,
Exemplo n.º 9
0
        # print step
        # Implementing split-step method
        # Update wavefunction and resovoir, record density
        cu_fft.fft(psi_gpu, psi_gpu, plan_forward)
        psi_gpu *= kineticFactorHalf_gpu
        cu_fft.ifft(psi_gpu, psi_gpu, plan_inverse, scale=True)

        # currentDensity_gpu = abs(psi_gpu) ** 2
        # currentDensity_gpu = psi_gpu.real **2 + psi_gpu.imag ** 2
        currentDensity_gpu = (psi_gpu * psi_gpu.conj()).real
        # modSquared.prepared_call(grid, block, psi_gpu.gpudata,
        #                          currentDensity_gpu.gpudata, 1024)
        # n_gpu *= cumath.exp(-gammaRdt_gpu + Rdt_gpu * currentDensity_gpu)
        n_gpu *= cumath.exp(
            misc.add(-gammaRdt_gpu,
                     -misc.multiply(Rdt_gpu, currentDensity_gpu)))
        n_gpu += Pdt_gpu
        psi_gpu *= cumath.exp(
            misc.add(
                misc.add(
                    misc.multiply(expFactorPolFirst_gpu, n_gpu),
                    misc.multiply(expFactorPolSecond_gpu, currentDensity_gpu)),
                expFactorPolThird_gpu))

        #  psiNonlinear.prepared_call(grid, block, expFactorPolFirst,
        #                             expFactorPolSecond, expFactorPolThird,
        #                             psi_gpu.gpudata, n_gpu.gpudata,
        #                             currentDensity_gpu.gpudata, 1024)

        cu_fft.fft(psi_gpu, psi_gpu, plan_forward)
        # record spectrum
Exemplo n.º 10
0
def fft_gpu(window_a, search_area):
    """
    Do batch of FFT's on on the Jetson 

    Inputs:
        window_a: 3D numpy array
            stack of interrogation windows of the first frame
            output from the window slice function
        search_area: 3D numpy array
            Stack of interrogation windows of the second frame
            output from the window slice function
    Outputs:
        corr_gpu: 3D numpy array
        Stack of correlation functions for each image pair
    """

    batch_size, win_h, win_w = np.array(window_a.shape).astype(np.int32)
    window_a = window_a.astype(np.float32)
    search_area = search_area.astype(np.float32)

    #allocate space on gpu for FFT's
    #d_winA = drv.mem_alloc(window_a.nbytes)
    #drv.memcpy_htod(d_winA, window_a)
    #d_search_area = drv.mem_alloc(search_area.nbytes)
    #drv.memcpy_htod(d_search_area, search_area)

    d_winA = gpuarray.to_gpu(window_a)
    d_search_area = gpuarray.to_gpu(search_area)

    d_winIFFT = gpuarray.empty_like(d_winA)
    d_winFFT = gpuarray.empty((batch_size, win_h, win_w // 2 + 1),
                              np.complex64)
    d_searchAreaFFT = gpuarray.empty((batch_size, win_h, win_w // 2 + 1),
                                     np.complex64)

    #frame a fft
    plan_forward = cu_fft.Plan((win_h, win_w),
                               np.float32,
                               np.complex64,
                               batch=batch_size)
    cu_fft.fft(d_winA, d_winFFT, plan_forward)

    #frame b fft
    cu_fft.fft(d_search_area, d_searchAreaFFT, plan_forward)

    #multiply the ffts
    d_winFFT = d_winFFT.conj()
    d_tmp = cu_misc.multiply(d_searchAreaFFT, d_winFFT)

    #inverse transform
    plan_inverse = cu_fft.Plan((win_h, win_w),
                               np.complex64,
                               np.float32,
                               batch=batch_size)
    cu_fft.ifft(d_tmp, d_winIFFT, plan_inverse, True)

    #transfer data back
    corr_gpu = d_winIFFT.get().real
    corr_gpu = fftshift(corr_gpu, axes=(1, 2))

    # Free GPU memory

    d_winA.gpudata.free()
    d_search_area.gpudata.free()
    d_winFFT.gpudata.free()
    d_winIFFT.gpudata.free()
    d_searchAreaFFT.gpudata.free()
    d_tmp.gpudata.free()

    return (corr_gpu)
Exemplo n.º 11
0
    def _impl_test_binaryop_2d(self, dtype):
        if issubclass(dtype, numbers.Integral):
            a_sca = np.array(np.random.randint(1, 10), dtype=dtype)
            b_sca = np.array(np.random.randint(1, 10), dtype=dtype)
            a_vec = np.random.randint(1, 10, 3).astype(dtype)
            b_vec = np.random.randint(1, 10, 3).astype(dtype)
            a_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype)
            b_mat = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype)
            b_mat_f = np.random.randint(1, 10, 6).reshape((3, 2)).astype(dtype, order='F')
        else:
            a_sca = np.random.normal(scale=5.0, size=()).astype(dtype)
            b_sca = np.random.normal(scale=5.0, size=()).astype(dtype)
            a_vec = np.random.normal(scale=5.0, size=(3,)).astype(dtype)
            b_vec = np.random.normal(scale=5.0, size=(3,)).astype(dtype)
            a_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype)
            b_mat = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype)
            b_mat_f = np.random.normal(scale=5.0, size=(3, 2)).astype(dtype, order='F')

        a_sca_gpu = gpuarray.to_gpu(a_sca)
        b_sca_gpu = gpuarray.to_gpu(b_sca)
        a_vec_gpu = gpuarray.to_gpu(a_vec)
        b_vec_gpu = gpuarray.to_gpu(b_vec)
        a_mat_gpu = gpuarray.to_gpu(a_mat)
        b_mat_gpu = gpuarray.to_gpu(b_mat)
        b_mat_f_gpu = gpuarray.to_gpu(b_mat_f)

        # addition
        assert_allclose(misc.add(a_sca_gpu, b_sca_gpu).get(), a_sca+b_sca,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.add(a_vec_gpu, b_vec_gpu).get(), a_vec+b_vec,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.add(a_mat_gpu, b_mat_gpu).get(), a_mat+b_mat,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])

        # subtract
        assert_allclose(misc.subtract(a_sca_gpu, b_sca_gpu).get(), a_sca-b_sca,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.subtract(a_vec_gpu, b_vec_gpu).get(), a_vec-b_vec,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.subtract(a_mat_gpu, b_mat_gpu).get(), a_mat-b_mat,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])

        # multiplication
        assert_allclose(misc.multiply(a_sca_gpu, b_sca_gpu).get(), a_sca*b_sca,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.multiply(a_vec_gpu, b_vec_gpu).get(), a_vec*b_vec,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])
        assert_allclose(misc.multiply(a_mat_gpu, b_mat_gpu).get(), a_mat*b_mat,
                        rtol=dtype_to_rtol[dtype],
                        atol=dtype_to_atol[dtype])

        # division
        if issubclass(dtype, numbers.Integral):
            assert_allclose(misc.divide(a_sca_gpu, b_sca_gpu).get(), a_sca//b_sca,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])
            assert_allclose(misc.divide(a_vec_gpu, b_vec_gpu).get(), a_vec//b_vec,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])
            assert_allclose(misc.divide(a_mat_gpu, b_mat_gpu).get(), a_mat//b_mat,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])
        else:
            assert_allclose(misc.divide(a_sca_gpu, b_sca_gpu).get(), a_sca/b_sca,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])
            assert_allclose(misc.divide(a_vec_gpu, b_vec_gpu).get(), a_vec/b_vec,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])
            assert_allclose(misc.divide(a_mat_gpu, b_mat_gpu).get(), a_mat/b_mat,
                            rtol=dtype_to_rtol[dtype],
                            atol=dtype_to_atol[dtype])

        # mismatched order
        assert_raises(ValueError, misc.add, a_mat_gpu, b_mat_f_gpu)