示例#1
0
def zoom_numbaThread(data,  chunkIndices, zoomArray):
    """
    2-D zoom interpolation using purely python - fast if compiled with numba.
    Both the array to zoom and the output array are required as arguments, the
    zoom level is calculated from the size of the new array.

    Parameters:
        array (ndarray): The 2-D array to zoom
        zoomArray (ndarray): The array to place the calculation

    Returns:
        interpArray (ndarray): A pointer to the calculated ``zoomArray''
    """

    for i in range(chunkIndices[0], chunkIndices[1]):
        x = i*numba.float32(data.shape[0]-1)/(zoomArray.shape[0]-0.99999999)
        x1 = numba.int32(x)
        for j in range(zoomArray.shape[1]):
            y = j*numba.float32(data.shape[1]-1)/(zoomArray.shape[1]-0.99999999)
            y1 = numba.int32(y)

            xGrad1 = data[x1+1, y1] - data[x1, y1]
            a1 = data[x1, y1] + xGrad1*(x-x1)

            xGrad2 = data[x1+1, y1+1] - data[x1, y1+1]
            a2 = data[x1, y1+1] + xGrad2*(x-x1)

            yGrad = a2 - a1
            zoomArray[i,j] = a1 + yGrad*(y-y1)


    return zoomArray
示例#2
0
        def preCalc(y, yA, yB, numDataPoints):
            i = cuda.grid(1)
            k = i % numDataPoints

            ans = float32(1.001 * float32(i))

            y[i] = ans
            yA[i] = ans * 1.0
            yB[i] = ans / 1.0
示例#3
0
        def cu_square_matrix_mul(A, B, C):
            sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
            sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

            tx = cuda.threadIdx.x
            ty = cuda.threadIdx.y
            bx = cuda.blockIdx.x
            by = cuda.blockIdx.y
            bw = cuda.blockDim.x
            bh = cuda.blockDim.y

            x = tx + bx * bw
            y = ty + by * bh

            acc = float32(0)  # forces all the math to be f32
            for i in range(bpg):
                if x < n and y < n:
                    sA[ty, tx] = A[y, tx + i * tpb]
                    sB[ty, tx] = B[ty + i * tpb, x]

                cuda.syncthreads()

                if x < n and y < n:
                    for j in range(tpb):
                        acc += sA[ty, j] * sB[j, tx]

                cuda.syncthreads()

            if x < n and y < n:
                C[y, x] = acc
示例#4
0
文件: test_ufunc.py 项目: esc/numba
    def _test_broadcasting(self, cls, a, b, c, d):
        "Test multiple args"
        vectorizer = cls(add_multiple_args)
        vectorizer.add(float32(float32, float32, float32, float32))
        ufunc = vectorizer.build_ufunc()

        info = (cls, a.shape)
        self.assertPreciseEqual(ufunc(a, b, c, d), a + b + c + d, msg=info)
示例#5
0
    def _test_broadcasting(self, cls, a, b, c, d):
        "Test multiple args"
        vectorizer = cls(add_multiple_args)
        vectorizer.add(float32(float32, float32, float32, float32))
        ufunc = vectorizer.build_ufunc()

        info = (cls, a.shape)
        self.assertTrue(np.all(ufunc(a, b, c, d) == a + b + c + d), info)
示例#6
0
    def test_implicit_broadcasting(self):
        for v in vectorizers:
            vectorizer = v(add)
            vectorizer.add(float32(float32, float32))
            ufunc = vectorizer.build_ufunc()

            broadcasting_b = b[np.newaxis, :, np.newaxis, np.newaxis, :]
            self.assertTrue(np.all(ufunc(a, broadcasting_b) == a + broadcasting_b))
示例#7
0
文件: test_ufunc.py 项目: esc/numba
    def _test_ufunc_attributes(self, cls, a, b, *args):
        "Test ufunc attributes"
        vectorizer = cls(add, *args)
        vectorizer.add(float32(float32, float32))
        ufunc = vectorizer.build_ufunc()

        info = (cls, a.ndim)
        self.assertPreciseEqual(ufunc(a, b), a + b, msg=info)
        self.assertPreciseEqual(ufunc_reduce(ufunc, a), np.sum(a), msg=info)
        self.assertPreciseEqual(ufunc.accumulate(a), np.add.accumulate(a), msg=info)
        self.assertPreciseEqual(ufunc.outer(a, b), np.add.outer(a, b), msg=info)
示例#8
0
    def _test_ufunc_attributes(self, cls, a, b, *args):
        "Test ufunc attributes"
        vectorizer = cls(add, *args)
        vectorizer.add(float32(float32, float32))
        ufunc = vectorizer.build_ufunc()

        info = (cls, a.ndim)
        self.assertTrue(np.all(ufunc(a, b) == a + b), info)
        self.assertTrue(ufunc_reduce(ufunc, a) == np.sum(a), info)
        self.assertTrue(np.all(ufunc.accumulate(a) == np.add.accumulate(a)), info)
        self.assertTrue(np.all(ufunc.outer(a, b) == np.add.outer(a, b)), info)
示例#9
0
def raycast(sx, sy, camera, world):
    fx = nb.float32(sx * 2 - 1)
    fy = nb.float32(sy * 2 - 1)
    dx = nb.float32(camera.plane_offset.x + camera.plane_x_size.x * fx + camera.plane_y_size.x * fy)
    dy = nb.float32(camera.plane_offset.y + camera.plane_x_size.y * fx + camera.plane_y_size.y * fy)

    ddx = nb.float32(abs(1 / dx) if dx != 0 else np.inf)
    ddy = nb.float32(abs(1 / dy) if dy != 0 else np.inf)
    tx = int(camera.pos.x // 1)
    ty = int(camera.pos.y // 1)
    ox = camera.pos.x % 1
    oy = camera.pos.y % 1
    sx = nb.cuda.selp(dx < 0, -1, 1)
    ox = nb.cuda.selp(dx < 0, ox, (1 - ox)) * ddx
    sy = nb.cuda.selp(dy < 0, -1, 1)
    oy = nb.cuda.selp(dy < 0, oy, (1 - oy)) * ddy

    finished = False
    while not finished:
        ox += ddx
        tx += sx
        if not (0 <= tx < world.shape[0]):
            finished = True
            continue
    return 0, 0, 0
示例#10
0
 def as_soft_penalty(self):
     from numba import njit, float32, float64
     from ..numba.model import softplus, d_softplus
     i_num = self.i_num
     i_den = self.i_den
     cmin_num = self.cmin_num
     cmin_den = self.cmin_den
     cmax_num = self.cmax_num
     cmax_den = self.cmax_den
     scale = self.scale
     @njit([
         float32(float32[:], float32, float32),
         float64(float64[:], float64, float64),
     ])
     def penalty(x, intensity, sharpness=1.0):
         _min = x[i_num] * cmin_num + x[i_den] * cmin_den
         _max = x[i_num] * cmax_num + x[i_den] * cmax_den
         return -softplus(-np.minimum(_min, _max) * scale * intensity, sharpness)
     @njit([
         float32[:](float32[:], float32, float32),
         float64[:](float64[:], float64, float64),
     ])
     def dpenalty(x, intensity, sharpness=1.0):
         j = np.zeros_like(x)
         _min = x[i_num] * cmin_num + x[i_den] * cmin_den
         _max = x[i_num] * cmax_num + x[i_den] * cmax_den
         partial = d_softplus(-np.minimum(_min, _max), sharpness * scale * intensity) * scale * intensity
         if _min < _max:
             j[i_num] = cmin_num * partial
             j[i_den] = cmin_den * partial
         else:
             j[i_num] = cmax_num * partial
             j[i_den] = cmax_den * partial
         return j
     @njit([
         float32[:](float32[:], float32),
         float64[:](float64[:], float64),
     ])
     def dpenalty_money(x, intensity):
         j = np.zeros_like(x)
         _min = x[i_num] * cmin_num + x[i_den] * cmin_den
         _max = x[i_num] * cmax_num + x[i_den] * cmax_den
         if np.absolute(_min) < 1e-5:
             partial = 0.5 * scale * intensity
             j[i_num] = cmin_num * partial
             j[i_den] = cmin_den * partial
         elif np.absolute(_max) < 1e-5:
             partial = 0.5 * scale * intensity
             j[i_num] = cmax_num * partial
             j[i_den] = cmax_den * partial
         return j
     return penalty, dpenalty, dpenalty_money
示例#11
0
    def execute(self, input):
        attack = nb.float32(0.002)
        release = nb.float32(0.0002)
        clipthreshold = nb.float32(0.9)
        amplitude = nb.float32(0.25)

        pa = self.pa
        output = np.zeros_like(input)

        for i in range(len(input)):
            # Input sample
            s = input[i]
            # Amplitude of the input sample.
            # Use amplitude instead of power (amplitude^2), so that short,
            # high amplitude peaks won't affect the AGC that much.
            p = np.abs(s)
            # Difference from the average amplitude
            pd = p - pa
            if pd >= 0:
                pa += pd * attack
            else:
                pa += pd * release

            # Normalize the amplitude
            if pa > 0:
                s *= amplitude / pa
            else:
                # this shouldn't happen often
                s = 0

            # Some samples may still be above 1, so clip them
            p = s.real ** 2 + s.imag ** 2
            if p > clipthreshold:
                s *= np.sqrt(clipthreshold / p)

            output[i] = s

        self.pa = pa
        return output
示例#12
0
def cu_sigm_cfe_post(cmin, cmax, midpoint, a, sigma):
    "Construct CUDA device function for Sigmoidal coupling function."
    cmin, cmax, midpoint, a, sigma = [
        float32(_) for _ in (cmin, cmax, midpoint, a, sigma)
    ]
    from math import exp

    @cuda.jit(device=True)
    def cfe(gx):
        return cmin + ((cmax - cmin) / (1.0 + exp(-a *
                                                  ((gx - midpoint) / sigma))))

    return cfe
示例#13
0
    def test_cuda_vectorize_device_call(self):
        @cuda.jit(float32(float32, float32, float32), device=True)
        def cu_device_fn(x, y, z):
            return x**y / z

        def cu_ufunc(x, y, z):
            return cu_device_fn(x, y, z)

        ufunc = vectorize([float32(float32, float32, float32)],
                          target='cuda')(cu_ufunc)

        N = 100

        X = np.array(np.random.sample(N), dtype=np.float32)
        Y = np.array(np.random.sample(N), dtype=np.float32)
        Z = np.array(np.random.sample(N), dtype=np.float32) + 0.1

        out = ufunc(X, Y, Z)

        gold = (X**Y) / Z

        self.assertTrue(np.allclose(out, gold))
示例#14
0
def cu_simple_cfun(offset, cvar):
    "Construct CUDA device function for simple summation coupling."

    offset = float32(offset)

    @cuda.jit(device=True)
    def cfun(weights, state, i_post, i_thread):  # 2*n reads
        H = float32(0.0)
        for j in range(state.shape[0]):
            H += weights[i_post, j] * (state[j, cvar, i_thread] + offset)
        return H

    return cfun
示例#15
0
def _savi_gpu(nir_data, red_data, soil_factor, out):
    y, x = cuda.grid(2)
    if y < out.shape[0] and x < out.shape[1]:
        nir = nir_data[y, x]
        red = red_data[y, x]
        numerator = nir - red
        soma = nir + red + soil_factor[0]
        denominator = soma * (nb.float32(1.0) + soil_factor[0])

        if denominator == 0.0:
            out[y, x] = np.nan
        else:
            out[y, x] = numerator / denominator
示例#16
0
def conv2d(arr, il, ir, ao, fin, fl, fr, dlin, dli):
    inp = arr[il:ir]
    out = arr[ao]
    f = fin[fl:fr]
    dl = dlin[dli]
    fshared = cuda.shared.array(shape=0, dtype=float32)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bdx = cuda.blockDim.x
    bdy = cuda.blockDim.y
    tid = ty*bdx+tx
    nth = bdx*bdy
    for i in range(tid,f.size,nth):
        fshared[i] = f[i]
    cuda.syncthreads()
    do=-1
    xc,yc = cuda.grid(2)
    if xc<out.shape[0] and yc<out.shape[1]:
        tmp = float32(0)
        idx = int32(0)
        for j in range(inp.shape[0]):
            if do!=dl[j]:
                do=dl[j]
                d=dl[j]
                if xc>=d:
                    xl = xc-d
                else:
                    xl = d-xc
                if xc<out.shape[0]-d:
                    xr = xc+d
                else:
                    xr = 2*out.shape[0] - (xc+d + 2)
                if yc>=d:
                    yl = yc-d
                else:
                    yl = d-yc
                if yc<out.shape[1]-d:
                    yr = yc+d
                else:
                    yr = 2*out.shape[1] - (yc+d + 2)
            tmp = cuda.fma(inp[j,xl,yl],fshared[idx], tmp)
            tmp = cuda.fma(inp[j,xl,yc],fshared[idx+1], tmp)
            tmp = cuda.fma(inp[j,xl,yr],fshared[idx+2], tmp)
            tmp = cuda.fma(inp[j,xc,yl],fshared[idx+3], tmp)
            tmp = cuda.fma(inp[j,xc,yc],fshared[idx+4], tmp)
            tmp = cuda.fma(inp[j,xc,yr],fshared[idx+5], tmp)
            tmp = cuda.fma(inp[j,xr,yl],fshared[idx+6], tmp)
            tmp = cuda.fma(inp[j,xr,yc],fshared[idx+7], tmp)
            tmp = cuda.fma(inp[j,xr,yr],fshared[idx+8], tmp)
            idx+=9
        out[xc,yc] += tmp
class Smart2FluxDelimiter(FluxDelimiter):

    @staticmethod
    @vectorize([float32(float32, float32,float32),
                float64(float64, float64,float64)])
    def __call__(phi_p, tetha_f, tetha_p):
        if phi_p<=tetha_p/3:
            return tetha_f/tetha_p*phi_p*(1-3*tetha_p+2*tetha_f)/(1-tetha_p)
        elif phi_p<=tetha_p/tetha_f*(1+tetha_f-tetha_p):
            return (tetha_f/tetha_p)*((1-tetha_f)/(1-tetha_p))*phi_p + (tetha_f/(1-tetha_p))*(tetha_f-tetha_p)
        elif phi_p<=1:
            return 1.
        else:
            return phi_p
示例#18
0
class MinMod2FluxDelimiter(FluxDelimiter):
    @staticmethod
    @vectorize([
        float32(float32, float32, float32),
        float64(float64, float64, float64)
    ])
    def __call__(phi_p, tetha_f, tetha_p):
        if phi_p <= tetha_p:
            return tetha_f / tetha_p * phi_p
        elif phi_p <= 1:
            return ((1 - tetha_f) * phi_p +
                    (tetha_f - tetha_p)) / (1 - tetha_p)
        else:
            return phi_p
示例#19
0
    def test_wrapper_address_protocol_libm(self):
        """Call cos and sinf from standard math library.

        """
        import os
        import ctypes.util

        class LibM(types.WrapperAddressProtocol):
            def __init__(self, fname):
                if os.name == 'nt':
                    lib = ctypes.cdll.msvcrt
                else:
                    libpath = ctypes.util.find_library('m')
                    lib = ctypes.cdll.LoadLibrary(libpath)
                self.lib = lib
                self._name = fname
                if fname == 'cos':
                    addr = ctypes.cast(self.lib.cos, ctypes.c_voidp).value
                    signature = float64(float64)
                elif fname == 'sinf':
                    addr = ctypes.cast(self.lib.sinf, ctypes.c_voidp).value
                    signature = float32(float32)
                else:
                    raise NotImplementedError(f'wrapper address of `{fname}`'
                                              f' with signature `{signature}`')
                self._signature = signature
                self._address = addr

            def __repr__(self):
                return f'{type(self).__name__}({self._name!r})'

            def __wrapper_address__(self):
                return self._address

            def signature(self):
                return self._signature

        mycos = LibM('cos')
        mysin = LibM('sinf')

        def myeval(f, x):
            return f(x)

        # Not testing forceobj=True as it requires implementing
        # LibM.__call__ using ctypes which would be out-of-scope here.
        for jit_opts in [dict(nopython=True)]:
            jit_ = jit(**jit_opts)
            with self.subTest(jit=jit_opts):
                self.assertEqual(jit_(myeval)(mycos, 0.0), 1.0)
                self.assertEqual(jit_(myeval)(mysin, float32(0.0)), 0.0)
示例#20
0
 def rk4_rV(it, nrV, rti, Vti, o_tau, pi, tau, Delta, eta, J, I, cr, rc, cv,
            Vc, r_sigma, V_sigma, z0, z1):
     dr_0 = dr_(rti, Vti, o_tau, pi, tau, Delta)
     dV_0 = dV_(rti, Vti, o_tau, pi, tau, eta, J, I, cr, rc, cv, Vc)
     kh = nb.float32(0.5)
     dr_1 = dr_(rti + dt * kh * dr_0, Vti + dt * kh * dV_0, o_tau, pi, tau,
                Delta)
     dV_1 = dV_(rti + dt * kh * dr_0, Vti + dt * kh * dV_0, o_tau, pi, tau,
                eta, J, I, cr, rc, cv, Vc)
     dr_2 = dr_(rti + dt * kh * dr_1, Vti + dt * kh * dV_1, o_tau, pi, tau,
                Delta)
     dV_2 = dV_(rti + dt * kh * dr_1, Vti + dt * kh * dV_1, o_tau, pi, tau,
                eta, J, I, cr, rc, cv, Vc)
     kh = nb.float32(1.0)
     dr_3 = dr_(rti + dt * kh * dr_2, Vti + dt * kh * dV_2, o_tau, pi, tau,
                Delta)
     dV_3 = dV_(rti + dt * kh * dr_2, Vti + dt * kh * dV_2, o_tau, pi, tau,
                eta, J, I, cr, rc, cv, Vc)
     nrV[0, it] = rti + o_6 * dt * (dr_0 + 2 * (dr_1 + dr_2) +
                                    dr_3) + sqrt_dt * r_sigma * z0
     nrV[0, it] *= nrV[0, it] > 0
     nrV[1, it] = Vti + o_6 * dt * (dV_0 + 2 * (dV_1 + dV_2) +
                                    dV_3) + sqrt_dt * V_sigma * z1
示例#21
0
def ax2hoL(axIn,p=P):
  pf = numba.float32(p > 0) * 2.0 - 1.0
  # intype = ax.dtype
  # n = np.int64(ax.size / 4)
  ax,m,n,intype = prepIn(axIn)
  ho = np.zeros((n,3),dtype=intype)
  axn = axnormL(ax)
  #axn = ax
  for i in numba.prange(n):
    f = 0.75 * (axn[i,3] - np.sin(axn[i,3]))
    f = f ** (1.0 / 3.0)
    for j in range(3):
      ho[i,j] = f * axn[i,j]
  return ho
示例#22
0
    def test_cuda_vectorize_device_call(self):

        @cuda.jit(float32(float32, float32, float32), device=True)
        def cu_device_fn(x, y, z):
            return x ** y / z

        def cu_ufunc(x, y, z):
            return cu_device_fn(x, y, z)

        ufunc = vectorize([float32(float32, float32, float32)], target='cuda')(
            cu_ufunc)

        N = 100

        X = np.array(np.random.sample(N), dtype=np.float32)
        Y = np.array(np.random.sample(N), dtype=np.float32)
        Z = np.array(np.random.sample(N), dtype=np.float32) + 0.1

        out = ufunc(X, Y, Z)

        gold = (X ** Y) / Z

        self.assertTrue(np.allclose(out, gold))
示例#23
0
def cu_sums4(nme, member, vel, virial_potential, coll, nblocks):
    block_size = 256
    block_size2 = block_size * 2
    block_size3 = block_size * 3
    block_size4 = block_size * 4
    block_size5 = block_size * 5

    sm = cuda.shared.array(256 * 6, nb.float32)
    i = cuda.grid(1)
    tx = cuda.threadIdx.x

    temp = nb.float32(0.0)
    virial = nb.float32(0.0)
    potential = nb.float32(0.0)
    mx = nb.float32(0.0)
    my = nb.float32(0.0)
    mz = nb.float32(0.0)

    if i < nme:
        idx = member[i]
        vi = vel[idx]
        mi = vi[3]
        vp = virial_potential[idx]
        temp = mi * (vi[0] * vi[0] + vi[1] * vi[1] + vi[2] * vi[2])
        virial = vp[0]
        potential = vp[1]
        mx = vi[0] * mi
        my = vi[1] * mi
        mz = vi[2] * mi

    sm[tx] = temp
    sm[tx + block_size] = virial
    sm[tx + block_size2] = potential
    sm[tx + block_size3] = mx
    sm[tx + block_size4] = my
    sm[tx + block_size5] = mz
    cuda.syncthreads()

    offs = cuda.blockDim.x >> 1
    while offs > 0:
        if tx < offs:
            sm[tx] += sm[tx + offs]
            sm[tx + block_size] += sm[tx + block_size + offs]
            sm[tx + block_size2] += sm[tx + block_size2 + offs]
            sm[tx + block_size3] += sm[tx + block_size3 + offs]
            sm[tx + block_size4] += sm[tx + block_size4 + offs]
            sm[tx + block_size5] += sm[tx + block_size5 + offs]
        offs >>= 1
        cuda.syncthreads()

    if tx == 0:
        coll[cuda.blockIdx.x] = sm[0]
        coll[cuda.blockIdx.x + nblocks] = sm[block_size]
        coll[cuda.blockIdx.x + nblocks * 2] = sm[block_size2]
        coll[cuda.blockIdx.x + nblocks * 3] = sm[block_size3]
        coll[cuda.blockIdx.x + nblocks * 4] = sm[block_size4]
        coll[cuda.blockIdx.x + nblocks * 5] = sm[block_size5]
示例#24
0
    def test_cuda_vectorize_device_call(self):
        ufunc = vectorize([float32(float32, float32, float32)], target='cuda')(
            cu_ufunc)

        N = 100

        X = np.array(np.random.sample(N), dtype=np.float32)
        Y = np.array(np.random.sample(N), dtype=np.float32)
        Z = np.array(np.random.sample(N), dtype=np.float32) + 0.1

        out = ufunc(X, Y, Z)

        gold = (X ** Y) / Z

        self.assertTrue(np.allclose(out, gold))
示例#25
0
def make_euler(dt, f, n_svar, n_step):
    "Construct CUDA device function for Euler scheme."

    n_step = int32(n_step)
    dt = float32(dt)

    @cuda.jit(device=True)
    def scheme(X, I):
        dX = cuda.local.array((n_svar, ), float32)
        for i in range(n_step):
            f(dX, X, I)
            for j in range(n_svar):
                X[j] += dX[j]

    return scheme
示例#26
0
def prepare_legendre(order, numba=True):
    if order == 1:
        def P(x):
            return x
    elif order == 2:
        def P(x):
            return 0.5 * (3.0*x**2 - 1.0)
    else:
        raise NotImplementedError("Order {:d} of Legendre polynomial has not been implemented".format(order))

    if numba:
        vectorizing_factory = vectorize([float32(float32), float64(float64)], nopython=True)
        return vectorizing_factory(P)
    else:
        return P
示例#27
0
def calculate_forces(positions, weights, accelerations):
    """
    Calculate accelerations produced on all bodies by mutual gravitational
    forces.
    """
    sh_positions = cuda.shared.array((tile_size, 2), float32)
    sh_weights = cuda.shared.array(tile_size, float32)
    i = cuda.grid(1)
    axi = float32(0.0)
    ayi = float32(0.0)
    xi = positions[i,0]
    yi = positions[i,1]
    for j in range(0, len(weights), tile_size):
        index = (j // tile_size) * cuda.blockDim.x + cuda.threadIdx.x
        sh_index = cuda.threadIdx.x
        sh_positions[sh_index,0] = positions[index,0]
        sh_positions[sh_index,1] = positions[index,1]
        sh_weights[sh_index] = weights[index]
        cuda.syncthreads()
        axi, ayi = tile_calculation(xi, yi, axi, ayi,
                                    sh_positions, sh_weights)
        cuda.syncthreads()
    accelerations[i,0] = axi
    accelerations[i,1] = ayi
示例#28
0
    def _test_ufunc_attributes(self, cls, a, b, *args):
        "Test ufunc attributes"
        vectorizer = cls(add, *args)
        vectorizer.add(float32(float32, float32))
        ufunc = vectorizer.build_ufunc()

        info = (cls, a.ndim)
        self.assertPreciseEqual(ufunc(a, b), a + b, msg=info)
        self.assertPreciseEqual(ufunc_reduce(ufunc, a), np.sum(a), msg=info)
        self.assertPreciseEqual(ufunc.accumulate(a),
                                np.add.accumulate(a),
                                msg=info)
        self.assertPreciseEqual(ufunc.outer(a, b),
                                np.add.outer(a, b),
                                msg=info)
示例#29
0
def _sinewave(num, den):
    """Generate a complex sine wave of frequency sample_rate*num/den.
    Length is chosen such that a continuous sine wave
    can be made by repeating the returned signal."""

    # The code below fails for num=0, so handle that as a special case
    if num == 0:
        return nb.complex64([1.0])

    # "% den" is not absolutely necessary here, but wrapping the phase
    # using integers may avoid loss of floating point precision.
    phase = \
        (np.arange(0, num*den, num, dtype = np.int64) % den) \
        .astype(np.float32) * nb.float32(2.0 * np.pi / den)
    return np.cos(phase) + np.sin(phase) * nb.complex64(1j)
示例#30
0
 def test_4(self):
     sig = [
         int32(int32, int32),
         uint32(uint32, uint32),
         float32(float32, float32),
         float64(float64, float64),
     ]
     func = self.funcs['func3']
     A = np.arange(100, dtype=np.float64)
     self._run_and_compare(func, sig, A, A)
     A = A.astype(np.float32)
     self._run_and_compare(func, sig, A, A)
     A = A.astype(np.int32)
     self._run_and_compare(func, sig, A, A)
     A = A.astype(np.uint32)
     self._run_and_compare(func, sig, A, A)
示例#31
0
 def test_4(self):
     sig = [
         int32(int32, int32),
         uint32(uint32, uint32),
         float32(float32, float32),
         float64(float64, float64),
     ]
     func = self.funcs['func3']
     A = np.arange(100, dtype=np.float64)
     self._run_and_compare(func, sig, A, A)
     A = A.astype(np.float32)
     self._run_and_compare(func, sig, A, A)
     A = A.astype(np.int32)
     self._run_and_compare(func, sig, A, A)
     A = A.astype(np.uint32)
     self._run_and_compare(func, sig, A, A)
示例#32
0
    def fit(self, X, y, train_indices, valid_indices, sample_weights):
        max_bins = self.n_bins - 1
        random_state = self.random_state
        # TODO: on obtiendra cette info via le binner qui est dans la foret
        n_samples, n_features = X.shape
        n_bins_per_feature = max_bins * np.ones(n_features)
        n_bins_per_feature = n_bins_per_feature.astype(np.intp)

        # Create the tree object, which is mostly a data container for the nodes
        tree = _TreeRegressor(n_features, random_state)

        # We build a tree context, that contains global information about
        # the data, in particular the way we'll organize data into contiguous
        # node indexes both for training and validation samples
        tree_context = TreeRegressorContext(
            X,
            y,
            sample_weights,
            train_indices,
            valid_indices,
            self.n_bins - 1,
            n_bins_per_feature,
            uintp(self.max_features),
            self.aggregation,
            float32(self.step),
        )

        node_context = NodeRegressorContext(tree_context)
        best_split = SplitRegressor()
        candidate_split = SplitRegressor()
        compute_node_context = compute_node_regressor_context

        grow(
            tree,
            tree_context,
            node_context,
            compute_node_context,
            find_best_split_regressor_along_feature,
            copy_split_regressor,
            best_split,
            candidate_split,
        )
        self._train_indices = train_indices
        self._valid_indices = valid_indices
        self._tree = tree
        self._tree_context = tree_context
        return self
示例#33
0
    def __init__(self,
                 X_binned,
                 max_bins,
                 n_bins_per_feature,
                 gradients,
                 hessians,
                 l2_regularization,
                 min_hessian_to_split=1e-3,
                 min_samples_leaf=20,
                 min_gain_to_split=0.):

        self.X_binned = X_binned
        self.n_features = X_binned.shape[1]
        # Note: all histograms will have <max_bins> bins, but some of the
        # last bins may be unused if n_bins_per_feature[f] < max_bins
        self.max_bins = max_bins
        self.n_bins_per_feature = n_bins_per_feature
        self.gradients = gradients
        self.hessians = hessians
        # for root node, gradients and hessians are already ordered
        self.ordered_gradients = gradients.copy()
        self.ordered_hessians = hessians.copy()
        self.sum_gradients = self.gradients.sum()
        self.sum_hessians = self.hessians.sum()
        self.constant_hessian = hessians.shape[0] == 1
        self.l2_regularization = l2_regularization
        self.min_hessian_to_split = min_hessian_to_split
        self.min_samples_leaf = min_samples_leaf
        self.min_gain_to_split = min_gain_to_split
        if self.constant_hessian:
            self.constant_hessian_value = self.hessians[0]  # 1 scalar
        else:
            self.constant_hessian_value = float32(1.)  # won't be used anyway

        # The partition array maps each sample index into the leaves of the
        # tree (a leaf in this context is a node that isn't splitted yet, not
        # necessarily a 'finalized' leaf). Initially, the root contains all
        # the indices, e.g.:
        # partition = [abcdefghijkl]
        # After a call to split_indices, it may look e.g. like this:
        # partition = [cef|abdghijkl]
        # we have 2 leaves, the left one is at position 0 and the second one at
        # position 3. The order of the samples is irrelevant.
        self.partition = np.arange(0, X_binned.shape[0], 1, np.uint32)
        # buffers used in split_indices to support parallel splitting.
        self.left_indices_buffer = np.empty_like(self.partition)
        self.right_indices_buffer = np.empty_like(self.partition)
示例#34
0
def raycast(sx, sy, camera, world, texture_map, textures):
    fx = nb.float32(sx * 2 - 1)
    fy = nb.float32(sy * 2 - 1)
    dx = nb.float32(camera.plane_offset.x + camera.plane_x_size.x * fx +
                    camera.plane_y_size.x * fy)
    dy = nb.float32(camera.plane_offset.y + camera.plane_x_size.y * fx +
                    camera.plane_y_size.y * fy)
    dz = nb.float32(camera.plane_offset.z + camera.plane_x_size.z * fx +
                    camera.plane_y_size.z * fy)

    ddx = nb.float32(abs(1 / dx))
    ddy = nb.float32(abs(1 / dy))
    ddz = nb.float32(abs(1 / dz))
    tx = int(camera.pos.x // 1)
    ty = int(camera.pos.y // 1)
    tz = int(camera.pos.z // 1)
    ox = camera.pos.x % 1
    oy = camera.pos.y % 1
    oz = camera.pos.z % 1
    sx = nb.cuda.selp(dx < 0, -1, 1)
    ox = nb.cuda.selp(dx < 0, ox, (1 - ox)) * ddx
    sy = nb.cuda.selp(dy < 0, -1, 1)
    oy = nb.cuda.selp(dy < 0, oy, (1 - oy)) * ddy
    sz = nb.cuda.selp(dz < 0, -1, 1)
    oz = nb.cuda.selp(dz < 0, oz, (1 - oz)) * ddz

    finished = False
    while not finished:
        if oz > ox < oy:
            ox += ddx
            tx += sx
            side = int(0 + (sx + 1) // 2)
        elif oz > oy < ox:
            oy += ddy
            ty += sy
            side = int(2 + (sy + 1) // 2)
        else:
            oz += ddz
            tz += sz
            side = int(4 + (sz + 1) // 2)
        if not ((fx := (0 <= tx < world.shape[0])) and
                (fy :=
                 (0 <= ty < world.shape[1])) and (fz :=
                                                  (0 <= tz < world.shape[2]))):
示例#35
0
def find_node_split_subtraction(context, sample_indices, parent_histograms,
                                sibling_histograms):
    """For each feature, find the best bin to split by histogram substraction

    This in turn calls _find_histogram_split_subtraction that does not need
    to scan the samples from this node and can therefore be significantly
    faster than computing the histograms from data.

    Returns the best SplitInfo among all features, along with all the feature
    histograms that can be latter used to compute the sibling or children
    histograms by substraction.
    """

    # We can pick any feature (here the first) in the histograms to
    # compute the gradients: they must be the same across all features
    # anyway, we have tests ensuring this. Maybe a more robust way would
    # be to compute an average but it's probably not worth it.
    context.sum_gradients = (parent_histograms[0]['sum_gradients'].sum() -
                             sibling_histograms[0]['sum_gradients'].sum())

    n_samples = sample_indices.shape[0]
    if context.constant_hessian:
        context.sum_hessians = \
            context.constant_hessian_value * float32(n_samples)
    else:
        context.sum_hessians = (parent_histograms[0]['sum_hessians'].sum() -
                                sibling_histograms[0]['sum_hessians'].sum())

    # Pre-allocate the results datastructure to be able to use prange
    split_infos = [
        SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0)
        for i in range(context.n_features)
    ]
    histograms = np.empty(shape=(np.int64(context.n_features),
                                 np.int64(context.n_bins)),
                          dtype=HISTOGRAM_DTYPE)
    for feature_idx in prange(context.n_features):
        split_info, histogram = _find_histogram_split_subtraction(
            context, feature_idx, parent_histograms, sibling_histograms,
            n_samples)
        split_infos[feature_idx] = split_info
        histograms[feature_idx, :] = histogram

    split_info = _find_best_feature_to_split_helper(split_infos)
    return split_info, histograms
示例#36
0
def cu_cell_build(npa, pos, dim, box_low_boundary, inv_width, cell_size,
                  cell_list, situation):
    i = cuda.grid(1)
    if i < npa:
        # pi = pos[i]
        pix = pos[i][0]
        piy = pos[i][1]
        piz = pos[i][2]
        if math.isnan(pix) or math.isnan(piy) or math.isnan(piz):
            situation[0] = i + nb.int32(1)
            return

        if pix < box_low_boundary[0] or pix >= -box_low_boundary[
                0] or piy < box_low_boundary[1] or piy >= -box_low_boundary[
                    1] or piz < box_low_boundary[
                        2] or piz >= -box_low_boundary[2]:
            situation[1] = i + nb.int32(1)
            return

        dpix = pix - box_low_boundary[0]
        dpiy = piy - box_low_boundary[1]
        dpiz = piz - box_low_boundary[2]

        ix = nb.int32(dpix * inv_width[0])
        iy = nb.int32(dpiy * inv_width[1])
        iz = nb.int32(dpiz * inv_width[2])
        if ix == dim[0]:
            ix = nb.int32(0)
        if iy == dim[1]:
            iy = nb.int32(0)
        if iz == dim[2]:
            iz = nb.int32(0)

        cell_id = iz + dim[2] * (iy + ix * dim[1])
        if cell_id >= cell_list.shape[0]:
            situation[1] = i + nb.int32(1)
            return
        size = cuda.atomic.add(cell_size, cell_id, nb.int32(1))
        if size < cell_list.shape[1]:
            cell_list[cell_id][size][0] = pix
            cell_list[cell_id][size][1] = piy
            cell_list[cell_id][size][2] = piz
            cell_list[cell_id][size][3] = nb.float32(i)
        else:
            cuda.atomic.max(situation, nb.int32(2), size + nb.int32(1))
示例#37
0
def om2axL(om, p=P):# depreciated now use qu2ax(om2qu()) -- kept for historical reasons.
  pf = numba.float32(p > 0) * 2.0 - 1.0
  intype = om.dtype
  n = np.int64(om.size / 9)
  ax = np.zeros((n, 4), dtype=intype)
  #  help for translating out of C version
  # [0,0], [0,1], [0, 2], [1,0], [1,1], [1,2], [2,0], [2,1], [2,2] 
  #    0      1      2      3      4      5      6      7      8
  for i in numba.prange(n):

    tr = om[i,0,0] + om[i,1,1] + om[i,2,2]
    t = 0.50 * (tr - 1.0)
    t = 1.0 if (t > 1.0) else t
    t = -1.0 if (t < -1.0) else t
    ax[i,3] = np.arccos(t)
    #if ((1.0 - np.abs(t)) > eps):
    mag = numba.float64(0.0)
    ax[i,2] = pf * (om[i,1,0] - om[i,0,1])
    mag += ax[i,2]*ax[i,2]
    ax[i,1] = pf * (om[i,0,2] - om[i,2,0])
    mag += ax[i,1] * ax[i,1]
    ax[i,0] = pf * (om[i,2,1] - om[i,1,2])
    mag += ax[i,0] * ax[i,0]
    mag = np.sqrt(mag)
    if mag > eps:
      for j in range(3):
        ax[i,j] *= 1.0/mag
    else:
      if t > 0.0:
        ax[i, 0] = 0.0
        ax[i, 1] = 0.0
        ax[i, 2] = -1.0*pf
      else:
        d = np.zeros(3,dtype=intype)
        for j in range(3):
          d[j] = np.sqrt(0.5*(om[i,j,j]+1.0))
        dargsrt = np.argsort(d)
        d[dargsrt[1]] = (om[i, dargsrt[2], dargsrt[1]] + om[i, dargsrt[1], dargsrt[2] ]) / (4.0 * d[dargsrt[2]])
        d[dargsrt[0]] = (om[i, dargsrt[2], dargsrt[0]] + om[i, dargsrt[0], dargsrt[2]]) / (4.0 * d[dargsrt[2]])

        for j in range(3):
          ax[i,j] = pf*d[j]

  ax = axnormL(ax)
  return ax
示例#38
0
    def _test_template_4(self, target):
        sig = [int32(int32, int32),
               uint32(uint32, uint32),
               float32(float32, float32),
               float64(float64, float64)]
        basic_ufunc = vectorize(sig, target=target)(vector_add)
        np_ufunc = np.add

        def test(ty):
            data = np.linspace(0., 100., 500).astype(ty)
            result = basic_ufunc(data, data)
            gold = np_ufunc(data, data)
            np.testing.assert_allclose(gold, result)

        test(np.double)
        test(np.float32)
        test(np.int32)
        test(np.uint32)
示例#39
0
    def _test_template_4(self, target):
        sig = [int32(int32, int32),
               uint32(uint32, uint32),
               float32(float32, float32),
               float64(float64, float64)]
        basic_ufunc = vectorize(sig, target=target)(vector_add)
        np_ufunc = np.add

        def test(ty):
            data = np.linspace(0., 100., 500).astype(ty)
            result = basic_ufunc(data, data)
            gold = np_ufunc(data, data)
            self.assertTrue(np.allclose(gold, result))

        test(np.double)
        test(np.float32)
        test(np.int32)
        test(np.uint32)
示例#40
0
def ho2axL(hoIn,p=P):
  pf = numba.float32(p > 0) * 2.0 - 1.0
  ho,m,n,intype = prepIn(hoIn)
  ax = np.zeros((n,4),dtype=intype)

  tfit = np.array([1.0000000000018852,-0.5000000002194847,-0.024999992127593126,
                   -0.003928701544781374,-0.0008152701535450438,-0.0002009500426119712,
                   -0.00002397986776071756,-0.00008202868926605841,0.00012448715042090092,
                   -0.0001749114214822577,0.0001703481934140054,-0.00012062065004116828,
                   0.000059719705868660826,-0.00001980756723965647,0.000003953714684212874,
                   -0.00000036555001439719544],dtype=np.float64)

  for i in numba.prange(n):
    hmag = np.float64(0.0)
    for j in range(3):
      hmag += ho[i,j] * ho[i,j]
    if hmag < eps:
      ax[i,0] = 0.0
      ax[i,1] = 0.0
      ax[i,2] = -1.0 * pf
      ax[i,3] = 0.0
    else:
      hm = hmag
      sqrthm = np.sqrt(hm)
      hn = np.zeros(3,dtype=intype)
      for j in range(3):
        hn[j] = ho[i,j] / sqrthm
      # hn = ho[i,:]/sqrthm
      s = tfit[0] + tfit[1] * hmag
      for j in range(2,16):
        hm *= hmag
        s += tfit[j] * hm
      s = 1.0 if (s > 1.0) else s
      s = -1.0 if (s < -1.0) else s
      s = 2.0 * np.arccos(s)

      for j in range(3):
        ax[i,j] = hn[j]
      if np.abs(s - PI) < eps:
        ax[i,3] = PI
      else:
        ax[i,3] = s

  return ax
示例#41
0
 def __init__(self, fname):
     if os.name == 'nt':
         lib = ctypes.cdll.msvcrt
     else:
         libpath = ctypes.util.find_library('m')
         lib = ctypes.cdll.LoadLibrary(libpath)
     self.lib = lib
     self._name = fname
     if fname == 'cos':
         addr = ctypes.cast(self.lib.cos, ctypes.c_voidp).value
         signature = float64(float64)
     elif fname == 'sinf':
         addr = ctypes.cast(self.lib.sinf, ctypes.c_voidp).value
         signature = float32(float32)
     else:
         raise NotImplementedError(f'wrapper address of `{fname}`'
                                   f' with signature `{signature}`')
     self._signature = signature
     self._address = addr
示例#42
0
    def _compile(cls, formula):
        with BTagScaleFactor._formulaLock:
            try:
                return BTagScaleFactor._formulaCache[formula]
            except KeyError:
                if 'x' in formula:
                    feval = eval('lambda x: ' + formula, {'log': numpy.log, 'sqrt': numpy.sqrt})
                    out = numba.vectorize([
                        numba.float32(numba.float32),
                        numba.float64(numba.float64),
                    ])(feval)
                else:
                    val = eval(formula, {'log': numpy.log, 'sqrt': numpy.sqrt})

                    def duck(_, out, where):
                        out[where] = val
                    out = duck
                BTagScaleFactor._formulaCache[formula] = out
                return out
示例#43
0
文件: random.py 项目: sklam/numba
def xoroshiro128p_normal_float32(states, index):
    '''Return a normally distributed float32 and advance ``states[index]``.

    The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
    Box-Muller transform.  This advances the RNG sequence by two steps.

    :type states: 1D array, dtype=xoroshiro128p_dtype
    :param states: array of RNG states
    :type index: int64
    :param index: offset in states to update
    :rtype: float32
    '''
    index = int64(index)

    u1 = xoroshiro128p_uniform_float32(states, index)
    u2 = xoroshiro128p_uniform_float32(states, index)

    z0 = math.sqrt(-float32(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT32 * u2)
    # discarding second normal value
    # z1 = math.sqrt(-float32(2.0) * math.log(u1)) * math.sin(TWO_PI_FLOAT32 * u2)
    return z0
示例#44
0
def template_vectorize(self, target):
    # build basic native code ufunc
    sig = [int32(int32, int32),
           uint32(uint32, uint32),
           float32(float32, float32),
           float64(float64, float64)]
    basic_ufunc = vectorize(sig, target=target)(vector_add)

    # build python ufunc
    np_ufunc = np.add

    # test it out
    def test(ty):
        data = np.linspace(0., 100., 500).astype(ty)
        result = basic_ufunc(data, data)
        gold = np_ufunc(data, data)
        self.assertTrue(np.allclose(gold, result))

    test(np.double)
    test(np.float32)
    test(np.int32)
    test(np.uint32)
示例#45
0
文件: random.py 项目: sklam/numba
def uint64_to_unit_float32(x):
    '''Convert uint64 to float64 value in the range [0.0, 1.0)'''
    x = uint64(x)
    return float32(uint64_to_unit_float64(x))
示例#46
0
    if exposure_indices is None:
        exposure_indices = np.empty( shape=(0, 2) , dtype=np.int32)

    for i in range(x.shape[0]):
        if np.sqrt((x[i]-x0)**2+(y[i]-y0)**2) < r:
                exposure_indices = np.vstack((exposure_indices,np.array([x[i],y[i]],dtype=np.int32)))

    return exposure_indices



outfilename = 'test.txt'



@jit(float32(float32,float32,float32,float32),nopython=True)
def dist(x0,y0,x,y):
    return math.sqrt( (x0-x)*(x0-x)+(y0-y)*(y0-y) )

@jit(void(float32[:,:],int32[:,:],float32[:]),nopython=True,parallel= True)
def set_doses_field(field, exposure_indices, doses):
    for i in prange(doses.shape[0]):
        field[exposure_indices[i,0],exposure_indices[i,1]] = doses[i]

@jit(void(float32[:,:],int32[:,:],float32),nopython=True)
def set_target(target, exposure_indices, dose):
    for i in range(exposure_indices.shape[0]):
        target[exposure_indices[i,0],exposure_indices[i,1]] = dose

@njit(void(float32[:,:],float32[:,:],float32[:],float32[:]),parallel=True)
def convolve_with_vector(field,exposure,v,h):
示例#47
0
 def test_2(self):
     sig = [float64(float64), float32(float32)]
     func = self.funcs['func1']
     A = np.arange(100, dtype=np.float64)
     self._run_and_compare(func, sig, A)
示例#48
0
    i = cuda.grid(1)
    # Map i to array elements
    if i >= out.size:
        # Out of range?
        return
        # Do actual work
    out[i] = a * x[i] + y[i]


"""
Vectorize turns a scalar function into a
elementwise operation over the input arrays.
"""


@vectorize([float32(float32, float32, float32)], target="cuda")
def vec_saxpy(a, x, y):
    ### Task 1 ###
    # Complete the vectorize version
    # Hint: this is a scalar function of
    # 		float32(float32 a, float32 x, float32 y)
    return a * x + y


# CPU code
# ---------

NUM_BLOCKS = 1
NUM_THREADS = 32
NELEM = NUM_BLOCKS * NUM_THREADS
示例#49
0
            for j in xrange(des_ngb):
                q = neighbor_dists[i, j]/h
                if q <= 0.5:
                    n_ngb += (1 - 6*q**2 + 6*q**3)
                elif q <= 1.0:
                    n_ngb += 2*(1-q)**3
            n_ngb *= norm
            if n_ngb > des_ngb:
                upper = h
            else:
                lower = h
            error = np.fabs(n_ngb-des_ngb)
        hsml[i] = h
    return hsml
        
@vectorize([float32(float32), float64(float64)])
def Kernel(q):
    if q <= 0.5:
        return 1 - 6*q**2 + 6*q**3
    elif q <= 1.0:
        return 2 * (1-q)**3
    else: return 0.0
        
@jit
def DF(f, ngb):
    df = np.empty(ngb.shape)
    for i in xrange(ngb.shape[0]):
        for j in xrange(ngb.shape[1]):
            df[i,j] = f[ngb[i,j]] - f[i]
    return df
    
示例#50
0
def cu_template_render_image(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128, tile_size=100):
    """
    CPU part of the SPH render code that executes the rendering on the GPU
    
    does some basic particle set prunning and sets up the image
    tiles. It launches cuda kernels for rendering the individual sections of the image
    """
    import pycuda.driver as drv
    import pycuda.tools
    import pycuda.autoinit
    from pycuda.compiler import SourceModule
    from radix_sort import radix_sort

    global_start = time.clock()

    

    start = time.clock()
    # construct an array of particles
    Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')]
    ps = drv.pagelocked_empty(len(s),dtype=Partstruct)
    
    with s.immediate_mode : 
        ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']]

    if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start)

    ymin,ymax = xmin,xmax

    # ----------------------
    # setup the global image
    # ----------------------
    image = np.zeros((nx,ny),dtype=np.float32)
    
    dx = float32((xmax-xmin)/nx)
    dy = float32((ymax-ymin)/ny)
    
    x_start = xmin+dx/2
    y_start = ymin+dy/2

    zplane = 0.0

    # ------------------------------------------------------------------------------------------------
    # trim particles based on smoothing length -- the GPU will only render those that need < 32 pixels
    # ------------------------------------------------------------------------------------------------

    start = time.clock()
  #  gpu_bool = 2*ps['h'] < 15.*dx
    
    ps_gpu = ps#[gpu_bool]
   # ps_cpu = ps[~gpu_bool]
    #del(ps)
    if timing: '<<< Setting up gpu/cpu particle struct arrays took %f s'%(time.clock()-start)

    # -----------------------------------------------------------------
    # set up the image slices -- max. size is 100x100 pixels 
    # in this step only process particles that need kernels < 40 pixels
    # tiles are 100x100 = 1e4 pixels x 4 bytes = 40k
    # kernels are 31x31 pixels max = 3844 bytes
    # max shared memory size is 48k
    # -----------------------------------------------------------------
    
    start = time.clock()
    tiles_pix, tiles_physical = make_tiles(nx,ny,xmin,xmax,ymin,ymax,tile_size)
    if timing: print '<<< Tiles made in %f s'%(time.clock()-start)

    Ntiles = tiles_pix.shape[0]

     
    
    # ------------------
    # set up the kernels
    # ------------------
    code = file(os.path.join(os.path.dirname(__file__),'template_kernel.cu')).read()
    mod = SourceModule(code,options=["--ptxas-options=-v"])
    tile_histogram = mod.get_function("tile_histogram")
    distribute_particles = mod.get_function("distribute_particles")
    tile_render_kernel = mod.get_function("tile_render_kernel")
    calculate_keys = mod.get_function("calculate_keys")


    # -------------------------------------------------------------
    # set up streams and figure out particle distributions per tile 
    # -------------------------------------------------------------
   

    # allocate histogram array
    hist = np.zeros(Ntiles,dtype=np.int32)
    
    # transfer histogram array and particle data to GPU
    hist_gpu = drv.mem_alloc(hist.nbytes)
    drv.memcpy_htod(hist_gpu,hist)
    
    start_g = drv.Event()
    end_g = drv.Event()

    start_g.record()
    ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes)
    drv.memcpy_htod(ps_on_gpu,ps_gpu)
    end_g.record()
    end_g.synchronize()

    if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g))

    # make everything the right size
    xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax])
    nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles])

    # -----------------------------
    # calculate pixels per particle
    # -----------------------------

    # allocate key arrays -- these will be keys to sort particles into softening bins
    start_g.record()
    keys_gpu = drv.mem_alloc(int(4*len(s)))
    calculate_keys(ps_on_gpu, keys_gpu, np.int32(len(s)), np.float32(dx), 
                   block=(nthreads,1,1),grid=(1024,1,1))
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g))

    # ----------------------------------------
    # sort particles by their softening length
    # ----------------------------------------
    start_g.record()
    radix_sort(int(keys_gpu), int(ps_on_gpu), np.int32(0), np.int32(len(s)))
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Radix sorting all tiles took %f ms'%(start_g.time_till(end_g))

    start_g.record()
    tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles,
                   block=(nthreads,1,1),grid=(1024,1,1))

    drv.Context.synchronize()
    drv.memcpy_dtoh(hist,hist_gpu)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g))
    print "<<< Total particle array = %d"%(hist.sum())

    # ---------------------------------------------------------------------------------
    # figured out the numbers of particles per tile -- set up the tile particle buffers
    # ---------------------------------------------------------------------------------
    ps_tiles = np.empty(hist.sum(),dtype=Partstruct)
    ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes)

    tile_offsets = np.array([0],dtype=np.int32)
    tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32))
    tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes)
    drv.memcpy_htod(tile_offsets_gpu,tile_offsets)

    start_g.record()
    distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), 
                         xmin, xmax, ymin, ymax, nx, ny, Ntiles, 
                         block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g))
    drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu)

    
    # -------------------------
    # start going through tiles
    # -------------------------
   
    # initialize the image on the device
    im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes)
    drv.memcpy_htod(im_gpu,image.astype(np.float32))
   

    

  
    tile_start = time.clock()
    
    streams = [drv.Stream() for i in range(16)]    
    
    for i in xrange(Ntiles) :
        n_per_tile = tile_offsets[i+1] - tile_offsets[i]
        if n_per_tile > 0 : 
            my_stream = streams[i%(16)]
            
            xmin_p, xmax_p, ymin_p, ymax_p  = tiles_physical[i]
            xmin_t, xmax_t, ymin_t, ymax_t  = tiles_pix[i]
            
            nx_tile = xmax_t-xmin_t+1
            ny_tile = ymax_t-ymin_t+1
                    
                
            # make everything the right size
            xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t])
            xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p])
            
            if n_per_tile > nthreads*256: ngrid=128
            else : ngrid = 64
            
            tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i),
                               xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t,
                               im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]),
                               block=(nthreads,1,1),grid=(ngrid,1,1),stream=my_stream)

    if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start)
    
    # ----------------------------------------------------------------------------------
    # process the particles with large smoothing lengths concurrently with GPU execution
    # ----------------------------------------------------------------------------------
    #if ind[1] != len(xs) : 
    #    start = time.clock()
    #    image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:],
    #                                  nx,ny,xmin,xmax,ymin,ymax)).T
    #    if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1],
    #                                                                                           time.clock()-start)
    drv.Context.synchronize()
    if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start)

    drv.memcpy_dtoh(image,im_gpu)
    drv.stop_profiler()
    
    if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start)

    del(start_g)
    del(end_g)
    
    return image
示例#51
0
from __future__ import print_function, absolute_import

import numpy as np

from numba import vectorize
from numba import cuda, int32, float32, float64
from numba import unittest_support as unittest
from numba.cuda.testing import skip_on_cudasim
from numba.cuda.testing import CUDATestCase
from numba import config

sig = [int32(int32, int32),
       float32(float32, float32),
       float64(float64, float64)]


target='cuda'
if config.ENABLE_CUDASIM:
    target='cpu'


test_dtypes = np.float32, np.int32


@skip_on_cudasim('ufunc API unsupported in the simulator')
class TestCUDAVectorize(CUDATestCase):
    N = 1000001

    def test_scalar(self):

        @vectorize(sig, target=target)
示例#52
0
 def foo(arr, val):
     i = cuda.grid(1)
     if i < arr.size:
         arr[i] = float32(i) / val
示例#53
0
 def _test_template_2(self, target):
     numba_sinc = vectorize([float64(float64), float32(float32)],
                            target=target)(sinc)
     numpy_sinc = np.vectorize(sinc)
     self._run_and_compare(numba_sinc, numpy_sinc)
"""
Demonstrate broadcasting when a scalar is provided as an argument to a 
vectorize function.

Please read NumPy Broadcasting documentation for details about broadcasting:
http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html
"""

from __future__ import print_function
import numpy as np
from numba import vectorize, float32


@vectorize([float32(float32, float32, float32)], target="parallel")
def truncate(x, xmin, xmax):
    """ Truncate x[:] to [xmin, xmax] interval """
    if x < xmin:
        x = xmin
    elif x > xmax:
        x = xmax
    return x


def main():
    x = np.arange(100, dtype=np.float32)
    print("x = %s" % x)
    xmin = np.float32(20)  # as float32 type scalar
    xmax = np.float32(70)  # as float32 type scalar

    # The scalar arguments are broadcasted into an array.
    # This process creates arrays of zero strides.
示例#55
0
from __future__ import print_function, absolute_import
import numpy as np
from numba import vectorize
from numba import cuda, int32, float32, float64
from timeit import default_timer as time
from numba import unittest_support as unittest
from numba.cuda.testing import skip_on_cudasim
from numba.cuda.testing import CUDATestCase
from numba import config

sig = [int32(int32, int32), float32(float32, float32), float64(float64, float64)]


target = "cuda"
if config.ENABLE_CUDASIM:
    target = "cpu"


test_dtypes = np.float32, np.int32


@skip_on_cudasim("ufunc API unsupported in the simulator")
class TestCUDAVectorize(CUDATestCase):
    def test_scalar(self):
        @vectorize(sig, target=target)
        def vector_add(a, b):
            return a + b

        a = 1.2
        b = 2.3
        c = vector_add(a, b)
示例#56
0
文件: indexing.py 项目: vmarchen/exa
        xy + \\text{trunc}\\left(\\frac{\\left(\\left|x - y\\right| -
            1\\right)^{2}}{4}\\right)

    Args:
        x (array): First value array
        y (array): Second value array

    Returns:
        p (array): Pairing function result

    Note:
        This function has a vectorized version that is imported as
        :func:`~exa.algorithms.indexing.unordered_pairing`; use that
        function when working with array data.

    .. _pairing function: http://www.mattdipasquale.com/blog/2014/03/09/unique-unordered-pairing-function/
    '''
    return np.int64(x * y + np.trunc((np.abs(x - y) - 1)**2 / 4))


if global_config['pkg_numba']:
    from numba import jit, vectorize, int32, int64, float32, float64
    arange1 = jit(nopython=True, cache=True)(arange1)
    arange2 = jit(nopython=True, cache=True)(arange2)
    indexes_sc1 = jit(nopython=True, cache=True)(indexes_sc1)
    indexes_sc2 = jit(nopython=True, cache=True)(indexes_sc2)
    unordered_pairing = vectorize([int32(int32, int32), int64(int64, int64),
                                   float32(float32, float32), float64(float64, float64)],
                                   nopython=True)(unordered_pairing)
示例#57
0
文件: util.py 项目: jag1g13/pycgtool
def vector_cross(u, v):
    """
    Return vector cross product of two 3d vectors as numpy array.

    :param u: First 3d vector
    :param v: Second 3d vector
    :return: Cross product of two vectors as numpy.array
    """
    res = np.empty_like(u)
    res[0] = u[1] * v[2] - u[2] * v[1]
    res[1] = u[2] * v[0] - u[0] * v[2]
    res[2] = u[0] * v[1] - u[1] * v[0]
    return res


@numba.jit(numba.float32(numba.float32[3], numba.float32[3]))
def vector_dot(u, v):
    """
    Return vector dot product of two 3d vectors.

    :param u: First 3d vector
    :param v: Second 3d vector
    :return: Dot product of two vectors
    """
    return u[0]*v[0] + u[1]*v[1] + u[2]*v[2]


@numba.jit(numba.float32(numba.float32[3]))
def vector_len(v):
    return math.sqrt(v[0]*v[0] + v[1]*v[1] + v[2]*v[2])
示例#58
0
def cu_template_render_image_single(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128):
    """
    CPU part of the SPH render code that executes the rendering on the GPU
    
    does some basic particle set prunning and sets up the image
    tiles. It launches cuda kernels for rendering the individual sections of the image
    """
    import pycuda.driver as drv
    import pycuda.tools
    import pycuda.autoinit
    from pycuda.compiler import SourceModule
    from radix_sort import radix_sort

    global_start = time.clock()

    start = time.clock()
    # construct an array of particles
    Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')]
    ps = drv.pagelocked_empty(len(s),dtype=Partstruct)
    
    with s.immediate_mode : 
        ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']]

    if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start)

    ymin,ymax = xmin,xmax

    # ----------------------
    # setup the global image
    # ----------------------
    image = np.zeros((nx,ny),dtype=np.float32)
    
    dx = float32((xmax-xmin)/nx)
    dy = float32((ymax-ymin)/ny)
    
    x_start = xmin+dx/2
    y_start = ymin+dy/2

    zplane = 0.0

    start = time.clock()

    # ------------------
    # set up the kernels
    # ------------------
    code = file('/home/itp/roskar/homegrown/template_kernel.cu').read()
    mod = SourceModule(code)
    tile_histogram = mod.get_function("tile_histogram")
    distribute_particles = mod.get_function("distribute_particles")
    tile_render_kernel = mod.get_function("tile_render_kernel")
    calculate_keys = mod.get_function("calculate_keys")

    # allocate histogram array
    hist = np.zeros(Ntiles,dtype=np.int32)
    
    # transfer histogram array and particle data to GPU
    hist_gpu = drv.mem_alloc(hist.nbytes)
    drv.memcpy_htod(hist_gpu,hist)
    
    start_g = drv.Event()
    end_g = drv.Event()

    start_g.record()
    ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes)
    drv.memcpy_htod(ps_on_gpu,ps_gpu)
    end_g.record()
    end_g.synchronize()

    if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g))

    # make everything the right size
    xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax])
    nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles])

    start_g.record()
    tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles,
                   block=(nthreads,1,1),grid=(32,1,1))

    drv.Context.synchronize()
    drv.memcpy_dtoh(hist,hist_gpu)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g))
    print "<<< Total particle array = %d"%(hist.sum())

    # ---------------------------------------------------------------------------------
    # figured out the numbers of particles per tile -- set up the tile particle buffers
    # ---------------------------------------------------------------------------------
    ps_tiles = np.empty(hist.sum(),dtype=Partstruct)
    ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes)

    tile_offsets = np.array([0],dtype=np.int32)
    tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32))
    tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes)
    drv.memcpy_htod(tile_offsets_gpu,tile_offsets)

    start_g.record()
    distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), 
                         xmin, xmax, ymin, ymax, nx, ny, Ntiles, 
                         block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g))
    drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu)

    
    # -------------------------
    # start going through tiles
    # -------------------------
   
    # initialize the image on the device
    im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes)
    drv.memcpy_htod(im_gpu,image.astype(np.float32))
   

    # allocate key arrays -- these will be keys to sort particles into softening bins
    start_g.record()
    keys_gpu = drv.mem_alloc(int(4*hist.sum()))
    calculate_keys(ps_tiles_gpu, keys_gpu, np.int32(hist.sum()), np.float32(dx), 
                   block=(nthreads,1,1),grid=(32,1,1))
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g))

    keys = np.empty(hist.sum(), dtype=np.int32)


    # ----------------------------------------
    # sort particles by their softening length
    # ----------------------------------------
    for i in xrange(Ntiles) : 
        n_per_tile = tile_offsets[i+1] - tile_offsets[i]
        if n_per_tile > 0 : 
            radix_sort(int(keys_gpu), int(ps_tiles_gpu), tile_offsets[i], n_per_tile)

    drv.memcpy_dtoh(keys,keys_gpu)
    drv.memcpy_dtoh(ps_tiles,ps_tiles_gpu)
#    return keys,ps_tiles,tile_offsets,dx
        
    drv.Context.synchronize()

    tile_start = time.clock()
    for i in xrange(Ntiles) :
        n_per_tile = tile_offsets[i+1] - tile_offsets[i]
        if n_per_tile > 0 : 
            my_stream = streams[i%16]
            
            xmin_p, xmax_p, ymin_p, ymax_p  = tiles_physical[i]
            xmin_t, xmax_t, ymin_t, ymax_t  = tiles_pix[i]
            
            nx_tile = xmax_t-xmin_t+1
            ny_tile = ymax_t-ymin_t+1
                    
                
            # make everything the right size
            xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t])
            xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p])
            
            tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i),
                               xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t,
                               im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]),
                               block=(nthreads,1,1),stream=my_stream)

    if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start)
    
    # ----------------------------------------------------------------------------------
    # process the particles with large smoothing lengths concurrently with GPU execution
    # ----------------------------------------------------------------------------------
    #if ind[1] != len(xs) : 
    #    start = time.clock()
    #    image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:],
    #                                  nx,ny,xmin,xmax,ymin,ymax)).T
    #    if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1],
    #                                                                                           time.clock()-start)
    drv.Context.synchronize()
    if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start)

    drv.memcpy_dtoh(image,im_gpu)
    drv.stop_profiler()
    
    if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start)

    del(start_g)
    del(end_g)
    
    return image
示例#59
0
'''
Demonstrate the significant performance difference between transferring
regular host memory and pinned (pagelocked) host memory.
'''
from __future__ import print_function
from timeit import default_timer as timer
import numpy as np
from numba import vectorize, float32, cuda

src = np.arange(10 ** 7, dtype=np.float32)
dst = np.empty_like(src)

@vectorize([float32(float32)], target='cuda')
def copy_kernel(src):
    return src

# Regular memory transfer

ts = timer()
d_src = cuda.to_device(src)
d_dst = cuda.device_array_like(dst)

copy_kernel(d_src, out=d_dst)

d_dst.copy_to_host(dst)
te = timer()

print('regular', te - ts)

del d_src, d_dst
示例#60
0
from __future__ import absolute_import, print_function, division
from numba import vectorize
from numba import cuda, float32
import numpy as np
from numba import unittest_support as unittest
from numba.cuda.testing import skip_on_cudasim


@cuda.jit(float32(float32, float32, float32), device=True)
def cu_device_fn(x, y, z):
    return x ** y / z


def cu_ufunc(x, y, z):
    return cu_device_fn(x, y, z)


@skip_on_cudasim('ufunc API unsupported in the simulator')
class TestCudaVectorizeDeviceCall(unittest.TestCase):
    def test_cuda_vectorize_device_call(self):
        ufunc = vectorize([float32(float32, float32, float32)], target='cuda')(
            cu_ufunc)

        N = 100

        X = np.array(np.random.sample(N), dtype=np.float32)
        Y = np.array(np.random.sample(N), dtype=np.float32)
        Z = np.array(np.random.sample(N), dtype=np.float32) + 0.1

        out = ufunc(X, Y, Z)