Exemplo n.º 1
def get_by_index(src_gpu, ind):
    Get values in a GPUArray by index.

    src_gpu : pycuda.gpuarray.GPUArray
        GPUArray instance from which to extract values.
    ind : pycuda.gpuarray.GPUArray or numpy.ndarray
        Array of element indices to set. Must have an integer dtype.

    res_gpu : pycuda.gpuarray.GPUArray
        GPUArray with length of `ind` and dtype of `src_gpu` containing
        selected values.

    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import misc
    >>> src = np.random.rand(5).astype(np.float32)
    >>> src_gpu = gpuarray.to_gpu(src)
    >>> ind = gpuarray.to_gpu(np.array([0, 2, 4]))
    >>> res_gpu = misc.get_by_index(src_gpu, ind)
    >>> np.allclose(res_gpu.get(), src[[0, 2, 4]])

    Only supports 1D index arrays.

    May not be efficient for certain index patterns because of lack of inability
    to coalesce memory operations.

    # Only support 1D index arrays:
    assert len(np.shape(ind)) == 1
    assert issubclass(ind.dtype.type, numbers.Integral)
    N = len(ind)
    if not isinstance(ind, gpuarray.GPUArray):
        ind = gpuarray.to_gpu(ind)
    dest_gpu = gpuarray.empty(N, dtype=src_gpu.dtype)

    # Manually handle empty index array because it will cause the kernel to
    # fail if processed:
    if N == 0:
        return dest_gpu
        func = get_by_index.cache[(src_gpu.dtype, ind.dtype)]
    except KeyError:
        data_ctype = tools.dtype_to_ctype(src_gpu.dtype)
        ind_ctype = tools.dtype_to_ctype(ind.dtype)
        v = "{data_ctype} *dest, {ind_ctype} *ind, {data_ctype} *src".format(data_ctype=data_ctype, ind_ctype=ind_ctype)
        func = elementwise.ElementwiseKernel(v, "dest[i] = src[ind[i]]")
        get_by_index.cache[(src_gpu.dtype, ind.dtype)] = func
    func(dest_gpu, ind, src_gpu, range=slice(0, N, 1))
    return dest_gpu
Exemplo n.º 2
def get_inds_kernel(inds_ctype, src_ctype):
    v = ("{data_ctype} *dest, int src_shift, " +\
         "{inds_ctype} *inds, {data_ctype} *src").format(\
    func = elementwise.ElementwiseKernel(v,\
                    "dest[i] = src[src_shift+inds[i]]")
    return func
Exemplo n.º 3
def gen_trapz_mult(N, dtype):
    Generate multiplication array for 1D trapezoidal integration.

    Generates an array whose dot product with some array of equal
    length is equivalent to the definite integral of the latter
    computed using trapezoidal integration.

    N : int
        Length of array.
    dtype : float type
        Floating point type to use when generating the array.

    result : pycuda.gpuarray.GPUArray
        Generated array.

    if dtype not in [np.float32, np.float64, np.complex64, np.complex128]:
        raise ValueError('unrecognized type')

    ctype = tools.dtype_to_ctype(dtype)
    func = elementwise.ElementwiseKernel(
        "{ctype} *x".format(ctype=ctype),
        "x[i] = ((i == 0) || (i == {M})) ? 0.5 : 1".format(M=N - 1))
    x_gpu = gpuarray.empty(N, dtype)
    return x_gpu
Exemplo n.º 4
def sici(x_gpu):
    Sine/Cosine integral.

    Computes the sine and cosine integral of every element in the
    input matrix.

    x_gpu : GPUArray
        Input matrix of shape `(m, n)`.
    (si_gpu, ci_gpu) : tuple of GPUArrays
        Tuple of GPUarrays containing the sine integrals and cosine
        integrals of the entries of `x_gpu`.
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import scipy.special
    >>> import special
    >>> x = np.array([[1, 2], [3, 4]], np.float32)
    >>> x_gpu = gpuarray.to_gpu(x)
    >>> (si_gpu, ci_gpu) = sici(x_gpu)
    >>> (si, ci) = scipy.special.sici(x)
    >>> np.allclose(si, si_gpu.get())
    >>> np.allclose(ci, ci_gpu.get())

    if x_gpu.dtype == np.float32:
        args = 'float *x, float *si, float *ci'
        op = 'sicif(x[i], &si[i], &ci[i])'
    elif x_gpu.dtype == np.float64:
        args = 'double *x, double *si, double *ci'
        op = 'sici(x[i], &si[i], &ci[i])'
        raise ValueError('unsupported type')

        func = sici.cache[x_gpu.dtype]
    except KeyError:
        func = elementwise.ElementwiseKernel(
            options=["-I", install_headers],
            preamble='#include "cuSpecialFuncs.h"')
        sici.cache[x_gpu.dtype] = func

    si_gpu = gpuarray.empty_like(x_gpu)
    ci_gpu = gpuarray.empty_like(x_gpu)
    func(x_gpu, si_gpu, ci_gpu)

    return (si_gpu, ci_gpu)
Exemplo n.º 5
def clipupper_GPU(x_gpu, ub):    

    clipupper = cuelement.ElementwiseKernel(
        "float *x, float ub",
        "x[i] = x[i] > ub ? ub : x[i]",

    clipupper(x_gpu, ub)
Exemplo n.º 6
def cliplower_GPU(x_gpu, lb):    

    cliplower = cuelement.ElementwiseKernel(
        "float *x, float lb",
        "x[i] = x[i] < lb ? lb : x[i]",

    cliplower(x_gpu, lb)
Exemplo n.º 7
def _scale_inplace(a, x_gpu):
    Scale an array by a specified value in-place.

    ctype = tools.dtype_to_ctype(x_gpu.dtype)
    inplace = el.ElementwiseKernel("{ctype} a, {ctype} *x".format(ctype=ctype),
                                   "x[i] /= a")
    inplace(np.cast[x_gpu.dtype](a), x_gpu)
Exemplo n.º 8
def conj(x_gpu, overwrite=True):
    Complex conjugate.

    Compute the complex conjugate of the array in device memory.

    x_gpu : pycuda.gpuarray.GPUArray
        Input array of shape `(m, n)`.
    overwrite : bool
        If true (default), save the result in the specified array.
        If false, return the result in a newly allocated array.

    xc_gpu : pycuda.gpuarray.GPUArray
        Conjugate of the input array. If `overwrite` is true, the
        returned matrix is the same as the input array.

    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> x = np.array([[1+1j, 2-2j, 3+3j, 4-4j], [5+5j, 6-6j, 7+7j, 8-8j]], np.complex64)
    >>> x_gpu = gpuarray.to_gpu(x)
    >>> y_gpu = linalg.conj(x_gpu)
    >>> np.all(x == np.conj(y_gpu.get()))


    # Don't attempt to process non-complex matrix types:
    if x_gpu.dtype in [np.float32, np.float64]:
        return x_gpu

        func = conj.cache[x_gpu.dtype]
    except KeyError:
        ctype = tools.dtype_to_ctype(x_gpu.dtype)
        func = el.ElementwiseKernel(
                "{ctype} *x, {ctype} *y".format(ctype=ctype),
                "y[i] = conj(x[i])")
        conj.cache[x_gpu.dtype] = func
    if overwrite:
        func(x_gpu, x_gpu)
        return x_gpu
        y_gpu = gpuarray.empty_like(x_gpu)
        func(x_gpu, y_gpu)
        return y_gpu
Exemplo n.º 9
def exp1(z_gpu):
    Exponential integral with `n = 1` of complex arguments.

    z_gpu : GPUArray
        Input matrix of shape `(m, n)`.
    e_gpu : GPUArray
        GPUarrays containing the exponential integrals of
        the entries of `z_gpu`.

    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import scipy.special
    >>> import special
    >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64)
    >>> z_gpu = gpuarray.to_gpu(z)
    >>> e_gpu = exp1(z_gpu)
    >>> e_sp = scipy.special.exp1(z)
    >>> np.allclose(e_sp, e_gpu.get())

    if z_gpu.dtype == np.complex64:
        args = 'pycuda::complex<float> *z, pycuda::complex<float> *e'
    elif z_gpu.dtype == np.complex128:
        args = 'pycuda::complex<double> *z, pycuda::complex<double> *e'
        raise ValueError('unsupported type')
    op = 'e[i] = exp1(z[i])'

        func = exp1.cache[z_gpu.dtype]
    except KeyError:
        func = elementwise.ElementwiseKernel(
            options=["-I", install_headers],
            preamble='#include "cuSpecialFuncs.h"')
        exp1.cache[z_gpu.dtype] = func

    e_gpu = gpuarray.empty_like(z_gpu)
    func(z_gpu, e_gpu)

    return e_gpu
Exemplo n.º 10
def eye(N, dtype=np.float32):
    Construct a 2D matrix with ones on the diagonal and zeros elsewhere.

    Constructs a matrix in device memory whose diagonal elements
    are set to 1 and non-diagonal elements are set to 0.

    N : int
        Number of rows or columns in the output matrix.
    dtype : type
        Matrix data type.

    e_gpu : pycuda.gpuarray.GPUArray
        Diagonal matrix of dimensions `[N, N]` with diagonal values
        set to 1.

    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> N = 5
    >>> e_gpu = linalg.eye(N)
    >>> np.all(e_gpu.get() == np.eye(N))
    >>> e_gpu = linalg.eye(N, np.complex64)
    >>> np.all(e_gpu.get() == np.eye(N, dtype=np.complex64))


    if dtype not in [np.float32, np.float64, np.complex64,
        raise ValueError('unrecognized type')
    if N <= 0:
        raise ValueError('N must be greater than 0')
    alloc = misc._global_cublas_allocator

    e_gpu = misc.zeros((N, N), dtype, allocator=alloc)
    func = el.ElementwiseKernel("{ctype} *e".format(ctype=tools.dtype_to_ctype(dtype)),
                                "e[i] = 1")
    func(e_gpu, slice=slice(0, N*N, N+1))
    return e_gpu
Exemplo n.º 11
 def set_inds(self, src, dest, inds, dest_shift=0):
     assert isinstance(dest_shift, numbers.Integral)
         func = self.set_inds.cache[(inds.dtype, dest_shift)]
     except KeyError:
         inds_ctype = dtype_to_ctype(inds.dtype)
         data_ctype = dtype_to_ctype(src.dtype)
         v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src"\
         func = elementwise.ElementwiseKernel(v, 
                                              "dest[i+%i] = src[inds[i]]" % dest_shift)
         self.set_inds.cache[(inds.dtype, dest_shift)] = func
     func(dest, inds, src, range=slice(0, len(inds), 1) )
Exemplo n.º 12
def _get_exp1_kernel(dtype):
    if dtype == np.complex64:
        args = 'pycuda::complex<float> *z, pycuda::complex<float> *e'
    elif dtype == np.complex128:
        args = 'pycuda::complex<double> *z, pycuda::complex<double> *e'
        raise ValueError('unsupported type')
    op = 'e[i] = exp1(z[i])'

    return elementwise.ElementwiseKernel(
        options=["-I", install_headers],
        preamble='#include "cuSpecialFuncs.h"')
Exemplo n.º 13
def _scale_inplace(a, x_gpu):
    Scale an array by a specified value in-place.

    # Cache the kernel to avoid invoking the compiler if the
    # specified scale factor and array type have already been encountered:
        func = _scale_inplace.cache[(a, x_gpu.dtype)]
    except KeyError:
        ctype = tools.dtype_to_ctype(x_gpu.dtype)
        func = el.ElementwiseKernel(
            "{ctype} a, {ctype} *x".format(ctype=ctype), "x[i] /= a")
        _scale_inplace.cache[(a, x_gpu.dtype)] = func
    func(x_gpu.dtype.type(a), x_gpu)
Exemplo n.º 14
def _get_sici_kernel(dtype):
    if dtype == np.float32:
        args = 'float *x, float *si, float *ci'
        op = 'sicif(x[i], &si[i], &ci[i])'
    elif dtype == np.float64:
        args = 'double *x, double *si, double *ci'
        op = 'sici(x[i], &si[i], &ci[i])'
        raise ValueError('unsupported type')

    return elementwise.ElementwiseKernel(
        options=["-I", install_headers],
        preamble='#include "cuSpecialFuncs.h"')
Exemplo n.º 15
def init_gpu(dev=0):
    global gp, lg, cm, msc, slf, _gpu

    from pycuda import gpuarray as gp
    from pycuda import elementwise as ew
    from pycuda import cumath as cm
    from skcuda import linalg as lg
    from skcuda import misc as msc


    slf = ew.ElementwiseKernel("float * y, float * x, unsigned * ind",
                               "y[i] = x[ind[i]]")

    _gpu = True
Exemplo n.º 16
    def _fill_zeros_kernel(self, dest, inds):
        Set `dest[inds[i]] = 0 for i in range(len(inds))`

            func = self._fill_zeros_kernel.cache[(inds.dtype, dest.dtype)]
        except KeyError:
            inds_ctype = dtype_to_ctype(inds.dtype)
            data_ctype = dtype_to_ctype(dest.dtype)
            v = ("{data_ctype} *dest," +\
                 "{inds_ctype} *inds").format(\
            func = elementwise.ElementwiseKernel(v,\
            "dest[inds[i]] =0")
            self._fill_zeros_kernel.cache[(inds.dtype, dest.dtype)] = func
        func(dest, inds, range=slice(0, len(inds), 1) )
Exemplo n.º 17
    def set_by_inds_array(self, inds, data):
        Set mapped data with array by integer indices.

        inds : array-like
            Integer indices of data elements to update.
        data : numpy.ndarray
            Data to assign.

        if np.isscalar(data):
            raise ValueError('data must be array-like')
        if len(np.shape(inds)) > 1:
            raise ValueError('index array must be 1D')
        N = len(inds)
        if N == 0:

        if not isinstance(inds, gpuarray.GPUArray):
            inds = gpuarray.to_gpu(inds)
        if not issubclass(inds.dtype.type, numbers.Integral):
            raise ValueError('index array must contain integers')
        if N != len(data):
            raise ValueError('len(inds) = %s != %s = len(data)' %
                             (N, len(data)))

        if not isinstance(data, gpuarray.GPUArray):
            data = gpuarray.to_gpu(data)

        # Allocate data array if it doesn't exist:
        if not self.data:
            self.data = gpuarray.empty(N, data.dtype)
            assert self.data.dtype == data.dtype
            func = self.set_by_inds_array.cache[(inds.dtype, self.data.dtype)]
        except KeyError:
            inds_ctype = tools.dtype_to_ctype(inds.dtype)
            v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src".format(
                data_ctype=self.data_ctype, inds_ctype=inds_ctype)
            func = elementwise.ElementwiseKernel(v, "dest[inds[i]] = src[i]")
            self.set_by_inds_array.cache[(inds.dtype, self.data.dtype)] = func
        func(self.data, inds, data, range=slice(0, N, 1))
Exemplo n.º 18
    def add_inds(self, src, dest, inds, dest_shift=0):
        Set `dest[inds[i]+dest_shift] = src[i] for i in range(len(inds))`

        assert src.dtype == dest.dtype
            func = self.add_inds.cache[(inds.dtype, src.dtype)]
        except KeyError:
            inds_ctype = dtype_to_ctype(inds.dtype)
            data_ctype = dtype_to_ctype(src.dtype)
            v = ("{data_ctype} *dest, int dest_shift," +\
                 "{inds_ctype} *inds, {data_ctype} *src").format(\
            func = elementwise.ElementwiseKernel(v,\
            "dest[inds[i]+dest_shift] = dest[inds[i]+dest_shift] + src[i]")
            self.add_inds.cache[(inds.dtype, src.dtype)] = func
        func(dest, int(dest_shift), inds, src, range=slice(0, len(inds), 1) )
Exemplo n.º 19
def set_by_inds_from_inds(dest_gpu, ind_dest, src_gpu, ind_src):
    Set values in a GPUArray by index from indexed values in another GPUArray.

    dest_gpu : pycuda.gpuarray.GPUArray
        GPUArray instance to modify.
    ind_dest : pycuda.gpuarray.GPUArray or numpy.ndarray
        1D array of element indices in `dest_gpu` to set. Must have an integer dtype.
    src_gpu : pycuda.gpuarray.GPUArray
        GPUArray instance from which to set values.
    ind_src : pycuda.gpuarray.GPUArray or numpy.ndarray
        1D array of element indices in `src_gpu` to copy. Must have an integer dtype.

    assert len(np.shape(ind_dest)) == 1
    assert len(np.shape(ind_src)) == 1
    assert dest_gpu.dtype == src_gpu.dtype
    assert ind_dest.dtype == ind_src.dtype
    assert issubclass(ind_dest.dtype.type, numbers.Integral)
    assert issubclass(ind_src.dtype.type, numbers.Integral)
    N = len(ind_src)

    # Manually handle empty index array because it will cause the kernel to
    # fail if processed:
    if N == 0:
    assert N == len(ind_dest)
    if not isinstance(ind_dest, gpuarray.GPUArray):
        ind_dest = gpuarray.to_gpu(ind_dest)
    if not isinstance(ind_src, gpuarray.GPUArray):
        ind_src = gpuarray.to_gpu(ind_src)
        func = set_by_inds_from_inds.cache[(dest_gpu.dtype, ind_dest.dtype)]
    except KeyError:
        data_ctype = dtype_to_ctype(dest_gpu.dtype)
        ind_ctype = dtype_to_ctype(ind_dest.dtype)
        v = "{data_ctype} *dest, {ind_ctype} *ind_dest,"\
            "{data_ctype} *src, {ind_ctype} *ind_src".format(data_ctype=data_ctype, ind_ctype=ind_ctype)
        func = elementwise.ElementwiseKernel(
            v, "dest[ind_dest[i]] = src[ind_src[i]]")
        set_by_inds_from_inds.cache[(dest_gpu.dtype, ind_dest.dtype)] = func
    func(dest_gpu, ind_dest, src_gpu, ind_src, range=slice(0, N, 1))
Exemplo n.º 20
def diff(x_gpu):
    Calculate the discrete difference.

    Calculates the first order difference between the successive
    entries of a vector.

    x_gpu : pycuda.gpuarray.GPUArray
        Input vector.

    y_gpu : pycuda.gpuarray.GPUArray
        Discrete difference.

    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import misc
    >>> x = np.asarray(np.random.rand(5), np.float32)
    >>> x_gpu = gpuarray.to_gpu(x)
    >>> y_gpu = misc.diff(x_gpu)
    >>> np.allclose(np.diff(x), y_gpu.get())


    y_gpu = gpuarray.empty(len(x_gpu) - 1, x_gpu.dtype)
        func = diff.cache[x_gpu.dtype]
    except KeyError:
        ctype = tools.dtype_to_ctype(x_gpu.dtype)
        func = elementwise.ElementwiseKernel(
            "{ctype} *a, {ctype} *b".format(ctype=ctype), "b[i] = a[i+1]-a[i]")
        diff.cache[x_gpu.dtype] = func
    func(x_gpu, y_gpu)
    return y_gpu
Exemplo n.º 21
    def get_by_inds(self, inds):
        Retrieve mapped data specified by integer index.
        inds : sequence of int
            Integer indices of data elements to return.
        result : numpy.ndarray
            Selected data.

        if not self.data:
            raise ValueError('port mapper contains no data')
        assert len(np.shape(inds)) == 1
        assert issubclass(inds.dtype.type, numbers.Integral)

        N = len(inds)
        assert N <= len(self.data)
        if N == 0:
            return np.empty(N, dtype=self.data.dtype)

        result = gpuarray.empty(N, dtype=self.data.dtype)
        if not isinstance(inds, gpuarray.GPUArray):
            inds = gpuarray.to_gpu(inds)

            func = self.get_by_inds.cache[inds.dtype]
        except KeyError:
            inds_ctype = tools.dtype_to_ctype(inds.dtype)
            v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src".format(
                data_ctype=self.data_ctype, inds_ctype=inds_ctype)
            func = elementwise.ElementwiseKernel(v, "dest[i] = src[inds[i]]")
            self.get_by_inds.cache[inds.dtype] = func
        func(result, inds, self.data, range=slice(0, N, 1))
        return result.get()
Exemplo n.º 22
    def set_by_inds(self, inds, data):
        Set mapped data by integer indices.

        inds : sequence of int
            Integer indices of data elements to update.
        data : numpy.ndarray
            Data to assign.

        assert len(np.shape(inds)) == 1
        assert issubclass(inds.dtype.type, numbers.Integral)
        N = len(inds)
        assert N == len(data)

        if not isinstance(inds, gpuarray.GPUArray):
            inds = gpuarray.to_gpu(inds)
        if not isinstance(data, gpuarray.GPUArray):
            data = gpuarray.to_gpu(data)

        # Allocate data array if it doesn't exist:
        if not self.data:
            self.data = gpuarray.empty(N, data.dtype)
            assert self.data.dtype == data.dtype
            func = self.set_by_inds.cache[inds.dtype]
        except KeyError:
            inds_ctype = tools.dtype_to_ctype(inds.dtype)
            v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src".format(
                data_ctype=self.data_ctype, inds_ctype=inds_ctype)
            func = elementwise.ElementwiseKernel(v, "dest[inds[i]] = src[i]")
            self.set_by_inds.cache[inds.dtype] = func
        func(self.data, inds, data, range=slice(0, N, 1))
Exemplo n.º 23
        self.verbose = 0
        # initially
        self.tau = 1e-5
        self.compute_obj = 1
        self.compute_both = 0
        self.asgui = 0
        self.max_func_evals = 10  # might save additional
        # time, is used in line search
        self.pbb_gradient_norm = 1e-9
        self.beta = 0.0498
        self.sigma = 0.298
        self.unconstrained = False

clip2bound = cuelement.ElementwiseKernel(
    "float *dx, float *x, float *g",
    "dx[i] = ((x[i] == 0.f) && (g[i] > 0)) ? 0.f : dx[i]", "clip2bound")

class PBB:
    PBB   --  Optimizes f(x) s.t., x >= 0
    This function solves the following optimization problem
        min f(x) subject to x >= 0
    The implementation follows a 'reverse-communication' interface wherein
    the function f(x) and its gradient f'(x) are computed via function
Exemplo n.º 24
def multiply(x_gpu, y_gpu, overwrite=True):
    Multiply arguments element-wise.

    x_gpu, y_gpu : pycuda.gpuarray.GPUArray
        Input arrays to be multiplied.
    dev : pycuda.driver.Device
        Device object to be used.
    overwrite : bool
        If true (default), return the result in `y_gpu`.
        is false, return the result in a newly allocated array.

    z_gpu : pycuda.gpuarray.GPUArray
        The element-wise product of the input arrays.

    >>> import pycuda.autoinit
    >>> import pycuda.gpuarray as gpuarray
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> x = np.asarray(np.random.rand(4, 4), np.float32)
    >>> y = np.asarray(np.random.rand(4, 4), np.float32)
    >>> x_gpu = gpuarray.to_gpu(x)
    >>> y_gpu = gpuarray.to_gpu(y)
    >>> z_gpu = linalg.multiply(x_gpu, y_gpu)
    >>> np.allclose(x*y, z_gpu.get())


    alloc = misc._global_cublas_allocator

    if x_gpu.shape != y_gpu.shape:
        raise ValueError('input arrays must have the same shape')

    if x_gpu.dtype not in [
            np.float32, np.float64, np.complex64, np.complex128
        raise ValueError('unrecognized type')

    x_ctype = tools.dtype_to_ctype(x_gpu.dtype)
    y_ctype = tools.dtype_to_ctype(y_gpu.dtype)

    if overwrite:
        func = el.ElementwiseKernel(
            "{x_ctype} *x, {y_ctype} *y".format(x_ctype=x_ctype,
            "y[i] *= x[i]")
        func(x_gpu, y_gpu)
        return y_gpu
        result_type = np.result_type(x_gpu.dtype, y_gpu.dtype)
        z_gpu = gpuarray.empty(x_gpu.shape, result_type, allocator=alloc)
        func = \
               el.ElementwiseKernel("{x_ctype} *x, {y_ctype} *y, {z_type} *z".format(x_ctype=x_ctype,
                                    "z[i] = x[i]*y[i]")
        func(x_gpu, y_gpu, z_gpu)
        return z_gpu
Exemplo n.º 25
def pinv(a_gpu, rcond=1e-15):
    Moore-Penrose pseudoinverse.

    Compute the Moore-Penrose pseudoinverse of the specified matrix.

    a_gpu : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)`.
    rcond : float
        Singular values smaller than `rcond`*max(singular_values)`
        are set to zero.

    a_inv_gpu : pycuda.gpuarray.GPUArray
        Pseudoinverse of input matrix.

    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    This function destroys the contents of the input matrix.

    If the input matrix is square, the pseudoinverse uses less memory.

    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.asarray(np.random.rand(8, 4), np.float32)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> a_inv_gpu = linalg.pinv(a_gpu)
    >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4)
    >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64)
    >>> b_gpu = gpuarray.to_gpu(b)
    >>> b_inv_gpu = linalg.pinv(b_gpu)
    >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4)


    if not _has_cula:
        raise NotImplementedError('CULA not installed')

    # Perform in-place SVD if the matrix is square to save memory:
    if a_gpu.shape[0] == a_gpu.shape[1]:
        u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o')
        u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's')

    # Suppress very small singular values:
    cutoff_gpu = gpuarray.max(s_gpu) * rcond
    ctype = tools.dtype_to_ctype(s_gpu.dtype)
    cutoff_func = el.ElementwiseKernel(
        "{ctype} *s, {ctype} *cutoff".format(ctype=ctype),
        "if (s[i] > cutoff[0]) {s[i] = 1/s[i];} else {s[i] = 0;}")
    cutoff_func(s_gpu, cutoff_gpu)

    # Compute the pseudoinverse without allocating a new diagonal matrix:
    return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c')
Exemplo n.º 26
def set_by_inds_from_inds(dest_gpu, ind_dest, src_gpu, ind_src):
    Set values in a GPUArray by index from indexed values in another GPUArray.

    dest_gpu : pycuda.gpuarray.GPUArray
        GPUArray instance to modify.
    ind_dest : pycuda.gpuarray.GPUArray or numpy.ndarray
        1D array of element indices in `dest_gpu` to set. Must have an integer dtype.
    src_gpu : pycuda.gpuarray.GPUArray
        GPUArray instance from which to set values.
    ind_src : pycuda.gpuarray.GPUArray or numpy.ndarray
        1D array of element indices in `src_gpu` to copy. Must have an integer
        dtype and be the same length as `ind_dest`.

    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> from nk.tools.gpu import set_by_inds_from_inds
    >>> dest_gpu = gpuarray.to_gpu(np.zeros(5, dtype=np.float32))
    >>> ind_dest = gpuarray.to_gpu(np.array([0, 2, 4]))
    >>> src_gpu =  gpuarray.to_gpu(np.arange(5, 10, dtype=np.float32))
    >>> ind_src =  gpuarray.to_gpu(np.array([2, 3, 4]))
    >>> gpu.set_by_inds_from_inds(dest_gpu, ind_dest, src_gpu, ind_src)
    >>> assert np.allclose(dest_gpu.get(), np.array([7, 0, 8, 0, 9], dtype=np.float32))

    if len(np.shape(ind_dest)) > 1:
        raise ValueError('destination index array must be 1D')
    if len(np.shape(ind_src)) > 1:
        raise ValueError('source index array must be 1D')
    assert dest_gpu.dtype == src_gpu.dtype
    assert ind_dest.dtype == ind_src.dtype
    assert issubclass(ind_dest.dtype.type, numbers.Integral)
    assert issubclass(ind_src.dtype.type, numbers.Integral)

    # Manually handle empty index array because it will cause the kernel to
    # fail if processed:
    N = len(ind_src)
    if N == 0:
    assert N == len(ind_dest)
    if not isinstance(ind_dest, gpuarray.GPUArray):
        ind_dest = gpuarray.to_gpu(ind_dest)
    if not isinstance(ind_src, gpuarray.GPUArray):
        ind_src = gpuarray.to_gpu(ind_src)
        func = set_by_inds_from_inds.cache[(dest_gpu.dtype, ind_dest.dtype)]
    except KeyError:
        data_ctype = dtype_to_ctype(dest_gpu.dtype)
        ind_ctype = dtype_to_ctype(ind_dest.dtype)
        v = "{data_ctype} *dest, {ind_ctype} *ind_dest,"\
            "{data_ctype} *src, {ind_ctype} *ind_src".format(data_ctype=data_ctype, ind_ctype=ind_ctype)
        func = elementwise.ElementwiseKernel(
            v, "dest[ind_dest[i]] = src[ind_src[i]]")
        set_by_inds_from_inds.cache[(dest_gpu.dtype, ind_dest.dtype)] = func
    func(dest_gpu, ind_dest, src_gpu, ind_src, range=slice(0, N, 1))
Exemplo n.º 27
def set_by_inds(dest_gpu, ind, src_gpu, ind_which='dest'):
    Set values in a GPUArray by index.

    dest_gpu : pycuda.gpuarray.GPUArray
        GPUArray instance to modify.
    ind : pycuda.gpuarray.GPUArray or numpy.ndarray
        1D array of element indices to set. Must have an integer dtype.
    src_gpu : pycuda.gpuarray.GPUArray
        GPUArray instance from which to set values.
    ind_which : str
        If set to 'dest', set the elements in `dest_gpu` with indices `ind`
        to the successive values in `src_gpu`; the lengths of `ind` and
        `src_gpu` must be equal. If set to 'src', set the
        successive values in `dest_gpu` to the values in `src_gpu` with indices
        `ind`; the lengths of `ind` and `dest_gpu` must be equal.

    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> from nk.tools.gpu import set_by_inds
    >>> dest_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32))
    >>> ind = gpuarray.to_gpu(np.array([0, 2, 4]))
    >>> src_gpu = gpuarray.to_gpu(np.array([1, 1, 1], dtype=np.float32))
    >>> set_by_inds(dest_gpu, ind, src_gpu, 'dest')
    >>> np.allclose(dest_gpu.get(), np.array([1, 1, 1, 3, 1], dtype=np.float32))
    >>> dest_gpu = gpuarray.to_gpu(np.zeros(3, dtype=np.float32))
    >>> ind = gpuarray.to_gpu(np.array([0, 2, 4]))
    >>> src_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32))
    >>> set_by_inds(dest_gpu, ind, src_gpu, 'src')
    >>> np.allclose(dest_gpu.get(), np.array([0, 2, 4], dtype=np.float32))

    Only supports 1D index arrays.

    May not be efficient for certain index patterns because of lack of inability
    to coalesce memory operations.

    if np.isscalar(src_gpu) or np.isscalar(dest_gpu):
        raise ValueError('data must be array-like')
    if len(np.shape(ind)) > 1:
        raise ValueError('index array must be 1D')

    # Manually handle empty index array because it will cause the kernel to
    # fail if processed:
    N = len(ind)
    if N == 0:

    if not issubclass(ind.dtype.type, numbers.Integral):
        raise ValueError('index array must contain integers')
    if not dest_gpu.dtype == src_gpu.dtype:
        raise ValueError('dest_gpu.dtype != src_gpu.dtype')

    if ind_which == 'dest':
        assert N == len(src_gpu)
    elif ind_which == 'src':
        assert N == len(dest_gpu)
        raise ValueError('invalid value for `ind_which`')
    if not isinstance(ind, gpuarray.GPUArray):
        ind = gpuarray.to_gpu(ind)
        func = set_by_inds.cache[(dest_gpu.dtype, ind.dtype, ind_which)]
    except KeyError:
        data_ctype = dtype_to_ctype(dest_gpu.dtype)
        ind_ctype = dtype_to_ctype(ind.dtype)
        v = "{data_ctype} *dest, {ind_ctype} *ind, {data_ctype} *src".format(
            data_ctype=data_ctype, ind_ctype=ind_ctype)

        if ind_which == 'dest':
            func = elementwise.ElementwiseKernel(v, "dest[ind[i]] = src[i]")
            func = elementwise.ElementwiseKernel(v, "dest[i] = src[ind[i]]")
        set_by_inds.cache[(dest_gpu.dtype, ind.dtype, ind_which)] = func
    func(dest_gpu, ind, src_gpu, range=slice(0, N, 1))
Exemplo n.º 28
def gen_simps_mult(N, dtype, even='avg'):
    Generate multiplication array for composite Simpson's rule.

    Generates an array whose dot product with some array of equal
    length is equivalent to the definite integral of the latter
    computed using composite Simpson's rule.

    If there are an even number of samples, N, then there are an odd 
    number of intervals (N-1), but Simpson's rule requires an even number 
    of intervals. The parameter 'even' controls how this is handled.

    N : int
        Length of array.
    dtype : float type
        Floating point type to use when generating the array.
    even : str {'avg', 'first', 'last'}, optional
        'avg' : Average two results:1) use the first N-2 intervals with
                  a trapezoidal rule on the last interval and 2) use the last
                  N-2 intervals with a trapezoidal rule on the first interval.
        'first' : Use Simpson's rule for the first N-2 intervals with
                a trapezoidal rule on the last interval.
        'last' : Use Simpson's rule for the last N-2 intervals with a
               trapezoidal rule on the first interval.

    result : pycuda.gpuarray.GPUArray
        Generated array.

    if dtype not in [np.float32, np.float64, np.complex64,
        raise ValueError('unrecognized type')

    ctype = tools.dtype_to_ctype(dtype)
    x_gpu = gpuarray.zeros(N, dtype)

    if N % 2:
        func = elementwise.ElementwiseKernel("{ctype} *x".format(ctype=ctype),
                                             "x[i] = (i%2 == 0) ? ((i != 0 && i != {M}) ? 2. : 1.) : 4.".format(M=N-1))
        return x_gpu/3.
        if even not in ['avg', 'last', 'first']:
            raise ValueError("Parameter 'even' must be "
                             "'avg', 'last', or 'first'.")
        basic_simps = gen_simps_mult(N-1, dtype)
        if even in ['avg', 'first']:
            x_gpu[:-1] += basic_simps
            x_gpu[-2:] += 0.5 # trapz on last interval
        if even in ['avg', 'last']:
            x_gpu[1:] += basic_simps
            x_gpu[:2] += 0.5 # trapz on first interval
        if even == 'avg':
            x_gpu /= 2.
        return x_gpu
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray, elementwise

x = np.arange(0, 1001, dtype=np.uint32)
y = np.zeros(1001, np.uint32)
kernel = elementwise.ElementwiseKernel(
    arguments="unsigned int* x, int* y",
    operation="y[i] = x[i] * x[i]",
x_gpu = gpuarray.to_gpu(x)
y_gpu = gpuarray.to_gpu(y)
kernel(x_gpu, y_gpu)

Exemplo n.º 30
def _get_scale_kernel(dtype):
    ctype = tools.dtype_to_ctype(dtype)
    return el.ElementwiseKernel(
        "{ctype} scale, {ctype} *x".format(ctype=ctype), "x[i] /= scale")