def __init__(self, dtype): from pycuda.curandom import md5_code from pycuda.elementwise import get_elwise_kernel if dtype == numpy.complex64: self._func = get_elwise_kernel( "float2 *dest, unsigned int seed", md5_code + """ #define POW_2_M32 (1/4294967296.0f) dest[i] = make_float2(a*POW_2_M32, b*POW_2_M32); if ((i += total_threads) < n) dest[i] = make_float2(c*POW_2_M32, d*POW_2_M32); """, "md5_rng_float") elif dtype == numpy.complex128: self._func = get_elwise_kernel( "pycuda::complex<double> *dest, unsigned int seed", md5_code + """ #define POW_2_M32 (1/4294967296.0) #define POW_2_M64 (1/18446744073709551616.) dest[i] = pycuda::complex<double>( a*POW_2_M32 + b*POW_2_M64, c*POW_2_M32 + d*POW_2_M64); """, "md5_rng_float")
def rand(shape, dtype=numpy.float32, stream=None): from pycuda.gpuarray import GPUArray from pycuda.elementwise import get_elwise_kernel result = GPUArray(shape, dtype) if dtype == numpy.float32: func = get_elwise_kernel( "float *dest, unsigned int seed", md5_code + """ #define POW_2_M32 (1/4294967296.0f) dest[i] = a*POW_2_M32; if ((i += total_threads) < n) dest[i] = b*POW_2_M32; if ((i += total_threads) < n) dest[i] = c*POW_2_M32; if ((i += total_threads) < n) dest[i] = d*POW_2_M32; """, "md5_rng_float") elif dtype == numpy.float64: func = get_elwise_kernel( "double *dest, unsigned int seed", md5_code + """ #define POW_2_M32 (1/4294967296.0) #define POW_2_M64 (1/18446744073709551616.) dest[i] = a*POW_2_M32 + b*POW_2_M64; if ((i += total_threads) < n) { dest[i] = c*POW_2_M32 + d*POW_2_M64; } """, "md5_rng_float") elif dtype in [numpy.int32, numpy.uint32]: func = get_elwise_kernel( "unsigned int *dest, unsigned int seed", md5_code + """ dest[i] = a; if ((i += total_threads) < n) dest[i] = b; if ((i += total_threads) < n) dest[i] = c; if ((i += total_threads) < n) dest[i] = d; """, "md5_rng_int") else: raise NotImplementedError; func.set_block_shape(*result._block) func.prepared_async_call(result._grid, stream, result.gpudata, numpy.random.randint(2**31-1), result.size) return result
def guarded_div_kernel(self, dtype_x, dtype_y, dtype_z): from pycuda.elementwise import get_elwise_kernel from pycuda.tools import dtype_to_ctype return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = y[i] == 0 ? 0 : (x[i] / y[i])", "divide")
def make_kernel_internal(self, args, instructions): from pycuda.elementwise import get_elwise_kernel return get_elwise_kernel(args, instructions, name="vector_expression")
else: raise ValueError("Incompatible dtype") return df def linear(x): pass def df_linear(x): return x sample_dropout_mask_kernel = get_elwise_kernel( "float *mat, float *dropout, float dropout_probability", """ if (dropout[i] <= dropout_probability) { dropout[i] = 0.; mat[i] = 0.; } else { dropout[i] = 1.; } """, "sample_dropout_mask") def sample_dropout_mask(x, dropout_probability=.5, columns=None, stream=None): """ Samples a dropout mask and applies it in place""" assert x.flags.c_contiguous if columns is not None: assert len(columns) == 2 x_tmp = x