def elemwise_layouts_mixed(shape, offseted_outer, offseted_inner, sliced, order): ac, ag = gen_gpuarray(shape, dtype='float32', sliced=sliced, order=order, offseted_outer=offseted_outer, offseted_inner=offseted_inner, ctx=context) b = numpy.asarray(2.0, dtype='float32') outg = gpuarray.empty(shape, dtype='float32', context=context) k = ElemwiseKernel(context, "float *a, float b, float *c", "c[i] = a[i] + b") # will use contig or basic k(ag, b, outg) outc = ac + b assert numpy.allclose(numpy.asarray(outg), outc) # test basic outg = gpuarray.empty(shape, dtype='float32', context=context) k.call_basic(ag, b, outg) assert numpy.allclose(numpy.asarray(outg), outc) # test dimspec outg = gpuarray.empty(shape, dtype='float32', context=context) k.call_dimspec(ag, b, outg) assert numpy.allclose(numpy.asarray(outg), outc) # test specialized outg = gpuarray.empty(shape, dtype='float32', context=context) k.call_specialized(ag, b, outg) assert numpy.allclose(numpy.asarray(outg), outc)
def perform(self, node, inputs, outs): out, = outs v = inputs[0] sh = tuple(map(int, inputs[1:])) if out[0] is None or out[0].shape != sh: out[0] = gpuarray.empty(sh, dtype=v.dtype) out[0][...] = v
def test_reduction_wrong_type(): c, g = gen_gpuarray((2, 3), dtype='float32', ctx=context, cls=elemary) out1 = gpuarray.empty((2, 3), dtype='int32', context=context) out2 = gpuarray.empty((3, 2), dtype='float32', context=context) try: r = g.sum(out=out1) assert False, "Expected a TypeError out of the sum" except TypeError: pass try: r = g.sum(out=out2) assert False, "Expected a TypeError out of the sum" except TypeError: pass
def test_reduction_wrong_type(): c, g = gen_gpuarray((2, 3), dtype='float32', ctx=context, cls=elemary) out1 = gpuarray.empty((2, 3), dtype='int32', context=context) out2 = gpuarray.empty((3, 2), dtype='float32', context=context) try: g.sum(out=out1) assert False, "Expected a TypeError out of the sum" except TypeError: pass try: g.sum(out=out2) assert False, "Expected a TypeError out of the sum" except TypeError: pass
def test_hash(): g = gpu_ndarray.empty((2, 3), context=ctx) exc = None try: h = hash(g) except TypeError, e: exc = e
def test_elemwise_bool(): a = gpuarray.empty((2,), context=context) exc = None try: bool(a) except ValueError, e: exc = e
def test_elemwise_bool(): a = gpuarray.empty((2, ), context=context) exc = None try: bool(a) except ValueError as e: exc = e assert exc is not None a = gpuarray.zeros((1, ), context=context) assert not bool(a) a = gpuarray.zeros((), context=context) assert not bool(a)
def test_elemwise_bool(): a = gpuarray.empty((2,), context=context) exc = None try: bool(a) except ValueError as e: exc = e assert exc is not None a = gpuarray.zeros((1,), context=context) assert not bool(a) a = gpuarray.zeros((), context=context) assert not bool(a)
def perform(self, node, inputs, outs): out, = outs v = inputs[0] sh = tuple(map(int, inputs[1:])) if out[0] is None or out[0].shape != sh: if v.size == 1 and numpy.asarray(v)[0].item() == 0: out[0] = gpuarray.zeros(sh, dtype=v.dtype) else: out[0] = gpuarray.empty(sh, dtype=v.dtype) out[0][...] = v else: out[0][...] = v if config.gpuarray.sync: out[0].sync()
def perform(self, node, inputs, outs): out, = outs v = inputs[0] sh = tuple(map(int, inputs[1:])) if out[0] is None or out[0].shape != sh: if self.memset_0: out[0] = gpuarray.zeros(sh, dtype=v.dtype) else: out[0] = gpuarray.empty(sh, dtype=v.dtype) out[0][...] = v else: out[0][...] = v if config.gpuarray.sync: out[0].sync()
def reduction_op(op, dtype, axis): c, g = gen_gpuarray((2, 3), dtype=dtype, ctx=context, cls=elemary) rc = getattr(c, op)(axis=axis) rg = getattr(g, op)(axis=axis) check_meta_content(rg, rc) outc = numpy.empty(rc.shape, dtype=rc.dtype) outg = gpuarray.empty(rg.shape, dtype=rg.dtype, context=context) rc = getattr(c, op)(axis=axis, out=outc) rg = getattr(g, op)(axis=axis, out=outg) check_meta_content(outg, outc)
def check_elemwise2(self, shapea, shapeb, output_shape, broadcast=True): # We rewrite this version of elemwise2 to skip the scaling of output # that is done in the official elemwise2 function. na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary) nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary) odtype = get_common_dtype(ga, gb, True) res = gpuarray.empty(output_shape, dtype=odtype, context=ga.context, cls=ga.__class__) a_arg = as_argument(ga, 'a', read=True) b_arg = as_argument(gb, 'b', read=True) res_arg = as_argument(res, 'res', write=True) args = [res_arg, a_arg, b_arg] oper = "res = (%(out_t)s)a %(op)s (%(out_t)s)b" % {'op': '+', 'out_t': dtype_to_ctype(odtype)} k = GpuElemwise(ga.context, oper, args, convert_f16=True) k(res, ga, gb, broadcast=broadcast) nres = na + nb assert numpy.allclose(nres, numpy.asarray(res), atol=1e-6)
def check_elemwise2(self, shapea, shapeb, output_shape, broadcast=True): # We rewrite this version of elemwise2 to skip the scaling of output # that is done in the official elemwise2 function. na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary) nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary) odtype = get_common_dtype(ga, gb, True) res = gpuarray.empty(output_shape, dtype=odtype, context=ga.context, cls=ga.__class__) a_arg = as_argument(ga, 'a', read=True) b_arg = as_argument(gb, 'b', read=True) res_arg = as_argument(res, 'res', write=True) args = [res_arg, a_arg, b_arg] oper = "res = (%(out_t)s)a %(op)s (%(out_t)s)b" % { 'op': '+', 'out_t': dtype_to_ctype(odtype) } k = GpuElemwise(ga.context, oper, args, convert_f16=True) k(res, ga, gb, broadcast=broadcast) nres = na + nb assert numpy.allclose(nres, numpy.asarray(res), atol=1e-6)
def test_reduce_scatter(self): texp = self.size * np.arange(5 * self.size) + sum(range(self.size)) exp = texp[self.rank * 5:self.rank * 5 + 5] # order c cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (self.size, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = gpuarray.empty((5, ), dtype='int64', order='C', context=self.ctx) self.gpucomm.reduce_scatter(gpu, 'sum', resgpu) assert np.allclose(resgpu, exp) # order f cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (5, self.size), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = gpuarray.empty((5, ), dtype='int64', order='F', context=self.ctx) self.gpucomm.reduce_scatter(gpu, 'sum', resgpu) assert np.allclose(resgpu, exp) # make result order c (one less dim) cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (self.size, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['C_CONTIGUOUS'] is True # c-contiguous split problem (for size == 1, it can always be split) if self.size != 1: cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank np.reshape(cpu, (self.size + 1, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) with self.assertRaises(TypeError): resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') # make result order f (one less dim) cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (5, self.size), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['F_CONTIGUOUS'] is True # f-contiguous split problem (for size == 1, it can always be split) if self.size != 1: cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank np.reshape(cpu, (5, self.size + 1), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) with self.assertRaises(TypeError): resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') # make result order c (same dim - less size) texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size)) exp = texp[self.rank * 15:self.rank * 15 + 15] np.reshape(exp, (3, 5), order='C') cpu = np.arange(5 * self.size * 3) + self.rank np.reshape(cpu, (self.size * 3, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['C_CONTIGUOUS'] is True # make result order f (same dim - less size) texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size)) exp = texp[self.rank * 15:self.rank * 15 + 15] np.reshape(exp, (5, 3), order='F') cpu = np.arange(5 * self.size * 3) + self.rank np.reshape(cpu, (5, self.size * 3), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['F_CONTIGUOUS'] is True
def gpu_alloc_expected(x, *shp): g = gpuarray.empty(shp, dtype=x.dtype, context=get_context(test_ctx_name)) g[:] = x return g
def empty(shp, order, dtype): x = gpu_ndarray.empty(shp, dtype, order, context=ctx) y = numpy.empty(shp, dtype, order) check_meta(x, y)
def ufunc21(name, a, b, out=None, context=None): """Call a ufunc with 2 inputs and 1 output. Parameters ---------- name : str Name of the NumPy ufunc. a, b : `array-like` Input arrays to which the ufunc should be applied. out : `pygpu.gpuarray.GpuArray`, optional Array in which to store the result. context : `pygpu.gpuarray.GpuContext`, optional Use this GPU context to evaluate the GPU kernel. For ``None``, if no GPU array is among the provided parameters, a default GPU context must have been set. Returns ------- out : `pygpu.gpuarray.GpuArray` Result of the computation. If ``out`` was given, the returned object is a reference to it. The type of the returned array is `pygpu._array.ndgpuarray` if - no GPU array was among the parameters or - one of the parameters had type `pygpu._array.ndgpuarray`. """ # Lazy import to avoid circular dependency from pygpu._array import ndgpuarray # --- Prepare input array --- # # Determine GPU context and class. Use the "highest" class present in the # inputs, defaulting to `ndgpuarray` need_context = True cls = None for ary in (a, b, out): if isinstance(ary, GpuArray): if context is not None and ary.context != context: raise ValueError('cannot mix contexts') context = ary.context if cls is None or cls == GpuArray: cls = ary.__class__ need_context = False if need_context and context is None: context = get_default_context() cls = ndgpuarray # Cast input to `GpuArray` of the right dtype if necessary # TODO: figure out what to do here exactly (scalars and such) if isinstance(a, (GpuArray, numpy.ndarray)): if a.flags.f_contiguous and not a.flags.c_contiguous: order = 'F' else: order = 'C' # Determine signature here to avoid creating an intermediate GPU array sig = find_smallest_valid_signature(name, (a, ), (out, )) if not sig: raise TypeError('ufunc {!r} not supported for the input types, ' 'and the inputs could not be safely coerced' ''.format(name)) tc_in, _ = sig.split('->') a = array(a, dtype=tc_in, copy=False, order=order, context=context, cls=cls) else: a = array(a, context=context, cls=cls) sig = find_smallest_valid_signature(name, (a, ), (out, )) if not sig: raise TypeError('ufunc {!r} not supported for the input types, ' 'and the inputs could not be safely coerced' ''.format(name)) # Upcast input if necessary tc_in, tc_out = sig.split('->') if a.dtype < tc_in: a = a.astype(tc_in) # Create output array if not provided if out is None: out = empty(a.shape, dtype=tc_out, context=context, cls=cls) # --- Generate code strings for GpuElemwise --- # # C dtypes for casting c_dtype_in = dtype_to_ctype(tc_in) c_dtype_out = dtype_to_ctype(tc_out) meta = ufunc_metadata[name] assert meta['nin'] == 1 assert meta['nout'] == 1 # Create `oper` string if meta['c_op'] is not None: # Case 1: unary operator unop = meta['c_op'] if a.dtype == numpy.bool and unop == '-': if parse_version(numpy.__version__) >= parse_version('1.13'): # Numpy >= 1.13 raises a TypeError raise TypeError( 'negation of boolean arrays is not supported, use ' '`logical_not` instead') else: # Warn and remap to logical not warnings.warn( 'using negation (`-`) with boolean arrays is ' 'deprecated, use `logical_not` (`~`) instead; ' 'the current behavior will be changed along ' "with NumPy's", FutureWarning) unop = '!' oper = 'out = ({odt}) {}a'.format(unop, odt=c_dtype_out) preamble = '' elif meta['c_func'] is not None: # Case 2: C function c_func = meta['c_func'] if name in ('abs', 'absolute'): # Special case if numpy.dtype(tc_out).kind == 'u': # Shortcut for abs() with unsigned int. This also fixes a CUDA # quirk that makes abs() crash with unsigned int input. out[:] = a return out elif numpy.dtype(tc_out).kind == 'f': c_func = 'fabs' else: c_func = 'abs' oper = 'out = ({odt}) {}(a)'.format(c_func, odt=c_dtype_out) preamble_tpl = mako.template.Template(meta['oper_preamble_tpl']) preamble = preamble_tpl.render(idt=c_dtype_in, odt=c_dtype_out) elif meta['oper_fmt'] is not None: # Case 3: custom implementation with `oper` template oper = meta['oper_fmt'].format(idt=c_dtype_in, odt=c_dtype_out) preamble_tpl = mako.template.Template(meta['oper_preamble_tpl']) preamble = preamble_tpl.render(idt=c_dtype_in, odt=c_dtype_out) else: # Case 4: not implemented raise NotImplementedError('ufunc {!r} not implemented'.format(name)) # --- Generate and run GpuElemwise kernel --- # a_arg = as_argument(a, 'a', read=True) args = [arg('out', out.dtype, write=True), a_arg] ker = GpuElemwise(context, oper, args, preamble=preamble) ker(out, a) return out
def test_empty_no_dtype(): x = gpu_ndarray.empty((), context=ctx)# no dtype and order param y = numpy.empty(()) check_meta(x, y)
def test_empty_no_params(): try: gpu_ndarray.empty() assert False except TypeError: pass
def test_empty_no_dtype(): x = gpu_ndarray.empty((), context=ctx) # no dtype and order param y = numpy.empty(()) check_meta(x, y)
def elemwise_collapse(dtype1, dtype2, shape1, shape2, expected): assert len(shape1) == len(shape2) # int8 does not cause problematic upcasts scalar = numpy.asarray(1, dtype='int8') a_cpu, a_gpu = gen_gpuarray(shape1, dtype1, ctx=context) b_cpu, b_gpu = gen_gpuarray(shape2, dtype2, ctx=context) o_shape = [] for i in range(len(shape1)): o_shape.append(max(shape1[i], shape2[i])) o = gpuarray.empty(o_shape, dtype=(a_cpu + b_cpu).dtype, context=context) n, nd, dims, strs, offsets, contig = check_args((a_gpu, b_gpu), collapse=True, broadcast=True) assert nd == expected, (shape1, shape2, dims, nd, expected) k = ElemwiseKernel(context, [ArrayArg(numpy.dtype(dtype1), 'a'), ArrayArg(numpy.dtype(dtype2), 'b'), ArrayArg(o.dtype, 'o')], "o[i] = a[i] + b[i]") out_cpu = a_cpu + b_cpu k(a_gpu, b_gpu, o, collapse=True, broadcast=True) assert numpy.allclose(numpy.asarray(o), out_cpu) k(a_gpu, b_gpu, o, collapse=False, broadcast=True) assert numpy.allclose(numpy.asarray(o), out_cpu) broadcast = any([True for i in shape1 + shape2 if i == 1]) n, nd, dims, strs, offsets, contig = check_args((a_gpu, b_gpu, scalar), collapse=True, broadcast=True) assert nd == expected k = ElemwiseKernel(context, [ArrayArg(numpy.dtype(dtype1), 'a'), ArrayArg(numpy.dtype(dtype2), 'b'), ScalarArg(scalar.dtype, 's'), ArrayArg(o.dtype, 'o')], "o[i] = a[i] + b[i] + s") out_cpu = a_cpu + b_cpu + scalar k(a_gpu, b_gpu, scalar, o, collapse=True, broadcast=True) assert numpy.allclose(numpy.asarray(o), out_cpu) k(a_gpu, b_gpu, scalar, o, collapse=False, broadcast=True) assert numpy.allclose(numpy.asarray(o), out_cpu) if expected == 1: expected2 = 2 else: expected2 = expected if len(shape1) != 4: return if shape1[0] != 1: c_cpu, c_gpu = gen_gpuarray(shape1, dtype=dtype1, sliced=2, ctx=context) n, nd, dims, strs, offsets,contig = check_args((c_gpu, b_gpu), collapse=True, broadcast=True) if broadcast: assert nd >= expected else: assert nd == expected2
def test_reduce_scatter(self): texp = self.size * np.arange(5 * self.size) + sum(range(self.size)) exp = texp[self.rank * 5:self.rank * 5 + 5] # order c cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (self.size, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = gpuarray.empty((5,), dtype='int64', order='C', context=self.ctx) self.gpucomm.reduce_scatter(gpu, 'sum', resgpu) assert np.allclose(resgpu, exp) # order f cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (5, self.size), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = gpuarray.empty((5,), dtype='int64', order='F', context=self.ctx) self.gpucomm.reduce_scatter(gpu, 'sum', resgpu) assert np.allclose(resgpu, exp) # make result order c (one less dim) cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (self.size, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['C_CONTIGUOUS'] is True # c-contiguous split problem (for size == 1, it can always be split) if self.size != 1: cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank np.reshape(cpu, (self.size + 1, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) with self.assertRaises(TypeError): resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') # make result order f (one less dim) cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (5, self.size), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['F_CONTIGUOUS'] is True # f-contiguous split problem (for size == 1, it can always be split) if self.size != 1: cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank np.reshape(cpu, (5, self.size + 1), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) with self.assertRaises(TypeError): resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') # make result order c (same dim - less size) texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size)) exp = texp[self.rank * 15:self.rank * 15 + 15] np.reshape(exp, (3, 5), order='C') cpu = np.arange(5 * self.size * 3) + self.rank np.reshape(cpu, (self.size * 3, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['C_CONTIGUOUS'] is True # make result order f (same dim - less size) texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size)) exp = texp[self.rank * 15:self.rank * 15 + 15] np.reshape(exp, (5, 3), order='F') cpu = np.arange(5 * self.size * 3) + self.rank np.reshape(cpu, (5, self.size * 3), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['F_CONTIGUOUS'] is True