def __call__(self, *args): itypes = ''.join([_get_input_type(x) for x in args]) kern = self._kernel_cache.get(itypes, None) if kern is None: in_types = [_cuda_types.Scalar(t) for t in itypes] ret_type = None if self.otypes is not None: # TODO(asi1024): Implement raise NotImplementedError func = _interface._CudaFunction(self.pyfunc, 'numpy', device=True) result = func._emit_code_from_types(in_types, ret_type) in_params = ', '.join(f'{t.dtype} in{i}' for i, t in enumerate(in_types)) in_args = ', '.join([f'in{i}' for i in range(len(in_types))]) out_params, out_lval = self._parse_out_param(result.return_type) body = '{} = {}({})'.format(out_lval, func.name, in_args) # note: we don't worry about -D not working on ROCm here, because # we unroll all headers for HIP and so thrust::tuple et al are all # defined regardless if CUPY_JIT_MODE is defined or not kern = _core.ElementwiseKernel(in_params, out_params, body, preamble=result.code, options=('-DCUPY_JIT_MODE', )) self._kernel_cache[itypes] = kern return kern(*args)
def __call__(self, *args): itypes = ''.join([_get_input_type(x) for x in args]) kern = self._kernel_cache.get(itypes, None) if kern is None: in_types = [_types.Scalar(t) for t in itypes] ret_type = None if self.otypes is not None: # TODO(asi1024): Implement raise NotImplementedError func = _interface._CudaFunction(self.pyfunc, 'numpy', device=True) result = func._emit_code_from_types(in_types, ret_type) in_params = ', '.join(f'{t.dtype} in{i}' for i, t in enumerate(in_types)) out_params = str(result.return_type.dtype) + ' out0' body = 'out0 = {}({})'.format( func.name, ', '.join([f'in{i}' for i in range(len(in_types))])) kern = core.ElementwiseKernel(in_params, out_params, body, preamble=result.code) self._kernel_cache[itypes] = kern return kern(*args)