def _assign(self, value): stream = self.backend.stream if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8_async(self.gpudata, unpack_from('B', value)[0], self.size, stream) elif self.dtype.itemsize == 2: drv.memset_d16_async(self.gpudata, unpack_from('H', value)[0], self.size, stream) else: drv.memset_d32_async(self.gpudata, unpack_from('I', value)[0], self.size, stream) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes, stream) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value, device=None) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def _assign(self, value): stream = self.backend.stream if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8_async( self.gpudata, unpack_from('B', value)[0], self.size, stream) elif self.dtype.itemsize == 2: drv.memset_d16_async(self.gpudata, unpack_from('H', value)[0], self.size, stream) else: drv.memset_d32_async(self.gpudata, unpack_from('I', value)[0], self.size, stream) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes, stream) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value, device=None) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def execute(self, repeat=1, unbind=True): shuffle_kernel = _get_transpose_kernel(self.dtype_str) kernel = kernel_specs.get_kernel(self.kernel[0]) for r in range(repeat): # let atomic adds accumulate on top if not self.beta: drv.memset_d8_async(*self.zero_args) shuffle_kernel.prepared_async_call(*self.shuffle_args) kernel.prepared_async_call(*self.kernel[1:]) if unbind: self.zero_args = None self.shuffle_args[2:5] = (None,) * 3 self.kernel[3:8] = (None,) * 5
def execute(self, repeat=1, unbind=True): shuffle_kernel = _get_transpose_kernel(self.dtype_str) kernel = kernel_specs.get_kernel(self.kernel[0]) for r in range(repeat): # let atomic adds accumulate on top if not self.beta: drv.memset_d8_async(*self.zero_args) shuffle_kernel.prepared_async_call(*self.shuffle_args) kernel.prepared_async_call(*self.kernel[1:]) if unbind: self.zero_args = None self.shuffle_args[2:5] = (None, ) * 3 self.kernel[3:8] = (None, ) * 5