def execute(self, repeat=1, unbind=True): for r in range(repeat): if self.bsum_zero: drv.memset_d32_async(*self.bsum_zero) for kernel_params in self.kernels: kernel = kernel_specs.get_kernel(kernel_params[0]) kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared) if unbind: self.bsum_zero = None for kernel_params in self.kernels: kernel_params[3:11] = (None,) * 8
def execute(self, repeat=1, unbind=True): for r in range(repeat): if not self.determ: drv.memset_d32_async(*self.zero_args) for kernel_params in self.kernels: kernel = kernel_specs.get_kernel(kernel_params[0]) kernel.prepared_async_call(*kernel_params[1:]) if self.convert_args: _fp_convert(*self.convert_args) if unbind: self.zero_args = self.convert_args = None for kernel_params in self.kernels: kernel_params[3:8] = (None,) * 5
def execute(self, repeat=1, unbind=True): for r in range(repeat): if not self.determ: drv.memset_d32_async(*self.zero_args) for kernel_params in self.kernels: kernel = kernel_specs.get_kernel(kernel_params[0]) kernel.prepared_async_call(*kernel_params[1:]) if self.convert_args: _fp_convert(*self.convert_args) if unbind: self.zero_args = self.convert_args = None for kernel_params in self.kernels: kernel_params[3:8] = (None, ) * 5
def execute(self, repeat=1, unbind=True): shuffle_kernel = _get_transpose_kernel(self.dtype_str) kernel = kernel_specs.get_kernel(self.kernel[0]) for r in range(repeat): # let atomic adds accumulate on top if not self.beta: drv.memset_d8_async(*self.zero_args) shuffle_kernel.prepared_async_call(*self.shuffle_args) kernel.prepared_async_call(*self.kernel[1:]) if unbind: self.zero_args = None self.shuffle_args[2:5] = (None,) * 3 self.kernel[3:8] = (None,) * 5
def execute(self, repeat=1, unbind=True): shuffle_kernel = _get_transpose_kernel(self.dtype_str) kernel = kernel_specs.get_kernel(self.kernel[0]) for r in range(repeat): # let atomic adds accumulate on top if not self.beta: drv.memset_d8_async(*self.zero_args) shuffle_kernel.prepared_async_call(*self.shuffle_args) kernel.prepared_async_call(*self.kernel[1:]) if unbind: self.zero_args = None self.shuffle_args[2:5] = (None, ) * 3 self.kernel[3:8] = (None, ) * 5
def execute(self, repeat=1, unbind=True): shuffle_kernel = _get_shuffle_kernel(self.dtype_str) for r in range(repeat): if self.bsum_zero: drv.memset_d32_async(*self.bsum_zero) shuffle_kernel.prepared_async_call(*self.shuffle_args) for kernel_params in self.kernels: kernel = kernel_specs.get_kernel(kernel_params[0]) kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared) if unbind: self.bsum_zero = None self.shuffle_args[2:5] = (None,) * 3 for kernel_params in self.kernels: kernel_params[3:11] = (None,) * 8