def _gpu_init(self): defines = { "LABELS": int(self.has_labels), "SAMPLE_SIZE": self.original_data.sample_size, "MAX_MINIBATCH_SIZE": self.max_minibatch_size, "original_data_dtype": numpy_dtype_to_opencl(self.original_data.dtype), "minibatch_data_dtype": numpy_dtype_to_opencl(self.minibatch_data.dtype) } defines.update(self.get_ocl_defines()) self.build_program(defines, "fullbatch_loader", dtype=self.minibatch_data.dtype) self.assign_kernel("fill_minibatch_data_labels") if not self.has_labels: self.set_args(self.original_data, self.minibatch_data, self.device.skip(2), self.shuffled_indices, self.minibatch_indices) else: self.set_args(self.original_data, self.minibatch_data, self.device.skip(2), self._mapped_original_labels_, self.minibatch_labels, self.shuffled_indices, self.minibatch_indices)
def get_ocl_defines(self): return { "TARGET": 1, "TARGET_SIZE": self.original_targets.sample_size, "original_target_dtype": numpy_dtype_to_opencl(self.original_targets.dtype), "minibatch_target_dtype": numpy_dtype_to_opencl(self.minibatch_targets.dtype), }
def _gpu_init(self): defines = { "LABELS": int(self.has_labels), "SAMPLE_SIZE": self.original_data.sample_size, "MAX_MINIBATCH_SIZE": self.max_minibatch_size, "original_data_dtype": numpy_dtype_to_opencl(self.original_data.dtype), "minibatch_data_dtype": numpy_dtype_to_opencl(self.minibatch_data.dtype), } defines.update(self.get_ocl_defines()) self.build_program(defines, "fullbatch_loader", dtype=self.minibatch_data.dtype) self.assign_kernel("fill_minibatch_data_labels") if not self.has_labels: self.set_args( self.original_data, self.minibatch_data, self.device.skip(2), self.shuffled_indices, self.minibatch_indices, ) else: self.set_args( self.original_data, self.minibatch_data, self.device.skip(2), self._mapped_original_labels_, self.minibatch_labels, self.shuffled_indices, self.minibatch_indices, )
def get_ocl_defines(self): return { "TARGET": 1, "TARGET_SIZE": self.original_targets.sample_size, "original_target_dtype": numpy_dtype_to_opencl(self.original_targets.dtype), "minibatch_target_dtype": numpy_dtype_to_opencl(self.minibatch_targets.dtype) }
def _gpu_init(self): dtype = self.rdisp.dtype sample_size = self.mean.size defines = { "input_type": numpy_dtype_to_opencl(self.input.dtype), "mean_type": numpy_dtype_to_opencl(self.mean.dtype), "SAMPLE_SIZE": sample_size } self.build_program(defines, self.__class__.__name__, dtype=dtype) self.assign_kernel("normalize_mean_disp") self.set_args(self.input, self.mean, self.rdisp, self.output)
def _gpu_init(self): dtype = self.output.dtype block_size = min(self.err_output.shape[0], 128) if self.class_targets: self.sources_["mse_find_closest"] = { "target_dtype": numpy_dtype_to_opencl(self.class_targets.dtype) } self.build_program(cache_file_name="%s_%d_%d" % (self.__class__.__name__, self.output.shape[0], self.output.sample_size), dtype=dtype, max_batch_size=self.err_output.shape[0], block_size=block_size, output_size=self.err_output.sample_size, root=self.root, normalization=self.normalizer.MAPPING, targets_number=self.class_targets.shape[0] if self.class_targets else None, coeffs=self.normalizer.coefficients) self.assign_kernel("evaluate_mse") self.set_args(self.output, self.target, self.skip_args(2), self.metrics, self.mse.devmem, self.err_output) if self.labels and self.class_targets: assert (self.labels.dtype == self.n_err.dtype == numpy.int32) self.krn_find_closest_ = self.get_kernel("mse_find_closest") self.krn_find_closest_.set_args(self.output.devmem, self.class_targets.devmem, self.labels.devmem, self.n_err.devmem) return block_size
def _gpu_init(self): dtype = self.output.dtype block_size = min(self.err_output.shape[0], 128) if self.class_targets: self.sources_["mse_find_closest"] = { "target_dtype": numpy_dtype_to_opencl(self.class_targets.dtype) } self.build_program( cache_file_name="%s_%d_%d" % (self.__class__.__name__, self.output.shape[0], self.output.sample_size), dtype=dtype, max_batch_size=self.err_output.shape[0], block_size=block_size, output_size=self.err_output.sample_size, root=self.root, normalization=self.normalizer.MAPPING, targets_number=self.class_targets.shape[0] if self.class_targets else None, coeffs=self.normalizer.coefficients) self.assign_kernel("evaluate_mse") self.set_args(self.output, self.target, self.skip_args(2), self.metrics, self.mse.devmem, self.err_output) if self.labels and self.class_targets: assert(self.labels.dtype == self.n_err.dtype == numpy.int32) self.krn_find_closest_ = self.get_kernel("mse_find_closest") self.krn_find_closest_.set_args( self.output.devmem, self.class_targets.devmem, self.labels.devmem, self.n_err.devmem) return block_size
def get_kernel_bs_vo(self, **kwargs): """Gets optimal block size and vector_opt flag for matrix multiplication. Parameters: dtype: numeric data type as string (float or double). kernel: hint for the name of the kernel for which the optimal block sizes will be returned: conv: convolutional forward propagation, deconv: convolutional back propagation, all other: simple matrix multiplication. precision: precision level for summation (0, 1, 2) (defaults to root.common.engine.precision_level). Returns: BLOCK_SIZE, VECTOR_OPT """ dtype = kwargs["dtype"] if type(dtype) != str: dtype = opencl_types.numpy_dtype_to_opencl(dtype) krnnme = kwargs.get("kernel", "matrix_multiplication") precision = kwargs.get("precision", root.common.engine.precision_level) krninfo = self.device_info.get(krnnme) if krninfo is None: # Benchmark for other kernel types is not implemented, # so only debug level here self.debug( "Kernel \"%s\" was not found, " "rolling back to block size for matrix_multiplication", krnnme) krnnme = "matrix_multiplication" krninfo = self.device_info.get(krnnme) if krninfo is None: bs = 8 self.warning( "krnnme = %s was not found, " "will use block size %d", krnnme, bs) return bs, False typeinfo = krninfo.get(dtype) if typeinfo is None: bs = 8 self.warning( "dtype = %s was not found with krnnme = %s, " "will use block size %d", dtype, krnnme, bs) return bs, False bs_dt = typeinfo.get(str(precision)) while bs_dt is None and precision > 0: precision -= 1 bs_dt = typeinfo.get(str(precision)) if bs_dt is None: bs = 8 self.warning( "precision = 0 was not found with krnnme = %s and dtype = %s, " "will use block size %d", krnnme, dtype, bs) return bs, False return bs_dt[0], bs_dt[1]
def _gpu_init(self): defines = { 'etype': opencl_types.numpy_dtype_to_opencl(self.output.dtype), } self.build_program( defines, "%s_%d_%s" % (type(self).__name__, self.output.shape[0], "_".join(map(str, self.output.shape[1:]))), inputs=self.inputs) self.assign_kernel("join") self.set_args(self.output, *self.inputs)
def build_program(self, defines=None, cache_file_name=None, dtype=None, **kwargs): if cache_file_name is None: cache_file_name = self.name if not isinstance(cache_file_name, str): raise ValueError("cache_file_name must be a string") if dtype is None: dtype = root.common.engine.precision_type elif not isinstance(dtype, str): dtype = opencl_types.numpy_dtype_to_opencl(dtype) return self._backend_build_program_( defines, cache_file_name, dtype, kwargs)
def ocl_init(self): self.input.initialize(self.device) self.weights.initialize(self.device) self.winners.initialize(self.device) self.argmins.initialize(self.device) self._distances.initialize(self.device) self._coords.initialize(self.device) batch_size = self.input.mem.shape[0] chunk_size = self._neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self._neurons_number // 2 + 1 self.argmin_group_size = int( numpy.ceil(float(self._neurons_number) / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self._sample_length, 'NEURONS_NUMBER': self._neurons_number, 'CHUNK_SIZE': chunk_size, 'GRADIENT_CHUNK_SIZE': self.device.max_group_size, 'coord_type': "%s%d" % (opencl_types.numpy_dtype_to_opencl( self._coords.mem.dtype), self._coords.mem.shape[-1]) } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self._sample_length, self._neurons_number), dtype=self.weights.mem.dtype) self.ocl_consts_ = numpy.zeros(1, dtype=self.weights.mem.dtype) self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.argmins.devmem, self.winners.devmem) self._krn_gravity_ = self.get_kernel("compute_gravity") self._krn_gravity_.set_args(self.argmins.devmem, self._coords.devmem) self._krn_gravity_.set_arg(3, self._distances.devmem) self._krn_apply_gradient_ = self.get_kernel("apply_gradient") self._krn_apply_gradient_.set_args(self.input.devmem, self._distances.devmem) self._krn_apply_gradient_.set_arg(3, self.weights.devmem) self._gs_distance = [ roundup(self._neurons_number, block_size), roundup(batch_size, block_size) ] self._ls_distance = [block_size, block_size]
def ocl_init(self): self.input.initialize(self.device) self.weights.initialize(self.device) self.winners.initialize(self.device) self.argmins.initialize(self.device) self._distances.initialize(self.device) self._coords.initialize(self.device) batch_size = self.input.mem.shape[0] chunk_size = self._neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self._neurons_number // 2 + 1 self.argmin_group_size = int(numpy.ceil(float(self._neurons_number) / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self._sample_length, 'NEURONS_NUMBER': self._neurons_number, 'CHUNK_SIZE': chunk_size, 'GRADIENT_CHUNK_SIZE': self.device.max_group_size, 'coord_type': "%s%d" % (opencl_types.numpy_dtype_to_opencl(self._coords.mem.dtype), self._coords.mem.shape[-1]) } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self._sample_length, self._neurons_number), dtype=self.weights.mem.dtype) self.ocl_consts_ = numpy.zeros(1, dtype=self.weights.mem.dtype) self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.argmins.devmem, self.winners.devmem) self._krn_gravity_ = self.get_kernel("compute_gravity") self._krn_gravity_.set_args(self.argmins.devmem, self._coords.devmem) self._krn_gravity_.set_arg(3, self._distances.devmem) self._krn_apply_gradient_ = self.get_kernel("apply_gradient") self._krn_apply_gradient_.set_args(self.input.devmem, self._distances.devmem) self._krn_apply_gradient_.set_arg(3, self.weights.devmem) self._gs_distance = [ roundup(self._neurons_number, block_size), roundup(batch_size, block_size)] self._ls_distance = [block_size, block_size]