def __init__(self, workflow, **kwargs): super(GradientsCalculator, self).__init__(workflow, **kwargs) self.vbias_grad = Array() self.hbias_grad = Array() self.weights_grad = Array() self.demand("hbias1", "vbias1", "hbias0", "vbias0", "weights0", "weights1")
def __init__(self, workflow, **kwargs): super(Cutter1D, self).__init__(workflow, **kwargs) self.alpha = kwargs.get("alpha") self.beta = kwargs.get("beta") self.output_offset = kwargs.get("output_offset", 0) self.output = Array() self.demand("alpha", "beta", "input")
def clone(self): for unit, attrs in self.reals.items(): for attr in attrs: value = getattr(unit, attr) if self.is_immutable(value): setattr(self, attr, value) continue if not isinstance(value, Array): cloned = getattr(self, attr, None) if cloned is None: setattr(self, attr, deepcopy(value)) continue if isinstance(value, list): del cloned[:] cloned.extend(value) elif isinstance(value, (dict, set)): cloned.clear() cloned.update(value) elif isinstance(value, Bool): cloned <<= value elif isinstance(value, numpy.ndarray): cloned[:] = value else: setattr(self, attr, deepcopy(value)) continue vec = getattr(self, attr, None) if vec is None: vec = Array() self.vectors[value] = vec setattr(self, attr, vec) else: assert isinstance(vec, Array) if not vec and value: vec.reset(value.mem.copy())
def __init__(self, workflow, **kwargs): super(MultiHistogram, self).__init__(workflow, **kwargs) self.limit = kwargs.get("limit", 64) self.value = Array() self.n_bars = kwargs.get("n_bars", 25) self.hist_number = kwargs.get("hist_number", 16) self.demand("input")
def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "WORKER") super(MeanDispNormalizer, self).__init__(workflow, **kwargs) self.output = Array() self.global_size = None self.local_size = None self.demand("input", "mean", "rdisp")
def __init__(self, workflow, **kwargs): super(EvaluatorSoftmax, self).__init__(workflow, **kwargs) self.compute_confusion_matrix = kwargs.get( "compute_confusion_matrix", True) self.confusion_matrix = Array() self.n_err = Array() self.max_err_output_sum = Array() self.demand("labels", "max_idx")
def __init__(self, workflow, **kwargs): super(Uniform, self).__init__(workflow, **kwargs) self.num_states = kwargs.get("num_states", 256) self.states = Array() self.prng = kwargs.get("prng", get()) self.output_bytes = kwargs.get("output_bytes", 0) self.output = Array() self.cl_const = numpy.zeros(1, dtype=numpy.int32)
class Summator(AcceleratedUnit): """Multiplies two vectors pointwise. """ def __init__(self, workflow, **kwargs): super(Summator, self).__init__(workflow, **kwargs) self.output = Array() self.demand("x", "y") def initialize(self, device, **kwargs): super(Summator, self).initialize(device, **kwargs) if not self.output: self.output.reset(numpy.zeros_like(self.x.mem)) else: assert self.output.shape == self.x.shape self.init_vectors(self.x, self.y, self.output) def init_unpickled(self): super(Summator, self).init_unpickled() self.sources_["summator"] = {} def _gpu_init(self): self.build_program({"OUTPUT_SIZE": self.output.size}, "%s_%d" % (self.__class__.__name__, self.output.size), dtype=self.x.dtype) self.assign_kernel("add_forward") self.set_args(self.x, self.y, self.output) def cuda_init(self): self._gpu_init() block_size = self.device.suggest_block_size(self._kernel_) self._global_size = (int(numpy.ceil(self.output.size / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def ocl_init(self): self._gpu_init() self._global_size = (self.output.size, 1, 1) self._local_size = None def numpy_init(self): pass # nothing to init def _gpu_run(self): self.unmap_vectors(self.x, self.y, self.output) self.execute_kernel(self._global_size, self._local_size) def cuda_run(self): self._gpu_run() def ocl_run(self): self._gpu_run() def numpy_run(self): self.x.map_read() self.y.map_read() self.output.map_invalidate() numpy.add(self.x.mem, self.y.mem, self.output.mem)
class Summator(AcceleratedUnit): """Multiplies two vectors pointwise. """ def __init__(self, workflow, **kwargs): super(Summator, self).__init__(workflow, **kwargs) self.output = Array() self.demand("x", "y") def initialize(self, device, **kwargs): super(Summator, self).initialize(device, **kwargs) if not self.output: self.output.reset(numpy.zeros_like(self.x.mem)) else: assert self.output.shape == self.x.shape self.init_vectors(self.x, self.y, self.output) def init_unpickled(self): super(Summator, self).init_unpickled() self.sources_["summator"] = {} def _gpu_init(self): self.build_program({"OUTPUT_SIZE": self.output.size}, "%s_%d" % (self.__class__.__name__, self.output.size), dtype=self.x.dtype) self.assign_kernel("add_forward") self.set_args(self.x, self.y, self.output) def cuda_init(self): self._gpu_init() block_size = self.device.suggest_block_size(self._kernel_) self._global_size = ( int(numpy.ceil(self.output.size / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def ocl_init(self): self._gpu_init() self._global_size = (self.output.size, 1, 1) self._local_size = None def numpy_init(self): pass # nothing to init def _gpu_run(self): self.unmap_vectors(self.x, self.y, self.output) self.execute_kernel(self._global_size, self._local_size) def cuda_run(self): self._gpu_run() def ocl_run(self): self._gpu_run() def numpy_run(self): self.x.map_read() self.y.map_read() self.output.map_invalidate() numpy.add(self.x.mem, self.y.mem, self.output.mem)
def __init__(self, workflow, **kwargs): super(EvaluatorMSE, self).__init__(workflow, **kwargs) self.metrics = Array() self.mse = Array() self.labels = None self.class_targets = None self.n_err = Array() self.root = kwargs.get("root", True) self.demand("target", "normalizer")
def __init__(self, workflow, **kwargs): super(FixAccumulator, self).__init__(workflow) self.bars = kwargs.get("bars", 200) self.type = kwargs.get("type", "relu") self.input = None self.output = Array() self.reset_flag = Bool(True) self.n_bars = [0] self.max = 100 self.min = 0
def __init__(self, workflow, **kwargs): super(Deconv, self).__init__(workflow, **kwargs) self.unsafe_padding = kwargs.get("unsafe_padding", False) self.hits = Array() self.krn_clear_output_ = None self._global_size = None self._local_size = None del self.bias self.demand("n_kernels", "kx", "ky", "padding", "sliding", "input", "weights", "output_shape_source")
def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "EVALUATOR") super(EvaluatorBase, self).__init__(workflow, **kwargs) self.mean = kwargs.get("mean", True) self.err_output = Array() self._merged_output = Array() self.krn_constants_i_ = None self.krn_constants_f_ = None self.demand("output", "batch_size") if self.testing: self.demand("class_lengths", "offset")
def __init__(self, workflow, **kwargs): super(KohonenForward, self).__init__(workflow, **kwargs) self.demand("input", "weights") self.argmins = None self._distances = Array() self.output = Array() self._chunk_size_ = 0 self.weights_transposed = False self.total = Array() if kwargs.get("total", False) else None if self.total is not None: self.minibatch_offset = None self.minibatch_size = None self.batch_size = None
def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "WORKER") super(Forward, self).__init__(workflow, **kwargs) self.weights_stddev = kwargs.get("weights_stddev") self.bias_stddev = kwargs.get("bias_stddev", self.weights_stddev) self.weights_filling = kwargs.get("weights_filling", "uniform") self.bias_filling = kwargs.get("bias_filling", "uniform") self.rand = kwargs.get("rand", prng.get()) self.weights_transposed = kwargs.get("weights_transposed", False) self.include_bias = kwargs.get("include_bias", True) self.demand("input") self.output = Array(shallow_pickle=True) self.weights = Array() self.bias = Array() self.forward_mode = False self.exports = ["weights", "bias", "include_bias", "weights_transposed"]
def __init__(self, workflow, **kwargs): name = kwargs.get("name", "Table") kwargs["name"] = name super(TableMaxMin, self).__init__(workflow, **kwargs) self.row_labels = ["max", "min"] self.col_labels = [] self.y = [] self.values = Array()
class FixAccumulator(Unit): """ Range accumulator. """ def __init__(self, workflow, **kwargs): super(FixAccumulator, self).__init__(workflow) self.bars = kwargs.get("bars", 200) self.type = kwargs.get("type", "relu") self.input = None self.output = Array() self.reset_flag = Bool(True) self.n_bars = [0] self.max = 100 self.min = 0 def initialize(self, **kwargs): self.output.mem = numpy.zeros([self.bars + 2], dtype=numpy.int64) def run(self): if self.type == "relu": self.max = 10000 self.min = 0 elif self.type == "tanh": self.max = 1.7159 self.min = -1.7159 else: raise error.BadFormatError("Unsupported type %s" % self.type) d = self.max - self.min if not d: return self.output.map_write() self.input.map_read() d = (self.bars - 1) / d if self.reset_flag: self.output.mem[:] = 0 self.n_bars[0] = self.bars + 2 for y in self.input.mem.ravel(): if y < self.min: self.output[0] += 1 continue if y <= self.max and y > self.min: i = int(numpy.floor((y - self.min) * d)) self.output[i] += 1 continue self.output[self.bars + 1] += 1
class GDSummator(AcceleratedUnit): """Gradient descent for Multiplier. """ def __init__(self, workflow, **kwargs): super(GDSummator, self).__init__(workflow, **kwargs) self.err_x = Array() self.err_y = Array() self.demand("err_output") def initialize(self, device, **kwargs): super(GDSummator, self).initialize(device, **kwargs) if not self.err_x: self.err_x.reset(numpy.zeros_like(self.err_output.mem)) else: assert self.err_x.shape == self.err_output.shape if not self.err_y: self.err_y.reset(numpy.zeros_like(self.err_output.mem)) else: assert self.err_y.shape == self.err_output.shape self.init_vectors(self.err_x, self.err_y, self.err_output) def cuda_init(self): pass # nothing to init def ocl_init(self): pass # nothing to init def numpy_init(self): pass # nothing to init def cuda_run(self): self.unmap_vectors(self.err_output, self.err_x, self.err_y) self.err_x.devmem.from_device_async(self.err_output.devmem) self.err_y.devmem.from_device_async(self.err_output.devmem) def ocl_run(self): self.unmap_vectors(self.err_output, self.err_x, self.err_y) self.device.queue_.copy_buffer(self.err_output.devmem, self.err_x.devmem, 0, 0, self.err_output.nbytes, need_event=False) self.device.queue_.copy_buffer(self.err_output.devmem, self.err_y.devmem, 0, 0, self.err_output.nbytes, need_event=False) def numpy_run(self): self.err_output.map_read() self.err_x.map_invalidate() self.err_y.map_invalidate() self.err_x.mem[:] = self.err_output.mem[:] self.err_y.mem[:] = self.err_output.mem[:]
def __init__(self, workflow, **kwargs): super(KohonenTrainer, self).__init__(workflow, **kwargs) self._distances = Array() self.argmins = Array() self._coords = Array() self.weights = Array() self.winners = Array() self.weights_filling = kwargs.get("weights_filling", "uniform") self.weights_stddev = kwargs.get("weights_stddev", None) self.weights_transposed = kwargs.get("weights_transposed", False) self.time = 0 self._sigma = 0 self.gradient_decay = kwargs.get("gradient_decay", lambda t: 0.1 / (1.0 + t * 0.05)) self.radius_decay = kwargs.get("radius_decay", lambda t: 1.0 / (1.0 + t * 0.05)) self.demand("input", "shape") self._shape = kwargs.get("shape")
def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "TRAINER") super(GradientDescentBase, self).__init__(workflow, **kwargs) self.err_input = Array(shallow_pickle=True) self.ocl_set_const_args = True self.weights = None self.bias = None self.demand("input", "err_output") self.learning_rate = kwargs.get("learning_rate", 0.01) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", 0.00005) self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0) self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0) self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2) self.gradient_moment = kwargs.get("gradient_moment", 0) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment) self.weights_transposed = kwargs.get("weights_transposed", False) self.need_err_input = kwargs.get("need_err_input", True) self.include_bias = kwargs.get("include_bias", True) self.factor_ortho = kwargs.get("factor_ortho", 0) self.col_sums = Array() # for orthogonalization # Current gradient as it is without applying learning_rate etc. self.gradient_weights = Array() self.gradient_bias = Array() # Gradient with applied learning_rate etc. # optionally accumulated from the previous run self.accumulate_gradient = kwargs.get("accumulate_gradient", False) # When accumulate_gradient set to True: # 1. Calculate gd # 2. acc = acc_alpha * gd + acc_beta * acc # 3. gd = gd_alpha * acc + gd_beta * gd # 4. Apply moments to gd # 5. weights += gd if apply_gradient set to True self.acc_alpha = kwargs.get("acc_alpha", 0.0) self.acc_beta = kwargs.get("acc_beta", 0.0) self.gd_alpha = kwargs.get("gd_alpha", 0.0) self.gd_beta = kwargs.get("gd_beta", 1.0) self.accumulated_gradient_weights = Array() self.accumulated_gradient_bias = Array() # Gradient with accumulated moments self.gradient_weights_with_moment = Array() self.gradient_bias_with_moment = Array() # Sets to True when gradient changes self.gradient_changed = False # Gradient will be applied to weights immediately just after computing self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave)
def __init__(self, workflow, **kwargs): super(DeviceBenchmark, self).__init__(workflow, **kwargs) self.precision = kwargs.get("dtype", root.common.engine.precision_type) self.dtype = opencl_types.dtypes[self.precision] self.size = kwargs.get("size", 1500) self.repeats = kwargs.get("repeats", 10) self._input_A_ = Array() self._input_B_ = Array() msize = self.size * self.size from veles.prng.random_generator import RandomGenerator rnd = RandomGenerator(None) genmem = lambda: rnd.rand(msize).astype(self.dtype) - 0.5 self._input_A_.mem = genmem() self._input_B_.mem = genmem() self.block_size = kwargs.get("block_size") self.vector_opt = kwargs.get("vector_opt") self.precision_level = kwargs.get("precision_level", root.common.engine.precision_level) self.return_time = kwargs.get("return_time", False) self.dry_run_first = kwargs.get("dry_run_first", False)
def __init__(self, workflow, **kwargs): super(ImagenetLoader, self).__init__(workflow, **kwargs) self.mean = Array() self.rdisp = Array() self.file_samples = "" self.crop_size_sx = kwargs.get("crop_size_sx", 227) self.crop_size_sy = kwargs.get("crop_size_sy", 227) self.sx = kwargs.get("sx", 256) self.sy = kwargs.get("sy", 256) self.shuffle_limit = kwargs.get("shuffle_limit", 2000000000) self.original_labels_filename = kwargs.get("original_labels_filename", None) self.count_samples_filename = kwargs.get("count_samples_filename", None) self.matrixes_filename = kwargs.get("matrixes_filename", None) self.samples_filename = kwargs.get("samples_filename", None) self.has_mean_file = False self.do_mirror = False self.mirror = kwargs.get("mirror", False) self.channels = kwargs.get("channels", 3)
class GDSummator(AcceleratedUnit): """Gradient descent for Summator. """ def __init__(self, workflow, **kwargs): super(GDSummator, self).__init__(workflow, **kwargs) self.err_x = Array() self.err_y = Array() self.demand("err_output") def initialize(self, device, **kwargs): super(GDSummator, self).initialize(device, **kwargs) if self.err_x: assert self.err_x.shape[1:] == self.err_output.shape[1:] if not self.err_x or self.err_x.shape[0] != self.err_output.shape[0]: self.err_x.reset(numpy.zeros_like(self.err_output.mem)) if self.err_y: assert self.err_y.shape[1:] == self.err_output.shape[1:] if not self.err_y or self.err_y.shape[0] != self.err_output.shape[0]: self.err_y.reset(numpy.zeros_like(self.err_output.mem)) self.init_vectors(self.err_x, self.err_y, self.err_output) def cuda_init(self): pass # nothing to init def ocl_init(self): pass # nothing to init def numpy_init(self): pass # nothing to init def cuda_run(self): self.unmap_vectors(self.err_output, self.err_x, self.err_y) self.err_x.devmem.from_device_async(self.err_output.devmem) self.err_y.devmem.from_device_async(self.err_output.devmem) def ocl_run(self): self.unmap_vectors(self.err_output, self.err_x, self.err_y) self.device.queue_.copy_buffer( self.err_output.devmem, self.err_x.devmem, 0, 0, self.err_output.nbytes, need_event=False) self.device.queue_.copy_buffer( self.err_output.devmem, self.err_y.devmem, 0, 0, self.err_output.nbytes, need_event=False) def numpy_run(self): self.err_output.map_read() self.err_x.map_invalidate() self.err_y.map_invalidate() self.err_x.mem[:] = self.err_output.mem[:] self.err_y.mem[:] = self.err_output.mem[:]
def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "TRAINER") super(GradientDescentBase, self).__init__(workflow, **kwargs) self.err_input = Array(shallow_pickle=True) self.ocl_set_const_args = True self.weights = None self.bias = None self.demand("input", "err_output") self.learning_rate = kwargs.get("learning_rate", 0.01) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", 0.00005) self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0) self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0) self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2) self.gradient_moment = kwargs.get("gradient_moment", 0) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment) self.weights_transposed = kwargs.get("weights_transposed", False) self.need_err_input = kwargs.get("need_err_input", True) self.include_bias = kwargs.get("include_bias", True) self.factor_ortho = kwargs.get("factor_ortho", 0) self.col_sums = Array() # for orthogonalization # Current gradient as it is without applying learning_rate etc. self.gradient_weights = Array() self.gradient_bias = Array() # Gradient with applied learning_rate etc. # optionally accumulated from the previous run self.accumulated_gradient_weights = Array() self.accumulated_gradient_bias = Array() # Gradient with accumulated moments self.gradient_weights_with_moment = Array() self.gradient_bias_with_moment = Array() # Sets to True when gradient changes self.gradient_changed = False # Gradient will be applied to weights immediately just after computing self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave) # Accumulates gradient from the previous run: # OP_NONE: do not allocate array at all # OP_STORE: stores gradient with an applied learning_rate etc. # OP_ADD: adds current gradient to the array # OP_FLUSH: applies accumulated gradient, then resets it to zero self.accumulate_gradient = kwargs.get("accumulate_gradient", self.OP_NONE)
def __init__(self, workflow, **kwargs): super(ImagenetLoaderBase, self).__init__(workflow, **kwargs) self.mean = Array() self.rdisp = Array() self._file_samples_ = "" self.sx = kwargs.get("sx", 256) self.sy = kwargs.get("sy", 256) self.channels = kwargs.get("channels", 3) self.original_labels_filename = kwargs.get("original_labels_filename") self.count_samples_filename = kwargs.get("count_samples_filename") self.matrixes_filename = kwargs.get("matrixes_filename") self.samples_filename = kwargs.get("samples_filename") self.class_keys_path = kwargs.get("class_keys_path") self.final_sy = self.sy self.final_sx = self.sx self._train_different_labels_ = defaultdict(int) self.class_keys = None if self.class_keys_path is not None: with open(self.class_keys_path, "r") as fin: self.class_keys = json.load(fin) self.info("Class keys was loaded: len %s" % len(self.class_keys))
def __init__(self, workflow, **kwargs): super(ImageLoader, self).__init__(workflow, **kwargs) self.color_space = kwargs.get("color_space", "RGB") self._source_dtype = numpy.float32 self._original_shape = tuple() self.class_keys = [[], [], []] self.verify_interface(IImageLoader) self.path_to_mean = kwargs.get("path_to_mean", None) self.add_sobel = kwargs.get("add_sobel", False) self.mirror = kwargs.get("mirror", False) # True, False, "random" self.scale = kwargs.get("scale", 1.0) self.scale_maintain_aspect_ratio = kwargs.get( "scale_maintain_aspect_ratio", True) self.rotations = kwargs.get("rotations", (0.0, )) # radians self.crop = kwargs.get("crop", None) self.crop_number = kwargs.get("crop_number", 1) self._background = None self.background_image = kwargs.get("background_image", None) self.background_color = kwargs.get("background_color", (0xff, 0x14, 0x93)) self.smart_crop = kwargs.get("smart_crop", True) self.minibatch_label_values = Array()
def __init__(self, workflow, **kwargs): super(EvaluatorSoftmax, self).__init__(workflow, **kwargs) self.compute_confusion_matrix = kwargs.get("compute_confusion_matrix", True) self.confusion_matrix = Array() self.n_err = Array() self.max_err_output_sum = Array() self.demand("labels", "max_idx")
class MyOCL(IOpenCLUnit): def __init__(self): self.a = Array(zeros([kibi >> 1, kibi], dtype=float32)) self.b = Array() self.b.mem = zeros([kibi, kibi], dtype=float32) def initialize(self, device, **kwargs): self.a.initialize(self) self.b.initialize(self) def ocl_init(): self.krn_.set_arg(0, self.a.devmem) self.krn_.set_arg(1, self.b.devmem) ocl_init() def __call__(self, *args, **kwargs): self.a.unmap() self.b.unmap() self.execute_kernel(global_size, local_size, self.krn_) a = self.a.ocl_map_read()
def __init__(self, workflow, **kwargs): self._solvers = set() super(GradientDescent, self).__init__(workflow, **kwargs) s = kwargs.get("solvers", set()) self.solvers = s self.reduce_size = self.REDUCE_SIZE self.krn_err_input_ = None self.krn_weights_ = None self.krn_err_output_ = None self.krn_bias_ = None self.krn_compute_col_sums_ = None self.krn_err_output_name = None self.demand("weights") if self.include_bias: self.demand("bias") self.last_minibatch = None self.variant_gradient = kwargs.get("variant_gradient", True) self.variant_moment_gradient = ( kwargs.get("variant_moment_gradient", True)) if "fast" in self.solvers: self.fast = FastGDObjects(kwargs.get("fast_learning_rate", 0.02), Array(), Array()) if "adadelta" in self.solvers: self.adadelta = AdaDeltaGDObjects( kwargs.get("adadelta_momentum", 0.9), Array(), Array(), Array(), Array(), kwargs.get("adadelta_adom", 0.3), kwargs.get("adadelta_epsilon", 1e-8)) self.adadelta_adom = self.adadelta.adom if "adagrad" in self.solvers: self.adagrad = AdaGradGDObjects( kwargs.get("adagrad_epsilon", 1e-8), Array(), Array()) self.last_minibatch = kwargs.get("last_minibatch", False)
class MemCpy(AcceleratedUnit): def __init__(self, workflow, **kwargs): super(MemCpy, self).__init__(workflow, **kwargs) self.output = Array() self.demand("input") def initialize(self, device, **kwargs): super(MemCpy, self).initialize(device, **kwargs) if (self.output.mem is None or self.output.mem.size != self.input.mem.size): self.output.reset() self.output.mem = numpy.zeros(self.input.mem.shape, dtype=self.input.mem.dtype) self.input.initialize(self.device) self.output.initialize(self.device) def cuda_init(self): pass def ocl_init(self): pass def _gpu_run(self): self.input.unmap() self.output.unmap() def ocl_run(self): self._gpu_run() self.device.queue_.copy_buffer(self.input.devmem, self.output.devmem, 0, 0, self.input.nbytes) def cuda_run(self): self._gpu_run() self.output.devmem.from_device_async(self.input.devmem) def numpy_run(self): self.input.map_read() self.output.map_invalidate() numpy.copyto(self.output.mem, self.input.mem)
def __init__(self, workflow, **kwargs): super(ImageLoader, self).__init__(workflow, **kwargs) self.color_space = kwargs.get("color_space", "RGB") self._source_dtype = numpy.float32 self._original_shape = tuple() self.class_keys = [[], [], []] self.verify_interface(IImageLoader) self.path_to_mean = kwargs.get("path_to_mean", None) self.add_sobel = kwargs.get("add_sobel", False) self.mirror = kwargs.get("mirror", False) # True, False, "random" self.scale = kwargs.get("scale", 1.0) self.scale_maintain_aspect_ratio = kwargs.get( "scale_maintain_aspect_ratio", True) self.rotations = kwargs.get("rotations", (0.0,)) # radians self.crop = kwargs.get("crop", None) self.crop_number = kwargs.get("crop_number", 1) self._background = None self.background_image = kwargs.get("background_image", None) self.background_color = kwargs.get( "background_color", (0xff, 0x14, 0x93)) self.smart_crop = kwargs.get("smart_crop", True) self.minibatch_label_values = Array()
class EvaluatorSoftmax(EvaluatorBase): MAPPING = "evaluator_softmax" LOSS = "softmax" """Evaluator for nn softmax output from the batch labels. Must be assigned before initialize(): output labels batch_size max_idx Updates after run(): err_output n_err confusion_matrix max_err_output_sum Creates within initialize(): err_output n_err confusion_matrix max_err_output_sum Attributes: labels: labels for Batch. output: output of the network_common as Batch. err_output: backpropagation errors based on labels. batch_size: number of elements in output to evaluate. confusion_matrix: confusion matrix for the output. compute_confusion_matrix: compute confusion matrix or not. max_idx: indexes of element with maximum real value for each sample. max_err_output_sum: maximum of backpropagated error sum by sample. """ def __init__(self, workflow, **kwargs): super(EvaluatorSoftmax, self).__init__(workflow, **kwargs) self.compute_confusion_matrix = kwargs.get("compute_confusion_matrix", True) self.confusion_matrix = Array() self.n_err = Array() self.max_err_output_sum = Array() self.class_keys = None self.demand("labels", "max_idx") if self.testing: self.demand("labels_mapping") def initialize(self, device, **kwargs): super(EvaluatorSoftmax, self).initialize(device=device, **kwargs) if self.testing: return self.sources_["evaluator"] = {} dtype = self.output.dtype if not self.n_err: self.n_err.reset(numpy.zeros(2, dtype=numpy.int32)) else: assert self.n_err.size == 2 out_size = self.output.sample_size if self.compute_confusion_matrix: if not self.confusion_matrix: self.confusion_matrix.reset( numpy.zeros([out_size, out_size], numpy.int32)) else: assert self.confusion_matrix.size == out_size * out_size else: self.confusion_matrix.reset() if not self.max_err_output_sum: self.max_err_output_sum.reset(numpy.zeros(1, dtype)) else: assert self.max_err_output_sum.size == 1 self.init_vectors(self.confusion_matrix, self.n_err, self.max_idx, self.labels, self.max_err_output_sum) def _gpu_init(self): dtype = self.output.dtype block_size = min(self.err_output.shape[0], 256) self.build_program(cache_file_name="%s_%d_%d" % (self.__class__.__name__, self.output.shape[0], self.output.sample_size), dtype=dtype, block_size=block_size, max_batch_size=self.err_output.shape[0], output_size=self.err_output.sample_size) self.assign_kernel("evaluate_softmax") self.set_args(self.output, self.max_idx, self.labels, self.skip_args(2), self.n_err, self.confusion_matrix, self.max_err_output_sum, self.err_output) return block_size def ocl_init(self): if self.testing: return block_size = self._gpu_init() self._global_size = [block_size] self._local_size = [block_size] def cuda_init(self): if self.testing: return block_size = self._gpu_init() self._global_size = (1, 1, 1) self._local_size = (block_size, 1, 1) def _gpu_run(self): self.unmap_vectors(self.err_output, self.output, self.max_idx, self.labels, self.n_err, self.confusion_matrix, self.max_err_output_sum) self.krn_constants_i_[0] = self.batch_size self.set_arg(3, self.krn_constants_i_[0:1]) self.krn_constants_f_[0] = 1.0 / self.batch_size if self.mean else 1.0 self.set_arg(4, self.krn_constants_f_[0:1]) self.execute_kernel(self._global_size, self._local_size) def ocl_run(self): return self._gpu_run() def cuda_run(self): return self._gpu_run() def numpy_run(self): self.err_output.map_invalidate() for vec in self.output, self.max_idx, self.labels: vec.map_read() for vec in self.n_err, self.confusion_matrix, self.max_err_output_sum: vec.map_write() batch_size = self.batch_size labels = self.labels.mem confusion_matrix = self.confusion_matrix.mem n_ok = 0 n_total = 0 multiplier = 1.0 / batch_size if self.mean else 1.0 for i in range(batch_size): # loop by batch if labels[i] < 0: self.err_output.mem[i] = 0.0 continue output = ravel(self.output[i]) err_output = ravel(self.err_output[i]) max_idx = self.max_idx[i] confusion_matrix[max_idx, labels[i]] += 1 if max_idx == labels[i]: n_ok += 1 n_total += 1 # Compute softmax output error gradient err_output[:] = output[:] err_output[labels[i]] -= 1.0 err_output *= multiplier if err_output.dtype in (numpy.complex64, numpy.complex128): self.max_err_output_sum[0] = max(self.max_err_output_sum[0], numpy.linalg.norm(err_output)) else: self.max_err_output_sum[0] = max( self.max_err_output_sum[0], (numpy.fabs(err_output)).sum()) # Set errors for excessive samples to zero if batch_size < self.err_output.mem.shape[0]: self.err_output.mem[batch_size:] = 0.0 self.n_err[0] += batch_size - n_ok self.n_err[1] += n_total def get_metric_values(self): if self.testing: output_labels = {} class_keys = getattr(self, "class_keys", None) for index, labels in enumerate(self.merged_output[:]): max_value = 0 for label_index, value in enumerate(labels): if value >= max_value: max_value = value max_index = label_index if class_keys is not None: output_labels[self.class_keys[TEST] [index]] = self.labels_mapping[max_index] else: output_labels[index] = self.labels_mapping[max_index] return {"Output": output_labels} return {}
def __init__(self, workflow, **kwargs): super(All2AllSoftmax, self).__init__(workflow, **kwargs) self.max_idx = Array() self.reduce_size = 256
class All2AllSoftmax(All2All): """All2All with linear activation and softmax normalization. Must be assigned before initialize(): Updates after run(): max_idx Creates within initialize(): max_idx Attributes: krn_sm_: kernel for softmax activation calculation. max_idx: indexes of element with maximum value for each sample. """ __id__ = "420219fc-3e1a-45b1-87f8-aaa0c1540de4" MAPPING = {"softmax"} def __init__(self, workflow, **kwargs): super(All2AllSoftmax, self).__init__(workflow, **kwargs) self.max_idx = Array() self.reduce_size = 256 def init_unpickled(self): super(All2AllSoftmax, self).init_unpickled() self.krn_sm_ = None self._force_gpu_apply_exp = False def initialize(self, device, **kwargs): self.reduce_size = min(self.reduce_size, int(numpy.prod(self.output_sample_shape))) self.sources_["all2all/softmax"] = { "REDUCE_SIZE": self.reduce_size } retval = super(All2AllSoftmax, self).initialize( device=device, **kwargs) if retval: return retval if self.output.mem.size // self.output.mem.shape[0] <= 1: raise error.BadFormatError( "Output sample size should be greater than 1 for SoftMax.") if not self.max_idx: self.max_idx.reset(numpy.zeros(self.output.shape[0], dtype=numpy.int32)) self.max_idx.initialize(self.device) return retval def numpy_apply_exp(self): self.output.map_write() self.max_idx.map_invalidate() out = self.output.mem out = reshape(out, (out.shape[0], out.size // out.shape[0])) for i, sample in enumerate(out): im = sample.argmax() self.max_idx[i] = im m = sample[im] sample -= m numpy.exp(sample, sample) smm = sample.sum() sample /= smm def ocl_apply_exp(self): self.unmap_vectors(self.output, self.max_idx) global_size = (self.output.shape[0] * self.reduce_size,) local_size = (self.reduce_size,) self.execute_kernel(global_size, local_size, self.krn_sm_) def cuda_apply_exp(self): self.unmap_vectors(self.output, self.max_idx) global_size = (self.output.shape[0], 1, 1) local_size = (self.reduce_size, 1, 1) self.execute_kernel(global_size, local_size, self.krn_sm_) def numpy_run(self): """Forward propagation from batch on CPU only. """ super(All2AllSoftmax, self).numpy_run() if not self._force_gpu_apply_exp: self.numpy_apply_exp() def ocl_run(self): """Forward propagation from batch on GPU. """ self._force_gpu_apply_exp = True super(All2AllSoftmax, self).ocl_run() self.ocl_apply_exp() def cuda_run(self): """Forward propagation from batch on GPU. """ self._force_gpu_apply_exp = True super(All2AllSoftmax, self).cuda_run() self.cuda_apply_exp() def ocl_init(self): super(All2AllSoftmax, self).ocl_init() self.krn_sm_ = self.get_kernel("apply_exp") self.krn_sm_.set_args(self.output.devmem, self.max_idx.devmem) def cuda_init(self): super(All2AllSoftmax, self).cuda_init() self.krn_sm_ = self.get_kernel("apply_exp") self.krn_sm_.set_args(self.output.devmem, self.max_idx.devmem)
def __init__(self, workflow, **kwargs): super(OffsetPooling, self).__init__(workflow, **kwargs) self.input_offset = Array() self.demand("input")
class OffsetPooling(Pooling): """Pooling by offset forward propagation. Must be assigned before initialize(): Updates after run(): input_offset Creates within initialize(): input_offset Attributes: input_offset: offsets in the input where elements are passed through. """ MAPPING = set() hide_from_registry = True def __init__(self, workflow, **kwargs): super(OffsetPooling, self).__init__(workflow, **kwargs) self.input_offset = Array() self.demand("input") def initialize(self, device, **kwargs): super(OffsetPooling, self).initialize(device=device, **kwargs) if self._no_output: return if self.input_offset: assert self.input_offset.shape[1:] == self.output.shape[1:] if (not self.input_offset or self.input_offset.shape[0] != self.output.shape[0]): self.input_offset.reset(numpy.zeros(self.output.shape, dtype=numpy.int32)) self.input_offset.initialize(self.device) def set_args(self, *args): super(OffsetPooling, self).set_args(self.input, self.output, self.input_offset, *args) def ocl_run(self): self.input_offset.unmap() super(OffsetPooling, self).ocl_run() def cuda_run(self): self.input_offset.unmap() super(OffsetPooling, self).cuda_run() def numpy_run(self): self.input_offset.map_invalidate() super(OffsetPooling, self).numpy_run() def numpy_run_cut(self, cut, coords): batch, y1, x1, ch, out_y, out_x = coords cut_index = self.numpy_run_cut_offset( cut, numpy.ravel_multi_index((batch, out_y, out_x, ch), self.output.shape)) i, j = numpy.unravel_index(cut_index, cut.shape) idx = numpy.ravel_multi_index((batch, y1 + i, x1 + j, ch), self.input.shape) val = numpy.ravel(self.input.mem)[idx] self.input_offset.mem[batch, out_y, out_x, ch] = idx return val
class MeanDispNormalizer(AcceleratedUnit, TriviallyDistributable): """Normalizes multichannel byte images according to dataset mean and dispersion. Attributes: input: minibatch of images (dtype=numpy.uint8, shape[0]=minibatch_size). mean: mean image over the dataset (dtype=numpy.uint8). rdisp: 1.0 / dispersion over the dataset (float datatype). output: normalized float images of the same dtype as rdisp. """ def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "WORKER") super(MeanDispNormalizer, self).__init__(workflow, **kwargs) self.output = Array() self.global_size = None self.local_size = None self.demand("input", "mean", "rdisp") def init_unpickled(self): super(MeanDispNormalizer, self).init_unpickled() self.sources_["mean_disp_normalizer"] = {} def initialize(self, device, **kwargs): super(MeanDispNormalizer, self).initialize(device, **kwargs) for arr in self.input, self.mean, self.rdisp: if not isinstance(arr, Array): raise TypeError( "veles.memory.Array type expected (got %s)" % type(arr)) if not arr: raise ValueError("Invalid Array state") if len(self.input.shape) < 2: raise ValueError("input should be at least 2D") sample_size = self.mean.size if (self.input.sample_size != sample_size or self.rdisp.size != sample_size): raise ValueError( "Sample size of input differs from mean-rdisp size") if not self.output: self.output.reset(numpy.zeros(self.input.shape, self.rdisp.dtype)) else: assert self.output.shape == self.input.shape self.init_vectors(self.input, self.mean, self.rdisp, self.output) def _gpu_init(self): dtype = self.rdisp.dtype sample_size = self.mean.size defines = { "input_type": numpy_dtype_to_opencl(self.input.dtype), "mean_type": numpy_dtype_to_opencl(self.mean.dtype), "SAMPLE_SIZE": sample_size } self.build_program(defines, self.__class__.__name__, dtype=dtype) self.assign_kernel("normalize_mean_disp") self.set_args(self.input, self.mean, self.rdisp, self.output) def ocl_init(self): self._gpu_init() self.global_size = [self.mean.size, self.input.shape[0]] def cuda_init(self): self._gpu_init() self.local_size = 1, 1, 1 self.global_size = self.mean.size, self.input.shape[0], 1 def _gpu_run(self): self.unmap_vectors(self.input, self.mean, self.rdisp, self.output) self.execute_kernel(self.global_size, self.local_size) def ocl_run(self): self._gpu_run() def cuda_run(self): self._gpu_run() def numpy_run(self): self.input.map_read() self.mean.map_read() self.rdisp.map_read() self.output.map_invalidate() dtype = self.output.dtype self.output.matrix[:] = ( self.input.matrix.astype(dtype)[:] - self.mean.plain.astype(dtype)) * self.rdisp.plain
class EvaluatorBase(AcceleratedUnit, TriviallyDistributable): hide_from_registry = True """Base class for evaluators. """ def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "EVALUATOR") super(EvaluatorBase, self).__init__(workflow, **kwargs) self.mean = kwargs.get("mean", True) self.err_output = Array() self._merged_output = Array() self.krn_constants_i_ = None self.krn_constants_f_ = None self.demand("output", "batch_size") if self.testing: self.demand("class_lengths", "offset") @property def mean(self): """ :return: True if the error function averages values. Default is True. """ return self._mean @mean.setter def mean(self, value): if not isinstance(value, bool): raise TypeError("mean must be boolean (got %s)" % type(value)) self._mean = value @property def merged_output(self): assert self.testing return self._merged_output.mem def initialize(self, device, **kwargs): super(EvaluatorBase, self).initialize(device, **kwargs) dtype = self.output.dtype if self.testing: self._merged_output.reset( numpy.zeros( (self.class_lengths[TEST], ) + self.output.shape[1:], dtype)) return self.krn_constants_i_ = numpy.zeros(1, numpy.int32) self.krn_constants_f_ = numpy.zeros(1, dtype) self.err_output.reset(numpy.zeros_like(self.output.mem, dtype)) for vec in self.output, self.err_output: vec.initialize(self.device) def run(self): if self.testing: self.output.map_read() self.merge_output() return return super(EvaluatorBase, self).run() def merge_output(self): self.merged_output[self.offset - self.batch_size:self.offset] = \ self.output[:self.batch_size] def get_metric_names(self): if self.testing: return {"Output"} return set() def get_metric_values(self): if self.testing: return {"Output": self.merged_output} return {}
class Deconv(TriviallyDistributable, ConvolutionalBase, nn_units.Forward): # TriviallyDistributable overrides nn_units.Forward IDistributable """Deconvolutional layer for simple convolutional layer with linear activation and without bias. Must be assigned before initialize(): input weights output_shape_source Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of multichannel interleaved images. output: output as batch of multichannel interleaved images. weights: matrix of weights. output_shape_source: Array to get output shape from. n_kernels: number of convolutional kernels in the corresponding convolutional layer. kx: kernel width. ky: kernel height. sliding: tuple of kernel sliding (by x-axis, by y-axis), kx, ky MUST be a multiple of sliding to avoid irregularities. padding: tuple of virtual sample padding (left, top, right, bottom), will be computed automatically based on sliding. weights_transposed: assume weights matrix as a transposed one. unsafe_padding: flag to enable unsafe padding and/or sliding. """ MAPPING = {"deconv"} @staticmethod def compute_padding(sx, sy, kx, ky, sliding): """Computes required padding. """ return (kx - sliding[1], ky - sliding[0], kx - sx % sliding[1] if sx % sliding[1] != 0 else kx - sliding[1], ky - sy % sliding[0] if sy % sliding[0] != 0 else ky - sliding[0]) @staticmethod def check_padding_is_safe(kx, ky, sliding): if sliding[0] > (ky >> 1) or sliding[1] > (kx >> 1): raise ValueError( "sliding should not be greater than half of the kernel size") if kx % sliding[0] != 0 or kx % sliding[1] != 0: raise ValueError("Kernel size should be multiple of sliding") def __init__(self, workflow, **kwargs): super(Deconv, self).__init__(workflow, **kwargs) self.unsafe_padding = kwargs.get("unsafe_padding", False) self.hits = Array() self.krn_clear_output_ = None self._global_size = None self._local_size = None del self.bias self.demand("n_kernels", "kx", "ky", "padding", "sliding", "input", "weights", "output_shape_source") def init_unpickled(self): super(Deconv, self).init_unpickled() self.sources_["deconv/forward"] = {} def initialize(self, device, **kwargs): super(Deconv, self).initialize(device, **kwargs) self._dtype = self.input.dtype self.weights_shape = (tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape) if hasattr(self, "bias"): raise ValueError("bias should not be set") if (len(self.input.shape) != 4 or self.input.shape[3] != self.n_kernels): raise ValueError("Incorrectly shaped input encountered") if (len(self.weights_shape) != 2 or self.weights_shape[0] != self.n_kernels or self.weights_shape[1] % (self.kx * self.ky) != 0): raise ValueError("Incorrectly shaped weights encountered") output_shape = tuple(self.output_shape_source.shape) if len(output_shape) != 4: raise ValueError("Incorrect output_shape_source shape") if output_shape[0] != self.input.shape[0]: raise ValueError("output_shape_source.shape[0] != input.shape[0]") try: self.check_padding_is_safe(self.kx, self.ky, self.sliding) except ValueError as e: if not self.unsafe_padding: raise from_none(e) self.warning("The padding will be unsafe") self._create_hits(output_shape) padding = Deconv.compute_padding(output_shape[2], output_shape[1], self.kx, self.ky, self.sliding) if self.padding is None: # pylint: disable=E0203 self.padding = padding elif self.padding != padding: if not self.unsafe_padding: raise ValueError("Expected padding %s but got %s" % (padding, self.padding)) self._create_hits(output_shape) if not self.output: self.output.reset(numpy.zeros(output_shape, dtype=self._dtype)) else: assert self.output.shape == output_shape self._output_shape = output_shape self._sy, self._sx, self._n_channels = self._output_shape[1:] self._kernel_size = self.kx * self.ky * self._n_channels self._kernel_app_per_image = self.input.sample_size // self.n_kernels self._kernel_app_total = (self._kernel_app_per_image * self.input.shape[0]) self.init_vectors(self.input, self.weights, self.output, self.hits) def _create_hits(self, output_shape): if not self.hits: self.hits.reset(numpy.zeros(output_shape, dtype=numpy.int32)) else: assert self.hits.size == int(numpy.prod(output_shape)) def _gpu_init(self, blas_class): defines = { "USE_ATOMICS": 1, "WEIGHTS_TRANSPOSED": int(self.weights_transposed), "BATCH": self._output_shape[0], "SX": self._sx, "SY": self._sy, "N_CHANNELS": self._n_channels, "KX": self.kx, "KY": self.ky, "N_KERNELS": self.n_kernels, "PAD_LEFT": self.padding[0], "PAD_TOP": self.padding[1], "PAD_RIGHT": self.padding[2], "PAD_BOTTOM": self.padding[3], "SLIDE_X": self.sliding[0], "SLIDE_Y": self.sliding[1], "USE_HITS": int(bool(self.hits)), "DECONV_MODE": int(bool(self.hits)) + 1, "OUTPUT_SIZE": self.output.size } self.build_program( defines, "%s/%s_%d_%dx%dx%d_%dx%d_%d" % (root.common.dirs.cache, self.__class__.__name__, self.input.shape[0], self._output_shape[2], self._output_shape[1], self._output_shape[3], self.kx, self.ky, self.n_kernels), dtype=self._dtype) self.krn_pack_ = self.get_kernel("DirectPack") unpack_bytes = (self._kernel_app_per_image * self.unpack_size * self._kernel_size * self.input.itemsize) self.device.request_temp_buffer(unpack_bytes) if self.hits: self.krn_pack_.set_arg(3, self.hits.devmem) self.krn_apply_hits_ = self.get_kernel("apply_hits") self.krn_apply_hits_.set_args(self.output.devmem, self.hits.devmem) self.gemm_ = blas_class.gemm(self._dtype) self.np_one = numpy.ones(1, dtype=self._dtype) self.np_zero = numpy.zeros(1, dtype=self._dtype) self._const_i = numpy.zeros(1, dtype=numpy.int64) def ocl_init(self): ocl_blas.OCLBLAS.attach_to_device(self.device) self._gpu_init(ocl_blas.OCLBLAS) self._global_size_pack = lambda size: (size, ) self._local_size_pack = None if self.hits: self.krn_clear_hits_ = self.get_kernel("clear_hits") self.krn_clear_hits_.set_arg(0, self.hits.devmem) self._global_size_hits = (self.output.size, ) self._local_size_hits = None self.krn_clear_output_ = self.get_kernel("clear_output") self.krn_clear_output_.set_arg(0, self.output.devmem) self._clear_output = lambda: (self.execute_kernel( (self.output.size, ), None, self.krn_clear_output_)) self._clear_hits = lambda: (self.execute_kernel( (self.hits.size, ), None, self.krn_clear_hits_)) self._process_subblock = self._ocl_process_subblock self.krn_pack_.set_arg(1, self.output.devmem) def cuda_init(self): self._gpu_init(cublas.CUBLAS) block_size = self.device.suggest_block_size(self.krn_pack_) self._global_size_pack = (lambda size: (int(numpy.ceil(size / block_size)), 1, 1)) self._local_size_pack = (block_size, 1, 1) if self.hits: block_size = self.device.suggest_block_size(self.krn_apply_hits_) self._global_size_hits = (int( numpy.ceil(self.output.size / block_size)), 1, 1) self._local_size_hits = (block_size, 1, 1) self._clear_output = lambda: self.output.devmem.memset32_async() self._clear_hits = lambda: self.hits.devmem.memset32_async() self._process_subblock = self._cuda_process_subblock def ocl_run(self): self.gpu_run() def cuda_run(self): self.gpu_run() def gpu_run(self): self.unmap_vectors(self.output, self.input, self.weights) unpack_data = self.device.get_temp_buffer() self._clear_output() if self.hits: self.hits.unmap() self._clear_hits() batch_size = self.output.shape[0] for i in range(0, batch_size, self.unpack_size): self._process_subblock(i, min(batch_size - i, self.unpack_size), unpack_data) if self.hits: self.execute_kernel(self._global_size_hits, self._local_size_hits, self.krn_apply_hits_) def _cuda_process_subblock(self, start_image, image_count, unpack_data): output_offs = (start_image * self.input.sample_size * self.input.itemsize) unpack_side = self._kernel_app_per_image * image_count self.gemm_( self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N, self._kernel_size, unpack_side, self.weights_shape[0], self.np_one, self.weights.devmem, int(self.input.devmem) + output_offs, self.np_zero, unpack_data) self.krn_pack_.set_arg(0, unpack_data) self.krn_pack_.set_arg( 1, int(self.output.devmem) + start_image * self.output.sample_size * self.output.itemsize) limit = unpack_side * self._kernel_size self._const_i[0] = limit self.krn_pack_.set_arg(2, self._const_i) self.execute_kernel(self._global_size_pack(limit), self._local_size_pack, self.krn_pack_) def _ocl_process_subblock(self, start_image, image_count, unpack_data): output_offs = start_image * self.input.sample_size unpack_side = self._kernel_app_per_image * image_count self.gemm_(self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N, self._kernel_size, unpack_side, self.weights_shape[0], self.np_one, self.weights.devmem, self.input.devmem, self.np_zero, unpack_data, offsetB=output_offs) self.krn_pack_.set_arg(0, unpack_data) self._const_i[0] = start_image * self.output.sample_size self.krn_pack_.set_arg(2, self._const_i) limit = unpack_side * self._kernel_size self.execute_kernel(self._global_size_pack(limit), self._local_size_pack, self.krn_pack_) def numpy_run(self): raise NotImplementedError()
class MultiHistogram(Plotter): """Plotter for drawing weights as 2D. Must be assigned before initialize(): input input_field """ def __init__(self, workflow, **kwargs): super(MultiHistogram, self).__init__(workflow, **kwargs) self.limit = kwargs.get("limit", 64) self.value = Array() self.n_bars = kwargs.get("n_bars", 25) self.hist_number = kwargs.get("hist_number", 16) self.demand("input") def initialize(self, **kwargs): super(MultiHistogram, self).initialize(**kwargs) if self.hist_number > self.limit: self.hist_number = self.limit self.value.mem = numpy.zeros( [self.hist_number, self.n_bars], dtype=numpy.int64) def redraw(self): fig = self.pp.figure(self.name) fig.clf() fig.patch.set_facecolor('#E8D6BB') # fig.patch.set_alpha(0.45) n_cols = int(numpy.round(numpy.sqrt(self.value.shape[0]))) n_rows = int(numpy.ceil(self.value.shape[0] / n_cols)) i = 0 for _ in range(0, n_rows): for _ in range(0, n_cols): ax = fig.add_subplot(n_rows, n_cols, i + 1) ax.cla() # ax.axis('off') ax.patch.set_facecolor('#ffe6ca') # ax.set_xlabel("Input Data", fontsize=10) # ax.set_ylabel("Number", fontsize=10) ymin = self.value[i].min() ymax = self.value[i].max() xmin = self.input[i].min() xmax = self.input[i].max() ax.axis([xmin, xmax + ((xmax - xmin) / self.n_bars), ymin, ymax]) ax.grid(True) ax.set_title(self.name.replace("Histogram ", "")) nbars = self.n_bars width = ((xmax - xmin) / nbars) * 0.8 X = numpy.linspace(xmin, xmax, num=nbars, endpoint=True) Y = self.value[i] if (n_rows > 5) or (n_cols > 5): ax.bar(X, Y, color='#ffa0ef', width=width, edgecolor='red') else: ax.bar(X, Y, color='#ffa0ef', width=width, edgecolor='lavender') if n_rows > 4: ax.set_yticklabels([]) if n_cols > 3: ax.set_xticklabels([]) i += 1 if i >= self.value.shape[0]: break if i >= self.value.shape[0]: break self.show_figure(fig) fig.canvas.draw() return fig def fill(self): for i in range(self.hist_number): self.value.map_write() self.input.map_read() mx = self.input.mem[i].max() mi = self.input.mem[i].min() d = mx - mi if not d: return d = (self.n_bars - 1) / d self.value[i] = 0 for x in self.input.mem[i]: i_bar = int(numpy.floor((x - mi) * d)) self.value[i, i_bar] += 1
class EvaluatorMSE(EvaluatorBase): MAPPING = "evaluator_mse" LOSS = "mse" """Evaluator for nn softmax output from the batch labels. Must be assigned before initialize(): output target batch_size labels (may be None) class_targets (may be None) Updates after run(): err_output confusion_matrix max_err_output_sum n_err (only if labels and class_targets is not None) Creates within initialize(): err_output n_err (only if labels and class_targets is not None) max_err_output_sum Attributes: output: output of the network_common as Batch. target: target for the current Batch. err_output: backpropagation errors. batch_size: number of elements in output to evaluate. metrics: [0] - sum of sample's mse, [1] - max of sample's mse, [2] - min of sample's mse. mse: array of mse for each sample in minibatch. krn_constants_i_: numpy array for constant arguments to kernel. labels: labels for a batch (may be None). class_targets: target for each class (may be None). n_err: number of wrongly recognized samples (if labels and class_targets is not None). """ def __init__(self, workflow, **kwargs): super(EvaluatorMSE, self).__init__(workflow, **kwargs) self.metrics = Array() self.mse = Array() self.labels = None self.class_targets = None self.n_err = Array() self.root = kwargs.get("root", True) self.demand("target", "normalizer") @property def root(self): """ :return: True if error metric is RMSE, otherwise, MSE (mean sum of squares). Default is True. """ return self._root @root.setter def root(self, value): if not isinstance(value, bool): raise TypeError("root must be boolean (got %s)" % type(value)) self._root = value def initialize(self, device, **kwargs): super(EvaluatorMSE, self).initialize(device=device, **kwargs) if self.testing: return if self.target.size != self.output.size: raise error.BadFormatError( "target.size != output.size (%s != %s)" % (self.target.size, self.output.size)) self.sources_["evaluator_mse"] = {} self.sources_["denormalization"] = {} dtype = self.output.dtype self.metrics.reset(numpy.zeros(3, dtype=dtype)) self.metrics[2] = 1.0e30 # mse_min self.mse.reset(numpy.zeros(self.err_output.mem.shape[0], dtype)) self.n_err.reset(numpy.zeros(2, dtype=numpy.int32)) self.init_vectors(self.n_err, self.target, self.metrics, self.mse) if self.class_targets: self.class_targets.initialize(self.device) def _gpu_init(self): dtype = self.output.dtype block_size = min(self.err_output.shape[0], 128) if self.class_targets: self.sources_["mse_find_closest"] = { "target_dtype": numpy_dtype_to_opencl(self.class_targets.dtype) } self.build_program(cache_file_name="%s_%d_%d" % (self.__class__.__name__, self.output.shape[0], self.output.sample_size), dtype=dtype, max_batch_size=self.err_output.shape[0], block_size=block_size, output_size=self.err_output.sample_size, root=self.root, normalization=self.normalizer.MAPPING, targets_number=self.class_targets.shape[0] if self.class_targets else None, coeffs=self.normalizer.coefficients) self.assign_kernel("evaluate_mse") self.set_args(self.output, self.target, self.skip_args(2), self.metrics, self.mse.devmem, self.err_output) if self.labels and self.class_targets: assert (self.labels.dtype == self.n_err.dtype == numpy.int32) self.krn_find_closest_ = self.get_kernel("mse_find_closest") self.krn_find_closest_.set_args(self.output.devmem, self.class_targets.devmem, self.labels.devmem, self.n_err.devmem) return block_size def ocl_init(self): if self.testing: return block_size = self._gpu_init() self._local_size = [block_size] self._global_size = self._local_size self._global_size_find_closest_ = lambda: (self.batch_size, ) self._local_size_find_closest = None def cuda_init(self): if self.testing: return block_size = self._gpu_init() self._local_size = (block_size, 1, 1) self._global_size = (1, 1, 1) self._global_size_find_closest_ = lambda: (self.batch_size, 1, 1) self._local_size_find_closest = (1, 1, 1) def _gpu_run(self): self.unmap_vectors(self.err_output, self.output, self.target, self.metrics, self.mse) batch_size = self.batch_size self.krn_constants_i_[0] = batch_size self.set_arg(2, self.krn_constants_i_[0:1]) self.krn_constants_f_[0] = 1.0 / self.batch_size if self.mean else 1.0 self.set_arg(3, self.krn_constants_f_[0:1]) self.execute_kernel(self._global_size, self._local_size) if self.labels and self.class_targets: self.unmap_vectors(self.class_targets, self.labels, self.n_err) self.execute_kernel(self._global_size_find_closest_(), self._local_size_find_closest, self.krn_find_closest_) self.n_err.map_write() self.n_err.mem[1] += batch_size def ocl_run(self): return self._gpu_run() def cuda_run(self): return self._gpu_run() def numpy_run(self): self.output.map_read() self.target.map_read() self.metrics.map_write() self.err_output.map_invalidate() self.mse.map_invalidate() assert (self.output.size == self.target.size == self.err_output.size) batch_size = self.batch_size err_output = self.err_output.matrix[:batch_size] assert_addr(err_output, self.err_output.mem) output = self.output.matrix[:batch_size] assert_addr(output, self.output.mem) target = self.target.matrix[:batch_size] assert_addr(target, self.target.mem) mse = self.mse.mem[:batch_size] assert_addr(mse, self.mse.mem) err_output[:] = output - target if not isinstance(self.normalizer, NoneNormalizer): output_copy = output.copy() target_copy = target.copy() self.normalizer.denormalize(output_copy) self.normalizer.denormalize(target_copy) denormed_err_output = output_copy - target_copy else: denormed_err_output = err_output self.err_output.mem[batch_size:] = 0 mse[:] = numpy.square(denormed_err_output).sum(axis=1) / \ denormed_err_output.shape[1] if self.mean: err_output /= batch_size if self.root: numpy.sqrt(mse, mse) self.mse.mem[batch_size:] = 0 self.metrics.mem[0] += mse.sum() self.metrics.mem[1] = max(self.metrics.mem[1], mse.max()) self.metrics.mem[2] = min(self.metrics.mem[2], mse.min()) if self.labels and self.class_targets: self.class_targets.map_read() self.labels.map_read() self.n_err.map_write() class_targets = self.class_targets.matrix labels = self.labels.mem for i, sample in enumerate(output): lbl = numpy.linalg.norm(class_targets - sample, axis=1).argmin() if lbl != labels[i]: self.n_err.mem[0] += 1 self.n_err.mem[1] += 1 def merge_output(self): if not isinstance(self.normalizer, NoneNormalizer): output = self.output[:self.batch_size].copy() self.normalizer.denormalize(output) else: output = self.output.mem self.merged_output[self.offset - self.batch_size:self.offset] = output
class KohonenForward(KohonenBase, AcceleratedUnit): """Kohonen forward layer. Must be assigned before initialize(): input weights minibatch_offset (if total == True) minibatch_size (if total == True) batch_size (if total == True) argmins speeds up run() if linked from KohonenTrainer Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of samples. weights: the weights of the neurons in Kohonen layer. output: the list of winners. total: if total=True is passed in __init__(), the overall winners table """ def __init__(self, workflow, **kwargs): super(KohonenForward, self).__init__(workflow, **kwargs) self.demand("input", "weights") self.argmins = None self._distances = Array() self.output = Array() self._chunk_size_ = 0 self.weights_transposed = False self.total = Array() if kwargs.get("total", False) else None if self.total is not None: self.minibatch_offset = None self.minibatch_size = None self.batch_size = None def init_unpickled(self): super(KohonenForward, self).init_unpickled() self.sources_["kohonen"] = {"FORWARD": 1} @property def neurons_number(self): return self.weights.mem.shape[0] @property def sample_length(self): return self.weights.mem.shape[1] @property def chunk_size(self): return self._chunk_size_ def initialize(self, device, **kwargs): super(KohonenForward, self).initialize(device=device, **kwargs) assert self.input.mem.shape[1] == self.sample_length batch_size = self.input.mem.shape[0] self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32)) if self.argmins is None: self._distances.reset( numpy.zeros([batch_size, self.neurons_number], dtype=self.weights.mem.dtype)) if self.total is not None: self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32)) self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32) def ocl_init(self): batch_size = self.input.mem.shape[0] self.output.initialize(self.device) if self.argmins is None: self.input.initialize(self.device) self.weights.initialize(self.device) self._distances.initialize(self.device) elif self.total is None: return if self.total is not None: self.total.initialize(self.device) copy_chunk_size = int( numpy.ceil(batch_size / self.device.max_group_size)) chunk_size = self.neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self.neurons_number // 2 + 1 self.argmin_group_size = \ int(numpy.ceil(self.neurons_number / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self.sample_length, 'NEURONS_NUMBER': self.neurons_number, 'CHUNK_SIZE': chunk_size, 'COPY_CHUNK_SIZE': copy_chunk_size, } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self.sample_length, self.neurons_number), dtype=self.weights.mem.dtype) if self.total is not None: self._set_total_global_size_ = \ [int(numpy.ceil(batch_size / copy_chunk_size))] self._krn_set_total_ = self.get_kernel("set_total") self._krn_set_total_.set_args(self.output.devmem, cl.skip, self.total.devmem) if self.argmins is not None: return self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem, None) self._gs_distance = [ roundup(self.neurons_number, block_size), roundup(batch_size, block_size) ] self._ls_distance = [block_size, block_size] def ocl_run(self): self.output.unmap() if self.total is not None: self.total.unmap() if self.argmins is None: self.input.unmap() self.weights.unmap() self.execute_kernel(self._gs_distance, self._ls_distance, self._krn_distances_) self.execute_kernel([self.argmin_group_size], [self.argmin_group_size], self._krn_argmin_) else: self.argmins.unmap() self.argmins.map_read() self.output.map_write() self.output.mem[:] = self.argmins.mem self.output.unmap() self.argmins.unmap() if self.total is not None: self._minibatch_offset_[0] = \ self.minibatch_offset - self.minibatch_size self._krn_set_total_.set_arg(1, self._minibatch_offset_) self.execute_kernel(self._set_total_global_size_, None, self._krn_set_total_) def numpy_run(self): self.output.map_invalidate() if self.argmins is not None: self.argmins.map_read() self.output.mem[:] = self.argmins.mem else: self.input.map_read() self.weights.map_read() if self.total is not None: self.total.map_invalidate() length = self.minibatch_size if self.total is not None \ else self.input.mem.shape[0] for sindex in range(length): if self.argmins is None: dist = self.weights.mem - self.input[sindex] winner = numpy.argmin(self.numpy_linalg_norm(dist)) self.output[sindex] = winner else: winner = self.argmins[sindex] if self.total is not None: index = sindex + self.minibatch_offset - self.minibatch_size self.total[index] = winner
class DropoutForward(Forward, Dropout): """ Forward propagation of dropout layer. """ MIN_RANDOM_STATE = 0 MAX_RANDOM_STATE = 0x100000000 MAPPING = {"dropout"} def __init__(self, workflow, **kwargs): super(DropoutForward, self).__init__(workflow, **kwargs) self.mask = Array() # dropout mask self.states = Array() self.rand = random_generator.get() @Dropout.dropout_ratio.setter def dropout_ratio(self, value): Dropout.dropout_ratio.fset(self, value) if hasattr(self, "input") and self.input is not None: self.calc_mask() def initialize(self, device, **kwargs): super(DropoutForward, self).initialize(device=device, **kwargs) self.mask.mem = numpy.empty_like(self.input.mem) self.states.mem = self.rand.randint( low=DropoutForward.MIN_RANDOM_STATE, high=DropoutForward.MAX_RANDOM_STATE, size=self.input.size * 4).astype(numpy.uint32) if not self.output: self.output.reset(numpy.zeros_like(self.input.mem)) else: assert self.output.shape == self.input.shape self.init_vectors(self.input, self.output, self.states, self.mask) def _gpu_init(self): self._threshold_arg_ = numpy.empty(1, dtype=numpy.uint64) self._pass_arg_ = numpy.empty(1, dtype=self.input.dtype) self.build_program({"OUTPUT_SIZE": self.input.size}, "%s_%s" % (self.__class__.__name__, "x".join(str(x) for x in self.input.shape)), dtype=self.input.dtype) self.assign_kernel("dropout_forward") self.set_args(self.input, self.device.skip(2), self.states, self.mask, self.output) def ocl_init(self): self._gpu_init() self._global_size = (self.input.size,) self._local_size = None def cuda_init(self): self._gpu_init() block_size = self.device.suggest_block_size(self._kernel_) self._global_size = ( int(numpy.ceil(self.input.size / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def calc_mask(self): leave_ratio = 1.0 - self.dropout_ratio self.rand.fill(self.mask.mem, -self.dropout_ratio, leave_ratio) numpy.maximum(self.mask.mem, 0, self.mask.mem) numpy.ceil(self.mask.mem, self.mask.mem) self.mask.mem[:] = (self.mask.mem.astype(self.input.dtype) / leave_ratio) def numpy_run(self): self.output.map_invalidate() self.input.map_read() if not self.forward_mode: self.mask.map_invalidate() self.calc_mask() numpy.multiply(self.input.mem.ravel(), self.mask.mem.ravel(), ravel(self.output.mem)) else: self.output.mem[:] = self.input.mem def _gpu_run(self): self.unmap_vectors(self.input, self.output) if self.forward_mode: # Will copy input to output from outside (in cuda_run/ocl_run). return True self.unmap_vectors(self.states, self.mask) self._threshold_arg_[0] = ((1 << 64) - 1.0) * self.dropout_ratio self._pass_arg_[0] = 1.0 / (1.0 - self.dropout_ratio) self.set_arg(1, self._threshold_arg_) self.set_arg(2, self._pass_arg_) self.execute_kernel(self._global_size, self._local_size) return False def ocl_run(self): if self._gpu_run(): self.device.queue_.copy_buffer( self.input.devmem, self.output.devmem, 0, 0, self.output.nbytes, need_event=False) def cuda_run(self): if self._gpu_run(): self.output.devmem.from_device_async(self.input.devmem)
def __init__(self, workflow, **kwargs): super(DropoutForward, self).__init__(workflow, **kwargs) self.mask = Array() # dropout mask self.states = Array() self.rand = random_generator.get()
class Deconv(TriviallyDistributable, ConvolutionalBase, nn_units.Forward): # TriviallyDistributable overrides nn_units.Forward IDistributable """Deconvolutional layer for simple convolutional layer with linear activation and without bias. Must be assigned before initialize(): input weights output_shape_source Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of multichannel interleaved images. output: output as batch of multichannel interleaved images. weights: matrix of weights. output_shape_source: Array to get output shape from. n_kernels: number of convolutional kernels in the corresponding convolutional layer. kx: kernel width. ky: kernel height. sliding: tuple of kernel sliding (by x-axis, by y-axis), kx, ky MUST be a multiple of sliding to avoid irregularities. padding: tuple of virtual sample padding (left, top, right, bottom), will be computed automatically based on sliding. weights_transposed: assume weights matrix as a transposed one. unsafe_padding: flag to enable unsafe padding and/or sliding. """ MAPPING = {"deconv"} @staticmethod def compute_padding(sx, sy, kx, ky, sliding): """Computes required padding. """ return (kx - sliding[1], ky - sliding[0], kx - sx % sliding[1] if sx % sliding[1] != 0 else kx - sliding[1], ky - sy % sliding[0] if sy % sliding[0] != 0 else ky - sliding[0]) @staticmethod def check_padding_is_safe(kx, ky, sliding): if sliding[0] > (ky >> 1) or sliding[1] > (kx >> 1): raise ValueError( "sliding should not be greater than half of the kernel size") if kx % sliding[0] != 0 or kx % sliding[1] != 0: raise ValueError( "Kernel size should be multiple of sliding") def __init__(self, workflow, **kwargs): super(Deconv, self).__init__(workflow, **kwargs) self.unsafe_padding = kwargs.get("unsafe_padding", False) self.hits = Array() self.krn_clear_output_ = None self._global_size = None self._local_size = None del self.bias self.demand("n_kernels", "kx", "ky", "padding", "sliding", "input", "weights", "output_shape_source") def init_unpickled(self): super(Deconv, self).init_unpickled() self.sources_["deconv/forward"] = {} def initialize(self, device, **kwargs): super(Deconv, self).initialize(device, **kwargs) self._dtype = self.input.dtype self.weights_shape = (tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape) if hasattr(self, "bias"): raise ValueError("bias should not be set") if (len(self.input.shape) != 4 or self.input.shape[3] != self.n_kernels): raise ValueError("Incorrectly shaped input encountered") if (len(self.weights_shape) != 2 or self.weights_shape[0] != self.n_kernels or self.weights_shape[1] % (self.kx * self.ky) != 0): raise ValueError("Incorrectly shaped weights encountered") output_shape = tuple(self.output_shape_source.shape) if len(output_shape) != 4: raise ValueError("Incorrect output_shape_source shape") if output_shape[0] != self.input.shape[0]: raise ValueError( "output_shape_source.shape[0] != input.shape[0]") try: self.check_padding_is_safe(self.kx, self.ky, self.sliding) except ValueError as e: if not self.unsafe_padding: raise from_none(e) self.warning("The padding will be unsafe") self._create_hits(output_shape) padding = Deconv.compute_padding( output_shape[2], output_shape[1], self.kx, self.ky, self.sliding) if self.padding is None: # pylint: disable=E0203 self.padding = padding elif self.padding != padding: if not self.unsafe_padding: raise ValueError( "Expected padding %s but got %s" % (padding, self.padding)) self._create_hits(output_shape) if self.output: assert self.output.shape[1:] == output_shape[1:] if not self.output or self.output.shape[0] != output_shape[0]: self.output.reset(numpy.zeros(output_shape, dtype=self._dtype)) self._output_shape = output_shape self._sy, self._sx, self._n_channels = self._output_shape[1:] self._kernel_size = self.kx * self.ky * self._n_channels self._kernel_app_per_image = self.input.sample_size // self.n_kernels self._kernel_app_total = (self._kernel_app_per_image * self.input.shape[0]) self.init_vectors(self.input, self.weights, self.output, self.hits) def _create_hits(self, output_shape): if not self.hits: self.hits.reset( numpy.zeros(output_shape, dtype=numpy.int32)) else: assert self.hits.size == int(numpy.prod(output_shape)) def _gpu_init(self, blas_class): defines = { "USE_ATOMICS": 1, "WEIGHTS_TRANSPOSED": int(self.weights_transposed), "BATCH": self._output_shape[0], "SX": self._sx, "SY": self._sy, "N_CHANNELS": self._n_channels, "KX": self.kx, "KY": self.ky, "N_KERNELS": self.n_kernels, "PAD_LEFT": self.padding[0], "PAD_TOP": self.padding[1], "PAD_RIGHT": self.padding[2], "PAD_BOTTOM": self.padding[3], "SLIDE_X": self.sliding[0], "SLIDE_Y": self.sliding[1], "USE_HITS": int(bool(self.hits)), "DECONV_MODE": int(bool(self.hits)) + 1, "OUTPUT_SIZE": self.output.size } self.build_program( defines, "%s/%s_%d_%dx%dx%d_%dx%d_%d" % ( root.common.dirs.cache, self.__class__.__name__, self.input.shape[0], self._output_shape[2], self._output_shape[1], self._output_shape[3], self.kx, self.ky, self.n_kernels), dtype=self._dtype) self.krn_pack_ = self.get_kernel("DirectPack") unpack_bytes = (self._kernel_app_per_image * self.unpack_size * self._kernel_size * self.input.itemsize) self.device.request_temp_buffer(unpack_bytes) if self.hits: self.krn_pack_.set_arg(3, self.hits.devmem) self.krn_apply_hits_ = self.get_kernel("apply_hits") self.krn_apply_hits_.set_args(self.output.devmem, self.hits.devmem) self.gemm_ = blas_class.gemm(self._dtype) self.np_one = numpy.ones(1, dtype=self._dtype) self.np_zero = numpy.zeros(1, dtype=self._dtype) self._const_i = numpy.zeros(1, dtype=numpy.int64) def ocl_init(self): ocl_blas.OCLBLAS.attach_to_device(self.device) self._gpu_init(ocl_blas.OCLBLAS) self._global_size_pack = lambda size: (size,) self._local_size_pack = None if self.hits: self.krn_clear_hits_ = self.get_kernel("clear_hits") self.krn_clear_hits_.set_arg(0, self.hits.devmem) self._global_size_hits = (self.output.size,) self._local_size_hits = None self.krn_clear_output_ = self.get_kernel("clear_output") self.krn_clear_output_.set_arg(0, self.output.devmem) self._clear_output = lambda: ( self.execute_kernel((self.output.size,), None, self.krn_clear_output_)) self._clear_hits = lambda: ( self.execute_kernel((self.hits.size,), None, self.krn_clear_hits_)) self._process_subblock = self._ocl_process_subblock self.krn_pack_.set_arg(1, self.output.devmem) def cuda_init(self): self._gpu_init(cublas.CUBLAS) block_size = self.device.suggest_block_size(self.krn_pack_) self._global_size_pack = ( lambda size: (int(numpy.ceil(size / block_size)), 1, 1)) self._local_size_pack = (block_size, 1, 1) if self.hits: block_size = self.device.suggest_block_size(self.krn_apply_hits_) self._global_size_hits = ( int(numpy.ceil(self.output.size / block_size)), 1, 1) self._local_size_hits = (block_size, 1, 1) self._clear_output = lambda: self.output.devmem.memset32_async() self._clear_hits = lambda: self.hits.devmem.memset32_async() self._process_subblock = self._cuda_process_subblock def ocl_run(self): self.gpu_run() def cuda_run(self): self.gpu_run() def gpu_run(self): self.unmap_vectors(self.output, self.input, self.weights) unpack_data = self.device.get_temp_buffer() self._clear_output() if self.hits: self.hits.unmap() self._clear_hits() batch_size = self.output.shape[0] for i in range(0, batch_size, self.unpack_size): self._process_subblock(i, min(batch_size - i, self.unpack_size), unpack_data) if self.hits: self.execute_kernel(self._global_size_hits, self._local_size_hits, self.krn_apply_hits_) def _cuda_process_subblock(self, start_image, image_count, unpack_data): output_offs = (start_image * self.input.sample_size * self.input.itemsize) unpack_side = self._kernel_app_per_image * image_count self.gemm_( self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N, self._kernel_size, unpack_side, self.weights_shape[0], self.np_one, self.weights.devmem, int(self.input.devmem) + output_offs, self.np_zero, unpack_data) self.krn_pack_.set_arg(0, unpack_data) self.krn_pack_.set_arg( 1, int(self.output.devmem) + start_image * self.output.sample_size * self.output.itemsize) limit = unpack_side * self._kernel_size self._const_i[0] = limit self.krn_pack_.set_arg(2, self._const_i) self.execute_kernel(self._global_size_pack(limit), self._local_size_pack, self.krn_pack_) def _ocl_process_subblock(self, start_image, image_count, unpack_data): output_offs = start_image * self.input.sample_size unpack_side = self._kernel_app_per_image * image_count self.gemm_( self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N, self._kernel_size, unpack_side, self.weights_shape[0], self.np_one, self.weights.devmem, self.input.devmem, self.np_zero, unpack_data, offsetB=output_offs) self.krn_pack_.set_arg(0, unpack_data) self._const_i[0] = start_image * self.output.sample_size self.krn_pack_.set_arg(2, self._const_i) limit = unpack_side * self._kernel_size self.execute_kernel(self._global_size_pack(limit), self._local_size_pack, self.krn_pack_) def numpy_run(self): raise NotImplementedError()