class Summator(AcceleratedUnit): """Multiplies two vectors pointwise. """ def __init__(self, workflow, **kwargs): super(Summator, self).__init__(workflow, **kwargs) self.output = Array() self.demand("x", "y") def initialize(self, device, **kwargs): super(Summator, self).initialize(device, **kwargs) if not self.output: self.output.reset(numpy.zeros_like(self.x.mem)) else: assert self.output.shape == self.x.shape self.init_vectors(self.x, self.y, self.output) def init_unpickled(self): super(Summator, self).init_unpickled() self.sources_["summator"] = {} def _gpu_init(self): self.build_program({"OUTPUT_SIZE": self.output.size}, "%s_%d" % (self.__class__.__name__, self.output.size), dtype=self.x.dtype) self.assign_kernel("add_forward") self.set_args(self.x, self.y, self.output) def cuda_init(self): self._gpu_init() block_size = self.device.suggest_block_size(self._kernel_) self._global_size = ( int(numpy.ceil(self.output.size / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def ocl_init(self): self._gpu_init() self._global_size = (self.output.size, 1, 1) self._local_size = None def numpy_init(self): pass # nothing to init def _gpu_run(self): self.unmap_vectors(self.x, self.y, self.output) self.execute_kernel(self._global_size, self._local_size) def cuda_run(self): self._gpu_run() def ocl_run(self): self._gpu_run() def numpy_run(self): self.x.map_read() self.y.map_read() self.output.map_invalidate() numpy.add(self.x.mem, self.y.mem, self.output.mem)
class GDSummator(AcceleratedUnit): """Gradient descent for Summator. """ def __init__(self, workflow, **kwargs): super(GDSummator, self).__init__(workflow, **kwargs) self.err_x = Array() self.err_y = Array() self.demand("err_output") def initialize(self, device, **kwargs): super(GDSummator, self).initialize(device, **kwargs) if self.err_x: assert self.err_x.shape[1:] == self.err_output.shape[1:] if not self.err_x or self.err_x.shape[0] != self.err_output.shape[0]: self.err_x.reset(numpy.zeros_like(self.err_output.mem)) if self.err_y: assert self.err_y.shape[1:] == self.err_output.shape[1:] if not self.err_y or self.err_y.shape[0] != self.err_output.shape[0]: self.err_y.reset(numpy.zeros_like(self.err_output.mem)) self.init_vectors(self.err_x, self.err_y, self.err_output) def cuda_init(self): pass # nothing to init def ocl_init(self): pass # nothing to init def numpy_init(self): pass # nothing to init def cuda_run(self): self.unmap_vectors(self.err_output, self.err_x, self.err_y) self.err_x.devmem.from_device_async(self.err_output.devmem) self.err_y.devmem.from_device_async(self.err_output.devmem) def ocl_run(self): self.unmap_vectors(self.err_output, self.err_x, self.err_y) self.device.queue_.copy_buffer( self.err_output.devmem, self.err_x.devmem, 0, 0, self.err_output.nbytes, need_event=False) self.device.queue_.copy_buffer( self.err_output.devmem, self.err_y.devmem, 0, 0, self.err_output.nbytes, need_event=False) def numpy_run(self): self.err_output.map_read() self.err_x.map_invalidate() self.err_y.map_invalidate() self.err_x.mem[:] = self.err_output.mem[:] self.err_y.mem[:] = self.err_output.mem[:]
class MemCpy(AcceleratedUnit): def __init__(self, workflow, **kwargs): super(MemCpy, self).__init__(workflow, **kwargs) self.output = Array() self.demand("input") def initialize(self, device, **kwargs): super(MemCpy, self).initialize(device, **kwargs) if (self.output.mem is None or self.output.mem.size != self.input.mem.size): self.output.reset() self.output.mem = numpy.zeros(self.input.mem.shape, dtype=self.input.mem.dtype) self.input.initialize(self.device) self.output.initialize(self.device) def cuda_init(self): pass def ocl_init(self): pass def _gpu_run(self): self.input.unmap() self.output.unmap() def ocl_run(self): self._gpu_run() self.device.queue_.copy_buffer(self.input.devmem, self.output.devmem, 0, 0, self.input.nbytes) def cuda_run(self): self._gpu_run() self.output.devmem.from_device_async(self.input.devmem) def numpy_run(self): self.input.map_read() self.output.map_invalidate() numpy.copyto(self.output.mem, self.input.mem)
class DropoutForward(Forward, Dropout): """ Forward propagation of dropout layer. """ MIN_RANDOM_STATE = 0 MAX_RANDOM_STATE = 0x100000000 MAPPING = {"dropout"} def __init__(self, workflow, **kwargs): super(DropoutForward, self).__init__(workflow, **kwargs) self.mask = Array() # dropout mask self.states = Array() self.rand = random_generator.get() @Dropout.dropout_ratio.setter def dropout_ratio(self, value): Dropout.dropout_ratio.fset(self, value) if hasattr(self, "input") and self.input is not None: self.calc_mask() def initialize(self, device, **kwargs): super(DropoutForward, self).initialize(device=device, **kwargs) self.mask.mem = numpy.empty_like(self.input.mem) self.states.mem = self.rand.randint( low=DropoutForward.MIN_RANDOM_STATE, high=DropoutForward.MAX_RANDOM_STATE, size=self.input.size * 4).astype(numpy.uint32) if not self.output: self.output.reset(numpy.zeros_like(self.input.mem)) else: assert self.output.shape == self.input.shape self.init_vectors(self.input, self.output, self.states, self.mask) def _gpu_init(self): self._threshold_arg_ = numpy.empty(1, dtype=numpy.uint64) self._pass_arg_ = numpy.empty(1, dtype=self.input.dtype) self.build_program({"OUTPUT_SIZE": self.input.size}, "%s_%s" % (self.__class__.__name__, "x".join(str(x) for x in self.input.shape)), dtype=self.input.dtype) self.assign_kernel("dropout_forward") self.set_args(self.input, self.device.skip(2), self.states, self.mask, self.output) def ocl_init(self): self._gpu_init() self._global_size = (self.input.size,) self._local_size = None def cuda_init(self): self._gpu_init() block_size = self.device.suggest_block_size(self._kernel_) self._global_size = ( int(numpy.ceil(self.input.size / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def calc_mask(self): leave_ratio = 1.0 - self.dropout_ratio self.rand.fill(self.mask.mem, -self.dropout_ratio, leave_ratio) numpy.maximum(self.mask.mem, 0, self.mask.mem) numpy.ceil(self.mask.mem, self.mask.mem) self.mask.mem[:] = (self.mask.mem.astype(self.input.dtype) / leave_ratio) def numpy_run(self): self.output.map_invalidate() self.input.map_read() if not self.forward_mode: self.mask.map_invalidate() self.calc_mask() numpy.multiply(self.input.mem.ravel(), self.mask.mem.ravel(), ravel(self.output.mem)) else: self.output.mem[:] = self.input.mem def _gpu_run(self): self.unmap_vectors(self.input, self.output) if self.forward_mode: # Will copy input to output from outside (in cuda_run/ocl_run). return True self.unmap_vectors(self.states, self.mask) self._threshold_arg_[0] = ((1 << 64) - 1.0) * self.dropout_ratio self._pass_arg_[0] = 1.0 / (1.0 - self.dropout_ratio) self.set_arg(1, self._threshold_arg_) self.set_arg(2, self._pass_arg_) self.execute_kernel(self._global_size, self._local_size) return False def ocl_run(self): if self._gpu_run(): self.device.queue_.copy_buffer( self.input.devmem, self.output.devmem, 0, 0, self.output.nbytes, need_event=False) def cuda_run(self): if self._gpu_run(): self.output.devmem.from_device_async(self.input.devmem)
class MeanDispNormalizer(AcceleratedUnit, TriviallyDistributable): """Normalizes multichannel byte images according to dataset mean and dispersion. Attributes: input: minibatch of images (dtype=numpy.uint8, shape[0]=minibatch_size). mean: mean image over the dataset (dtype=numpy.uint8). rdisp: 1.0 / dispersion over the dataset (float datatype). output: normalized float images of the same dtype as rdisp. """ def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "WORKER") super(MeanDispNormalizer, self).__init__(workflow, **kwargs) self.output = Array() self.global_size = None self.local_size = None self.demand("input", "mean", "rdisp") def init_unpickled(self): super(MeanDispNormalizer, self).init_unpickled() self.sources_["mean_disp_normalizer"] = {} def initialize(self, device, **kwargs): super(MeanDispNormalizer, self).initialize(device, **kwargs) for arr in self.input, self.mean, self.rdisp: if not isinstance(arr, Array): raise TypeError( "veles.memory.Array type expected (got %s)" % type(arr)) if not arr: raise ValueError("Invalid Array state") if len(self.input.shape) < 2: raise ValueError("input should be at least 2D") sample_size = self.mean.size if (self.input.sample_size != sample_size or self.rdisp.size != sample_size): raise ValueError( "Sample size of input differs from mean-rdisp size") if not self.output: self.output.reset(numpy.zeros(self.input.shape, self.rdisp.dtype)) else: assert self.output.shape == self.input.shape self.init_vectors(self.input, self.mean, self.rdisp, self.output) def _gpu_init(self): dtype = self.rdisp.dtype sample_size = self.mean.size defines = { "input_type": numpy_dtype_to_opencl(self.input.dtype), "mean_type": numpy_dtype_to_opencl(self.mean.dtype), "SAMPLE_SIZE": sample_size } self.build_program(defines, self.__class__.__name__, dtype=dtype) self.assign_kernel("normalize_mean_disp") self.set_args(self.input, self.mean, self.rdisp, self.output) def ocl_init(self): self._gpu_init() self.global_size = [self.mean.size, self.input.shape[0]] def cuda_init(self): self._gpu_init() self.local_size = 1, 1, 1 self.global_size = self.mean.size, self.input.shape[0], 1 def _gpu_run(self): self.unmap_vectors(self.input, self.mean, self.rdisp, self.output) self.execute_kernel(self.global_size, self.local_size) def ocl_run(self): self._gpu_run() def cuda_run(self): self._gpu_run() def numpy_run(self): self.input.map_read() self.mean.map_read() self.rdisp.map_read() self.output.map_invalidate() dtype = self.output.dtype self.output.matrix[:] = ( self.input.matrix.astype(dtype)[:] - self.mean.plain.astype(dtype)) * self.rdisp.plain
class EvaluatorMSE(EvaluatorBase): MAPPING = "evaluator_mse" LOSS = "mse" """Evaluator for nn softmax output from the batch labels. Must be assigned before initialize(): output target batch_size labels (may be None) class_targets (may be None) Updates after run(): err_output confusion_matrix max_err_output_sum n_err (only if labels and class_targets is not None) Creates within initialize(): err_output n_err (only if labels and class_targets is not None) max_err_output_sum Attributes: output: output of the network_common as Batch. target: target for the current Batch. err_output: backpropagation errors. batch_size: number of elements in output to evaluate. metrics: [0] - sum of sample's mse, [1] - max of sample's mse, [2] - min of sample's mse. mse: array of mse for each sample in minibatch. krn_constants_i_: numpy array for constant arguments to kernel. labels: labels for a batch (may be None). class_targets: target for each class (may be None). n_err: number of wrongly recognized samples (if labels and class_targets is not None). """ def __init__(self, workflow, **kwargs): super(EvaluatorMSE, self).__init__(workflow, **kwargs) self.metrics = Array() self.mse = Array() self.labels = None self.class_targets = None self.n_err = Array() self.root = kwargs.get("root", True) self.demand("target", "normalizer") @property def root(self): """ :return: True if error metric is RMSE, otherwise, MSE (mean sum of squares). Default is True. """ return self._root @root.setter def root(self, value): if not isinstance(value, bool): raise TypeError("root must be boolean (got %s)" % type(value)) self._root = value def initialize(self, device, **kwargs): super(EvaluatorMSE, self).initialize(device=device, **kwargs) if self.testing: return if self.target.size != self.output.size: raise error.BadFormatError( "target.size != output.size (%s != %s)" % (self.target.size, self.output.size)) self.sources_["evaluator_mse"] = {} self.sources_["denormalization"] = {} dtype = self.output.dtype self.metrics.reset(numpy.zeros(3, dtype=dtype)) self.metrics[2] = 1.0e30 # mse_min self.mse.reset(numpy.zeros(self.err_output.mem.shape[0], dtype)) self.n_err.reset(numpy.zeros(2, dtype=numpy.int32)) self.init_vectors(self.n_err, self.target, self.metrics, self.mse) if self.class_targets: self.class_targets.initialize(self.device) def _gpu_init(self): dtype = self.output.dtype block_size = min(self.err_output.shape[0], 128) if self.class_targets: self.sources_["mse_find_closest"] = { "target_dtype": numpy_dtype_to_opencl(self.class_targets.dtype) } self.build_program( cache_file_name="%s_%d_%d" % (self.__class__.__name__, self.output.shape[0], self.output.sample_size), dtype=dtype, max_batch_size=self.err_output.shape[0], block_size=block_size, output_size=self.err_output.sample_size, root=self.root, normalization=self.normalizer.MAPPING, targets_number=self.class_targets.shape[0] if self.class_targets else None, coeffs=self.normalizer.coefficients) self.assign_kernel("evaluate_mse") self.set_args(self.output, self.target, self.skip_args(2), self.metrics, self.mse.devmem, self.err_output) if self.labels and self.class_targets: assert(self.labels.dtype == self.n_err.dtype == numpy.int32) self.krn_find_closest_ = self.get_kernel("mse_find_closest") self.krn_find_closest_.set_args( self.output.devmem, self.class_targets.devmem, self.labels.devmem, self.n_err.devmem) return block_size def ocl_init(self): if self.testing: return block_size = self._gpu_init() self._local_size = [block_size] self._global_size = self._local_size self._global_size_find_closest_ = lambda: (self.batch_size,) self._local_size_find_closest = None def cuda_init(self): if self.testing: return block_size = self._gpu_init() self._local_size = (block_size, 1, 1) self._global_size = (1, 1, 1) self._global_size_find_closest_ = lambda: (self.batch_size, 1, 1) self._local_size_find_closest = (1, 1, 1) def _gpu_run(self): self.unmap_vectors(self.err_output, self.output, self.target, self.metrics, self.mse) batch_size = self.batch_size self.krn_constants_i_[0] = batch_size self.set_arg(2, self.krn_constants_i_[0:1]) self.krn_constants_f_[0] = 1.0 / self.batch_size if self.mean else 1.0 self.set_arg(3, self.krn_constants_f_[0:1]) self.execute_kernel(self._global_size, self._local_size) if self.labels and self.class_targets: self.unmap_vectors(self.class_targets, self.labels, self.n_err) self.execute_kernel(self._global_size_find_closest_(), self._local_size_find_closest, self.krn_find_closest_) self.n_err.map_write() self.n_err.mem[1] += batch_size def ocl_run(self): return self._gpu_run() def cuda_run(self): return self._gpu_run() def numpy_run(self): self.output.map_read() self.target.map_read() self.metrics.map_write() self.err_output.map_invalidate() self.mse.map_invalidate() assert(self.output.size == self.target.size == self.err_output.size) batch_size = self.batch_size err_output = self.err_output.matrix[:batch_size] assert_addr(err_output, self.err_output.mem) output = self.output.matrix[:batch_size] assert_addr(output, self.output.mem) target = self.target.matrix[:batch_size] assert_addr(target, self.target.mem) mse = self.mse.mem[:batch_size] assert_addr(mse, self.mse.mem) err_output[:] = output - target if not isinstance(self.normalizer, NoneNormalizer): output_copy = output.copy() target_copy = target.copy() self.normalizer.denormalize(output_copy) self.normalizer.denormalize(target_copy) denormed_err_output = output_copy - target_copy else: denormed_err_output = err_output self.err_output.mem[batch_size:] = 0 mse[:] = numpy.square(denormed_err_output).sum(axis=1) / \ denormed_err_output.shape[1] if self.mean: err_output /= batch_size if self.root: numpy.sqrt(mse, mse) self.mse.mem[batch_size:] = 0 self.metrics.mem[0] += mse.sum() self.metrics.mem[1] = max(self.metrics.mem[1], mse.max()) self.metrics.mem[2] = min(self.metrics.mem[2], mse.min()) if self.labels and self.class_targets: self.class_targets.map_read() self.labels.map_read() self.n_err.map_write() class_targets = self.class_targets.matrix labels = self.labels.mem for i, sample in enumerate(output): lbl = numpy.linalg.norm(class_targets - sample, axis=1).argmin() if lbl != labels[i]: self.n_err.mem[0] += 1 self.n_err.mem[1] += 1 def merge_output(self): if not isinstance(self.normalizer, NoneNormalizer): output = self.output[:self.batch_size].copy() self.normalizer.denormalize(output) else: output = self.output.mem self.merged_output[self.offset - self.batch_size:self.offset] = output
class Forward(ForwardBase): """Class for forward propagation units. Attributes: input: input layer values. output: output layer values. weights: weights. bias: bias. weights_stddev: magnitude of the random distribution for weights. bias_stddev: magnitude of the random distribution for bias. rand: prng.Rand() object for initial weights generation. """ hide_from_registry = True MAPPING = set() def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "WORKER") super(Forward, self).__init__(workflow, **kwargs) self.weights_stddev = kwargs.get("weights_stddev") self.bias_stddev = kwargs.get("bias_stddev", self.weights_stddev) self.weights_filling = kwargs.get("weights_filling", "uniform") self.bias_filling = kwargs.get("bias_filling", "uniform") self.rand = kwargs.get("rand", prng.get()) self.weights_transposed = kwargs.get("weights_transposed", False) self.include_bias = kwargs.get("include_bias", True) self.demand("input") self.output = Array(shallow_pickle=True) self.weights = Array() self.bias = Array() self.forward_mode = False self.exports = ["weights", "bias", "include_bias", "weights_transposed"] def package_export(self): data = {} for attr in self.exports: value = getattr(self, attr) if value is not None: if isinstance(value, Array): value.map_read() value = value.mem data[attr] = value return data @property def forward_mode(self): return self._forward_mode @forward_mode.setter def forward_mode(self, value): if not isinstance(value, bool): raise TypeError("forward_mode must be boolean (got %s)" % type(value)) self._forward_mode = value def initialize(self, device, **kwargs): self.forward_mode = kwargs.get("forward_mode", False) super(Forward, self).initialize(device=device, **kwargs) def generate_data_for_slave(self, slave): if self.forward_mode: return None data = [None, None] if self.weights: self.weights.map_read() data[0] = self.weights.mem if self.bias: self.bias.map_read() data[1] = self.bias.mem return data def generate_data_for_master(self): return None def apply_data_from_master(self, data): if self.forward_mode: return if self.weights: self.weights.map_invalidate() numpy.copyto(self.weights.mem, data[0]) else: self.weights.reset(data[0]) if self.bias: self.bias.map_invalidate() numpy.copyto(self.bias.mem, data[1]) else: self.bias.reset(data[1]) def apply_data_from_slave(self, data, slave): pass def drop_slave(self, slave): pass
class Binarization(AcceleratedUnit, EmptyDeviceMethodsMixin): """ Input Binarization. Input and output is 2d arrays of the same size. Each element A(i,j) (in row i and column j) of input is a float number between 0 and 1. Each element B(i,j) of output is equal 1 with probability A(i,j) and 0 with 1 - A(i,j). Must be assigned before initialize(): * input Updates after run(): * output Creates within initialize(): * output Attributes: input: input as batch of samples. output: output as batch of samples. """ def __init__(self, workflow, **kwargs): super(Binarization, self).__init__(workflow, **kwargs) self.output = Array() self.rand = kwargs.get("rand", prng.get()) self.demand("input", "batch_size") def run(self): """Batch binarization on CPU only. """ self.output.map_invalidate() self.input.map_read() self.output.mem[:] = self.input.mem[:] self.output.mem[:self.batch_size, :] = self.matlab_binornd( 1, self.input.mem[:self.batch_size, :]) def initialize(self, device, **kwargs): super(Binarization, self).initialize(device=device, **kwargs) if not self.output or self.output.size != self.input.size: self.output.reset() self.output.mem = numpy.zeros_like(self.input.mem) self.output.initialize(self.device) def matlab_binornd(self, n, p_in): """ Analogue binornd in Matlab, but n must be scalar. The function generates a matrix of random variables, where the element at (i,j) position is generated from binomial distribution with the number of trials n and the probability of success p_in(i,j). Args: n (int): number of trials p_in (2 dimension numpy.array): success probability matrix Returns: res (2 dimension numpy.array): matrix of random variables generated from the binomial distribution """ p = numpy.copy(p_in) if len(p.shape) == 2: nrow = p.shape[0] ncol = p.shape[1] p = numpy.transpose(p) p = p.flatten() dim = p.shape[0] p = matlib.repmat(p, n, 1) f = self.rand.rand(n, dim) res = f < p res = numpy.sum(res, axis=0) res = numpy.transpose(res.reshape(ncol, nrow)).reshape(nrow, ncol) elif len(p.shape) == 1: p = matlib.repmat(p, n, 1) dim = p.shape[0] p = matlib.repmat(p, n, 1) f = self.rand.rand(n, dim) res = f < p res = numpy.sum(res, axis=0) else: # will make exeption raise ValueError("shape of input Binarization class " "must be 1 or 2 dimensions") return res
class Multiplier(AcceleratedUnit): """Multiplies two vectors pointwise. """ def __init__(self, workflow, **kwargs): super(Multiplier, self).__init__(workflow, **kwargs) self.output = Array() self.demand("x", "y") def initialize(self, device, **kwargs): if ((not self.output and (self.x or self.y)) or (self.x and self.output.shape[0] != self.x.shape[0]) or (self.y and self.output.shape[0] != self.y.shape[0])): self.output.reset( numpy.zeros_like(self.x.mem if self.x else self.y.mem)) if not self.x or not self.y: return True super(Multiplier, self).initialize(device, **kwargs) assert self.output.shape == self.x.shape == self.y.shape self.init_vectors(self.x, self.y, self.output) def init_unpickled(self): super(Multiplier, self).init_unpickled() self.sources_["multiplier"] = {} def _gpu_init(self): self.build_program({"OUTPUT_SIZE": self.output.size}, "%s_%d" % (self.__class__.__name__, self.output.size), dtype=self.x.dtype) self.assign_kernel("multiply_forward") self.set_args(self.x, self.y, self.output) def cuda_init(self): self._gpu_init() block_size = self.device.suggest_block_size(self._kernel_) self._global_size = (int(numpy.ceil(self.output.size / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def ocl_init(self): self._gpu_init() self._global_size = (self.output.size, 1, 1) self._local_size = None def numpy_init(self): pass # nothing to init def _gpu_run(self): self.unmap_vectors(self.x, self.y, self.output) self.execute_kernel(self._global_size, self._local_size) def cuda_run(self): self._gpu_run() def ocl_run(self): self._gpu_run() def numpy_run(self): self.x.map_read() self.y.map_read() self.output.map_invalidate() numpy.multiply(self.x.mem, self.y.mem, self.output.mem)
class InputJoiner(AcceleratedUnit): """Joins several minibatch inputs into one continuous minibatch output. Attributes: input_0, input_1, ...: inputs of type Array(), created via link_inputs offset_0, offset_1, ...: offsets of each input in elements, have valid values after initialize(). length_0, length_1, ...: lengths of each input in elements, have valid values after initialize. output: Array() minibatch_size: size of the minibatch (will be set to the minimum of the first shapes from the inputs if not provided prior to the initialize) """ def __init__(self, workflow, **kwargs): super(InputJoiner, self).__init__(workflow, **kwargs) self.output = Array() self._num_inputs = 0 self.inputs = kwargs.get("inputs") def init_unpickled(self): super(InputJoiner, self).init_unpickled() self.sources_["join"] = {} @property def num_inputs(self): return self._num_inputs @num_inputs.setter def num_inputs(self, value): try: value = int(value) except (ValueError, TypeError): raise ValueError("num_inputs must be copnvertible to int") for x in range(value, self._num_inputs): try: delattr(self, "input_%d" % x) delattr(self, "offset_%d" % x) delattr(self, "length_%d" % x) except AttributeError: pass for x in range(self._num_inputs, value): setattr(self, "input_%d" % x, None) setattr(self, "offset_%d" % x, None) setattr(self, "length_%d" % x, None) self._num_inputs = value @property def inputs(self): return list(getattr(self, "input_%d" % x) for x in range(self._num_inputs)) @property def offsets(self): return list(getattr(self, "offset_%d" % x) for x in range(self._num_inputs)) @property def lengths(self): return list(getattr(self, "length_%d" % x) for x in range(self._num_inputs)) @inputs.setter def inputs(self, value): if value is None: self.num_inputs = 0 return if not hasattr(value, "__iter__"): raise TypeError("inputs must be iterable") self.num_inputs = len(value) for i, inp in enumerate(value): setattr(self, "input_%d" % i, inp) def link_inputs(self, other, *args): """Adds more inputs and links them. It will link args to attributes named "input_0", "input_1", etc. Parameters: other: unit from which to link attributes. args: attribute names to link. """ if not len(args): raise ValueError("args may not be empty") num_inputs = self.num_inputs self.num_inputs = num_inputs + len(args) for arg in args: self.link_attrs(other, ("input_%d" % num_inputs, arg)) num_inputs += 1 def _init_offset_length_attributes(self): """Initializes offset_0, offset_1, ... length_0, length_1, ... """ offset = 0 for i in range(self.num_inputs): inp = getattr(self, "input_%d" % i) setattr(self, "offset_%d" % i, offset) setattr(self, "length_%d" % i, inp.sample_size) offset += inp.sample_size def initialize(self, device, **kwargs): if any(i.mem is None for i in self.inputs): # Not yet ready to initialize return True self._init_offset_length_attributes() super(InputJoiner, self).initialize(device=device, **kwargs) minibatch_size = min(i.shape[0] for i in self.inputs) if any(i.shape[0] > minibatch_size for i in self.inputs): self.warning("Detected inputs of different sizes. Sizes will be " "cut to the lowest value (%d)", minibatch_size) output_shape = (minibatch_size, sum(i.size // i.shape[0] for i in self.inputs)) if not self.output: self.output.reset(numpy.zeros(output_shape, self.inputs[0].dtype)) else: assert self.output.shape == output_shape self.init_vectors(self.output, *self.inputs) def _gpu_init(self): defines = { 'etype': opencl_types.numpy_dtype_to_opencl(self.output.dtype), } self.build_program( defines, "%s_%d_%s" % (type(self).__name__, self.output.shape[0], "_".join(map(str, self.output.shape[1:]))), inputs=self.inputs) self.assign_kernel("join") self.set_args(self.output, *self.inputs) def ocl_init(self): self._gpu_init() def cuda_init(self): self._gpu_init() def numpy_run(self): self.output.map_invalidate() # we will update output on CPU minibatch_size = self.output.shape[0] low = 0 for inp in self.inputs: inp.map_read() high = low + inp.size // inp.shape[0] if low >= high: break self.output.mem[:, low:high] = inp[:minibatch_size] low = high def ocl_run(self): for inp in self.inputs: inp.unmap() self.execute_kernel(*((self.output.shape[0],),) * 2) def cuda_run(self): for inp in self.inputs: inp.unmap() # TODO(a.kazantsev): rewrite CUDA kernel for proper grid size self.execute_kernel((1, 1, 1), (self.output.shape[0], 1, 1))
class GDMultiplier(AcceleratedUnit): """Gradient descent for Multiplier. """ def __init__(self, workflow, **kwargs): super(GDMultiplier, self).__init__(workflow, **kwargs) self.err_x = Array() self.err_y = Array() self.demand("x", "y", "err_output") def initialize(self, device, **kwargs): super(GDMultiplier, self).initialize(device, **kwargs) if not self.err_x: self.err_x.reset(numpy.zeros_like(self.x.mem)) else: assert self.err_x.shape == self.x.shape if not self.err_y: self.err_y.reset(numpy.zeros_like(self.y.mem)) else: assert self.err_y.shape == self.y.shape self.init_vectors(self.err_x, self.err_y, self.x, self.y, self.err_output) def init_unpickled(self): super(GDMultiplier, self).init_unpickled() self.sources_["multiplier"] = {} def _gpu_init(self): self.build_program({"OUTPUT_SIZE": self.err_output.size}, "%s_%d" % (self.__class__.__name__, self.err_output.size), dtype=self.x.dtype) self.assign_kernel("multiply_backward") self.set_args(self.x, self.y, self.err_output, self.err_x, self.err_y) def cuda_init(self): self._gpu_init() block_size = self.device.suggest_block_size(self._kernel_) self._global_size = ( int(numpy.ceil(self.err_output.size / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def ocl_init(self): self._gpu_init() self._global_size = (self.err_output.size, 1, 1) self._local_size = None def numpy_init(self): pass # nothing to init def _gpu_run(self): self.unmap_vectors(self.x, self.y, self.err_output, self.err_x, self.err_y) self.execute_kernel(self._global_size, self._local_size) def cuda_run(self): self._gpu_run() def ocl_run(self): self._gpu_run() def numpy_run(self): self.x.map_read() self.y.map_read() self.err_output.map_read() self.err_x.map_invalidate() self.err_y.map_invalidate() numpy.multiply(self.err_output.mem, self.y.mem, self.err_x.mem) numpy.multiply(self.err_output.mem, self.x.mem, self.err_y.mem)
class KohonenForward(KohonenBase, AcceleratedUnit): """Kohonen forward layer. Must be assigned before initialize(): input weights minibatch_offset (if total == True) minibatch_size (if total == True) batch_size (if total == True) argmins speeds up run() if linked from KohonenTrainer Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of samples. weights: the weights of the neurons in Kohonen layer. output: the list of winners. total: if total=True is passed in __init__(), the overall winners table """ def __init__(self, workflow, **kwargs): super(KohonenForward, self).__init__(workflow, **kwargs) self.demand("input", "weights") self.argmins = None self._distances = Array() self.output = Array() self._chunk_size_ = 0 self.weights_transposed = False self.total = Array() if kwargs.get("total", False) else None if self.total is not None: self.minibatch_offset = None self.minibatch_size = None self.batch_size = None def init_unpickled(self): super(KohonenForward, self).init_unpickled() self.sources_["kohonen"] = {"FORWARD": 1} @property def neurons_number(self): return self.weights.mem.shape[0] @property def sample_length(self): return self.weights.mem.shape[1] @property def chunk_size(self): return self._chunk_size_ def initialize(self, device, **kwargs): super(KohonenForward, self).initialize(device=device, **kwargs) assert self.input.mem.shape[1] == self.sample_length batch_size = self.input.mem.shape[0] self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32)) if self.argmins is None: self._distances.reset(numpy.zeros( [batch_size, self.neurons_number], dtype=self.weights.mem.dtype)) if self.total is not None: self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32)) self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32) def ocl_init(self): batch_size = self.input.mem.shape[0] self.output.initialize(self.device) if self.argmins is None: self.input.initialize(self.device) self.weights.initialize(self.device) self._distances.initialize(self.device) elif self.total is None: return if self.total is not None: self.total.initialize(self.device) copy_chunk_size = int(numpy.ceil(batch_size / self.device.max_group_size)) chunk_size = self.neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self.neurons_number // 2 + 1 self.argmin_group_size = \ int(numpy.ceil(self.neurons_number / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self.sample_length, 'NEURONS_NUMBER': self.neurons_number, 'CHUNK_SIZE': chunk_size, 'COPY_CHUNK_SIZE': copy_chunk_size, } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self.sample_length, self.neurons_number), dtype=self.weights.mem.dtype) if self.total is not None: self._set_total_global_size_ = \ [int(numpy.ceil(batch_size / copy_chunk_size))] self._krn_set_total_ = self.get_kernel("set_total") self._krn_set_total_.set_args(self.output.devmem, cl.skip, self.total.devmem) if self.argmins is not None: return self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem, None) self._gs_distance = [ roundup(self.neurons_number, block_size), roundup(batch_size, block_size)] self._ls_distance = [block_size, block_size] def ocl_run(self): self.output.unmap() if self.total is not None: self.total.unmap() if self.argmins is None: self.input.unmap() self.weights.unmap() self.execute_kernel(self._gs_distance, self._ls_distance, self._krn_distances_) self.execute_kernel([self.argmin_group_size], [self.argmin_group_size], self._krn_argmin_) else: self.argmins.unmap() self.argmins.map_read() self.output.map_write() self.output.mem[:] = self.argmins.mem self.output.unmap() self.argmins.unmap() if self.total is not None: self._minibatch_offset_[0] = \ self.minibatch_offset - self.minibatch_size self._krn_set_total_.set_arg(1, self._minibatch_offset_) self.execute_kernel(self._set_total_global_size_, None, self._krn_set_total_) def numpy_run(self): self.output.map_invalidate() if self.argmins is not None: self.argmins.map_read() self.output.mem[:] = self.argmins.mem else: self.input.map_read() self.weights.map_read() if self.total is not None: self.total.map_invalidate() length = self.minibatch_size if self.total is not None \ else self.input.mem.shape[0] for sindex in range(length): if self.argmins is None: dist = self.weights.mem - self.input[sindex] winner = numpy.argmin(self.numpy_linalg_norm(dist)) self.output[sindex] = winner else: winner = self.argmins[sindex] if self.total is not None: index = sindex + self.minibatch_offset - self.minibatch_size self.total[index] = winner
class KohonenTrainer(KohonenBase, AcceleratedUnit): """KohonenForward train pass. Must be assigned before initialize(): input shape Creates within initialize(): weights winners argmins _distances _coords Updates after run(): weights Attributes: weights: weights of the current layer. input: input of the current layer as batch of 1D samples. krn_dist_: computes distances between input and neuron weights. _krn_argmin_: finds indexes of minimal computed distances. krn_gravity_: computes gravity to the winner neuron. krn_apply_gradients_: applies gradient to weights. """ def __init__(self, workflow, **kwargs): super(KohonenTrainer, self).__init__(workflow, **kwargs) self._distances = Array() self.argmins = Array() self._coords = Array() self.weights = Array() self.winners = Array() self.weights_filling = kwargs.get("weights_filling", "uniform") self.weights_stddev = kwargs.get("weights_stddev", None) self.weights_transposed = kwargs.get("weights_transposed", False) self.time = 0 self._sigma = 0 self.gradient_decay = kwargs.get("gradient_decay", lambda t: 0.1 / (1.0 + t * 0.05)) self.radius_decay = kwargs.get("radius_decay", lambda t: 1.0 / (1.0 + t * 0.05)) self.demand("input", "shape") self._shape = kwargs.get("shape") def init_unpickled(self): super(KohonenTrainer, self).init_unpickled() self.sources_["kohonen"] = {"TRAIN": 1} self._krn_distances_ = None self._krn_argmin_ = None self._krn_gravity_ = None self._krn_compute_gradients_ = None self._krn_apply_gradients_ = None @property def gravity_radius(self): return self.radius_decay(self.time) * self._sigma @property def gradient_multiplier(self): return self.gradient_decay(self.time) @property def shape(self): return self._shape @shape.setter def shape(self, value): self._shape = value def initialize(self, device, **kwargs): super(KohonenTrainer, self).initialize(device=device, **kwargs) self._neurons_number = self.shape[0] * self.shape[1] self._sample_length = self.input.mem.size // self.input.mem.shape[0] # Initialize weights if self.weights_stddev is None: # Get weights magnitude and cap it to 0.05 self.weights_stddev = min(self._get_weights_magnitude(), 0.05) weights_size = (self._sample_length * self._neurons_number) if not self.weights: self.weights.reset(numpy.zeros(weights_size, dtype=self.input.mem.dtype)) filling = { "uniform": lambda rand: rand.fill( self.weights.mem, -self.weights_stddev, self.weights_stddev), "gaussian": lambda rand: rand.fill_normal_real( self.weights.mem, 0, self.weights_stddev) } filling[self.weights_filling](prng.get()) self.weights.mem = self.weights.mem.reshape(( self._neurons_number, self._sample_length)) else: assert self.weights.shape == (self._neurons_number, self._sample_length) if self.weights_transposed: # Reshape weights as a matrix: wtrncopy = self.weights.mem.transpose().copy() self.weights.mem.shape = wtrncopy.shape self.weights.mem[:] = wtrncopy[:] self._sample_length = \ self.weights.mem.shape[0 if self.weights_transposed else 1] # Initialize winners self.winners.reset(numpy.zeros(self._neurons_number, numpy.int32)) # Initialize distances batch_size = self.input.mem.shape[0] self._distances.reset(numpy.zeros( [batch_size, self._neurons_number], dtype=self.weights.mem.dtype)) self.argmins.reset(numpy.zeros(batch_size, dtype=numpy.int32)) self._coords.reset(numpy.zeros([self._neurons_number, 2], dtype=self.weights.mem.dtype)) sz = self._neurons_number rows = int(numpy.round(numpy.sqrt(sz))) cols = sz // rows if sz % rows != 0: cols += 1 x_min = -1.0 x_max = 1.0 y_min = -1.0 y_max = 1.0 x_step = (x_max - x_min) / (cols - 1) if cols > 1 else 0 y = y_min y_step = (y_max - y_min) / (rows - 1) if rows > 1 else 0 offs = 0 mem = self._coords.mem for _row in range(rows): x = x_min + (x_step * 0.5 if _row & 1 else 0) for _col in range(cols): mem[offs, 0] = x mem[offs, 1] = y offs += 1 x += x_step y += y_step self._sigma = (self._coords.mem.ravel().max() - self._coords.mem.ravel().min()) * 1.42 def ocl_init(self): self.input.initialize(self.device) self.weights.initialize(self.device) self.winners.initialize(self.device) self.argmins.initialize(self.device) self._distances.initialize(self.device) self._coords.initialize(self.device) batch_size = self.input.mem.shape[0] chunk_size = self._neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self._neurons_number // 2 + 1 self.argmin_group_size = int(numpy.ceil(float(self._neurons_number) / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self._sample_length, 'NEURONS_NUMBER': self._neurons_number, 'CHUNK_SIZE': chunk_size, 'GRADIENT_CHUNK_SIZE': self.device.max_group_size, 'coord_type': "%s%d" % (opencl_types.numpy_dtype_to_opencl(self._coords.mem.dtype), self._coords.mem.shape[-1]) } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self._sample_length, self._neurons_number), dtype=self.weights.mem.dtype) self.ocl_consts_ = numpy.zeros(1, dtype=self.weights.mem.dtype) self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.argmins.devmem, self.winners.devmem) self._krn_gravity_ = self.get_kernel("compute_gravity") self._krn_gravity_.set_args(self.argmins.devmem, self._coords.devmem) self._krn_gravity_.set_arg(3, self._distances.devmem) self._krn_apply_gradient_ = self.get_kernel("apply_gradient") self._krn_apply_gradient_.set_args(self.input.devmem, self._distances.devmem) self._krn_apply_gradient_.set_arg(3, self.weights.devmem) self._gs_distance = [ roundup(self._neurons_number, block_size), roundup(batch_size, block_size)] self._ls_distance = [block_size, block_size] def iteration(fn): def wrapped(self, *args, **kwargs): result = fn(self, *args, **kwargs) self.time += 1 return result name = getattr(fn, '__name__', getattr(fn, 'func', wrapped).__name__) wrapped.__name__ = name + '_iteration' return wrapped @iteration def numpy_run(self): batch_size = self.input.mem.shape[0] neurons_number = self._neurons_number dists = numpy.empty(neurons_number) gradients = numpy.zeros(self.weights.mem.shape) sigma = self.gravity_radius gmult = self.gradient_multiplier self.input.map_read() self.weights.map_invalidate() self.winners.map_invalidate() for sindex in range(batch_size): dist = self.weights.mem - self.input[sindex] winner = numpy.argmin(self.numpy_linalg_norm(dist)) self.winners[winner] += 1 winner_coords = self._coords.mem[winner] for nindex in range(neurons_number): dist = self._coords.mem[nindex] - winner_coords dists[nindex] = numpy.sum(dist * dist) gravity = numpy.exp(dists / (-2 * sigma * sigma)) gradients += gravity.reshape((1, neurons_number)).transpose() * \ (self.input[sindex] - self.weights.mem) * gmult self.weights.mem += gradients @iteration def ocl_run(self): self.unmap_vectors(self.input, self.weights, self.winners, self._distances, self.argmins, self._coords) batch_size = self.input.mem.shape[0] self.execute_kernel(self._gs_distance, self._ls_distance, self._krn_distances_) self.execute_kernel([self.argmin_group_size], [self.argmin_group_size], self._krn_argmin_) self.ocl_consts_[0] = self.gravity_radius self._krn_gravity_.set_arg(2, self.ocl_consts_[0:1]) self.execute_kernel([batch_size, self._neurons_number], None, self._krn_gravity_) self.ocl_consts_[0] = self.gradient_multiplier self._krn_apply_gradient_.set_arg(2, self.ocl_consts_[0:1]) self.execute_kernel( [int(numpy.ceil(self._sample_length / self.device.max_group_size)), self.device.max_group_size], None, self._krn_apply_gradient_) iteration = staticmethod(iteration) def _get_weights_magnitude(self): """ Returns: weights magnitude for initial random distribution, such that activation function will be near maximum if all input values are at their supposed max value. Doesn't matter for classic Kohonen networks, get values as in All2AllTanh. """ d = self.input.max_supposed * self._sample_length if self.input.mem.dtype in (numpy.complex64, numpy.complex128): return 1.0 / d return 9.0 / d
class ZeroFiller(ForwardBase, TriviallyDistributable): """Fills weights of given unit with zero on every step""" MAPPING = {"zero_filter"} def __init__(self, workflow, **kwargs): super(ZeroFiller, self).__init__(workflow, **kwargs) self.mask = Array() self.grouping = kwargs.get("grouping", 1) self.demand("weights") def init_unpickled(self): super(ZeroFiller, self).init_unpickled() self.sources_["weights_zerofilling"] = {} @property def effective_shape(self): return (self.weights.shape[0], self.weights.size // self.weights.shape[0]) @property def grouping(self): return self._grouping @grouping.setter def grouping(self, value): if not isinstance(value, int): raise TypeError( "grouping value must be an integer (got %s)" % type(value)) if value < 2: raise ValueError("grouping value %d is invalid" % value) self._grouping = value def initialize(self, device=None, **kwargs): super(ZeroFiller, self).initialize(device, **kwargs) if not self.weights: return True if not self.mask: if self.effective_shape[1] % self.grouping != 0: raise ValueError( "Non-multiple of grouping weights shape detected: " "%s, grouping=%d" % (self.weights.shape, self.grouping)) self.mask.reset(numpy.zeros(self.effective_shape, dtype=self.weights.dtype)) self.mask.map_invalidate() # TODO(a.kazantsev): add check for transposed weights. for kernel in range(self.effective_shape[0]): for chan in range(self.effective_shape[1]): self.mask[kernel, chan] = not ( kernel % self.grouping == chan % self.grouping) else: assert self.mask.shape == self.effective_shape for vec in self.mask, self.weights: vec.initialize(device) def _gpu_init(self): self.build_program(cache_file_name="zero_filling_%d" % self.grouping, dtype=self.weights.dtype) self.assign_kernel("multiply_by_mask") self.set_args(self.mask, self.weights) def ocl_init(self): self._gpu_init() self._global_size = [self.weights.size] self._local_size = None def cuda_init(self): self._gpu_init() self._global_size = (self.weights.size, 1, 1) self._local_size = (1, 1, 1) def numpy_run(self): self.mask.map_read() self.weights.map_write() self.weights.mem *= self.mask.mem def _gpu_run(self): self.weights.unmap() self.mask.unmap() self.execute_kernel(self._global_size, self._local_size) def ocl_run(self): self._gpu_run() def cuda_run(self): self._gpu_run()
class Uniform(AcceleratedUnit): """Generates random numbers from uniform distribution. Attributes: num_states: number of random states for parallel generation. states: Array of random states. prng: veles.prng.RandomGenerator for initial states generation. output_bytes: number of output bytes to generate. """ backend_methods = AcceleratedUnit.backend_methods + ("fill",) def __init__(self, workflow, **kwargs): super(Uniform, self).__init__(workflow, **kwargs) self.num_states = kwargs.get("num_states", 256) self.states = Array() self.prng = kwargs.get("prng", get()) self.output_bytes = kwargs.get("output_bytes", 0) self.output = Array() self.cl_const = numpy.zeros(1, dtype=numpy.int32) def init_unpickled(self): super(Uniform, self).init_unpickled() self.sources_["random"] = {} def initialize(self, device, **kwargs): super(Uniform, self).initialize(device, **kwargs) if not self.states or self.states.size != self.num_states * 16: self.states.reset(numpy.empty(self.num_states * 16 * 2, dtype=numpy.uint32)) self.states.mem[:] = self.prng.randint(0, (1 << 32) + 1, self.states.size) if not self.output or self.output.nbytes < self.output_bytes: self.output_bytes = roundup(self.output_bytes, self.num_states * 16 * 8) self.output.reset(numpy.zeros(self.output_bytes, numpy.uint8)) else: self.output_bytes = self.output.nbytes self.init_vectors(self.states, self.output) def _gpu_init(self): self.build_program({}, "uniform_%d" % self.num_states) self.assign_kernel("random_xorshift1024star") self.set_args(self.states, self.cl_const, self.output) def ocl_init(self): self._gpu_init() self._global_size = [self.num_states] self._local_size = None def cuda_init(self): self._gpu_init() n = self.num_states l = 1 while not (n & 1) and l < 32: n >>= 1 l <<= 1 self._global_size = (n, 1, 1) self._local_size = (l, 1, 1) def _gpu_fill(self, nbytes): bytes_per_round = self.num_states * 16 * 8 nbytes = roundup(nbytes, bytes_per_round) if nbytes > self.output.nbytes: raise error.Bug("nbytes > self.output.nbytes") self.unmap_vectors(self.states, self.output) self.cl_const[0] = nbytes // bytes_per_round self.set_arg(1, self.cl_const) self.execute_kernel(self._global_size, self._local_size) def ocl_fill(self, nbytes): self._gpu_fill(nbytes) def cuda_fill(self, nbytes): self._gpu_fill(nbytes) def numpy_fill(self, nbytes): bytes_per_round = self.num_states * 16 * 8 nbytes = roundup(nbytes, bytes_per_round) if nbytes > self.output.nbytes: raise error.Bug("nbytes > self.output.nbytes") self.states.map_write() self.output.map_invalidate() n_rounds = nbytes // bytes_per_round u64 = numpy.array([1181783497276652981], dtype=numpy.uint64) s0 = numpy.zeros(1, dtype=numpy.uint64) s1 = numpy.zeros(1, dtype=numpy.uint64) states = self.states.mem.view(dtype=numpy.uint64) states = states.reshape(states.size // 16, 16) output = self.output.mem.view(dtype=numpy.uint64) for i in range(self.num_states): offs = i s = states[i] self.p = 0 for _round in range(n_rounds): for _iter in range(16): output[offs] = self._next_rand(s, s0, s1, u64) offs += self.num_states def _next_rand(self, s, s0, s1, u64): s0[0] = s[self.p] self.p = (self.p + 1) & 15 s1[0] = s[self.p] s1 ^= s1 << 31 s1 ^= s1 >> 11 s0 ^= s0 >> 30 s0 ^= s1 s[self.p] = s0[0] return (s0 * u64)[0] def fill(self, nbytes): self._backend_fill_(nbytes) def ocl_run(self): self.ocl_fill(self.output.nbytes) def cuda_run(self): self.cuda_fill(self.output.nbytes) def numpy_run(self): self.numpy_fill(self.output.nbytes)
class OffsetPooling(Pooling): """Pooling by offset forward propagation. Must be assigned before initialize(): Updates after run(): input_offset Creates within initialize(): input_offset Attributes: input_offset: offsets in the input where elements are passed through. """ MAPPING = set() hide_from_registry = True def __init__(self, workflow, **kwargs): super(OffsetPooling, self).__init__(workflow, **kwargs) self.input_offset = Array() self.demand("input") def initialize(self, device, **kwargs): super(OffsetPooling, self).initialize(device=device, **kwargs) if self._no_output: return if self.input_offset: assert self.input_offset.shape[1:] == self.output.shape[1:] if (not self.input_offset or self.input_offset.shape[0] != self.output.shape[0]): self.input_offset.reset(numpy.zeros(self.output.shape, dtype=numpy.int32)) self.input_offset.initialize(self.device) def set_args(self, *args): super(OffsetPooling, self).set_args(self.input, self.output, self.input_offset, *args) def ocl_run(self): self.input_offset.unmap() super(OffsetPooling, self).ocl_run() def cuda_run(self): self.input_offset.unmap() super(OffsetPooling, self).cuda_run() def numpy_run(self): self.input_offset.map_invalidate() super(OffsetPooling, self).numpy_run() def numpy_run_cut(self, cut, coords): batch, y1, x1, ch, out_y, out_x = coords cut_index = self.numpy_run_cut_offset( cut, numpy.ravel_multi_index((batch, out_y, out_x, ch), self.output.shape)) i, j = numpy.unravel_index(cut_index, cut.shape) idx = numpy.ravel_multi_index((batch, y1 + i, x1 + j, ch), self.input.shape) val = numpy.ravel(self.input.mem)[idx] self.input_offset.mem[batch, out_y, out_x, ch] = idx return val
class All2AllSoftmax(All2All): """All2All with linear activation and softmax normalization. Must be assigned before initialize(): Updates after run(): max_idx Creates within initialize(): max_idx Attributes: krn_sm_: kernel for softmax activation calculation. max_idx: indexes of element with maximum value for each sample. """ __id__ = "420219fc-3e1a-45b1-87f8-aaa0c1540de4" MAPPING = {"softmax"} def __init__(self, workflow, **kwargs): super(All2AllSoftmax, self).__init__(workflow, **kwargs) self.max_idx = Array() self.reduce_size = 256 def init_unpickled(self): super(All2AllSoftmax, self).init_unpickled() self.krn_sm_ = None self._force_gpu_apply_exp = False def initialize(self, device, **kwargs): self.reduce_size = min(self.reduce_size, int(numpy.prod(self.output_sample_shape))) self.sources_["all2all/softmax"] = { "REDUCE_SIZE": self.reduce_size } retval = super(All2AllSoftmax, self).initialize( device=device, **kwargs) if retval: return retval if self.output.mem.size // self.output.mem.shape[0] <= 1: raise error.BadFormatError( "Output sample size should be greater than 1 for SoftMax.") if not self.max_idx: self.max_idx.reset(numpy.zeros(self.output.shape[0], dtype=numpy.int32)) self.max_idx.initialize(self.device) return retval def numpy_apply_exp(self): self.output.map_write() self.max_idx.map_invalidate() out = self.output.mem out = reshape(out, (out.shape[0], out.size // out.shape[0])) for i, sample in enumerate(out): im = sample.argmax() self.max_idx[i] = im m = sample[im] sample -= m numpy.exp(sample, sample) smm = sample.sum() sample /= smm def ocl_apply_exp(self): self.unmap_vectors(self.output, self.max_idx) global_size = (self.output.shape[0] * self.reduce_size,) local_size = (self.reduce_size,) self.execute_kernel(global_size, local_size, self.krn_sm_) def cuda_apply_exp(self): self.unmap_vectors(self.output, self.max_idx) global_size = (self.output.shape[0], 1, 1) local_size = (self.reduce_size, 1, 1) self.execute_kernel(global_size, local_size, self.krn_sm_) def numpy_run(self): """Forward propagation from batch on CPU only. """ super(All2AllSoftmax, self).numpy_run() if not self._force_gpu_apply_exp: self.numpy_apply_exp() def ocl_run(self): """Forward propagation from batch on GPU. """ self._force_gpu_apply_exp = True super(All2AllSoftmax, self).ocl_run() self.ocl_apply_exp() def cuda_run(self): """Forward propagation from batch on GPU. """ self._force_gpu_apply_exp = True super(All2AllSoftmax, self).cuda_run() self.cuda_apply_exp() def ocl_init(self): super(All2AllSoftmax, self).ocl_init() self.krn_sm_ = self.get_kernel("apply_exp") self.krn_sm_.set_args(self.output.devmem, self.max_idx.devmem) def cuda_init(self): super(All2AllSoftmax, self).cuda_init() self.krn_sm_ = self.get_kernel("apply_exp") self.krn_sm_.set_args(self.output.devmem, self.max_idx.devmem)
class GDMultiplier(AcceleratedUnit): """Gradient descent for Multiplier. """ def __init__(self, workflow, **kwargs): super(GDMultiplier, self).__init__(workflow, **kwargs) self.err_x = Array() self.err_y = Array() self.demand("x", "y", "err_output") def initialize(self, device, **kwargs): super(GDMultiplier, self).initialize(device, **kwargs) if self.err_x: assert self.err_x.shape[1:] == self.x.shape[1:] if not self.err_x or self.err_x.shape[0] != self.x.shape[0]: self.err_x.reset(numpy.zeros_like(self.x.mem)) if self.err_y: assert self.err_y.shape[1:] == self.y.shape[1:] if not self.err_y or self.err_y.shape[0] != self.y.shape[0]: self.err_y.reset(numpy.zeros_like(self.y.mem)) self.init_vectors(self.err_x, self.err_y, self.x, self.y, self.err_output) def init_unpickled(self): super(GDMultiplier, self).init_unpickled() self.sources_["multiplier"] = {} def _gpu_init(self): self.build_program({"OUTPUT_SIZE": self.err_output.size}, "%s_%d" % (self.__class__.__name__, self.err_output.size), dtype=self.x.dtype) self.assign_kernel("multiply_backward") self.set_args(self.x, self.y, self.err_output, self.err_x, self.err_y) def cuda_init(self): self._gpu_init() block_size = self.device.suggest_block_size(self._kernel_) self._global_size = (int(numpy.ceil(self.err_output.size / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def ocl_init(self): self._gpu_init() self._global_size = (self.err_output.size, 1, 1) self._local_size = None def numpy_init(self): pass # nothing to init def _gpu_run(self): self.unmap_vectors(self.x, self.y, self.err_output, self.err_x, self.err_y) self.execute_kernel(self._global_size, self._local_size) def cuda_run(self): self._gpu_run() def ocl_run(self): self._gpu_run() def numpy_run(self): self.x.map_read() self.y.map_read() self.err_output.map_read() self.err_x.map_invalidate() self.err_y.map_invalidate() numpy.multiply(self.err_output.mem, self.y.mem, self.err_x.mem) numpy.multiply(self.err_output.mem, self.x.mem, self.err_y.mem)