class MemCpy(AcceleratedUnit): def __init__(self, workflow, **kwargs): super(MemCpy, self).__init__(workflow, **kwargs) self.output = Array() self.demand("input") def initialize(self, device, **kwargs): super(MemCpy, self).initialize(device, **kwargs) if (self.output.mem is None or self.output.mem.size != self.input.mem.size): self.output.reset() self.output.mem = numpy.zeros(self.input.mem.shape, dtype=self.input.mem.dtype) self.input.initialize(self.device) self.output.initialize(self.device) def cuda_init(self): pass def ocl_init(self): pass def _gpu_run(self): self.input.unmap() self.output.unmap() def ocl_run(self): self._gpu_run() self.device.queue_.copy_buffer(self.input.devmem, self.output.devmem, 0, 0, self.input.nbytes) def cuda_run(self): self._gpu_run() self.output.devmem.from_device_async(self.input.devmem) def numpy_run(self): self.input.map_read() self.output.map_invalidate() numpy.copyto(self.output.mem, self.input.mem)
class MyOCL(IOpenCLUnit): def __init__(self): self.a = Array(zeros([kibi >> 1, kibi], dtype=float32)) self.b = Array() self.b.mem = zeros([kibi, kibi], dtype=float32) def initialize(self, device, **kwargs): self.a.initialize(self) self.b.initialize(self) def ocl_init(): self.krn_.set_arg(0, self.a.devmem) self.krn_.set_arg(1, self.b.devmem) ocl_init() def __call__(self, *args, **kwargs): self.a.unmap() self.b.unmap() self.execute_kernel(global_size, local_size, self.krn_) a = self.a.ocl_map_read()
class All2AllSoftmax(All2All): """All2All with linear activation and softmax normalization. Must be assigned before initialize(): Updates after run(): max_idx Creates within initialize(): max_idx Attributes: krn_sm_: kernel for softmax activation calculation. max_idx: indexes of element with maximum value for each sample. """ __id__ = "420219fc-3e1a-45b1-87f8-aaa0c1540de4" MAPPING = {"softmax"} def __init__(self, workflow, **kwargs): super(All2AllSoftmax, self).__init__(workflow, **kwargs) self.max_idx = Array() self.reduce_size = 256 def init_unpickled(self): super(All2AllSoftmax, self).init_unpickled() self.krn_sm_ = None self._force_gpu_apply_exp = False def initialize(self, device, **kwargs): self.reduce_size = min(self.reduce_size, int(numpy.prod(self.output_sample_shape))) self.sources_["all2all/softmax"] = { "REDUCE_SIZE": self.reduce_size } retval = super(All2AllSoftmax, self).initialize( device=device, **kwargs) if retval: return retval if self.output.mem.size // self.output.mem.shape[0] <= 1: raise error.BadFormatError( "Output sample size should be greater than 1 for SoftMax.") if not self.max_idx: self.max_idx.reset(numpy.zeros(self.output.shape[0], dtype=numpy.int32)) self.max_idx.initialize(self.device) return retval def numpy_apply_exp(self): self.output.map_write() self.max_idx.map_invalidate() out = self.output.mem out = reshape(out, (out.shape[0], out.size // out.shape[0])) for i, sample in enumerate(out): im = sample.argmax() self.max_idx[i] = im m = sample[im] sample -= m numpy.exp(sample, sample) smm = sample.sum() sample /= smm def ocl_apply_exp(self): self.unmap_vectors(self.output, self.max_idx) global_size = (self.output.shape[0] * self.reduce_size,) local_size = (self.reduce_size,) self.execute_kernel(global_size, local_size, self.krn_sm_) def cuda_apply_exp(self): self.unmap_vectors(self.output, self.max_idx) global_size = (self.output.shape[0], 1, 1) local_size = (self.reduce_size, 1, 1) self.execute_kernel(global_size, local_size, self.krn_sm_) def numpy_run(self): """Forward propagation from batch on CPU only. """ super(All2AllSoftmax, self).numpy_run() if not self._force_gpu_apply_exp: self.numpy_apply_exp() def ocl_run(self): """Forward propagation from batch on GPU. """ self._force_gpu_apply_exp = True super(All2AllSoftmax, self).ocl_run() self.ocl_apply_exp() def cuda_run(self): """Forward propagation from batch on GPU. """ self._force_gpu_apply_exp = True super(All2AllSoftmax, self).cuda_run() self.cuda_apply_exp() def ocl_init(self): super(All2AllSoftmax, self).ocl_init() self.krn_sm_ = self.get_kernel("apply_exp") self.krn_sm_.set_args(self.output.devmem, self.max_idx.devmem) def cuda_init(self): super(All2AllSoftmax, self).cuda_init() self.krn_sm_ = self.get_kernel("apply_exp") self.krn_sm_.set_args(self.output.devmem, self.max_idx.devmem)
class OffsetPooling(Pooling): """Pooling by offset forward propagation. Must be assigned before initialize(): Updates after run(): input_offset Creates within initialize(): input_offset Attributes: input_offset: offsets in the input where elements are passed through. """ MAPPING = set() hide_from_registry = True def __init__(self, workflow, **kwargs): super(OffsetPooling, self).__init__(workflow, **kwargs) self.input_offset = Array() self.demand("input") def initialize(self, device, **kwargs): super(OffsetPooling, self).initialize(device=device, **kwargs) if self._no_output: return if self.input_offset: assert self.input_offset.shape[1:] == self.output.shape[1:] if (not self.input_offset or self.input_offset.shape[0] != self.output.shape[0]): self.input_offset.reset(numpy.zeros(self.output.shape, dtype=numpy.int32)) self.input_offset.initialize(self.device) def set_args(self, *args): super(OffsetPooling, self).set_args(self.input, self.output, self.input_offset, *args) def ocl_run(self): self.input_offset.unmap() super(OffsetPooling, self).ocl_run() def cuda_run(self): self.input_offset.unmap() super(OffsetPooling, self).cuda_run() def numpy_run(self): self.input_offset.map_invalidate() super(OffsetPooling, self).numpy_run() def numpy_run_cut(self, cut, coords): batch, y1, x1, ch, out_y, out_x = coords cut_index = self.numpy_run_cut_offset( cut, numpy.ravel_multi_index((batch, out_y, out_x, ch), self.output.shape)) i, j = numpy.unravel_index(cut_index, cut.shape) idx = numpy.ravel_multi_index((batch, y1 + i, x1 + j, ch), self.input.shape) val = numpy.ravel(self.input.mem)[idx] self.input_offset.mem[batch, out_y, out_x, ch] = idx return val
class KohonenForward(KohonenBase, AcceleratedUnit): """Kohonen forward layer. Must be assigned before initialize(): input weights minibatch_offset (if total == True) minibatch_size (if total == True) batch_size (if total == True) argmins speeds up run() if linked from KohonenTrainer Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of samples. weights: the weights of the neurons in Kohonen layer. output: the list of winners. total: if total=True is passed in __init__(), the overall winners table """ def __init__(self, workflow, **kwargs): super(KohonenForward, self).__init__(workflow, **kwargs) self.demand("input", "weights") self.argmins = None self._distances = Array() self.output = Array() self._chunk_size_ = 0 self.weights_transposed = False self.total = Array() if kwargs.get("total", False) else None if self.total is not None: self.minibatch_offset = None self.minibatch_size = None self.batch_size = None def init_unpickled(self): super(KohonenForward, self).init_unpickled() self.sources_["kohonen"] = {"FORWARD": 1} @property def neurons_number(self): return self.weights.mem.shape[0] @property def sample_length(self): return self.weights.mem.shape[1] @property def chunk_size(self): return self._chunk_size_ def initialize(self, device, **kwargs): super(KohonenForward, self).initialize(device=device, **kwargs) assert self.input.mem.shape[1] == self.sample_length batch_size = self.input.mem.shape[0] self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32)) if self.argmins is None: self._distances.reset( numpy.zeros([batch_size, self.neurons_number], dtype=self.weights.mem.dtype)) if self.total is not None: self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32)) self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32) def ocl_init(self): batch_size = self.input.mem.shape[0] self.output.initialize(self.device) if self.argmins is None: self.input.initialize(self.device) self.weights.initialize(self.device) self._distances.initialize(self.device) elif self.total is None: return if self.total is not None: self.total.initialize(self.device) copy_chunk_size = int( numpy.ceil(batch_size / self.device.max_group_size)) chunk_size = self.neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self.neurons_number // 2 + 1 self.argmin_group_size = \ int(numpy.ceil(self.neurons_number / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self.sample_length, 'NEURONS_NUMBER': self.neurons_number, 'CHUNK_SIZE': chunk_size, 'COPY_CHUNK_SIZE': copy_chunk_size, } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self.sample_length, self.neurons_number), dtype=self.weights.mem.dtype) if self.total is not None: self._set_total_global_size_ = \ [int(numpy.ceil(batch_size / copy_chunk_size))] self._krn_set_total_ = self.get_kernel("set_total") self._krn_set_total_.set_args(self.output.devmem, cl.skip, self.total.devmem) if self.argmins is not None: return self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem, None) self._gs_distance = [ roundup(self.neurons_number, block_size), roundup(batch_size, block_size) ] self._ls_distance = [block_size, block_size] def ocl_run(self): self.output.unmap() if self.total is not None: self.total.unmap() if self.argmins is None: self.input.unmap() self.weights.unmap() self.execute_kernel(self._gs_distance, self._ls_distance, self._krn_distances_) self.execute_kernel([self.argmin_group_size], [self.argmin_group_size], self._krn_argmin_) else: self.argmins.unmap() self.argmins.map_read() self.output.map_write() self.output.mem[:] = self.argmins.mem self.output.unmap() self.argmins.unmap() if self.total is not None: self._minibatch_offset_[0] = \ self.minibatch_offset - self.minibatch_size self._krn_set_total_.set_arg(1, self._minibatch_offset_) self.execute_kernel(self._set_total_global_size_, None, self._krn_set_total_) def numpy_run(self): self.output.map_invalidate() if self.argmins is not None: self.argmins.map_read() self.output.mem[:] = self.argmins.mem else: self.input.map_read() self.weights.map_read() if self.total is not None: self.total.map_invalidate() length = self.minibatch_size if self.total is not None \ else self.input.mem.shape[0] for sindex in range(length): if self.argmins is None: dist = self.weights.mem - self.input[sindex] winner = numpy.argmin(self.numpy_linalg_norm(dist)) self.output[sindex] = winner else: winner = self.argmins[sindex] if self.total is not None: index = sindex + self.minibatch_offset - self.minibatch_size self.total[index] = winner
class KohonenTrainer(KohonenBase, AcceleratedUnit): """KohonenForward train pass. Must be assigned before initialize(): input shape Creates within initialize(): weights winners argmins _distances _coords Updates after run(): weights Attributes: weights: weights of the current layer. input: input of the current layer as batch of 1D samples. krn_dist_: computes distances between input and neuron weights. _krn_argmin_: finds indexes of minimal computed distances. krn_gravity_: computes gravity to the winner neuron. krn_apply_gradients_: applies gradient to weights. """ def __init__(self, workflow, **kwargs): super(KohonenTrainer, self).__init__(workflow, **kwargs) self._distances = Array() self.argmins = Array() self._coords = Array() self.weights = Array() self.winners = Array() self.weights_filling = kwargs.get("weights_filling", "uniform") self.weights_stddev = kwargs.get("weights_stddev", None) self.weights_transposed = kwargs.get("weights_transposed", False) self.time = 0 self._sigma = 0 self.gradient_decay = kwargs.get("gradient_decay", lambda t: 0.1 / (1.0 + t * 0.05)) self.radius_decay = kwargs.get("radius_decay", lambda t: 1.0 / (1.0 + t * 0.05)) self.demand("input", "shape") self._shape = kwargs.get("shape") def init_unpickled(self): super(KohonenTrainer, self).init_unpickled() self.sources_["kohonen"] = {"TRAIN": 1} self._krn_distances_ = None self._krn_argmin_ = None self._krn_gravity_ = None self._krn_compute_gradients_ = None self._krn_apply_gradients_ = None @property def gravity_radius(self): return self.radius_decay(self.time) * self._sigma @property def gradient_multiplier(self): return self.gradient_decay(self.time) @property def shape(self): return self._shape @shape.setter def shape(self, value): self._shape = value def initialize(self, device, **kwargs): super(KohonenTrainer, self).initialize(device=device, **kwargs) self._neurons_number = self.shape[0] * self.shape[1] self._sample_length = self.input.mem.size // self.input.mem.shape[0] # Initialize weights if self.weights_stddev is None: # Get weights magnitude and cap it to 0.05 self.weights_stddev = min(self._get_weights_magnitude(), 0.05) weights_size = (self._sample_length * self._neurons_number) if not self.weights: self.weights.reset( numpy.zeros(weights_size, dtype=self.input.mem.dtype)) filling = { "uniform": lambda rand: rand.fill(self.weights.mem, -self.weights_stddev, self.weights_stddev), "gaussian": lambda rand: rand.fill_normal_real(self.weights.mem, 0, self. weights_stddev) } filling[self.weights_filling](prng.get()) self.weights.mem = self.weights.mem.reshape( (self._neurons_number, self._sample_length)) else: assert self.weights.shape == (self._neurons_number, self._sample_length) if self.weights_transposed: # Reshape weights as a matrix: wtrncopy = self.weights.mem.transpose().copy() self.weights.mem.shape = wtrncopy.shape self.weights.mem[:] = wtrncopy[:] self._sample_length = \ self.weights.mem.shape[0 if self.weights_transposed else 1] # Initialize winners self.winners.reset(numpy.zeros(self._neurons_number, numpy.int32)) # Initialize distances batch_size = self.input.mem.shape[0] self._distances.reset( numpy.zeros([batch_size, self._neurons_number], dtype=self.weights.mem.dtype)) self.argmins.reset(numpy.zeros(batch_size, dtype=numpy.int32)) self._coords.reset( numpy.zeros([self._neurons_number, 2], dtype=self.weights.mem.dtype)) sz = self._neurons_number rows = int(numpy.round(numpy.sqrt(sz))) cols = sz // rows if sz % rows != 0: cols += 1 x_min = -1.0 x_max = 1.0 y_min = -1.0 y_max = 1.0 x_step = (x_max - x_min) / (cols - 1) if cols > 1 else 0 y = y_min y_step = (y_max - y_min) / (rows - 1) if rows > 1 else 0 offs = 0 mem = self._coords.mem for _row in range(rows): x = x_min + (x_step * 0.5 if _row & 1 else 0) for _col in range(cols): mem[offs, 0] = x mem[offs, 1] = y offs += 1 x += x_step y += y_step self._sigma = (self._coords.mem.ravel().max() - self._coords.mem.ravel().min()) * 1.42 def ocl_init(self): self.input.initialize(self.device) self.weights.initialize(self.device) self.winners.initialize(self.device) self.argmins.initialize(self.device) self._distances.initialize(self.device) self._coords.initialize(self.device) batch_size = self.input.mem.shape[0] chunk_size = self._neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self._neurons_number // 2 + 1 self.argmin_group_size = int( numpy.ceil(float(self._neurons_number) / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self._sample_length, 'NEURONS_NUMBER': self._neurons_number, 'CHUNK_SIZE': chunk_size, 'GRADIENT_CHUNK_SIZE': self.device.max_group_size, 'coord_type': "%s%d" % (opencl_types.numpy_dtype_to_opencl( self._coords.mem.dtype), self._coords.mem.shape[-1]) } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self._sample_length, self._neurons_number), dtype=self.weights.mem.dtype) self.ocl_consts_ = numpy.zeros(1, dtype=self.weights.mem.dtype) self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.argmins.devmem, self.winners.devmem) self._krn_gravity_ = self.get_kernel("compute_gravity") self._krn_gravity_.set_args(self.argmins.devmem, self._coords.devmem) self._krn_gravity_.set_arg(3, self._distances.devmem) self._krn_apply_gradient_ = self.get_kernel("apply_gradient") self._krn_apply_gradient_.set_args(self.input.devmem, self._distances.devmem) self._krn_apply_gradient_.set_arg(3, self.weights.devmem) self._gs_distance = [ roundup(self._neurons_number, block_size), roundup(batch_size, block_size) ] self._ls_distance = [block_size, block_size] def iteration(fn): def wrapped(self, *args, **kwargs): result = fn(self, *args, **kwargs) self.time += 1 return result name = getattr(fn, '__name__', getattr(fn, 'func', wrapped).__name__) wrapped.__name__ = name + '_iteration' return wrapped @iteration def numpy_run(self): batch_size = self.input.mem.shape[0] neurons_number = self._neurons_number dists = numpy.empty(neurons_number) gradients = numpy.zeros(self.weights.mem.shape) sigma = self.gravity_radius gmult = self.gradient_multiplier self.input.map_read() self.weights.map_invalidate() self.winners.map_invalidate() for sindex in range(batch_size): dist = self.weights.mem - self.input[sindex] winner = numpy.argmin(self.numpy_linalg_norm(dist)) self.winners[winner] += 1 winner_coords = self._coords.mem[winner] for nindex in range(neurons_number): dist = self._coords.mem[nindex] - winner_coords dists[nindex] = numpy.sum(dist * dist) gravity = numpy.exp(dists / (-2 * sigma * sigma)) gradients += gravity.reshape((1, neurons_number)).transpose() * \ (self.input[sindex] - self.weights.mem) * gmult self.weights.mem += gradients @iteration def ocl_run(self): self.unmap_vectors(self.input, self.weights, self.winners, self._distances, self.argmins, self._coords) batch_size = self.input.mem.shape[0] self.execute_kernel(self._gs_distance, self._ls_distance, self._krn_distances_) self.execute_kernel([self.argmin_group_size], [self.argmin_group_size], self._krn_argmin_) self.ocl_consts_[0] = self.gravity_radius self._krn_gravity_.set_arg(2, self.ocl_consts_[0:1]) self.execute_kernel([batch_size, self._neurons_number], None, self._krn_gravity_) self.ocl_consts_[0] = self.gradient_multiplier self._krn_apply_gradient_.set_arg(2, self.ocl_consts_[0:1]) self.execute_kernel([ int(numpy.ceil(self._sample_length / self.device.max_group_size)), self.device.max_group_size ], None, self._krn_apply_gradient_) iteration = staticmethod(iteration) def _get_weights_magnitude(self): """ Returns: weights magnitude for initial random distribution, such that activation function will be near maximum if all input values are at their supposed max value. Doesn't matter for classic Kohonen networks, get values as in All2AllTanh. """ d = self.input.max_supposed * self._sample_length if self.input.mem.dtype in (numpy.complex64, numpy.complex128): return 1.0 / d return 9.0 / d
class GradientDescentBase(AcceleratedUnit): """Base class for gradient descent units. Attributes: input: input layer values. output: output layer values. err_output: error to backpropagate. err_input: backpropagated error. weights: weights. bias: bias. batch_size: current minibatch size. learning_rate: gradient descent speed (positive). learning_rate_bias weights_decay: regularization for weights (see l1_vs_l2). weights_decay_bias gradient_moment: moment coefficient for weights. gradient_moment_bias gradient_weights_with_moment: accumulated moment. gradient_bias_with_moment batch_size: effective batch size (if None, get it from y). weights_transposed: assume weights matrix as a transposed one. apply_gradient: will apply gradient. gradient_changed: when True, slave will send gradients to master (assigned to True just before the run call, so it can be set to False inside ocl_run, numpy_run if necessary). ocl_set_const_args: True when constant arguments for the kernel had been changed and need to be set again. """ hide_from_registry = True MAPPING = set() REDUCE_SIZE = 64 # used for updating bias def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "TRAINER") super(GradientDescentBase, self).__init__(workflow, **kwargs) self.err_input = Array(shallow_pickle=True) self.ocl_set_const_args = True self.weights = None self.bias = None self.demand("input", "err_output") self.learning_rate = kwargs.get("learning_rate", 0.01) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", 0.00005) self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0) self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0) self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2) self.gradient_moment = kwargs.get("gradient_moment", 0) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment) self.weights_transposed = kwargs.get("weights_transposed", False) self.need_err_input = kwargs.get("need_err_input", True) self.include_bias = kwargs.get("include_bias", True) self.factor_ortho = kwargs.get("factor_ortho", 0) self.col_sums = Array() # for orthogonalization # Current gradient as it is without applying learning_rate etc. self.gradient_weights = Array() self.gradient_bias = Array() # Gradient with applied learning_rate etc. # optionally accumulated from the previous run self.accumulate_gradient = kwargs.get("accumulate_gradient", False) # When accumulate_gradient set to True: # 1. Calculate gd # 2. acc = acc_alpha * gd + acc_beta * acc # 3. gd = gd_alpha * acc + gd_beta * gd # 4. Apply moments to gd # 5. weights += gd if apply_gradient set to True self.acc_alpha = kwargs.get("acc_alpha", 0.0) self.acc_beta = kwargs.get("acc_beta", 0.0) self.gd_alpha = kwargs.get("gd_alpha", 0.0) self.gd_beta = kwargs.get("gd_beta", 1.0) self.accumulated_gradient_weights = Array() self.accumulated_gradient_bias = Array() # Gradient with accumulated moments self.gradient_weights_with_moment = Array() self.gradient_bias_with_moment = Array() # Sets to True when gradient changes self.gradient_changed = False # Gradient will be applied to weights immediately just after computing self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave) @property def current_batch_size(self): batch_size = getattr(self, "batch_size", None) if batch_size is None: return self.err_output.mem.shape[0] return int(batch_size) def initialize(self, device, **kwargs): super(GradientDescentBase, self).initialize(device, **kwargs) if self.weights: assert len(self.weights.shape) == 2 self.weights_shape = (tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape) else: self.weights_shape = None self.learning_rate = kwargs.get("learning_rate", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", self.weights_decay) self.gradient_moment = kwargs.get("gradient_moment", self.gradient_moment) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate_bias) self.weights_decay_bias = kwargs.get("weights_decay_bias", self.weights_decay_bias) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment_bias) if self.weights: if not self.gradient_weights: self.gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights.size == self.weights.size if self.weights and self.accumulate_gradient: if not self.accumulated_gradient_weights: self.accumulated_gradient_weights.reset( numpy.zeros_like(self.weights.mem)) else: assert (self.accumulated_gradient_weights.size == self.weights.size) if self.weights and (self.gradient_moment or not self.is_standalone): if not self.gradient_weights_with_moment: self.gradient_weights_with_moment.reset( numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights_with_moment.size == \ self.weights.size if (self.include_bias and self.bias and (not self.gradient_bias or self.gradient_bias.size != self.bias.size)): self.gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if (self.include_bias and self.bias and self.accumulate_gradient and (not self.accumulated_gradient_bias or self.accumulated_gradient_bias.size != self.bias.size)): self.accumulated_gradient_bias.reset( numpy.zeros_like(self.bias.mem)) if (self.include_bias and self.bias and (self.gradient_moment_bias or not self.is_standalone)): if not self.gradient_bias_with_moment: self.gradient_bias_with_moment.reset( numpy.zeros_like(self.bias.mem)) else: assert self.gradient_bias_with_moment.size == self.bias.size dtype = self.err_output.dtype if self.need_err_input: if not self.err_input: self.err_input.reset(numpy.zeros(self.input.shape, dtype)) else: assert self.err_input.shape == self.input.shape if self.weights: side = self.weights_shape[0] other = self.weights.size // side if self.factor_ortho: if not self.col_sums: self.col_sums.reset(numpy.zeros(other, dtype=dtype)) else: assert self.col_sums.size == other self.col_sums.initialize(self.device) self.reduce_size = roundup(min(self.reduce_size, other), 32) self.weights.initialize(self.device) for vec in self.bias, self.input, self.err_input: if vec: vec.initialize(self.device) self.init_vectors(self.err_output, self.gradient_weights, self.gradient_bias, self.accumulated_gradient_weights, self.accumulated_gradient_bias, self.gradient_weights_with_moment, self.gradient_bias_with_moment) def gpu_weights_update(self): self.unmap_vectors(self.input, self.err_output, self.weights, self.gradient_weights, self.accumulated_gradient_weights, self.gradient_weights_with_moment) if self.factor_ortho: self.col_sums.unmap() self.execute_kernel(self._global_size_ortho, self._local_size_ortho, self.krn_compute_col_sums_) self._weights_const[12] = self.factor_ortho self.krn_weights_.set_arg(12, self._weights_const[12:13]) self._weights_const[4:12] = (self.learning_rate, self.weights_decay, self.l1_vs_l2, self.gradient_moment, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta) self.krn_weights_.set_args( self.device.skip(4), self._weights_const[4:5], self._weights_const[5:6], self._weights_const[6:7], self._weights_const[7:8], self._weights_const[8:9], self._weights_const[9:10], self._weights_const[10:11], self._weights_const[11:12]) self.execute_kernel(self._global_size_weights, self._local_size_weights, self.krn_weights_) def gpu_bias_update(self): if not self.include_bias: return self.unmap_vectors(self.err_output, self.bias, self.gradient_bias, self.accumulated_gradient_bias, self.gradient_bias_with_moment) self._bias_const[5:13] = (self.learning_rate_bias, self.weights_decay_bias, self.l1_vs_l2_bias, self.gradient_moment_bias, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta) self.krn_bias_.set_args(self.device.skip(5), self._bias_const[5:6], self._bias_const[6:7], self._bias_const[7:8], self._bias_const[8:9], self._bias_const[9:10], self._bias_const[10:11], self._bias_const[11:12], self._bias_const[12:13]) self.execute_kernel(self._global_size_bias, self._local_size_bias, self.krn_bias_) def gpu_err_output_update(self): """Multiply err_output by activation derivative by output. """ if self.krn_err_output_ is None: return self.err_output.unmap() self.output.unmap() self.execute_kernel(self._global_size_err_output, self._local_size_err_output, self.krn_err_output_) def numpy_err_output_update(self): """Multiply err_output by activation derivative by output. """ pass def print_debug_data(self): """ Show weights statistics """ if not self.logger.isEnabledFor(logging.DEBUG): return self.weights.map_read() self.bias.map_read() self.gradient_bias.map_read() self.gradient_weights.map_read() weights = self.weights.mem bias = self.bias.mem grad_weights = self.gradient_weights.mem grad_bias = self.gradient_bias.mem weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max") weight_table.float_format = ".10" for (w_name, w_array) in [("Weight", weights), ("Bias", bias), ("Grad Weight", grad_weights), ("Grad Bias", grad_bias)]: w_mean = w_stddev = w_min = w_max = None if w_array is not None and w_array.size > 0: w_mean = numpy.mean(w_array) w_stddev = numpy.std(w_array) w_min = numpy.min(w_array) w_max = numpy.max(w_array) weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max) self.debug("\n" + weight_table.get_string()) def generate_data_for_slave(self, slave): return (self.learning_rate, self.weights_decay, self.gradient_moment, self.learning_rate_bias, self.weights_decay_bias, self.gradient_moment_bias) @staticmethod def fill_zeros(vector): if not vector: return vector.map_invalidate() vector.mem[:] = 0 def apply_data_from_master(self, data): self.learning_rate = data[0] self.weights_decay = data[1] self.gradient_moment = data[2] self.learning_rate_bias = data[3] self.weights_decay_bias = data[4] self.gradient_moment_bias = data[5] self.fill_zeros(self.gradient_weights_with_moment) self.fill_zeros(self.gradient_bias_with_moment) self.fill_zeros(self.gradient_weights) self.fill_zeros(self.gradient_bias) self.fill_zeros(self.accumulated_gradient_weights) self.fill_zeros(self.accumulated_gradient_bias) def generate_data_for_master(self): if not self.gradient_changed: return None self.gradient_changed = False self.gradient_weights_with_moment.map_read() self.gradient_bias_with_moment.map_read() return (self.gradient_weights_with_moment.mem, self.gradient_bias_with_moment.mem) def apply_data_from_slave(self, data, slave): if self.weights: self.weights.map_write() self.gradient_weights_with_moment.map_write() self.gradient_weights_with_moment.mem *= self.gradient_moment self.gradient_weights_with_moment.mem += data[0] self.weights.mem += self.gradient_weights_with_moment.mem if self.bias: self.bias.map_write() self.gradient_bias_with_moment.map_write() self.gradient_bias_with_moment.mem *= self.gradient_moment_bias self.gradient_bias_with_moment.mem += data[1] self.bias.mem += self.gradient_bias_with_moment.mem def drop_slave(self, slave): pass def accumulate_gradient_f(self, accumulated_gradient, gradient): if accumulated_gradient and self.accumulate_gradient: accumulated_gradient[:] = ( gradient * self.acc_alpha + (self.acc_beta * accumulated_gradient if self.acc_beta else 0)) gradient *= self.gd_beta gradient += self.gd_alpha * accumulated_gradient return gradient @staticmethod def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False): gradient = gradient.copy() gradient += factor_l12 * ( (1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight)) if factor_ortho: col_sums = (reshape_transposed(weight).sum( axis=1) if weights_transposed else weight.sum(axis=0)) for i, row in enumerate(gradient): row += (col_sums - weight[i]) * factor_ortho / weight.shape[0] gradient *= lr return gradient def run(self): self.gradient_changed = True super(GradientDescentBase, self).run() self.ocl_set_const_args = False
class Binarization(AcceleratedUnit, EmptyDeviceMethodsMixin): """ Input Binarization. Input and output is 2d arrays of the same size. Each element A(i,j) (in row i and column j) of input is a float number between 0 and 1. Each element B(i,j) of output is equal 1 with probability A(i,j) and 0 with 1 - A(i,j). Must be assigned before initialize(): * input Updates after run(): * output Creates within initialize(): * output Attributes: input: input as batch of samples. output: output as batch of samples. """ def __init__(self, workflow, **kwargs): super(Binarization, self).__init__(workflow, **kwargs) self.output = Array() self.rand = kwargs.get("rand", prng.get()) self.demand("input", "batch_size") def run(self): """Batch binarization on CPU only. """ self.output.map_invalidate() self.input.map_read() self.output.mem[:] = self.input.mem[:] self.output.mem[:self.batch_size, :] = self.matlab_binornd( 1, self.input.mem[:self.batch_size, :]) def initialize(self, device, **kwargs): super(Binarization, self).initialize(device=device, **kwargs) if not self.output or self.output.size != self.input.size: self.output.reset() self.output.mem = numpy.zeros_like(self.input.mem) self.output.initialize(self.device) def matlab_binornd(self, n, p_in): """ Analogue binornd in Matlab, but n must be scalar. The function generates a matrix of random variables, where the element at (i,j) position is generated from binomial distribution with the number of trials n and the probability of success p_in(i,j). Args: n (int): number of trials p_in (2 dimension numpy.array): success probability matrix Returns: res (2 dimension numpy.array): matrix of random variables generated from the binomial distribution """ p = numpy.copy(p_in) if len(p.shape) == 2: nrow = p.shape[0] ncol = p.shape[1] p = numpy.transpose(p) p = p.flatten() dim = p.shape[0] p = matlib.repmat(p, n, 1) f = self.rand.rand(n, dim) res = f < p res = numpy.sum(res, axis=0) res = numpy.transpose(res.reshape(ncol, nrow)).reshape(nrow, ncol) elif len(p.shape) == 1: p = matlib.repmat(p, n, 1) dim = p.shape[0] p = matlib.repmat(p, n, 1) f = self.rand.rand(n, dim) res = f < p res = numpy.sum(res, axis=0) else: # will make exeption raise ValueError("shape of input Binarization class " "must be 1 or 2 dimensions") return res
class GradientDescentBase(AcceleratedUnit): """Base class for gradient descent units. Attributes: input: input layer values. output: output layer values. err_output: error to backpropagate. err_input: backpropagated error. weights: weights. bias: bias. batch_size: current minibatch size. learning_rate: gradient descent speed (positive). learning_rate_bias weights_decay: regularization for weights (see l1_vs_l2). weights_decay_bias gradient_moment: moment coefficient for weights. gradient_moment_bias gradient_weights_with_moment: accumulated moment. gradient_bias_with_moment batch_size: effective batch size (if None, get it from y). weights_transposed: assume weights matrix as a transposed one. apply_gradient: will apply gradient. gradient_changed: when True, slave will send gradients to master (assigned to True just before the run call, so it can be set to False inside ocl_run, numpy_run if necessary). ocl_set_const_args: True when constant arguments for the kernel had been changed and need to be set again. """ hide_from_registry = True MAPPING = set() REDUCE_SIZE = 64 # used for updating bias def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "TRAINER") super(GradientDescentBase, self).__init__(workflow, **kwargs) self.err_input = Array(shallow_pickle=True) self.ocl_set_const_args = True self.weights = None self.bias = None self.demand("input", "err_output") self.learning_rate = kwargs.get("learning_rate", 0.01) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", 0.00005) self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0) self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0) self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2) self.gradient_moment = kwargs.get("gradient_moment", 0) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment) self.weights_transposed = kwargs.get("weights_transposed", False) self.need_err_input = kwargs.get("need_err_input", True) self.include_bias = kwargs.get("include_bias", True) self.factor_ortho = kwargs.get("factor_ortho", 0) self.col_sums = Array() # for orthogonalization # Current gradient as it is without applying learning_rate etc. self.gradient_weights = Array() self.gradient_bias = Array() # Gradient with applied learning_rate etc. # optionally accumulated from the previous run self.accumulate_gradient = kwargs.get("accumulate_gradient", False) # When accumulate_gradient set to True: # 1. Calculate gd # 2. acc = acc_alpha * gd + acc_beta * acc # 3. gd = gd_alpha * acc + gd_beta * gd # 4. Apply moments to gd # 5. weights += gd if apply_gradient set to True self.acc_alpha = kwargs.get("acc_alpha", 0.0) self.acc_beta = kwargs.get("acc_beta", 0.0) self.gd_alpha = kwargs.get("gd_alpha", 0.0) self.gd_beta = kwargs.get("gd_beta", 1.0) self.accumulated_gradient_weights = Array() self.accumulated_gradient_bias = Array() # Gradient with accumulated moments self.gradient_weights_with_moment = Array() self.gradient_bias_with_moment = Array() # Sets to True when gradient changes self.gradient_changed = False # Gradient will be applied to weights immediately just after computing self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave) @property def current_batch_size(self): batch_size = getattr(self, "batch_size", None) if batch_size is None: return self.err_output.mem.shape[0] return int(batch_size) def initialize(self, device, **kwargs): super(GradientDescentBase, self).initialize(device, **kwargs) if self.weights: assert len(self.weights.shape) == 2 self.weights_shape = tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape else: self.weights_shape = None self.learning_rate = kwargs.get("learning_rate", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", self.weights_decay) self.gradient_moment = kwargs.get("gradient_moment", self.gradient_moment) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate_bias) self.weights_decay_bias = kwargs.get("weights_decay_bias", self.weights_decay_bias) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment_bias) if self.weights: if not self.gradient_weights: self.gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights.size == self.weights.size if self.weights and self.accumulate_gradient: if not self.accumulated_gradient_weights: self.accumulated_gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.accumulated_gradient_weights.size == self.weights.size if self.weights and (self.gradient_moment or not self.is_standalone): if not self.gradient_weights_with_moment: self.gradient_weights_with_moment.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights_with_moment.size == self.weights.size if self.include_bias and self.bias and (not self.gradient_bias or self.gradient_bias.size != self.bias.size): self.gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if ( self.include_bias and self.bias and self.accumulate_gradient and (not self.accumulated_gradient_bias or self.accumulated_gradient_bias.size != self.bias.size) ): self.accumulated_gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if self.include_bias and self.bias and (self.gradient_moment_bias or not self.is_standalone): if not self.gradient_bias_with_moment: self.gradient_bias_with_moment.reset(numpy.zeros_like(self.bias.mem)) else: assert self.gradient_bias_with_moment.size == self.bias.size dtype = self.err_output.dtype if self.need_err_input: if not self.err_input: self.err_input.reset(numpy.zeros(self.input.shape, dtype)) else: assert self.err_input.shape == self.input.shape if self.weights: side = self.weights_shape[0] other = self.weights.size // side if self.factor_ortho: if not self.col_sums: self.col_sums.reset(numpy.zeros(other, dtype=dtype)) else: assert self.col_sums.size == other self.col_sums.initialize(self.device) self.reduce_size = roundup(min(self.reduce_size, other), 32) self.weights.initialize(self.device) for vec in self.bias, self.input, self.err_input: if vec: vec.initialize(self.device) self.init_vectors( self.err_output, self.gradient_weights, self.gradient_bias, self.accumulated_gradient_weights, self.accumulated_gradient_bias, self.gradient_weights_with_moment, self.gradient_bias_with_moment, ) def gpu_weights_update(self): self.unmap_vectors( self.input, self.err_output, self.weights, self.gradient_weights, self.accumulated_gradient_weights, self.gradient_weights_with_moment, ) if self.factor_ortho: self.col_sums.unmap() self.execute_kernel(self._global_size_ortho, self._local_size_ortho, self.krn_compute_col_sums_) self._weights_const[12] = self.factor_ortho self.krn_weights_.set_arg(12, self._weights_const[12:13]) self._weights_const[4:12] = ( self.learning_rate, self.weights_decay, self.l1_vs_l2, self.gradient_moment, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta, ) self.krn_weights_.set_args( self.device.skip(4), self._weights_const[4:5], self._weights_const[5:6], self._weights_const[6:7], self._weights_const[7:8], self._weights_const[8:9], self._weights_const[9:10], self._weights_const[10:11], self._weights_const[11:12], ) self.execute_kernel(self._global_size_weights, self._local_size_weights, self.krn_weights_) def gpu_bias_update(self): if not self.include_bias: return self.unmap_vectors( self.err_output, self.bias, self.gradient_bias, self.accumulated_gradient_bias, self.gradient_bias_with_moment, ) self._bias_const[5:13] = ( self.learning_rate_bias, self.weights_decay_bias, self.l1_vs_l2_bias, self.gradient_moment_bias, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta, ) self.krn_bias_.set_args( self.device.skip(5), self._bias_const[5:6], self._bias_const[6:7], self._bias_const[7:8], self._bias_const[8:9], self._bias_const[9:10], self._bias_const[10:11], self._bias_const[11:12], self._bias_const[12:13], ) self.execute_kernel(self._global_size_bias, self._local_size_bias, self.krn_bias_) def gpu_err_output_update(self): """Multiply err_output by activation derivative by output. """ if self.krn_err_output_ is None: return self.err_output.unmap() self.output.unmap() self.execute_kernel(self._global_size_err_output, self._local_size_err_output, self.krn_err_output_) def numpy_err_output_update(self): """Multiply err_output by activation derivative by output. """ pass def print_debug_data(self): """ Show weights statistics """ if not self.logger.isEnabledFor(logging.DEBUG): return self.weights.map_read() self.bias.map_read() self.gradient_bias.map_read() self.gradient_weights.map_read() weights = self.weights.mem bias = self.bias.mem grad_weights = self.gradient_weights.mem grad_bias = self.gradient_bias.mem weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max") weight_table.float_format = ".10" for (w_name, w_array) in [ ("Weight", weights), ("Bias", bias), ("Grad Weight", grad_weights), ("Grad Bias", grad_bias), ]: w_mean = w_stddev = w_min = w_max = None if w_array is not None and w_array.size > 0: w_mean = numpy.mean(w_array) w_stddev = numpy.std(w_array) w_min = numpy.min(w_array) w_max = numpy.max(w_array) weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max) self.debug("\n" + weight_table.get_string()) def generate_data_for_slave(self, slave): return ( self.learning_rate, self.weights_decay, self.gradient_moment, self.learning_rate_bias, self.weights_decay_bias, self.gradient_moment_bias, ) @staticmethod def fill_zeros(vector): if not vector: return vector.map_invalidate() vector.mem[:] = 0 def apply_data_from_master(self, data): self.learning_rate = data[0] self.weights_decay = data[1] self.gradient_moment = data[2] self.learning_rate_bias = data[3] self.weights_decay_bias = data[4] self.gradient_moment_bias = data[5] self.fill_zeros(self.gradient_weights_with_moment) self.fill_zeros(self.gradient_bias_with_moment) self.fill_zeros(self.gradient_weights) self.fill_zeros(self.gradient_bias) self.fill_zeros(self.accumulated_gradient_weights) self.fill_zeros(self.accumulated_gradient_bias) def generate_data_for_master(self): if not self.gradient_changed: return None self.gradient_changed = False self.gradient_weights_with_moment.map_read() self.gradient_bias_with_moment.map_read() return (self.gradient_weights_with_moment.mem, self.gradient_bias_with_moment.mem) def apply_data_from_slave(self, data, slave): if self.weights: self.weights.map_write() self.gradient_weights_with_moment.map_write() self.gradient_weights_with_moment.mem *= self.gradient_moment self.gradient_weights_with_moment.mem += data[0] self.weights.mem += self.gradient_weights_with_moment.mem if self.bias: self.bias.map_write() self.gradient_bias_with_moment.map_write() self.gradient_bias_with_moment.mem *= self.gradient_moment_bias self.gradient_bias_with_moment.mem += data[1] self.bias.mem += self.gradient_bias_with_moment.mem def drop_slave(self, slave): pass def accumulate_gradient_f(self, accumulated_gradient, gradient): if accumulated_gradient and self.accumulate_gradient: accumulated_gradient[:] = gradient * self.acc_alpha + ( self.acc_beta * accumulated_gradient if self.acc_beta else 0 ) gradient *= self.gd_beta gradient += self.gd_alpha * accumulated_gradient return gradient @staticmethod def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False): gradient = gradient.copy() gradient += factor_l12 * ((1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight)) if factor_ortho: col_sums = reshape_transposed(weight).sum(axis=1) if weights_transposed else weight.sum(axis=0) for i, row in enumerate(gradient): row += (col_sums - weight[i]) * factor_ortho / weight.shape[0] gradient *= lr return gradient def run(self): self.gradient_changed = True super(GradientDescentBase, self).run() self.ocl_set_const_args = False
class All2AllSoftmax(All2All): """All2All with linear activation and softmax normalization. Must be assigned before initialize(): Updates after run(): max_idx Creates within initialize(): max_idx Attributes: krn_sm_: kernel for softmax activation calculation. max_idx: indexes of element with maximum value for each sample. """ __id__ = "420219fc-3e1a-45b1-87f8-aaa0c1540de4" MAPPING = {"softmax"} def __init__(self, workflow, **kwargs): super(All2AllSoftmax, self).__init__(workflow, **kwargs) self.max_idx = Array() self.reduce_size = 256 def init_unpickled(self): super(All2AllSoftmax, self).init_unpickled() self.krn_sm_ = None self._force_gpu_apply_exp = False def initialize(self, device, **kwargs): self.reduce_size = min(self.reduce_size, int(numpy.prod(self.output_sample_shape))) self.sources_["all2all/softmax"] = {"REDUCE_SIZE": self.reduce_size} retval = super(All2AllSoftmax, self).initialize(device=device, **kwargs) if retval: return retval if self.output.mem.size // self.output.mem.shape[0] <= 1: raise error.BadFormatError( "Output sample size should be greater than 1 for SoftMax.") if not self.max_idx: self.max_idx.reset( numpy.zeros(self.output.shape[0], dtype=numpy.int32)) self.max_idx.initialize(self.device) return retval def numpy_apply_exp(self): self.output.map_write() self.max_idx.map_invalidate() out = self.output.mem out = reshape(out, (out.shape[0], out.size // out.shape[0])) for i, sample in enumerate(out): im = sample.argmax() self.max_idx[i] = im m = sample[im] sample -= m numpy.exp(sample, sample) smm = sample.sum() sample /= smm def ocl_apply_exp(self): self.unmap_vectors(self.output, self.max_idx) global_size = (self.output.shape[0] * self.reduce_size, ) local_size = (self.reduce_size, ) self.execute_kernel(global_size, local_size, self.krn_sm_) def cuda_apply_exp(self): self.unmap_vectors(self.output, self.max_idx) global_size = (self.output.shape[0], 1, 1) local_size = (self.reduce_size, 1, 1) self.execute_kernel(global_size, local_size, self.krn_sm_) def numpy_run(self): """Forward propagation from batch on CPU only. """ super(All2AllSoftmax, self).numpy_run() if not self._force_gpu_apply_exp: self.numpy_apply_exp() def ocl_run(self): """Forward propagation from batch on GPU. """ self._force_gpu_apply_exp = True super(All2AllSoftmax, self).ocl_run() self.ocl_apply_exp() def cuda_run(self): """Forward propagation from batch on GPU. """ self._force_gpu_apply_exp = True super(All2AllSoftmax, self).cuda_run() self.cuda_apply_exp() def ocl_init(self): super(All2AllSoftmax, self).ocl_init() self.krn_sm_ = self.get_kernel("apply_exp") self.krn_sm_.set_args(self.output.devmem, self.max_idx.devmem) def cuda_init(self): super(All2AllSoftmax, self).cuda_init() self.krn_sm_ = self.get_kernel("apply_exp") self.krn_sm_.set_args(self.output.devmem, self.max_idx.devmem)
class OffsetPooling(Pooling): """Pooling by offset forward propagation. Must be assigned before initialize(): Updates after run(): input_offset Creates within initialize(): input_offset Attributes: input_offset: offsets in the input where elements are passed through. """ MAPPING = set() hide_from_registry = True def __init__(self, workflow, **kwargs): super(OffsetPooling, self).__init__(workflow, **kwargs) self.input_offset = Array() self.demand("input") def initialize(self, device, **kwargs): super(OffsetPooling, self).initialize(device=device, **kwargs) if self._no_output: return if not self.input_offset: self.input_offset.reset(numpy.zeros(self.output.shape, dtype=numpy.int32)) else: assert self.input_offset.shape == self.output.shape self.input_offset.initialize(self.device) def set_args(self, *args): super(OffsetPooling, self).set_args(self.input, self.output, self.input_offset, *args) def ocl_run(self): self.input_offset.unmap() super(OffsetPooling, self).ocl_run() def cuda_run(self): self.input_offset.unmap() super(OffsetPooling, self).cuda_run() def numpy_run(self): self.input_offset.map_invalidate() super(OffsetPooling, self).numpy_run() def numpy_run_cut(self, cut, coords): batch, y1, x1, ch, out_y, out_x = coords cut_index = self.numpy_run_cut_offset( cut, numpy.ravel_multi_index((batch, out_y, out_x, ch), self.output.shape)) i, j = numpy.unravel_index(cut_index, cut.shape) idx = numpy.ravel_multi_index((batch, y1 + i, x1 + j, ch), self.input.shape) val = numpy.ravel(self.input.mem)[idx] self.input_offset.mem[batch, out_y, out_x, ch] = idx return val
class KohonenForward(KohonenBase, AcceleratedUnit): """Kohonen forward layer. Must be assigned before initialize(): input weights minibatch_offset (if total == True) minibatch_size (if total == True) batch_size (if total == True) argmins speeds up run() if linked from KohonenTrainer Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of samples. weights: the weights of the neurons in Kohonen layer. output: the list of winners. total: if total=True is passed in __init__(), the overall winners table """ def __init__(self, workflow, **kwargs): super(KohonenForward, self).__init__(workflow, **kwargs) self.demand("input", "weights") self.argmins = None self._distances = Array() self.output = Array() self._chunk_size_ = 0 self.weights_transposed = False self.total = Array() if kwargs.get("total", False) else None if self.total is not None: self.minibatch_offset = None self.minibatch_size = None self.batch_size = None def init_unpickled(self): super(KohonenForward, self).init_unpickled() self.sources_["kohonen"] = {"FORWARD": 1} @property def neurons_number(self): return self.weights.mem.shape[0] @property def sample_length(self): return self.weights.mem.shape[1] @property def chunk_size(self): return self._chunk_size_ def initialize(self, device, **kwargs): super(KohonenForward, self).initialize(device=device, **kwargs) assert self.input.mem.shape[1] == self.sample_length batch_size = self.input.mem.shape[0] self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32)) if self.argmins is None: self._distances.reset(numpy.zeros( [batch_size, self.neurons_number], dtype=self.weights.mem.dtype)) if self.total is not None: self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32)) self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32) def ocl_init(self): batch_size = self.input.mem.shape[0] self.output.initialize(self.device) if self.argmins is None: self.input.initialize(self.device) self.weights.initialize(self.device) self._distances.initialize(self.device) elif self.total is None: return if self.total is not None: self.total.initialize(self.device) copy_chunk_size = int(numpy.ceil(batch_size / self.device.max_group_size)) chunk_size = self.neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self.neurons_number // 2 + 1 self.argmin_group_size = \ int(numpy.ceil(self.neurons_number / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self.sample_length, 'NEURONS_NUMBER': self.neurons_number, 'CHUNK_SIZE': chunk_size, 'COPY_CHUNK_SIZE': copy_chunk_size, } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self.sample_length, self.neurons_number), dtype=self.weights.mem.dtype) if self.total is not None: self._set_total_global_size_ = \ [int(numpy.ceil(batch_size / copy_chunk_size))] self._krn_set_total_ = self.get_kernel("set_total") self._krn_set_total_.set_args(self.output.devmem, cl.skip, self.total.devmem) if self.argmins is not None: return self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem, None) self._gs_distance = [ roundup(self.neurons_number, block_size), roundup(batch_size, block_size)] self._ls_distance = [block_size, block_size] def ocl_run(self): self.output.unmap() if self.total is not None: self.total.unmap() if self.argmins is None: self.input.unmap() self.weights.unmap() self.execute_kernel(self._gs_distance, self._ls_distance, self._krn_distances_) self.execute_kernel([self.argmin_group_size], [self.argmin_group_size], self._krn_argmin_) else: self.argmins.unmap() self.argmins.map_read() self.output.map_write() self.output.mem[:] = self.argmins.mem self.output.unmap() self.argmins.unmap() if self.total is not None: self._minibatch_offset_[0] = \ self.minibatch_offset - self.minibatch_size self._krn_set_total_.set_arg(1, self._minibatch_offset_) self.execute_kernel(self._set_total_global_size_, None, self._krn_set_total_) def numpy_run(self): self.output.map_invalidate() if self.argmins is not None: self.argmins.map_read() self.output.mem[:] = self.argmins.mem else: self.input.map_read() self.weights.map_read() if self.total is not None: self.total.map_invalidate() length = self.minibatch_size if self.total is not None \ else self.input.mem.shape[0] for sindex in range(length): if self.argmins is None: dist = self.weights.mem - self.input[sindex] winner = numpy.argmin(self.numpy_linalg_norm(dist)) self.output[sindex] = winner else: winner = self.argmins[sindex] if self.total is not None: index = sindex + self.minibatch_offset - self.minibatch_size self.total[index] = winner
class KohonenTrainer(KohonenBase, AcceleratedUnit): """KohonenForward train pass. Must be assigned before initialize(): input shape Creates within initialize(): weights winners argmins _distances _coords Updates after run(): weights Attributes: weights: weights of the current layer. input: input of the current layer as batch of 1D samples. krn_dist_: computes distances between input and neuron weights. _krn_argmin_: finds indexes of minimal computed distances. krn_gravity_: computes gravity to the winner neuron. krn_apply_gradients_: applies gradient to weights. """ def __init__(self, workflow, **kwargs): super(KohonenTrainer, self).__init__(workflow, **kwargs) self._distances = Array() self.argmins = Array() self._coords = Array() self.weights = Array() self.winners = Array() self.weights_filling = kwargs.get("weights_filling", "uniform") self.weights_stddev = kwargs.get("weights_stddev", None) self.weights_transposed = kwargs.get("weights_transposed", False) self.time = 0 self._sigma = 0 self.gradient_decay = kwargs.get("gradient_decay", lambda t: 0.1 / (1.0 + t * 0.05)) self.radius_decay = kwargs.get("radius_decay", lambda t: 1.0 / (1.0 + t * 0.05)) self.demand("input", "shape") self._shape = kwargs.get("shape") def init_unpickled(self): super(KohonenTrainer, self).init_unpickled() self.sources_["kohonen"] = {"TRAIN": 1} self._krn_distances_ = None self._krn_argmin_ = None self._krn_gravity_ = None self._krn_compute_gradients_ = None self._krn_apply_gradients_ = None @property def gravity_radius(self): return self.radius_decay(self.time) * self._sigma @property def gradient_multiplier(self): return self.gradient_decay(self.time) @property def shape(self): return self._shape @shape.setter def shape(self, value): self._shape = value def initialize(self, device, **kwargs): super(KohonenTrainer, self).initialize(device=device, **kwargs) self._neurons_number = self.shape[0] * self.shape[1] self._sample_length = self.input.mem.size // self.input.mem.shape[0] # Initialize weights if self.weights_stddev is None: # Get weights magnitude and cap it to 0.05 self.weights_stddev = min(self._get_weights_magnitude(), 0.05) weights_size = (self._sample_length * self._neurons_number) if not self.weights: self.weights.reset(numpy.zeros(weights_size, dtype=self.input.mem.dtype)) filling = { "uniform": lambda rand: rand.fill( self.weights.mem, -self.weights_stddev, self.weights_stddev), "gaussian": lambda rand: rand.fill_normal_real( self.weights.mem, 0, self.weights_stddev) } filling[self.weights_filling](prng.get()) self.weights.mem = self.weights.mem.reshape(( self._neurons_number, self._sample_length)) else: assert self.weights.shape == (self._neurons_number, self._sample_length) if self.weights_transposed: # Reshape weights as a matrix: wtrncopy = self.weights.mem.transpose().copy() self.weights.mem.shape = wtrncopy.shape self.weights.mem[:] = wtrncopy[:] self._sample_length = \ self.weights.mem.shape[0 if self.weights_transposed else 1] # Initialize winners self.winners.reset(numpy.zeros(self._neurons_number, numpy.int32)) # Initialize distances batch_size = self.input.mem.shape[0] self._distances.reset(numpy.zeros( [batch_size, self._neurons_number], dtype=self.weights.mem.dtype)) self.argmins.reset(numpy.zeros(batch_size, dtype=numpy.int32)) self._coords.reset(numpy.zeros([self._neurons_number, 2], dtype=self.weights.mem.dtype)) sz = self._neurons_number rows = int(numpy.round(numpy.sqrt(sz))) cols = sz // rows if sz % rows != 0: cols += 1 x_min = -1.0 x_max = 1.0 y_min = -1.0 y_max = 1.0 x_step = (x_max - x_min) / (cols - 1) if cols > 1 else 0 y = y_min y_step = (y_max - y_min) / (rows - 1) if rows > 1 else 0 offs = 0 mem = self._coords.mem for _row in range(rows): x = x_min + (x_step * 0.5 if _row & 1 else 0) for _col in range(cols): mem[offs, 0] = x mem[offs, 1] = y offs += 1 x += x_step y += y_step self._sigma = (self._coords.mem.ravel().max() - self._coords.mem.ravel().min()) * 1.42 def ocl_init(self): self.input.initialize(self.device) self.weights.initialize(self.device) self.winners.initialize(self.device) self.argmins.initialize(self.device) self._distances.initialize(self.device) self._coords.initialize(self.device) batch_size = self.input.mem.shape[0] chunk_size = self._neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self._neurons_number // 2 + 1 self.argmin_group_size = int(numpy.ceil(float(self._neurons_number) / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self._sample_length, 'NEURONS_NUMBER': self._neurons_number, 'CHUNK_SIZE': chunk_size, 'GRADIENT_CHUNK_SIZE': self.device.max_group_size, 'coord_type': "%s%d" % (opencl_types.numpy_dtype_to_opencl(self._coords.mem.dtype), self._coords.mem.shape[-1]) } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self._sample_length, self._neurons_number), dtype=self.weights.mem.dtype) self.ocl_consts_ = numpy.zeros(1, dtype=self.weights.mem.dtype) self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.argmins.devmem, self.winners.devmem) self._krn_gravity_ = self.get_kernel("compute_gravity") self._krn_gravity_.set_args(self.argmins.devmem, self._coords.devmem) self._krn_gravity_.set_arg(3, self._distances.devmem) self._krn_apply_gradient_ = self.get_kernel("apply_gradient") self._krn_apply_gradient_.set_args(self.input.devmem, self._distances.devmem) self._krn_apply_gradient_.set_arg(3, self.weights.devmem) self._gs_distance = [ roundup(self._neurons_number, block_size), roundup(batch_size, block_size)] self._ls_distance = [block_size, block_size] def iteration(fn): def wrapped(self, *args, **kwargs): result = fn(self, *args, **kwargs) self.time += 1 return result name = getattr(fn, '__name__', getattr(fn, 'func', wrapped).__name__) wrapped.__name__ = name + '_iteration' return wrapped @iteration def numpy_run(self): batch_size = self.input.mem.shape[0] neurons_number = self._neurons_number dists = numpy.empty(neurons_number) gradients = numpy.zeros(self.weights.mem.shape) sigma = self.gravity_radius gmult = self.gradient_multiplier self.input.map_read() self.weights.map_invalidate() self.winners.map_invalidate() for sindex in range(batch_size): dist = self.weights.mem - self.input[sindex] winner = numpy.argmin(self.numpy_linalg_norm(dist)) self.winners[winner] += 1 winner_coords = self._coords.mem[winner] for nindex in range(neurons_number): dist = self._coords.mem[nindex] - winner_coords dists[nindex] = numpy.sum(dist * dist) gravity = numpy.exp(dists / (-2 * sigma * sigma)) gradients += gravity.reshape((1, neurons_number)).transpose() * \ (self.input[sindex] - self.weights.mem) * gmult self.weights.mem += gradients @iteration def ocl_run(self): self.unmap_vectors(self.input, self.weights, self.winners, self._distances, self.argmins, self._coords) batch_size = self.input.mem.shape[0] self.execute_kernel(self._gs_distance, self._ls_distance, self._krn_distances_) self.execute_kernel([self.argmin_group_size], [self.argmin_group_size], self._krn_argmin_) self.ocl_consts_[0] = self.gravity_radius self._krn_gravity_.set_arg(2, self.ocl_consts_[0:1]) self.execute_kernel([batch_size, self._neurons_number], None, self._krn_gravity_) self.ocl_consts_[0] = self.gradient_multiplier self._krn_apply_gradient_.set_arg(2, self.ocl_consts_[0:1]) self.execute_kernel( [int(numpy.ceil(self._sample_length / self.device.max_group_size)), self.device.max_group_size], None, self._krn_apply_gradient_) iteration = staticmethod(iteration) def _get_weights_magnitude(self): """ Returns: weights magnitude for initial random distribution, such that activation function will be near maximum if all input values are at their supposed max value. Doesn't matter for classic Kohonen networks, get values as in All2AllTanh. """ d = self.input.max_supposed * self._sample_length if self.input.mem.dtype in (numpy.complex64, numpy.complex128): return 1.0 / d return 9.0 / d