class MemCpy(AcceleratedUnit): def __init__(self, workflow, **kwargs): super(MemCpy, self).__init__(workflow, **kwargs) self.output = Array() self.demand("input") def initialize(self, device, **kwargs): super(MemCpy, self).initialize(device, **kwargs) if (self.output.mem is None or self.output.mem.size != self.input.mem.size): self.output.reset() self.output.mem = numpy.zeros(self.input.mem.shape, dtype=self.input.mem.dtype) self.input.initialize(self.device) self.output.initialize(self.device) def cuda_init(self): pass def ocl_init(self): pass def _gpu_run(self): self.input.unmap() self.output.unmap() def ocl_run(self): self._gpu_run() self.device.queue_.copy_buffer(self.input.devmem, self.output.devmem, 0, 0, self.input.nbytes) def cuda_run(self): self._gpu_run() self.output.devmem.from_device_async(self.input.devmem) def numpy_run(self): self.input.map_read() self.output.map_invalidate() numpy.copyto(self.output.mem, self.input.mem)
class MyOCL(IOpenCLUnit): def __init__(self): self.a = Array(zeros([kibi >> 1, kibi], dtype=float32)) self.b = Array() self.b.mem = zeros([kibi, kibi], dtype=float32) def initialize(self, device, **kwargs): self.a.initialize(self) self.b.initialize(self) def ocl_init(): self.krn_.set_arg(0, self.a.devmem) self.krn_.set_arg(1, self.b.devmem) ocl_init() def __call__(self, *args, **kwargs): self.a.unmap() self.b.unmap() self.execute_kernel(global_size, local_size, self.krn_) a = self.a.ocl_map_read()
class OffsetPooling(Pooling): """Pooling by offset forward propagation. Must be assigned before initialize(): Updates after run(): input_offset Creates within initialize(): input_offset Attributes: input_offset: offsets in the input where elements are passed through. """ MAPPING = set() hide_from_registry = True def __init__(self, workflow, **kwargs): super(OffsetPooling, self).__init__(workflow, **kwargs) self.input_offset = Array() self.demand("input") def initialize(self, device, **kwargs): super(OffsetPooling, self).initialize(device=device, **kwargs) if self._no_output: return if self.input_offset: assert self.input_offset.shape[1:] == self.output.shape[1:] if (not self.input_offset or self.input_offset.shape[0] != self.output.shape[0]): self.input_offset.reset(numpy.zeros(self.output.shape, dtype=numpy.int32)) self.input_offset.initialize(self.device) def set_args(self, *args): super(OffsetPooling, self).set_args(self.input, self.output, self.input_offset, *args) def ocl_run(self): self.input_offset.unmap() super(OffsetPooling, self).ocl_run() def cuda_run(self): self.input_offset.unmap() super(OffsetPooling, self).cuda_run() def numpy_run(self): self.input_offset.map_invalidate() super(OffsetPooling, self).numpy_run() def numpy_run_cut(self, cut, coords): batch, y1, x1, ch, out_y, out_x = coords cut_index = self.numpy_run_cut_offset( cut, numpy.ravel_multi_index((batch, out_y, out_x, ch), self.output.shape)) i, j = numpy.unravel_index(cut_index, cut.shape) idx = numpy.ravel_multi_index((batch, y1 + i, x1 + j, ch), self.input.shape) val = numpy.ravel(self.input.mem)[idx] self.input_offset.mem[batch, out_y, out_x, ch] = idx return val
class Deconv(TriviallyDistributable, ConvolutionalBase, nn_units.Forward): # TriviallyDistributable overrides nn_units.Forward IDistributable """Deconvolutional layer for simple convolutional layer with linear activation and without bias. Must be assigned before initialize(): input weights output_shape_source Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of multichannel interleaved images. output: output as batch of multichannel interleaved images. weights: matrix of weights. output_shape_source: Array to get output shape from. n_kernels: number of convolutional kernels in the corresponding convolutional layer. kx: kernel width. ky: kernel height. sliding: tuple of kernel sliding (by x-axis, by y-axis), kx, ky MUST be a multiple of sliding to avoid irregularities. padding: tuple of virtual sample padding (left, top, right, bottom), will be computed automatically based on sliding. weights_transposed: assume weights matrix as a transposed one. unsafe_padding: flag to enable unsafe padding and/or sliding. """ MAPPING = {"deconv"} @staticmethod def compute_padding(sx, sy, kx, ky, sliding): """Computes required padding. """ return (kx - sliding[1], ky - sliding[0], kx - sx % sliding[1] if sx % sliding[1] != 0 else kx - sliding[1], ky - sy % sliding[0] if sy % sliding[0] != 0 else ky - sliding[0]) @staticmethod def check_padding_is_safe(kx, ky, sliding): if sliding[0] > (ky >> 1) or sliding[1] > (kx >> 1): raise ValueError( "sliding should not be greater than half of the kernel size") if kx % sliding[0] != 0 or kx % sliding[1] != 0: raise ValueError( "Kernel size should be multiple of sliding") def __init__(self, workflow, **kwargs): super(Deconv, self).__init__(workflow, **kwargs) self.unsafe_padding = kwargs.get("unsafe_padding", False) self.hits = Array() self.krn_clear_output_ = None self._global_size = None self._local_size = None del self.bias self.demand("n_kernels", "kx", "ky", "padding", "sliding", "input", "weights", "output_shape_source") def init_unpickled(self): super(Deconv, self).init_unpickled() self.sources_["deconv/forward"] = {} def initialize(self, device, **kwargs): super(Deconv, self).initialize(device, **kwargs) self._dtype = self.input.dtype self.weights_shape = (tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape) if hasattr(self, "bias"): raise ValueError("bias should not be set") if (len(self.input.shape) != 4 or self.input.shape[3] != self.n_kernels): raise ValueError("Incorrectly shaped input encountered") if (len(self.weights_shape) != 2 or self.weights_shape[0] != self.n_kernels or self.weights_shape[1] % (self.kx * self.ky) != 0): raise ValueError("Incorrectly shaped weights encountered") output_shape = tuple(self.output_shape_source.shape) if len(output_shape) != 4: raise ValueError("Incorrect output_shape_source shape") if output_shape[0] != self.input.shape[0]: raise ValueError( "output_shape_source.shape[0] != input.shape[0]") try: self.check_padding_is_safe(self.kx, self.ky, self.sliding) except ValueError as e: if not self.unsafe_padding: raise from_none(e) self.warning("The padding will be unsafe") self._create_hits(output_shape) padding = Deconv.compute_padding( output_shape[2], output_shape[1], self.kx, self.ky, self.sliding) if self.padding is None: # pylint: disable=E0203 self.padding = padding elif self.padding != padding: if not self.unsafe_padding: raise ValueError( "Expected padding %s but got %s" % (padding, self.padding)) self._create_hits(output_shape) if self.output: assert self.output.shape[1:] == output_shape[1:] if not self.output or self.output.shape[0] != output_shape[0]: self.output.reset(numpy.zeros(output_shape, dtype=self._dtype)) self._output_shape = output_shape self._sy, self._sx, self._n_channels = self._output_shape[1:] self._kernel_size = self.kx * self.ky * self._n_channels self._kernel_app_per_image = self.input.sample_size // self.n_kernels self._kernel_app_total = (self._kernel_app_per_image * self.input.shape[0]) self.init_vectors(self.input, self.weights, self.output, self.hits) def _create_hits(self, output_shape): if not self.hits: self.hits.reset( numpy.zeros(output_shape, dtype=numpy.int32)) else: assert self.hits.size == int(numpy.prod(output_shape)) def _gpu_init(self, blas_class): defines = { "USE_ATOMICS": 1, "WEIGHTS_TRANSPOSED": int(self.weights_transposed), "BATCH": self._output_shape[0], "SX": self._sx, "SY": self._sy, "N_CHANNELS": self._n_channels, "KX": self.kx, "KY": self.ky, "N_KERNELS": self.n_kernels, "PAD_LEFT": self.padding[0], "PAD_TOP": self.padding[1], "PAD_RIGHT": self.padding[2], "PAD_BOTTOM": self.padding[3], "SLIDE_X": self.sliding[0], "SLIDE_Y": self.sliding[1], "USE_HITS": int(bool(self.hits)), "DECONV_MODE": int(bool(self.hits)) + 1, "OUTPUT_SIZE": self.output.size } self.build_program( defines, "%s/%s_%d_%dx%dx%d_%dx%d_%d" % ( root.common.dirs.cache, self.__class__.__name__, self.input.shape[0], self._output_shape[2], self._output_shape[1], self._output_shape[3], self.kx, self.ky, self.n_kernels), dtype=self._dtype) self.krn_pack_ = self.get_kernel("DirectPack") unpack_bytes = (self._kernel_app_per_image * self.unpack_size * self._kernel_size * self.input.itemsize) self.device.request_temp_buffer(unpack_bytes) if self.hits: self.krn_pack_.set_arg(3, self.hits.devmem) self.krn_apply_hits_ = self.get_kernel("apply_hits") self.krn_apply_hits_.set_args(self.output.devmem, self.hits.devmem) self.gemm_ = blas_class.gemm(self._dtype) self.np_one = numpy.ones(1, dtype=self._dtype) self.np_zero = numpy.zeros(1, dtype=self._dtype) self._const_i = numpy.zeros(1, dtype=numpy.int64) def ocl_init(self): ocl_blas.OCLBLAS.attach_to_device(self.device) self._gpu_init(ocl_blas.OCLBLAS) self._global_size_pack = lambda size: (size,) self._local_size_pack = None if self.hits: self.krn_clear_hits_ = self.get_kernel("clear_hits") self.krn_clear_hits_.set_arg(0, self.hits.devmem) self._global_size_hits = (self.output.size,) self._local_size_hits = None self.krn_clear_output_ = self.get_kernel("clear_output") self.krn_clear_output_.set_arg(0, self.output.devmem) self._clear_output = lambda: ( self.execute_kernel((self.output.size,), None, self.krn_clear_output_)) self._clear_hits = lambda: ( self.execute_kernel((self.hits.size,), None, self.krn_clear_hits_)) self._process_subblock = self._ocl_process_subblock self.krn_pack_.set_arg(1, self.output.devmem) def cuda_init(self): self._gpu_init(cublas.CUBLAS) block_size = self.device.suggest_block_size(self.krn_pack_) self._global_size_pack = ( lambda size: (int(numpy.ceil(size / block_size)), 1, 1)) self._local_size_pack = (block_size, 1, 1) if self.hits: block_size = self.device.suggest_block_size(self.krn_apply_hits_) self._global_size_hits = ( int(numpy.ceil(self.output.size / block_size)), 1, 1) self._local_size_hits = (block_size, 1, 1) self._clear_output = lambda: self.output.devmem.memset32_async() self._clear_hits = lambda: self.hits.devmem.memset32_async() self._process_subblock = self._cuda_process_subblock def ocl_run(self): self.gpu_run() def cuda_run(self): self.gpu_run() def gpu_run(self): self.unmap_vectors(self.output, self.input, self.weights) unpack_data = self.device.get_temp_buffer() self._clear_output() if self.hits: self.hits.unmap() self._clear_hits() batch_size = self.output.shape[0] for i in range(0, batch_size, self.unpack_size): self._process_subblock(i, min(batch_size - i, self.unpack_size), unpack_data) if self.hits: self.execute_kernel(self._global_size_hits, self._local_size_hits, self.krn_apply_hits_) def _cuda_process_subblock(self, start_image, image_count, unpack_data): output_offs = (start_image * self.input.sample_size * self.input.itemsize) unpack_side = self._kernel_app_per_image * image_count self.gemm_( self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N, self._kernel_size, unpack_side, self.weights_shape[0], self.np_one, self.weights.devmem, int(self.input.devmem) + output_offs, self.np_zero, unpack_data) self.krn_pack_.set_arg(0, unpack_data) self.krn_pack_.set_arg( 1, int(self.output.devmem) + start_image * self.output.sample_size * self.output.itemsize) limit = unpack_side * self._kernel_size self._const_i[0] = limit self.krn_pack_.set_arg(2, self._const_i) self.execute_kernel(self._global_size_pack(limit), self._local_size_pack, self.krn_pack_) def _ocl_process_subblock(self, start_image, image_count, unpack_data): output_offs = start_image * self.input.sample_size unpack_side = self._kernel_app_per_image * image_count self.gemm_( self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N, self._kernel_size, unpack_side, self.weights_shape[0], self.np_one, self.weights.devmem, self.input.devmem, self.np_zero, unpack_data, offsetB=output_offs) self.krn_pack_.set_arg(0, unpack_data) self._const_i[0] = start_image * self.output.sample_size self.krn_pack_.set_arg(2, self._const_i) limit = unpack_side * self._kernel_size self.execute_kernel(self._global_size_pack(limit), self._local_size_pack, self.krn_pack_) def numpy_run(self): raise NotImplementedError()
class Deconv(TriviallyDistributable, ConvolutionalBase, nn_units.Forward): # TriviallyDistributable overrides nn_units.Forward IDistributable """Deconvolutional layer for simple convolutional layer with linear activation and without bias. Must be assigned before initialize(): input weights output_shape_source Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of multichannel interleaved images. output: output as batch of multichannel interleaved images. weights: matrix of weights. output_shape_source: Array to get output shape from. n_kernels: number of convolutional kernels in the corresponding convolutional layer. kx: kernel width. ky: kernel height. sliding: tuple of kernel sliding (by x-axis, by y-axis), kx, ky MUST be a multiple of sliding to avoid irregularities. padding: tuple of virtual sample padding (left, top, right, bottom), will be computed automatically based on sliding. weights_transposed: assume weights matrix as a transposed one. unsafe_padding: flag to enable unsafe padding and/or sliding. """ MAPPING = {"deconv"} @staticmethod def compute_padding(sx, sy, kx, ky, sliding): """Computes required padding. """ return (kx - sliding[1], ky - sliding[0], kx - sx % sliding[1] if sx % sliding[1] != 0 else kx - sliding[1], ky - sy % sliding[0] if sy % sliding[0] != 0 else ky - sliding[0]) @staticmethod def check_padding_is_safe(kx, ky, sliding): if sliding[0] > (ky >> 1) or sliding[1] > (kx >> 1): raise ValueError( "sliding should not be greater than half of the kernel size") if kx % sliding[0] != 0 or kx % sliding[1] != 0: raise ValueError("Kernel size should be multiple of sliding") def __init__(self, workflow, **kwargs): super(Deconv, self).__init__(workflow, **kwargs) self.unsafe_padding = kwargs.get("unsafe_padding", False) self.hits = Array() self.krn_clear_output_ = None self._global_size = None self._local_size = None del self.bias self.demand("n_kernels", "kx", "ky", "padding", "sliding", "input", "weights", "output_shape_source") def init_unpickled(self): super(Deconv, self).init_unpickled() self.sources_["deconv/forward"] = {} def initialize(self, device, **kwargs): super(Deconv, self).initialize(device, **kwargs) self._dtype = self.input.dtype self.weights_shape = (tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape) if hasattr(self, "bias"): raise ValueError("bias should not be set") if (len(self.input.shape) != 4 or self.input.shape[3] != self.n_kernels): raise ValueError("Incorrectly shaped input encountered") if (len(self.weights_shape) != 2 or self.weights_shape[0] != self.n_kernels or self.weights_shape[1] % (self.kx * self.ky) != 0): raise ValueError("Incorrectly shaped weights encountered") output_shape = tuple(self.output_shape_source.shape) if len(output_shape) != 4: raise ValueError("Incorrect output_shape_source shape") if output_shape[0] != self.input.shape[0]: raise ValueError("output_shape_source.shape[0] != input.shape[0]") try: self.check_padding_is_safe(self.kx, self.ky, self.sliding) except ValueError as e: if not self.unsafe_padding: raise from_none(e) self.warning("The padding will be unsafe") self._create_hits(output_shape) padding = Deconv.compute_padding(output_shape[2], output_shape[1], self.kx, self.ky, self.sliding) if self.padding is None: # pylint: disable=E0203 self.padding = padding elif self.padding != padding: if not self.unsafe_padding: raise ValueError("Expected padding %s but got %s" % (padding, self.padding)) self._create_hits(output_shape) if not self.output: self.output.reset(numpy.zeros(output_shape, dtype=self._dtype)) else: assert self.output.shape == output_shape self._output_shape = output_shape self._sy, self._sx, self._n_channels = self._output_shape[1:] self._kernel_size = self.kx * self.ky * self._n_channels self._kernel_app_per_image = self.input.sample_size // self.n_kernels self._kernel_app_total = (self._kernel_app_per_image * self.input.shape[0]) self.init_vectors(self.input, self.weights, self.output, self.hits) def _create_hits(self, output_shape): if not self.hits: self.hits.reset(numpy.zeros(output_shape, dtype=numpy.int32)) else: assert self.hits.size == int(numpy.prod(output_shape)) def _gpu_init(self, blas_class): defines = { "USE_ATOMICS": 1, "WEIGHTS_TRANSPOSED": int(self.weights_transposed), "BATCH": self._output_shape[0], "SX": self._sx, "SY": self._sy, "N_CHANNELS": self._n_channels, "KX": self.kx, "KY": self.ky, "N_KERNELS": self.n_kernels, "PAD_LEFT": self.padding[0], "PAD_TOP": self.padding[1], "PAD_RIGHT": self.padding[2], "PAD_BOTTOM": self.padding[3], "SLIDE_X": self.sliding[0], "SLIDE_Y": self.sliding[1], "USE_HITS": int(bool(self.hits)), "DECONV_MODE": int(bool(self.hits)) + 1, "OUTPUT_SIZE": self.output.size } self.build_program( defines, "%s/%s_%d_%dx%dx%d_%dx%d_%d" % (root.common.dirs.cache, self.__class__.__name__, self.input.shape[0], self._output_shape[2], self._output_shape[1], self._output_shape[3], self.kx, self.ky, self.n_kernels), dtype=self._dtype) self.krn_pack_ = self.get_kernel("DirectPack") unpack_bytes = (self._kernel_app_per_image * self.unpack_size * self._kernel_size * self.input.itemsize) self.device.request_temp_buffer(unpack_bytes) if self.hits: self.krn_pack_.set_arg(3, self.hits.devmem) self.krn_apply_hits_ = self.get_kernel("apply_hits") self.krn_apply_hits_.set_args(self.output.devmem, self.hits.devmem) self.gemm_ = blas_class.gemm(self._dtype) self.np_one = numpy.ones(1, dtype=self._dtype) self.np_zero = numpy.zeros(1, dtype=self._dtype) self._const_i = numpy.zeros(1, dtype=numpy.int64) def ocl_init(self): ocl_blas.OCLBLAS.attach_to_device(self.device) self._gpu_init(ocl_blas.OCLBLAS) self._global_size_pack = lambda size: (size, ) self._local_size_pack = None if self.hits: self.krn_clear_hits_ = self.get_kernel("clear_hits") self.krn_clear_hits_.set_arg(0, self.hits.devmem) self._global_size_hits = (self.output.size, ) self._local_size_hits = None self.krn_clear_output_ = self.get_kernel("clear_output") self.krn_clear_output_.set_arg(0, self.output.devmem) self._clear_output = lambda: (self.execute_kernel( (self.output.size, ), None, self.krn_clear_output_)) self._clear_hits = lambda: (self.execute_kernel( (self.hits.size, ), None, self.krn_clear_hits_)) self._process_subblock = self._ocl_process_subblock self.krn_pack_.set_arg(1, self.output.devmem) def cuda_init(self): self._gpu_init(cublas.CUBLAS) block_size = self.device.suggest_block_size(self.krn_pack_) self._global_size_pack = (lambda size: (int(numpy.ceil(size / block_size)), 1, 1)) self._local_size_pack = (block_size, 1, 1) if self.hits: block_size = self.device.suggest_block_size(self.krn_apply_hits_) self._global_size_hits = (int( numpy.ceil(self.output.size / block_size)), 1, 1) self._local_size_hits = (block_size, 1, 1) self._clear_output = lambda: self.output.devmem.memset32_async() self._clear_hits = lambda: self.hits.devmem.memset32_async() self._process_subblock = self._cuda_process_subblock def ocl_run(self): self.gpu_run() def cuda_run(self): self.gpu_run() def gpu_run(self): self.unmap_vectors(self.output, self.input, self.weights) unpack_data = self.device.get_temp_buffer() self._clear_output() if self.hits: self.hits.unmap() self._clear_hits() batch_size = self.output.shape[0] for i in range(0, batch_size, self.unpack_size): self._process_subblock(i, min(batch_size - i, self.unpack_size), unpack_data) if self.hits: self.execute_kernel(self._global_size_hits, self._local_size_hits, self.krn_apply_hits_) def _cuda_process_subblock(self, start_image, image_count, unpack_data): output_offs = (start_image * self.input.sample_size * self.input.itemsize) unpack_side = self._kernel_app_per_image * image_count self.gemm_( self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N, self._kernel_size, unpack_side, self.weights_shape[0], self.np_one, self.weights.devmem, int(self.input.devmem) + output_offs, self.np_zero, unpack_data) self.krn_pack_.set_arg(0, unpack_data) self.krn_pack_.set_arg( 1, int(self.output.devmem) + start_image * self.output.sample_size * self.output.itemsize) limit = unpack_side * self._kernel_size self._const_i[0] = limit self.krn_pack_.set_arg(2, self._const_i) self.execute_kernel(self._global_size_pack(limit), self._local_size_pack, self.krn_pack_) def _ocl_process_subblock(self, start_image, image_count, unpack_data): output_offs = start_image * self.input.sample_size unpack_side = self._kernel_app_per_image * image_count self.gemm_(self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N, self._kernel_size, unpack_side, self.weights_shape[0], self.np_one, self.weights.devmem, self.input.devmem, self.np_zero, unpack_data, offsetB=output_offs) self.krn_pack_.set_arg(0, unpack_data) self._const_i[0] = start_image * self.output.sample_size self.krn_pack_.set_arg(2, self._const_i) limit = unpack_side * self._kernel_size self.execute_kernel(self._global_size_pack(limit), self._local_size_pack, self.krn_pack_) def numpy_run(self): raise NotImplementedError()
class KohonenForward(KohonenBase, AcceleratedUnit): """Kohonen forward layer. Must be assigned before initialize(): input weights minibatch_offset (if total == True) minibatch_size (if total == True) batch_size (if total == True) argmins speeds up run() if linked from KohonenTrainer Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of samples. weights: the weights of the neurons in Kohonen layer. output: the list of winners. total: if total=True is passed in __init__(), the overall winners table """ def __init__(self, workflow, **kwargs): super(KohonenForward, self).__init__(workflow, **kwargs) self.demand("input", "weights") self.argmins = None self._distances = Array() self.output = Array() self._chunk_size_ = 0 self.weights_transposed = False self.total = Array() if kwargs.get("total", False) else None if self.total is not None: self.minibatch_offset = None self.minibatch_size = None self.batch_size = None def init_unpickled(self): super(KohonenForward, self).init_unpickled() self.sources_["kohonen"] = {"FORWARD": 1} @property def neurons_number(self): return self.weights.mem.shape[0] @property def sample_length(self): return self.weights.mem.shape[1] @property def chunk_size(self): return self._chunk_size_ def initialize(self, device, **kwargs): super(KohonenForward, self).initialize(device=device, **kwargs) assert self.input.mem.shape[1] == self.sample_length batch_size = self.input.mem.shape[0] self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32)) if self.argmins is None: self._distances.reset( numpy.zeros([batch_size, self.neurons_number], dtype=self.weights.mem.dtype)) if self.total is not None: self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32)) self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32) def ocl_init(self): batch_size = self.input.mem.shape[0] self.output.initialize(self.device) if self.argmins is None: self.input.initialize(self.device) self.weights.initialize(self.device) self._distances.initialize(self.device) elif self.total is None: return if self.total is not None: self.total.initialize(self.device) copy_chunk_size = int( numpy.ceil(batch_size / self.device.max_group_size)) chunk_size = self.neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self.neurons_number // 2 + 1 self.argmin_group_size = \ int(numpy.ceil(self.neurons_number / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self.sample_length, 'NEURONS_NUMBER': self.neurons_number, 'CHUNK_SIZE': chunk_size, 'COPY_CHUNK_SIZE': copy_chunk_size, } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self.sample_length, self.neurons_number), dtype=self.weights.mem.dtype) if self.total is not None: self._set_total_global_size_ = \ [int(numpy.ceil(batch_size / copy_chunk_size))] self._krn_set_total_ = self.get_kernel("set_total") self._krn_set_total_.set_args(self.output.devmem, cl.skip, self.total.devmem) if self.argmins is not None: return self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem, None) self._gs_distance = [ roundup(self.neurons_number, block_size), roundup(batch_size, block_size) ] self._ls_distance = [block_size, block_size] def ocl_run(self): self.output.unmap() if self.total is not None: self.total.unmap() if self.argmins is None: self.input.unmap() self.weights.unmap() self.execute_kernel(self._gs_distance, self._ls_distance, self._krn_distances_) self.execute_kernel([self.argmin_group_size], [self.argmin_group_size], self._krn_argmin_) else: self.argmins.unmap() self.argmins.map_read() self.output.map_write() self.output.mem[:] = self.argmins.mem self.output.unmap() self.argmins.unmap() if self.total is not None: self._minibatch_offset_[0] = \ self.minibatch_offset - self.minibatch_size self._krn_set_total_.set_arg(1, self._minibatch_offset_) self.execute_kernel(self._set_total_global_size_, None, self._krn_set_total_) def numpy_run(self): self.output.map_invalidate() if self.argmins is not None: self.argmins.map_read() self.output.mem[:] = self.argmins.mem else: self.input.map_read() self.weights.map_read() if self.total is not None: self.total.map_invalidate() length = self.minibatch_size if self.total is not None \ else self.input.mem.shape[0] for sindex in range(length): if self.argmins is None: dist = self.weights.mem - self.input[sindex] winner = numpy.argmin(self.numpy_linalg_norm(dist)) self.output[sindex] = winner else: winner = self.argmins[sindex] if self.total is not None: index = sindex + self.minibatch_offset - self.minibatch_size self.total[index] = winner
class GradientDescentBase(AcceleratedUnit): """Base class for gradient descent units. Attributes: input: input layer values. output: output layer values. err_output: error to backpropagate. err_input: backpropagated error. weights: weights. bias: bias. batch_size: current minibatch size. learning_rate: gradient descent speed (positive). learning_rate_bias weights_decay: regularization for weights (see l1_vs_l2). weights_decay_bias gradient_moment: moment coefficient for weights. gradient_moment_bias gradient_weights_with_moment: accumulated moment. gradient_bias_with_moment batch_size: effective batch size (if None, get it from y). weights_transposed: assume weights matrix as a transposed one. apply_gradient: will apply gradient. gradient_changed: when True, slave will send gradients to master (assigned to True just before the run call, so it can be set to False inside ocl_run, numpy_run if necessary). ocl_set_const_args: True when constant arguments for the kernel had been changed and need to be set again. """ hide_from_registry = True MAPPING = set() REDUCE_SIZE = 64 # used for updating bias def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "TRAINER") super(GradientDescentBase, self).__init__(workflow, **kwargs) self.err_input = Array(shallow_pickle=True) self.ocl_set_const_args = True self.weights = None self.bias = None self.demand("input", "err_output") self.learning_rate = kwargs.get("learning_rate", 0.01) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", 0.00005) self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0) self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0) self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2) self.gradient_moment = kwargs.get("gradient_moment", 0) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment) self.weights_transposed = kwargs.get("weights_transposed", False) self.need_err_input = kwargs.get("need_err_input", True) self.include_bias = kwargs.get("include_bias", True) self.factor_ortho = kwargs.get("factor_ortho", 0) self.col_sums = Array() # for orthogonalization # Current gradient as it is without applying learning_rate etc. self.gradient_weights = Array() self.gradient_bias = Array() # Gradient with applied learning_rate etc. # optionally accumulated from the previous run self.accumulate_gradient = kwargs.get("accumulate_gradient", False) # When accumulate_gradient set to True: # 1. Calculate gd # 2. acc = acc_alpha * gd + acc_beta * acc # 3. gd = gd_alpha * acc + gd_beta * gd # 4. Apply moments to gd # 5. weights += gd if apply_gradient set to True self.acc_alpha = kwargs.get("acc_alpha", 0.0) self.acc_beta = kwargs.get("acc_beta", 0.0) self.gd_alpha = kwargs.get("gd_alpha", 0.0) self.gd_beta = kwargs.get("gd_beta", 1.0) self.accumulated_gradient_weights = Array() self.accumulated_gradient_bias = Array() # Gradient with accumulated moments self.gradient_weights_with_moment = Array() self.gradient_bias_with_moment = Array() # Sets to True when gradient changes self.gradient_changed = False # Gradient will be applied to weights immediately just after computing self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave) @property def current_batch_size(self): batch_size = getattr(self, "batch_size", None) if batch_size is None: return self.err_output.mem.shape[0] return int(batch_size) def initialize(self, device, **kwargs): super(GradientDescentBase, self).initialize(device, **kwargs) if self.weights: assert len(self.weights.shape) == 2 self.weights_shape = (tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape) else: self.weights_shape = None self.learning_rate = kwargs.get("learning_rate", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", self.weights_decay) self.gradient_moment = kwargs.get("gradient_moment", self.gradient_moment) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate_bias) self.weights_decay_bias = kwargs.get("weights_decay_bias", self.weights_decay_bias) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment_bias) if self.weights: if not self.gradient_weights: self.gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights.size == self.weights.size if self.weights and self.accumulate_gradient: if not self.accumulated_gradient_weights: self.accumulated_gradient_weights.reset( numpy.zeros_like(self.weights.mem)) else: assert (self.accumulated_gradient_weights.size == self.weights.size) if self.weights and (self.gradient_moment or not self.is_standalone): if not self.gradient_weights_with_moment: self.gradient_weights_with_moment.reset( numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights_with_moment.size == \ self.weights.size if (self.include_bias and self.bias and (not self.gradient_bias or self.gradient_bias.size != self.bias.size)): self.gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if (self.include_bias and self.bias and self.accumulate_gradient and (not self.accumulated_gradient_bias or self.accumulated_gradient_bias.size != self.bias.size)): self.accumulated_gradient_bias.reset( numpy.zeros_like(self.bias.mem)) if (self.include_bias and self.bias and (self.gradient_moment_bias or not self.is_standalone)): if not self.gradient_bias_with_moment: self.gradient_bias_with_moment.reset( numpy.zeros_like(self.bias.mem)) else: assert self.gradient_bias_with_moment.size == self.bias.size dtype = self.err_output.dtype if self.need_err_input: if not self.err_input: self.err_input.reset(numpy.zeros(self.input.shape, dtype)) else: assert self.err_input.shape == self.input.shape if self.weights: side = self.weights_shape[0] other = self.weights.size // side if self.factor_ortho: if not self.col_sums: self.col_sums.reset(numpy.zeros(other, dtype=dtype)) else: assert self.col_sums.size == other self.col_sums.initialize(self.device) self.reduce_size = roundup(min(self.reduce_size, other), 32) self.weights.initialize(self.device) for vec in self.bias, self.input, self.err_input: if vec: vec.initialize(self.device) self.init_vectors(self.err_output, self.gradient_weights, self.gradient_bias, self.accumulated_gradient_weights, self.accumulated_gradient_bias, self.gradient_weights_with_moment, self.gradient_bias_with_moment) def gpu_weights_update(self): self.unmap_vectors(self.input, self.err_output, self.weights, self.gradient_weights, self.accumulated_gradient_weights, self.gradient_weights_with_moment) if self.factor_ortho: self.col_sums.unmap() self.execute_kernel(self._global_size_ortho, self._local_size_ortho, self.krn_compute_col_sums_) self._weights_const[12] = self.factor_ortho self.krn_weights_.set_arg(12, self._weights_const[12:13]) self._weights_const[4:12] = (self.learning_rate, self.weights_decay, self.l1_vs_l2, self.gradient_moment, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta) self.krn_weights_.set_args( self.device.skip(4), self._weights_const[4:5], self._weights_const[5:6], self._weights_const[6:7], self._weights_const[7:8], self._weights_const[8:9], self._weights_const[9:10], self._weights_const[10:11], self._weights_const[11:12]) self.execute_kernel(self._global_size_weights, self._local_size_weights, self.krn_weights_) def gpu_bias_update(self): if not self.include_bias: return self.unmap_vectors(self.err_output, self.bias, self.gradient_bias, self.accumulated_gradient_bias, self.gradient_bias_with_moment) self._bias_const[5:13] = (self.learning_rate_bias, self.weights_decay_bias, self.l1_vs_l2_bias, self.gradient_moment_bias, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta) self.krn_bias_.set_args(self.device.skip(5), self._bias_const[5:6], self._bias_const[6:7], self._bias_const[7:8], self._bias_const[8:9], self._bias_const[9:10], self._bias_const[10:11], self._bias_const[11:12], self._bias_const[12:13]) self.execute_kernel(self._global_size_bias, self._local_size_bias, self.krn_bias_) def gpu_err_output_update(self): """Multiply err_output by activation derivative by output. """ if self.krn_err_output_ is None: return self.err_output.unmap() self.output.unmap() self.execute_kernel(self._global_size_err_output, self._local_size_err_output, self.krn_err_output_) def numpy_err_output_update(self): """Multiply err_output by activation derivative by output. """ pass def print_debug_data(self): """ Show weights statistics """ if not self.logger.isEnabledFor(logging.DEBUG): return self.weights.map_read() self.bias.map_read() self.gradient_bias.map_read() self.gradient_weights.map_read() weights = self.weights.mem bias = self.bias.mem grad_weights = self.gradient_weights.mem grad_bias = self.gradient_bias.mem weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max") weight_table.float_format = ".10" for (w_name, w_array) in [("Weight", weights), ("Bias", bias), ("Grad Weight", grad_weights), ("Grad Bias", grad_bias)]: w_mean = w_stddev = w_min = w_max = None if w_array is not None and w_array.size > 0: w_mean = numpy.mean(w_array) w_stddev = numpy.std(w_array) w_min = numpy.min(w_array) w_max = numpy.max(w_array) weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max) self.debug("\n" + weight_table.get_string()) def generate_data_for_slave(self, slave): return (self.learning_rate, self.weights_decay, self.gradient_moment, self.learning_rate_bias, self.weights_decay_bias, self.gradient_moment_bias) @staticmethod def fill_zeros(vector): if not vector: return vector.map_invalidate() vector.mem[:] = 0 def apply_data_from_master(self, data): self.learning_rate = data[0] self.weights_decay = data[1] self.gradient_moment = data[2] self.learning_rate_bias = data[3] self.weights_decay_bias = data[4] self.gradient_moment_bias = data[5] self.fill_zeros(self.gradient_weights_with_moment) self.fill_zeros(self.gradient_bias_with_moment) self.fill_zeros(self.gradient_weights) self.fill_zeros(self.gradient_bias) self.fill_zeros(self.accumulated_gradient_weights) self.fill_zeros(self.accumulated_gradient_bias) def generate_data_for_master(self): if not self.gradient_changed: return None self.gradient_changed = False self.gradient_weights_with_moment.map_read() self.gradient_bias_with_moment.map_read() return (self.gradient_weights_with_moment.mem, self.gradient_bias_with_moment.mem) def apply_data_from_slave(self, data, slave): if self.weights: self.weights.map_write() self.gradient_weights_with_moment.map_write() self.gradient_weights_with_moment.mem *= self.gradient_moment self.gradient_weights_with_moment.mem += data[0] self.weights.mem += self.gradient_weights_with_moment.mem if self.bias: self.bias.map_write() self.gradient_bias_with_moment.map_write() self.gradient_bias_with_moment.mem *= self.gradient_moment_bias self.gradient_bias_with_moment.mem += data[1] self.bias.mem += self.gradient_bias_with_moment.mem def drop_slave(self, slave): pass def accumulate_gradient_f(self, accumulated_gradient, gradient): if accumulated_gradient and self.accumulate_gradient: accumulated_gradient[:] = ( gradient * self.acc_alpha + (self.acc_beta * accumulated_gradient if self.acc_beta else 0)) gradient *= self.gd_beta gradient += self.gd_alpha * accumulated_gradient return gradient @staticmethod def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False): gradient = gradient.copy() gradient += factor_l12 * ( (1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight)) if factor_ortho: col_sums = (reshape_transposed(weight).sum( axis=1) if weights_transposed else weight.sum(axis=0)) for i, row in enumerate(gradient): row += (col_sums - weight[i]) * factor_ortho / weight.shape[0] gradient *= lr return gradient def run(self): self.gradient_changed = True super(GradientDescentBase, self).run() self.ocl_set_const_args = False
class GradientDescentBase(AcceleratedUnit): """Base class for gradient descent units. Attributes: input: input layer values. output: output layer values. err_output: error to backpropagate. err_input: backpropagated error. weights: weights. bias: bias. batch_size: current minibatch size. learning_rate: gradient descent speed (positive). learning_rate_bias weights_decay: regularization for weights (see l1_vs_l2). weights_decay_bias gradient_moment: moment coefficient for weights. gradient_moment_bias gradient_weights_with_moment: accumulated moment. gradient_bias_with_moment batch_size: effective batch size (if None, get it from y). weights_transposed: assume weights matrix as a transposed one. apply_gradient: will apply gradient. gradient_changed: when True, slave will send gradients to master (assigned to True just before the run call, so it can be set to False inside ocl_run, numpy_run if necessary). ocl_set_const_args: True when constant arguments for the kernel had been changed and need to be set again. """ hide_from_registry = True MAPPING = set() REDUCE_SIZE = 64 # used for updating bias def __init__(self, workflow, **kwargs): kwargs["view_group"] = kwargs.get("view_group", "TRAINER") super(GradientDescentBase, self).__init__(workflow, **kwargs) self.err_input = Array(shallow_pickle=True) self.ocl_set_const_args = True self.weights = None self.bias = None self.demand("input", "err_output") self.learning_rate = kwargs.get("learning_rate", 0.01) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", 0.00005) self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0) self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0) self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2) self.gradient_moment = kwargs.get("gradient_moment", 0) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment) self.weights_transposed = kwargs.get("weights_transposed", False) self.need_err_input = kwargs.get("need_err_input", True) self.include_bias = kwargs.get("include_bias", True) self.factor_ortho = kwargs.get("factor_ortho", 0) self.col_sums = Array() # for orthogonalization # Current gradient as it is without applying learning_rate etc. self.gradient_weights = Array() self.gradient_bias = Array() # Gradient with applied learning_rate etc. # optionally accumulated from the previous run self.accumulate_gradient = kwargs.get("accumulate_gradient", False) # When accumulate_gradient set to True: # 1. Calculate gd # 2. acc = acc_alpha * gd + acc_beta * acc # 3. gd = gd_alpha * acc + gd_beta * gd # 4. Apply moments to gd # 5. weights += gd if apply_gradient set to True self.acc_alpha = kwargs.get("acc_alpha", 0.0) self.acc_beta = kwargs.get("acc_beta", 0.0) self.gd_alpha = kwargs.get("gd_alpha", 0.0) self.gd_beta = kwargs.get("gd_beta", 1.0) self.accumulated_gradient_weights = Array() self.accumulated_gradient_bias = Array() # Gradient with accumulated moments self.gradient_weights_with_moment = Array() self.gradient_bias_with_moment = Array() # Sets to True when gradient changes self.gradient_changed = False # Gradient will be applied to weights immediately just after computing self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave) @property def current_batch_size(self): batch_size = getattr(self, "batch_size", None) if batch_size is None: return self.err_output.mem.shape[0] return int(batch_size) def initialize(self, device, **kwargs): super(GradientDescentBase, self).initialize(device, **kwargs) if self.weights: assert len(self.weights.shape) == 2 self.weights_shape = tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape else: self.weights_shape = None self.learning_rate = kwargs.get("learning_rate", self.learning_rate) self.weights_decay = kwargs.get("weights_decay", self.weights_decay) self.gradient_moment = kwargs.get("gradient_moment", self.gradient_moment) self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate_bias) self.weights_decay_bias = kwargs.get("weights_decay_bias", self.weights_decay_bias) self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment_bias) if self.weights: if not self.gradient_weights: self.gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights.size == self.weights.size if self.weights and self.accumulate_gradient: if not self.accumulated_gradient_weights: self.accumulated_gradient_weights.reset(numpy.zeros_like(self.weights.mem)) else: assert self.accumulated_gradient_weights.size == self.weights.size if self.weights and (self.gradient_moment or not self.is_standalone): if not self.gradient_weights_with_moment: self.gradient_weights_with_moment.reset(numpy.zeros_like(self.weights.mem)) else: assert self.gradient_weights_with_moment.size == self.weights.size if self.include_bias and self.bias and (not self.gradient_bias or self.gradient_bias.size != self.bias.size): self.gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if ( self.include_bias and self.bias and self.accumulate_gradient and (not self.accumulated_gradient_bias or self.accumulated_gradient_bias.size != self.bias.size) ): self.accumulated_gradient_bias.reset(numpy.zeros_like(self.bias.mem)) if self.include_bias and self.bias and (self.gradient_moment_bias or not self.is_standalone): if not self.gradient_bias_with_moment: self.gradient_bias_with_moment.reset(numpy.zeros_like(self.bias.mem)) else: assert self.gradient_bias_with_moment.size == self.bias.size dtype = self.err_output.dtype if self.need_err_input: if not self.err_input: self.err_input.reset(numpy.zeros(self.input.shape, dtype)) else: assert self.err_input.shape == self.input.shape if self.weights: side = self.weights_shape[0] other = self.weights.size // side if self.factor_ortho: if not self.col_sums: self.col_sums.reset(numpy.zeros(other, dtype=dtype)) else: assert self.col_sums.size == other self.col_sums.initialize(self.device) self.reduce_size = roundup(min(self.reduce_size, other), 32) self.weights.initialize(self.device) for vec in self.bias, self.input, self.err_input: if vec: vec.initialize(self.device) self.init_vectors( self.err_output, self.gradient_weights, self.gradient_bias, self.accumulated_gradient_weights, self.accumulated_gradient_bias, self.gradient_weights_with_moment, self.gradient_bias_with_moment, ) def gpu_weights_update(self): self.unmap_vectors( self.input, self.err_output, self.weights, self.gradient_weights, self.accumulated_gradient_weights, self.gradient_weights_with_moment, ) if self.factor_ortho: self.col_sums.unmap() self.execute_kernel(self._global_size_ortho, self._local_size_ortho, self.krn_compute_col_sums_) self._weights_const[12] = self.factor_ortho self.krn_weights_.set_arg(12, self._weights_const[12:13]) self._weights_const[4:12] = ( self.learning_rate, self.weights_decay, self.l1_vs_l2, self.gradient_moment, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta, ) self.krn_weights_.set_args( self.device.skip(4), self._weights_const[4:5], self._weights_const[5:6], self._weights_const[6:7], self._weights_const[7:8], self._weights_const[8:9], self._weights_const[9:10], self._weights_const[10:11], self._weights_const[11:12], ) self.execute_kernel(self._global_size_weights, self._local_size_weights, self.krn_weights_) def gpu_bias_update(self): if not self.include_bias: return self.unmap_vectors( self.err_output, self.bias, self.gradient_bias, self.accumulated_gradient_bias, self.gradient_bias_with_moment, ) self._bias_const[5:13] = ( self.learning_rate_bias, self.weights_decay_bias, self.l1_vs_l2_bias, self.gradient_moment_bias, self.acc_alpha, self.acc_beta, self.gd_alpha, self.gd_beta, ) self.krn_bias_.set_args( self.device.skip(5), self._bias_const[5:6], self._bias_const[6:7], self._bias_const[7:8], self._bias_const[8:9], self._bias_const[9:10], self._bias_const[10:11], self._bias_const[11:12], self._bias_const[12:13], ) self.execute_kernel(self._global_size_bias, self._local_size_bias, self.krn_bias_) def gpu_err_output_update(self): """Multiply err_output by activation derivative by output. """ if self.krn_err_output_ is None: return self.err_output.unmap() self.output.unmap() self.execute_kernel(self._global_size_err_output, self._local_size_err_output, self.krn_err_output_) def numpy_err_output_update(self): """Multiply err_output by activation derivative by output. """ pass def print_debug_data(self): """ Show weights statistics """ if not self.logger.isEnabledFor(logging.DEBUG): return self.weights.map_read() self.bias.map_read() self.gradient_bias.map_read() self.gradient_weights.map_read() weights = self.weights.mem bias = self.bias.mem grad_weights = self.gradient_weights.mem grad_bias = self.gradient_bias.mem weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max") weight_table.float_format = ".10" for (w_name, w_array) in [ ("Weight", weights), ("Bias", bias), ("Grad Weight", grad_weights), ("Grad Bias", grad_bias), ]: w_mean = w_stddev = w_min = w_max = None if w_array is not None and w_array.size > 0: w_mean = numpy.mean(w_array) w_stddev = numpy.std(w_array) w_min = numpy.min(w_array) w_max = numpy.max(w_array) weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max) self.debug("\n" + weight_table.get_string()) def generate_data_for_slave(self, slave): return ( self.learning_rate, self.weights_decay, self.gradient_moment, self.learning_rate_bias, self.weights_decay_bias, self.gradient_moment_bias, ) @staticmethod def fill_zeros(vector): if not vector: return vector.map_invalidate() vector.mem[:] = 0 def apply_data_from_master(self, data): self.learning_rate = data[0] self.weights_decay = data[1] self.gradient_moment = data[2] self.learning_rate_bias = data[3] self.weights_decay_bias = data[4] self.gradient_moment_bias = data[5] self.fill_zeros(self.gradient_weights_with_moment) self.fill_zeros(self.gradient_bias_with_moment) self.fill_zeros(self.gradient_weights) self.fill_zeros(self.gradient_bias) self.fill_zeros(self.accumulated_gradient_weights) self.fill_zeros(self.accumulated_gradient_bias) def generate_data_for_master(self): if not self.gradient_changed: return None self.gradient_changed = False self.gradient_weights_with_moment.map_read() self.gradient_bias_with_moment.map_read() return (self.gradient_weights_with_moment.mem, self.gradient_bias_with_moment.mem) def apply_data_from_slave(self, data, slave): if self.weights: self.weights.map_write() self.gradient_weights_with_moment.map_write() self.gradient_weights_with_moment.mem *= self.gradient_moment self.gradient_weights_with_moment.mem += data[0] self.weights.mem += self.gradient_weights_with_moment.mem if self.bias: self.bias.map_write() self.gradient_bias_with_moment.map_write() self.gradient_bias_with_moment.mem *= self.gradient_moment_bias self.gradient_bias_with_moment.mem += data[1] self.bias.mem += self.gradient_bias_with_moment.mem def drop_slave(self, slave): pass def accumulate_gradient_f(self, accumulated_gradient, gradient): if accumulated_gradient and self.accumulate_gradient: accumulated_gradient[:] = gradient * self.acc_alpha + ( self.acc_beta * accumulated_gradient if self.acc_beta else 0 ) gradient *= self.gd_beta gradient += self.gd_alpha * accumulated_gradient return gradient @staticmethod def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False): gradient = gradient.copy() gradient += factor_l12 * ((1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight)) if factor_ortho: col_sums = reshape_transposed(weight).sum(axis=1) if weights_transposed else weight.sum(axis=0) for i, row in enumerate(gradient): row += (col_sums - weight[i]) * factor_ortho / weight.shape[0] gradient *= lr return gradient def run(self): self.gradient_changed = True super(GradientDescentBase, self).run() self.ocl_set_const_args = False
class ZeroFiller(ForwardBase, TriviallyDistributable): """Fills weights of given unit with zero on every step""" MAPPING = {"zero_filter"} def __init__(self, workflow, **kwargs): super(ZeroFiller, self).__init__(workflow, **kwargs) self.mask = Array() self.grouping = kwargs.get("grouping", 1) self.demand("weights") def init_unpickled(self): super(ZeroFiller, self).init_unpickled() self.sources_["weights_zerofilling"] = {} @property def effective_shape(self): return (self.weights.shape[0], self.weights.size // self.weights.shape[0]) @property def grouping(self): return self._grouping @grouping.setter def grouping(self, value): if not isinstance(value, int): raise TypeError("grouping value must be an integer (got %s)" % type(value)) if value < 2: raise ValueError("grouping value %d is invalid" % value) self._grouping = value def initialize(self, device=None, **kwargs): super(ZeroFiller, self).initialize(device, **kwargs) if not self.weights: return True if not self.mask: if self.effective_shape[1] % self.grouping != 0: raise ValueError( "Non-multiple of grouping weights shape detected: " "%s, grouping=%d" % (self.weights.shape, self.grouping)) self.mask.reset( numpy.zeros(self.effective_shape, dtype=self.weights.dtype)) self.mask.map_invalidate() # TODO(a.kazantsev): add check for transposed weights. for kernel in range(self.effective_shape[0]): for chan in range(self.effective_shape[1]): self.mask[kernel, chan] = not (kernel % self.grouping == chan % self.grouping) else: assert self.mask.shape == self.effective_shape for vec in self.mask, self.weights: vec.initialize(device) def _gpu_init(self): self.build_program(cache_file_name="zero_filling_%d" % self.grouping, dtype=self.weights.dtype) self.assign_kernel("multiply_by_mask") self.set_args(self.mask, self.weights) def ocl_init(self): self._gpu_init() self._global_size = [self.weights.size] self._local_size = None def cuda_init(self): self._gpu_init() self._global_size = (self.weights.size, 1, 1) self._local_size = (1, 1, 1) def numpy_run(self): self.mask.map_read() self.weights.map_write() self.weights.mem *= self.mask.mem def _gpu_run(self): self.weights.unmap() self.mask.unmap() self.execute_kernel(self._global_size, self._local_size) def ocl_run(self): self._gpu_run() def cuda_run(self): self._gpu_run()
class OffsetPooling(Pooling): """Pooling by offset forward propagation. Must be assigned before initialize(): Updates after run(): input_offset Creates within initialize(): input_offset Attributes: input_offset: offsets in the input where elements are passed through. """ MAPPING = set() hide_from_registry = True def __init__(self, workflow, **kwargs): super(OffsetPooling, self).__init__(workflow, **kwargs) self.input_offset = Array() self.demand("input") def initialize(self, device, **kwargs): super(OffsetPooling, self).initialize(device=device, **kwargs) if self._no_output: return if not self.input_offset: self.input_offset.reset(numpy.zeros(self.output.shape, dtype=numpy.int32)) else: assert self.input_offset.shape == self.output.shape self.input_offset.initialize(self.device) def set_args(self, *args): super(OffsetPooling, self).set_args(self.input, self.output, self.input_offset, *args) def ocl_run(self): self.input_offset.unmap() super(OffsetPooling, self).ocl_run() def cuda_run(self): self.input_offset.unmap() super(OffsetPooling, self).cuda_run() def numpy_run(self): self.input_offset.map_invalidate() super(OffsetPooling, self).numpy_run() def numpy_run_cut(self, cut, coords): batch, y1, x1, ch, out_y, out_x = coords cut_index = self.numpy_run_cut_offset( cut, numpy.ravel_multi_index((batch, out_y, out_x, ch), self.output.shape)) i, j = numpy.unravel_index(cut_index, cut.shape) idx = numpy.ravel_multi_index((batch, y1 + i, x1 + j, ch), self.input.shape) val = numpy.ravel(self.input.mem)[idx] self.input_offset.mem[batch, out_y, out_x, ch] = idx return val
class KohonenForward(KohonenBase, AcceleratedUnit): """Kohonen forward layer. Must be assigned before initialize(): input weights minibatch_offset (if total == True) minibatch_size (if total == True) batch_size (if total == True) argmins speeds up run() if linked from KohonenTrainer Updates after run(): output Creates within initialize(): output Attributes: input: input as batch of samples. weights: the weights of the neurons in Kohonen layer. output: the list of winners. total: if total=True is passed in __init__(), the overall winners table """ def __init__(self, workflow, **kwargs): super(KohonenForward, self).__init__(workflow, **kwargs) self.demand("input", "weights") self.argmins = None self._distances = Array() self.output = Array() self._chunk_size_ = 0 self.weights_transposed = False self.total = Array() if kwargs.get("total", False) else None if self.total is not None: self.minibatch_offset = None self.minibatch_size = None self.batch_size = None def init_unpickled(self): super(KohonenForward, self).init_unpickled() self.sources_["kohonen"] = {"FORWARD": 1} @property def neurons_number(self): return self.weights.mem.shape[0] @property def sample_length(self): return self.weights.mem.shape[1] @property def chunk_size(self): return self._chunk_size_ def initialize(self, device, **kwargs): super(KohonenForward, self).initialize(device=device, **kwargs) assert self.input.mem.shape[1] == self.sample_length batch_size = self.input.mem.shape[0] self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32)) if self.argmins is None: self._distances.reset(numpy.zeros( [batch_size, self.neurons_number], dtype=self.weights.mem.dtype)) if self.total is not None: self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32)) self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32) def ocl_init(self): batch_size = self.input.mem.shape[0] self.output.initialize(self.device) if self.argmins is None: self.input.initialize(self.device) self.weights.initialize(self.device) self._distances.initialize(self.device) elif self.total is None: return if self.total is not None: self.total.initialize(self.device) copy_chunk_size = int(numpy.ceil(batch_size / self.device.max_group_size)) chunk_size = self.neurons_number // self.device.max_group_size if chunk_size < 2: chunk_size = self.neurons_number // 2 + 1 self.argmin_group_size = \ int(numpy.ceil(self.neurons_number / chunk_size)) block_size, vector_opt = self.device.device_info.get_kernel_bs_vo( kernel="matrix_multiplication", dtype=self.input.dtype) defines = { 'BLOCK_SIZE': block_size, 'VECTOR_OPT': int(bool(vector_opt)), 'BATCH': batch_size, 'SAMPLE_LENGTH': self.sample_length, 'NEURONS_NUMBER': self.neurons_number, 'CHUNK_SIZE': chunk_size, 'COPY_CHUNK_SIZE': copy_chunk_size, } if self.weights_transposed: defines['WEIGHTS_TRANSPOSED'] = 1 self.build_program(defines, "%s_%d_%d_%d" % (self.__class__.__name__, batch_size, self.sample_length, self.neurons_number), dtype=self.weights.mem.dtype) if self.total is not None: self._set_total_global_size_ = \ [int(numpy.ceil(batch_size / copy_chunk_size))] self._krn_set_total_ = self.get_kernel("set_total") self._krn_set_total_.set_args(self.output.devmem, cl.skip, self.total.devmem) if self.argmins is not None: return self._krn_distances_ = self.get_kernel("calculate_distances") self._krn_distances_.set_args(self.input.devmem, self.weights.devmem, self._distances.devmem) self._krn_argmin_ = self.get_kernel("calculate_argmin") self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem, None) self._gs_distance = [ roundup(self.neurons_number, block_size), roundup(batch_size, block_size)] self._ls_distance = [block_size, block_size] def ocl_run(self): self.output.unmap() if self.total is not None: self.total.unmap() if self.argmins is None: self.input.unmap() self.weights.unmap() self.execute_kernel(self._gs_distance, self._ls_distance, self._krn_distances_) self.execute_kernel([self.argmin_group_size], [self.argmin_group_size], self._krn_argmin_) else: self.argmins.unmap() self.argmins.map_read() self.output.map_write() self.output.mem[:] = self.argmins.mem self.output.unmap() self.argmins.unmap() if self.total is not None: self._minibatch_offset_[0] = \ self.minibatch_offset - self.minibatch_size self._krn_set_total_.set_arg(1, self._minibatch_offset_) self.execute_kernel(self._set_total_global_size_, None, self._krn_set_total_) def numpy_run(self): self.output.map_invalidate() if self.argmins is not None: self.argmins.map_read() self.output.mem[:] = self.argmins.mem else: self.input.map_read() self.weights.map_read() if self.total is not None: self.total.map_invalidate() length = self.minibatch_size if self.total is not None \ else self.input.mem.shape[0] for sindex in range(length): if self.argmins is None: dist = self.weights.mem - self.input[sindex] winner = numpy.argmin(self.numpy_linalg_norm(dist)) self.output[sindex] = winner else: winner = self.argmins[sindex] if self.total is not None: index = sindex + self.minibatch_offset - self.minibatch_size self.total[index] = winner
class ZeroFiller(ForwardBase, TriviallyDistributable): """Fills weights of given unit with zero on every step""" MAPPING = {"zero_filter"} def __init__(self, workflow, **kwargs): super(ZeroFiller, self).__init__(workflow, **kwargs) self.mask = Array() self.grouping = kwargs.get("grouping", 1) self.demand("weights") def init_unpickled(self): super(ZeroFiller, self).init_unpickled() self.sources_["weights_zerofilling"] = {} @property def effective_shape(self): return (self.weights.shape[0], self.weights.size // self.weights.shape[0]) @property def grouping(self): return self._grouping @grouping.setter def grouping(self, value): if not isinstance(value, int): raise TypeError( "grouping value must be an integer (got %s)" % type(value)) if value < 2: raise ValueError("grouping value %d is invalid" % value) self._grouping = value def initialize(self, device=None, **kwargs): super(ZeroFiller, self).initialize(device, **kwargs) if not self.weights: return True if not self.mask: if self.effective_shape[1] % self.grouping != 0: raise ValueError( "Non-multiple of grouping weights shape detected: " "%s, grouping=%d" % (self.weights.shape, self.grouping)) self.mask.reset(numpy.zeros(self.effective_shape, dtype=self.weights.dtype)) self.mask.map_invalidate() # TODO(a.kazantsev): add check for transposed weights. for kernel in range(self.effective_shape[0]): for chan in range(self.effective_shape[1]): self.mask[kernel, chan] = not ( kernel % self.grouping == chan % self.grouping) else: assert self.mask.shape == self.effective_shape for vec in self.mask, self.weights: vec.initialize(device) def _gpu_init(self): self.build_program(cache_file_name="zero_filling_%d" % self.grouping, dtype=self.weights.dtype) self.assign_kernel("multiply_by_mask") self.set_args(self.mask, self.weights) def ocl_init(self): self._gpu_init() self._global_size = [self.weights.size] self._local_size = None def cuda_init(self): self._gpu_init() self._global_size = (self.weights.size, 1, 1) self._local_size = (1, 1, 1) def numpy_run(self): self.mask.map_read() self.weights.map_write() self.weights.mem *= self.mask.mem def _gpu_run(self): self.weights.unmap() self.mask.unmap() self.execute_kernel(self._global_size, self._local_size) def ocl_run(self): self._gpu_run() def cuda_run(self): self._gpu_run()