Пример #1
0
class MemCpy(AcceleratedUnit):
    def __init__(self, workflow, **kwargs):
        super(MemCpy, self).__init__(workflow, **kwargs)
        self.output = Array()
        self.demand("input")

    def initialize(self, device, **kwargs):
        super(MemCpy, self).initialize(device, **kwargs)
        if (self.output.mem is None
                or self.output.mem.size != self.input.mem.size):
            self.output.reset()
            self.output.mem = numpy.zeros(self.input.mem.shape,
                                          dtype=self.input.mem.dtype)
        self.input.initialize(self.device)
        self.output.initialize(self.device)

    def cuda_init(self):
        pass

    def ocl_init(self):
        pass

    def _gpu_run(self):
        self.input.unmap()
        self.output.unmap()

    def ocl_run(self):
        self._gpu_run()
        self.device.queue_.copy_buffer(self.input.devmem, self.output.devmem,
                                       0, 0, self.input.nbytes)

    def cuda_run(self):
        self._gpu_run()
        self.output.devmem.from_device_async(self.input.devmem)

    def numpy_run(self):
        self.input.map_read()
        self.output.map_invalidate()
        numpy.copyto(self.output.mem, self.input.mem)
Пример #2
0
class MemCpy(AcceleratedUnit):
    def __init__(self, workflow, **kwargs):
        super(MemCpy, self).__init__(workflow, **kwargs)
        self.output = Array()
        self.demand("input")

    def initialize(self, device, **kwargs):
        super(MemCpy, self).initialize(device, **kwargs)
        if (self.output.mem is None or
                self.output.mem.size != self.input.mem.size):
            self.output.reset()
            self.output.mem = numpy.zeros(self.input.mem.shape,
                                          dtype=self.input.mem.dtype)
        self.input.initialize(self.device)
        self.output.initialize(self.device)

    def cuda_init(self):
        pass

    def ocl_init(self):
        pass

    def _gpu_run(self):
        self.input.unmap()
        self.output.unmap()

    def ocl_run(self):
        self._gpu_run()
        self.device.queue_.copy_buffer(self.input.devmem, self.output.devmem,
                                       0, 0, self.input.nbytes)

    def cuda_run(self):
        self._gpu_run()
        self.output.devmem.from_device_async(self.input.devmem)

    def numpy_run(self):
        self.input.map_read()
        self.output.map_invalidate()
        numpy.copyto(self.output.mem, self.input.mem)
Пример #3
0
class MyOCL(IOpenCLUnit):
    def __init__(self):
        self.a = Array(zeros([kibi >> 1, kibi], dtype=float32))
        self.b = Array()
        self.b.mem = zeros([kibi, kibi], dtype=float32)

    def initialize(self, device, **kwargs):
        self.a.initialize(self)
        self.b.initialize(self)

        def ocl_init():
            self.krn_.set_arg(0, self.a.devmem)
            self.krn_.set_arg(1, self.b.devmem)

        ocl_init()

    def __call__(self, *args, **kwargs):
        self.a.unmap()
        self.b.unmap()
        self.execute_kernel(global_size, local_size, self.krn_)

        a = self.a.ocl_map_read()
Пример #4
0
class OffsetPooling(Pooling):
    """Pooling by offset forward propagation.

    Must be assigned before initialize():

    Updates after run():
        input_offset

    Creates within initialize():
        input_offset

    Attributes:
        input_offset: offsets in the input where elements are passed through.
    """

    MAPPING = set()
    hide_from_registry = True

    def __init__(self, workflow, **kwargs):
        super(OffsetPooling, self).__init__(workflow, **kwargs)
        self.input_offset = Array()
        self.demand("input")

    def initialize(self, device, **kwargs):
        super(OffsetPooling, self).initialize(device=device, **kwargs)

        if self._no_output:
            return
        if self.input_offset:
            assert self.input_offset.shape[1:] == self.output.shape[1:]
        if (not self.input_offset or
                self.input_offset.shape[0] != self.output.shape[0]):
            self.input_offset.reset(numpy.zeros(self.output.shape,
                                                dtype=numpy.int32))
        self.input_offset.initialize(self.device)

    def set_args(self, *args):
        super(OffsetPooling, self).set_args(self.input, self.output,
                                            self.input_offset, *args)

    def ocl_run(self):
        self.input_offset.unmap()
        super(OffsetPooling, self).ocl_run()

    def cuda_run(self):
        self.input_offset.unmap()
        super(OffsetPooling, self).cuda_run()

    def numpy_run(self):
        self.input_offset.map_invalidate()
        super(OffsetPooling, self).numpy_run()

    def numpy_run_cut(self, cut, coords):
        batch, y1, x1, ch, out_y, out_x = coords
        cut_index = self.numpy_run_cut_offset(
            cut, numpy.ravel_multi_index((batch, out_y, out_x, ch),
                                         self.output.shape))
        i, j = numpy.unravel_index(cut_index, cut.shape)
        idx = numpy.ravel_multi_index((batch, y1 + i, x1 + j, ch),
                                      self.input.shape)
        val = numpy.ravel(self.input.mem)[idx]
        self.input_offset.mem[batch, out_y, out_x, ch] = idx
        return val
Пример #5
0
class Deconv(TriviallyDistributable, ConvolutionalBase, nn_units.Forward):
    # TriviallyDistributable overrides nn_units.Forward IDistributable
    """Deconvolutional layer for simple convolutional layer
    with linear activation and without bias.

    Must be assigned before initialize():
        input
        weights
        output_shape_source

    Updates after run():
        output

    Creates within initialize():
        output

    Attributes:
        input: input as batch of multichannel interleaved images.
        output: output as batch of multichannel interleaved images.
        weights: matrix of weights.
        output_shape_source: Array to get output shape from.
        n_kernels: number of convolutional kernels
                   in the corresponding convolutional layer.
        kx: kernel width.
        ky: kernel height.
        sliding: tuple of kernel sliding (by x-axis, by y-axis),
                 kx, ky MUST be a multiple of sliding to avoid irregularities.
        padding: tuple of virtual sample padding (left, top, right, bottom),
                 will be computed automatically based on sliding.
        weights_transposed: assume weights matrix as a transposed one.
        unsafe_padding: flag to enable unsafe padding and/or sliding.
    """

    MAPPING = {"deconv"}

    @staticmethod
    def compute_padding(sx, sy, kx, ky, sliding):
        """Computes required padding.
        """
        return (kx - sliding[1], ky - sliding[0],
                kx - sx % sliding[1] if sx % sliding[1] != 0
                else kx - sliding[1],
                ky - sy % sliding[0] if sy % sliding[0] != 0
                else ky - sliding[0])

    @staticmethod
    def check_padding_is_safe(kx, ky, sliding):
        if sliding[0] > (ky >> 1) or sliding[1] > (kx >> 1):
            raise ValueError(
                "sliding should not be greater than half of the kernel size")
        if kx % sliding[0] != 0 or kx % sliding[1] != 0:
            raise ValueError(
                "Kernel size should be multiple of sliding")

    def __init__(self, workflow, **kwargs):
        super(Deconv, self).__init__(workflow, **kwargs)
        self.unsafe_padding = kwargs.get("unsafe_padding", False)
        self.hits = Array()
        self.krn_clear_output_ = None
        self._global_size = None
        self._local_size = None
        del self.bias
        self.demand("n_kernels", "kx", "ky", "padding", "sliding",
                    "input", "weights", "output_shape_source")

    def init_unpickled(self):
        super(Deconv, self).init_unpickled()
        self.sources_["deconv/forward"] = {}

    def initialize(self, device, **kwargs):
        super(Deconv, self).initialize(device, **kwargs)

        self._dtype = self.input.dtype

        self.weights_shape = (tuple(reversed(self.weights.shape))
                              if self.weights_transposed
                              else self.weights.shape)

        if hasattr(self, "bias"):
            raise ValueError("bias should not be set")
        if (len(self.input.shape) != 4 or
                self.input.shape[3] != self.n_kernels):
            raise ValueError("Incorrectly shaped input encountered")
        if (len(self.weights_shape) != 2 or
                self.weights_shape[0] != self.n_kernels or
                self.weights_shape[1] % (self.kx * self.ky) != 0):
            raise ValueError("Incorrectly shaped weights encountered")

        output_shape = tuple(self.output_shape_source.shape)
        if len(output_shape) != 4:
            raise ValueError("Incorrect output_shape_source shape")
        if output_shape[0] != self.input.shape[0]:
            raise ValueError(
                "output_shape_source.shape[0] != input.shape[0]")

        try:
            self.check_padding_is_safe(self.kx, self.ky, self.sliding)
        except ValueError as e:
            if not self.unsafe_padding:
                raise from_none(e)
            self.warning("The padding will be unsafe")
            self._create_hits(output_shape)

        padding = Deconv.compute_padding(
            output_shape[2], output_shape[1], self.kx, self.ky, self.sliding)
        if self.padding is None:  # pylint: disable=E0203
            self.padding = padding
        elif self.padding != padding:
            if not self.unsafe_padding:
                raise ValueError(
                    "Expected padding %s but got %s" % (padding, self.padding))
            self._create_hits(output_shape)

        if self.output:
            assert self.output.shape[1:] == output_shape[1:]
        if not self.output or self.output.shape[0] != output_shape[0]:
            self.output.reset(numpy.zeros(output_shape,
                                          dtype=self._dtype))

        self._output_shape = output_shape

        self._sy, self._sx, self._n_channels = self._output_shape[1:]
        self._kernel_size = self.kx * self.ky * self._n_channels

        self._kernel_app_per_image = self.input.sample_size // self.n_kernels
        self._kernel_app_total = (self._kernel_app_per_image *
                                  self.input.shape[0])

        self.init_vectors(self.input, self.weights, self.output, self.hits)

    def _create_hits(self, output_shape):
        if not self.hits:
            self.hits.reset(
                numpy.zeros(output_shape, dtype=numpy.int32))
        else:
            assert self.hits.size == int(numpy.prod(output_shape))

    def _gpu_init(self, blas_class):
        defines = {
            "USE_ATOMICS": 1,
            "WEIGHTS_TRANSPOSED": int(self.weights_transposed),
            "BATCH": self._output_shape[0],
            "SX": self._sx,
            "SY": self._sy,
            "N_CHANNELS": self._n_channels,
            "KX": self.kx,
            "KY": self.ky,
            "N_KERNELS": self.n_kernels,
            "PAD_LEFT": self.padding[0],
            "PAD_TOP": self.padding[1],
            "PAD_RIGHT": self.padding[2],
            "PAD_BOTTOM": self.padding[3],
            "SLIDE_X": self.sliding[0],
            "SLIDE_Y": self.sliding[1],
            "USE_HITS": int(bool(self.hits)),
            "DECONV_MODE": int(bool(self.hits)) + 1,
            "OUTPUT_SIZE": self.output.size
        }

        self.build_program(
            defines, "%s/%s_%d_%dx%dx%d_%dx%d_%d" % (
                root.common.dirs.cache, self.__class__.__name__,
                self.input.shape[0],
                self._output_shape[2], self._output_shape[1],
                self._output_shape[3],
                self.kx, self.ky, self.n_kernels), dtype=self._dtype)

        self.krn_pack_ = self.get_kernel("DirectPack")
        unpack_bytes = (self._kernel_app_per_image * self.unpack_size *
                        self._kernel_size * self.input.itemsize)
        self.device.request_temp_buffer(unpack_bytes)

        if self.hits:
            self.krn_pack_.set_arg(3, self.hits.devmem)

            self.krn_apply_hits_ = self.get_kernel("apply_hits")
            self.krn_apply_hits_.set_args(self.output.devmem, self.hits.devmem)

        self.gemm_ = blas_class.gemm(self._dtype)
        self.np_one = numpy.ones(1, dtype=self._dtype)
        self.np_zero = numpy.zeros(1, dtype=self._dtype)
        self._const_i = numpy.zeros(1, dtype=numpy.int64)

    def ocl_init(self):
        ocl_blas.OCLBLAS.attach_to_device(self.device)
        self._gpu_init(ocl_blas.OCLBLAS)

        self._global_size_pack = lambda size: (size,)
        self._local_size_pack = None

        if self.hits:
            self.krn_clear_hits_ = self.get_kernel("clear_hits")
            self.krn_clear_hits_.set_arg(0, self.hits.devmem)

            self._global_size_hits = (self.output.size,)
            self._local_size_hits = None

        self.krn_clear_output_ = self.get_kernel("clear_output")
        self.krn_clear_output_.set_arg(0, self.output.devmem)

        self._clear_output = lambda: (
            self.execute_kernel((self.output.size,), None,
                                self.krn_clear_output_))
        self._clear_hits = lambda: (
            self.execute_kernel((self.hits.size,), None, self.krn_clear_hits_))

        self._process_subblock = self._ocl_process_subblock

        self.krn_pack_.set_arg(1, self.output.devmem)

    def cuda_init(self):
        self._gpu_init(cublas.CUBLAS)

        block_size = self.device.suggest_block_size(self.krn_pack_)
        self._global_size_pack = (
            lambda size: (int(numpy.ceil(size / block_size)), 1, 1))
        self._local_size_pack = (block_size, 1, 1)

        if self.hits:
            block_size = self.device.suggest_block_size(self.krn_apply_hits_)
            self._global_size_hits = (
                int(numpy.ceil(self.output.size / block_size)), 1, 1)
            self._local_size_hits = (block_size, 1, 1)

        self._clear_output = lambda: self.output.devmem.memset32_async()
        self._clear_hits = lambda: self.hits.devmem.memset32_async()

        self._process_subblock = self._cuda_process_subblock

    def ocl_run(self):
        self.gpu_run()

    def cuda_run(self):
        self.gpu_run()

    def gpu_run(self):
        self.unmap_vectors(self.output, self.input, self.weights)
        unpack_data = self.device.get_temp_buffer()
        self._clear_output()
        if self.hits:
            self.hits.unmap()
            self._clear_hits()
        batch_size = self.output.shape[0]
        for i in range(0, batch_size, self.unpack_size):
            self._process_subblock(i, min(batch_size - i, self.unpack_size),
                                   unpack_data)
        if self.hits:
            self.execute_kernel(self._global_size_hits, self._local_size_hits,
                                self.krn_apply_hits_)

    def _cuda_process_subblock(self, start_image, image_count, unpack_data):
        output_offs = (start_image * self.input.sample_size *
                       self.input.itemsize)
        unpack_side = self._kernel_app_per_image * image_count

        self.gemm_(
            self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed
            else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N,
            self._kernel_size, unpack_side, self.weights_shape[0],
            self.np_one, self.weights.devmem,
            int(self.input.devmem) + output_offs,
            self.np_zero, unpack_data)

        self.krn_pack_.set_arg(0, unpack_data)
        self.krn_pack_.set_arg(
            1, int(self.output.devmem) +
            start_image * self.output.sample_size * self.output.itemsize)
        limit = unpack_side * self._kernel_size
        self._const_i[0] = limit
        self.krn_pack_.set_arg(2, self._const_i)
        self.execute_kernel(self._global_size_pack(limit),
                            self._local_size_pack, self.krn_pack_)

    def _ocl_process_subblock(self, start_image, image_count, unpack_data):
        output_offs = start_image * self.input.sample_size
        unpack_side = self._kernel_app_per_image * image_count

        self.gemm_(
            self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed
            else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N,
            self._kernel_size, unpack_side, self.weights_shape[0],
            self.np_one, self.weights.devmem,
            self.input.devmem,
            self.np_zero, unpack_data, offsetB=output_offs)

        self.krn_pack_.set_arg(0, unpack_data)
        self._const_i[0] = start_image * self.output.sample_size
        self.krn_pack_.set_arg(2, self._const_i)
        limit = unpack_side * self._kernel_size
        self.execute_kernel(self._global_size_pack(limit),
                            self._local_size_pack, self.krn_pack_)

    def numpy_run(self):
        raise NotImplementedError()
Пример #6
0
class Deconv(TriviallyDistributable, ConvolutionalBase, nn_units.Forward):
    # TriviallyDistributable overrides nn_units.Forward IDistributable
    """Deconvolutional layer for simple convolutional layer
    with linear activation and without bias.

    Must be assigned before initialize():
        input
        weights
        output_shape_source

    Updates after run():
        output

    Creates within initialize():
        output

    Attributes:
        input: input as batch of multichannel interleaved images.
        output: output as batch of multichannel interleaved images.
        weights: matrix of weights.
        output_shape_source: Array to get output shape from.
        n_kernels: number of convolutional kernels
                   in the corresponding convolutional layer.
        kx: kernel width.
        ky: kernel height.
        sliding: tuple of kernel sliding (by x-axis, by y-axis),
                 kx, ky MUST be a multiple of sliding to avoid irregularities.
        padding: tuple of virtual sample padding (left, top, right, bottom),
                 will be computed automatically based on sliding.
        weights_transposed: assume weights matrix as a transposed one.
        unsafe_padding: flag to enable unsafe padding and/or sliding.
    """

    MAPPING = {"deconv"}

    @staticmethod
    def compute_padding(sx, sy, kx, ky, sliding):
        """Computes required padding.
        """
        return (kx - sliding[1], ky - sliding[0], kx -
                sx % sliding[1] if sx % sliding[1] != 0 else kx - sliding[1],
                ky - sy % sliding[0] if sy % sliding[0] != 0 else ky -
                sliding[0])

    @staticmethod
    def check_padding_is_safe(kx, ky, sliding):
        if sliding[0] > (ky >> 1) or sliding[1] > (kx >> 1):
            raise ValueError(
                "sliding should not be greater than half of the kernel size")
        if kx % sliding[0] != 0 or kx % sliding[1] != 0:
            raise ValueError("Kernel size should be multiple of sliding")

    def __init__(self, workflow, **kwargs):
        super(Deconv, self).__init__(workflow, **kwargs)
        self.unsafe_padding = kwargs.get("unsafe_padding", False)
        self.hits = Array()
        self.krn_clear_output_ = None
        self._global_size = None
        self._local_size = None
        del self.bias
        self.demand("n_kernels", "kx", "ky", "padding", "sliding", "input",
                    "weights", "output_shape_source")

    def init_unpickled(self):
        super(Deconv, self).init_unpickled()
        self.sources_["deconv/forward"] = {}

    def initialize(self, device, **kwargs):
        super(Deconv, self).initialize(device, **kwargs)

        self._dtype = self.input.dtype

        self.weights_shape = (tuple(reversed(self.weights.shape)) if
                              self.weights_transposed else self.weights.shape)

        if hasattr(self, "bias"):
            raise ValueError("bias should not be set")
        if (len(self.input.shape) != 4
                or self.input.shape[3] != self.n_kernels):
            raise ValueError("Incorrectly shaped input encountered")
        if (len(self.weights_shape) != 2
                or self.weights_shape[0] != self.n_kernels
                or self.weights_shape[1] % (self.kx * self.ky) != 0):
            raise ValueError("Incorrectly shaped weights encountered")

        output_shape = tuple(self.output_shape_source.shape)
        if len(output_shape) != 4:
            raise ValueError("Incorrect output_shape_source shape")
        if output_shape[0] != self.input.shape[0]:
            raise ValueError("output_shape_source.shape[0] != input.shape[0]")

        try:
            self.check_padding_is_safe(self.kx, self.ky, self.sliding)
        except ValueError as e:
            if not self.unsafe_padding:
                raise from_none(e)
            self.warning("The padding will be unsafe")
            self._create_hits(output_shape)

        padding = Deconv.compute_padding(output_shape[2], output_shape[1],
                                         self.kx, self.ky, self.sliding)
        if self.padding is None:  # pylint: disable=E0203
            self.padding = padding
        elif self.padding != padding:
            if not self.unsafe_padding:
                raise ValueError("Expected padding %s but got %s" %
                                 (padding, self.padding))
            self._create_hits(output_shape)

        if not self.output:
            self.output.reset(numpy.zeros(output_shape, dtype=self._dtype))
        else:
            assert self.output.shape == output_shape

        self._output_shape = output_shape

        self._sy, self._sx, self._n_channels = self._output_shape[1:]
        self._kernel_size = self.kx * self.ky * self._n_channels

        self._kernel_app_per_image = self.input.sample_size // self.n_kernels
        self._kernel_app_total = (self._kernel_app_per_image *
                                  self.input.shape[0])

        self.init_vectors(self.input, self.weights, self.output, self.hits)

    def _create_hits(self, output_shape):
        if not self.hits:
            self.hits.reset(numpy.zeros(output_shape, dtype=numpy.int32))
        else:
            assert self.hits.size == int(numpy.prod(output_shape))

    def _gpu_init(self, blas_class):
        defines = {
            "USE_ATOMICS": 1,
            "WEIGHTS_TRANSPOSED": int(self.weights_transposed),
            "BATCH": self._output_shape[0],
            "SX": self._sx,
            "SY": self._sy,
            "N_CHANNELS": self._n_channels,
            "KX": self.kx,
            "KY": self.ky,
            "N_KERNELS": self.n_kernels,
            "PAD_LEFT": self.padding[0],
            "PAD_TOP": self.padding[1],
            "PAD_RIGHT": self.padding[2],
            "PAD_BOTTOM": self.padding[3],
            "SLIDE_X": self.sliding[0],
            "SLIDE_Y": self.sliding[1],
            "USE_HITS": int(bool(self.hits)),
            "DECONV_MODE": int(bool(self.hits)) + 1,
            "OUTPUT_SIZE": self.output.size
        }

        self.build_program(
            defines,
            "%s/%s_%d_%dx%dx%d_%dx%d_%d" %
            (root.common.dirs.cache, self.__class__.__name__,
             self.input.shape[0], self._output_shape[2], self._output_shape[1],
             self._output_shape[3], self.kx, self.ky, self.n_kernels),
            dtype=self._dtype)

        self.krn_pack_ = self.get_kernel("DirectPack")
        unpack_bytes = (self._kernel_app_per_image * self.unpack_size *
                        self._kernel_size * self.input.itemsize)
        self.device.request_temp_buffer(unpack_bytes)

        if self.hits:
            self.krn_pack_.set_arg(3, self.hits.devmem)

            self.krn_apply_hits_ = self.get_kernel("apply_hits")
            self.krn_apply_hits_.set_args(self.output.devmem, self.hits.devmem)

        self.gemm_ = blas_class.gemm(self._dtype)
        self.np_one = numpy.ones(1, dtype=self._dtype)
        self.np_zero = numpy.zeros(1, dtype=self._dtype)
        self._const_i = numpy.zeros(1, dtype=numpy.int64)

    def ocl_init(self):
        ocl_blas.OCLBLAS.attach_to_device(self.device)
        self._gpu_init(ocl_blas.OCLBLAS)

        self._global_size_pack = lambda size: (size, )
        self._local_size_pack = None

        if self.hits:
            self.krn_clear_hits_ = self.get_kernel("clear_hits")
            self.krn_clear_hits_.set_arg(0, self.hits.devmem)

            self._global_size_hits = (self.output.size, )
            self._local_size_hits = None

        self.krn_clear_output_ = self.get_kernel("clear_output")
        self.krn_clear_output_.set_arg(0, self.output.devmem)

        self._clear_output = lambda: (self.execute_kernel(
            (self.output.size, ), None, self.krn_clear_output_))
        self._clear_hits = lambda: (self.execute_kernel(
            (self.hits.size, ), None, self.krn_clear_hits_))

        self._process_subblock = self._ocl_process_subblock

        self.krn_pack_.set_arg(1, self.output.devmem)

    def cuda_init(self):
        self._gpu_init(cublas.CUBLAS)

        block_size = self.device.suggest_block_size(self.krn_pack_)
        self._global_size_pack = (lambda size:
                                  (int(numpy.ceil(size / block_size)), 1, 1))
        self._local_size_pack = (block_size, 1, 1)

        if self.hits:
            block_size = self.device.suggest_block_size(self.krn_apply_hits_)
            self._global_size_hits = (int(
                numpy.ceil(self.output.size / block_size)), 1, 1)
            self._local_size_hits = (block_size, 1, 1)

        self._clear_output = lambda: self.output.devmem.memset32_async()
        self._clear_hits = lambda: self.hits.devmem.memset32_async()

        self._process_subblock = self._cuda_process_subblock

    def ocl_run(self):
        self.gpu_run()

    def cuda_run(self):
        self.gpu_run()

    def gpu_run(self):
        self.unmap_vectors(self.output, self.input, self.weights)
        unpack_data = self.device.get_temp_buffer()
        self._clear_output()
        if self.hits:
            self.hits.unmap()
            self._clear_hits()
        batch_size = self.output.shape[0]
        for i in range(0, batch_size, self.unpack_size):
            self._process_subblock(i, min(batch_size - i, self.unpack_size),
                                   unpack_data)
        if self.hits:
            self.execute_kernel(self._global_size_hits, self._local_size_hits,
                                self.krn_apply_hits_)

    def _cuda_process_subblock(self, start_image, image_count, unpack_data):
        output_offs = (start_image * self.input.sample_size *
                       self.input.itemsize)
        unpack_side = self._kernel_app_per_image * image_count

        self.gemm_(
            self.device.blas, cublas.CUBLAS_OP_T
            if self.weights_transposed else cublas.CUBLAS_OP_N,
            cublas.CUBLAS_OP_N, self._kernel_size, unpack_side,
            self.weights_shape[0], self.np_one, self.weights.devmem,
            int(self.input.devmem) + output_offs, self.np_zero, unpack_data)

        self.krn_pack_.set_arg(0, unpack_data)
        self.krn_pack_.set_arg(
            1,
            int(self.output.devmem) +
            start_image * self.output.sample_size * self.output.itemsize)
        limit = unpack_side * self._kernel_size
        self._const_i[0] = limit
        self.krn_pack_.set_arg(2, self._const_i)
        self.execute_kernel(self._global_size_pack(limit),
                            self._local_size_pack, self.krn_pack_)

    def _ocl_process_subblock(self, start_image, image_count, unpack_data):
        output_offs = start_image * self.input.sample_size
        unpack_side = self._kernel_app_per_image * image_count

        self.gemm_(self.device.blas,
                   cublas.CUBLAS_OP_T
                   if self.weights_transposed else cublas.CUBLAS_OP_N,
                   cublas.CUBLAS_OP_N,
                   self._kernel_size,
                   unpack_side,
                   self.weights_shape[0],
                   self.np_one,
                   self.weights.devmem,
                   self.input.devmem,
                   self.np_zero,
                   unpack_data,
                   offsetB=output_offs)

        self.krn_pack_.set_arg(0, unpack_data)
        self._const_i[0] = start_image * self.output.sample_size
        self.krn_pack_.set_arg(2, self._const_i)
        limit = unpack_side * self._kernel_size
        self.execute_kernel(self._global_size_pack(limit),
                            self._local_size_pack, self.krn_pack_)

    def numpy_run(self):
        raise NotImplementedError()
Пример #7
0
class KohonenForward(KohonenBase, AcceleratedUnit):
    """Kohonen forward layer.

    Must be assigned before initialize():
        input
        weights
        minibatch_offset (if total == True)
        minibatch_size (if total == True)
        batch_size (if total == True)
        argmins speeds up run() if linked from KohonenTrainer

    Updates after run():
        output

    Creates within initialize():
        output

    Attributes:
        input: input as batch of samples.
        weights: the weights of the neurons in Kohonen layer.
        output: the list of winners.
        total: if total=True is passed in __init__(), the overall winners table
    """
    def __init__(self, workflow, **kwargs):
        super(KohonenForward, self).__init__(workflow, **kwargs)
        self.demand("input", "weights")
        self.argmins = None
        self._distances = Array()
        self.output = Array()
        self._chunk_size_ = 0
        self.weights_transposed = False
        self.total = Array() if kwargs.get("total", False) else None
        if self.total is not None:
            self.minibatch_offset = None
            self.minibatch_size = None
            self.batch_size = None

    def init_unpickled(self):
        super(KohonenForward, self).init_unpickled()
        self.sources_["kohonen"] = {"FORWARD": 1}

    @property
    def neurons_number(self):
        return self.weights.mem.shape[0]

    @property
    def sample_length(self):
        return self.weights.mem.shape[1]

    @property
    def chunk_size(self):
        return self._chunk_size_

    def initialize(self, device, **kwargs):
        super(KohonenForward, self).initialize(device=device, **kwargs)

        assert self.input.mem.shape[1] == self.sample_length
        batch_size = self.input.mem.shape[0]

        self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32))
        if self.argmins is None:
            self._distances.reset(
                numpy.zeros([batch_size, self.neurons_number],
                            dtype=self.weights.mem.dtype))

        if self.total is not None:
            self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32))
            self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32)

    def ocl_init(self):
        batch_size = self.input.mem.shape[0]
        self.output.initialize(self.device)
        if self.argmins is None:
            self.input.initialize(self.device)
            self.weights.initialize(self.device)
            self._distances.initialize(self.device)
        elif self.total is None:
            return
        if self.total is not None:
            self.total.initialize(self.device)

        copy_chunk_size = int(
            numpy.ceil(batch_size / self.device.max_group_size))
        chunk_size = self.neurons_number // self.device.max_group_size
        if chunk_size < 2:
            chunk_size = self.neurons_number // 2 + 1
        self.argmin_group_size = \
            int(numpy.ceil(self.neurons_number / chunk_size))

        block_size, vector_opt = self.device.device_info.get_kernel_bs_vo(
            kernel="matrix_multiplication", dtype=self.input.dtype)

        defines = {
            'BLOCK_SIZE': block_size,
            'VECTOR_OPT': int(bool(vector_opt)),
            'BATCH': batch_size,
            'SAMPLE_LENGTH': self.sample_length,
            'NEURONS_NUMBER': self.neurons_number,
            'CHUNK_SIZE': chunk_size,
            'COPY_CHUNK_SIZE': copy_chunk_size,
        }
        if self.weights_transposed:
            defines['WEIGHTS_TRANSPOSED'] = 1
        self.build_program(defines,
                           "%s_%d_%d_%d" %
                           (self.__class__.__name__, batch_size,
                            self.sample_length, self.neurons_number),
                           dtype=self.weights.mem.dtype)

        if self.total is not None:
            self._set_total_global_size_ = \
                [int(numpy.ceil(batch_size / copy_chunk_size))]
            self._krn_set_total_ = self.get_kernel("set_total")
            self._krn_set_total_.set_args(self.output.devmem, cl.skip,
                                          self.total.devmem)
        if self.argmins is not None:
            return

        self._krn_distances_ = self.get_kernel("calculate_distances")
        self._krn_distances_.set_args(self.input.devmem, self.weights.devmem,
                                      self._distances.devmem)

        self._krn_argmin_ = self.get_kernel("calculate_argmin")
        self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem,
                                   None)

        self._gs_distance = [
            roundup(self.neurons_number, block_size),
            roundup(batch_size, block_size)
        ]
        self._ls_distance = [block_size, block_size]

    def ocl_run(self):
        self.output.unmap()
        if self.total is not None:
            self.total.unmap()

        if self.argmins is None:
            self.input.unmap()
            self.weights.unmap()
            self.execute_kernel(self._gs_distance, self._ls_distance,
                                self._krn_distances_)
            self.execute_kernel([self.argmin_group_size],
                                [self.argmin_group_size], self._krn_argmin_)
        else:
            self.argmins.unmap()
            self.argmins.map_read()
            self.output.map_write()
            self.output.mem[:] = self.argmins.mem
            self.output.unmap()
            self.argmins.unmap()

        if self.total is not None:
            self._minibatch_offset_[0] = \
                self.minibatch_offset - self.minibatch_size
            self._krn_set_total_.set_arg(1, self._minibatch_offset_)
            self.execute_kernel(self._set_total_global_size_, None,
                                self._krn_set_total_)

    def numpy_run(self):
        self.output.map_invalidate()

        if self.argmins is not None:
            self.argmins.map_read()
            self.output.mem[:] = self.argmins.mem
        else:
            self.input.map_read()
            self.weights.map_read()

        if self.total is not None:
            self.total.map_invalidate()

        length = self.minibatch_size if self.total is not None \
            else self.input.mem.shape[0]
        for sindex in range(length):
            if self.argmins is None:
                dist = self.weights.mem - self.input[sindex]
                winner = numpy.argmin(self.numpy_linalg_norm(dist))
                self.output[sindex] = winner
            else:
                winner = self.argmins[sindex]
            if self.total is not None:
                index = sindex + self.minibatch_offset - self.minibatch_size
                self.total[index] = winner
Пример #8
0
class GradientDescentBase(AcceleratedUnit):
    """Base class for gradient descent units.

    Attributes:
        input: input layer values.
        output: output layer values.
        err_output: error to backpropagate.
        err_input: backpropagated error.
        weights: weights.
        bias: bias.
        batch_size: current minibatch size.
        learning_rate: gradient descent speed (positive).
        learning_rate_bias
        weights_decay: regularization for weights (see l1_vs_l2).
        weights_decay_bias
        gradient_moment: moment coefficient for weights.
        gradient_moment_bias
        gradient_weights_with_moment: accumulated moment.
        gradient_bias_with_moment
        batch_size: effective batch size (if None, get it from y).
        weights_transposed: assume weights matrix as a transposed one.
        apply_gradient: will apply gradient.
        gradient_changed: when True, slave will send gradients to master
            (assigned to True just before the run call, so it can be set to
            False inside ocl_run, numpy_run if necessary).
        ocl_set_const_args: True when constant arguments for the kernel
                            had been changed and need to be set again.
    """
    hide_from_registry = True
    MAPPING = set()

    REDUCE_SIZE = 64  # used for updating bias

    def __init__(self, workflow, **kwargs):
        kwargs["view_group"] = kwargs.get("view_group", "TRAINER")
        super(GradientDescentBase, self).__init__(workflow, **kwargs)
        self.err_input = Array(shallow_pickle=True)
        self.ocl_set_const_args = True
        self.weights = None
        self.bias = None
        self.demand("input", "err_output")
        self.learning_rate = kwargs.get("learning_rate", 0.01)
        self.learning_rate_bias = kwargs.get("learning_rate_bias",
                                             self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", 0.00005)
        self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0)
        self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0)
        self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2)
        self.gradient_moment = kwargs.get("gradient_moment", 0)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias",
                                               self.gradient_moment)
        self.weights_transposed = kwargs.get("weights_transposed", False)
        self.need_err_input = kwargs.get("need_err_input", True)
        self.include_bias = kwargs.get("include_bias", True)
        self.factor_ortho = kwargs.get("factor_ortho", 0)
        self.col_sums = Array()  # for orthogonalization

        # Current gradient as it is without applying learning_rate etc.
        self.gradient_weights = Array()
        self.gradient_bias = Array()

        # Gradient with applied learning_rate etc.
        # optionally accumulated from the previous run
        self.accumulate_gradient = kwargs.get("accumulate_gradient", False)

        # When accumulate_gradient set to True:
        # 1. Calculate gd
        # 2. acc = acc_alpha * gd + acc_beta * acc
        # 3. gd = gd_alpha * acc + gd_beta * gd
        # 4. Apply moments to gd
        # 5. weights += gd if apply_gradient set to True
        self.acc_alpha = kwargs.get("acc_alpha", 0.0)
        self.acc_beta = kwargs.get("acc_beta", 0.0)
        self.gd_alpha = kwargs.get("gd_alpha", 0.0)
        self.gd_beta = kwargs.get("gd_beta", 1.0)

        self.accumulated_gradient_weights = Array()
        self.accumulated_gradient_bias = Array()

        # Gradient with accumulated moments
        self.gradient_weights_with_moment = Array()
        self.gradient_bias_with_moment = Array()

        # Sets to True when gradient changes
        self.gradient_changed = False

        # Gradient will be applied to weights immediately just after computing
        self.apply_gradient = kwargs.get("apply_gradient",
                                         not workflow.is_slave)

    @property
    def current_batch_size(self):
        batch_size = getattr(self, "batch_size", None)
        if batch_size is None:
            return self.err_output.mem.shape[0]
        return int(batch_size)

    def initialize(self, device, **kwargs):
        super(GradientDescentBase, self).initialize(device, **kwargs)

        if self.weights:
            assert len(self.weights.shape) == 2
            self.weights_shape = (tuple(reversed(self.weights.shape))
                                  if self.weights_transposed else
                                  self.weights.shape)
        else:
            self.weights_shape = None

        self.learning_rate = kwargs.get("learning_rate", self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", self.weights_decay)
        self.gradient_moment = kwargs.get("gradient_moment",
                                          self.gradient_moment)
        self.learning_rate_bias = kwargs.get("learning_rate_bias",
                                             self.learning_rate_bias)
        self.weights_decay_bias = kwargs.get("weights_decay_bias",
                                             self.weights_decay_bias)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias",
                                               self.gradient_moment_bias)

        if self.weights:
            if not self.gradient_weights:
                self.gradient_weights.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights.size == self.weights.size

        if self.weights and self.accumulate_gradient:
            if not self.accumulated_gradient_weights:
                self.accumulated_gradient_weights.reset(
                    numpy.zeros_like(self.weights.mem))
            else:
                assert (self.accumulated_gradient_weights.size ==
                        self.weights.size)

        if self.weights and (self.gradient_moment or not self.is_standalone):
            if not self.gradient_weights_with_moment:
                self.gradient_weights_with_moment.reset(
                    numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights_with_moment.size == \
                    self.weights.size

        if (self.include_bias and self.bias
                and (not self.gradient_bias
                     or self.gradient_bias.size != self.bias.size)):
            self.gradient_bias.reset(numpy.zeros_like(self.bias.mem))

        if (self.include_bias and self.bias and self.accumulate_gradient and
            (not self.accumulated_gradient_bias
             or self.accumulated_gradient_bias.size != self.bias.size)):
            self.accumulated_gradient_bias.reset(
                numpy.zeros_like(self.bias.mem))

        if (self.include_bias and self.bias
                and (self.gradient_moment_bias or not self.is_standalone)):
            if not self.gradient_bias_with_moment:
                self.gradient_bias_with_moment.reset(
                    numpy.zeros_like(self.bias.mem))
            else:
                assert self.gradient_bias_with_moment.size == self.bias.size

        dtype = self.err_output.dtype
        if self.need_err_input:
            if not self.err_input:
                self.err_input.reset(numpy.zeros(self.input.shape, dtype))
            else:
                assert self.err_input.shape == self.input.shape

        if self.weights:
            side = self.weights_shape[0]
            other = self.weights.size // side
            if self.factor_ortho:
                if not self.col_sums:
                    self.col_sums.reset(numpy.zeros(other, dtype=dtype))
                else:
                    assert self.col_sums.size == other
                self.col_sums.initialize(self.device)
            self.reduce_size = roundup(min(self.reduce_size, other), 32)
            self.weights.initialize(self.device)

        for vec in self.bias, self.input, self.err_input:
            if vec:
                vec.initialize(self.device)
        self.init_vectors(self.err_output, self.gradient_weights,
                          self.gradient_bias,
                          self.accumulated_gradient_weights,
                          self.accumulated_gradient_bias,
                          self.gradient_weights_with_moment,
                          self.gradient_bias_with_moment)

    def gpu_weights_update(self):
        self.unmap_vectors(self.input, self.err_output, self.weights,
                           self.gradient_weights,
                           self.accumulated_gradient_weights,
                           self.gradient_weights_with_moment)

        if self.factor_ortho:
            self.col_sums.unmap()
            self.execute_kernel(self._global_size_ortho,
                                self._local_size_ortho,
                                self.krn_compute_col_sums_)

            self._weights_const[12] = self.factor_ortho
            self.krn_weights_.set_arg(12, self._weights_const[12:13])

        self._weights_const[4:12] = (self.learning_rate, self.weights_decay,
                                     self.l1_vs_l2, self.gradient_moment,
                                     self.acc_alpha, self.acc_beta,
                                     self.gd_alpha, self.gd_beta)
        self.krn_weights_.set_args(
            self.device.skip(4), self._weights_const[4:5],
            self._weights_const[5:6], self._weights_const[6:7],
            self._weights_const[7:8], self._weights_const[8:9],
            self._weights_const[9:10], self._weights_const[10:11],
            self._weights_const[11:12])

        self.execute_kernel(self._global_size_weights,
                            self._local_size_weights, self.krn_weights_)

    def gpu_bias_update(self):
        if not self.include_bias:
            return

        self.unmap_vectors(self.err_output, self.bias, self.gradient_bias,
                           self.accumulated_gradient_bias,
                           self.gradient_bias_with_moment)

        self._bias_const[5:13] = (self.learning_rate_bias,
                                  self.weights_decay_bias, self.l1_vs_l2_bias,
                                  self.gradient_moment_bias, self.acc_alpha,
                                  self.acc_beta, self.gd_alpha, self.gd_beta)
        self.krn_bias_.set_args(self.device.skip(5), self._bias_const[5:6],
                                self._bias_const[6:7], self._bias_const[7:8],
                                self._bias_const[8:9], self._bias_const[9:10],
                                self._bias_const[10:11],
                                self._bias_const[11:12],
                                self._bias_const[12:13])

        self.execute_kernel(self._global_size_bias, self._local_size_bias,
                            self.krn_bias_)

    def gpu_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        if self.krn_err_output_ is None:
            return
        self.err_output.unmap()
        self.output.unmap()
        self.execute_kernel(self._global_size_err_output,
                            self._local_size_err_output, self.krn_err_output_)

    def numpy_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        pass

    def print_debug_data(self):
        """
        Show weights statistics
        """
        if not self.logger.isEnabledFor(logging.DEBUG):
            return
        self.weights.map_read()
        self.bias.map_read()
        self.gradient_bias.map_read()
        self.gradient_weights.map_read()
        weights = self.weights.mem
        bias = self.bias.mem
        grad_weights = self.gradient_weights.mem
        grad_bias = self.gradient_bias.mem

        weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max")
        weight_table.float_format = ".10"
        for (w_name, w_array) in [("Weight", weights), ("Bias", bias),
                                  ("Grad Weight", grad_weights),
                                  ("Grad Bias", grad_bias)]:
            w_mean = w_stddev = w_min = w_max = None
            if w_array is not None and w_array.size > 0:
                w_mean = numpy.mean(w_array)
                w_stddev = numpy.std(w_array)
                w_min = numpy.min(w_array)
                w_max = numpy.max(w_array)
            weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max)
        self.debug("\n" + weight_table.get_string())

    def generate_data_for_slave(self, slave):
        return (self.learning_rate, self.weights_decay, self.gradient_moment,
                self.learning_rate_bias, self.weights_decay_bias,
                self.gradient_moment_bias)

    @staticmethod
    def fill_zeros(vector):
        if not vector:
            return
        vector.map_invalidate()
        vector.mem[:] = 0

    def apply_data_from_master(self, data):
        self.learning_rate = data[0]
        self.weights_decay = data[1]
        self.gradient_moment = data[2]
        self.learning_rate_bias = data[3]
        self.weights_decay_bias = data[4]
        self.gradient_moment_bias = data[5]
        self.fill_zeros(self.gradient_weights_with_moment)
        self.fill_zeros(self.gradient_bias_with_moment)
        self.fill_zeros(self.gradient_weights)
        self.fill_zeros(self.gradient_bias)
        self.fill_zeros(self.accumulated_gradient_weights)
        self.fill_zeros(self.accumulated_gradient_bias)

    def generate_data_for_master(self):
        if not self.gradient_changed:
            return None
        self.gradient_changed = False
        self.gradient_weights_with_moment.map_read()
        self.gradient_bias_with_moment.map_read()
        return (self.gradient_weights_with_moment.mem,
                self.gradient_bias_with_moment.mem)

    def apply_data_from_slave(self, data, slave):
        if self.weights:
            self.weights.map_write()
            self.gradient_weights_with_moment.map_write()
            self.gradient_weights_with_moment.mem *= self.gradient_moment
            self.gradient_weights_with_moment.mem += data[0]
            self.weights.mem += self.gradient_weights_with_moment.mem
        if self.bias:
            self.bias.map_write()
            self.gradient_bias_with_moment.map_write()
            self.gradient_bias_with_moment.mem *= self.gradient_moment_bias
            self.gradient_bias_with_moment.mem += data[1]
            self.bias.mem += self.gradient_bias_with_moment.mem

    def drop_slave(self, slave):
        pass

    def accumulate_gradient_f(self, accumulated_gradient, gradient):
        if accumulated_gradient and self.accumulate_gradient:
            accumulated_gradient[:] = (
                gradient * self.acc_alpha +
                (self.acc_beta * accumulated_gradient if self.acc_beta else 0))

            gradient *= self.gd_beta
            gradient += self.gd_alpha * accumulated_gradient

        return gradient

    @staticmethod
    def numpy_gradient_step(weight,
                            gradient,
                            lr,
                            factor_l12,
                            l1_vs_l2,
                            factor_ortho=0,
                            weights_transposed=False):
        gradient = gradient.copy()
        gradient += factor_l12 * (
            (1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight))
        if factor_ortho:
            col_sums = (reshape_transposed(weight).sum(
                axis=1) if weights_transposed else weight.sum(axis=0))
            for i, row in enumerate(gradient):
                row += (col_sums - weight[i]) * factor_ortho / weight.shape[0]
        gradient *= lr
        return gradient

    def run(self):
        self.gradient_changed = True
        super(GradientDescentBase, self).run()
        self.ocl_set_const_args = False
Пример #9
0
class GradientDescentBase(AcceleratedUnit):
    """Base class for gradient descent units.

    Attributes:
        input: input layer values.
        output: output layer values.
        err_output: error to backpropagate.
        err_input: backpropagated error.
        weights: weights.
        bias: bias.
        batch_size: current minibatch size.
        learning_rate: gradient descent speed (positive).
        learning_rate_bias
        weights_decay: regularization for weights (see l1_vs_l2).
        weights_decay_bias
        gradient_moment: moment coefficient for weights.
        gradient_moment_bias
        gradient_weights_with_moment: accumulated moment.
        gradient_bias_with_moment
        batch_size: effective batch size (if None, get it from y).
        weights_transposed: assume weights matrix as a transposed one.
        apply_gradient: will apply gradient.
        gradient_changed: when True, slave will send gradients to master
            (assigned to True just before the run call, so it can be set to
            False inside ocl_run, numpy_run if necessary).
        ocl_set_const_args: True when constant arguments for the kernel
                            had been changed and need to be set again.
    """

    hide_from_registry = True
    MAPPING = set()

    REDUCE_SIZE = 64  # used for updating bias

    def __init__(self, workflow, **kwargs):
        kwargs["view_group"] = kwargs.get("view_group", "TRAINER")
        super(GradientDescentBase, self).__init__(workflow, **kwargs)
        self.err_input = Array(shallow_pickle=True)
        self.ocl_set_const_args = True
        self.weights = None
        self.bias = None
        self.demand("input", "err_output")
        self.learning_rate = kwargs.get("learning_rate", 0.01)
        self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", 0.00005)
        self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0)
        self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0)
        self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2)
        self.gradient_moment = kwargs.get("gradient_moment", 0)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment)
        self.weights_transposed = kwargs.get("weights_transposed", False)
        self.need_err_input = kwargs.get("need_err_input", True)
        self.include_bias = kwargs.get("include_bias", True)
        self.factor_ortho = kwargs.get("factor_ortho", 0)
        self.col_sums = Array()  # for orthogonalization

        # Current gradient as it is without applying learning_rate etc.
        self.gradient_weights = Array()
        self.gradient_bias = Array()

        # Gradient with applied learning_rate etc.
        # optionally accumulated from the previous run
        self.accumulate_gradient = kwargs.get("accumulate_gradient", False)

        # When accumulate_gradient set to True:
        # 1. Calculate gd
        # 2. acc = acc_alpha * gd + acc_beta * acc
        # 3. gd = gd_alpha * acc + gd_beta * gd
        # 4. Apply moments to gd
        # 5. weights += gd if apply_gradient set to True
        self.acc_alpha = kwargs.get("acc_alpha", 0.0)
        self.acc_beta = kwargs.get("acc_beta", 0.0)
        self.gd_alpha = kwargs.get("gd_alpha", 0.0)
        self.gd_beta = kwargs.get("gd_beta", 1.0)

        self.accumulated_gradient_weights = Array()
        self.accumulated_gradient_bias = Array()

        # Gradient with accumulated moments
        self.gradient_weights_with_moment = Array()
        self.gradient_bias_with_moment = Array()

        # Sets to True when gradient changes
        self.gradient_changed = False

        # Gradient will be applied to weights immediately just after computing
        self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave)

    @property
    def current_batch_size(self):
        batch_size = getattr(self, "batch_size", None)
        if batch_size is None:
            return self.err_output.mem.shape[0]
        return int(batch_size)

    def initialize(self, device, **kwargs):
        super(GradientDescentBase, self).initialize(device, **kwargs)

        if self.weights:
            assert len(self.weights.shape) == 2
            self.weights_shape = tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape
        else:
            self.weights_shape = None

        self.learning_rate = kwargs.get("learning_rate", self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", self.weights_decay)
        self.gradient_moment = kwargs.get("gradient_moment", self.gradient_moment)
        self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate_bias)
        self.weights_decay_bias = kwargs.get("weights_decay_bias", self.weights_decay_bias)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment_bias)

        if self.weights:
            if not self.gradient_weights:
                self.gradient_weights.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights.size == self.weights.size

        if self.weights and self.accumulate_gradient:
            if not self.accumulated_gradient_weights:
                self.accumulated_gradient_weights.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.accumulated_gradient_weights.size == self.weights.size

        if self.weights and (self.gradient_moment or not self.is_standalone):
            if not self.gradient_weights_with_moment:
                self.gradient_weights_with_moment.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights_with_moment.size == self.weights.size

        if self.include_bias and self.bias and (not self.gradient_bias or self.gradient_bias.size != self.bias.size):
            self.gradient_bias.reset(numpy.zeros_like(self.bias.mem))

        if (
            self.include_bias
            and self.bias
            and self.accumulate_gradient
            and (not self.accumulated_gradient_bias or self.accumulated_gradient_bias.size != self.bias.size)
        ):
            self.accumulated_gradient_bias.reset(numpy.zeros_like(self.bias.mem))

        if self.include_bias and self.bias and (self.gradient_moment_bias or not self.is_standalone):
            if not self.gradient_bias_with_moment:
                self.gradient_bias_with_moment.reset(numpy.zeros_like(self.bias.mem))
            else:
                assert self.gradient_bias_with_moment.size == self.bias.size

        dtype = self.err_output.dtype
        if self.need_err_input:
            if not self.err_input:
                self.err_input.reset(numpy.zeros(self.input.shape, dtype))
            else:
                assert self.err_input.shape == self.input.shape

        if self.weights:
            side = self.weights_shape[0]
            other = self.weights.size // side
            if self.factor_ortho:
                if not self.col_sums:
                    self.col_sums.reset(numpy.zeros(other, dtype=dtype))
                else:
                    assert self.col_sums.size == other
                self.col_sums.initialize(self.device)
            self.reduce_size = roundup(min(self.reduce_size, other), 32)
            self.weights.initialize(self.device)

        for vec in self.bias, self.input, self.err_input:
            if vec:
                vec.initialize(self.device)
        self.init_vectors(
            self.err_output,
            self.gradient_weights,
            self.gradient_bias,
            self.accumulated_gradient_weights,
            self.accumulated_gradient_bias,
            self.gradient_weights_with_moment,
            self.gradient_bias_with_moment,
        )

    def gpu_weights_update(self):
        self.unmap_vectors(
            self.input,
            self.err_output,
            self.weights,
            self.gradient_weights,
            self.accumulated_gradient_weights,
            self.gradient_weights_with_moment,
        )

        if self.factor_ortho:
            self.col_sums.unmap()
            self.execute_kernel(self._global_size_ortho, self._local_size_ortho, self.krn_compute_col_sums_)

            self._weights_const[12] = self.factor_ortho
            self.krn_weights_.set_arg(12, self._weights_const[12:13])

        self._weights_const[4:12] = (
            self.learning_rate,
            self.weights_decay,
            self.l1_vs_l2,
            self.gradient_moment,
            self.acc_alpha,
            self.acc_beta,
            self.gd_alpha,
            self.gd_beta,
        )
        self.krn_weights_.set_args(
            self.device.skip(4),
            self._weights_const[4:5],
            self._weights_const[5:6],
            self._weights_const[6:7],
            self._weights_const[7:8],
            self._weights_const[8:9],
            self._weights_const[9:10],
            self._weights_const[10:11],
            self._weights_const[11:12],
        )

        self.execute_kernel(self._global_size_weights, self._local_size_weights, self.krn_weights_)

    def gpu_bias_update(self):
        if not self.include_bias:
            return

        self.unmap_vectors(
            self.err_output,
            self.bias,
            self.gradient_bias,
            self.accumulated_gradient_bias,
            self.gradient_bias_with_moment,
        )

        self._bias_const[5:13] = (
            self.learning_rate_bias,
            self.weights_decay_bias,
            self.l1_vs_l2_bias,
            self.gradient_moment_bias,
            self.acc_alpha,
            self.acc_beta,
            self.gd_alpha,
            self.gd_beta,
        )
        self.krn_bias_.set_args(
            self.device.skip(5),
            self._bias_const[5:6],
            self._bias_const[6:7],
            self._bias_const[7:8],
            self._bias_const[8:9],
            self._bias_const[9:10],
            self._bias_const[10:11],
            self._bias_const[11:12],
            self._bias_const[12:13],
        )

        self.execute_kernel(self._global_size_bias, self._local_size_bias, self.krn_bias_)

    def gpu_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        if self.krn_err_output_ is None:
            return
        self.err_output.unmap()
        self.output.unmap()
        self.execute_kernel(self._global_size_err_output, self._local_size_err_output, self.krn_err_output_)

    def numpy_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        pass

    def print_debug_data(self):
        """
        Show weights statistics
        """
        if not self.logger.isEnabledFor(logging.DEBUG):
            return
        self.weights.map_read()
        self.bias.map_read()
        self.gradient_bias.map_read()
        self.gradient_weights.map_read()
        weights = self.weights.mem
        bias = self.bias.mem
        grad_weights = self.gradient_weights.mem
        grad_bias = self.gradient_bias.mem

        weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max")
        weight_table.float_format = ".10"
        for (w_name, w_array) in [
            ("Weight", weights),
            ("Bias", bias),
            ("Grad Weight", grad_weights),
            ("Grad Bias", grad_bias),
        ]:
            w_mean = w_stddev = w_min = w_max = None
            if w_array is not None and w_array.size > 0:
                w_mean = numpy.mean(w_array)
                w_stddev = numpy.std(w_array)
                w_min = numpy.min(w_array)
                w_max = numpy.max(w_array)
            weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max)
        self.debug("\n" + weight_table.get_string())

    def generate_data_for_slave(self, slave):
        return (
            self.learning_rate,
            self.weights_decay,
            self.gradient_moment,
            self.learning_rate_bias,
            self.weights_decay_bias,
            self.gradient_moment_bias,
        )

    @staticmethod
    def fill_zeros(vector):
        if not vector:
            return
        vector.map_invalidate()
        vector.mem[:] = 0

    def apply_data_from_master(self, data):
        self.learning_rate = data[0]
        self.weights_decay = data[1]
        self.gradient_moment = data[2]
        self.learning_rate_bias = data[3]
        self.weights_decay_bias = data[4]
        self.gradient_moment_bias = data[5]
        self.fill_zeros(self.gradient_weights_with_moment)
        self.fill_zeros(self.gradient_bias_with_moment)
        self.fill_zeros(self.gradient_weights)
        self.fill_zeros(self.gradient_bias)
        self.fill_zeros(self.accumulated_gradient_weights)
        self.fill_zeros(self.accumulated_gradient_bias)

    def generate_data_for_master(self):
        if not self.gradient_changed:
            return None
        self.gradient_changed = False
        self.gradient_weights_with_moment.map_read()
        self.gradient_bias_with_moment.map_read()
        return (self.gradient_weights_with_moment.mem, self.gradient_bias_with_moment.mem)

    def apply_data_from_slave(self, data, slave):
        if self.weights:
            self.weights.map_write()
            self.gradient_weights_with_moment.map_write()
            self.gradient_weights_with_moment.mem *= self.gradient_moment
            self.gradient_weights_with_moment.mem += data[0]
            self.weights.mem += self.gradient_weights_with_moment.mem
        if self.bias:
            self.bias.map_write()
            self.gradient_bias_with_moment.map_write()
            self.gradient_bias_with_moment.mem *= self.gradient_moment_bias
            self.gradient_bias_with_moment.mem += data[1]
            self.bias.mem += self.gradient_bias_with_moment.mem

    def drop_slave(self, slave):
        pass

    def accumulate_gradient_f(self, accumulated_gradient, gradient):
        if accumulated_gradient and self.accumulate_gradient:
            accumulated_gradient[:] = gradient * self.acc_alpha + (
                self.acc_beta * accumulated_gradient if self.acc_beta else 0
            )

            gradient *= self.gd_beta
            gradient += self.gd_alpha * accumulated_gradient

        return gradient

    @staticmethod
    def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False):
        gradient = gradient.copy()
        gradient += factor_l12 * ((1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight))
        if factor_ortho:
            col_sums = reshape_transposed(weight).sum(axis=1) if weights_transposed else weight.sum(axis=0)
            for i, row in enumerate(gradient):
                row += (col_sums - weight[i]) * factor_ortho / weight.shape[0]
        gradient *= lr
        return gradient

    def run(self):
        self.gradient_changed = True
        super(GradientDescentBase, self).run()
        self.ocl_set_const_args = False
Пример #10
0
class ZeroFiller(ForwardBase, TriviallyDistributable):
    """Fills weights of given unit with zero on every step"""

    MAPPING = {"zero_filter"}

    def __init__(self, workflow, **kwargs):
        super(ZeroFiller, self).__init__(workflow, **kwargs)

        self.mask = Array()
        self.grouping = kwargs.get("grouping", 1)
        self.demand("weights")

    def init_unpickled(self):
        super(ZeroFiller, self).init_unpickled()
        self.sources_["weights_zerofilling"] = {}

    @property
    def effective_shape(self):
        return (self.weights.shape[0],
                self.weights.size // self.weights.shape[0])

    @property
    def grouping(self):
        return self._grouping

    @grouping.setter
    def grouping(self, value):
        if not isinstance(value, int):
            raise TypeError("grouping value must be an integer (got %s)" %
                            type(value))
        if value < 2:
            raise ValueError("grouping value %d is invalid" % value)
        self._grouping = value

    def initialize(self, device=None, **kwargs):
        super(ZeroFiller, self).initialize(device, **kwargs)
        if not self.weights:
            return True

        if not self.mask:
            if self.effective_shape[1] % self.grouping != 0:
                raise ValueError(
                    "Non-multiple of grouping weights shape detected: "
                    "%s, grouping=%d" % (self.weights.shape, self.grouping))
            self.mask.reset(
                numpy.zeros(self.effective_shape, dtype=self.weights.dtype))
            self.mask.map_invalidate()
            # TODO(a.kazantsev): add check for transposed weights.
            for kernel in range(self.effective_shape[0]):
                for chan in range(self.effective_shape[1]):
                    self.mask[kernel, chan] = not (kernel % self.grouping
                                                   == chan % self.grouping)
        else:
            assert self.mask.shape == self.effective_shape

        for vec in self.mask, self.weights:
            vec.initialize(device)

    def _gpu_init(self):
        self.build_program(cache_file_name="zero_filling_%d" % self.grouping,
                           dtype=self.weights.dtype)

        self.assign_kernel("multiply_by_mask")
        self.set_args(self.mask, self.weights)

    def ocl_init(self):
        self._gpu_init()
        self._global_size = [self.weights.size]
        self._local_size = None

    def cuda_init(self):
        self._gpu_init()
        self._global_size = (self.weights.size, 1, 1)
        self._local_size = (1, 1, 1)

    def numpy_run(self):
        self.mask.map_read()
        self.weights.map_write()

        self.weights.mem *= self.mask.mem

    def _gpu_run(self):
        self.weights.unmap()
        self.mask.unmap()
        self.execute_kernel(self._global_size, self._local_size)

    def ocl_run(self):
        self._gpu_run()

    def cuda_run(self):
        self._gpu_run()
Пример #11
0
class OffsetPooling(Pooling):
    """Pooling by offset forward propagation.

    Must be assigned before initialize():

    Updates after run():
        input_offset

    Creates within initialize():
        input_offset

    Attributes:
        input_offset: offsets in the input where elements are passed through.
    """

    MAPPING = set()
    hide_from_registry = True

    def __init__(self, workflow, **kwargs):
        super(OffsetPooling, self).__init__(workflow, **kwargs)
        self.input_offset = Array()
        self.demand("input")

    def initialize(self, device, **kwargs):
        super(OffsetPooling, self).initialize(device=device, **kwargs)

        if self._no_output:
            return
        if not self.input_offset:
            self.input_offset.reset(numpy.zeros(self.output.shape,
                                                dtype=numpy.int32))
        else:
            assert self.input_offset.shape == self.output.shape
        self.input_offset.initialize(self.device)

    def set_args(self, *args):
        super(OffsetPooling, self).set_args(self.input, self.output,
                                            self.input_offset, *args)

    def ocl_run(self):
        self.input_offset.unmap()
        super(OffsetPooling, self).ocl_run()

    def cuda_run(self):
        self.input_offset.unmap()
        super(OffsetPooling, self).cuda_run()

    def numpy_run(self):
        self.input_offset.map_invalidate()
        super(OffsetPooling, self).numpy_run()

    def numpy_run_cut(self, cut, coords):
        batch, y1, x1, ch, out_y, out_x = coords
        cut_index = self.numpy_run_cut_offset(
            cut, numpy.ravel_multi_index((batch, out_y, out_x, ch),
                                         self.output.shape))
        i, j = numpy.unravel_index(cut_index, cut.shape)
        idx = numpy.ravel_multi_index((batch, y1 + i, x1 + j, ch),
                                      self.input.shape)
        val = numpy.ravel(self.input.mem)[idx]
        self.input_offset.mem[batch, out_y, out_x, ch] = idx
        return val
Пример #12
0
class KohonenForward(KohonenBase, AcceleratedUnit):
    """Kohonen forward layer.

    Must be assigned before initialize():
        input
        weights
        minibatch_offset (if total == True)
        minibatch_size (if total == True)
        batch_size (if total == True)
        argmins speeds up run() if linked from KohonenTrainer

    Updates after run():
        output

    Creates within initialize():
        output

    Attributes:
        input: input as batch of samples.
        weights: the weights of the neurons in Kohonen layer.
        output: the list of winners.
        total: if total=True is passed in __init__(), the overall winners table
    """
    def __init__(self, workflow, **kwargs):
        super(KohonenForward, self).__init__(workflow, **kwargs)
        self.demand("input", "weights")
        self.argmins = None
        self._distances = Array()
        self.output = Array()
        self._chunk_size_ = 0
        self.weights_transposed = False
        self.total = Array() if kwargs.get("total", False) else None
        if self.total is not None:
            self.minibatch_offset = None
            self.minibatch_size = None
            self.batch_size = None

    def init_unpickled(self):
        super(KohonenForward, self).init_unpickled()
        self.sources_["kohonen"] = {"FORWARD": 1}

    @property
    def neurons_number(self):
        return self.weights.mem.shape[0]

    @property
    def sample_length(self):
        return self.weights.mem.shape[1]

    @property
    def chunk_size(self):
        return self._chunk_size_

    def initialize(self, device, **kwargs):
        super(KohonenForward, self).initialize(device=device, **kwargs)

        assert self.input.mem.shape[1] == self.sample_length
        batch_size = self.input.mem.shape[0]

        self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32))
        if self.argmins is None:
            self._distances.reset(numpy.zeros(
                [batch_size, self.neurons_number],
                dtype=self.weights.mem.dtype))

        if self.total is not None:
            self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32))
            self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32)

    def ocl_init(self):
        batch_size = self.input.mem.shape[0]
        self.output.initialize(self.device)
        if self.argmins is None:
            self.input.initialize(self.device)
            self.weights.initialize(self.device)
            self._distances.initialize(self.device)
        elif self.total is None:
            return
        if self.total is not None:
            self.total.initialize(self.device)

        copy_chunk_size = int(numpy.ceil(batch_size /
                                         self.device.max_group_size))
        chunk_size = self.neurons_number // self.device.max_group_size
        if chunk_size < 2:
            chunk_size = self.neurons_number // 2 + 1
        self.argmin_group_size = \
            int(numpy.ceil(self.neurons_number / chunk_size))

        block_size, vector_opt = self.device.device_info.get_kernel_bs_vo(
            kernel="matrix_multiplication", dtype=self.input.dtype)

        defines = {
            'BLOCK_SIZE': block_size,
            'VECTOR_OPT': int(bool(vector_opt)),
            'BATCH': batch_size,
            'SAMPLE_LENGTH': self.sample_length,
            'NEURONS_NUMBER': self.neurons_number,
            'CHUNK_SIZE': chunk_size,
            'COPY_CHUNK_SIZE': copy_chunk_size,
        }
        if self.weights_transposed:
            defines['WEIGHTS_TRANSPOSED'] = 1
        self.build_program(defines, "%s_%d_%d_%d" %
                           (self.__class__.__name__,
                            batch_size, self.sample_length,
                            self.neurons_number),
                           dtype=self.weights.mem.dtype)

        if self.total is not None:
            self._set_total_global_size_ = \
                [int(numpy.ceil(batch_size / copy_chunk_size))]
            self._krn_set_total_ = self.get_kernel("set_total")
            self._krn_set_total_.set_args(self.output.devmem, cl.skip,
                                          self.total.devmem)
        if self.argmins is not None:
            return

        self._krn_distances_ = self.get_kernel("calculate_distances")
        self._krn_distances_.set_args(self.input.devmem, self.weights.devmem,
                                      self._distances.devmem)

        self._krn_argmin_ = self.get_kernel("calculate_argmin")
        self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem,
                                   None)

        self._gs_distance = [
            roundup(self.neurons_number, block_size),
            roundup(batch_size, block_size)]
        self._ls_distance = [block_size, block_size]

    def ocl_run(self):
        self.output.unmap()
        if self.total is not None:
            self.total.unmap()

        if self.argmins is None:
            self.input.unmap()
            self.weights.unmap()
            self.execute_kernel(self._gs_distance, self._ls_distance,
                                self._krn_distances_)
            self.execute_kernel([self.argmin_group_size],
                                [self.argmin_group_size],
                                self._krn_argmin_)
        else:
            self.argmins.unmap()
            self.argmins.map_read()
            self.output.map_write()
            self.output.mem[:] = self.argmins.mem
            self.output.unmap()
            self.argmins.unmap()

        if self.total is not None:
            self._minibatch_offset_[0] = \
                self.minibatch_offset - self.minibatch_size
            self._krn_set_total_.set_arg(1, self._minibatch_offset_)
            self.execute_kernel(self._set_total_global_size_, None,
                                self._krn_set_total_)

    def numpy_run(self):
        self.output.map_invalidate()

        if self.argmins is not None:
            self.argmins.map_read()
            self.output.mem[:] = self.argmins.mem
        else:
            self.input.map_read()
            self.weights.map_read()

        if self.total is not None:
            self.total.map_invalidate()

        length = self.minibatch_size if self.total is not None \
            else self.input.mem.shape[0]
        for sindex in range(length):
            if self.argmins is None:
                dist = self.weights.mem - self.input[sindex]
                winner = numpy.argmin(self.numpy_linalg_norm(dist))
                self.output[sindex] = winner
            else:
                winner = self.argmins[sindex]
            if self.total is not None:
                index = sindex + self.minibatch_offset - self.minibatch_size
                self.total[index] = winner
Пример #13
0
class ZeroFiller(ForwardBase, TriviallyDistributable):
    """Fills weights of given unit with zero on every step"""

    MAPPING = {"zero_filter"}

    def __init__(self, workflow, **kwargs):
        super(ZeroFiller, self).__init__(workflow, **kwargs)

        self.mask = Array()
        self.grouping = kwargs.get("grouping", 1)
        self.demand("weights")

    def init_unpickled(self):
        super(ZeroFiller, self).init_unpickled()
        self.sources_["weights_zerofilling"] = {}

    @property
    def effective_shape(self):
        return (self.weights.shape[0],
                self.weights.size // self.weights.shape[0])

    @property
    def grouping(self):
        return self._grouping

    @grouping.setter
    def grouping(self, value):
        if not isinstance(value, int):
            raise TypeError(
                "grouping value must be an integer (got %s)" % type(value))
        if value < 2:
            raise ValueError("grouping value %d is invalid" % value)
        self._grouping = value

    def initialize(self, device=None, **kwargs):
        super(ZeroFiller, self).initialize(device, **kwargs)
        if not self.weights:
            return True

        if not self.mask:
            if self.effective_shape[1] % self.grouping != 0:
                raise ValueError(
                    "Non-multiple of grouping weights shape detected: "
                    "%s, grouping=%d" %
                    (self.weights.shape, self.grouping))
            self.mask.reset(numpy.zeros(self.effective_shape,
                                        dtype=self.weights.dtype))
            self.mask.map_invalidate()
            # TODO(a.kazantsev): add check for transposed weights.
            for kernel in range(self.effective_shape[0]):
                for chan in range(self.effective_shape[1]):
                    self.mask[kernel, chan] = not (
                        kernel % self.grouping == chan % self.grouping)
        else:
            assert self.mask.shape == self.effective_shape

        for vec in self.mask, self.weights:
            vec.initialize(device)

    def _gpu_init(self):
        self.build_program(cache_file_name="zero_filling_%d" % self.grouping,
                           dtype=self.weights.dtype)

        self.assign_kernel("multiply_by_mask")
        self.set_args(self.mask, self.weights)

    def ocl_init(self):
        self._gpu_init()
        self._global_size = [self.weights.size]
        self._local_size = None

    def cuda_init(self):
        self._gpu_init()
        self._global_size = (self.weights.size, 1, 1)
        self._local_size = (1, 1, 1)

    def numpy_run(self):
        self.mask.map_read()
        self.weights.map_write()

        self.weights.mem *= self.mask.mem

    def _gpu_run(self):
        self.weights.unmap()
        self.mask.unmap()
        self.execute_kernel(self._global_size, self._local_size)

    def ocl_run(self):
        self._gpu_run()

    def cuda_run(self):
        self._gpu_run()