Python Array.unmap примеры использования

Язык программирования: Python

Пространство имен/Пакет: veles.memory

Класс/Тип: Array

Метод/Функция: unmap

Примеров на hotexamples.com: 13

Python Array.unmap - 13 примеров найдено. Это лучшие примеры Python кода для veles.memory.Array.unmap, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Array(30)

reset(25)

initialize(8)

map_write(7)

unmap(7)

map_read(3)

map_invalidate(2)

ocl_map_read(1)

Пример #1

Показать файл

Файл: rbm_units.py Проект: vmarkovtsev/veles.znicz

class MemCpy(AcceleratedUnit):
    def __init__(self, workflow, **kwargs):
        super(MemCpy, self).__init__(workflow, **kwargs)
        self.output = Array()
        self.demand("input")

    def initialize(self, device, **kwargs):
        super(MemCpy, self).initialize(device, **kwargs)
        if (self.output.mem is None
                or self.output.mem.size != self.input.mem.size):
            self.output.reset()
            self.output.mem = numpy.zeros(self.input.mem.shape,
                                          dtype=self.input.mem.dtype)
        self.input.initialize(self.device)
        self.output.initialize(self.device)

    def cuda_init(self):
        pass

    def ocl_init(self):
        pass

    def _gpu_run(self):
        self.input.unmap()
        self.output.unmap()

    def ocl_run(self):
        self._gpu_run()
        self.device.queue_.copy_buffer(self.input.devmem, self.output.devmem,
                                       0, 0, self.input.nbytes)

    def cuda_run(self):
        self._gpu_run()
        self.output.devmem.from_device_async(self.input.devmem)

    def numpy_run(self):
        self.input.map_read()
        self.output.map_invalidate()
        numpy.copyto(self.output.mem, self.input.mem)

Пример #2

Показать файл

Файл: rbm_units.py Проект: Samsung/veles.znicz

class MemCpy(AcceleratedUnit):
    def __init__(self, workflow, **kwargs):
        super(MemCpy, self).__init__(workflow, **kwargs)
        self.output = Array()
        self.demand("input")

    def initialize(self, device, **kwargs):
        super(MemCpy, self).initialize(device, **kwargs)
        if (self.output.mem is None or
                self.output.mem.size != self.input.mem.size):
            self.output.reset()
            self.output.mem = numpy.zeros(self.input.mem.shape,
                                          dtype=self.input.mem.dtype)
        self.input.initialize(self.device)
        self.output.initialize(self.device)

    def cuda_init(self):
        pass

    def ocl_init(self):
        pass

    def _gpu_run(self):
        self.input.unmap()
        self.output.unmap()

    def ocl_run(self):
        self._gpu_run()
        self.device.queue_.copy_buffer(self.input.devmem, self.output.devmem,
                                       0, 0, self.input.nbytes)

    def cuda_run(self):
        self._gpu_run()
        self.output.devmem.from_device_async(self.input.devmem)

    def numpy_run(self):
        self.input.map_read()
        self.output.map_invalidate()
        numpy.copyto(self.output.mem, self.input.mem)

Пример #3

Показать файл

Файл: test_veles.py Проект: InonS/Open-GPGPU-ANN

class MyOCL(IOpenCLUnit):
    def __init__(self):
        self.a = Array(zeros([kibi >> 1, kibi], dtype=float32))
        self.b = Array()
        self.b.mem = zeros([kibi, kibi], dtype=float32)

    def initialize(self, device, **kwargs):
        self.a.initialize(self)
        self.b.initialize(self)

        def ocl_init():
            self.krn_.set_arg(0, self.a.devmem)
            self.krn_.set_arg(1, self.b.devmem)

        ocl_init()

    def __call__(self, *args, **kwargs):
        self.a.unmap()
        self.b.unmap()
        self.execute_kernel(global_size, local_size, self.krn_)

        a = self.a.ocl_map_read()

Пример #4

Показать файл

Файл: pooling.py Проект: Samsung/veles.znicz

class OffsetPooling(Pooling):
    """Pooling by offset forward propagation.

    Must be assigned before initialize():

    Updates after run():
        input_offset

    Creates within initialize():
        input_offset

    Attributes:
        input_offset: offsets in the input where elements are passed through.
    """

    MAPPING = set()
    hide_from_registry = True

    def __init__(self, workflow, **kwargs):
        super(OffsetPooling, self).__init__(workflow, **kwargs)
        self.input_offset = Array()
        self.demand("input")

    def initialize(self, device, **kwargs):
        super(OffsetPooling, self).initialize(device=device, **kwargs)

        if self._no_output:
            return
        if self.input_offset:
            assert self.input_offset.shape[1:] == self.output.shape[1:]
        if (not self.input_offset or
                self.input_offset.shape[0] != self.output.shape[0]):
            self.input_offset.reset(numpy.zeros(self.output.shape,
                                                dtype=numpy.int32))
        self.input_offset.initialize(self.device)

    def set_args(self, *args):
        super(OffsetPooling, self).set_args(self.input, self.output,
                                            self.input_offset, *args)

    def ocl_run(self):
        self.input_offset.unmap()
        super(OffsetPooling, self).ocl_run()

    def cuda_run(self):
        self.input_offset.unmap()
        super(OffsetPooling, self).cuda_run()

    def numpy_run(self):
        self.input_offset.map_invalidate()
        super(OffsetPooling, self).numpy_run()

    def numpy_run_cut(self, cut, coords):
        batch, y1, x1, ch, out_y, out_x = coords
        cut_index = self.numpy_run_cut_offset(
            cut, numpy.ravel_multi_index((batch, out_y, out_x, ch),
                                         self.output.shape))
        i, j = numpy.unravel_index(cut_index, cut.shape)
        idx = numpy.ravel_multi_index((batch, y1 + i, x1 + j, ch),
                                      self.input.shape)
        val = numpy.ravel(self.input.mem)[idx]
        self.input_offset.mem[batch, out_y, out_x, ch] = idx
        return val

Пример #5

Показать файл

Файл: deconv.py Проект: Samsung/veles.znicz

class Deconv(TriviallyDistributable, ConvolutionalBase, nn_units.Forward):
    # TriviallyDistributable overrides nn_units.Forward IDistributable
    """Deconvolutional layer for simple convolutional layer
    with linear activation and without bias.

    Must be assigned before initialize():
        input
        weights
        output_shape_source

    Updates after run():
        output

    Creates within initialize():
        output

    Attributes:
        input: input as batch of multichannel interleaved images.
        output: output as batch of multichannel interleaved images.
        weights: matrix of weights.
        output_shape_source: Array to get output shape from.
        n_kernels: number of convolutional kernels
                   in the corresponding convolutional layer.
        kx: kernel width.
        ky: kernel height.
        sliding: tuple of kernel sliding (by x-axis, by y-axis),
                 kx, ky MUST be a multiple of sliding to avoid irregularities.
        padding: tuple of virtual sample padding (left, top, right, bottom),
                 will be computed automatically based on sliding.
        weights_transposed: assume weights matrix as a transposed one.
        unsafe_padding: flag to enable unsafe padding and/or sliding.
    """

    MAPPING = {"deconv"}

    @staticmethod
    def compute_padding(sx, sy, kx, ky, sliding):
        """Computes required padding.
        """
        return (kx - sliding[1], ky - sliding[0],
                kx - sx % sliding[1] if sx % sliding[1] != 0
                else kx - sliding[1],
                ky - sy % sliding[0] if sy % sliding[0] != 0
                else ky - sliding[0])

    @staticmethod
    def check_padding_is_safe(kx, ky, sliding):
        if sliding[0] > (ky >> 1) or sliding[1] > (kx >> 1):
            raise ValueError(
                "sliding should not be greater than half of the kernel size")
        if kx % sliding[0] != 0 or kx % sliding[1] != 0:
            raise ValueError(
                "Kernel size should be multiple of sliding")

    def __init__(self, workflow, **kwargs):
        super(Deconv, self).__init__(workflow, **kwargs)
        self.unsafe_padding = kwargs.get("unsafe_padding", False)
        self.hits = Array()
        self.krn_clear_output_ = None
        self._global_size = None
        self._local_size = None
        del self.bias
        self.demand("n_kernels", "kx", "ky", "padding", "sliding",
                    "input", "weights", "output_shape_source")

    def init_unpickled(self):
        super(Deconv, self).init_unpickled()
        self.sources_["deconv/forward"] = {}

    def initialize(self, device, **kwargs):
        super(Deconv, self).initialize(device, **kwargs)

        self._dtype = self.input.dtype

        self.weights_shape = (tuple(reversed(self.weights.shape))
                              if self.weights_transposed
                              else self.weights.shape)

        if hasattr(self, "bias"):
            raise ValueError("bias should not be set")
        if (len(self.input.shape) != 4 or
                self.input.shape[3] != self.n_kernels):
            raise ValueError("Incorrectly shaped input encountered")
        if (len(self.weights_shape) != 2 or
                self.weights_shape[0] != self.n_kernels or
                self.weights_shape[1] % (self.kx * self.ky) != 0):
            raise ValueError("Incorrectly shaped weights encountered")

        output_shape = tuple(self.output_shape_source.shape)
        if len(output_shape) != 4:
            raise ValueError("Incorrect output_shape_source shape")
        if output_shape[0] != self.input.shape[0]:
            raise ValueError(
                "output_shape_source.shape[0] != input.shape[0]")

        try:
            self.check_padding_is_safe(self.kx, self.ky, self.sliding)
        except ValueError as e:
            if not self.unsafe_padding:
                raise from_none(e)
            self.warning("The padding will be unsafe")
            self._create_hits(output_shape)

        padding = Deconv.compute_padding(
            output_shape[2], output_shape[1], self.kx, self.ky, self.sliding)
        if self.padding is None:  # pylint: disable=E0203
            self.padding = padding
        elif self.padding != padding:
            if not self.unsafe_padding:
                raise ValueError(
                    "Expected padding %s but got %s" % (padding, self.padding))
            self._create_hits(output_shape)

        if self.output:
            assert self.output.shape[1:] == output_shape[1:]
        if not self.output or self.output.shape[0] != output_shape[0]:
            self.output.reset(numpy.zeros(output_shape,
                                          dtype=self._dtype))

        self._output_shape = output_shape

        self._sy, self._sx, self._n_channels = self._output_shape[1:]
        self._kernel_size = self.kx * self.ky * self._n_channels

        self._kernel_app_per_image = self.input.sample_size // self.n_kernels
        self._kernel_app_total = (self._kernel_app_per_image *
                                  self.input.shape[0])

        self.init_vectors(self.input, self.weights, self.output, self.hits)

    def _create_hits(self, output_shape):
        if not self.hits:
            self.hits.reset(
                numpy.zeros(output_shape, dtype=numpy.int32))
        else:
            assert self.hits.size == int(numpy.prod(output_shape))

    def _gpu_init(self, blas_class):
        defines = {
            "USE_ATOMICS": 1,
            "WEIGHTS_TRANSPOSED": int(self.weights_transposed),
            "BATCH": self._output_shape[0],
            "SX": self._sx,
            "SY": self._sy,
            "N_CHANNELS": self._n_channels,
            "KX": self.kx,
            "KY": self.ky,
            "N_KERNELS": self.n_kernels,
            "PAD_LEFT": self.padding[0],
            "PAD_TOP": self.padding[1],
            "PAD_RIGHT": self.padding[2],
            "PAD_BOTTOM": self.padding[3],
            "SLIDE_X": self.sliding[0],
            "SLIDE_Y": self.sliding[1],
            "USE_HITS": int(bool(self.hits)),
            "DECONV_MODE": int(bool(self.hits)) + 1,
            "OUTPUT_SIZE": self.output.size
        }

        self.build_program(
            defines, "%s/%s_%d_%dx%dx%d_%dx%d_%d" % (
                root.common.dirs.cache, self.__class__.__name__,
                self.input.shape[0],
                self._output_shape[2], self._output_shape[1],
                self._output_shape[3],
                self.kx, self.ky, self.n_kernels), dtype=self._dtype)

        self.krn_pack_ = self.get_kernel("DirectPack")
        unpack_bytes = (self._kernel_app_per_image * self.unpack_size *
                        self._kernel_size * self.input.itemsize)
        self.device.request_temp_buffer(unpack_bytes)

        if self.hits:
            self.krn_pack_.set_arg(3, self.hits.devmem)

            self.krn_apply_hits_ = self.get_kernel("apply_hits")
            self.krn_apply_hits_.set_args(self.output.devmem, self.hits.devmem)

        self.gemm_ = blas_class.gemm(self._dtype)
        self.np_one = numpy.ones(1, dtype=self._dtype)
        self.np_zero = numpy.zeros(1, dtype=self._dtype)
        self._const_i = numpy.zeros(1, dtype=numpy.int64)

    def ocl_init(self):
        ocl_blas.OCLBLAS.attach_to_device(self.device)
        self._gpu_init(ocl_blas.OCLBLAS)

        self._global_size_pack = lambda size: (size,)
        self._local_size_pack = None

        if self.hits:
            self.krn_clear_hits_ = self.get_kernel("clear_hits")
            self.krn_clear_hits_.set_arg(0, self.hits.devmem)

            self._global_size_hits = (self.output.size,)
            self._local_size_hits = None

        self.krn_clear_output_ = self.get_kernel("clear_output")
        self.krn_clear_output_.set_arg(0, self.output.devmem)

        self._clear_output = lambda: (
            self.execute_kernel((self.output.size,), None,
                                self.krn_clear_output_))
        self._clear_hits = lambda: (
            self.execute_kernel((self.hits.size,), None, self.krn_clear_hits_))

        self._process_subblock = self._ocl_process_subblock

        self.krn_pack_.set_arg(1, self.output.devmem)

    def cuda_init(self):
        self._gpu_init(cublas.CUBLAS)

        block_size = self.device.suggest_block_size(self.krn_pack_)
        self._global_size_pack = (
            lambda size: (int(numpy.ceil(size / block_size)), 1, 1))
        self._local_size_pack = (block_size, 1, 1)

        if self.hits:
            block_size = self.device.suggest_block_size(self.krn_apply_hits_)
            self._global_size_hits = (
                int(numpy.ceil(self.output.size / block_size)), 1, 1)
            self._local_size_hits = (block_size, 1, 1)

        self._clear_output = lambda: self.output.devmem.memset32_async()
        self._clear_hits = lambda: self.hits.devmem.memset32_async()

        self._process_subblock = self._cuda_process_subblock

    def ocl_run(self):
        self.gpu_run()

    def cuda_run(self):
        self.gpu_run()

    def gpu_run(self):
        self.unmap_vectors(self.output, self.input, self.weights)
        unpack_data = self.device.get_temp_buffer()
        self._clear_output()
        if self.hits:
            self.hits.unmap()
            self._clear_hits()
        batch_size = self.output.shape[0]
        for i in range(0, batch_size, self.unpack_size):
            self._process_subblock(i, min(batch_size - i, self.unpack_size),
                                   unpack_data)
        if self.hits:
            self.execute_kernel(self._global_size_hits, self._local_size_hits,
                                self.krn_apply_hits_)

    def _cuda_process_subblock(self, start_image, image_count, unpack_data):
        output_offs = (start_image * self.input.sample_size *
                       self.input.itemsize)
        unpack_side = self._kernel_app_per_image * image_count

        self.gemm_(
            self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed
            else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N,
            self._kernel_size, unpack_side, self.weights_shape[0],
            self.np_one, self.weights.devmem,
            int(self.input.devmem) + output_offs,
            self.np_zero, unpack_data)

        self.krn_pack_.set_arg(0, unpack_data)
        self.krn_pack_.set_arg(
            1, int(self.output.devmem) +
            start_image * self.output.sample_size * self.output.itemsize)
        limit = unpack_side * self._kernel_size
        self._const_i[0] = limit
        self.krn_pack_.set_arg(2, self._const_i)
        self.execute_kernel(self._global_size_pack(limit),
                            self._local_size_pack, self.krn_pack_)

    def _ocl_process_subblock(self, start_image, image_count, unpack_data):
        output_offs = start_image * self.input.sample_size
        unpack_side = self._kernel_app_per_image * image_count

        self.gemm_(
            self.device.blas, cublas.CUBLAS_OP_T if self.weights_transposed
            else cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N,
            self._kernel_size, unpack_side, self.weights_shape[0],
            self.np_one, self.weights.devmem,
            self.input.devmem,
            self.np_zero, unpack_data, offsetB=output_offs)

        self.krn_pack_.set_arg(0, unpack_data)
        self._const_i[0] = start_image * self.output.sample_size
        self.krn_pack_.set_arg(2, self._const_i)
        limit = unpack_side * self._kernel_size
        self.execute_kernel(self._global_size_pack(limit),
                            self._local_size_pack, self.krn_pack_)

    def numpy_run(self):
        raise NotImplementedError()

Пример #6

Показать файл

Файл: deconv.py Проект: vmarkovtsev/veles.znicz

class Deconv(TriviallyDistributable, ConvolutionalBase, nn_units.Forward):
    # TriviallyDistributable overrides nn_units.Forward IDistributable
    """Deconvolutional layer for simple convolutional layer
    with linear activation and without bias.

    Must be assigned before initialize():
        input
        weights
        output_shape_source

    Updates after run():
        output

    Creates within initialize():
        output

    Attributes:
        input: input as batch of multichannel interleaved images.
        output: output as batch of multichannel interleaved images.
        weights: matrix of weights.
        output_shape_source: Array to get output shape from.
        n_kernels: number of convolutional kernels
                   in the corresponding convolutional layer.
        kx: kernel width.
        ky: kernel height.
        sliding: tuple of kernel sliding (by x-axis, by y-axis),
                 kx, ky MUST be a multiple of sliding to avoid irregularities.
        padding: tuple of virtual sample padding (left, top, right, bottom),
                 will be computed automatically based on sliding.
        weights_transposed: assume weights matrix as a transposed one.
        unsafe_padding: flag to enable unsafe padding and/or sliding.
    """

    MAPPING = {"deconv"}

    @staticmethod
    def compute_padding(sx, sy, kx, ky, sliding):
        """Computes required padding.
        """
        return (kx - sliding[1], ky - sliding[0], kx -
                sx % sliding[1] if sx % sliding[1] != 0 else kx - sliding[1],
                ky - sy % sliding[0] if sy % sliding[0] != 0 else ky -
                sliding[0])

    @staticmethod
    def check_padding_is_safe(kx, ky, sliding):
        if sliding[0] > (ky >> 1) or sliding[1] > (kx >> 1):
            raise ValueError(
                "sliding should not be greater than half of the kernel size")
        if kx % sliding[0] != 0 or kx % sliding[1] != 0:
            raise ValueError("Kernel size should be multiple of sliding")

    def __init__(self, workflow, **kwargs):
        super(Deconv, self).__init__(workflow, **kwargs)
        self.unsafe_padding = kwargs.get("unsafe_padding", False)
        self.hits = Array()
        self.krn_clear_output_ = None
        self._global_size = None
        self._local_size = None
        del self.bias
        self.demand("n_kernels", "kx", "ky", "padding", "sliding", "input",
                    "weights", "output_shape_source")

    def init_unpickled(self):
        super(Deconv, self).init_unpickled()
        self.sources_["deconv/forward"] = {}

    def initialize(self, device, **kwargs):
        super(Deconv, self).initialize(device, **kwargs)

        self._dtype = self.input.dtype

        self.weights_shape = (tuple(reversed(self.weights.shape)) if
                              self.weights_transposed else self.weights.shape)

        if hasattr(self, "bias"):
            raise ValueError("bias should not be set")
        if (len(self.input.shape) != 4
                or self.input.shape[3] != self.n_kernels):
            raise ValueError("Incorrectly shaped input encountered")
        if (len(self.weights_shape) != 2
                or self.weights_shape[0] != self.n_kernels
                or self.weights_shape[1] % (self.kx * self.ky) != 0):
            raise ValueError("Incorrectly shaped weights encountered")

        output_shape = tuple(self.output_shape_source.shape)
        if len(output_shape) != 4:
            raise ValueError("Incorrect output_shape_source shape")
        if output_shape[0] != self.input.shape[0]:
            raise ValueError("output_shape_source.shape[0] != input.shape[0]")

        try:
            self.check_padding_is_safe(self.kx, self.ky, self.sliding)
        except ValueError as e:
            if not self.unsafe_padding:
                raise from_none(e)
            self.warning("The padding will be unsafe")
            self._create_hits(output_shape)

        padding = Deconv.compute_padding(output_shape[2], output_shape[1],
                                         self.kx, self.ky, self.sliding)
        if self.padding is None:  # pylint: disable=E0203
            self.padding = padding
        elif self.padding != padding:
            if not self.unsafe_padding:
                raise ValueError("Expected padding %s but got %s" %
                                 (padding, self.padding))
            self._create_hits(output_shape)

        if not self.output:
            self.output.reset(numpy.zeros(output_shape, dtype=self._dtype))
        else:
            assert self.output.shape == output_shape

        self._output_shape = output_shape

        self._sy, self._sx, self._n_channels = self._output_shape[1:]
        self._kernel_size = self.kx * self.ky * self._n_channels

        self._kernel_app_per_image = self.input.sample_size // self.n_kernels
        self._kernel_app_total = (self._kernel_app_per_image *
                                  self.input.shape[0])

        self.init_vectors(self.input, self.weights, self.output, self.hits)

    def _create_hits(self, output_shape):
        if not self.hits:
            self.hits.reset(numpy.zeros(output_shape, dtype=numpy.int32))
        else:
            assert self.hits.size == int(numpy.prod(output_shape))

    def _gpu_init(self, blas_class):
        defines = {
            "USE_ATOMICS": 1,
            "WEIGHTS_TRANSPOSED": int(self.weights_transposed),
            "BATCH": self._output_shape[0],
            "SX": self._sx,
            "SY": self._sy,
            "N_CHANNELS": self._n_channels,
            "KX": self.kx,
            "KY": self.ky,
            "N_KERNELS": self.n_kernels,
            "PAD_LEFT": self.padding[0],
            "PAD_TOP": self.padding[1],
            "PAD_RIGHT": self.padding[2],
            "PAD_BOTTOM": self.padding[3],
            "SLIDE_X": self.sliding[0],
            "SLIDE_Y": self.sliding[1],
            "USE_HITS": int(bool(self.hits)),
            "DECONV_MODE": int(bool(self.hits)) + 1,
            "OUTPUT_SIZE": self.output.size
        }

        self.build_program(
            defines,
            "%s/%s_%d_%dx%dx%d_%dx%d_%d" %
            (root.common.dirs.cache, self.__class__.__name__,
             self.input.shape[0], self._output_shape[2], self._output_shape[1],
             self._output_shape[3], self.kx, self.ky, self.n_kernels),
            dtype=self._dtype)

        self.krn_pack_ = self.get_kernel("DirectPack")
        unpack_bytes = (self._kernel_app_per_image * self.unpack_size *
                        self._kernel_size * self.input.itemsize)
        self.device.request_temp_buffer(unpack_bytes)

        if self.hits:
            self.krn_pack_.set_arg(3, self.hits.devmem)

            self.krn_apply_hits_ = self.get_kernel("apply_hits")
            self.krn_apply_hits_.set_args(self.output.devmem, self.hits.devmem)

        self.gemm_ = blas_class.gemm(self._dtype)
        self.np_one = numpy.ones(1, dtype=self._dtype)
        self.np_zero = numpy.zeros(1, dtype=self._dtype)
        self._const_i = numpy.zeros(1, dtype=numpy.int64)

    def ocl_init(self):
        ocl_blas.OCLBLAS.attach_to_device(self.device)
        self._gpu_init(ocl_blas.OCLBLAS)

        self._global_size_pack = lambda size: (size, )
        self._local_size_pack = None

        if self.hits:
            self.krn_clear_hits_ = self.get_kernel("clear_hits")
            self.krn_clear_hits_.set_arg(0, self.hits.devmem)

            self._global_size_hits = (self.output.size, )
            self._local_size_hits = None

        self.krn_clear_output_ = self.get_kernel("clear_output")
        self.krn_clear_output_.set_arg(0, self.output.devmem)

        self._clear_output = lambda: (self.execute_kernel(
            (self.output.size, ), None, self.krn_clear_output_))
        self._clear_hits = lambda: (self.execute_kernel(
            (self.hits.size, ), None, self.krn_clear_hits_))

        self._process_subblock = self._ocl_process_subblock

        self.krn_pack_.set_arg(1, self.output.devmem)

    def cuda_init(self):
        self._gpu_init(cublas.CUBLAS)

        block_size = self.device.suggest_block_size(self.krn_pack_)
        self._global_size_pack = (lambda size:
                                  (int(numpy.ceil(size / block_size)), 1, 1))
        self._local_size_pack = (block_size, 1, 1)

        if self.hits:
            block_size = self.device.suggest_block_size(self.krn_apply_hits_)
            self._global_size_hits = (int(
                numpy.ceil(self.output.size / block_size)), 1, 1)
            self._local_size_hits = (block_size, 1, 1)

        self._clear_output = lambda: self.output.devmem.memset32_async()
        self._clear_hits = lambda: self.hits.devmem.memset32_async()

        self._process_subblock = self._cuda_process_subblock

    def ocl_run(self):
        self.gpu_run()

    def cuda_run(self):
        self.gpu_run()

    def gpu_run(self):
        self.unmap_vectors(self.output, self.input, self.weights)
        unpack_data = self.device.get_temp_buffer()
        self._clear_output()
        if self.hits:
            self.hits.unmap()
            self._clear_hits()
        batch_size = self.output.shape[0]
        for i in range(0, batch_size, self.unpack_size):
            self._process_subblock(i, min(batch_size - i, self.unpack_size),
                                   unpack_data)
        if self.hits:
            self.execute_kernel(self._global_size_hits, self._local_size_hits,
                                self.krn_apply_hits_)

    def _cuda_process_subblock(self, start_image, image_count, unpack_data):
        output_offs = (start_image * self.input.sample_size *
                       self.input.itemsize)
        unpack_side = self._kernel_app_per_image * image_count

        self.gemm_(
            self.device.blas, cublas.CUBLAS_OP_T
            if self.weights_transposed else cublas.CUBLAS_OP_N,
            cublas.CUBLAS_OP_N, self._kernel_size, unpack_side,
            self.weights_shape[0], self.np_one, self.weights.devmem,
            int(self.input.devmem) + output_offs, self.np_zero, unpack_data)

        self.krn_pack_.set_arg(0, unpack_data)
        self.krn_pack_.set_arg(
            1,
            int(self.output.devmem) +
            start_image * self.output.sample_size * self.output.itemsize)
        limit = unpack_side * self._kernel_size
        self._const_i[0] = limit
        self.krn_pack_.set_arg(2, self._const_i)
        self.execute_kernel(self._global_size_pack(limit),
                            self._local_size_pack, self.krn_pack_)

    def _ocl_process_subblock(self, start_image, image_count, unpack_data):
        output_offs = start_image * self.input.sample_size
        unpack_side = self._kernel_app_per_image * image_count

        self.gemm_(self.device.blas,
                   cublas.CUBLAS_OP_T
                   if self.weights_transposed else cublas.CUBLAS_OP_N,
                   cublas.CUBLAS_OP_N,
                   self._kernel_size,
                   unpack_side,
                   self.weights_shape[0],
                   self.np_one,
                   self.weights.devmem,
                   self.input.devmem,
                   self.np_zero,
                   unpack_data,
                   offsetB=output_offs)

        self.krn_pack_.set_arg(0, unpack_data)
        self._const_i[0] = start_image * self.output.sample_size
        self.krn_pack_.set_arg(2, self._const_i)
        limit = unpack_side * self._kernel_size
        self.execute_kernel(self._global_size_pack(limit),
                            self._local_size_pack, self.krn_pack_)

    def numpy_run(self):
        raise NotImplementedError()

Пример #7

Показать файл

Файл: kohonen.py Проект: vmarkovtsev/veles.znicz

class KohonenForward(KohonenBase, AcceleratedUnit):
    """Kohonen forward layer.

    Must be assigned before initialize():
        input
        weights
        minibatch_offset (if total == True)
        minibatch_size (if total == True)
        batch_size (if total == True)
        argmins speeds up run() if linked from KohonenTrainer

    Updates after run():
        output

    Creates within initialize():
        output

    Attributes:
        input: input as batch of samples.
        weights: the weights of the neurons in Kohonen layer.
        output: the list of winners.
        total: if total=True is passed in __init__(), the overall winners table
    """
    def __init__(self, workflow, **kwargs):
        super(KohonenForward, self).__init__(workflow, **kwargs)
        self.demand("input", "weights")
        self.argmins = None
        self._distances = Array()
        self.output = Array()
        self._chunk_size_ = 0
        self.weights_transposed = False
        self.total = Array() if kwargs.get("total", False) else None
        if self.total is not None:
            self.minibatch_offset = None
            self.minibatch_size = None
            self.batch_size = None

    def init_unpickled(self):
        super(KohonenForward, self).init_unpickled()
        self.sources_["kohonen"] = {"FORWARD": 1}

    @property
    def neurons_number(self):
        return self.weights.mem.shape[0]

    @property
    def sample_length(self):
        return self.weights.mem.shape[1]

    @property
    def chunk_size(self):
        return self._chunk_size_

    def initialize(self, device, **kwargs):
        super(KohonenForward, self).initialize(device=device, **kwargs)

        assert self.input.mem.shape[1] == self.sample_length
        batch_size = self.input.mem.shape[0]

        self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32))
        if self.argmins is None:
            self._distances.reset(
                numpy.zeros([batch_size, self.neurons_number],
                            dtype=self.weights.mem.dtype))

        if self.total is not None:
            self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32))
            self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32)

    def ocl_init(self):
        batch_size = self.input.mem.shape[0]
        self.output.initialize(self.device)
        if self.argmins is None:
            self.input.initialize(self.device)
            self.weights.initialize(self.device)
            self._distances.initialize(self.device)
        elif self.total is None:
            return
        if self.total is not None:
            self.total.initialize(self.device)

        copy_chunk_size = int(
            numpy.ceil(batch_size / self.device.max_group_size))
        chunk_size = self.neurons_number // self.device.max_group_size
        if chunk_size < 2:
            chunk_size = self.neurons_number // 2 + 1
        self.argmin_group_size = \
            int(numpy.ceil(self.neurons_number / chunk_size))

        block_size, vector_opt = self.device.device_info.get_kernel_bs_vo(
            kernel="matrix_multiplication", dtype=self.input.dtype)

        defines = {
            'BLOCK_SIZE': block_size,
            'VECTOR_OPT': int(bool(vector_opt)),
            'BATCH': batch_size,
            'SAMPLE_LENGTH': self.sample_length,
            'NEURONS_NUMBER': self.neurons_number,
            'CHUNK_SIZE': chunk_size,
            'COPY_CHUNK_SIZE': copy_chunk_size,
        }
        if self.weights_transposed:
            defines['WEIGHTS_TRANSPOSED'] = 1
        self.build_program(defines,
                           "%s_%d_%d_%d" %
                           (self.__class__.__name__, batch_size,
                            self.sample_length, self.neurons_number),
                           dtype=self.weights.mem.dtype)

        if self.total is not None:
            self._set_total_global_size_ = \
                [int(numpy.ceil(batch_size / copy_chunk_size))]
            self._krn_set_total_ = self.get_kernel("set_total")
            self._krn_set_total_.set_args(self.output.devmem, cl.skip,
                                          self.total.devmem)
        if self.argmins is not None:
            return

        self._krn_distances_ = self.get_kernel("calculate_distances")
        self._krn_distances_.set_args(self.input.devmem, self.weights.devmem,
                                      self._distances.devmem)

        self._krn_argmin_ = self.get_kernel("calculate_argmin")
        self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem,
                                   None)

        self._gs_distance = [
            roundup(self.neurons_number, block_size),
            roundup(batch_size, block_size)
        ]
        self._ls_distance = [block_size, block_size]

    def ocl_run(self):
        self.output.unmap()
        if self.total is not None:
            self.total.unmap()

        if self.argmins is None:
            self.input.unmap()
            self.weights.unmap()
            self.execute_kernel(self._gs_distance, self._ls_distance,
                                self._krn_distances_)
            self.execute_kernel([self.argmin_group_size],
                                [self.argmin_group_size], self._krn_argmin_)
        else:
            self.argmins.unmap()
            self.argmins.map_read()
            self.output.map_write()
            self.output.mem[:] = self.argmins.mem
            self.output.unmap()
            self.argmins.unmap()

        if self.total is not None:
            self._minibatch_offset_[0] = \
                self.minibatch_offset - self.minibatch_size
            self._krn_set_total_.set_arg(1, self._minibatch_offset_)
            self.execute_kernel(self._set_total_global_size_, None,
                                self._krn_set_total_)

    def numpy_run(self):
        self.output.map_invalidate()

        if self.argmins is not None:
            self.argmins.map_read()
            self.output.mem[:] = self.argmins.mem
        else:
            self.input.map_read()
            self.weights.map_read()

        if self.total is not None:
            self.total.map_invalidate()

        length = self.minibatch_size if self.total is not None \
            else self.input.mem.shape[0]
        for sindex in range(length):
            if self.argmins is None:
                dist = self.weights.mem - self.input[sindex]
                winner = numpy.argmin(self.numpy_linalg_norm(dist))
                self.output[sindex] = winner
            else:
                winner = self.argmins[sindex]
            if self.total is not None:
                index = sindex + self.minibatch_offset - self.minibatch_size
                self.total[index] = winner

Пример #8

Показать файл

Файл: nn_units.py Проект: vmarkovtsev/veles.znicz

class GradientDescentBase(AcceleratedUnit):
    """Base class for gradient descent units.

    Attributes:
        input: input layer values.
        output: output layer values.
        err_output: error to backpropagate.
        err_input: backpropagated error.
        weights: weights.
        bias: bias.
        batch_size: current minibatch size.
        learning_rate: gradient descent speed (positive).
        learning_rate_bias
        weights_decay: regularization for weights (see l1_vs_l2).
        weights_decay_bias
        gradient_moment: moment coefficient for weights.
        gradient_moment_bias
        gradient_weights_with_moment: accumulated moment.
        gradient_bias_with_moment
        batch_size: effective batch size (if None, get it from y).
        weights_transposed: assume weights matrix as a transposed one.
        apply_gradient: will apply gradient.
        gradient_changed: when True, slave will send gradients to master
            (assigned to True just before the run call, so it can be set to
            False inside ocl_run, numpy_run if necessary).
        ocl_set_const_args: True when constant arguments for the kernel
                            had been changed and need to be set again.
    """
    hide_from_registry = True
    MAPPING = set()

    REDUCE_SIZE = 64  # used for updating bias

    def __init__(self, workflow, **kwargs):
        kwargs["view_group"] = kwargs.get("view_group", "TRAINER")
        super(GradientDescentBase, self).__init__(workflow, **kwargs)
        self.err_input = Array(shallow_pickle=True)
        self.ocl_set_const_args = True
        self.weights = None
        self.bias = None
        self.demand("input", "err_output")
        self.learning_rate = kwargs.get("learning_rate", 0.01)
        self.learning_rate_bias = kwargs.get("learning_rate_bias",
                                             self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", 0.00005)
        self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0)
        self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0)
        self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2)
        self.gradient_moment = kwargs.get("gradient_moment", 0)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias",
                                               self.gradient_moment)
        self.weights_transposed = kwargs.get("weights_transposed", False)
        self.need_err_input = kwargs.get("need_err_input", True)
        self.include_bias = kwargs.get("include_bias", True)
        self.factor_ortho = kwargs.get("factor_ortho", 0)
        self.col_sums = Array()  # for orthogonalization

        # Current gradient as it is without applying learning_rate etc.
        self.gradient_weights = Array()
        self.gradient_bias = Array()

        # Gradient with applied learning_rate etc.
        # optionally accumulated from the previous run
        self.accumulate_gradient = kwargs.get("accumulate_gradient", False)

        # When accumulate_gradient set to True:
        # 1. Calculate gd
        # 2. acc = acc_alpha * gd + acc_beta * acc
        # 3. gd = gd_alpha * acc + gd_beta * gd
        # 4. Apply moments to gd
        # 5. weights += gd if apply_gradient set to True
        self.acc_alpha = kwargs.get("acc_alpha", 0.0)
        self.acc_beta = kwargs.get("acc_beta", 0.0)
        self.gd_alpha = kwargs.get("gd_alpha", 0.0)
        self.gd_beta = kwargs.get("gd_beta", 1.0)

        self.accumulated_gradient_weights = Array()
        self.accumulated_gradient_bias = Array()

        # Gradient with accumulated moments
        self.gradient_weights_with_moment = Array()
        self.gradient_bias_with_moment = Array()

        # Sets to True when gradient changes
        self.gradient_changed = False

        # Gradient will be applied to weights immediately just after computing
        self.apply_gradient = kwargs.get("apply_gradient",
                                         not workflow.is_slave)

    @property
    def current_batch_size(self):
        batch_size = getattr(self, "batch_size", None)
        if batch_size is None:
            return self.err_output.mem.shape[0]
        return int(batch_size)

    def initialize(self, device, **kwargs):
        super(GradientDescentBase, self).initialize(device, **kwargs)

        if self.weights:
            assert len(self.weights.shape) == 2
            self.weights_shape = (tuple(reversed(self.weights.shape))
                                  if self.weights_transposed else
                                  self.weights.shape)
        else:
            self.weights_shape = None

        self.learning_rate = kwargs.get("learning_rate", self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", self.weights_decay)
        self.gradient_moment = kwargs.get("gradient_moment",
                                          self.gradient_moment)
        self.learning_rate_bias = kwargs.get("learning_rate_bias",
                                             self.learning_rate_bias)
        self.weights_decay_bias = kwargs.get("weights_decay_bias",
                                             self.weights_decay_bias)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias",
                                               self.gradient_moment_bias)

        if self.weights:
            if not self.gradient_weights:
                self.gradient_weights.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights.size == self.weights.size

        if self.weights and self.accumulate_gradient:
            if not self.accumulated_gradient_weights:
                self.accumulated_gradient_weights.reset(
                    numpy.zeros_like(self.weights.mem))
            else:
                assert (self.accumulated_gradient_weights.size ==
                        self.weights.size)

        if self.weights and (self.gradient_moment or not self.is_standalone):
            if not self.gradient_weights_with_moment:
                self.gradient_weights_with_moment.reset(
                    numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights_with_moment.size == \
                    self.weights.size

        if (self.include_bias and self.bias
                and (not self.gradient_bias
                     or self.gradient_bias.size != self.bias.size)):
            self.gradient_bias.reset(numpy.zeros_like(self.bias.mem))

        if (self.include_bias and self.bias and self.accumulate_gradient and
            (not self.accumulated_gradient_bias
             or self.accumulated_gradient_bias.size != self.bias.size)):
            self.accumulated_gradient_bias.reset(
                numpy.zeros_like(self.bias.mem))

        if (self.include_bias and self.bias
                and (self.gradient_moment_bias or not self.is_standalone)):
            if not self.gradient_bias_with_moment:
                self.gradient_bias_with_moment.reset(
                    numpy.zeros_like(self.bias.mem))
            else:
                assert self.gradient_bias_with_moment.size == self.bias.size

        dtype = self.err_output.dtype
        if self.need_err_input:
            if not self.err_input:
                self.err_input.reset(numpy.zeros(self.input.shape, dtype))
            else:
                assert self.err_input.shape == self.input.shape

        if self.weights:
            side = self.weights_shape[0]
            other = self.weights.size // side
            if self.factor_ortho:
                if not self.col_sums:
                    self.col_sums.reset(numpy.zeros(other, dtype=dtype))
                else:
                    assert self.col_sums.size == other
                self.col_sums.initialize(self.device)
            self.reduce_size = roundup(min(self.reduce_size, other), 32)
            self.weights.initialize(self.device)

        for vec in self.bias, self.input, self.err_input:
            if vec:
                vec.initialize(self.device)
        self.init_vectors(self.err_output, self.gradient_weights,
                          self.gradient_bias,
                          self.accumulated_gradient_weights,
                          self.accumulated_gradient_bias,
                          self.gradient_weights_with_moment,
                          self.gradient_bias_with_moment)

    def gpu_weights_update(self):
        self.unmap_vectors(self.input, self.err_output, self.weights,
                           self.gradient_weights,
                           self.accumulated_gradient_weights,
                           self.gradient_weights_with_moment)

        if self.factor_ortho:
            self.col_sums.unmap()
            self.execute_kernel(self._global_size_ortho,
                                self._local_size_ortho,
                                self.krn_compute_col_sums_)

            self._weights_const[12] = self.factor_ortho
            self.krn_weights_.set_arg(12, self._weights_const[12:13])

        self._weights_const[4:12] = (self.learning_rate, self.weights_decay,
                                     self.l1_vs_l2, self.gradient_moment,
                                     self.acc_alpha, self.acc_beta,
                                     self.gd_alpha, self.gd_beta)
        self.krn_weights_.set_args(
            self.device.skip(4), self._weights_const[4:5],
            self._weights_const[5:6], self._weights_const[6:7],
            self._weights_const[7:8], self._weights_const[8:9],
            self._weights_const[9:10], self._weights_const[10:11],
            self._weights_const[11:12])

        self.execute_kernel(self._global_size_weights,
                            self._local_size_weights, self.krn_weights_)

    def gpu_bias_update(self):
        if not self.include_bias:
            return

        self.unmap_vectors(self.err_output, self.bias, self.gradient_bias,
                           self.accumulated_gradient_bias,
                           self.gradient_bias_with_moment)

        self._bias_const[5:13] = (self.learning_rate_bias,
                                  self.weights_decay_bias, self.l1_vs_l2_bias,
                                  self.gradient_moment_bias, self.acc_alpha,
                                  self.acc_beta, self.gd_alpha, self.gd_beta)
        self.krn_bias_.set_args(self.device.skip(5), self._bias_const[5:6],
                                self._bias_const[6:7], self._bias_const[7:8],
                                self._bias_const[8:9], self._bias_const[9:10],
                                self._bias_const[10:11],
                                self._bias_const[11:12],
                                self._bias_const[12:13])

        self.execute_kernel(self._global_size_bias, self._local_size_bias,
                            self.krn_bias_)

    def gpu_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        if self.krn_err_output_ is None:
            return
        self.err_output.unmap()
        self.output.unmap()
        self.execute_kernel(self._global_size_err_output,
                            self._local_size_err_output, self.krn_err_output_)

    def numpy_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        pass

    def print_debug_data(self):
        """
        Show weights statistics
        """
        if not self.logger.isEnabledFor(logging.DEBUG):
            return
        self.weights.map_read()
        self.bias.map_read()
        self.gradient_bias.map_read()
        self.gradient_weights.map_read()
        weights = self.weights.mem
        bias = self.bias.mem
        grad_weights = self.gradient_weights.mem
        grad_bias = self.gradient_bias.mem

        weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max")
        weight_table.float_format = ".10"
        for (w_name, w_array) in [("Weight", weights), ("Bias", bias),
                                  ("Grad Weight", grad_weights),
                                  ("Grad Bias", grad_bias)]:
            w_mean = w_stddev = w_min = w_max = None
            if w_array is not None and w_array.size > 0:
                w_mean = numpy.mean(w_array)
                w_stddev = numpy.std(w_array)
                w_min = numpy.min(w_array)
                w_max = numpy.max(w_array)
            weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max)
        self.debug("\n" + weight_table.get_string())

    def generate_data_for_slave(self, slave):
        return (self.learning_rate, self.weights_decay, self.gradient_moment,
                self.learning_rate_bias, self.weights_decay_bias,
                self.gradient_moment_bias)

    @staticmethod
    def fill_zeros(vector):
        if not vector:
            return
        vector.map_invalidate()
        vector.mem[:] = 0

    def apply_data_from_master(self, data):
        self.learning_rate = data[0]
        self.weights_decay = data[1]
        self.gradient_moment = data[2]
        self.learning_rate_bias = data[3]
        self.weights_decay_bias = data[4]
        self.gradient_moment_bias = data[5]
        self.fill_zeros(self.gradient_weights_with_moment)
        self.fill_zeros(self.gradient_bias_with_moment)
        self.fill_zeros(self.gradient_weights)
        self.fill_zeros(self.gradient_bias)
        self.fill_zeros(self.accumulated_gradient_weights)
        self.fill_zeros(self.accumulated_gradient_bias)

    def generate_data_for_master(self):
        if not self.gradient_changed:
            return None
        self.gradient_changed = False
        self.gradient_weights_with_moment.map_read()
        self.gradient_bias_with_moment.map_read()
        return (self.gradient_weights_with_moment.mem,
                self.gradient_bias_with_moment.mem)

    def apply_data_from_slave(self, data, slave):
        if self.weights:
            self.weights.map_write()
            self.gradient_weights_with_moment.map_write()
            self.gradient_weights_with_moment.mem *= self.gradient_moment
            self.gradient_weights_with_moment.mem += data[0]
            self.weights.mem += self.gradient_weights_with_moment.mem
        if self.bias:
            self.bias.map_write()
            self.gradient_bias_with_moment.map_write()
            self.gradient_bias_with_moment.mem *= self.gradient_moment_bias
            self.gradient_bias_with_moment.mem += data[1]
            self.bias.mem += self.gradient_bias_with_moment.mem

    def drop_slave(self, slave):
        pass

    def accumulate_gradient_f(self, accumulated_gradient, gradient):
        if accumulated_gradient and self.accumulate_gradient:
            accumulated_gradient[:] = (
                gradient * self.acc_alpha +
                (self.acc_beta * accumulated_gradient if self.acc_beta else 0))

            gradient *= self.gd_beta
            gradient += self.gd_alpha * accumulated_gradient

        return gradient

    @staticmethod
    def numpy_gradient_step(weight,
                            gradient,
                            lr,
                            factor_l12,
                            l1_vs_l2,
                            factor_ortho=0,
                            weights_transposed=False):
        gradient = gradient.copy()
        gradient += factor_l12 * (
            (1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight))
        if factor_ortho:
            col_sums = (reshape_transposed(weight).sum(
                axis=1) if weights_transposed else weight.sum(axis=0))
            for i, row in enumerate(gradient):
                row += (col_sums - weight[i]) * factor_ortho / weight.shape[0]
        gradient *= lr
        return gradient

    def run(self):
        self.gradient_changed = True
        super(GradientDescentBase, self).run()
        self.ocl_set_const_args = False

Пример #9

Показать файл

Файл: nn_units.py Проект: vmarkovtsev/veles.znicz

class GradientDescentBase(AcceleratedUnit):
    """Base class for gradient descent units.

    Attributes:
        input: input layer values.
        output: output layer values.
        err_output: error to backpropagate.
        err_input: backpropagated error.
        weights: weights.
        bias: bias.
        batch_size: current minibatch size.
        learning_rate: gradient descent speed (positive).
        learning_rate_bias
        weights_decay: regularization for weights (see l1_vs_l2).
        weights_decay_bias
        gradient_moment: moment coefficient for weights.
        gradient_moment_bias
        gradient_weights_with_moment: accumulated moment.
        gradient_bias_with_moment
        batch_size: effective batch size (if None, get it from y).
        weights_transposed: assume weights matrix as a transposed one.
        apply_gradient: will apply gradient.
        gradient_changed: when True, slave will send gradients to master
            (assigned to True just before the run call, so it can be set to
            False inside ocl_run, numpy_run if necessary).
        ocl_set_const_args: True when constant arguments for the kernel
                            had been changed and need to be set again.
    """

    hide_from_registry = True
    MAPPING = set()

    REDUCE_SIZE = 64  # used for updating bias

    def __init__(self, workflow, **kwargs):
        kwargs["view_group"] = kwargs.get("view_group", "TRAINER")
        super(GradientDescentBase, self).__init__(workflow, **kwargs)
        self.err_input = Array(shallow_pickle=True)
        self.ocl_set_const_args = True
        self.weights = None
        self.bias = None
        self.demand("input", "err_output")
        self.learning_rate = kwargs.get("learning_rate", 0.01)
        self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", 0.00005)
        self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0)
        self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0)
        self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2)
        self.gradient_moment = kwargs.get("gradient_moment", 0)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment)
        self.weights_transposed = kwargs.get("weights_transposed", False)
        self.need_err_input = kwargs.get("need_err_input", True)
        self.include_bias = kwargs.get("include_bias", True)
        self.factor_ortho = kwargs.get("factor_ortho", 0)
        self.col_sums = Array()  # for orthogonalization

        # Current gradient as it is without applying learning_rate etc.
        self.gradient_weights = Array()
        self.gradient_bias = Array()

        # Gradient with applied learning_rate etc.
        # optionally accumulated from the previous run
        self.accumulate_gradient = kwargs.get("accumulate_gradient", False)

        # When accumulate_gradient set to True:
        # 1. Calculate gd
        # 2. acc = acc_alpha * gd + acc_beta * acc
        # 3. gd = gd_alpha * acc + gd_beta * gd
        # 4. Apply moments to gd
        # 5. weights += gd if apply_gradient set to True
        self.acc_alpha = kwargs.get("acc_alpha", 0.0)
        self.acc_beta = kwargs.get("acc_beta", 0.0)
        self.gd_alpha = kwargs.get("gd_alpha", 0.0)
        self.gd_beta = kwargs.get("gd_beta", 1.0)

        self.accumulated_gradient_weights = Array()
        self.accumulated_gradient_bias = Array()

        # Gradient with accumulated moments
        self.gradient_weights_with_moment = Array()
        self.gradient_bias_with_moment = Array()

        # Sets to True when gradient changes
        self.gradient_changed = False

        # Gradient will be applied to weights immediately just after computing
        self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave)

    @property
    def current_batch_size(self):
        batch_size = getattr(self, "batch_size", None)
        if batch_size is None:
            return self.err_output.mem.shape[0]
        return int(batch_size)

    def initialize(self, device, **kwargs):
        super(GradientDescentBase, self).initialize(device, **kwargs)

        if self.weights:
            assert len(self.weights.shape) == 2
            self.weights_shape = tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape
        else:
            self.weights_shape = None

        self.learning_rate = kwargs.get("learning_rate", self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", self.weights_decay)
        self.gradient_moment = kwargs.get("gradient_moment", self.gradient_moment)
        self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate_bias)
        self.weights_decay_bias = kwargs.get("weights_decay_bias", self.weights_decay_bias)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment_bias)

        if self.weights:
            if not self.gradient_weights:
                self.gradient_weights.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights.size == self.weights.size

        if self.weights and self.accumulate_gradient:
            if not self.accumulated_gradient_weights:
                self.accumulated_gradient_weights.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.accumulated_gradient_weights.size == self.weights.size

        if self.weights and (self.gradient_moment or not self.is_standalone):
            if not self.gradient_weights_with_moment:
                self.gradient_weights_with_moment.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights_with_moment.size == self.weights.size

        if self.include_bias and self.bias and (not self.gradient_bias or self.gradient_bias.size != self.bias.size):
            self.gradient_bias.reset(numpy.zeros_like(self.bias.mem))

        if (
            self.include_bias
            and self.bias
            and self.accumulate_gradient
            and (not self.accumulated_gradient_bias or self.accumulated_gradient_bias.size != self.bias.size)
        ):
            self.accumulated_gradient_bias.reset(numpy.zeros_like(self.bias.mem))

        if self.include_bias and self.bias and (self.gradient_moment_bias or not self.is_standalone):
            if not self.gradient_bias_with_moment:
                self.gradient_bias_with_moment.reset(numpy.zeros_like(self.bias.mem))
            else:
                assert self.gradient_bias_with_moment.size == self.bias.size

        dtype = self.err_output.dtype
        if self.need_err_input:
            if not self.err_input:
                self.err_input.reset(numpy.zeros(self.input.shape, dtype))
            else:
                assert self.err_input.shape == self.input.shape

        if self.weights:
            side = self.weights_shape[0]
            other = self.weights.size // side
            if self.factor_ortho:
                if not self.col_sums:
                    self.col_sums.reset(numpy.zeros(other, dtype=dtype))
                else:
                    assert self.col_sums.size == other
                self.col_sums.initialize(self.device)
            self.reduce_size = roundup(min(self.reduce_size, other), 32)
            self.weights.initialize(self.device)

        for vec in self.bias, self.input, self.err_input:
            if vec:
                vec.initialize(self.device)
        self.init_vectors(
            self.err_output,
            self.gradient_weights,
            self.gradient_bias,
            self.accumulated_gradient_weights,
            self.accumulated_gradient_bias,
            self.gradient_weights_with_moment,
            self.gradient_bias_with_moment,
        )

    def gpu_weights_update(self):
        self.unmap_vectors(
            self.input,
            self.err_output,
            self.weights,
            self.gradient_weights,
            self.accumulated_gradient_weights,
            self.gradient_weights_with_moment,
        )

        if self.factor_ortho:
            self.col_sums.unmap()
            self.execute_kernel(self._global_size_ortho, self._local_size_ortho, self.krn_compute_col_sums_)

            self._weights_const[12] = self.factor_ortho
            self.krn_weights_.set_arg(12, self._weights_const[12:13])

        self._weights_const[4:12] = (
            self.learning_rate,
            self.weights_decay,
            self.l1_vs_l2,
            self.gradient_moment,
            self.acc_alpha,
            self.acc_beta,
            self.gd_alpha,
            self.gd_beta,
        )
        self.krn_weights_.set_args(
            self.device.skip(4),
            self._weights_const[4:5],
            self._weights_const[5:6],
            self._weights_const[6:7],
            self._weights_const[7:8],
            self._weights_const[8:9],
            self._weights_const[9:10],
            self._weights_const[10:11],
            self._weights_const[11:12],
        )

        self.execute_kernel(self._global_size_weights, self._local_size_weights, self.krn_weights_)

    def gpu_bias_update(self):
        if not self.include_bias:
            return

        self.unmap_vectors(
            self.err_output,
            self.bias,
            self.gradient_bias,
            self.accumulated_gradient_bias,
            self.gradient_bias_with_moment,
        )

        self._bias_const[5:13] = (
            self.learning_rate_bias,
            self.weights_decay_bias,
            self.l1_vs_l2_bias,
            self.gradient_moment_bias,
            self.acc_alpha,
            self.acc_beta,
            self.gd_alpha,
            self.gd_beta,
        )
        self.krn_bias_.set_args(
            self.device.skip(5),
            self._bias_const[5:6],
            self._bias_const[6:7],
            self._bias_const[7:8],
            self._bias_const[8:9],
            self._bias_const[9:10],
            self._bias_const[10:11],
            self._bias_const[11:12],
            self._bias_const[12:13],
        )

        self.execute_kernel(self._global_size_bias, self._local_size_bias, self.krn_bias_)

    def gpu_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        if self.krn_err_output_ is None:
            return
        self.err_output.unmap()
        self.output.unmap()
        self.execute_kernel(self._global_size_err_output, self._local_size_err_output, self.krn_err_output_)

    def numpy_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        pass

    def print_debug_data(self):
        """
        Show weights statistics
        """
        if not self.logger.isEnabledFor(logging.DEBUG):
            return
        self.weights.map_read()
        self.bias.map_read()
        self.gradient_bias.map_read()
        self.gradient_weights.map_read()
        weights = self.weights.mem
        bias = self.bias.mem
        grad_weights = self.gradient_weights.mem
        grad_bias = self.gradient_bias.mem

        weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max")
        weight_table.float_format = ".10"
        for (w_name, w_array) in [
            ("Weight", weights),
            ("Bias", bias),
            ("Grad Weight", grad_weights),
            ("Grad Bias", grad_bias),
        ]:
            w_mean = w_stddev = w_min = w_max = None
            if w_array is not None and w_array.size > 0:
                w_mean = numpy.mean(w_array)
                w_stddev = numpy.std(w_array)
                w_min = numpy.min(w_array)
                w_max = numpy.max(w_array)
            weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max)
        self.debug("\n" + weight_table.get_string())

    def generate_data_for_slave(self, slave):
        return (
            self.learning_rate,
            self.weights_decay,
            self.gradient_moment,
            self.learning_rate_bias,
            self.weights_decay_bias,
            self.gradient_moment_bias,
        )

    @staticmethod
    def fill_zeros(vector):
        if not vector:
            return
        vector.map_invalidate()
        vector.mem[:] = 0

    def apply_data_from_master(self, data):
        self.learning_rate = data[0]
        self.weights_decay = data[1]
        self.gradient_moment = data[2]
        self.learning_rate_bias = data[3]
        self.weights_decay_bias = data[4]
        self.gradient_moment_bias = data[5]
        self.fill_zeros(self.gradient_weights_with_moment)
        self.fill_zeros(self.gradient_bias_with_moment)
        self.fill_zeros(self.gradient_weights)
        self.fill_zeros(self.gradient_bias)
        self.fill_zeros(self.accumulated_gradient_weights)
        self.fill_zeros(self.accumulated_gradient_bias)

    def generate_data_for_master(self):
        if not self.gradient_changed:
            return None
        self.gradient_changed = False
        self.gradient_weights_with_moment.map_read()
        self.gradient_bias_with_moment.map_read()
        return (self.gradient_weights_with_moment.mem, self.gradient_bias_with_moment.mem)

    def apply_data_from_slave(self, data, slave):
        if self.weights:
            self.weights.map_write()
            self.gradient_weights_with_moment.map_write()
            self.gradient_weights_with_moment.mem *= self.gradient_moment
            self.gradient_weights_with_moment.mem += data[0]
            self.weights.mem += self.gradient_weights_with_moment.mem
        if self.bias:
            self.bias.map_write()
            self.gradient_bias_with_moment.map_write()
            self.gradient_bias_with_moment.mem *= self.gradient_moment_bias
            self.gradient_bias_with_moment.mem += data[1]
            self.bias.mem += self.gradient_bias_with_moment.mem

    def drop_slave(self, slave):
        pass

    def accumulate_gradient_f(self, accumulated_gradient, gradient):
        if accumulated_gradient and self.accumulate_gradient:
            accumulated_gradient[:] = gradient * self.acc_alpha + (
                self.acc_beta * accumulated_gradient if self.acc_beta else 0
            )

            gradient *= self.gd_beta
            gradient += self.gd_alpha * accumulated_gradient

        return gradient

    @staticmethod
    def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False):
        gradient = gradient.copy()
        gradient += factor_l12 * ((1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight))
        if factor_ortho:
            col_sums = reshape_transposed(weight).sum(axis=1) if weights_transposed else weight.sum(axis=0)
            for i, row in enumerate(gradient):
                row += (col_sums - weight[i]) * factor_ortho / weight.shape[0]
        gradient *= lr
        return gradient

    def run(self):
        self.gradient_changed = True
        super(GradientDescentBase, self).run()
        self.ocl_set_const_args = False

Пример #10

Показать файл

Файл: weights_zerofilling.py Проект: vmarkovtsev/veles.znicz

class ZeroFiller(ForwardBase, TriviallyDistributable):
    """Fills weights of given unit with zero on every step"""

    MAPPING = {"zero_filter"}

    def __init__(self, workflow, **kwargs):
        super(ZeroFiller, self).__init__(workflow, **kwargs)

        self.mask = Array()
        self.grouping = kwargs.get("grouping", 1)
        self.demand("weights")

    def init_unpickled(self):
        super(ZeroFiller, self).init_unpickled()
        self.sources_["weights_zerofilling"] = {}

    @property
    def effective_shape(self):
        return (self.weights.shape[0],
                self.weights.size // self.weights.shape[0])

    @property
    def grouping(self):
        return self._grouping

    @grouping.setter
    def grouping(self, value):
        if not isinstance(value, int):
            raise TypeError("grouping value must be an integer (got %s)" %
                            type(value))
        if value < 2:
            raise ValueError("grouping value %d is invalid" % value)
        self._grouping = value

    def initialize(self, device=None, **kwargs):
        super(ZeroFiller, self).initialize(device, **kwargs)
        if not self.weights:
            return True

        if not self.mask:
            if self.effective_shape[1] % self.grouping != 0:
                raise ValueError(
                    "Non-multiple of grouping weights shape detected: "
                    "%s, grouping=%d" % (self.weights.shape, self.grouping))
            self.mask.reset(
                numpy.zeros(self.effective_shape, dtype=self.weights.dtype))
            self.mask.map_invalidate()
            # TODO(a.kazantsev): add check for transposed weights.
            for kernel in range(self.effective_shape[0]):
                for chan in range(self.effective_shape[1]):
                    self.mask[kernel, chan] = not (kernel % self.grouping
                                                   == chan % self.grouping)
        else:
            assert self.mask.shape == self.effective_shape

        for vec in self.mask, self.weights:
            vec.initialize(device)

    def _gpu_init(self):
        self.build_program(cache_file_name="zero_filling_%d" % self.grouping,
                           dtype=self.weights.dtype)

        self.assign_kernel("multiply_by_mask")
        self.set_args(self.mask, self.weights)

    def ocl_init(self):
        self._gpu_init()
        self._global_size = [self.weights.size]
        self._local_size = None

    def cuda_init(self):
        self._gpu_init()
        self._global_size = (self.weights.size, 1, 1)
        self._local_size = (1, 1, 1)

    def numpy_run(self):
        self.mask.map_read()
        self.weights.map_write()

        self.weights.mem *= self.mask.mem

    def _gpu_run(self):
        self.weights.unmap()
        self.mask.unmap()
        self.execute_kernel(self._global_size, self._local_size)

    def ocl_run(self):
        self._gpu_run()

    def cuda_run(self):
        self._gpu_run()

Пример #11

Показать файл

Файл: pooling.py Проект: vmarkovtsev/veles.znicz

class OffsetPooling(Pooling):
    """Pooling by offset forward propagation.

    Must be assigned before initialize():

    Updates after run():
        input_offset

    Creates within initialize():
        input_offset

    Attributes:
        input_offset: offsets in the input where elements are passed through.
    """

    MAPPING = set()
    hide_from_registry = True

    def __init__(self, workflow, **kwargs):
        super(OffsetPooling, self).__init__(workflow, **kwargs)
        self.input_offset = Array()
        self.demand("input")

    def initialize(self, device, **kwargs):
        super(OffsetPooling, self).initialize(device=device, **kwargs)

        if self._no_output:
            return
        if not self.input_offset:
            self.input_offset.reset(numpy.zeros(self.output.shape,
                                                dtype=numpy.int32))
        else:
            assert self.input_offset.shape == self.output.shape
        self.input_offset.initialize(self.device)

    def set_args(self, *args):
        super(OffsetPooling, self).set_args(self.input, self.output,
                                            self.input_offset, *args)

    def ocl_run(self):
        self.input_offset.unmap()
        super(OffsetPooling, self).ocl_run()

    def cuda_run(self):
        self.input_offset.unmap()
        super(OffsetPooling, self).cuda_run()

    def numpy_run(self):
        self.input_offset.map_invalidate()
        super(OffsetPooling, self).numpy_run()

    def numpy_run_cut(self, cut, coords):
        batch, y1, x1, ch, out_y, out_x = coords
        cut_index = self.numpy_run_cut_offset(
            cut, numpy.ravel_multi_index((batch, out_y, out_x, ch),
                                         self.output.shape))
        i, j = numpy.unravel_index(cut_index, cut.shape)
        idx = numpy.ravel_multi_index((batch, y1 + i, x1 + j, ch),
                                      self.input.shape)
        val = numpy.ravel(self.input.mem)[idx]
        self.input_offset.mem[batch, out_y, out_x, ch] = idx
        return val

Пример #12

Показать файл

Файл: kohonen.py Проект: Samsung/veles.znicz

class KohonenForward(KohonenBase, AcceleratedUnit):
    """Kohonen forward layer.

    Must be assigned before initialize():
        input
        weights
        minibatch_offset (if total == True)
        minibatch_size (if total == True)
        batch_size (if total == True)
        argmins speeds up run() if linked from KohonenTrainer

    Updates after run():
        output

    Creates within initialize():
        output

    Attributes:
        input: input as batch of samples.
        weights: the weights of the neurons in Kohonen layer.
        output: the list of winners.
        total: if total=True is passed in __init__(), the overall winners table
    """
    def __init__(self, workflow, **kwargs):
        super(KohonenForward, self).__init__(workflow, **kwargs)
        self.demand("input", "weights")
        self.argmins = None
        self._distances = Array()
        self.output = Array()
        self._chunk_size_ = 0
        self.weights_transposed = False
        self.total = Array() if kwargs.get("total", False) else None
        if self.total is not None:
            self.minibatch_offset = None
            self.minibatch_size = None
            self.batch_size = None

    def init_unpickled(self):
        super(KohonenForward, self).init_unpickled()
        self.sources_["kohonen"] = {"FORWARD": 1}

    @property
    def neurons_number(self):
        return self.weights.mem.shape[0]

    @property
    def sample_length(self):
        return self.weights.mem.shape[1]

    @property
    def chunk_size(self):
        return self._chunk_size_

    def initialize(self, device, **kwargs):
        super(KohonenForward, self).initialize(device=device, **kwargs)

        assert self.input.mem.shape[1] == self.sample_length
        batch_size = self.input.mem.shape[0]

        self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32))
        if self.argmins is None:
            self._distances.reset(numpy.zeros(
                [batch_size, self.neurons_number],
                dtype=self.weights.mem.dtype))

        if self.total is not None:
            self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32))
            self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32)

    def ocl_init(self):
        batch_size = self.input.mem.shape[0]
        self.output.initialize(self.device)
        if self.argmins is None:
            self.input.initialize(self.device)
            self.weights.initialize(self.device)
            self._distances.initialize(self.device)
        elif self.total is None:
            return
        if self.total is not None:
            self.total.initialize(self.device)

        copy_chunk_size = int(numpy.ceil(batch_size /
                                         self.device.max_group_size))
        chunk_size = self.neurons_number // self.device.max_group_size
        if chunk_size < 2:
            chunk_size = self.neurons_number // 2 + 1
        self.argmin_group_size = \
            int(numpy.ceil(self.neurons_number / chunk_size))

        block_size, vector_opt = self.device.device_info.get_kernel_bs_vo(
            kernel="matrix_multiplication", dtype=self.input.dtype)

        defines = {
            'BLOCK_SIZE': block_size,
            'VECTOR_OPT': int(bool(vector_opt)),
            'BATCH': batch_size,
            'SAMPLE_LENGTH': self.sample_length,
            'NEURONS_NUMBER': self.neurons_number,
            'CHUNK_SIZE': chunk_size,
            'COPY_CHUNK_SIZE': copy_chunk_size,
        }
        if self.weights_transposed:
            defines['WEIGHTS_TRANSPOSED'] = 1
        self.build_program(defines, "%s_%d_%d_%d" %
                           (self.__class__.__name__,
                            batch_size, self.sample_length,
                            self.neurons_number),
                           dtype=self.weights.mem.dtype)

        if self.total is not None:
            self._set_total_global_size_ = \
                [int(numpy.ceil(batch_size / copy_chunk_size))]
            self._krn_set_total_ = self.get_kernel("set_total")
            self._krn_set_total_.set_args(self.output.devmem, cl.skip,
                                          self.total.devmem)
        if self.argmins is not None:
            return

        self._krn_distances_ = self.get_kernel("calculate_distances")
        self._krn_distances_.set_args(self.input.devmem, self.weights.devmem,
                                      self._distances.devmem)

        self._krn_argmin_ = self.get_kernel("calculate_argmin")
        self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem,
                                   None)

        self._gs_distance = [
            roundup(self.neurons_number, block_size),
            roundup(batch_size, block_size)]
        self._ls_distance = [block_size, block_size]

    def ocl_run(self):
        self.output.unmap()
        if self.total is not None:
            self.total.unmap()

        if self.argmins is None:
            self.input.unmap()
            self.weights.unmap()
            self.execute_kernel(self._gs_distance, self._ls_distance,
                                self._krn_distances_)
            self.execute_kernel([self.argmin_group_size],
                                [self.argmin_group_size],
                                self._krn_argmin_)
        else:
            self.argmins.unmap()
            self.argmins.map_read()
            self.output.map_write()
            self.output.mem[:] = self.argmins.mem
            self.output.unmap()
            self.argmins.unmap()

        if self.total is not None:
            self._minibatch_offset_[0] = \
                self.minibatch_offset - self.minibatch_size
            self._krn_set_total_.set_arg(1, self._minibatch_offset_)
            self.execute_kernel(self._set_total_global_size_, None,
                                self._krn_set_total_)

    def numpy_run(self):
        self.output.map_invalidate()

        if self.argmins is not None:
            self.argmins.map_read()
            self.output.mem[:] = self.argmins.mem
        else:
            self.input.map_read()
            self.weights.map_read()

        if self.total is not None:
            self.total.map_invalidate()

        length = self.minibatch_size if self.total is not None \
            else self.input.mem.shape[0]
        for sindex in range(length):
            if self.argmins is None:
                dist = self.weights.mem - self.input[sindex]
                winner = numpy.argmin(self.numpy_linalg_norm(dist))
                self.output[sindex] = winner
            else:
                winner = self.argmins[sindex]
            if self.total is not None:
                index = sindex + self.minibatch_offset - self.minibatch_size
                self.total[index] = winner

Пример #13

Показать файл

Файл: weights_zerofilling.py Проект: Samsung/veles.znicz

class ZeroFiller(ForwardBase, TriviallyDistributable):
    """Fills weights of given unit with zero on every step"""

    MAPPING = {"zero_filter"}

    def __init__(self, workflow, **kwargs):
        super(ZeroFiller, self).__init__(workflow, **kwargs)

        self.mask = Array()
        self.grouping = kwargs.get("grouping", 1)
        self.demand("weights")

    def init_unpickled(self):
        super(ZeroFiller, self).init_unpickled()
        self.sources_["weights_zerofilling"] = {}

    @property
    def effective_shape(self):
        return (self.weights.shape[0],
                self.weights.size // self.weights.shape[0])

    @property
    def grouping(self):
        return self._grouping

    @grouping.setter
    def grouping(self, value):
        if not isinstance(value, int):
            raise TypeError(
                "grouping value must be an integer (got %s)" % type(value))
        if value < 2:
            raise ValueError("grouping value %d is invalid" % value)
        self._grouping = value

    def initialize(self, device=None, **kwargs):
        super(ZeroFiller, self).initialize(device, **kwargs)
        if not self.weights:
            return True

        if not self.mask:
            if self.effective_shape[1] % self.grouping != 0:
                raise ValueError(
                    "Non-multiple of grouping weights shape detected: "
                    "%s, grouping=%d" %
                    (self.weights.shape, self.grouping))
            self.mask.reset(numpy.zeros(self.effective_shape,
                                        dtype=self.weights.dtype))
            self.mask.map_invalidate()
            # TODO(a.kazantsev): add check for transposed weights.
            for kernel in range(self.effective_shape[0]):
                for chan in range(self.effective_shape[1]):
                    self.mask[kernel, chan] = not (
                        kernel % self.grouping == chan % self.grouping)
        else:
            assert self.mask.shape == self.effective_shape

        for vec in self.mask, self.weights:
            vec.initialize(device)

    def _gpu_init(self):
        self.build_program(cache_file_name="zero_filling_%d" % self.grouping,
                           dtype=self.weights.dtype)

        self.assign_kernel("multiply_by_mask")
        self.set_args(self.mask, self.weights)

    def ocl_init(self):
        self._gpu_init()
        self._global_size = [self.weights.size]
        self._local_size = None

    def cuda_init(self):
        self._gpu_init()
        self._global_size = (self.weights.size, 1, 1)
        self._local_size = (1, 1, 1)

    def numpy_run(self):
        self.mask.map_read()
        self.weights.map_write()

        self.weights.mem *= self.mask.mem

    def _gpu_run(self):
        self.weights.unmap()
        self.mask.unmap()
        self.execute_kernel(self._global_size, self._local_size)

    def ocl_run(self):
        self._gpu_run()

    def cuda_run(self):
        self._gpu_run()