Python Array.map_write примеры использования

Язык программирования: Python

Пространство имен/Пакет: veles.memory

Класс/Тип: Array

Метод/Функция: map_write

Примеров на hotexamples.com: 13

Python Array.map_write - 13 примеров найдено. Это лучшие примеры Python кода для veles.memory.Array.map_write, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Array(30)

reset(25)

initialize(8)

map_write(7)

unmap(7)

map_read(3)

map_invalidate(2)

ocl_map_read(1)

Пример #1

Показать файл

class FixAccumulator(Unit):
    """
    Range accumulator.
    """
    def __init__(self, workflow, **kwargs):
        super(FixAccumulator, self).__init__(workflow)
        self.bars = kwargs.get("bars", 200)
        self.type = kwargs.get("type", "relu")
        self.input = None
        self.output = Array()
        self.reset_flag = Bool(True)
        self.n_bars = [0]
        self.max = 100
        self.min = 0

    def initialize(self, **kwargs):
        self.output.mem = numpy.zeros([self.bars + 2], dtype=numpy.int64)

    def run(self):
        if self.type == "relu":
            self.max = 10000
            self.min = 0
        elif self.type == "tanh":
            self.max = 1.7159
            self.min = -1.7159
        else:
            raise error.BadFormatError("Unsupported type %s" % self.type)

        d = self.max - self.min
        if not d:
            return
        self.output.map_write()
        self.input.map_read()
        d = (self.bars - 1) / d
        if self.reset_flag:
            self.output.mem[:] = 0
        self.n_bars[0] = self.bars + 2
        for y in self.input.mem.ravel():
            if y < self.min:
                self.output[0] += 1
                continue
            if y <= self.max and y > self.min:
                i = int(numpy.floor((y - self.min) * d))
                self.output[i] += 1
                continue
            self.output[self.bars + 1] += 1

Пример #2

Показать файл

Файл: accumulator.py Проект: Samsung/veles.znicz

class FixAccumulator(Unit):
    """
    Range accumulator.
    """
    def __init__(self, workflow, **kwargs):
        super(FixAccumulator, self).__init__(workflow)
        self.bars = kwargs.get("bars", 200)
        self.type = kwargs.get("type", "relu")
        self.input = None
        self.output = Array()
        self.reset_flag = Bool(True)
        self.n_bars = [0]
        self.max = 100
        self.min = 0

    def initialize(self, **kwargs):
        self.output.mem = numpy.zeros([self.bars + 2], dtype=numpy.int64)

    def run(self):
        if self.type == "relu":
            self.max = 10000
            self.min = 0
        elif self.type == "tanh":
            self.max = 1.7159
            self.min = -1.7159
        else:
            raise error.BadFormatError("Unsupported type %s" % self.type)

        d = self.max - self.min
        if not d:
            return
        self.output.map_write()
        self.input.map_read()
        d = (self.bars - 1) / d
        if self.reset_flag:
            self.output.mem[:] = 0
        self.n_bars[0] = self.bars + 2
        for y in self.input.mem.ravel():
            if y < self.min:
                self.output[0] += 1
                continue
            if y <= self.max and y > self.min:
                i = int(numpy.floor((y - self.min) * d))
                self.output[i] += 1
                continue
            self.output[self.bars + 1] += 1

Пример #3

Показать файл

class EvaluatorMSE(EvaluatorBase):

    MAPPING = "evaluator_mse"
    LOSS = "mse"
    """Evaluator for nn softmax output from the batch labels.

    Must be assigned before initialize():
        output
        target
        batch_size
        labels (may be None)
        class_targets (may be None)

    Updates after run():
        err_output
        confusion_matrix
        max_err_output_sum
        n_err (only if labels and class_targets is not None)

    Creates within initialize():
        err_output
        n_err (only if labels and class_targets is not None)
        max_err_output_sum

    Attributes:
        output: output of the network_common as Batch.
        target: target for the current Batch.
        err_output: backpropagation errors.
        batch_size: number of elements in output to evaluate.
        metrics: [0] - sum of sample's mse, [1] - max of sample's mse,
                 [2] - min of sample's mse.
        mse: array of mse for each sample in minibatch.
        krn_constants_i_: numpy array for constant arguments to kernel.
        labels: labels for a batch (may be None).
        class_targets: target for each class (may be None).
        n_err: number of wrongly recognized samples
            (if labels and class_targets is not None).
    """
    def __init__(self, workflow, **kwargs):
        super(EvaluatorMSE, self).__init__(workflow, **kwargs)
        self.metrics = Array()
        self.mse = Array()
        self.labels = None
        self.class_targets = None
        self.n_err = Array()
        self.root = kwargs.get("root", True)
        self.demand("target", "normalizer")

    @property
    def root(self):
        """
        :return: True if error metric is RMSE, otherwise, MSE (mean sum of
        squares). Default is True.
        """
        return self._root

    @root.setter
    def root(self, value):
        if not isinstance(value, bool):
            raise TypeError("root must be boolean (got %s)" % type(value))
        self._root = value

    def initialize(self, device, **kwargs):
        super(EvaluatorMSE, self).initialize(device=device, **kwargs)
        if self.testing:
            return

        if self.target.size != self.output.size:
            raise error.BadFormatError(
                "target.size != output.size (%s != %s)" %
                (self.target.size, self.output.size))

        self.sources_["evaluator_mse"] = {}
        self.sources_["denormalization"] = {}

        dtype = self.output.dtype

        self.metrics.reset(numpy.zeros(3, dtype=dtype))
        self.metrics[2] = 1.0e30  # mse_min
        self.mse.reset(numpy.zeros(self.err_output.mem.shape[0], dtype))
        self.n_err.reset(numpy.zeros(2, dtype=numpy.int32))
        self.init_vectors(self.n_err, self.target, self.metrics, self.mse)
        if self.class_targets:
            self.class_targets.initialize(self.device)

    def _gpu_init(self):
        dtype = self.output.dtype
        block_size = min(self.err_output.shape[0], 128)
        if self.class_targets:
            self.sources_["mse_find_closest"] = {
                "target_dtype": numpy_dtype_to_opencl(self.class_targets.dtype)
            }

        self.build_program(cache_file_name="%s_%d_%d" %
                           (self.__class__.__name__, self.output.shape[0],
                            self.output.sample_size),
                           dtype=dtype,
                           max_batch_size=self.err_output.shape[0],
                           block_size=block_size,
                           output_size=self.err_output.sample_size,
                           root=self.root,
                           normalization=self.normalizer.MAPPING,
                           targets_number=self.class_targets.shape[0]
                           if self.class_targets else None,
                           coeffs=self.normalizer.coefficients)

        self.assign_kernel("evaluate_mse")
        self.set_args(self.output, self.target, self.skip_args(2),
                      self.metrics, self.mse.devmem, self.err_output)

        if self.labels and self.class_targets:
            assert (self.labels.dtype == self.n_err.dtype == numpy.int32)
            self.krn_find_closest_ = self.get_kernel("mse_find_closest")
            self.krn_find_closest_.set_args(self.output.devmem,
                                            self.class_targets.devmem,
                                            self.labels.devmem,
                                            self.n_err.devmem)

        return block_size

    def ocl_init(self):
        if self.testing:
            return
        block_size = self._gpu_init()
        self._local_size = [block_size]
        self._global_size = self._local_size
        self._global_size_find_closest_ = lambda: (self.batch_size, )
        self._local_size_find_closest = None

    def cuda_init(self):
        if self.testing:
            return
        block_size = self._gpu_init()
        self._local_size = (block_size, 1, 1)
        self._global_size = (1, 1, 1)
        self._global_size_find_closest_ = lambda: (self.batch_size, 1, 1)
        self._local_size_find_closest = (1, 1, 1)

    def _gpu_run(self):
        self.unmap_vectors(self.err_output, self.output, self.target,
                           self.metrics, self.mse)

        batch_size = self.batch_size
        self.krn_constants_i_[0] = batch_size
        self.set_arg(2, self.krn_constants_i_[0:1])
        self.krn_constants_f_[0] = 1.0 / self.batch_size if self.mean else 1.0
        self.set_arg(3, self.krn_constants_f_[0:1])

        self.execute_kernel(self._global_size, self._local_size)

        if self.labels and self.class_targets:
            self.unmap_vectors(self.class_targets, self.labels, self.n_err)
            self.execute_kernel(self._global_size_find_closest_(),
                                self._local_size_find_closest,
                                self.krn_find_closest_)
            self.n_err.map_write()
            self.n_err.mem[1] += batch_size

    def ocl_run(self):
        return self._gpu_run()

    def cuda_run(self):
        return self._gpu_run()

    def numpy_run(self):
        self.output.map_read()
        self.target.map_read()
        self.metrics.map_write()
        self.err_output.map_invalidate()
        self.mse.map_invalidate()

        assert (self.output.size == self.target.size == self.err_output.size)
        batch_size = self.batch_size
        err_output = self.err_output.matrix[:batch_size]
        assert_addr(err_output, self.err_output.mem)
        output = self.output.matrix[:batch_size]
        assert_addr(output, self.output.mem)
        target = self.target.matrix[:batch_size]
        assert_addr(target, self.target.mem)
        mse = self.mse.mem[:batch_size]
        assert_addr(mse, self.mse.mem)

        err_output[:] = output - target
        if not isinstance(self.normalizer, NoneNormalizer):
            output_copy = output.copy()
            target_copy = target.copy()
            self.normalizer.denormalize(output_copy)
            self.normalizer.denormalize(target_copy)
            denormed_err_output = output_copy - target_copy
        else:
            denormed_err_output = err_output
        self.err_output.mem[batch_size:] = 0
        mse[:] = numpy.square(denormed_err_output).sum(axis=1) / \
            denormed_err_output.shape[1]
        if self.mean:
            err_output /= batch_size
        if self.root:
            numpy.sqrt(mse, mse)
        self.mse.mem[batch_size:] = 0

        self.metrics.mem[0] += mse.sum()
        self.metrics.mem[1] = max(self.metrics.mem[1], mse.max())
        self.metrics.mem[2] = min(self.metrics.mem[2], mse.min())

        if self.labels and self.class_targets:
            self.class_targets.map_read()
            self.labels.map_read()
            self.n_err.map_write()
            class_targets = self.class_targets.matrix
            labels = self.labels.mem
            for i, sample in enumerate(output):
                lbl = numpy.linalg.norm(class_targets - sample,
                                        axis=1).argmin()
                if lbl != labels[i]:
                    self.n_err.mem[0] += 1
                self.n_err.mem[1] += 1

    def merge_output(self):
        if not isinstance(self.normalizer, NoneNormalizer):
            output = self.output[:self.batch_size].copy()
            self.normalizer.denormalize(output)
        else:
            output = self.output.mem
        self.merged_output[self.offset - self.batch_size:self.offset] = output

Пример #4

Показать файл

Файл: plotting_units.py Проект: 2php/veles

class MultiHistogram(Plotter):
    """Plotter for drawing weights as 2D.

    Must be assigned before initialize():
        input
        input_field
    """
    def __init__(self, workflow, **kwargs):
        super(MultiHistogram, self).__init__(workflow, **kwargs)
        self.limit = kwargs.get("limit", 64)
        self.value = Array()
        self.n_bars = kwargs.get("n_bars", 25)
        self.hist_number = kwargs.get("hist_number", 16)
        self.demand("input")

    def initialize(self, **kwargs):
        super(MultiHistogram, self).initialize(**kwargs)
        if self.hist_number > self.limit:
            self.hist_number = self.limit
        self.value.mem = numpy.zeros(
            [self.hist_number, self.n_bars], dtype=numpy.int64)

    def redraw(self):
        fig = self.pp.figure(self.name)
        fig.clf()
        fig.patch.set_facecolor('#E8D6BB')
        # fig.patch.set_alpha(0.45)

        n_cols = int(numpy.round(numpy.sqrt(self.value.shape[0])))
        n_rows = int(numpy.ceil(self.value.shape[0] / n_cols))
        i = 0
        for _ in range(0, n_rows):
            for _ in range(0, n_cols):
                ax = fig.add_subplot(n_rows, n_cols, i + 1)
                ax.cla()
                # ax.axis('off')
                ax.patch.set_facecolor('#ffe6ca')
                # ax.set_xlabel("Input Data", fontsize=10)
                # ax.set_ylabel("Number", fontsize=10)
                ymin = self.value[i].min()
                ymax = self.value[i].max()
                xmin = self.input[i].min()
                xmax = self.input[i].max()
                ax.axis([xmin, xmax + ((xmax - xmin) / self.n_bars), ymin,
                         ymax])
                ax.grid(True)
                ax.set_title(self.name.replace("Histogram ", ""))
                nbars = self.n_bars
                width = ((xmax - xmin) / nbars) * 0.8
                X = numpy.linspace(xmin, xmax, num=nbars, endpoint=True)
                Y = self.value[i]
                if (n_rows > 5) or (n_cols > 5):
                    ax.bar(X, Y, color='#ffa0ef', width=width,
                           edgecolor='red')
                else:
                    ax.bar(X, Y, color='#ffa0ef', width=width,
                           edgecolor='lavender')
                if n_rows > 4:
                    ax.set_yticklabels([])
                if n_cols > 3:
                    ax.set_xticklabels([])
                i += 1
                if i >= self.value.shape[0]:
                    break
            if i >= self.value.shape[0]:
                break

        self.show_figure(fig)
        fig.canvas.draw()
        return fig

    def fill(self):
        for i in range(self.hist_number):
            self.value.map_write()
            self.input.map_read()
            mx = self.input.mem[i].max()
            mi = self.input.mem[i].min()
            d = mx - mi
            if not d:
                return
            d = (self.n_bars - 1) / d
            self.value[i] = 0
            for x in self.input.mem[i]:
                i_bar = int(numpy.floor((x - mi) * d))
                self.value[i, i_bar] += 1

Пример #5

Показать файл

Файл: kohonen.py Проект: vmarkovtsev/veles.znicz

class KohonenForward(KohonenBase, AcceleratedUnit):
    """Kohonen forward layer.

    Must be assigned before initialize():
        input
        weights
        minibatch_offset (if total == True)
        minibatch_size (if total == True)
        batch_size (if total == True)
        argmins speeds up run() if linked from KohonenTrainer

    Updates after run():
        output

    Creates within initialize():
        output

    Attributes:
        input: input as batch of samples.
        weights: the weights of the neurons in Kohonen layer.
        output: the list of winners.
        total: if total=True is passed in __init__(), the overall winners table
    """
    def __init__(self, workflow, **kwargs):
        super(KohonenForward, self).__init__(workflow, **kwargs)
        self.demand("input", "weights")
        self.argmins = None
        self._distances = Array()
        self.output = Array()
        self._chunk_size_ = 0
        self.weights_transposed = False
        self.total = Array() if kwargs.get("total", False) else None
        if self.total is not None:
            self.minibatch_offset = None
            self.minibatch_size = None
            self.batch_size = None

    def init_unpickled(self):
        super(KohonenForward, self).init_unpickled()
        self.sources_["kohonen"] = {"FORWARD": 1}

    @property
    def neurons_number(self):
        return self.weights.mem.shape[0]

    @property
    def sample_length(self):
        return self.weights.mem.shape[1]

    @property
    def chunk_size(self):
        return self._chunk_size_

    def initialize(self, device, **kwargs):
        super(KohonenForward, self).initialize(device=device, **kwargs)

        assert self.input.mem.shape[1] == self.sample_length
        batch_size = self.input.mem.shape[0]

        self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32))
        if self.argmins is None:
            self._distances.reset(
                numpy.zeros([batch_size, self.neurons_number],
                            dtype=self.weights.mem.dtype))

        if self.total is not None:
            self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32))
            self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32)

    def ocl_init(self):
        batch_size = self.input.mem.shape[0]
        self.output.initialize(self.device)
        if self.argmins is None:
            self.input.initialize(self.device)
            self.weights.initialize(self.device)
            self._distances.initialize(self.device)
        elif self.total is None:
            return
        if self.total is not None:
            self.total.initialize(self.device)

        copy_chunk_size = int(
            numpy.ceil(batch_size / self.device.max_group_size))
        chunk_size = self.neurons_number // self.device.max_group_size
        if chunk_size < 2:
            chunk_size = self.neurons_number // 2 + 1
        self.argmin_group_size = \
            int(numpy.ceil(self.neurons_number / chunk_size))

        block_size, vector_opt = self.device.device_info.get_kernel_bs_vo(
            kernel="matrix_multiplication", dtype=self.input.dtype)

        defines = {
            'BLOCK_SIZE': block_size,
            'VECTOR_OPT': int(bool(vector_opt)),
            'BATCH': batch_size,
            'SAMPLE_LENGTH': self.sample_length,
            'NEURONS_NUMBER': self.neurons_number,
            'CHUNK_SIZE': chunk_size,
            'COPY_CHUNK_SIZE': copy_chunk_size,
        }
        if self.weights_transposed:
            defines['WEIGHTS_TRANSPOSED'] = 1
        self.build_program(defines,
                           "%s_%d_%d_%d" %
                           (self.__class__.__name__, batch_size,
                            self.sample_length, self.neurons_number),
                           dtype=self.weights.mem.dtype)

        if self.total is not None:
            self._set_total_global_size_ = \
                [int(numpy.ceil(batch_size / copy_chunk_size))]
            self._krn_set_total_ = self.get_kernel("set_total")
            self._krn_set_total_.set_args(self.output.devmem, cl.skip,
                                          self.total.devmem)
        if self.argmins is not None:
            return

        self._krn_distances_ = self.get_kernel("calculate_distances")
        self._krn_distances_.set_args(self.input.devmem, self.weights.devmem,
                                      self._distances.devmem)

        self._krn_argmin_ = self.get_kernel("calculate_argmin")
        self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem,
                                   None)

        self._gs_distance = [
            roundup(self.neurons_number, block_size),
            roundup(batch_size, block_size)
        ]
        self._ls_distance = [block_size, block_size]

    def ocl_run(self):
        self.output.unmap()
        if self.total is not None:
            self.total.unmap()

        if self.argmins is None:
            self.input.unmap()
            self.weights.unmap()
            self.execute_kernel(self._gs_distance, self._ls_distance,
                                self._krn_distances_)
            self.execute_kernel([self.argmin_group_size],
                                [self.argmin_group_size], self._krn_argmin_)
        else:
            self.argmins.unmap()
            self.argmins.map_read()
            self.output.map_write()
            self.output.mem[:] = self.argmins.mem
            self.output.unmap()
            self.argmins.unmap()

        if self.total is not None:
            self._minibatch_offset_[0] = \
                self.minibatch_offset - self.minibatch_size
            self._krn_set_total_.set_arg(1, self._minibatch_offset_)
            self.execute_kernel(self._set_total_global_size_, None,
                                self._krn_set_total_)

    def numpy_run(self):
        self.output.map_invalidate()

        if self.argmins is not None:
            self.argmins.map_read()
            self.output.mem[:] = self.argmins.mem
        else:
            self.input.map_read()
            self.weights.map_read()

        if self.total is not None:
            self.total.map_invalidate()

        length = self.minibatch_size if self.total is not None \
            else self.input.mem.shape[0]
        for sindex in range(length):
            if self.argmins is None:
                dist = self.weights.mem - self.input[sindex]
                winner = numpy.argmin(self.numpy_linalg_norm(dist))
                self.output[sindex] = winner
            else:
                winner = self.argmins[sindex]
            if self.total is not None:
                index = sindex + self.minibatch_offset - self.minibatch_size
                self.total[index] = winner

Пример #6

Показать файл

Файл: evaluator.py Проект: Samsung/veles.znicz

class EvaluatorMSE(EvaluatorBase):

    MAPPING = "evaluator_mse"
    LOSS = "mse"

    """Evaluator for nn softmax output from the batch labels.

    Must be assigned before initialize():
        output
        target
        batch_size
        labels (may be None)
        class_targets (may be None)

    Updates after run():
        err_output
        confusion_matrix
        max_err_output_sum
        n_err (only if labels and class_targets is not None)

    Creates within initialize():
        err_output
        n_err (only if labels and class_targets is not None)
        max_err_output_sum

    Attributes:
        output: output of the network_common as Batch.
        target: target for the current Batch.
        err_output: backpropagation errors.
        batch_size: number of elements in output to evaluate.
        metrics: [0] - sum of sample's mse, [1] - max of sample's mse,
                 [2] - min of sample's mse.
        mse: array of mse for each sample in minibatch.
        krn_constants_i_: numpy array for constant arguments to kernel.
        labels: labels for a batch (may be None).
        class_targets: target for each class (may be None).
        n_err: number of wrongly recognized samples
            (if labels and class_targets is not None).
    """
    def __init__(self, workflow, **kwargs):
        super(EvaluatorMSE, self).__init__(workflow, **kwargs)
        self.metrics = Array()
        self.mse = Array()
        self.labels = None
        self.class_targets = None
        self.n_err = Array()
        self.root = kwargs.get("root", True)
        self.demand("target", "normalizer")

    @property
    def root(self):
        """
        :return: True if error metric is RMSE, otherwise, MSE (mean sum of
        squares). Default is True.
        """
        return self._root

    @root.setter
    def root(self, value):
        if not isinstance(value, bool):
            raise TypeError("root must be boolean (got %s)" % type(value))
        self._root = value

    def initialize(self, device, **kwargs):
        super(EvaluatorMSE, self).initialize(device=device, **kwargs)
        if self.testing:
            return

        if self.target.size != self.output.size:
            raise error.BadFormatError(
                "target.size != output.size (%s != %s)" %
                (self.target.size, self.output.size))

        self.sources_["evaluator_mse"] = {}
        self.sources_["denormalization"] = {}

        dtype = self.output.dtype

        self.metrics.reset(numpy.zeros(3, dtype=dtype))
        self.metrics[2] = 1.0e30  # mse_min
        self.mse.reset(numpy.zeros(self.err_output.mem.shape[0], dtype))
        self.n_err.reset(numpy.zeros(2, dtype=numpy.int32))
        self.init_vectors(self.n_err, self.target, self.metrics, self.mse)
        if self.class_targets:
            self.class_targets.initialize(self.device)

    def _gpu_init(self):
        dtype = self.output.dtype
        block_size = min(self.err_output.shape[0], 128)
        if self.class_targets:
            self.sources_["mse_find_closest"] = {
                "target_dtype": numpy_dtype_to_opencl(self.class_targets.dtype)
            }

        self.build_program(
            cache_file_name="%s_%d_%d" % (self.__class__.__name__,
                                          self.output.shape[0],
                                          self.output.sample_size),
            dtype=dtype, max_batch_size=self.err_output.shape[0],
            block_size=block_size, output_size=self.err_output.sample_size,
            root=self.root, normalization=self.normalizer.MAPPING,
            targets_number=self.class_targets.shape[0] if self.class_targets
            else None, coeffs=self.normalizer.coefficients)

        self.assign_kernel("evaluate_mse")
        self.set_args(self.output, self.target, self.skip_args(2),
                      self.metrics, self.mse.devmem, self.err_output)

        if self.labels and self.class_targets:
            assert(self.labels.dtype == self.n_err.dtype == numpy.int32)
            self.krn_find_closest_ = self.get_kernel("mse_find_closest")
            self.krn_find_closest_.set_args(
                self.output.devmem,
                self.class_targets.devmem,
                self.labels.devmem,
                self.n_err.devmem)

        return block_size

    def ocl_init(self):
        if self.testing:
            return
        block_size = self._gpu_init()
        self._local_size = [block_size]
        self._global_size = self._local_size
        self._global_size_find_closest_ = lambda: (self.batch_size,)
        self._local_size_find_closest = None

    def cuda_init(self):
        if self.testing:
            return
        block_size = self._gpu_init()
        self._local_size = (block_size, 1, 1)
        self._global_size = (1, 1, 1)
        self._global_size_find_closest_ = lambda: (self.batch_size, 1, 1)
        self._local_size_find_closest = (1, 1, 1)

    def _gpu_run(self):
        self.unmap_vectors(self.err_output, self.output, self.target,
                           self.metrics, self.mse)

        batch_size = self.batch_size
        self.krn_constants_i_[0] = batch_size
        self.set_arg(2, self.krn_constants_i_[0:1])
        self.krn_constants_f_[0] = 1.0 / self.batch_size if self.mean else 1.0
        self.set_arg(3, self.krn_constants_f_[0:1])

        self.execute_kernel(self._global_size, self._local_size)

        if self.labels and self.class_targets:
            self.unmap_vectors(self.class_targets, self.labels, self.n_err)
            self.execute_kernel(self._global_size_find_closest_(),
                                self._local_size_find_closest,
                                self.krn_find_closest_)
            self.n_err.map_write()
            self.n_err.mem[1] += batch_size

    def ocl_run(self):
        return self._gpu_run()

    def cuda_run(self):
        return self._gpu_run()

    def numpy_run(self):
        self.output.map_read()
        self.target.map_read()
        self.metrics.map_write()
        self.err_output.map_invalidate()
        self.mse.map_invalidate()

        assert(self.output.size == self.target.size == self.err_output.size)
        batch_size = self.batch_size
        err_output = self.err_output.matrix[:batch_size]
        assert_addr(err_output, self.err_output.mem)
        output = self.output.matrix[:batch_size]
        assert_addr(output, self.output.mem)
        target = self.target.matrix[:batch_size]
        assert_addr(target, self.target.mem)
        mse = self.mse.mem[:batch_size]
        assert_addr(mse, self.mse.mem)

        err_output[:] = output - target
        if not isinstance(self.normalizer, NoneNormalizer):
            output_copy = output.copy()
            target_copy = target.copy()
            self.normalizer.denormalize(output_copy)
            self.normalizer.denormalize(target_copy)
            denormed_err_output = output_copy - target_copy
        else:
            denormed_err_output = err_output
        self.err_output.mem[batch_size:] = 0
        mse[:] = numpy.square(denormed_err_output).sum(axis=1) / \
            denormed_err_output.shape[1]
        if self.mean:
            err_output /= batch_size
        if self.root:
            numpy.sqrt(mse, mse)
        self.mse.mem[batch_size:] = 0

        self.metrics.mem[0] += mse.sum()
        self.metrics.mem[1] = max(self.metrics.mem[1], mse.max())
        self.metrics.mem[2] = min(self.metrics.mem[2], mse.min())

        if self.labels and self.class_targets:
            self.class_targets.map_read()
            self.labels.map_read()
            self.n_err.map_write()
            class_targets = self.class_targets.matrix
            labels = self.labels.mem
            for i, sample in enumerate(output):
                lbl = numpy.linalg.norm(class_targets - sample,
                                        axis=1).argmin()
                if lbl != labels[i]:
                    self.n_err.mem[0] += 1
                self.n_err.mem[1] += 1

    def merge_output(self):
        if not isinstance(self.normalizer, NoneNormalizer):
            output = self.output[:self.batch_size].copy()
            self.normalizer.denormalize(output)
        else:
            output = self.output.mem
        self.merged_output[self.offset - self.batch_size:self.offset] = output

Пример #7

Показать файл

Файл: nn_units.py Проект: vmarkovtsev/veles.znicz

class GradientDescentBase(AcceleratedUnit):
    """Base class for gradient descent units.

    Attributes:
        input: input layer values.
        output: output layer values.
        err_output: error to backpropagate.
        err_input: backpropagated error.
        weights: weights.
        bias: bias.
        batch_size: current minibatch size.
        learning_rate: gradient descent speed (positive).
        learning_rate_bias
        weights_decay: regularization for weights (see l1_vs_l2).
        weights_decay_bias
        gradient_moment: moment coefficient for weights.
        gradient_moment_bias
        gradient_weights_with_moment: accumulated moment.
        gradient_bias_with_moment
        batch_size: effective batch size (if None, get it from y).
        weights_transposed: assume weights matrix as a transposed one.
        apply_gradient: will apply gradient.
        gradient_changed: when True, slave will send gradients to master
            (assigned to True just before the run call, so it can be set to
            False inside ocl_run, numpy_run if necessary).
        ocl_set_const_args: True when constant arguments for the kernel
                            had been changed and need to be set again.
    """
    hide_from_registry = True
    MAPPING = set()

    REDUCE_SIZE = 64  # used for updating bias

    def __init__(self, workflow, **kwargs):
        kwargs["view_group"] = kwargs.get("view_group", "TRAINER")
        super(GradientDescentBase, self).__init__(workflow, **kwargs)
        self.err_input = Array(shallow_pickle=True)
        self.ocl_set_const_args = True
        self.weights = None
        self.bias = None
        self.demand("input", "err_output")
        self.learning_rate = kwargs.get("learning_rate", 0.01)
        self.learning_rate_bias = kwargs.get("learning_rate_bias",
                                             self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", 0.00005)
        self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0)
        self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0)
        self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2)
        self.gradient_moment = kwargs.get("gradient_moment", 0)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias",
                                               self.gradient_moment)
        self.weights_transposed = kwargs.get("weights_transposed", False)
        self.need_err_input = kwargs.get("need_err_input", True)
        self.include_bias = kwargs.get("include_bias", True)
        self.factor_ortho = kwargs.get("factor_ortho", 0)
        self.col_sums = Array()  # for orthogonalization

        # Current gradient as it is without applying learning_rate etc.
        self.gradient_weights = Array()
        self.gradient_bias = Array()

        # Gradient with applied learning_rate etc.
        # optionally accumulated from the previous run
        self.accumulate_gradient = kwargs.get("accumulate_gradient", False)

        # When accumulate_gradient set to True:
        # 1. Calculate gd
        # 2. acc = acc_alpha * gd + acc_beta * acc
        # 3. gd = gd_alpha * acc + gd_beta * gd
        # 4. Apply moments to gd
        # 5. weights += gd if apply_gradient set to True
        self.acc_alpha = kwargs.get("acc_alpha", 0.0)
        self.acc_beta = kwargs.get("acc_beta", 0.0)
        self.gd_alpha = kwargs.get("gd_alpha", 0.0)
        self.gd_beta = kwargs.get("gd_beta", 1.0)

        self.accumulated_gradient_weights = Array()
        self.accumulated_gradient_bias = Array()

        # Gradient with accumulated moments
        self.gradient_weights_with_moment = Array()
        self.gradient_bias_with_moment = Array()

        # Sets to True when gradient changes
        self.gradient_changed = False

        # Gradient will be applied to weights immediately just after computing
        self.apply_gradient = kwargs.get("apply_gradient",
                                         not workflow.is_slave)

    @property
    def current_batch_size(self):
        batch_size = getattr(self, "batch_size", None)
        if batch_size is None:
            return self.err_output.mem.shape[0]
        return int(batch_size)

    def initialize(self, device, **kwargs):
        super(GradientDescentBase, self).initialize(device, **kwargs)

        if self.weights:
            assert len(self.weights.shape) == 2
            self.weights_shape = (tuple(reversed(self.weights.shape))
                                  if self.weights_transposed else
                                  self.weights.shape)
        else:
            self.weights_shape = None

        self.learning_rate = kwargs.get("learning_rate", self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", self.weights_decay)
        self.gradient_moment = kwargs.get("gradient_moment",
                                          self.gradient_moment)
        self.learning_rate_bias = kwargs.get("learning_rate_bias",
                                             self.learning_rate_bias)
        self.weights_decay_bias = kwargs.get("weights_decay_bias",
                                             self.weights_decay_bias)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias",
                                               self.gradient_moment_bias)

        if self.weights:
            if not self.gradient_weights:
                self.gradient_weights.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights.size == self.weights.size

        if self.weights and self.accumulate_gradient:
            if not self.accumulated_gradient_weights:
                self.accumulated_gradient_weights.reset(
                    numpy.zeros_like(self.weights.mem))
            else:
                assert (self.accumulated_gradient_weights.size ==
                        self.weights.size)

        if self.weights and (self.gradient_moment or not self.is_standalone):
            if not self.gradient_weights_with_moment:
                self.gradient_weights_with_moment.reset(
                    numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights_with_moment.size == \
                    self.weights.size

        if (self.include_bias and self.bias
                and (not self.gradient_bias
                     or self.gradient_bias.size != self.bias.size)):
            self.gradient_bias.reset(numpy.zeros_like(self.bias.mem))

        if (self.include_bias and self.bias and self.accumulate_gradient and
            (not self.accumulated_gradient_bias
             or self.accumulated_gradient_bias.size != self.bias.size)):
            self.accumulated_gradient_bias.reset(
                numpy.zeros_like(self.bias.mem))

        if (self.include_bias and self.bias
                and (self.gradient_moment_bias or not self.is_standalone)):
            if not self.gradient_bias_with_moment:
                self.gradient_bias_with_moment.reset(
                    numpy.zeros_like(self.bias.mem))
            else:
                assert self.gradient_bias_with_moment.size == self.bias.size

        dtype = self.err_output.dtype
        if self.need_err_input:
            if not self.err_input:
                self.err_input.reset(numpy.zeros(self.input.shape, dtype))
            else:
                assert self.err_input.shape == self.input.shape

        if self.weights:
            side = self.weights_shape[0]
            other = self.weights.size // side
            if self.factor_ortho:
                if not self.col_sums:
                    self.col_sums.reset(numpy.zeros(other, dtype=dtype))
                else:
                    assert self.col_sums.size == other
                self.col_sums.initialize(self.device)
            self.reduce_size = roundup(min(self.reduce_size, other), 32)
            self.weights.initialize(self.device)

        for vec in self.bias, self.input, self.err_input:
            if vec:
                vec.initialize(self.device)
        self.init_vectors(self.err_output, self.gradient_weights,
                          self.gradient_bias,
                          self.accumulated_gradient_weights,
                          self.accumulated_gradient_bias,
                          self.gradient_weights_with_moment,
                          self.gradient_bias_with_moment)

    def gpu_weights_update(self):
        self.unmap_vectors(self.input, self.err_output, self.weights,
                           self.gradient_weights,
                           self.accumulated_gradient_weights,
                           self.gradient_weights_with_moment)

        if self.factor_ortho:
            self.col_sums.unmap()
            self.execute_kernel(self._global_size_ortho,
                                self._local_size_ortho,
                                self.krn_compute_col_sums_)

            self._weights_const[12] = self.factor_ortho
            self.krn_weights_.set_arg(12, self._weights_const[12:13])

        self._weights_const[4:12] = (self.learning_rate, self.weights_decay,
                                     self.l1_vs_l2, self.gradient_moment,
                                     self.acc_alpha, self.acc_beta,
                                     self.gd_alpha, self.gd_beta)
        self.krn_weights_.set_args(
            self.device.skip(4), self._weights_const[4:5],
            self._weights_const[5:6], self._weights_const[6:7],
            self._weights_const[7:8], self._weights_const[8:9],
            self._weights_const[9:10], self._weights_const[10:11],
            self._weights_const[11:12])

        self.execute_kernel(self._global_size_weights,
                            self._local_size_weights, self.krn_weights_)

    def gpu_bias_update(self):
        if not self.include_bias:
            return

        self.unmap_vectors(self.err_output, self.bias, self.gradient_bias,
                           self.accumulated_gradient_bias,
                           self.gradient_bias_with_moment)

        self._bias_const[5:13] = (self.learning_rate_bias,
                                  self.weights_decay_bias, self.l1_vs_l2_bias,
                                  self.gradient_moment_bias, self.acc_alpha,
                                  self.acc_beta, self.gd_alpha, self.gd_beta)
        self.krn_bias_.set_args(self.device.skip(5), self._bias_const[5:6],
                                self._bias_const[6:7], self._bias_const[7:8],
                                self._bias_const[8:9], self._bias_const[9:10],
                                self._bias_const[10:11],
                                self._bias_const[11:12],
                                self._bias_const[12:13])

        self.execute_kernel(self._global_size_bias, self._local_size_bias,
                            self.krn_bias_)

    def gpu_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        if self.krn_err_output_ is None:
            return
        self.err_output.unmap()
        self.output.unmap()
        self.execute_kernel(self._global_size_err_output,
                            self._local_size_err_output, self.krn_err_output_)

    def numpy_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        pass

    def print_debug_data(self):
        """
        Show weights statistics
        """
        if not self.logger.isEnabledFor(logging.DEBUG):
            return
        self.weights.map_read()
        self.bias.map_read()
        self.gradient_bias.map_read()
        self.gradient_weights.map_read()
        weights = self.weights.mem
        bias = self.bias.mem
        grad_weights = self.gradient_weights.mem
        grad_bias = self.gradient_bias.mem

        weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max")
        weight_table.float_format = ".10"
        for (w_name, w_array) in [("Weight", weights), ("Bias", bias),
                                  ("Grad Weight", grad_weights),
                                  ("Grad Bias", grad_bias)]:
            w_mean = w_stddev = w_min = w_max = None
            if w_array is not None and w_array.size > 0:
                w_mean = numpy.mean(w_array)
                w_stddev = numpy.std(w_array)
                w_min = numpy.min(w_array)
                w_max = numpy.max(w_array)
            weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max)
        self.debug("\n" + weight_table.get_string())

    def generate_data_for_slave(self, slave):
        return (self.learning_rate, self.weights_decay, self.gradient_moment,
                self.learning_rate_bias, self.weights_decay_bias,
                self.gradient_moment_bias)

    @staticmethod
    def fill_zeros(vector):
        if not vector:
            return
        vector.map_invalidate()
        vector.mem[:] = 0

    def apply_data_from_master(self, data):
        self.learning_rate = data[0]
        self.weights_decay = data[1]
        self.gradient_moment = data[2]
        self.learning_rate_bias = data[3]
        self.weights_decay_bias = data[4]
        self.gradient_moment_bias = data[5]
        self.fill_zeros(self.gradient_weights_with_moment)
        self.fill_zeros(self.gradient_bias_with_moment)
        self.fill_zeros(self.gradient_weights)
        self.fill_zeros(self.gradient_bias)
        self.fill_zeros(self.accumulated_gradient_weights)
        self.fill_zeros(self.accumulated_gradient_bias)

    def generate_data_for_master(self):
        if not self.gradient_changed:
            return None
        self.gradient_changed = False
        self.gradient_weights_with_moment.map_read()
        self.gradient_bias_with_moment.map_read()
        return (self.gradient_weights_with_moment.mem,
                self.gradient_bias_with_moment.mem)

    def apply_data_from_slave(self, data, slave):
        if self.weights:
            self.weights.map_write()
            self.gradient_weights_with_moment.map_write()
            self.gradient_weights_with_moment.mem *= self.gradient_moment
            self.gradient_weights_with_moment.mem += data[0]
            self.weights.mem += self.gradient_weights_with_moment.mem
        if self.bias:
            self.bias.map_write()
            self.gradient_bias_with_moment.map_write()
            self.gradient_bias_with_moment.mem *= self.gradient_moment_bias
            self.gradient_bias_with_moment.mem += data[1]
            self.bias.mem += self.gradient_bias_with_moment.mem

    def drop_slave(self, slave):
        pass

    def accumulate_gradient_f(self, accumulated_gradient, gradient):
        if accumulated_gradient and self.accumulate_gradient:
            accumulated_gradient[:] = (
                gradient * self.acc_alpha +
                (self.acc_beta * accumulated_gradient if self.acc_beta else 0))

            gradient *= self.gd_beta
            gradient += self.gd_alpha * accumulated_gradient

        return gradient

    @staticmethod
    def numpy_gradient_step(weight,
                            gradient,
                            lr,
                            factor_l12,
                            l1_vs_l2,
                            factor_ortho=0,
                            weights_transposed=False):
        gradient = gradient.copy()
        gradient += factor_l12 * (
            (1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight))
        if factor_ortho:
            col_sums = (reshape_transposed(weight).sum(
                axis=1) if weights_transposed else weight.sum(axis=0))
            for i, row in enumerate(gradient):
                row += (col_sums - weight[i]) * factor_ortho / weight.shape[0]
        gradient *= lr
        return gradient

    def run(self):
        self.gradient_changed = True
        super(GradientDescentBase, self).run()
        self.ocl_set_const_args = False

Пример #8

Показать файл

Файл: plotting_units.py Проект: zghzdxs/veles

class MultiHistogram(Plotter):
    """Plotter for drawing weights as 2D.

    Must be assigned before initialize():
        input
        input_field
    """
    def __init__(self, workflow, **kwargs):
        super(MultiHistogram, self).__init__(workflow, **kwargs)
        self.limit = kwargs.get("limit", 64)
        self.value = Array()
        self.n_bars = kwargs.get("n_bars", 25)
        self.hist_number = kwargs.get("hist_number", 16)
        self.demand("input")

    def initialize(self, **kwargs):
        super(MultiHistogram, self).initialize(**kwargs)
        if self.hist_number > self.limit:
            self.hist_number = self.limit
        self.value.mem = numpy.zeros([self.hist_number, self.n_bars],
                                     dtype=numpy.int64)

    def redraw(self):
        fig = self.pp.figure(self.name)
        fig.clf()
        fig.patch.set_facecolor('#E8D6BB')
        # fig.patch.set_alpha(0.45)

        n_cols = int(numpy.round(numpy.sqrt(self.value.shape[0])))
        n_rows = int(numpy.ceil(self.value.shape[0] / n_cols))
        i = 0
        for _ in range(0, n_rows):
            for _ in range(0, n_cols):
                ax = fig.add_subplot(n_rows, n_cols, i + 1)
                ax.cla()
                # ax.axis('off')
                ax.patch.set_facecolor('#ffe6ca')
                # ax.set_xlabel("Input Data", fontsize=10)
                # ax.set_ylabel("Number", fontsize=10)
                ymin = self.value[i].min()
                ymax = self.value[i].max()
                xmin = self.input[i].min()
                xmax = self.input[i].max()
                ax.axis(
                    [xmin, xmax + ((xmax - xmin) / self.n_bars), ymin, ymax])
                ax.grid(True)
                ax.set_title(self.name.replace("Histogram ", ""))
                nbars = self.n_bars
                width = ((xmax - xmin) / nbars) * 0.8
                X = numpy.linspace(xmin, xmax, num=nbars, endpoint=True)
                Y = self.value[i]
                if (n_rows > 5) or (n_cols > 5):
                    ax.bar(X, Y, color='#ffa0ef', width=width, edgecolor='red')
                else:
                    ax.bar(X,
                           Y,
                           color='#ffa0ef',
                           width=width,
                           edgecolor='lavender')
                if n_rows > 4:
                    ax.set_yticklabels([])
                if n_cols > 3:
                    ax.set_xticklabels([])
                i += 1
                if i >= self.value.shape[0]:
                    break
            if i >= self.value.shape[0]:
                break

        self.show_figure(fig)
        fig.canvas.draw()
        return fig

    def fill(self):
        for i in range(self.hist_number):
            self.value.map_write()
            self.input.map_read()
            mx = self.input.mem[i].max()
            mi = self.input.mem[i].min()
            d = mx - mi
            if not d:
                return
            d = (self.n_bars - 1) / d
            self.value[i] = 0
            for x in self.input.mem[i]:
                i_bar = int(numpy.floor((x - mi) * d))
                self.value[i, i_bar] += 1

Пример #9

Показать файл

Файл: nn_units.py Проект: vmarkovtsev/veles.znicz

class GradientDescentBase(AcceleratedUnit):
    """Base class for gradient descent units.

    Attributes:
        input: input layer values.
        output: output layer values.
        err_output: error to backpropagate.
        err_input: backpropagated error.
        weights: weights.
        bias: bias.
        batch_size: current minibatch size.
        learning_rate: gradient descent speed (positive).
        learning_rate_bias
        weights_decay: regularization for weights (see l1_vs_l2).
        weights_decay_bias
        gradient_moment: moment coefficient for weights.
        gradient_moment_bias
        gradient_weights_with_moment: accumulated moment.
        gradient_bias_with_moment
        batch_size: effective batch size (if None, get it from y).
        weights_transposed: assume weights matrix as a transposed one.
        apply_gradient: will apply gradient.
        gradient_changed: when True, slave will send gradients to master
            (assigned to True just before the run call, so it can be set to
            False inside ocl_run, numpy_run if necessary).
        ocl_set_const_args: True when constant arguments for the kernel
                            had been changed and need to be set again.
    """

    hide_from_registry = True
    MAPPING = set()

    REDUCE_SIZE = 64  # used for updating bias

    def __init__(self, workflow, **kwargs):
        kwargs["view_group"] = kwargs.get("view_group", "TRAINER")
        super(GradientDescentBase, self).__init__(workflow, **kwargs)
        self.err_input = Array(shallow_pickle=True)
        self.ocl_set_const_args = True
        self.weights = None
        self.bias = None
        self.demand("input", "err_output")
        self.learning_rate = kwargs.get("learning_rate", 0.01)
        self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", 0.00005)
        self.weights_decay_bias = kwargs.get("weights_decay_bias", 0.0)
        self.l1_vs_l2 = kwargs.get("l1_vs_l2", 0)
        self.l1_vs_l2_bias = kwargs.get("l1_vs_l2_bias", self.l1_vs_l2)
        self.gradient_moment = kwargs.get("gradient_moment", 0)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment)
        self.weights_transposed = kwargs.get("weights_transposed", False)
        self.need_err_input = kwargs.get("need_err_input", True)
        self.include_bias = kwargs.get("include_bias", True)
        self.factor_ortho = kwargs.get("factor_ortho", 0)
        self.col_sums = Array()  # for orthogonalization

        # Current gradient as it is without applying learning_rate etc.
        self.gradient_weights = Array()
        self.gradient_bias = Array()

        # Gradient with applied learning_rate etc.
        # optionally accumulated from the previous run
        self.accumulate_gradient = kwargs.get("accumulate_gradient", False)

        # When accumulate_gradient set to True:
        # 1. Calculate gd
        # 2. acc = acc_alpha * gd + acc_beta * acc
        # 3. gd = gd_alpha * acc + gd_beta * gd
        # 4. Apply moments to gd
        # 5. weights += gd if apply_gradient set to True
        self.acc_alpha = kwargs.get("acc_alpha", 0.0)
        self.acc_beta = kwargs.get("acc_beta", 0.0)
        self.gd_alpha = kwargs.get("gd_alpha", 0.0)
        self.gd_beta = kwargs.get("gd_beta", 1.0)

        self.accumulated_gradient_weights = Array()
        self.accumulated_gradient_bias = Array()

        # Gradient with accumulated moments
        self.gradient_weights_with_moment = Array()
        self.gradient_bias_with_moment = Array()

        # Sets to True when gradient changes
        self.gradient_changed = False

        # Gradient will be applied to weights immediately just after computing
        self.apply_gradient = kwargs.get("apply_gradient", not workflow.is_slave)

    @property
    def current_batch_size(self):
        batch_size = getattr(self, "batch_size", None)
        if batch_size is None:
            return self.err_output.mem.shape[0]
        return int(batch_size)

    def initialize(self, device, **kwargs):
        super(GradientDescentBase, self).initialize(device, **kwargs)

        if self.weights:
            assert len(self.weights.shape) == 2
            self.weights_shape = tuple(reversed(self.weights.shape)) if self.weights_transposed else self.weights.shape
        else:
            self.weights_shape = None

        self.learning_rate = kwargs.get("learning_rate", self.learning_rate)
        self.weights_decay = kwargs.get("weights_decay", self.weights_decay)
        self.gradient_moment = kwargs.get("gradient_moment", self.gradient_moment)
        self.learning_rate_bias = kwargs.get("learning_rate_bias", self.learning_rate_bias)
        self.weights_decay_bias = kwargs.get("weights_decay_bias", self.weights_decay_bias)
        self.gradient_moment_bias = kwargs.get("gradient_moment_bias", self.gradient_moment_bias)

        if self.weights:
            if not self.gradient_weights:
                self.gradient_weights.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights.size == self.weights.size

        if self.weights and self.accumulate_gradient:
            if not self.accumulated_gradient_weights:
                self.accumulated_gradient_weights.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.accumulated_gradient_weights.size == self.weights.size

        if self.weights and (self.gradient_moment or not self.is_standalone):
            if not self.gradient_weights_with_moment:
                self.gradient_weights_with_moment.reset(numpy.zeros_like(self.weights.mem))
            else:
                assert self.gradient_weights_with_moment.size == self.weights.size

        if self.include_bias and self.bias and (not self.gradient_bias or self.gradient_bias.size != self.bias.size):
            self.gradient_bias.reset(numpy.zeros_like(self.bias.mem))

        if (
            self.include_bias
            and self.bias
            and self.accumulate_gradient
            and (not self.accumulated_gradient_bias or self.accumulated_gradient_bias.size != self.bias.size)
        ):
            self.accumulated_gradient_bias.reset(numpy.zeros_like(self.bias.mem))

        if self.include_bias and self.bias and (self.gradient_moment_bias or not self.is_standalone):
            if not self.gradient_bias_with_moment:
                self.gradient_bias_with_moment.reset(numpy.zeros_like(self.bias.mem))
            else:
                assert self.gradient_bias_with_moment.size == self.bias.size

        dtype = self.err_output.dtype
        if self.need_err_input:
            if not self.err_input:
                self.err_input.reset(numpy.zeros(self.input.shape, dtype))
            else:
                assert self.err_input.shape == self.input.shape

        if self.weights:
            side = self.weights_shape[0]
            other = self.weights.size // side
            if self.factor_ortho:
                if not self.col_sums:
                    self.col_sums.reset(numpy.zeros(other, dtype=dtype))
                else:
                    assert self.col_sums.size == other
                self.col_sums.initialize(self.device)
            self.reduce_size = roundup(min(self.reduce_size, other), 32)
            self.weights.initialize(self.device)

        for vec in self.bias, self.input, self.err_input:
            if vec:
                vec.initialize(self.device)
        self.init_vectors(
            self.err_output,
            self.gradient_weights,
            self.gradient_bias,
            self.accumulated_gradient_weights,
            self.accumulated_gradient_bias,
            self.gradient_weights_with_moment,
            self.gradient_bias_with_moment,
        )

    def gpu_weights_update(self):
        self.unmap_vectors(
            self.input,
            self.err_output,
            self.weights,
            self.gradient_weights,
            self.accumulated_gradient_weights,
            self.gradient_weights_with_moment,
        )

        if self.factor_ortho:
            self.col_sums.unmap()
            self.execute_kernel(self._global_size_ortho, self._local_size_ortho, self.krn_compute_col_sums_)

            self._weights_const[12] = self.factor_ortho
            self.krn_weights_.set_arg(12, self._weights_const[12:13])

        self._weights_const[4:12] = (
            self.learning_rate,
            self.weights_decay,
            self.l1_vs_l2,
            self.gradient_moment,
            self.acc_alpha,
            self.acc_beta,
            self.gd_alpha,
            self.gd_beta,
        )
        self.krn_weights_.set_args(
            self.device.skip(4),
            self._weights_const[4:5],
            self._weights_const[5:6],
            self._weights_const[6:7],
            self._weights_const[7:8],
            self._weights_const[8:9],
            self._weights_const[9:10],
            self._weights_const[10:11],
            self._weights_const[11:12],
        )

        self.execute_kernel(self._global_size_weights, self._local_size_weights, self.krn_weights_)

    def gpu_bias_update(self):
        if not self.include_bias:
            return

        self.unmap_vectors(
            self.err_output,
            self.bias,
            self.gradient_bias,
            self.accumulated_gradient_bias,
            self.gradient_bias_with_moment,
        )

        self._bias_const[5:13] = (
            self.learning_rate_bias,
            self.weights_decay_bias,
            self.l1_vs_l2_bias,
            self.gradient_moment_bias,
            self.acc_alpha,
            self.acc_beta,
            self.gd_alpha,
            self.gd_beta,
        )
        self.krn_bias_.set_args(
            self.device.skip(5),
            self._bias_const[5:6],
            self._bias_const[6:7],
            self._bias_const[7:8],
            self._bias_const[8:9],
            self._bias_const[9:10],
            self._bias_const[10:11],
            self._bias_const[11:12],
            self._bias_const[12:13],
        )

        self.execute_kernel(self._global_size_bias, self._local_size_bias, self.krn_bias_)

    def gpu_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        if self.krn_err_output_ is None:
            return
        self.err_output.unmap()
        self.output.unmap()
        self.execute_kernel(self._global_size_err_output, self._local_size_err_output, self.krn_err_output_)

    def numpy_err_output_update(self):
        """Multiply err_output by activation derivative by output.
        """
        pass

    def print_debug_data(self):
        """
        Show weights statistics
        """
        if not self.logger.isEnabledFor(logging.DEBUG):
            return
        self.weights.map_read()
        self.bias.map_read()
        self.gradient_bias.map_read()
        self.gradient_weights.map_read()
        weights = self.weights.mem
        bias = self.bias.mem
        grad_weights = self.gradient_weights.mem
        grad_bias = self.gradient_bias.mem

        weight_table = PrettyTable("TYPE", "Mean", "StdDev", "Min", "Max")
        weight_table.float_format = ".10"
        for (w_name, w_array) in [
            ("Weight", weights),
            ("Bias", bias),
            ("Grad Weight", grad_weights),
            ("Grad Bias", grad_bias),
        ]:
            w_mean = w_stddev = w_min = w_max = None
            if w_array is not None and w_array.size > 0:
                w_mean = numpy.mean(w_array)
                w_stddev = numpy.std(w_array)
                w_min = numpy.min(w_array)
                w_max = numpy.max(w_array)
            weight_table.add_row(w_name, w_mean, w_stddev, w_min, w_max)
        self.debug("\n" + weight_table.get_string())

    def generate_data_for_slave(self, slave):
        return (
            self.learning_rate,
            self.weights_decay,
            self.gradient_moment,
            self.learning_rate_bias,
            self.weights_decay_bias,
            self.gradient_moment_bias,
        )

    @staticmethod
    def fill_zeros(vector):
        if not vector:
            return
        vector.map_invalidate()
        vector.mem[:] = 0

    def apply_data_from_master(self, data):
        self.learning_rate = data[0]
        self.weights_decay = data[1]
        self.gradient_moment = data[2]
        self.learning_rate_bias = data[3]
        self.weights_decay_bias = data[4]
        self.gradient_moment_bias = data[5]
        self.fill_zeros(self.gradient_weights_with_moment)
        self.fill_zeros(self.gradient_bias_with_moment)
        self.fill_zeros(self.gradient_weights)
        self.fill_zeros(self.gradient_bias)
        self.fill_zeros(self.accumulated_gradient_weights)
        self.fill_zeros(self.accumulated_gradient_bias)

    def generate_data_for_master(self):
        if not self.gradient_changed:
            return None
        self.gradient_changed = False
        self.gradient_weights_with_moment.map_read()
        self.gradient_bias_with_moment.map_read()
        return (self.gradient_weights_with_moment.mem, self.gradient_bias_with_moment.mem)

    def apply_data_from_slave(self, data, slave):
        if self.weights:
            self.weights.map_write()
            self.gradient_weights_with_moment.map_write()
            self.gradient_weights_with_moment.mem *= self.gradient_moment
            self.gradient_weights_with_moment.mem += data[0]
            self.weights.mem += self.gradient_weights_with_moment.mem
        if self.bias:
            self.bias.map_write()
            self.gradient_bias_with_moment.map_write()
            self.gradient_bias_with_moment.mem *= self.gradient_moment_bias
            self.gradient_bias_with_moment.mem += data[1]
            self.bias.mem += self.gradient_bias_with_moment.mem

    def drop_slave(self, slave):
        pass

    def accumulate_gradient_f(self, accumulated_gradient, gradient):
        if accumulated_gradient and self.accumulate_gradient:
            accumulated_gradient[:] = gradient * self.acc_alpha + (
                self.acc_beta * accumulated_gradient if self.acc_beta else 0
            )

            gradient *= self.gd_beta
            gradient += self.gd_alpha * accumulated_gradient

        return gradient

    @staticmethod
    def numpy_gradient_step(weight, gradient, lr, factor_l12, l1_vs_l2, factor_ortho=0, weights_transposed=False):
        gradient = gradient.copy()
        gradient += factor_l12 * ((1.0 - l1_vs_l2) * weight + 0.5 * l1_vs_l2 * numpy.sign(weight))
        if factor_ortho:
            col_sums = reshape_transposed(weight).sum(axis=1) if weights_transposed else weight.sum(axis=0)
            for i, row in enumerate(gradient):
                row += (col_sums - weight[i]) * factor_ortho / weight.shape[0]
        gradient *= lr
        return gradient

    def run(self):
        self.gradient_changed = True
        super(GradientDescentBase, self).run()
        self.ocl_set_const_args = False

Пример #10

Показать файл

class Cutter1D(AcceleratedUnit):
    """Cuts the specified interval from each 1D sample of input batch
    into output.

    y = alpha * x + beta * y
    """
    def __init__(self, workflow, **kwargs):
        super(Cutter1D, self).__init__(workflow, **kwargs)
        self.alpha = kwargs.get("alpha")
        self.beta = kwargs.get("beta")
        self.output_offset = kwargs.get("output_offset", 0)
        self.output = Array()
        self.demand("alpha", "beta", "input")
        # TODO: add input_offset and length to demand and not to crash lstm
        # TODO: unit test

    def init_unpickled(self):
        super(Cutter1D, self).init_unpickled()
        self.sources_["cutter"] = {}

    def initialize(self, device, **kwargs):
        super(Cutter1D, self).initialize(device, **kwargs)

        if not self.output or self.output.shape[0] != self.input.shape[0]:
            self.output.reset(
                numpy.zeros(
                    (self.input.shape[0], self.output_offset + self.length),
                    dtype=self.input.dtype))
        else:
            assert self.output.sample_size >= self.output_offset + self.length

        self.init_vectors(self.input, self.output)

    def cuda_init(self):
        dtype = self.input.dtype
        itemsize = self.input.itemsize
        limit = self.input.shape[0] * self.length

        self.build_program({}, "%s" % self.__class__.__name__, dtype=dtype)
        self.assign_kernel("cutter_1d_forward")

        self.set_args(
            int(self.input.devmem) + self.input_offset * itemsize,
            numpy.array([self.alpha], dtype=dtype),
            numpy.array([self.input.sample_size], dtype=numpy.int32),
            int(self.output.devmem) + self.output_offset * itemsize,
            numpy.array([self.beta], dtype=dtype),
            numpy.array([self.output.sample_size], dtype=numpy.int32),
            numpy.array([self.length], dtype=numpy.int32),
            numpy.array([limit], dtype=numpy.int32))

        block_size = self.device.suggest_block_size(self._kernel_)
        self._global_size = (int(numpy.ceil(limit / block_size)), 1, 1)
        self._local_size = (block_size, 1, 1)

    def ocl_init(self):
        dtype = self.input.dtype

        self.build_program({}, "%s" % self.__class__.__name__, dtype=dtype)
        self.assign_kernel("cutter_1d_forward")

        self.set_args(
            self.input.devmem,
            numpy.array([self.input_offset], dtype=numpy.int32),
            numpy.array([self.alpha], dtype=dtype),
            numpy.array([self.input.sample_size], dtype=numpy.int32),
            self.output.devmem,
            numpy.array([self.output_offset], dtype=numpy.int32),
            numpy.array([self.beta], dtype=dtype),
            numpy.array([self.output.sample_size], dtype=numpy.int32))

        self._global_size = (self.input.shape[0], self.length)
        self._local_size = None

    def _gpu_run(self):
        self.unmap_vectors(self.input, self.output)
        self.execute_kernel(self._global_size, self._local_size)

    def cuda_run(self):
        return self._gpu_run()

    def ocl_run(self):
        return self._gpu_run()

    def numpy_run(self):
        self.input.map_read()
        self.output.map_write()
        out = self.output.matrix[:, self.output_offset:self.output_offset +
                                 self.length]
        if self.beta:
            out *= self.beta
        else:
            out[:] = 0
        out += (self.input.matrix[:, self.input_offset:self.input_offset +
                                  self.length] * self.alpha)

Пример #11

Показать файл

Файл: cutter.py Проект: Samsung/veles.znicz

class Cutter1D(AcceleratedUnit):
    """Cuts the specified interval from each 1D sample of input batch
    into output.

    y = alpha * x + beta * y
    """
    def __init__(self, workflow, **kwargs):
        super(Cutter1D, self).__init__(workflow, **kwargs)
        self.alpha = kwargs.get("alpha")
        self.beta = kwargs.get("beta")
        self.output_offset = kwargs.get("output_offset", 0)
        self.output = Array()
        self.demand("alpha", "beta", "input")
        # TODO: add input_offset and length to demand and not to crash lstm
        # TODO: unit test

    def init_unpickled(self):
        super(Cutter1D, self).init_unpickled()
        self.sources_["cutter"] = {}

    def initialize(self, device, **kwargs):
        super(Cutter1D, self).initialize(device, **kwargs)

        if not self.output or self.output.shape[0] != self.input.shape[0]:
            self.output.reset(
                numpy.zeros(
                    (self.input.shape[0], self.output_offset + self.length),
                    dtype=self.input.dtype))
        else:
            assert self.output.sample_size >= self.output_offset + self.length

        self.init_vectors(self.input, self.output)

    def cuda_init(self):
        dtype = self.input.dtype
        itemsize = self.input.itemsize
        limit = self.input.shape[0] * self.length

        self.build_program({}, "%s" % self.__class__.__name__, dtype=dtype)
        self.assign_kernel("cutter_1d_forward")

        self.set_args(
            int(self.input.devmem) + self.input_offset * itemsize,
            numpy.array([self.alpha], dtype=dtype),
            numpy.array([self.input.sample_size], dtype=numpy.int32),
            int(self.output.devmem) + self.output_offset * itemsize,
            numpy.array([self.beta], dtype=dtype),
            numpy.array([self.output.sample_size], dtype=numpy.int32),
            numpy.array([self.length], dtype=numpy.int32),
            numpy.array([limit], dtype=numpy.int32))

        block_size = self.device.suggest_block_size(self._kernel_)
        self._global_size = (int(numpy.ceil(limit / block_size)), 1, 1)
        self._local_size = (block_size, 1, 1)

    def ocl_init(self):
        dtype = self.input.dtype

        self.build_program({}, "%s" % self.__class__.__name__, dtype=dtype)
        self.assign_kernel("cutter_1d_forward")

        self.set_args(
            self.input.devmem,
            numpy.array([self.input_offset], dtype=numpy.int32),
            numpy.array([self.alpha], dtype=dtype),
            numpy.array([self.input.sample_size], dtype=numpy.int32),
            self.output.devmem,
            numpy.array([self.output_offset], dtype=numpy.int32),
            numpy.array([self.beta], dtype=dtype),
            numpy.array([self.output.sample_size], dtype=numpy.int32))

        self._global_size = (self.input.shape[0], self.length)
        self._local_size = None

    def _gpu_run(self):
        self.unmap_vectors(self.input, self.output)
        self.execute_kernel(self._global_size, self._local_size)

    def cuda_run(self):
        return self._gpu_run()

    def ocl_run(self):
        return self._gpu_run()

    def numpy_run(self):
        self.input.map_read()
        self.output.map_write()
        out = self.output.matrix[
            :, self.output_offset:self.output_offset + self.length]
        if self.beta:
            out *= self.beta
        else:
            out[:] = 0
        out += (
            self.input.matrix[
                :, self.input_offset:self.input_offset + self.length] *
            self.alpha)

Пример #12

Показать файл

Файл: kohonen.py Проект: Samsung/veles.znicz

class KohonenForward(KohonenBase, AcceleratedUnit):
    """Kohonen forward layer.

    Must be assigned before initialize():
        input
        weights
        minibatch_offset (if total == True)
        minibatch_size (if total == True)
        batch_size (if total == True)
        argmins speeds up run() if linked from KohonenTrainer

    Updates after run():
        output

    Creates within initialize():
        output

    Attributes:
        input: input as batch of samples.
        weights: the weights of the neurons in Kohonen layer.
        output: the list of winners.
        total: if total=True is passed in __init__(), the overall winners table
    """
    def __init__(self, workflow, **kwargs):
        super(KohonenForward, self).__init__(workflow, **kwargs)
        self.demand("input", "weights")
        self.argmins = None
        self._distances = Array()
        self.output = Array()
        self._chunk_size_ = 0
        self.weights_transposed = False
        self.total = Array() if kwargs.get("total", False) else None
        if self.total is not None:
            self.minibatch_offset = None
            self.minibatch_size = None
            self.batch_size = None

    def init_unpickled(self):
        super(KohonenForward, self).init_unpickled()
        self.sources_["kohonen"] = {"FORWARD": 1}

    @property
    def neurons_number(self):
        return self.weights.mem.shape[0]

    @property
    def sample_length(self):
        return self.weights.mem.shape[1]

    @property
    def chunk_size(self):
        return self._chunk_size_

    def initialize(self, device, **kwargs):
        super(KohonenForward, self).initialize(device=device, **kwargs)

        assert self.input.mem.shape[1] == self.sample_length
        batch_size = self.input.mem.shape[0]

        self.output.reset(numpy.zeros(batch_size, dtype=numpy.int32))
        if self.argmins is None:
            self._distances.reset(numpy.zeros(
                [batch_size, self.neurons_number],
                dtype=self.weights.mem.dtype))

        if self.total is not None:
            self.total.reset(numpy.zeros(self.batch_size, dtype=numpy.int32))
            self._minibatch_offset_ = numpy.zeros(1, dtype=numpy.int32)

    def ocl_init(self):
        batch_size = self.input.mem.shape[0]
        self.output.initialize(self.device)
        if self.argmins is None:
            self.input.initialize(self.device)
            self.weights.initialize(self.device)
            self._distances.initialize(self.device)
        elif self.total is None:
            return
        if self.total is not None:
            self.total.initialize(self.device)

        copy_chunk_size = int(numpy.ceil(batch_size /
                                         self.device.max_group_size))
        chunk_size = self.neurons_number // self.device.max_group_size
        if chunk_size < 2:
            chunk_size = self.neurons_number // 2 + 1
        self.argmin_group_size = \
            int(numpy.ceil(self.neurons_number / chunk_size))

        block_size, vector_opt = self.device.device_info.get_kernel_bs_vo(
            kernel="matrix_multiplication", dtype=self.input.dtype)

        defines = {
            'BLOCK_SIZE': block_size,
            'VECTOR_OPT': int(bool(vector_opt)),
            'BATCH': batch_size,
            'SAMPLE_LENGTH': self.sample_length,
            'NEURONS_NUMBER': self.neurons_number,
            'CHUNK_SIZE': chunk_size,
            'COPY_CHUNK_SIZE': copy_chunk_size,
        }
        if self.weights_transposed:
            defines['WEIGHTS_TRANSPOSED'] = 1
        self.build_program(defines, "%s_%d_%d_%d" %
                           (self.__class__.__name__,
                            batch_size, self.sample_length,
                            self.neurons_number),
                           dtype=self.weights.mem.dtype)

        if self.total is not None:
            self._set_total_global_size_ = \
                [int(numpy.ceil(batch_size / copy_chunk_size))]
            self._krn_set_total_ = self.get_kernel("set_total")
            self._krn_set_total_.set_args(self.output.devmem, cl.skip,
                                          self.total.devmem)
        if self.argmins is not None:
            return

        self._krn_distances_ = self.get_kernel("calculate_distances")
        self._krn_distances_.set_args(self.input.devmem, self.weights.devmem,
                                      self._distances.devmem)

        self._krn_argmin_ = self.get_kernel("calculate_argmin")
        self._krn_argmin_.set_args(self._distances.devmem, self.output.devmem,
                                   None)

        self._gs_distance = [
            roundup(self.neurons_number, block_size),
            roundup(batch_size, block_size)]
        self._ls_distance = [block_size, block_size]

    def ocl_run(self):
        self.output.unmap()
        if self.total is not None:
            self.total.unmap()

        if self.argmins is None:
            self.input.unmap()
            self.weights.unmap()
            self.execute_kernel(self._gs_distance, self._ls_distance,
                                self._krn_distances_)
            self.execute_kernel([self.argmin_group_size],
                                [self.argmin_group_size],
                                self._krn_argmin_)
        else:
            self.argmins.unmap()
            self.argmins.map_read()
            self.output.map_write()
            self.output.mem[:] = self.argmins.mem
            self.output.unmap()
            self.argmins.unmap()

        if self.total is not None:
            self._minibatch_offset_[0] = \
                self.minibatch_offset - self.minibatch_size
            self._krn_set_total_.set_arg(1, self._minibatch_offset_)
            self.execute_kernel(self._set_total_global_size_, None,
                                self._krn_set_total_)

    def numpy_run(self):
        self.output.map_invalidate()

        if self.argmins is not None:
            self.argmins.map_read()
            self.output.mem[:] = self.argmins.mem
        else:
            self.input.map_read()
            self.weights.map_read()

        if self.total is not None:
            self.total.map_invalidate()

        length = self.minibatch_size if self.total is not None \
            else self.input.mem.shape[0]
        for sindex in range(length):
            if self.argmins is None:
                dist = self.weights.mem - self.input[sindex]
                winner = numpy.argmin(self.numpy_linalg_norm(dist))
                self.output[sindex] = winner
            else:
                winner = self.argmins[sindex]
            if self.total is not None:
                index = sindex + self.minibatch_offset - self.minibatch_size
                self.total[index] = winner

Пример #13

Показать файл

Файл: uniform.py Проект: zghzdxs/veles

class Uniform(AcceleratedUnit):
    """Generates random numbers from uniform distribution.

    Attributes:
        num_states: number of random states for parallel generation.
        states: Array of random states.
        prng: veles.prng.RandomGenerator for initial states generation.
        output_bytes: number of output bytes to generate.
    """

    backend_methods = AcceleratedUnit.backend_methods + ("fill",)

    def __init__(self, workflow, **kwargs):
        super(Uniform, self).__init__(workflow, **kwargs)
        self.num_states = kwargs.get("num_states", 256)
        self.states = Array()
        self.prng = kwargs.get("prng", get())
        self.output_bytes = kwargs.get("output_bytes", 0)
        self.output = Array()
        self.cl_const = numpy.zeros(1, dtype=numpy.int32)

    def init_unpickled(self):
        super(Uniform, self).init_unpickled()
        self.sources_["random"] = {}

    def initialize(self, device, **kwargs):
        super(Uniform, self).initialize(device, **kwargs)

        if not self.states or self.states.size != self.num_states * 16:
            self.states.reset(numpy.empty(self.num_states * 16 * 2,
                                          dtype=numpy.uint32))
            self.states.mem[:] = self.prng.randint(0, (1 << 32) + 1,
                                                   self.states.size)

        if not self.output or self.output.nbytes < self.output_bytes:
            self.output_bytes = roundup(self.output_bytes,
                                        self.num_states * 16 * 8)
            self.output.reset(numpy.zeros(self.output_bytes, numpy.uint8))
        else:
            self.output_bytes = self.output.nbytes

        self.init_vectors(self.states, self.output)

    def _gpu_init(self):
        self.build_program({}, "uniform_%d" % self.num_states)

        self.assign_kernel("random_xorshift1024star")
        self.set_args(self.states, self.cl_const, self.output)

    def ocl_init(self):
        self._gpu_init()
        self._global_size = [self.num_states]
        self._local_size = None

    def cuda_init(self):
        self._gpu_init()
        n = self.num_states
        l = 1
        while not (n & 1) and l < 32:
            n >>= 1
            l <<= 1
        self._global_size = (n, 1, 1)
        self._local_size = (l, 1, 1)

    def _gpu_fill(self, nbytes):
        bytes_per_round = self.num_states * 16 * 8
        nbytes = roundup(nbytes, bytes_per_round)
        if nbytes > self.output.nbytes:
            raise error.Bug("nbytes > self.output.nbytes")
        self.unmap_vectors(self.states, self.output)
        self.cl_const[0] = nbytes // bytes_per_round
        self.set_arg(1, self.cl_const)
        self.execute_kernel(self._global_size, self._local_size)

    def ocl_fill(self, nbytes):
        self._gpu_fill(nbytes)

    def cuda_fill(self, nbytes):
        self._gpu_fill(nbytes)

    def numpy_fill(self, nbytes):
        bytes_per_round = self.num_states * 16 * 8
        nbytes = roundup(nbytes, bytes_per_round)
        if nbytes > self.output.nbytes:
            raise error.Bug("nbytes > self.output.nbytes")
        self.states.map_write()
        self.output.map_invalidate()
        n_rounds = nbytes // bytes_per_round

        u64 = numpy.array([1181783497276652981], dtype=numpy.uint64)
        s0 = numpy.zeros(1, dtype=numpy.uint64)
        s1 = numpy.zeros(1, dtype=numpy.uint64)

        states = self.states.mem.view(dtype=numpy.uint64)
        states = states.reshape(states.size // 16, 16)
        output = self.output.mem.view(dtype=numpy.uint64)
        for i in range(self.num_states):
            offs = i
            s = states[i]
            self.p = 0
            for _round in range(n_rounds):
                for _iter in range(16):
                    output[offs] = self._next_rand(s, s0, s1, u64)
                    offs += self.num_states

    def _next_rand(self, s, s0, s1, u64):
        s0[0] = s[self.p]
        self.p = (self.p + 1) & 15
        s1[0] = s[self.p]
        s1 ^= s1 << 31
        s1 ^= s1 >> 11
        s0 ^= s0 >> 30
        s0 ^= s1
        s[self.p] = s0[0]
        return (s0 * u64)[0]

    def fill(self, nbytes):
        self._backend_fill_(nbytes)

    def ocl_run(self):
        self.ocl_fill(self.output.nbytes)

    def cuda_run(self):
        self.cuda_fill(self.output.nbytes)

    def numpy_run(self):
        self.numpy_fill(self.output.nbytes)