Python NervanaGPU.dot примеры использования

Язык программирования: Python

Пространство имен/Пакет: nervanagpu

Класс/Тип: NervanaGPU

Метод/Функция: dot

Примеров на hotexamples.com: 15

Python NervanaGPU.dot - 15 примеров найдено. Это лучшие примеры Python кода для nervanagpu.NervanaGPU.dot, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

empty(10)

dot(8)

NervanaGPU(6)

array(5)

update_conv(3)

rand(3)

bprop_conv(3)

mean(3)

max(3)

fprop_conv(3)

finite(2)

sum(2)

pool_layer(2)

min(2)

add(2)

fprop_pool(2)

zeros(2)

bprop_pool(2)

conv_layer(2)

fabs(1)

var(1)

argmax(1)

tanh(1)

batched_dot(1)

subtract(1)

square(1)

sqrt(1)

sig(1)

reciprocal(1)

bench(1)

power(1)

ones(1)

exp(1)

not_equal(1)

multiply(1)

clip(1)

maximum(1)

divide(1)

log(1)

dropout(1)

empty_like(1)

fprop_cuda_conv(1)

equal(1)

greater(1)

Пример #1

Показать файл

Файл: gpu.py Проект: YouVentures/neon

class GPU(Backend):
    """
    Sets up a NervanaGPU based backend for matrix operations.
    Note that some functions defined in the generic Backend class such as are
    cross-map pooling and normalization and adaDelta are not implemented for
    this backend.
    """
    default_dtype = np.float32

    def __init__(self, rng_seed, stochastic_round=False, device_id=0):
        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized NervanaGPU with stochastic_round=%s",
                    stochastic_round)
        self.rng_seed = rng_seed
        self.rng_init()
        self.device_id = device_id if device_id is not None else 0

    def __getstate__(self):
        """
        Defines what and how we go about serializing an instance of this class.

        Returns:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        if hasattr(self, 'mem_pool') and self.mem_pool is not None:
            self.mem_pool_pickle = {'shape': self.mem_pool.shape,
                                    'dtype': np.float32}
            self.mem_pool = None

        return self.__dict__

    def __setstate__(self, state):
        """
        Defines how we go about deserializing into an instance of this class.

        Arguments:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        self.__dict__.update(state)
        self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'],
                                      dtype=self.mem_pool_pickle['dtype'])

    def init_mempool(self, shape, dtype=default_dtype):
        """
        Allocates a memory pool for temporary storage
        """
        self.mem_pool = self.ng.empty(shape, dtype=dtype)

    def alloc_host_mem(self, shape, dtype):
        return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0)

    def create_stream(self):
        return drv.Stream()

    def async_copy(self, dest, src, stream=None):
        drv.memcpy_htod_async(dest.gpudata, src, stream)

    def rng_init(self):
        """
        Initialize and seed the pseudo random number genrator. Random numbers
        are generated on the host using numpy, then transfered to device.
        """
        seed = None
        if 'rng_seed' in self.__dict__:
            seed = self.rng_seed
            logger.info("Seeding random number generator with: %s", str(seed))
        np.random.seed(seed)

    def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew):
        """
        Initialize FLOP timing.  Wraps the specified MOP calls via a decorator
        to record elapsed time and number of operations.

        Arguments:
           decorate_fc (list): string giving the function names of fully
                               connected layer forward/backward/update calls
                               to time.
           decorate_conv (list): string giving the function names of
                                 convolutional layer forward/backward/update
                                 calls to time.
           decorate_ew (list): string giving the function names of element-wise
                               calls to time.

        Notes:
            Must be called prior to first flop_timing_start call
        """
        self.start = drv.Event()
        self.end = drv.Event()
        self.flop_timer = FlopsDecorator(self)
        self.flop_timer.decorate(decorate_fc=decorate_fc,
                                 decorate_conv=decorate_conv,
                                 decorate_ew=decorate_ew)

    def flop_timinig_start(self):
        """
        Start a new FLOP timer.
        Returns:
            None: dummy value (not used)
        """
        return self.start.record()

    def flop_timing_finish(self, start_time):
        """
        Complete current FLOP timing.

        Arguments:
            start_time (unused): ignored.

        Returns:
            float: elapsed time in seconds since prior flop_timing_start call.
        """
        self.end.record()
        self.end.synchronize()
        return self.end.time_since(self.start)

    def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype,
                persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dype=None it will probably explode
        """
        ary = np.random.uniform(low, high, shape)
        return GPUTensor(ary.shape, dtype, allocator=allocator, name=name,
                         rounding=self.ng.round_mode).set(ary)

    def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
               persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        Gaussian/Normal random number sample generation
        """
        ary = np.random.normal(loc, scale, size)
        return GPUTensor(ary.shape, dtype, allocator=allocator, name=name,
                         rounding=self.ng.round_mode).set(ary)

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        Forward propagate the inputs of a fully connected network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            layer (Layer): The layer object.
        """
        self.ng.dot(weights, inputs, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(weights.T, deltas, out)

    def update_fc(self, out, inputs, deltas, layer=None):
        """
        Compute the updated gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(deltas, inputs.T, out)

    def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, nifm, padding, stride, ngroups, fpropbuf,
                   local=False):
        """
        Forward propagate the inputs of a convolutional network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                             layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element
                                  in each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).  For this
                              backend we expect these values to be square.
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  convolved outputs for a single receptive
                                  field.  Not used for this backend.
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """

        '''
        N: Number of images in mini-batch
        C: Number of input feature maps
        K: Number of output feature maps

        D: Depth  of input image
        H: Height of input image
        W: Width  of input image

        T: Depth  of filter kernel
        R: Height of filter kernel
        S: Width  of filter kernel
        '''
        self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out,
                           alpha=1.0, repeat=1)

    def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, padding, stride, nifm, ngroups, bpropbuf,
                   local=False):
        """
        Backward propagate the error through a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """
        self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out,
                           alpha=1.0, repeat=1)

    def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize,
                    ofmlocs, ifmshape, links, nifm, padding, stride, ngroups,
                    fwidth, updatebuf, local=False, layer=None):
        """
        Compute the updated gradient for a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fwidth (int): Filter width.
            updatebuf (GPUTensor): Temporary storage buffer used to hold the
                                   updated gradient for a single receptive
                                   field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
            layer (Layer): The layer object.
        """
        self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out,
                            alpha=1.0, repeat=1)

    def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape,
                   ifmshape, links, nifm, padding, stride, fpropbuf):
        """
        Forward propagate the inputs of a Pooling network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  pooled outputs for a single receptive field.
        """
        op = op.lower()
        if op == "max":
            self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize,
                   ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding,
                   stride, bpropbuf):
        """
        Backward propagate the error through a pooling network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            fouts (GPUTensor): Forward propagated outputs from the previous
                               layer.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                              each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            fpsize (int): The size of each filter.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
        """
        op = op.lower()
        if op == "max":
            self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out,
                               repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def logistic(self, x, out):
        """
        Logistic sigmoid nonlinearity, 1/(1+exp(-x))

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.sig(x, out=out)

        return out

    def rectlin(self, x, out):
        """
        Rectified Linear nonlinearity

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.maximum(x, 0., out=out)
        return out

    def rectleaky(self, x, slope, out):
        out[:] = self.ng.maximum(x, x*slope)

    def rectleaky_derivative(self, x, slope, out):
        out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope

    def sum(self, tsr, axes, out):
        """
        Sum

        Arguments:
            tsr  (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.sum(tsr, axis=axes, out=out)
        return out

    def mean(self, tsr, axes, out):
        """
        Calculates the arithmetic mean of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.mean(tsr, axis=axes, out=out)
        return out

    def min(self, tsr, axes, out):
        """
        Calculates the minimum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.min(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.min(tsr, axis=axes, out=out)
        return out

    def max(self, tsr, axes, out):
        """
        Calculates the maximum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.max(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.max(tsr, axis=axes, out=out)
        return out

    def variance(self, tsr, axes, out, mean=None):
        """
        Calculates the variance of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): the tensor on which to compute the variance
            axes (int, list, optional): the dimension(s) along which to
                                        variance.  If set to None, we will
                                        variance over all dimensions.
            out (GPUTensor): where the result will be stored.
            mean (GPUTensor): the tensor containing mean of tsr

        Returns:
            GPUTensor: reference to out
        """
        if mean is None:
            logger.error("GPUTensor requires mean to be specified.")
            raise ValueError("mean not specified")
        self.ng.mean(self.ng.square(tsr-mean),  axis=axes, out=out)
        return out

    def fabs(self, x, out):
        """
        Calculates absolute value of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.fabs(x, out=out)
        return out

    def sqrt(self, x, out):
        """
        Calculates square root of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.sqrt(x, out=out)
        return out

    def zeros(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with zeros.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.zeros(shape, dtype=dtype)

    def ones(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with ones.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.ones(shape, dtype=dtype)

    def empty(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.empty(shape, dtype=dtype)

    def array(self, ary, dtype=default_dtype, persist_values=True, name=None,
              allocator=drv.mem_alloc):
        """
        Allocate a new GPUTensor and fill it with supplied numpy array.

        Arguments:
            ary (ndarray): Numpy array with source data
            dtype (dtype, optional): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls
            name (string): Name for the GPUTensor
            allocator (pycuda): Pycuda memory allocator

        Returns:
            GPUTensor: output
        """
        return GPUTensor(ary.shape, dtype, allocator=allocator, name=name,
                         rounding=self.ng.round_mode).set(ary)

    def add(self, left, right, out):
        """
        Elementwise addition

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.add(left, right, out=out)
        return out

    def subtract(self, left, right, out):
        """
        Elementwise subtraction

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.subtract(left, right, out=out)
        return out

    def multiply(self, left, right, out):
        """
        Elementwise multiplication

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.multiply(left, right, out=out)
        return out

    def divide(self, left, right, out):
        """
        Elementwise division

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.divide(left, right, out=out)
        return out

    def greater(self, left, right, out):
        """
        Elementwise greater than testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.greater(left, right, out=out)
        return out

    def equal(self, left, right, out):
        """
        Performs element-wise equality testing on each element of left and
        right, storing the result in out.  Each operand is assumed to be the
        same shape (or broadcastable as such).

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.equal(left, right, out=out)
        return out

    def not_equal(self, left, right, out):
        """
        Elementwise not equal testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.not_equal(left, right, out=out)
        return out

    def clip(self, a, a_min, a_max, out):
        """
        Elementwise clipping between a range of specified values

        Arguments:
            a (GPUTensor): input tensor.
            a_min (float): floor value.
            a_max (float): ceiling value.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.clip(a, a_min, a_max, out=out)
        return out

    def log(self, a, out):
        """
        Elementwise base-e logarithm

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.log(a, out=out)
        return out

    def tanh(self, a, out):
        """
        Elementwise tanh

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.tanh(a, out=out)
        return out

    def argmax(self, a, out, axis=0):
        """
        Calculates the indices of the maximal element value along the specified
        axis.  If multiple elements contain the maximum, only the elements of
        the first are returned.

        Arguments:
            tsr (GPUTensor): The GPUTensor on which to find the maximum indices
            axis (int): The dimension along which to find the maximum.  If set
                        to None, find the overall maximum index of a flattened
                        representation of tsr.
            out (GPUTensor): Where to store the result.  Should be of the
                             appropriate type and expected shape

        Returns:
            GPUTensor: reference to out
        """
        self.ng.argmax(a, out=out, axis=axis)
        return out

    def softmax(self, x, out):
        """
        Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i))

        Arguments:
            x (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        out[:] = (self.ng.reciprocal(self.ng.sum(
                  self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) *
                  self.ng.exp(x - self.ng.max(x, axis=0)))
        return out

    def softmax_gradient(self, y, err, out):
        """
        Gradient of the softmax nonlinearity.

        Arguments:
            y (GPUTensor): input tensor.
            err (GPUTensor): backpropagated error.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        raise NotImplementedError("Softmax gradient should use shortcut")
        return out

    def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype):
        """
        Create a binary mask for dropout layers.

        Arguments:
            tsr (GPUTensor): Output tensor
            keepthresh (float): fraction of ones
        """
        self.ng.dropout(keep=keepthresh, out=tsr)

    def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef,
                     learning_rate, epoch):
        """
        Perform gradient descent update with momentum.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs are written to vs_item (updated velocity)
        and ps_item (updated weights)
        """
        vs_item[:] = vs_item * momentum_coef - us_item * learning_rate
        ps_item[:] = ps_item + vs_item

    def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef,
                       learning_rate, wd, epoch):
        """
        Perform gradient descent update with momentum and weight decay.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            wd (float): weight decay parameter.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs:
            ps_item, the updated weights.
            vs_item, the updated velocity.
            us_item, used as a temp buffer.
        """
        vs_item[:] = vs_item * momentum_coef - us_item * \
            learning_rate - learning_rate * wd * ps_item
        ps_item[:] = ps_item + vs_item

    def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item,
                   rho, epsilon):
        """
        Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701)

        Arguments:
            ps_item: weight / parameter (will be updated)
            us_item: update
            gs_item: expected value of Gradient Squared (will be updated)
            ds_item: expected value of Delta Squared (will be updated)
            ls_item: learning rate (will be updated)
            ss_item: Scratch Space
            rho: decay constant (determines window size)
            epsilon: small positive constant for numerical stability
        """
        # Accumulate E[Grad^2]
        gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item

        # Calculate Updates
        ls_item[:] = self.ng.sqrt((ds_item + epsilon) /
                                  (gs_item + epsilon)) * (-1.0) * us_item

        # Accumulate E[Delt^2]
        ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item

        # Final update to the params
        ps_item[:] = ps_item + ls_item

    def rms_update(self, params, updates, run_squares, velocity, scratch_space,
                   gamma, epsilon, learning_rate, momentum_coef):

        # Update running squares
        run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates

        # Now scale the gradient by lr / rms(grad) (with a epsilon term for
        # stability) and use it to update the params
        if momentum_coef == 0:
            params[:] = params - learning_rate * updates * self.ng.reciprocal(
                self.ng.sqrt(run_squares) + epsilon)
        else:
            velocity[:] = velocity * momentum_coef - \
                learning_rate * updates * \
                self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon)
            params[:] = params + velocity

    def fprop_bn_compound(self, inputs, beta, gamma, eps, xvar, xhat, out):
        """
        Batch normalization forward pass, compounded to run in 3 kernel calls.

        Arguments:
            inputs: input data to be normalized
            beta: location parameter
            gamma: scale parameter
            eps: small constant for numerical stability
            xvar: variance (updated)
            xhat: normalized input (updated)
            out: normalized and rescaled input (updated)
        """
        xvar[:] = self.ng.reciprocal(self.ng.sqrt(self.ng.var(inputs, axis=1) +
                                                  eps))
        xhat[:] = xvar * (inputs - self.ng.mean(inputs, axis=1))
        out[:] = xhat * gamma + beta
        return out

    def bprop_bn_compound(self, xhat, error, xvar, gamma,
                          beta_updates, gamma_updates):
        """
        Batch normalization backward pass, compounded to run with 4 kernel
        calls.

        Arguments:
            xhat: normalized input data (updated)
            error: backpropagated deltas (updated)
            xvar: precomputed variance
            gamma: scale parameter
            beta_updates: gradient update for beta (updated)
            gamma_updates: gradient update for gamma (updated)
        """
        gamma_updates[:] = self.ng.sum(xhat * error, axis=1)
        beta_updates[:] = self.ng.sum(error, axis=1)
        xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1])
        error[:] = xvar * gamma * (error - xhat)

Пример #2

Показать файл

Файл: cublas.py Проект: zky001/nervanagpu

                # otherwise copy
                else:
                    devA2 = ng.empty(dimA, dtype=np.float32)
                    devB2 = ng.empty(dimB, dtype=np.float32)
                    devA2[:] = devA1
                    devB2[:] = devB1

                devC2 = ng.empty(dimC, dtype=np.float32)
                devC2[:] = devC1

                if op[0] == 't': devA1, devA2 = devA1.T, devA2.T
                if op[1] == 't': devB1, devB2 = devB1.T, devB2.T

                ng.dot(devA1,
                       devB1,
                       devC1,
                       alpha=alpha,
                       beta=beta,
                       repeat=repeat)

                cublas_dot(devA2,
                           devB2,
                           devC2,
                           alpha=alpha,
                           beta=beta,
                           repeat=repeat)

                partial1 = ng.empty((devC1.shape[0], 1), dtype=np.float32)
                partial2 = partial1[0:1, 0:1]

                diff = ng.max(abs(devC2 - devC1),
                              partial=partial1,

Пример #3

Показать файл

Файл: gemm_test.py Проект: honorpeter/Caffe2_fpga

                    cpuA = np.random.uniform(-1.0, 1.0,
                                             dimA).astype(np.float32)
                    cpuB = np.random.uniform(-1.0, 1.0,
                                             dimB).astype(np.float32)
                    #cpuB = np.identity(n, dtype=dtype)

                    devA = ng.array(cpuA, dtype=dtype)
                    devB = ng.array(cpuB, dtype=dtype)
                    devC = ng.empty(dimC, dtype=dtype)

                    #repeat = min(int(50.0 * 4096**3 / (m * n * k)), 1000)

                    if op[0] == 't': cpuA, devA = cpuA.T, devA.T
                    if op[1] == 't': cpuB, devB = cpuB.T, devB.T

                    ng.dot(devA, devB, devC, repeat=1)

                    #context.synchronize()

                    cpuC = np.dot(cpuA, cpuB)

                    cpuD = devC.get()
                    diff = np.absolute(cpuC - cpuD)
                    max_diff = diff.max()
                    print(max_diff, cpuD.max())
                    if max_diff > 0.1 or max_diff != max_diff:
                        #print(m, n, k, max_diff)
                        print(cpuD[::max(m // 16, 1), ::max(n // 16, 1)])
                        print(cpuC[::max(m // 16, 1), ::max(n // 16, 1)])
                        print(diff[::max(m // 16, 1), ::max(n // 16, 1)])
                        exit()

Пример #4

Показать файл

Файл: cublas2.py Проект: KayneWest/nervanagpu

                devA2 = ng.empty(dimA, dtype=np.float32)
                devB2 = ng.empty(dimB, dtype=np.float32)
                devA2[:] = devA1
                devB2[:] = devB1

            devC2 = ng.empty(dimC, dtype=np.float32)

            if op[0] == 't': devA1, devA2 = devA1.T, devA2.T
            if op[1] == 't': devB1, devB2 = devB1.T, devB2.T

            glops16 = 0
            glops32 = 0
            glops64 = 0
            if op == "tn" and dtype is np.float16:
                # Experimental 128x16 gemm kernel
                glops16 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=16)
            if op != 'nt':
                glops32 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=32)
                glops64 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=64)
            glops128 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=128)

            glops = max(glops16, glops32, glops64, glops128)

            if glops16 == glops:
                fastest = 16
            elif glops32 == glops:
                fastest = 32
            elif glops64 == glops:
                fastest = 64
            else:
                fastest = 128

Пример #5

Показать файл

Файл: gemm_test2.py Проект: KayneWest/nervanagpu

            if data_type == "All Ones":
                cpuA = np.ones(dimA, dtype=dtype).astype(np.float32)
                cpuB = np.ones(dimB, dtype=dtype).astype(np.float32)
                #cpuB = np.identity(n, dtype=np.float32)
            else:
                cpuA = np.random.uniform(-1.0, 1.0, dimA).astype(np.float32)
                cpuB = np.random.uniform(-1.0, 1.0, dimB).astype(np.float32)

            devA = ng.array(cpuA, dtype=dtype)
            devB = ng.array(cpuB, dtype=dtype)
            devC = ng.empty(dimC, dtype=dtype)

            if op[0] == 't': cpuA, devA = cpuA.T, devA.T
            if op[1] == 't': cpuB, devB = cpuB.T, devB.T

            ng.dot(devA, devB, devC, repeat=repeat)

            if cpu:

                cpuC = np.dot(cpuA, cpuB)

                cpuD = devC.get()
                diff = np.absolute(cpuC - cpuD)

                print diff.max()
                print cpuD[::max(m//4,1),::max(n//4,1)]
                print cpuC[::max(m//4,1),::max(n//4,1)]
                print diff[::max(m//4,1),::max(n//4,1)]

                # print cpuD
                # exit()

Пример #6

Показать файл

Файл: cublas_test.py Проект: leonardt/nervanagpu

                                    devA2[:] = devA1
                                    devB2[:] = devB1

                                devC2    = ng.empty(dimC, dtype=np.float32)
                                # devC2    = devC2s.share(dimC, dtype=np.float32)
                                devC2[:] = devC1

                                if op[0] == 't': devA1, devA2 = devA1.T, devA2.T
                                if op[1] == 't': devB1, devB2 = devB1.T, devB2.T

                                for tile in (32,64,128):
                                    if op == 'nt' and tile != 128:
                                        continue
                                    try: 

                                        ng.dot(devA1, devB1, devC1, alpha=alpha, beta=beta, size=tile)
                                        context.synchronize()

                                        cublas_dot(devA2, devB2, devC2, alpha=alpha, beta=beta)

                                        partial1 = ng.empty((devC1.shape[0],1), dtype=np.float32)
                                        partial2 = partial1[0:1,0:1]

                                        if ng.min(ng.finite(devC1), partial=partial1, out=partial2).get()[0,0] == 0.0:
                                            print("Error: NaN KCN: (%d,%d,%d) ab: (%f,%f) dtype: %d" %
                                                  (K,C,N, alpha,beta, itemsize))
                                            exit()

                                        diff = ng.max(abs(devC2 - devC1), partial=partial1, out=partial2).get()[0,0]
                                        mean = ng.mean(abs(devC2), partial=partial1, out=partial2).get()[0,0]
                                        pctErr = 100 * diff / mean

Пример #7

Показать файл

Файл: gemm_test.py Проект: chagge/nervanagpu

                    cpuA = np.random.uniform(-1.0, 1.0, dimA).astype(np.float32)
                    cpuB = np.random.uniform(-1.0, 1.0, dimB).astype(np.float32)
                    # cpuB = np.identity(n, dtype=dtype)

                    devA = ng.array(cpuA, dtype=dtype)
                    devB = ng.array(cpuB, dtype=dtype)
                    devC = ng.empty(dimC, dtype=dtype)

                    # repeat = min(int(50.0 * 4096**3 / (m * n * k)), 1000)

                    if op[0] == "t":
                        cpuA, devA = cpuA.T, devA.T
                    if op[1] == "t":
                        cpuB, devB = cpuB.T, devB.T

                    ng.dot(devA, devB, devC, repeat=1)

                    # context.synchronize()

                    cpuC = np.dot(cpuA, cpuB)

                    cpuD = devC.get()
                    diff = np.absolute(cpuC - cpuD)
                    max_diff = diff.max()
                    print(max_diff, cpuD.max())
                    if max_diff > 0.1 or max_diff != max_diff:
                        # print(m, n, k, max_diff)
                        print(cpuD[:: max(m // 16, 1), :: max(n // 16, 1)])
                        print(cpuC[:: max(m // 16, 1), :: max(n // 16, 1)])
                        print(diff[:: max(m // 16, 1), :: max(n // 16, 1)])
                        exit()

Пример #8

Показать файл

Файл: cublas.py Проект: leonardt/nervanagpu

                    devA2 = devA1
                    devB2 = devB1
                # otherwise copy
                else:
                    devA2 = ng.empty(dimA, dtype=np.float32)
                    devB2 = ng.empty(dimB, dtype=np.float32)
                    devA2[:] = devA1
                    devB2[:] = devB1

                devC2    = ng.empty(dimC, dtype=np.float32)
                devC2[:] = devC1

                if op[0] == 't': devA1, devA2 = devA1.T, devA2.T
                if op[1] == 't': devB1, devB2 = devB1.T, devB2.T

                ng.dot(devA1, devB1, devC1, alpha=alpha, beta=beta, repeat=repeat)

                cublas_dot(devA2, devB2, devC2, alpha=alpha, beta=beta, repeat=repeat)

                partial1 = ng.empty((devC1.shape[0],1), dtype=np.float32)
                partial2 = partial1[0:1,0:1]

                diff = ng.max(abs(devC2 - devC1), partial=partial1, out=partial2).get()[0,0]
                mean = ng.mean(abs(devC2), partial=partial1, out=partial2).get()[0,0]

                #if diff > .1:
                print("Error: %.3f%%" % (100 * diff / mean))

                print("--------------------------------------------------------------------------------")

cublas.cublasDestroy(handle)

Пример #9

Показать файл

Файл: cublas2.py Проект: zky001/nervanagpu

                devA2 = ng.empty(dimA, dtype=np.float32)
                devB2 = ng.empty(dimB, dtype=np.float32)
                devA2[:] = devA1
                devB2[:] = devB1

            devC2 = ng.empty(dimC, dtype=np.float32)

            if op[0] == 't': devA1, devA2 = devA1.T, devA2.T
            if op[1] == 't': devB1, devB2 = devB1.T, devB2.T

            glops16 = 0
            glops32 = 0
            glops64 = 0
            if op == "tn" and dtype is np.float16:
                # Experimental 128x16 gemm kernel
                glops16 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=16)
            if op != 'nt':
                glops32 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=32)
                glops64 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=64)
            glops128 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=128)

            glops = max(glops16, glops32, glops64, glops128)

            if glops16 == glops:
                fastest = 16
            elif glops32 == glops:
                fastest = 32
            elif glops64 == glops:
                fastest = 64
            else:
                fastest = 128

Пример #10

Показать файл

Файл: gpu.py Проект: xiaoyunwu/neon

class GPU(Backend):
    """
    Sets up a NervanaGPU based backend for matrix operations.
    Note that some functions defined in the generic Backend class such as are
    cross-map pooling and normalization and adaDelta are not implemented for
    this backend.
    """
    default_dtype = np.float32

    def __init__(self, rng_seed, stochastic_round=False, device_id=0):
        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized NervanaGPU with stochastic_round=%s",
                    stochastic_round)
        self.rng_seed = rng_seed
        self.rng_init()
        self.device_id = device_id if device_id is not None else 0

    def __getstate__(self):
        """
        Defines what and how we go about serializing an instance of this class.

        Returns:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        if hasattr(self, 'mem_pool') and self.mem_pool is not None:
            self.mem_pool_pickle = {'shape': self.mem_pool.shape,
                                    'dtype': np.float32}
            self.mem_pool = None

        return self.__dict__

    def __setstate__(self, state):
        """
        Defines how we go about deserializing into an instance of this class.

        Arguments:
            self.__dict__: The full contents of the backend class instance,
                           except for the mem_pool which is on device and
                           cannot be serialized.
        """
        self.__dict__.update(state)
        self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'],
                                      dtype=self.mem_pool_pickle['dtype'])

    def init_mempool(self, shape, dtype=default_dtype):
        """
        Allocates a memory pool for temporary storage
        """
        self.mem_pool = self.ng.empty(shape, dtype=dtype)

    def alloc_host_mem(self, shape, dtype):
        return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0)

    def create_stream(self):
        return drv.Stream()

    def async_copy(self, dest, src, stream=None):
        drv.memcpy_htod_async(dest.gpudata, src, stream)

    def rng_init(self):
        """
        Initialize and seed the pseudo random number genrator. Random numbers
        are generated on the host using numpy, then transfered to device.
        """
        seed = None
        if 'rng_seed' in self.__dict__:
            seed = self.rng_seed
            logger.info("Seeding random number generator with: %s", str(seed))
        np.random.seed(seed)

    def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew):
        """
        Initialize FLOP timing.  Wraps the specified MOP calls via a decorator
        to record elapsed time and number of operations.

        Arguments:
           decorate_fc (list): string giving the function names of fully
                               connected layer forward/backward/update calls
                               to time.
           decorate_conv (list): string giving the function names of
                                 convolutional layer forward/backward/update
                                 calls to time.
           decorate_ew (list): string giving the function names of element-wise
                               calls to time.

        Notes:
            Must be called prior to first flop_timing_start call
        """
        self.start = drv.Event()
        self.end = drv.Event()
        self.flop_timer = FlopsDecorator(self)
        self.flop_timer.decorate(decorate_fc=decorate_fc,
                                 decorate_conv=decorate_conv,
                                 decorate_ew=decorate_ew)

    def flop_timinig_start(self):
        """
        Start a new FLOP timer.
        Returns:
            None: dummy value (not used)
        """
        return self.start.record()

    def flop_timing_finish(self, start_time):
        """
        Complete current FLOP timing.

        Arguments:
            start_time (unused): ignored.

        Returns:
            float: elapsed time in seconds since prior flop_timing_start call.
        """
        self.end.record()
        self.end.synchronize()
        return self.end.time_since(self.start)

    def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype,
                persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dype=None it will probably explode
        """
        ary = np.random.uniform(low, high, shape)
        return self.ng.array(ary, dtype=dtype, name=name)

    def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
               persist_values=True, name=None, allocator=drv.mem_alloc):
        """
        Gaussian/Normal random number sample generation
        """
        ary = np.random.normal(loc, scale, size)
        return self.ng.array(ary, dtype=dtype, name=name)

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        Forward propagate the inputs of a fully connected network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            layer (Layer): The layer object.
        """
        self.ng.dot(weights, inputs, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(weights.T, deltas, out)

    def update_fc(self, out, inputs, deltas, layer=None):
        """
        Compute the updated gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        self.ng.dot(deltas, inputs.T, out)

    def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, nifm, padding, stride, ngroups, fpropbuf,
                   local=False):
        """
        Forward propagate the inputs of a convolutional network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                             layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element
                                  in each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).  For this
                              backend we expect these values to be square.
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  convolved outputs for a single receptive
                                  field.  Not used for this backend.
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """

        '''
        N: Number of images in mini-batch
        C: Number of input feature maps
        K: Number of output feature maps

        D: Depth  of input image
        H: Height of input image
        W: Width  of input image

        T: Depth  of filter kernel
        R: Height of filter kernel
        S: Width  of filter kernel
        '''
        self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out,
                           alpha=1.0, repeat=1)

    def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs,
                   ifmshape, links, padding, stride, nifm, ngroups, bpropbuf,
                   local=False):
        """
        Backward propagate the error through a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
        """
        self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out,
                           alpha=1.0, repeat=1)

    def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize,
                    ofmlocs, ifmshape, links, nifm, padding, stride, ngroups,
                    fwidth, updatebuf, local=False, layer=None):
        """
        Compute the updated gradient for a convolutional network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           convolution operation.
            stride (int): Number of neurons to shift the filter at each step.
            ngroups (int): Number of groups.
            fwidth (int): Filter width.
            updatebuf (GPUTensor): Temporary storage buffer used to hold the
                                   updated gradient for a single receptive
                                   field
            local (bool, optional): Whether to do local filtering (True) or
                                    convolution (False, the default)
            layer (Layer): The layer object.
        """
        self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out,
                            alpha=1.0, repeat=1)

    def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape,
                   ifmshape, links, nifm, padding, stride, fpropbuf):
        """
        Forward propagate the inputs of a Pooling network layer to
        produce output pre-activations (ready for transformation by an
        activation function).

        Arguments:
            out (GPUTensor): Where to store the forward propagated results.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              number of height and width neurons).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                                 each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            ifmshape (tuple): Dimensions of each input feature map (typically
                              number of height and width neurons).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            fpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  pooled outputs for a single receptive field.
        """
        op = op.lower()
        if op == "max":
            self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize,
                   ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding,
                   stride, bpropbuf):
        """
        Backward propagate the error through a pooling network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            fouts (GPUTensor): Forward propagated outputs from the previous
                               layer.
            inputs (GPUTensor): Will be either the dataset input values (first
                                layer), or the outputs from the previous layer.
            deltas (GPUTensor): The error values for this layer
            op (string): The type of pooling operation to apply.  We support
                         "max", "avg", "l2" currently.
            ofmshape (tuple): Dimensions of each output feature map (typically
                              height and width).
            ofmsize (int): Total size of each output feature map.
            ofmlocs (GPUTensor): Indices giving the location of each element in
                              each output feature map stored in out.
            fshape (tuple): Dimensions of each filter (typically height and
                            width).
            fpsize (int): The size of each filter.
            ifmshape (tuple): Dimensions of each input feature map (typically
                              height and width).
            links (GPUTensor): Input receptive field indices.
            nifm (int): Total number of input feature maps.
            padding (int): Number of additional elements to include along each
                           dimension of each local receptive field during the
                           pooling operation.
            stride (int): Number of neurons to shift the filter at each step.
            bpropbuf (GPUTensor): Temporary storage buffer used to hold the
                                  backpropagated error for a single receptive
                                  field
        """
        op = op.lower()
        if op == "max":
            self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out,
                               repeat=1)
        else:
            raise AttributeError("unexpected pooling op type: %s", op)

    def logistic(self, x, out):
        """
        Logistic sigmoid nonlinearity, 1/(1+exp(-x))

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.sig(x, out=out)

        return out

    def rectlin(self, x, out):
        """
        Rectified Linear nonlinearity

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        """
        self.ng.maximum(x, 0., out=out)
        return out

    def rectleaky(self, x, slope, out):
        out[:] = self.ng.maximum(x, x*slope)

    def rectleaky_derivative(self, x, slope, out):
        out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope

    def sum(self, tsr, axes, out):
        """
        Sum

        Arguments:
            tsr  (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.sum(tsr, axis=axes, out=out)
        return out

    def mean(self, tsr, axes, out):
        """
        Calculates the arithmetic mean of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.mean(tsr, axis=axes, out=out)
        return out

    def min(self, tsr, axes, out):
        """
        Calculates the minimum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.min(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.min(tsr, axis=axes, out=out)
        return out

    def max(self, tsr, axes, out):
        """
        Calculates the maximum of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): Input tensor
            axes (int): Axis along which the reduction is performed. If axes
                        is None,  the tensor is flattened and reduced over
                        both dimensions.
            out (GPUTensor): Output tensor
        """
        if axes is None:
            sze = tsr.shape[0]*tsr.shape[1]
            self.ng.max(tsr.reshape(sze, 1), axis=0, out=out)
        else:
            self.ng.max(tsr, axis=axes, out=out)
        return out

    def variance(self, tsr, axes, out, mean=None):
        """
        Calculates the variance of the elements along the specified
        axes.

        Arguments:
            tsr (GPUTensor): the tensor on which to compute the variance
            axes (int, list, optional): the dimension(s) along which to
                                        variance.  If set to None, we will
                                        variance over all dimensions.
            out (GPUTensor): where the result will be stored.
            mean (GPUTensor): the tensor containing mean of tsr

        Returns:
            GPUTensor: reference to out
        """
        if mean is None:
            logger.error("GPUTensor requires mean to be specified.")
            raise ValueError("mean not specified")
        self.ng.mean(self.ng.square(tsr-mean),  axis=axes, out=out)
        return out

    def fabs(self, x, out):
        """
        Calculates absolute value of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.fabs(x, out=out)
        return out

    def sqrt(self, x, out):
        """
        Calculates square root of the elements in a tensor

        Arguments:
            x (GPUTensor): Input tensor
            out (GPUTensor): Output tensor

        Returns:
            GPUTensor: reference to out
        """
        self.ng.sqrt(x, out=out)
        return out

    def zeros(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with zeros.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.zeros(shape, dtype=dtype)

    def ones(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor and fill it with ones.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.ones(shape, dtype=dtype)

    def empty(self, shape, dtype=default_dtype, persist_values=True):
        """
        Allocate a new GPUTensor.

        Arguments:
            shape (tupel): Shape of the desired GPUTensor
            dtype (dtype): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls

        Returns:
            GPUTensor: output
        """
        return self.ng.empty(shape, dtype=dtype)

    def array(self, ary, dtype=default_dtype, persist_values=True, name=None,
              allocator=drv.mem_alloc):
        """
        Allocate a new GPUTensor and fill it with supplied numpy array.

        Arguments:
            ary (ndarray): Numpy array with source data
            dtype (dtype, optional): Optional datatype
            persist_values (bool, optional): If set to True (the default), the
                                             values assigned to this Tensor
                                             will persist across multiple begin
                                             and end calls.  Setting to False
                                             may provide a performance increase
                                             if values do not need to be
                                             maintained across such calls
            name (string): Name for the GPUTensor
            allocator (pycuda): Pycuda memory allocator

        Returns:
            GPUTensor: output
        """
        return self.ng.array(ary, dtype=dtype, name=name)

    def add(self, left, right, out):
        """
        Elementwise addition

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.add(left, right, out=out)
        return out

    def subtract(self, left, right, out):
        """
        Elementwise subtraction

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.subtract(left, right, out=out)
        return out

    def multiply(self, left, right, out):
        """
        Elementwise multiplication

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.multiply(left, right, out=out)
        return out

    def divide(self, left, right, out):
        """
        Elementwise division

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.divide(left, right, out=out)
        return out

    def greater(self, left, right, out):
        """
        Elementwise greater than testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.greater(left, right, out=out)
        return out

    def equal(self, left, right, out):
        """
        Performs element-wise equality testing on each element of left and
        right, storing the result in out.  Each operand is assumed to be the
        same shape (or broadcastable as such).

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.equal(left, right, out=out)
        return out

    def not_equal(self, left, right, out):
        """
        Elementwise not equal testing

        Arguments:
            left (GPUTensor, numeric): left-hand side operand.
            right (GPUTensor, numeric): right-hand side operand.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.not_equal(left, right, out=out)
        return out

    def clip(self, a, a_min, a_max, out):
        """
        Elementwise clipping between a range of specified values

        Arguments:
            a (GPUTensor): input tensor.
            a_min (float): floor value.
            a_max (float): ceiling value.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.clip(a, a_min, a_max, out=out)
        return out

    def log(self, a, out):
        """
        Elementwise base-e logarithm

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.log(a, out=out)
        return out

    def tanh(self, a, out):
        """
        Elementwise tanh

        Arguments:
            a (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        self.ng.tanh(a, out=out)
        return out

    def argmax(self, a, out, axis=0):
        """
        Calculates the indices of the maximal element value along the specified
        axis.  If multiple elements contain the maximum, only the elements of
        the first are returned.

        Arguments:
            tsr (GPUTensor): The GPUTensor on which to find the maximum indices
            axis (int): The dimension along which to find the maximum.  If set
                        to None, find the overall maximum index of a flattened
                        representation of tsr.
            out (GPUTensor): Where to store the result.  Should be of the
                             appropriate type and expected shape

        Returns:
            GPUTensor: reference to out
        """
        self.ng.argmax(a, out=out, axis=axis)
        return out

    def softmax(self, x, out):
        """
        Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i))

        Arguments:
            x (GPUTensor): input tensor.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        out[:] = (self.ng.reciprocal(self.ng.sum(
                  self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) *
                  self.ng.exp(x - self.ng.max(x, axis=0)))
        return out

    def softmax_gradient(self, y, err, out):
        """
        Gradient of the softmax nonlinearity.

        Arguments:
            y (GPUTensor): input tensor.
            err (GPUTensor): backpropagated error.
            out (GPUTensor): where the result will be stored.

        Returns:
            GPUTensor: reference to out
        """
        raise NotImplementedError("Softmax gradient should use shortcut")
        return out

    def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype):
        """
        Create a binary mask for dropout layers.

        Arguments:
            tsr (GPUTensor): Output tensor
            keepthresh (float): fraction of ones
        """
        self.ng.dropout(keep=keepthresh, out=tsr)

    def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef,
                     learning_rate, epoch):
        """
        Perform gradient descent update with momentum.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs are written to vs_item (updated velocity)
        and ps_item (updated weights)
        """
        vs_item[:] = vs_item * momentum_coef - us_item * learning_rate
        ps_item[:] = ps_item + vs_item

    def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef,
                       learning_rate, wd, epoch):
        """
        Perform gradient descent update with momentum and weight decay.

        Arguments:
            ps_item (GPUTensor): parameter tensor (e.g. a weight matrix)
            us_item (GPUTensor): update tensor, contains gradient wrt. weights
            vs_item (GPUTensor): velocity tensor.
            momentum_coef (float): momentum coefficient.
            learning_rate (float): learning rate.
            wd (float): weight decay parameter.
            epoch (int): epoch (used in conjunction with diagnostics).

        Outputs:
            ps_item, the updated weights.
            vs_item, the updated velocity.
            us_item, used as a temp buffer.
        """
        vs_item[:] = vs_item * momentum_coef - us_item * \
            learning_rate - learning_rate * wd * ps_item
        ps_item[:] = ps_item + vs_item

    def exp_mavg(self, mavg, newval, rho):
        """
        Calculate the exponential moving average

        Arguments:
            mavg:  The running value of the moving average
            newval:  New sample to be added to the moving average
            rho:  Interpolation value
        """

        mavg[:] = rho * mavg + (1.0 - rho) * newval

    def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item,
                   rho, epsilon):
        """
        Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701)

        Arguments:
            ps_item: weight / parameter (will be updated)
            us_item: update
            gs_item: expected value of Gradient Squared (will be updated)
            ds_item: expected value of Delta Squared (will be updated)
            ls_item: learning rate (will be updated)
            ss_item: Scratch Space
            rho: decay constant (determines window size)
            epsilon: small positive constant for numerical stability
        """
        # Accumulate E[Grad^2]
        gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item

        # Calculate Updates
        ls_item[:] = self.ng.sqrt((ds_item + epsilon) /
                                  (gs_item + epsilon)) * (-1.0) * us_item

        # Accumulate E[Delt^2]
        ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item

        # Final update to the params
        ps_item[:] = ps_item + ls_item

    def rms_update(self, params, updates, run_squares, velocity, scratch_space,
                   gamma, epsilon, learning_rate, momentum_coef):

        # Update running squares
        run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates

        # Now scale the gradient by lr / rms(grad) (with a epsilon term for
        # stability) and use it to update the params
        if momentum_coef == 0:
            params[:] = params - learning_rate * updates * self.ng.reciprocal(
                self.ng.sqrt(run_squares) + epsilon)
        else:
            velocity[:] = velocity * momentum_coef - \
                learning_rate * updates * \
                self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon)
            params[:] = params + velocity

    def fprop_bn_compound(self, inputs, beta, gamma, eps, xhat,
                          xmean, xvar, gmean, gvar, rho, out):
        """
        Batch normalization forward pass, compounded to run in 3 kernel calls.

        Arguments:
            inputs: input data to be normalized
            beta: location parameter
            gamma: scale parameter
            eps: small constant for numerical stability
            xvar: variance (updated)
            xhat: normalized input (updated)
            out: normalized and rescaled input (updated)
        """
        xvar[:] = self.ng.var(inputs, axis=1)
        xmean[:] = self.ng.mean(inputs, axis=1)
        gmean[:] = gmean * rho + (1.0 - rho) * xmean
        gvar[:] = gvar * rho + (1.0 - rho) * xvar

        xvar[:] = self.ng.reciprocal(self.ng.sqrt(xvar + eps))
        xhat[:] = xvar * (inputs - xmean)
        out[:] = xhat * gamma + beta
        return out

    def bprop_bn_compound(self, xhat, error, xvar, gamma,
                          beta_updates, gamma_updates):
        """
        Batch normalization backward pass, compounded to run with 4 kernel
        calls.

        Arguments:
            xhat: normalized input data (updated)
            error: backpropagated deltas (updated)
            xvar: precomputed variance
            gamma: scale parameter
            beta_updates: gradient update for beta (updated)
            gamma_updates: gradient update for gamma (updated)
        """
        gamma_updates[:] = self.ng.sum(xhat * error, axis=1)
        beta_updates[:] = self.ng.sum(error, axis=1)
        xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1])
        error[:] = xvar * gamma * (error - xhat)

Пример #11

Показать файл

            if data_type == "All Ones":
                cpuA = np.ones(dimA, dtype=dtype).astype(np.float32)
                cpuB = np.ones(dimB, dtype=dtype).astype(np.float32)
                #cpuB = np.identity(n, dtype=np.float32)
            else:
                cpuA = np.random.uniform(-1.0, 1.0, dimA).astype(np.float32)
                cpuB = np.random.uniform(-1.0, 1.0, dimB).astype(np.float32)

            devA = ng.array(cpuA, dtype=dtype)
            devB = ng.array(cpuB, dtype=dtype)
            devC = ng.empty(dimC, dtype=dtype)

            if op[0] == 't': cpuA, devA = cpuA.T, devA.T
            if op[1] == 't': cpuB, devB = cpuB.T, devB.T

            ng.dot(devA, devB, devC, repeat=repeat)

            if cpu:

                cpuC = np.dot(cpuA, cpuB)

                cpuD = devC.get()
                diff = np.absolute(cpuC - cpuD)

                print diff.max()
                print cpuD[::max(m // 4, 1), ::max(n // 4, 1)]
                print cpuC[::max(m // 4, 1), ::max(n // 4, 1)]
                print diff[::max(m // 4, 1), ::max(n // 4, 1)]

                # print cpuD
                # exit()

Пример #12

Показать файл

                                # devC2    = devC2s.share(dimC, dtype=np.float32)
                                devC2[:] = devC1

                                if op[0] == 't':
                                    devA1, devA2 = devA1.T, devA2.T
                                if op[1] == 't':
                                    devB1, devB2 = devB1.T, devB2.T

                                for tile in (32, 64, 128):
                                    if op == 'nt' and tile != 128:
                                        continue
                                    try:

                                        ng.dot(devA1,
                                               devB1,
                                               devC1,
                                               alpha=alpha,
                                               beta=beta,
                                               size=tile)
                                        context.synchronize()

                                        cublas_dot(devA2,
                                                   devB2,
                                                   devC2,
                                                   alpha=alpha,
                                                   beta=beta)

                                        partial1 = ng.empty(
                                            (devC1.shape[0], 1),
                                            dtype=np.float32)
                                        partial2 = partial1[0:1, 0:1]

Пример #13

Показать файл

Файл: mgpu.py Проект: neuroidss/neon

class MGPU(GPU):
    default_dtype = np.float32
    num_dev = 1
    is_dist = True

    def __init__(self, rng_seed, stochastic_round=False, device_id=0,
                 num_dev=2):
        drv.init()
        self.num_dev = num_dev

        if device_id == 0:
            self.dev_list = range(num_dev)
        else:
            self.dev_list = device_id

        assert len(self.dev_list) == self.num_dev
        assert self.num_dev <= drv.Device.count()

        self.ctxs = []
        self.devs = []
        self._strms = []
        self._redstrms = []

        self._events = []
        self._redevents = []

        self.async = True
        self._nostrms = [None for i in self.dev_list]

        for i in self.dev_list:
            self.devs.append(drv.Device(i))

        for dev in self.devs:
            self.ctxs.append(
                dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC))
            self._strms.append(drv.Stream())
            self._redstrms.append(drv.Stream())
            self._events.append(drv.Event())
            self._redevents.append(drv.Event())
            drv.Context.pop()

        self.ctxs[0].push()
        atexit.register(drv.Context.pop)
        MGPUTensor.ctxs = self.ctxs
        MGPUTensor.num_dev = num_dev

        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized %d device NervanaGPU, stochastic_round=%s",
                    num_dev, stochastic_round)
        self.ng.block = None
        self.rng_seed = rng_seed
        self.rng_init()

        # Setup the pairwise contexts
        # TODO clean up this code to avoid indexing
        for dev1, ctx1 in zip(self.devs, self.ctxs):
            ctx1.push()
            for dev2, ctx2 in zip(self.devs, self.ctxs):
                if dev1 == dev2:
                    continue
                if dev1.can_access_peer(dev2):
                    ctx1.enable_peer_access(ctx2)
                else:
                    print('Cannot enable peer access between '
                          '{:d} and {:d}'.format(dev1, dev2))
            ctx1.pop()

    def make_events(self):
        evtlist = []
        for ctx in self.ctxs:
            ctx.push()
            evtlist.append(drv.Event())
            ctx.pop()
        return evtlist

    # These definitions are for performing grouped context commands
    # This is experimental and should remove _stack for actual usage
    def begin_stack(self, block, identifier):
        if block == Block.update:
            self.ng.block = Block.update
            self.call_stack = []
        else:
            pass

    def end_stack(self, block, identifier):
        if block == Block.update:
            self.ng.block = None
            for idx, ctx in enumerate(self.ctxs):
                ctx.push()
                self.ng.stream = self.strms[idx]
                for method, args, kwargs in self.call_stack:
                    myargs = [a._tensorlist[idx] if isinstance(
                        a, MGPUTensor) else a for a in args]
                    mykwargs = {k: v._tensorlist[idx] if isinstance(
                        v, MGPUTensor) else v for k, v in kwargs.iteritems()}
                    getattr(super(MGPU, self), method)(*myargs, **mykwargs)
                self.ng.stream = None
                ctx.pop()
            self.call_stack = None
        else:
            pass

    @property
    def strms(self):
        return self._strms if self.async else self._nostrms

    @property
    def redstrms(self):
        return self._redstrms if self.async else self._nostrms

    def uniform(self, low=0.0, high=1.0, size=1, dtype=default_dtype,
                name=None, persist_values=True, ptype='replica'):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dtype=None it will probably explode
        """
        assert len(size) == 2
        result = self.empty(size, dtype=dtype, persist_values=persist_values)
        result.ptype = ptype
        beshape = size if ptype == 'replica' else (self.num_dev * size[0],
                                                   size[1])
        ary = np.random.uniform(low, high, beshape).astype(dtype)
        self.set(result, ary)
        return result

    def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
               name=None, persist_values=True, ptype='replica'):
        """
        Gaussian/Normal random number sample generation
        """
        assert len(size) == 2
        result = self.empty(size, dtype=dtype, persist_values=persist_values)
        result.ptype = ptype
        beshape = size if ptype == 'replica' else (self.num_dev * size[0],
                                                   size[1])
        ary = np.random.normal(loc, scale, beshape).astype(dtype)
        self.set(result, ary)
        return result

    def synchronize(self):
        if not self.async:
            return
        for s in self.strms:
            s.synchronize()

    def redsynchronize(self):
        if not self.async:
            return
        for s in self.redstrms:
            s.synchronize()

    def allocate_fragment(self, shape, dtype=default_dtype,
                          persist_values=True):
        # TODO: set ptype to be fragment in this case ??
        return self.empty((shape[0], shape[1] / self.num_dev), dtype,
                          persist_values=persist_values)

    def zeros_like(self, ary, dtype=default_dtype, persist_values=True,
                   name=None):
        result = self.zeros(ary.shape, dtype=dtype,
                            persist_values=persist_values)
        result.ptype = ary.ptype
        return result

    def empty_like(self, ary, dtype=default_dtype, persist_values=True,
                   name=None):
        result = self.empty(ary.shape, dtype=dtype,
                            persist_values=persist_values, name=name)
        result.ptype = ary.ptype
        return result

    def set(self, tensor, data):
        assert isinstance(tensor, MGPUTensor)
        if tensor.ptype == 'replica':
            for dest, strm, ctx in zip(tensor.tlist, self.strms, self.ctxs):
                ctx.push()
                drv.memcpy_htod_async(dest.ptr, data, strm)
                ctx.pop()
            # tensor.copy_from(data)
        else:
            self.scatter(data, tensor)

    def scatter(self, hbuf, dbuf):
        '''
        scatters the array data in hbuf to the mgpu tensor
        assumes that dbuf is a M x N and hbuf is M x (Nxk) where k is the
        number of replicas
        also assumes that dtype of hbuf and dbuf are the same
        '''
        assert hbuf.size == dbuf.size * dbuf.num_dev
        assert isinstance(dbuf, MGPUTensor)
        assert hbuf.dtype == dbuf.dtype
        ndata = dbuf.size
        starts = [i * ndata for i in range(self.num_dev)]

        for dest, strm, ctx, doff in zip(dbuf.tlist, self.strms, self.ctxs,
                                         starts):
            src = hbuf.reshape((hbuf.size))[doff:(doff + ndata)]
            ctx.push()
            drv.memcpy_htod_async(dest.ptr, src, strm)
            ctx.pop()

        self.synchronize()

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        In this case, the weights are shards, the acts are replicas
        ubuf should be of size nout/num_dev x mbsz
        """
        ubuf = layer.mempool[0]
        assert ubuf.shape == (weights.shape[0], inputs.shape[1])

        if layer.use_biases:
            biases = layer.biases.tlist
        else:
            biases = [None for i in range(self.num_dev)]

        for dbuf, ibuf, wt, bs, strm, ctx in zip(ubuf.tlist, inputs.tlist,
                                                 weights.tlist, biases,
                                                 self.strms, self.ctxs):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(wt, ibuf, dbuf)
            if layer.use_biases:
                self.ng.add(dbuf, bs, out=dbuf)
            ctx.pop()

        # Note, should be safe not to sync because each fragment is computed
        # on the same stream that originates the copy
        # self.synchronize()
        self.fragment_to_replica(ubuf, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        ubuf = layer.mempool[1]
        wtsz = weights.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]
        assert out.shape == (weights.shape[1], deltas.shape[1])
        assert ubuf.shape == out.shape

        for dbuf, ibuf, wt, strm, ctx, off in zip(out.tlist, deltas.tlist,
                                                  weights.tlist, self.strms,
                                                  self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(wt.T, ibuf[off:(off + wtsz)], dbuf)
            ctx.pop()

        # Note, should be safe not to sync because each fragment is computed
        # on the same stream that originates the copy
        self.synchronize()
        self.reduce(out, ubuf)

    def update_fc(self, out, inputs, deltas, layer=None):
        wtsz = out.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]

        for obuf, dbuf, ibuf, strm, ctx, off in zip(out.tlist, deltas.tlist,
                                                    inputs.tlist, self.strms,
                                                    self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(dbuf[off:(off + wtsz)], ibuf.T, obuf)
            ctx.pop()

        # self.synchronize()

    def update_fc_bias(self, err, out):
        """
        Compute the updated bias gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            err (GPUTensor): backpropagated error
        """
        wtsz = out.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]

        for ebuf, obuf, strm, ctx, off in zip(err.tlist, out.tlist, self.strms,
                                              self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.sum(ebuf[off:(off + wtsz)], axis=1, out=obuf)
            ctx.pop()

    def add_fc_bias(self, inputs, bias):
        """
        This is a no-op since we absorb the bias add into the fprop_fc call
        """
        pass

    def reduce_tensor(self, ary, async=True):
        '''
        This is the case for the scalar tensor
        '''
        assert ary.size == 1
        if ary.ptype == 'replica':
            self.ctxs[0].push()
            result = ary.tlist[0].get()
            self.ctxs[0].pop()
            return result

        result = np.zeros((self.num_dev, 1), ary.dtype)
        for i, (ctx, src_buf, strm) in enumerate(zip(
                self.ctxs, ary.tlist, self.strms)):
            ctx.push()
            drv.memcpy_dtoh_async(result[i], src_buf.ptr, strm)
            ctx.pop()
        self.synchronize()
        return result.sum()

Пример #14

Показать файл

Файл: mgpu.py Проект: vipuldivyanshu92/neon

class MGPU(GPU):
    default_dtype = np.float32
    num_dev = 1
    is_dist = True

    def __init__(self,
                 rng_seed,
                 stochastic_round=False,
                 device_id=0,
                 num_dev=2):
        drv.init()
        self.num_dev = num_dev

        if device_id == 0:
            self.dev_list = range(num_dev)
        else:
            self.dev_list = device_id

        assert len(self.dev_list) == self.num_dev
        assert self.num_dev <= drv.Device.count()

        self.ctxs = []
        self.devs = []
        self._strms = []
        self._redstrms = []

        self._events = []
        self._redevents = []

        self. async = True
        self._nostrms = [None for i in self.dev_list]

        for i in self.dev_list:
            self.devs.append(drv.Device(i))

        for dev in self.devs:
            self.ctxs.append(
                dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC))
            self._strms.append(drv.Stream())
            self._redstrms.append(drv.Stream())
            self._events.append(drv.Event())
            self._redevents.append(drv.Event())
            drv.Context.pop()

        self.ctxs[0].push()
        atexit.register(drv.Context.pop)
        MGPUTensor.ctxs = self.ctxs
        MGPUTensor.num_dev = num_dev

        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized %d device NervanaGPU, stochastic_round=%s",
                    num_dev, stochastic_round)
        self.ng.block = None
        self.rng_seed = rng_seed
        self.rng_init()

        # Setup the pairwise contexts
        # TODO clean up this code to avoid indexing
        for dev1, ctx1 in zip(self.devs, self.ctxs):
            ctx1.push()
            for dev2, ctx2 in zip(self.devs, self.ctxs):
                if dev1 == dev2:
                    continue
                if dev1.can_access_peer(dev2):
                    ctx1.enable_peer_access(ctx2)
                else:
                    print('Cannot enable peer access between '
                          '{:d} and {:d}'.format(dev1, dev2))
            ctx1.pop()

    def make_events(self):
        evtlist = []
        for ctx in self.ctxs:
            ctx.push()
            evtlist.append(drv.Event())
            ctx.pop()
        return evtlist

    # These definitions are for performing grouped context commands
    # This is experimental and should remove _stack for actual usage
    def begin_stack(self, block, identifier):
        if block == Block.update:
            self.ng.block = Block.update
            self.call_stack = []
        else:
            pass

    def end_stack(self, block, identifier):
        if block == Block.update:
            self.ng.block = None
            for idx, ctx in enumerate(self.ctxs):
                ctx.push()
                self.ng.stream = self.strms[idx]
                for method, args, kwargs in self.call_stack:
                    myargs = [
                        a._tensorlist[idx] if isinstance(a, MGPUTensor) else a
                        for a in args
                    ]
                    mykwargs = {
                        k:
                        v._tensorlist[idx] if isinstance(v, MGPUTensor) else v
                        for k, v in kwargs.iteritems()
                    }
                    getattr(super(MGPU, self), method)(*myargs, **mykwargs)
                self.ng.stream = None
                ctx.pop()
            self.call_stack = None
        else:
            pass

    @property
    def strms(self):
        return self._strms if self. async else self._nostrms

    @property
    def redstrms(self):
        return self._redstrms if self. async else self._nostrms

    def uniform(self,
                low=0.0,
                high=1.0,
                size=1,
                dtype=default_dtype,
                name=None,
                persist_values=True,
                ptype='replica'):
        """
        generate numpy random number and convert to a GPUTensor.
        If called with dtype=None it will probably explode
        """
        assert len(size) == 2
        result = self.empty(size, dtype=dtype, persist_values=persist_values)
        result.ptype = ptype
        beshape = size if ptype == 'replica' else (self.num_dev * size[0],
                                                   size[1])
        ary = np.random.uniform(low, high, beshape).astype(dtype)
        self.set(result, ary)
        return result

    def normal(self,
               loc=0.0,
               scale=1.0,
               size=1,
               dtype=default_dtype,
               name=None,
               persist_values=True,
               ptype='replica'):
        """
        Gaussian/Normal random number sample generation
        """
        assert len(size) == 2
        result = self.empty(size, dtype=dtype, persist_values=persist_values)
        result.ptype = ptype
        beshape = size if ptype == 'replica' else (self.num_dev * size[0],
                                                   size[1])
        ary = np.random.normal(loc, scale, beshape).astype(dtype)
        self.set(result, ary)
        return result

    def synchronize(self):
        if not self. async:
            return
        for s in self.strms:
            s.synchronize()

    def redsynchronize(self):
        if not self. async:
            return
        for s in self.redstrms:
            s.synchronize()

    def allocate_fragment(self,
                          shape,
                          dtype=default_dtype,
                          persist_values=True):
        # TODO: set ptype to be fragment in this case ??
        return self.empty((shape[0], shape[1] / self.num_dev),
                          dtype,
                          persist_values=persist_values)

    def zeros_like(self,
                   ary,
                   dtype=default_dtype,
                   persist_values=True,
                   name=None):
        result = self.zeros(ary.shape,
                            dtype=dtype,
                            persist_values=persist_values)
        result.ptype = ary.ptype
        return result

    def empty_like(self,
                   ary,
                   dtype=default_dtype,
                   persist_values=True,
                   name=None):
        result = self.empty(ary.shape,
                            dtype=dtype,
                            persist_values=persist_values,
                            name=name)
        result.ptype = ary.ptype
        return result

    def set(self, tensor, data):
        assert isinstance(tensor, MGPUTensor)
        if tensor.ptype == 'replica':
            for dest, strm, ctx in zip(tensor.tlist, self.strms, self.ctxs):
                ctx.push()
                drv.memcpy_htod_async(dest.ptr, data, strm)
                ctx.pop()
            # tensor.copy_from(data)
        else:
            self.scatter(data, tensor)

    def scatter(self, hbuf, dbuf):
        '''
        scatters the array data in hbuf to the mgpu tensor
        assumes that dbuf is a M x N and hbuf is M x (Nxk) where k is the
        number of replicas
        also assumes that dtype of hbuf and dbuf are the same
        '''
        assert hbuf.size == dbuf.size * dbuf.num_dev
        assert isinstance(dbuf, MGPUTensor)
        assert hbuf.dtype == dbuf.dtype
        ndata = dbuf.size
        starts = [i * ndata for i in range(self.num_dev)]

        for dest, strm, ctx, doff in zip(dbuf.tlist, self.strms, self.ctxs,
                                         starts):
            src = hbuf.reshape((hbuf.size))[doff:(doff + ndata)]
            ctx.push()
            drv.memcpy_htod_async(dest.ptr, src, strm)
            ctx.pop()

        self.synchronize()

    def fprop_fc(self, out, inputs, weights, layer=None):
        """
        In this case, the weights are shards, the acts are replicas
        ubuf should be of size nout/num_dev x mbsz
        """
        ubuf = layer.mempool[0]
        assert ubuf.shape == (weights.shape[0], inputs.shape[1])

        if layer.use_biases:
            biases = layer.biases.tlist
        else:
            biases = [None for i in range(self.num_dev)]

        for dbuf, ibuf, wt, bs, strm, ctx in zip(ubuf.tlist, inputs.tlist,
                                                 weights.tlist, biases,
                                                 self.strms, self.ctxs):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(wt, ibuf, dbuf)
            if layer.use_biases:
                self.ng.add(dbuf, bs, out=dbuf)
            ctx.pop()

        # Note, should be safe not to sync because each fragment is computed
        # on the same stream that originates the copy
        # self.synchronize()
        self.fragment_to_replica(ubuf, out)

    def bprop_fc(self, out, weights, deltas, layer=None):
        """
        Backward propagate the error through a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the backward propagated errors.
            weights (GPUTensor): The weight coefficient values for this layer.
            deltas (GPUTensor): The error values for this layer
            layer (Layer): The layer object.
        """
        ubuf = layer.mempool[1]
        wtsz = weights.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]
        assert out.shape == (weights.shape[1], deltas.shape[1])
        assert ubuf.shape == out.shape

        for dbuf, ibuf, wt, strm, ctx, off in zip(out.tlist, deltas.tlist,
                                                  weights.tlist, self.strms,
                                                  self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(wt.T, ibuf[off:(off + wtsz)], dbuf)
            ctx.pop()

        # Note, should be safe not to sync because each fragment is computed
        # on the same stream that originates the copy
        self.synchronize()
        self.reduce(out, ubuf)

    def update_fc(self, out, inputs, deltas, layer=None):
        wtsz = out.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]

        for obuf, dbuf, ibuf, strm, ctx, off in zip(out.tlist, deltas.tlist,
                                                    inputs.tlist, self.strms,
                                                    self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.dot(dbuf[off:(off + wtsz)], ibuf.T, obuf)
            ctx.pop()

        # self.synchronize()

    def update_fc_bias(self, err, out):
        """
        Compute the updated bias gradient for a fully connected network layer.

        Arguments:
            out (GPUTensor): Where to store the updated gradient value.
            err (GPUTensor): backpropagated error
        """
        wtsz = out.shape[0]
        starts = [i * wtsz for i in range(self.num_dev)]

        for ebuf, obuf, strm, ctx, off in zip(err.tlist, out.tlist, self.strms,
                                              self.ctxs, starts):
            ctx.push()
            self.ng.stream = strm
            self.ng.sum(ebuf[off:(off + wtsz)], axis=1, out=obuf)
            ctx.pop()

    def add_fc_bias(self, inputs, bias):
        """
        This is a no-op since we absorb the bias add into the fprop_fc call
        """
        pass

    def reduce_tensor(self, ary, async=True):
        '''
        This is the case for the scalar tensor
        '''
        assert ary.size == 1
        if ary.ptype == 'replica':
            self.ctxs[0].push()
            result = ary.tlist[0].get()
            self.ctxs[0].pop()
            return result

        result = np.zeros((self.num_dev, 1), ary.dtype)
        for i, (ctx, src_buf,
                strm) in enumerate(zip(self.ctxs, ary.tlist, self.strms)):
            ctx.push()
            drv.memcpy_dtoh_async(result[i], src_buf.ptr, strm)
            ctx.pop()
        self.synchronize()
        return result.sum()

Пример #15

Показать файл

Файл: cublas2.py Проект: honorpeter/Caffe2_fpga

                devA2[:] = devA1
                devB2[:] = devB1

            devC2 = ng.empty(dimC, dtype=np.float32)

            if op[0] == 't': devA1, devA2 = devA1.T, devA2.T
            if op[1] == 't': devB1, devB2 = devB1.T, devB2.T

            glops32x128 = 0
            glops128x32 = 0
            glops128x64 = 0

            if op != 'tn':
                glops32x128 = ng.dot(devA1,
                                     devB1,
                                     devC1,
                                     repeat=repeat,
                                     size='32x128')
            if op != 'nt':
                glops128x32 = ng.dot(devA1,
                                     devB1,
                                     devC1,
                                     repeat=repeat,
                                     size='128x32')
                glops128x64 = ng.dot(devA1,
                                     devB1,
                                     devC1,
                                     repeat=repeat,
                                     size='128x64')
            glops128x128 = ng.dot(devA1,
                                  devB1,