for K, C, N in ((3072,3072*1,32),(3072,3072*1,64),(3072,3072*1,96),(3072,3072*1,128), (3072,3072*2,32),(3072,3072*2,64),(3072,3072*2,96),(3072,3072*2,128), (3072,3072*3,32),(3072,3072*3,64),(3072,3072*3,96),(3072,3072*3,128), (3072,3072*4,32),(3072,3072*4,64),(3072,3072*4,96),(3072,3072*4,128),): #(3072,3072,32+128*0),(3072,3072,64+128*0),(3072,3072,96+128*0),(3072,3072,128+128*0), #(3072,3072,32+128*1),(3072,3072,64+128*1),(3072,3072,96+128*1),(3072,3072,128+128*1), #(3072,3072,32+128*2),(3072,3072,64+128*2),(3072,3072,96+128*2),(3072,3072,128+128*2), #(3072,3072,32+128*3),(3072,3072,64+128*3),(3072,3072,96+128*3),(3072,3072,128+128*3),): for op, dimA, dimB, dimC in ( ("nn", (K,C), (C,N), (K,N) ), # fprop ("tn", (K,C), (K,N), (C,N) ), # bprop ("nt", (K,N), (C,N), (K,C) )): # update repeat = 5000 if C <= 3072 else 500 devA1 = ng.empty(dimA, dtype=dtype) devB1 = ng.empty(dimB, dtype=dtype) devC1 = ng.empty(dimC, dtype=dtype) # fill with uniform randoms from -1 to 1 devA1[:] = 2 * (.5 - ng.rand()) devB1[:] = 2 * (.5 - ng.rand()) # just alias if same dtype if dtype is np.float32: devA2 = devA1 devB2 = devB1 # otherwise copy else: devA2 = ng.empty(dimA, dtype=np.float32) devB2 = ng.empty(dimB, dtype=np.float32)
for k in size: print("op,M,N,K: ", op, m, n, k) dimA = (m, k) if op[0] == 'n' else (k, m) dimB = (k, n) if op[1] == 'n' else (n, k) dimC = (m, n) cpuA = np.random.uniform(-1.0, 1.0, dimA).astype(np.float32) cpuB = np.random.uniform(-1.0, 1.0, dimB).astype(np.float32) #cpuB = np.identity(n, dtype=dtype) devA = ng.array(cpuA, dtype=dtype) devB = ng.array(cpuB, dtype=dtype) devC = ng.empty(dimC, dtype=dtype) #repeat = min(int(50.0 * 4096**3 / (m * n * k)), 1000) if op[0] == 't': cpuA, devA = cpuA.T, devA.T if op[1] == 't': cpuB, devB = cpuB.T, devB.T ng.dot(devA, devB, devC, repeat=1) #context.synchronize() cpuC = np.dot(cpuA, cpuB) cpuD = devC.get() diff = np.absolute(cpuC - cpuD) max_diff = diff.max()
layers.append(layer) # find the size of the largest buffers so they can be shared if layer.sizeF > max_weights: max_weights = layer.sizeF max_weight_layer = layer if layer.sizeO > max_deltas: max_deltas = layer.sizeO max_delta_layer = layer # for layer in sorted(layers, key=lambda l: l.sizeO, reverse=True): # print "%d %s" % (layer.sizeO, layer) # Init shared buffers (assumes consistent dtype for now) shared_deltas[0] = ng.empty(max_delta_layer.dimO2, dtype=max_delta_layer.dtype) shared_deltas[1] = ng.empty(max_delta_layer.dimO2, dtype=max_delta_layer.dtype) shared_weights = ng.empty(max_weight_layer.dimF2, dtype=max_weight_layer.dtype) prev_layer = None delta = False for layer in layers: print layer # Intitalize buffers. Alernate shared delta buffer. # One layer can't have the same buffer for both error in and error out. layer.init_activations() layer.init_weights(shared=shared_weights)
# bprop(nn): NK x KC = NC # updat(tn): NK^T x NC = KC repeat = 2000 for K, C, N in ((3072,3072,32),): total = 0 for op, dimA, dimB, dimC in ( ("nn", (K,C), (C,N), (K,N) ), # fprop ("tn", (K,C), (K,N), (C,N) ), # bprop ("nt", (K,N), (C,N), (K,C) ),): # update devA = ng.empty(dimA, dtype=np.float32) devB = ng.empty(dimB, dtype=np.float32) devC = ng.empty(dimC, dtype=np.float32) # fill with uniform randoms from -1 to 1 devA[:] = 2 * (.5 - ng.rand()) devB[:] = 2 * (.5 - ng.rand()) total += cublas_dot(op, devA, devB, devC, repeat=repeat, warmup=True) print "N2 Total: ", total total = 0 for op, dimA, dimB, dimC in ( ("nt", (N,C), (K,C), (N,K) ), # fprop ("nn", (N,K), (K,C), (N,C) ), # bprop
dimA = (m,k) if op[0] == 'n' else (k,m) dimB = (k,n) if op[1] == 'n' else (n,k) dimC = (m,n) if data_type == "All Ones": cpuA = np.ones(dimA, dtype=dtype).astype(np.float32) cpuB = np.ones(dimB, dtype=dtype).astype(np.float32) #cpuB = np.identity(n, dtype=np.float32) else: cpuA = np.random.uniform(-1.0, 1.0, dimA).astype(np.float32) cpuB = np.random.uniform(-1.0, 1.0, dimB).astype(np.float32) devA = ng.array(cpuA, dtype=dtype) devB = ng.array(cpuB, dtype=dtype) devC = ng.empty(dimC, dtype=dtype) if op[0] == 't': cpuA, devA = cpuA.T, devA.T if op[1] == 't': cpuB, devB = cpuB.T, devB.T ng.dot(devA, devB, devC, repeat=repeat) if cpu: cpuC = np.dot(cpuA, cpuB) cpuD = devC.get() diff = np.absolute(cpuC - cpuD) print diff.max() print cpuD[::max(m//4,1),::max(n//4,1)]
class GPU(Backend): """ Sets up a NervanaGPU based backend for matrix operations. Note that some functions defined in the generic Backend class such as are cross-map pooling and normalization and adaDelta are not implemented for this backend. """ default_dtype = np.float32 def __init__(self, rng_seed, stochastic_round=False, device_id=0): self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized NervanaGPU with stochastic_round=%s", stochastic_round) self.rng_seed = rng_seed self.rng_init() self.device_id = device_id if device_id is not None else 0 def __getstate__(self): """ Defines what and how we go about serializing an instance of this class. Returns: self.__dict__: The full contents of the backend class instance, except for the mem_pool which is on device and cannot be serialized. """ if hasattr(self, 'mem_pool') and self.mem_pool is not None: self.mem_pool_pickle = {'shape': self.mem_pool.shape, 'dtype': np.float32} self.mem_pool = None return self.__dict__ def __setstate__(self, state): """ Defines how we go about deserializing into an instance of this class. Arguments: self.__dict__: The full contents of the backend class instance, except for the mem_pool which is on device and cannot be serialized. """ self.__dict__.update(state) self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'], dtype=self.mem_pool_pickle['dtype']) def init_mempool(self, shape, dtype=default_dtype): """ Allocates a memory pool for temporary storage """ self.mem_pool = self.ng.empty(shape, dtype=dtype) def alloc_host_mem(self, shape, dtype): return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0) def create_stream(self): return drv.Stream() def async_copy(self, dest, src, stream=None): drv.memcpy_htod_async(dest.gpudata, src, stream) def rng_init(self): """ Initialize and seed the pseudo random number genrator. Random numbers are generated on the host using numpy, then transfered to device. """ seed = None if 'rng_seed' in self.__dict__: seed = self.rng_seed logger.info("Seeding random number generator with: %s", str(seed)) np.random.seed(seed) def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew): """ Initialize FLOP timing. Wraps the specified MOP calls via a decorator to record elapsed time and number of operations. Arguments: decorate_fc (list): string giving the function names of fully connected layer forward/backward/update calls to time. decorate_conv (list): string giving the function names of convolutional layer forward/backward/update calls to time. decorate_ew (list): string giving the function names of element-wise calls to time. Notes: Must be called prior to first flop_timing_start call """ self.start = drv.Event() self.end = drv.Event() self.flop_timer = FlopsDecorator(self) self.flop_timer.decorate(decorate_fc=decorate_fc, decorate_conv=decorate_conv, decorate_ew=decorate_ew) def flop_timinig_start(self): """ Start a new FLOP timer. Returns: None: dummy value (not used) """ return self.start.record() def flop_timing_finish(self, start_time): """ Complete current FLOP timing. Arguments: start_time (unused): ignored. Returns: float: elapsed time in seconds since prior flop_timing_start call. """ self.end.record() self.end.synchronize() return self.end.time_since(self.start) def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ generate numpy random number and convert to a GPUTensor. If called with dype=None it will probably explode """ ary = np.random.uniform(low, high, shape) return GPUTensor(ary.shape, dtype, allocator=allocator, name=name, rounding=self.ng.round_mode).set(ary) def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ Gaussian/Normal random number sample generation """ ary = np.random.normal(loc, scale, size) return GPUTensor(ary.shape, dtype, allocator=allocator, name=name, rounding=self.ng.round_mode).set(ary) def fprop_fc(self, out, inputs, weights, layer=None): """ Forward propagate the inputs of a fully connected network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. layer (Layer): The layer object. """ self.ng.dot(weights, inputs, out) def bprop_fc(self, out, weights, deltas, layer=None): """ Backward propagate the error through a fully connected network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer layer (Layer): The layer object. """ self.ng.dot(weights.T, deltas, out) def update_fc(self, out, inputs, deltas, layer=None): """ Compute the updated gradient for a fully connected network layer. Arguments: out (GPUTensor): Where to store the updated gradient value. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. deltas (GPUTensor): The error values for this layer layer (Layer): The layer object. """ self.ng.dot(deltas, inputs.T, out) def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs, ifmshape, links, nifm, padding, stride, ngroups, fpropbuf, local=False): """ Forward propagate the inputs of a convolutional network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. ofmshape (tuple): Dimensions of each output feature map (typically number of height and width neurons). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically number of height and width neurons). For this backend we expect these values to be square. links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. fpropbuf (GPUTensor): Temporary storage buffer used to hold the convolved outputs for a single receptive field. Not used for this backend. local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) """ ''' N: Number of images in mini-batch C: Number of input feature maps K: Number of output feature maps D: Depth of input image H: Height of input image W: Width of input image T: Depth of filter kernel R: Height of filter kernel S: Width of filter kernel ''' self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out, alpha=1.0, repeat=1) def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs, ifmshape, links, padding, stride, nifm, ngroups, bpropbuf, local=False): """ Backward propagate the error through a convolutional network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. bpropbuf (GPUTensor): Temporary storage buffer used to hold the backpropagated error for a single receptive field local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) """ self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out, alpha=1.0, repeat=1) def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize, ofmlocs, ifmshape, links, nifm, padding, stride, ngroups, fwidth, updatebuf, local=False, layer=None): """ Compute the updated gradient for a convolutional network layer. Arguments: out (GPUTensor): Where to store the updated gradient value. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. fwidth (int): Filter width. updatebuf (GPUTensor): Temporary storage buffer used to hold the updated gradient for a single receptive field local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) layer (Layer): The layer object. """ self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out, alpha=1.0, repeat=1) def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape, ifmshape, links, nifm, padding, stride, fpropbuf): """ Forward propagate the inputs of a Pooling network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. op (string): The type of pooling operation to apply. We support "max", "avg", "l2" currently. ofmshape (tuple): Dimensions of each output feature map (typically number of height and width neurons). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. fshape (tuple): Dimensions of each filter (typically height and width). ifmshape (tuple): Dimensions of each input feature map (typically number of height and width neurons). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the pooling operation. stride (int): Number of neurons to shift the filter at each step. fpropbuf (GPUTensor): Temporary storage buffer used to hold the pooled outputs for a single receptive field. """ op = op.lower() if op == "max": self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1) else: raise AttributeError("unexpected pooling op type: %s", op) def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize, ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding, stride, bpropbuf): """ Backward propagate the error through a pooling network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. fouts (GPUTensor): Forward propagated outputs from the previous layer. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. deltas (GPUTensor): The error values for this layer op (string): The type of pooling operation to apply. We support "max", "avg", "l2" currently. ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. fshape (tuple): Dimensions of each filter (typically height and width). fpsize (int): The size of each filter. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the pooling operation. stride (int): Number of neurons to shift the filter at each step. bpropbuf (GPUTensor): Temporary storage buffer used to hold the backpropagated error for a single receptive field """ op = op.lower() if op == "max": self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out, repeat=1) else: raise AttributeError("unexpected pooling op type: %s", op) def logistic(self, x, out): """ Logistic sigmoid nonlinearity, 1/(1+exp(-x)) Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor """ self.ng.sig(x, out=out) return out def rectlin(self, x, out): """ Rectified Linear nonlinearity Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor """ self.ng.maximum(x, 0., out=out) return out def rectleaky(self, x, slope, out): out[:] = self.ng.maximum(x, x*slope) def rectleaky_derivative(self, x, slope, out): out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope def sum(self, tsr, axes, out): """ Sum Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.sum(tsr, axis=axes, out=out) return out def mean(self, tsr, axes, out): """ Calculates the arithmetic mean of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.mean(tsr, axis=axes, out=out) return out def min(self, tsr, axes, out): """ Calculates the minimum of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.min(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.min(tsr, axis=axes, out=out) return out def max(self, tsr, axes, out): """ Calculates the maximum of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.max(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.max(tsr, axis=axes, out=out) return out def variance(self, tsr, axes, out, mean=None): """ Calculates the variance of the elements along the specified axes. Arguments: tsr (GPUTensor): the tensor on which to compute the variance axes (int, list, optional): the dimension(s) along which to variance. If set to None, we will variance over all dimensions. out (GPUTensor): where the result will be stored. mean (GPUTensor): the tensor containing mean of tsr Returns: GPUTensor: reference to out """ if mean is None: logger.error("GPUTensor requires mean to be specified.") raise ValueError("mean not specified") self.ng.mean(self.ng.square(tsr-mean), axis=axes, out=out) return out def fabs(self, x, out): """ Calculates absolute value of the elements in a tensor Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor Returns: GPUTensor: reference to out """ self.ng.fabs(x, out=out) return out def sqrt(self, x, out): """ Calculates square root of the elements in a tensor Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor Returns: GPUTensor: reference to out """ self.ng.sqrt(x, out=out) return out def zeros(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor and fill it with zeros. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.zeros(shape, dtype=dtype) def ones(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor and fill it with ones. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.ones(shape, dtype=dtype) def empty(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.empty(shape, dtype=dtype) def array(self, ary, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ Allocate a new GPUTensor and fill it with supplied numpy array. Arguments: ary (ndarray): Numpy array with source data dtype (dtype, optional): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls name (string): Name for the GPUTensor allocator (pycuda): Pycuda memory allocator Returns: GPUTensor: output """ return GPUTensor(ary.shape, dtype, allocator=allocator, name=name, rounding=self.ng.round_mode).set(ary) def add(self, left, right, out): """ Elementwise addition Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.add(left, right, out=out) return out def subtract(self, left, right, out): """ Elementwise subtraction Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.subtract(left, right, out=out) return out def multiply(self, left, right, out): """ Elementwise multiplication Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.multiply(left, right, out=out) return out def divide(self, left, right, out): """ Elementwise division Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.divide(left, right, out=out) return out def greater(self, left, right, out): """ Elementwise greater than testing Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.greater(left, right, out=out) return out def equal(self, left, right, out): """ Performs element-wise equality testing on each element of left and right, storing the result in out. Each operand is assumed to be the same shape (or broadcastable as such). Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.equal(left, right, out=out) return out def not_equal(self, left, right, out): """ Elementwise not equal testing Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.not_equal(left, right, out=out) return out def clip(self, a, a_min, a_max, out): """ Elementwise clipping between a range of specified values Arguments: a (GPUTensor): input tensor. a_min (float): floor value. a_max (float): ceiling value. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.clip(a, a_min, a_max, out=out) return out def log(self, a, out): """ Elementwise base-e logarithm Arguments: a (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.log(a, out=out) return out def tanh(self, a, out): """ Elementwise tanh Arguments: a (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.tanh(a, out=out) return out def argmax(self, a, out, axis=0): """ Calculates the indices of the maximal element value along the specified axis. If multiple elements contain the maximum, only the elements of the first are returned. Arguments: tsr (GPUTensor): The GPUTensor on which to find the maximum indices axis (int): The dimension along which to find the maximum. If set to None, find the overall maximum index of a flattened representation of tsr. out (GPUTensor): Where to store the result. Should be of the appropriate type and expected shape Returns: GPUTensor: reference to out """ self.ng.argmax(a, out=out, axis=axis) return out def softmax(self, x, out): """ Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i)) Arguments: x (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ out[:] = (self.ng.reciprocal(self.ng.sum( self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) * self.ng.exp(x - self.ng.max(x, axis=0))) return out def softmax_gradient(self, y, err, out): """ Gradient of the softmax nonlinearity. Arguments: y (GPUTensor): input tensor. err (GPUTensor): backpropagated error. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ raise NotImplementedError("Softmax gradient should use shortcut") return out def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype): """ Create a binary mask for dropout layers. Arguments: tsr (GPUTensor): Output tensor keepthresh (float): fraction of ones """ self.ng.dropout(keep=keepthresh, out=tsr) def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef, learning_rate, epoch): """ Perform gradient descent update with momentum. Arguments: ps_item (GPUTensor): parameter tensor (e.g. a weight matrix) us_item (GPUTensor): update tensor, contains gradient wrt. weights vs_item (GPUTensor): velocity tensor. momentum_coef (float): momentum coefficient. learning_rate (float): learning rate. epoch (int): epoch (used in conjunction with diagnostics). Outputs are written to vs_item (updated velocity) and ps_item (updated weights) """ vs_item[:] = vs_item * momentum_coef - us_item * learning_rate ps_item[:] = ps_item + vs_item def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef, learning_rate, wd, epoch): """ Perform gradient descent update with momentum and weight decay. Arguments: ps_item (GPUTensor): parameter tensor (e.g. a weight matrix) us_item (GPUTensor): update tensor, contains gradient wrt. weights vs_item (GPUTensor): velocity tensor. momentum_coef (float): momentum coefficient. learning_rate (float): learning rate. wd (float): weight decay parameter. epoch (int): epoch (used in conjunction with diagnostics). Outputs: ps_item, the updated weights. vs_item, the updated velocity. us_item, used as a temp buffer. """ vs_item[:] = vs_item * momentum_coef - us_item * \ learning_rate - learning_rate * wd * ps_item ps_item[:] = ps_item + vs_item def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item, rho, epsilon): """ Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701) Arguments: ps_item: weight / parameter (will be updated) us_item: update gs_item: expected value of Gradient Squared (will be updated) ds_item: expected value of Delta Squared (will be updated) ls_item: learning rate (will be updated) ss_item: Scratch Space rho: decay constant (determines window size) epsilon: small positive constant for numerical stability """ # Accumulate E[Grad^2] gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item # Calculate Updates ls_item[:] = self.ng.sqrt((ds_item + epsilon) / (gs_item + epsilon)) * (-1.0) * us_item # Accumulate E[Delt^2] ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item # Final update to the params ps_item[:] = ps_item + ls_item def rms_update(self, params, updates, run_squares, velocity, scratch_space, gamma, epsilon, learning_rate, momentum_coef): # Update running squares run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates # Now scale the gradient by lr / rms(grad) (with a epsilon term for # stability) and use it to update the params if momentum_coef == 0: params[:] = params - learning_rate * updates * self.ng.reciprocal( self.ng.sqrt(run_squares) + epsilon) else: velocity[:] = velocity * momentum_coef - \ learning_rate * updates * \ self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon) params[:] = params + velocity def fprop_bn_compound(self, inputs, beta, gamma, eps, xvar, xhat, out): """ Batch normalization forward pass, compounded to run in 3 kernel calls. Arguments: inputs: input data to be normalized beta: location parameter gamma: scale parameter eps: small constant for numerical stability xvar: variance (updated) xhat: normalized input (updated) out: normalized and rescaled input (updated) """ xvar[:] = self.ng.reciprocal(self.ng.sqrt(self.ng.var(inputs, axis=1) + eps)) xhat[:] = xvar * (inputs - self.ng.mean(inputs, axis=1)) out[:] = xhat * gamma + beta return out def bprop_bn_compound(self, xhat, error, xvar, gamma, beta_updates, gamma_updates): """ Batch normalization backward pass, compounded to run with 4 kernel calls. Arguments: xhat: normalized input data (updated) error: backpropagated deltas (updated) xvar: precomputed variance gamma: scale parameter beta_updates: gradient update for beta (updated) gamma_updates: gradient update for gamma (updated) """ gamma_updates[:] = self.ng.sum(xhat * error, axis=1) beta_updates[:] = self.ng.sum(error, axis=1) xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1]) error[:] = xvar * gamma * (error - xhat)
cpuI[-1,:] = 0.0 # cpu output arrays cpuO = np.zeros(dimO, dtype=np.float32) cpuB = np.zeros(slicable(dimI,1), dtype=np.float32) cpuU = np.zeros(slicable(dimF), dtype=np.float32) # give gpu the input array without zero padding (not needed) devI = ng.array(cpuI[:-1,:].reshape(dimI), dtype=dtype) devF = ng.array(cpuF.reshape(dimF), dtype=dtype) devE = ng.array(cpuE, dtype=dtype) devO = devB = devU = 0 if "fprop" in ops: devO = ng.empty(dimO, dtype=dtype) ng.fprop_conv(conv, devI, devF, devO, alpha=1.0, repeat=repeat) if "bprop" in ops: devB = ng.empty(dimI, dtype=dtype) ng.bprop_conv(conv, devF, devE, devB, alpha=1.0, repeat=repeat) if "update" in ops: devU = ng.empty(dimF, dtype=dtype) ng.update_conv(conv, devI, devE, devU, alpha=1.0, repeat=repeat) def pixel_indices(mt, pr, qs): T,R,S = conv.TRS D,H,W = conv.DHW
N,C,K = conv.NCK D,H,W = conv.DHW T,R,S = conv.TRS M,P,Q = conv.MPQ pad_d, pad_h, pad_w = conv.padding str_d, str_h, str_w = conv.strides alpha, beta = (1.0, 0.0) dimI = conv.dimI2 dimF = conv.dimF2 dimO = conv.dimO2 print "cudnn:" cuI = ng.empty(dimI[::-1], dtype=np.float32) cuF = ng.empty(dimF[::-1], dtype=np.float32) cuE = ng.empty(dimO[::-1], dtype=np.float32) cuB = ng.empty(dimI[::-1], dtype=np.float32) cuU = ng.empty(dimF[::-1], dtype=np.float32) cuO = ng.empty(dimO[::-1], dtype=np.float32) cuI[:] = 2 * (.5 - ng.rand()) cuF[:] = 2 * (.5 - ng.rand()) cuE[:] = 2 * (.5 - ng.rand()) #print drv.mem_get_info() I_data = ctypes.c_void_p(int(cuI.gpudata)) F_data = ctypes.c_void_p(int(cuF.gpudata)) O_data = ctypes.c_void_p(int(cuO.gpudata)) E_data = ctypes.c_void_p(int(cuE.gpudata))
dimO = (X, N, K) if ones: cpuI = np.ones(dimI, dtype=np.float32) cpuE = np.ones(dimO, dtype=np.float32) cpuW = np.ones(dimW, dtype=np.float32) else: cpuI = np.random.uniform(-1.0, 1.0, dimI).astype(dtype).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, dimO).astype(dtype).astype(np.float32) cpuW = np.random.uniform(-1.0, 1.0, dimW).astype(dtype).astype(np.float32) devI = ng.array(cpuI, dtype=dtype) devE = ng.array(cpuE, dtype=dtype) devW = ng.array(cpuW, dtype=dtype) devO = ng.empty(dimO, dtype=dtype) devB = ng.empty(dimI, dtype=dtype) devU = ng.empty(dimW, dtype=dtype) if Nin: ng.batched_dot(devW, devI, devO, repeat=repeat, size=size) # fprop ng.batched_dot(devW.T, devE, devB, repeat=repeat, size=size) # bprop ng.batched_dot(devE, devI.T, devU, repeat=repeat, size=size) # update else: ng.batched_dot(devI, devW.T, devO, repeat=repeat, size=size) # fprop ng.batched_dot(devE, devW, devB, repeat=repeat, size=size) # bprop ng.batched_dot(devE.T, devI, devU, repeat=repeat, size=size) # update if cpu: cpuO = np.empty(dimO, dtype=np.float32)
dimO = (X,N,K) if ones: cpuI = np.ones(dimI, dtype=np.float32) cpuE = np.ones(dimO, dtype=np.float32) cpuW = np.ones(dimW, dtype=np.float32) else: cpuI = np.random.uniform(-1.0, 1.0, dimI).astype(dtype).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, dimO).astype(dtype).astype(np.float32) cpuW = np.random.uniform(-1.0, 1.0, dimW).astype(dtype).astype(np.float32) devI = ng.array(cpuI, dtype=dtype) devE = ng.array(cpuE, dtype=dtype) devW = ng.array(cpuW, dtype=dtype) devO = ng.empty(dimO, dtype=dtype) devB = ng.empty(dimI, dtype=dtype) devU = ng.empty(dimW, dtype=dtype) if Nin: ng.batched_dot(devW, devI, devO, repeat=repeat, size=size) # fprop ng.batched_dot(devW.T, devE, devB, repeat=repeat, size=size) # bprop ng.batched_dot(devE, devI.T, devU, repeat=repeat, size=size) # update else: ng.batched_dot(devI, devW.T, devO, repeat=repeat, size=size) # fprop ng.batched_dot(devE, devW, devB, repeat=repeat, size=size) # bprop ng.batched_dot(devE.T, devI, devU, repeat=repeat, size=size) # update if cpu: cpuO = np.empty(dimO, dtype=np.float32)
layers.append(layer) # find the size of the largest buffers so they can be shared if layer.sizeF > max_weights: max_weights = layer.sizeF max_weight_layer = layer if layer.sizeO > max_deltas: max_deltas = layer.sizeO max_delta_layer = layer # for layer in sorted(layers, key=lambda l: l.sizeO, reverse=True): # print("%d %s" % (layer.sizeO, layer)) # Init shared buffers (assumes consistent dtype for now) shared_deltas[0] = ng.empty(max_delta_layer.dimO2, dtype=max_delta_layer.dtype) shared_deltas[1] = ng.empty(max_delta_layer.dimO2, dtype=max_delta_layer.dtype) shared_weights = ng.empty(max_weight_layer.dimF2, dtype=max_weight_layer.dtype) prev_layer = None delta = False for layer in layers: print(layer) # Intitalize buffers. Alernate shared delta buffer. # One layer can't have the same buffer for both error in and error out. layer.init_activations() layer.init_weights(shared=shared_weights) layer.init_deltas(shared=shared_deltas[delta])
# find the size of the largest buffers so they can be shared if layer.sizeF > max_weights: max_weights = layer.sizeF max_weight_layer = layer if layer.sizeI > max_deltas and type(prev_layer) is not DataLayer: max_deltas = layer.sizeI max_delta_layer = layer prev_layer = layer layers.append(layer) # Init shared buffers (assumes consistent dtype for now) shared_deltas.append( ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype)) shared_deltas.append( ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype)) if inception: shared_deltas.append( ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype)) shared_deltas.append( ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype)) shared_updates = ng.empty(max_weight_layer.dimF, dtype=np.float32) for i, layer in enumerate(layers): if verbose: print(layer) # Intitalize buffers. Alernate shared delta buffer.
N, C, K = conv.NCK D, H, W = conv.DHW T, R, S = conv.TRS M, P, Q = conv.MPQ pad_d, pad_h, pad_w = conv.padding str_d, str_h, str_w = conv.strides alpha, beta = (1.0, 0.0) dimI = conv.dimI2 dimF = conv.dimF2 dimO = conv.dimO2 print "cudnn:" cuI = ng.empty(dimI[::-1], dtype=np.float32) cuF = ng.empty(dimF[::-1], dtype=np.float32) cuE = ng.empty(dimO[::-1], dtype=np.float32) cuB = ng.empty(dimI[::-1], dtype=np.float32) cuU = ng.empty(dimF[::-1], dtype=np.float32) cuO = ng.empty(dimO[::-1], dtype=np.float32) cuI[:] = 2 * (.5 - ng.rand()) cuF[:] = 2 * (.5 - ng.rand()) cuE[:] = 2 * (.5 - ng.rand()) #print drv.mem_get_info() I_data = ctypes.c_void_p(int(cuI.gpudata)) F_data = ctypes.c_void_p(int(cuF.gpudata)) O_data = ctypes.c_void_p(int(cuO.gpudata)) E_data = ctypes.c_void_p(int(cuE.gpudata))
cpuI = np.random.uniform(0.0, 9.4, slicable(dimI,1)).astype(np.float16).astype(np.float32) # zero pad the last row of cpu input for the sake of numpy if pool.op == "max": cpuI[-1,:] = np.finfo(cpuI.dtype).min else: cpuI[-1,:] = 0 # cpu output arrays cpuO = np.empty(dimO, dtype=np.float32) cpuB = np.zeros(slicable(dimI,1), dtype=np.float32) # give gpu the input array without zero padding (not needed) devI = ng.array(cpuI[:-1,:].reshape(dimI), dtype=dtype) devO = ng.zeros(dimO, dtype=dtype) devB = ng.empty(dimI, dtype=dtype) ng.fprop_pool(pool, devI, devO, repeat=repeat) ng.bprop_pool(pool, devI, devO, devB, repeat=repeat) def pixel_indices(kj, mt, pr, qs): C = pool.C J,T,R,S = pool.JTRS D,H,W = pool.DHW HW = H*W DHW = D*H*W imax = C*D*H*W idx = []
class GPU(Backend): """ Sets up a NervanaGPU based backend for matrix operations. Note that some functions defined in the generic Backend class such as are cross-map pooling and normalization and adaDelta are not implemented for this backend. """ default_dtype = np.float32 def __init__(self, rng_seed, stochastic_round=False, device_id=0): self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized NervanaGPU with stochastic_round=%s", stochastic_round) self.rng_seed = rng_seed self.rng_init() self.device_id = device_id if device_id is not None else 0 def __getstate__(self): """ Defines what and how we go about serializing an instance of this class. Returns: self.__dict__: The full contents of the backend class instance, except for the mem_pool which is on device and cannot be serialized. """ if hasattr(self, 'mem_pool') and self.mem_pool is not None: self.mem_pool_pickle = {'shape': self.mem_pool.shape, 'dtype': np.float32} self.mem_pool = None return self.__dict__ def __setstate__(self, state): """ Defines how we go about deserializing into an instance of this class. Arguments: self.__dict__: The full contents of the backend class instance, except for the mem_pool which is on device and cannot be serialized. """ self.__dict__.update(state) self.mem_pool = self.ng.empty(self.mem_pool_pickle['shape'], dtype=self.mem_pool_pickle['dtype']) def init_mempool(self, shape, dtype=default_dtype): """ Allocates a memory pool for temporary storage """ self.mem_pool = self.ng.empty(shape, dtype=dtype) def alloc_host_mem(self, shape, dtype): return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0) def create_stream(self): return drv.Stream() def async_copy(self, dest, src, stream=None): drv.memcpy_htod_async(dest.gpudata, src, stream) def rng_init(self): """ Initialize and seed the pseudo random number genrator. Random numbers are generated on the host using numpy, then transfered to device. """ seed = None if 'rng_seed' in self.__dict__: seed = self.rng_seed logger.info("Seeding random number generator with: %s", str(seed)) np.random.seed(seed) def flop_timing_init(self, decorate_fc, decorate_conv, decorate_ew): """ Initialize FLOP timing. Wraps the specified MOP calls via a decorator to record elapsed time and number of operations. Arguments: decorate_fc (list): string giving the function names of fully connected layer forward/backward/update calls to time. decorate_conv (list): string giving the function names of convolutional layer forward/backward/update calls to time. decorate_ew (list): string giving the function names of element-wise calls to time. Notes: Must be called prior to first flop_timing_start call """ self.start = drv.Event() self.end = drv.Event() self.flop_timer = FlopsDecorator(self) self.flop_timer.decorate(decorate_fc=decorate_fc, decorate_conv=decorate_conv, decorate_ew=decorate_ew) def flop_timinig_start(self): """ Start a new FLOP timer. Returns: None: dummy value (not used) """ return self.start.record() def flop_timing_finish(self, start_time): """ Complete current FLOP timing. Arguments: start_time (unused): ignored. Returns: float: elapsed time in seconds since prior flop_timing_start call. """ self.end.record() self.end.synchronize() return self.end.time_since(self.start) def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ generate numpy random number and convert to a GPUTensor. If called with dype=None it will probably explode """ ary = np.random.uniform(low, high, shape) return self.ng.array(ary, dtype=dtype, name=name) def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ Gaussian/Normal random number sample generation """ ary = np.random.normal(loc, scale, size) return self.ng.array(ary, dtype=dtype, name=name) def fprop_fc(self, out, inputs, weights, layer=None): """ Forward propagate the inputs of a fully connected network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. layer (Layer): The layer object. """ self.ng.dot(weights, inputs, out) def bprop_fc(self, out, weights, deltas, layer=None): """ Backward propagate the error through a fully connected network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer layer (Layer): The layer object. """ self.ng.dot(weights.T, deltas, out) def update_fc(self, out, inputs, deltas, layer=None): """ Compute the updated gradient for a fully connected network layer. Arguments: out (GPUTensor): Where to store the updated gradient value. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. deltas (GPUTensor): The error values for this layer layer (Layer): The layer object. """ self.ng.dot(deltas, inputs.T, out) def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs, ifmshape, links, nifm, padding, stride, ngroups, fpropbuf, local=False): """ Forward propagate the inputs of a convolutional network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. ofmshape (tuple): Dimensions of each output feature map (typically number of height and width neurons). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically number of height and width neurons). For this backend we expect these values to be square. links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. fpropbuf (GPUTensor): Temporary storage buffer used to hold the convolved outputs for a single receptive field. Not used for this backend. local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) """ ''' N: Number of images in mini-batch C: Number of input feature maps K: Number of output feature maps D: Depth of input image H: Height of input image W: Width of input image T: Depth of filter kernel R: Height of filter kernel S: Width of filter kernel ''' self.ng.fprop_conv(layer=fpropbuf, I=inputs, F=weights, O=out, alpha=1.0, repeat=1) def bprop_conv(self, out, weights, deltas, ofmshape, ofmsize, ofmlocs, ifmshape, links, padding, stride, nifm, ngroups, bpropbuf, local=False): """ Backward propagate the error through a convolutional network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. bpropbuf (GPUTensor): Temporary storage buffer used to hold the backpropagated error for a single receptive field local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) """ self.ng.bprop_conv(layer=bpropbuf, F=weights, E=deltas, grad_I=out, alpha=1.0, repeat=1) def update_conv(self, out, inputs, weights, deltas, ofmshape, ofmsize, ofmlocs, ifmshape, links, nifm, padding, stride, ngroups, fwidth, updatebuf, local=False, layer=None): """ Compute the updated gradient for a convolutional network layer. Arguments: out (GPUTensor): Where to store the updated gradient value. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. weights (GPUTensor): The weight coefficient values for this layer. deltas (GPUTensor): The error values for this layer ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the convolution operation. stride (int): Number of neurons to shift the filter at each step. ngroups (int): Number of groups. fwidth (int): Filter width. updatebuf (GPUTensor): Temporary storage buffer used to hold the updated gradient for a single receptive field local (bool, optional): Whether to do local filtering (True) or convolution (False, the default) layer (Layer): The layer object. """ self.ng.update_conv(layer=updatebuf, I=inputs, E=deltas, grad_F=out, alpha=1.0, repeat=1) def fprop_pool(self, out, inputs, op, ofmshape, ofmsize, ofmlocs, fshape, ifmshape, links, nifm, padding, stride, fpropbuf): """ Forward propagate the inputs of a Pooling network layer to produce output pre-activations (ready for transformation by an activation function). Arguments: out (GPUTensor): Where to store the forward propagated results. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. op (string): The type of pooling operation to apply. We support "max", "avg", "l2" currently. ofmshape (tuple): Dimensions of each output feature map (typically number of height and width neurons). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. fshape (tuple): Dimensions of each filter (typically height and width). ifmshape (tuple): Dimensions of each input feature map (typically number of height and width neurons). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the pooling operation. stride (int): Number of neurons to shift the filter at each step. fpropbuf (GPUTensor): Temporary storage buffer used to hold the pooled outputs for a single receptive field. """ op = op.lower() if op == "max": self.ng.fprop_pool(layer=fpropbuf, I=inputs, O=out, repeat=1) else: raise AttributeError("unexpected pooling op type: %s", op) def bprop_pool(self, out, fouts, inputs, deltas, op, ofmshape, ofmsize, ofmlocs, fshape, fpsize, ifmshape, links, nifm, padding, stride, bpropbuf): """ Backward propagate the error through a pooling network layer. Arguments: out (GPUTensor): Where to store the backward propagated errors. fouts (GPUTensor): Forward propagated outputs from the previous layer. inputs (GPUTensor): Will be either the dataset input values (first layer), or the outputs from the previous layer. deltas (GPUTensor): The error values for this layer op (string): The type of pooling operation to apply. We support "max", "avg", "l2" currently. ofmshape (tuple): Dimensions of each output feature map (typically height and width). ofmsize (int): Total size of each output feature map. ofmlocs (GPUTensor): Indices giving the location of each element in each output feature map stored in out. fshape (tuple): Dimensions of each filter (typically height and width). fpsize (int): The size of each filter. ifmshape (tuple): Dimensions of each input feature map (typically height and width). links (GPUTensor): Input receptive field indices. nifm (int): Total number of input feature maps. padding (int): Number of additional elements to include along each dimension of each local receptive field during the pooling operation. stride (int): Number of neurons to shift the filter at each step. bpropbuf (GPUTensor): Temporary storage buffer used to hold the backpropagated error for a single receptive field """ op = op.lower() if op == "max": self.ng.bprop_pool(layer=bpropbuf, I=inputs, E=deltas, grad_I=out, repeat=1) else: raise AttributeError("unexpected pooling op type: %s", op) def logistic(self, x, out): """ Logistic sigmoid nonlinearity, 1/(1+exp(-x)) Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor """ self.ng.sig(x, out=out) return out def rectlin(self, x, out): """ Rectified Linear nonlinearity Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor """ self.ng.maximum(x, 0., out=out) return out def rectleaky(self, x, slope, out): out[:] = self.ng.maximum(x, x*slope) def rectleaky_derivative(self, x, slope, out): out[:] = self.ng.greater(x, 0) * (1.0 - slope) + slope def sum(self, tsr, axes, out): """ Sum Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.sum(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.sum(tsr, axis=axes, out=out) return out def mean(self, tsr, axes, out): """ Calculates the arithmetic mean of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.mean(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.mean(tsr, axis=axes, out=out) return out def min(self, tsr, axes, out): """ Calculates the minimum of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.min(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.min(tsr, axis=axes, out=out) return out def max(self, tsr, axes, out): """ Calculates the maximum of the elements along the specified axes. Arguments: tsr (GPUTensor): Input tensor axes (int): Axis along which the reduction is performed. If axes is None, the tensor is flattened and reduced over both dimensions. out (GPUTensor): Output tensor """ if axes is None: sze = tsr.shape[0]*tsr.shape[1] self.ng.max(tsr.reshape(sze, 1), axis=0, out=out) else: self.ng.max(tsr, axis=axes, out=out) return out def variance(self, tsr, axes, out, mean=None): """ Calculates the variance of the elements along the specified axes. Arguments: tsr (GPUTensor): the tensor on which to compute the variance axes (int, list, optional): the dimension(s) along which to variance. If set to None, we will variance over all dimensions. out (GPUTensor): where the result will be stored. mean (GPUTensor): the tensor containing mean of tsr Returns: GPUTensor: reference to out """ if mean is None: logger.error("GPUTensor requires mean to be specified.") raise ValueError("mean not specified") self.ng.mean(self.ng.square(tsr-mean), axis=axes, out=out) return out def fabs(self, x, out): """ Calculates absolute value of the elements in a tensor Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor Returns: GPUTensor: reference to out """ self.ng.fabs(x, out=out) return out def sqrt(self, x, out): """ Calculates square root of the elements in a tensor Arguments: x (GPUTensor): Input tensor out (GPUTensor): Output tensor Returns: GPUTensor: reference to out """ self.ng.sqrt(x, out=out) return out def zeros(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor and fill it with zeros. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.zeros(shape, dtype=dtype) def ones(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor and fill it with ones. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.ones(shape, dtype=dtype) def empty(self, shape, dtype=default_dtype, persist_values=True): """ Allocate a new GPUTensor. Arguments: shape (tupel): Shape of the desired GPUTensor dtype (dtype): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls Returns: GPUTensor: output """ return self.ng.empty(shape, dtype=dtype) def array(self, ary, dtype=default_dtype, persist_values=True, name=None, allocator=drv.mem_alloc): """ Allocate a new GPUTensor and fill it with supplied numpy array. Arguments: ary (ndarray): Numpy array with source data dtype (dtype, optional): Optional datatype persist_values (bool, optional): If set to True (the default), the values assigned to this Tensor will persist across multiple begin and end calls. Setting to False may provide a performance increase if values do not need to be maintained across such calls name (string): Name for the GPUTensor allocator (pycuda): Pycuda memory allocator Returns: GPUTensor: output """ return self.ng.array(ary, dtype=dtype, name=name) def add(self, left, right, out): """ Elementwise addition Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.add(left, right, out=out) return out def subtract(self, left, right, out): """ Elementwise subtraction Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.subtract(left, right, out=out) return out def multiply(self, left, right, out): """ Elementwise multiplication Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.multiply(left, right, out=out) return out def divide(self, left, right, out): """ Elementwise division Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.divide(left, right, out=out) return out def greater(self, left, right, out): """ Elementwise greater than testing Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.greater(left, right, out=out) return out def equal(self, left, right, out): """ Performs element-wise equality testing on each element of left and right, storing the result in out. Each operand is assumed to be the same shape (or broadcastable as such). Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.equal(left, right, out=out) return out def not_equal(self, left, right, out): """ Elementwise not equal testing Arguments: left (GPUTensor, numeric): left-hand side operand. right (GPUTensor, numeric): right-hand side operand. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.not_equal(left, right, out=out) return out def clip(self, a, a_min, a_max, out): """ Elementwise clipping between a range of specified values Arguments: a (GPUTensor): input tensor. a_min (float): floor value. a_max (float): ceiling value. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.clip(a, a_min, a_max, out=out) return out def log(self, a, out): """ Elementwise base-e logarithm Arguments: a (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.log(a, out=out) return out def tanh(self, a, out): """ Elementwise tanh Arguments: a (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ self.ng.tanh(a, out=out) return out def argmax(self, a, out, axis=0): """ Calculates the indices of the maximal element value along the specified axis. If multiple elements contain the maximum, only the elements of the first are returned. Arguments: tsr (GPUTensor): The GPUTensor on which to find the maximum indices axis (int): The dimension along which to find the maximum. If set to None, find the overall maximum index of a flattened representation of tsr. out (GPUTensor): Where to store the result. Should be of the appropriate type and expected shape Returns: GPUTensor: reference to out """ self.ng.argmax(a, out=out, axis=axis) return out def softmax(self, x, out): """ Softmax nonlinearity. Computes exp(x-max(x)) / sum_i exp(x_i-max(x_i)) Arguments: x (GPUTensor): input tensor. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ out[:] = (self.ng.reciprocal(self.ng.sum( self.ng.exp(x - self.ng.max(x, axis=0)), axis=0)) * self.ng.exp(x - self.ng.max(x, axis=0))) return out def softmax_gradient(self, y, err, out): """ Gradient of the softmax nonlinearity. Arguments: y (GPUTensor): input tensor. err (GPUTensor): backpropagated error. out (GPUTensor): where the result will be stored. Returns: GPUTensor: reference to out """ raise NotImplementedError("Softmax gradient should use shortcut") return out def make_binary_mask(self, tsr, keepthresh=0.5, dtype=default_dtype): """ Create a binary mask for dropout layers. Arguments: tsr (GPUTensor): Output tensor keepthresh (float): fraction of ones """ self.ng.dropout(keep=keepthresh, out=tsr) def gdm_compound(self, ps_item, us_item, vs_item, momentum_coef, learning_rate, epoch): """ Perform gradient descent update with momentum. Arguments: ps_item (GPUTensor): parameter tensor (e.g. a weight matrix) us_item (GPUTensor): update tensor, contains gradient wrt. weights vs_item (GPUTensor): velocity tensor. momentum_coef (float): momentum coefficient. learning_rate (float): learning rate. epoch (int): epoch (used in conjunction with diagnostics). Outputs are written to vs_item (updated velocity) and ps_item (updated weights) """ vs_item[:] = vs_item * momentum_coef - us_item * learning_rate ps_item[:] = ps_item + vs_item def gdmwd_compound(self, ps_item, us_item, vs_item, momentum_coef, learning_rate, wd, epoch): """ Perform gradient descent update with momentum and weight decay. Arguments: ps_item (GPUTensor): parameter tensor (e.g. a weight matrix) us_item (GPUTensor): update tensor, contains gradient wrt. weights vs_item (GPUTensor): velocity tensor. momentum_coef (float): momentum coefficient. learning_rate (float): learning rate. wd (float): weight decay parameter. epoch (int): epoch (used in conjunction with diagnostics). Outputs: ps_item, the updated weights. vs_item, the updated velocity. us_item, used as a temp buffer. """ vs_item[:] = vs_item * momentum_coef - us_item * \ learning_rate - learning_rate * wd * ps_item ps_item[:] = ps_item + vs_item def exp_mavg(self, mavg, newval, rho): """ Calculate the exponential moving average Arguments: mavg: The running value of the moving average newval: New sample to be added to the moving average rho: Interpolation value """ mavg[:] = rho * mavg + (1.0 - rho) * newval def ada_update(self, ps_item, us_item, gs_item, ds_item, ls_item, ss_item, rho, epsilon): """ Update rule for AdaDelta (Zeiler, http://arxiv.org/abs/1212.5701) Arguments: ps_item: weight / parameter (will be updated) us_item: update gs_item: expected value of Gradient Squared (will be updated) ds_item: expected value of Delta Squared (will be updated) ls_item: learning rate (will be updated) ss_item: Scratch Space rho: decay constant (determines window size) epsilon: small positive constant for numerical stability """ # Accumulate E[Grad^2] gs_item[:] = gs_item * rho + (1.0 - rho) * us_item * us_item # Calculate Updates ls_item[:] = self.ng.sqrt((ds_item + epsilon) / (gs_item + epsilon)) * (-1.0) * us_item # Accumulate E[Delt^2] ds_item[:] = ds_item * rho + (1.0 - rho) * ls_item * ls_item # Final update to the params ps_item[:] = ps_item + ls_item def rms_update(self, params, updates, run_squares, velocity, scratch_space, gamma, epsilon, learning_rate, momentum_coef): # Update running squares run_squares[:] = gamma * run_squares + (1. - gamma) * updates * updates # Now scale the gradient by lr / rms(grad) (with a epsilon term for # stability) and use it to update the params if momentum_coef == 0: params[:] = params - learning_rate * updates * self.ng.reciprocal( self.ng.sqrt(run_squares) + epsilon) else: velocity[:] = velocity * momentum_coef - \ learning_rate * updates * \ self.ng.reciprocal(self.ng.sqrt(run_squares) + epsilon) params[:] = params + velocity def fprop_bn_compound(self, inputs, beta, gamma, eps, xhat, xmean, xvar, gmean, gvar, rho, out): """ Batch normalization forward pass, compounded to run in 3 kernel calls. Arguments: inputs: input data to be normalized beta: location parameter gamma: scale parameter eps: small constant for numerical stability xvar: variance (updated) xhat: normalized input (updated) out: normalized and rescaled input (updated) """ xvar[:] = self.ng.var(inputs, axis=1) xmean[:] = self.ng.mean(inputs, axis=1) gmean[:] = gmean * rho + (1.0 - rho) * xmean gvar[:] = gvar * rho + (1.0 - rho) * xvar xvar[:] = self.ng.reciprocal(self.ng.sqrt(xvar + eps)) xhat[:] = xvar * (inputs - xmean) out[:] = xhat * gamma + beta return out def bprop_bn_compound(self, xhat, error, xvar, gamma, beta_updates, gamma_updates): """ Batch normalization backward pass, compounded to run with 4 kernel calls. Arguments: xhat: normalized input data (updated) error: backpropagated deltas (updated) xvar: precomputed variance gamma: scale parameter beta_updates: gradient update for beta (updated) gamma_updates: gradient update for gamma (updated) """ gamma_updates[:] = self.ng.sum(xhat * error, axis=1) beta_updates[:] = self.ng.sum(error, axis=1) xhat[:] = (xhat * gamma_updates + beta_updates) / float(xhat.shape[1]) error[:] = xvar * gamma * (error - xhat)
1)).astype(np.float16).astype(np.float32) # zero pad the last row of cpu input for the sake of numpy if pool.op == "max": cpuI[-1, :] = np.finfo(cpuI.dtype).min else: cpuI[-1, :] = 0 # cpu output arrays cpuO = np.empty(dimO, dtype=np.float32) cpuB = np.zeros(slicable(dimI, 1), dtype=np.float32) # give gpu the input array without zero padding (not needed) devI = ng.array(cpuI[:-1, :].reshape(dimI), dtype=dtype) devO = ng.zeros(dimO, dtype=dtype) devB = ng.empty(dimI, dtype=dtype) ng.fprop_pool(pool, devI, devO, repeat=repeat) ng.bprop_pool(pool, devI, devO, devB, repeat=repeat) def pixel_indices(kj, mt, pr, qs): C = pool.C J, T, R, S = pool.JTRS D, H, W = pool.DHW HW = H * W DHW = D * H * W imax = C * D * H * W idx = []
inception = True # find the size of the largest buffers so they can be shared if layer.sizeF > max_weights: max_weights = layer.sizeF max_weight_layer = layer if layer.sizeI > max_deltas and type(prev_layer) is not DataLayer: max_deltas = layer.sizeI max_delta_layer = layer prev_layer = layer layers.append(layer) # Init shared buffers (assumes consistent dtype for now) shared_deltas.append(ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype)) shared_deltas.append(ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype)) if inception: shared_deltas.append(ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype)) shared_deltas.append(ng.empty(max_delta_layer.dimI, dtype=max_delta_layer.dtype)) shared_updates = ng.empty(max_weight_layer.dimF, dtype=np.float32) for i, layer in enumerate(layers): print(layer) # Intitalize buffers. Alernate shared delta buffer. # One layer can't have the same buffer for both error in and error out. layer.init_activations() layer.init_weights(shared=shared_updates, zeros=zeros) if i > 1:
for dtype in ( np.float16, np.float32, ): for K, C, N in ((32, 4096, 1512), ): for alpha, beta in ((1.0, 0.0), (0.5, 0.5)): for op, dimA, dimB, dimC in ( ("nn", (K, C), (C, N), (K, N)), # fprop ("tn", (K, C), (K, N), (C, N)), # bprop ("nt", (K, N), (C, N), (K, C)), ): # update devA1 = ng.empty(dimA, dtype=dtype) devB1 = ng.empty(dimB, dtype=dtype) devC1 = ng.empty(dimC, dtype=dtype) # fill with uniform randoms from -1 to 1 devA1[:] = 2 * (.5 - ng.rand()) devB1[:] = 2 * (.5 - ng.rand()) devC1[:] = 2 * (.5 - ng.rand()) # just alias if same dtype if dtype is np.float32: devA2 = devA1 devB2 = devB1 # otherwise copy else: devA2 = ng.empty(dimA, dtype=np.float32)
# bprop(nn): NK x KC = NC # updat(tn): NK^T x NC = KC repeat = 2000 for K, C, N in ((3072,3072,32),): total = 0 for op, dimA, dimB, dimC in ( ("nn", (K,C), (C,N), (K,N) ), # fprop ("tn", (K,C), (K,N), (C,N) ), # bprop ("nt", (K,N), (C,N), (K,C) ),): # update devA = ng.empty(dimA, dtype=np.float32) devB = ng.empty(dimB, dtype=np.float32) devC = ng.empty(dimC, dtype=np.float32) # fill with uniform randoms from -1 to 1 devA[:] = 2 * (.5 - ng.rand()) devB[:] = 2 * (.5 - ng.rand()) total += cublas_dot(op, devA, devB, devC, repeat=repeat, warmup=True) print("N2 Total: ", total) total = 0 for op, dimA, dimB, dimC in ( ("nt", (N,C), (K,C), (N,K) ), # fprop ("nn", (N,K), (K,C), (N,C) ), # bprop