def lmul(self, x): """ dot(x, A) aka, do convolution with input image x """ check_cuda(str(type(self)) + ".lmul") # TODO Why is it CPU?? print "Por que?!?!", type(x) cpu = "Cuda" not in str(type(x)) if cpu: x = gpu_from_host(x) assert x.ndim == 5 x_axes = self.input_axes assert len(x_axes) == 5 op_axes = ("c", 0, 1, "t", "b") if tuple(x_axes) != op_axes: print "ssssssssssssssss" x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes]) _x_4d_shape = ( self.signal_shape[0], self.signal_shape[1], self.signal_shape[2], self.signal_shape[3] * self.signal_shape[4], ) x = x.reshape(_x_4d_shape) x = gpu_contiguous(x) rval = FilterActs(self.pad, self.partial_sum, self.kernel_stride[0])(x, self._filters) if cpu: rval = host_from_gpu(rval) rval = rval.reshape( ( self.filter_shape[3], self.filter_shape[4], rval.shape[1], rval.shape[2], self.signal_shape[3], self.signal_shape[4], ) ) rval = diagonal_subtensor(rval, 4, 0).sum(axis=0) # Format the output based on the output space rval_axes = self.output_axes assert len(rval_axes) == 5 if tuple(rval_axes) != op_axes: rval = rval.dimshuffle(*[op_axes.index(axis) for axis in rval_axes]) return rval
def make_funcs(batch_size, rows, cols, channels, filter_rows, num_filters): rng = np.random.RandomState([2012, 10, 9]) filter_cols = filter_rows base_image_value = rng.uniform(-1.0, 1.0, (channels, rows, cols, batch_size)).astype("float32") base_filters_value = rng.uniform(-1.0, 1.0, (channels, filter_rows, filter_cols, num_filters)).astype("float32") images = shared(base_image_value) filters = shared(base_filters_value, name="filters") # bench.py should always be run in gpu mode so we should not need a gpu_from_host here output = FilterActs()(images, filters) output_shared = shared(output.eval()) cuda_convnet = function([], updates={output_shared: output}) cuda_convnet.name = "cuda_convnet" images_bc01v = base_image_value.transpose(3, 0, 1, 2) filters_bc01v = base_filters_value.transpose(3, 0, 1, 2) filters_bc01v = filters_bc01v[:, :, ::-1, ::-1] images_bc01 = shared(images_bc01v) filters_bc01 = shared(filters_bc01v) output_conv2d = conv2d( images_bc01, filters_bc01, border_mode="valid", image_shape=images_bc01v.shape, filter_shape=filters_bc01v.shape ) output_conv2d_shared = shared(output_conv2d.eval()) baseline = function([], updates={output_conv2d_shared: output_conv2d}) baseline.name = "baseline" return cuda_convnet, baseline
def test_match_valid_conv(): # Tests that running FilterActs with no padding is the same as running # theano's conv2D in valid mode rng = np.random.RandomState([2012, 10, 9]) batch_size = 5 rows = 10 cols = 9 channels = 3 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform( -1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs()(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) f = function([], [output, output_conv2d]) output, output_conv2d = f() warnings.warn( """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""" ) if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ', output.shape print 'theano shape: ', output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False
def test_grad(): rng = np.random.RandomState([2012, 10, 9]) batch_size = 5 rows = 10 cols = 9 channels = 3 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform(-1.0, 1.0, (channels, rows, cols, batch_size)).astype("float32"), name="images") filters = shared( rng.uniform(-1.0, 1.0, (channels, filter_rows, filter_cols, num_filters)).astype("float32"), name="filters" ) gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs()(gpu_images, gpu_filters) output = host_from_gpu(output) # XXX: use verify_grad output_grad = grad(output.sum(), images) images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode="valid") output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) # XXX: use verify_grad output_conv2d_grad = grad(output_conv2d.sum(), images) f = function([], [output_grad, output_conv2d_grad]) output_grad, output_conv2d_grad = f() warnings.warn( """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""" ) if np.abs(output_grad - output_conv2d_grad).max() > 7.7e-6: assert type(output_grad) == type(output_conv2d_grad) assert output_grad.dtype == output_conv2d_grad.dtype if output_grad.shape != output_conv2d_grad.shape: print "cuda-convnet shape: ", output_grad.shape print "theano shape: ", output_conv2d_grad.shape assert False err = np.abs(output_grad - output_conv2d_grad) print "absolute error range: ", (err.min(), err.max()) print "mean absolute error: ", err.mean() print "cuda-convnet value range: ", (output_grad.min(), output_grad.max()) print "theano value range: ", (output_conv2d_grad.min(), output_conv2d_grad.max()) assert False
def test_match_valid_conv(): # Tests that running FilterActs with no padding is the same as running # theano's conv2D in valid mode rng = np.random.RandomState([2012,10,9]) batch_size = 5 rows = 10 cols = 9 channels = 3 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform(-1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform(-1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs()(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3,0,1,2) filters_bc01 = filters.dimshuffle(3,0,1,2) filters_bc01 = filters_bc01[:,:,::-1,::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') output_conv2d = output_conv2d.dimshuffle(1,2,3,0) try: f = function([], [output, output_conv2d]) except: raise KnownFailureTest("cuda-convnet code depends on an unmerged theano feature.") output, output_conv2d = f() warnings.warn("test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?") if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ',output.shape print 'theano shape: ',output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False
def test_match_valid_conv_strided(): # Tests that running FilterActs with stride is the same as running # theano's conv2D in valid mode and then downsampling rng = np.random.RandomState([2012,10,9]) batch_size = 5 rows = 9 cols = 9 channels = 3 filter_rows = 3 filter_cols = filter_rows stride = 3 num_filters = 16 images = shared(rng.uniform(-1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform(-1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs(stride=stride)(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3,0,1,2) filters_bc01 = filters.dimshuffle(3,0,1,2) filters_bc01 = filters_bc01[:,:,::-1,::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid', subsample=(stride, stride)) output_conv2d_orig = output_conv2d.dimshuffle(1,2,3,0) output_conv2d = output_conv2d_orig # [:, ::stride, ::stride, :] f = function([], [output, output_conv2d, output_conv2d_orig]) output, output_conv2d, output_conv2d_orig = f() warnings.warn("""test_match_valid_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""") if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print 'cuda-convnet shape: ',output.shape print 'theano shape: ',output_conv2d.shape assert False err = np.abs(output - output_conv2d) print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output.min(), output.max()) print 'theano value range: ', (output_conv2d.min(), output_conv2d.max()) assert False
def lmul(self, x): """ dot(x, A) aka, do convolution with input image x """ check_cuda(str(type(self)) + ".lmul") # TODO Why is it CPU?? print 'Por que?!?!', type(x) cpu = 'Cuda' not in str(type(x)) if cpu: x = gpu_from_host(x) assert x.ndim == 5 x_axes = self.input_axes assert len(x_axes) == 5 op_axes = ('c', 0, 1, 't', 'b') if tuple(x_axes) != op_axes: print 'ssssssssssssssss' x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes]) _x_4d_shape = (self.signal_shape[0], self.signal_shape[1], self.signal_shape[2], self.signal_shape[3] * self.signal_shape[4]) x = x.reshape(_x_4d_shape) x = gpu_contiguous(x) rval = FilterActs(self.pad, self.partial_sum, self.kernel_stride[0])(x, self._filters) if cpu: rval = host_from_gpu(rval) rval = rval.reshape( (self.filter_shape[3], self.filter_shape[4], rval.shape[1], rval.shape[2], self.signal_shape[3], self.signal_shape[4])) rval = diagonal_subtensor(rval, 4, 0).sum(axis=0) # Format the output based on the output space rval_axes = self.output_axes assert len(rval_axes) == 5 if tuple(rval_axes) != op_axes: rval = rval.dimshuffle( *[op_axes.index(axis) for axis in rval_axes]) return rval
def lmul(self, x): """ .. todo:: WRITEME properly dot(x, A) aka, do convolution with input image x """ check_cuda(str(type(self)) + ".lmul") cpu = 'Cuda' not in str(type(x)) if cpu: x = gpu_from_host(x) # x must be formatted as channel, topo dim 0, topo dim 1, batch_index # for use with FilterActs assert x.ndim == 4 x_axes = self.input_axes assert len(x_axes) == 4 op_axes = ('c', 0, 1, 'b') if tuple(x_axes) != op_axes: x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes]) x = gpu_contiguous(x) # Patch old pickle files. if not hasattr(self, 'kernel_stride'): self.kernel_stride = (1, 1) rval = FilterActs(self.pad, self.partial_sum, self.kernel_stride[0])( x, self._filters ) # Format the output based on the output space rval_axes = self.output_axes assert len(rval_axes) == 4 if cpu: rval = host_from_gpu(rval) if tuple(rval_axes) != op_axes: rval = rval.dimshuffle(*[op_axes.index(axis) for axis in rval_axes]) return rval
def make_funcs(batch_size, rows, cols, channels, filter_rows, num_filters): rng = np.random.RandomState([2012, 10, 9]) filter_cols = filter_rows base_image_value = rng.uniform( -1., 1., (channels, rows, cols, batch_size)).astype('float32') base_filters_value = rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32') images = shared(base_image_value) filters = shared(base_filters_value, name='filters') # bench.py should always be run in gpu mode so we should not need a gpu_from_host here layer_1_detector = FilterActs()(images, filters) layer_1_pooled_fake = layer_1_detector[:, 0:layer_1_detector.shape[0]:2, 0:layer_1_detector.shape[1]:2, :] base_filters2_value = rng.uniform( -1., 1., (num_filters, filter_rows, filter_cols, num_filters)).astype('float32') filters2 = shared(base_filters_value, name='filters') layer_2_detector = FilterActs()(images, filters2) output = layer_2_detector output_shared = shared(output.eval()) cuda_convnet = function([], updates={output_shared: output}) cuda_convnet.name = 'cuda_convnet' images_bc01 = base_image_value.transpose(3, 0, 1, 2) filters_bc01 = base_filters_value.transpose(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] images_bc01 = shared(images_bc01) filters_bc01 = shared(filters_bc01) output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') output_conv2d_shared = shared(output_conv2d.eval()) baseline = function([], updates={output_conv2d_shared: output_conv2d}) baseline.name = 'baseline' return cuda_convnet, baseline
def grad(self, inputs, g_outputs): """ .. todo:: WRITEME """ hid_acts, filters, output_shape = inputs g_images, = g_outputs g_images = as_cuda_ndarray_variable(g_images) assert not isinstance(g_images, list) global FilterActs global WeightActs if FilterActs is None: from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs from pylearn2.sandbox.cuda_convnet.weight_acts import WeightActs g_filters = WeightActs(stride=self.stride, partial_sum=self.partial_sum, pad=self.pad)(g_images, hid_acts, filters.shape[1:3])[0] assert not isinstance(g_filters, list) g_hid_acts = FilterActs(stride=self.stride, pad=self.pad, partial_sum=self.partial_sum)(g_images, filters) return [g_hid_acts, g_filters, DisconnectedType()()]
def apply(self, v, **kwargs): input = v.output #input = utils.PrintShapeOp(input, 'conv') # See http://benanne.github.io/2014/04/03/faster-convolutions-in-theano.html # for further info about what follows. # See cuda-convnet for info about partial_sum from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs conv_op = FilterActs(stride=self.kernel_stride, pad=self.padding, partial_sum=self.partial_sum) input_shuffled = input.dimshuffle(1, 2, 3, 0) filters_shuffled = self.filters.dimshuffle(1, 2, 3, 0) contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_shuffled) # out_shuffled is in channels, height, width, mb order out_shuffled = conv_op(contiguous_input, contiguous_filters) out_shuffled += self.filters_bias.dimshuffle(0, 'x', 'x', 'x') # unshuffling output = out_shuffled.dimshuffle(3, 0, 1, 2) nv = vcopy(v) nv.update(output=output) return self.post_apply(nv, **kwargs)
def test_dimshuffle_false_get_output_for(self, DummyInputLayer): try: from lasagne.layers.cuda_convnet import Conv2DCCLayer except ImportError: pytest.skip("cuda_convnet not available") # this implementation is tested against FilterActs instead of # theano.tensor.nnet.conv.conv2d because using the latter leads to # numerical precision errors. from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs filter_acts = FilterActs(stride=1, pad=0, partial_sum=1) input = theano.shared(floatX(np.random.random((4, 5, 5, 8)))) kernel = theano.shared(floatX(np.random.random((4, 3, 3, 16)))) input_layer = DummyInputLayer((4, 5, 5, 8)) # c01b instead of bc01 layer = Conv2DCCLayer(input_layer, num_filters=16, filter_size=(3, 3), dimshuffle=False, W=kernel, b=None, nonlinearity=None) output = np.array(filter_acts(input, kernel).eval()) actual = layer.get_output_for(input).eval() actual = np.array(actual) assert actual.shape == output.shape assert actual.shape == layer.output_shape assert np.allclose(actual, output)
def __init__(self, rngs, input_layer, Lshape, traits, activation): super(ConvLayer, self).__init__(input_layer, traits, "Conv") self.rng = rngs[0] self.l2decay = traits['l2decay'] filter_shape = Lshape[1] # The number of input channels must match number of filter channels assert Lshape[0][1] == filter_shape[1] self.pad = traits['padding'] self.W = NNl.gen_weights(self.rng, filter_shape, 0, traits['initW']) # convolve input feature maps with filters # Using Alex K.'s fast CUDA conv, courtesy of S. Dieleman self.x = self.input_layer.output(False) conv_op = FilterActs(pad=self.pad, partial_sum=1) input_shuffled = (self.x).dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = (self.W).dimshuffle(1, 2, 3, 0) # bc01 to c01b contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_shuffled) out_shuffled = conv_op(contiguous_input, contiguous_filters) self.conv_out = out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 # store parameters of this layer self.params = [self.W]
def __init__(self, input_layer, n_filters, filter_size, weights_std, init_bias_value, stride=1, nonlinearity=layers.rectify, dropout=0., partial_sum=None, untie_biases=False): """ This is a convolution which is circular in the 0-direction, and valid in the 1-direction. n_filters should be a multiple of 16 """ self.input_layer = input_layer self.n_filters = n_filters self.filter_size = filter_size self.weights_std = np.float32(weights_std) self.init_bias_value = np.float32(init_bias_value) self.stride = stride self.nonlinearity = nonlinearity self.dropout = dropout self.partial_sum = partial_sum self.untie_biases = untie_biases # if untie_biases == True, each position in the output map has its own bias (as opposed to having the same bias everywhere for a given filter) self.mb_size = self.input_layer.mb_size self.input_shape = self.input_layer.get_output_shape() self.filter_shape = (self.input_shape[0], filter_size, filter_size, n_filters) self.W = layers.shared_single(4) # theano.shared(np.random.randn(*self.filter_shape).astype(np.float32) * self.weights_std) if self.untie_biases: self.b = layers.shared_single(3) else: self.b = layers.shared_single(1) # theano.shared(np.ones(n_filters).astype(np.float32) * self.init_bias_value) self.params = [self.W, self.b] self.bias_params = [self.b] self.reset_params() self.filter_acts_op = FilterActs(stride=self.stride, partial_sum=self.partial_sum)
def __init__(self, incoming, num_filters, filter_size, groups=1, strides=(1, 1), border_mode=None, untie_biases=False, W=init.Uniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify, pad=None, dimshuffle=True, flip_filters=False, partial_sum=1, **kwargs): super(CaffeConv2DCCLayer, self).__init__(incoming, num_filters, filter_size, strides=strides, border_mode=border_mode, untie_biases=untie_biases, W=W, b=b, nonlinearity=nonlinearity, pad=pad, dimshuffle=dimshuffle, flip_filters=flip_filters, partial_sum=partial_sum, **kwargs) self.groups = groups self.filter_acts_op = FilterActs(numGroups=self.groups, stride=self.stride, partial_sum=self.partial_sum, pad=self.pad)
def __init__( self, n_filters, filter_size, weights_std=0.01, init_bias_value=0.1, stride=1, activation='relu', partial_sum=None, pad=0, untie_biases=False, # check the keyword arguments if nopt on default values initW='truncated_normal', initB='constant', initial_weights=None, W_regularizer=None, W_constraint=None, b_regularizer=None, b_constraint=None, **kwargs): """ Only the valid border mode is supported. n_filters should be a multiple of 16 """ self.initW = initializers.get({ 'class_name': initW, 'config': { 'stddev': weights_std } }) self.initB = initializers.get({ 'class_name': initB, 'config': { 'value': init_bias_value } }) self.initial_weights = initial_weights self.n_filters = n_filters self.filter_size = filter_size self.weights_std = np.float32(weights_std) self.init_bias_value = np.float32(init_bias_value) self.stride = stride self.nonlinearity = activations.get(activation) self.partial_sum = partial_sum self.pad = pad self.untie_biases = untie_biases self.W_regularizer = W_regularizer self.W_constraint = W_constraint self.b_regularizer = b_regularizer self.b_constraint = b_constraint self.filter_acts_op = FilterActs(stride=self.stride, partial_sum=self.partial_sum, pad=self.pad) super(kerasCudaConvnetConv2DLayer, self).__init__(**kwargs)
def __init__(self, incoming, num_filters, filter_size, stride=(1, 1), pad=0, untie_biases=False, W=None, b=init.Constant(0.), nonlinearity=nonlinearities.rectify, dimshuffle=True, flip_filters=False, partial_sum=1, **kwargs): if W is None: if dimshuffle: W = init.GlorotUniform() else: W = init.GlorotUniform(c01b=True) self.dimshuffle = dimshuffle super(Conv2DCCLayer, self).__init__(incoming, num_filters, filter_size, stride, pad, untie_biases, W, b, nonlinearity, flip_filters, n=2, **kwargs) self.partial_sum = partial_sum if self.filter_size[0] != self.filter_size[1]: raise RuntimeError("Conv2DCCLayer only supports square filters, " "but filter_size=(%d, %d)" % filter_size) if self.stride[0] != self.stride[1]: raise RuntimeError("Conv2DCCLayer only supports square strides, " "but stride=(%d, %d)" % stride) if self.num_filters % 16 != 0: raise RuntimeError("Conv2DCCLayer requires num_filters to be a " "multiple of 16, but num_filters is " "%d" % num_filters) if not (self.num_input_channels < 4 or self.num_input_channels % 4 == 0): raise RuntimeError("Conv2DCCLayer requires the number of input " "channels to be 1, 2, 3 or a multiple of 4, " "but it is %d" % self.num_input_channels) if isinstance(self.pad, tuple): if self.pad[0] != self.pad[1]: raise RuntimeError("Conv2DCCLayer only supports square " "padding, but pad=(%d, %d)" % pad) pad = self.pad[0] elif self.pad == 'same': pad = self.filter_size[0] // 2 elif self.pad == 'full': pad = self.filter_size[0] - 1 if not self.dimshuffle and self.untie_biases and self.b is not None: del self.params[self.b] biases_shape = (num_filters, self.output_shape[1], self.output_shape[2]) self.b = self.add_param(b, biases_shape, name="b", regularizable=False) self.filter_acts_op = FilterActs(stride=self.stride[0], partial_sum=self.partial_sum, pad=pad)
def compileActivation(self, net, layerNum): variable = net.x if layerNum == 0 else net.varArrayA[layerNum - 1] #Calc shapes for reshape function on-the-fly. Assume we have square images as input. sX = T.cast(T.sqrt(T.shape(variable)[0] / self.kernel_shape[1]), 'int16') #Converts input from 2 to 4 dimensions Xr = T.reshape(variable.T, (T.shape(variable)[1], self.kernel_shape[1], sX, sX)) if self.optimized: out_size = T.cast( T.ceil((T.shape(Xr)[-1] - T.shape(net.varWeights[layerNum]['w'])[-1] + 1) / np.float32(self.stride)), 'int32') conv_op = FilterActs(stride=self.stride) input_shuffled = Xr.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = net.varWeights[layerNum]['w'].dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_flipped = filters_shuffled[:, ::-1, ::-1, :] # flip rows and columns contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_flipped * (net.dropOutVectors[layerNum].dimshuffle('x', 0, 1, 'x') if self.dropout else 1.0)) a = conv_op(contiguous_input, contiguous_filters) a = a[:, :out_size, :out_size, :] #Add bias a = a + net.varWeights[layerNum]['b'].dimshuffle(0, 'x', 'x', 'x') else: a = T.nnet.conv2d(Xr, net.varWeights[layerNum]['w'] * (net.dropOutVectors[layerNum].dimshuffle('x', 'x', 0, 1) if self.dropout else 1.0), border_mode='valid', subsample=(self.stride, self.stride)) #Add bias a = a + net.varWeights[layerNum]['b'].dimshuffle('x', 0, 'x', 'x') if self.pooling: if self.optimized: #Pooling # ds - side of square pool window # stride - Defines the stride size between successive pooling squares. # Setting this parameter smaller than sizeX produces overlapping pools. # Setting it equal to sizeX gives the usual, non-overlapping pools. Values greater than sizeX are not allowed. pool_op = MaxPool(ds=self.pooling_shape, stride=self.pooling_shape) contiguous_input = gpu_contiguous(a) a = pool_op(contiguous_input) a = a.dimshuffle(3, 0, 1, 2) # c01b to bc01 else: a = downsample.max_pool_2d(a, (self.pooling_shape, self.pooling_shape), ignore_border=False) else: if self.optimized: a = a.dimshuffle(3, 0, 1, 2) # c01b to bc01 a = T.flatten(a, outdim=2).T #Sigmoid a = self.activation(a, self.pool_size) net.varArrayA.append(a)
def dropout_fprop(self, input): # we reduce the precision of parameters for the computations self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range) self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range) # create the dropout mask # The cast is important because # int * float32 = float64 which pulls things off the gpu srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999)) self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX) input = input * self.mask self.fixed_x = input.reshape(self.image_shape) # convolution input_shuffled = self.fixed_x.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = self.fixed_W.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_op = FilterActs( stride=self.filter_stride, partial_sum=self.partial_sum, pad=self.zero_pad ) # augment partial sum -> use less memory but slower contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_shuffled) conv_out_shuffled = conv_op(contiguous_input, contiguous_filters) self.z = conv_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range) conv_out_shuffled = self.fixed_z.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_out_shuffled = gpu_contiguous(conv_out_shuffled) # downsample each feature map individually, using maxpooling # pooled_out = downsample.max_pool_2d(input=conv_out, # ds=poolsize, ignore_border=True) pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride) pooled_out_shuffled = pool_op(conv_out_shuffled) pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 # bias self.u = pooled_out + self.fixed_b.dimshuffle('x', 0, 'x', 'x') self.fixed_u = apply_format(self.format, self.u, self.comp_precision, self.z_range) # activation self.y = self.activation(self.fixed_u).flatten(2) self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range) return self.fixed_y
def apply_conv(self, input): """ This method applies the convolution operation on the input provided @note Convolution operation in this version is not as powerful as using dnn_conv @param input: symbolic tensor of shape image_shape (theano.tensor.dtensor4) A 4D tensor with the axes representing batch size, number of channels, image height, and image width. ---------------------------------------------------------------------------------- @return output : A 4D tensor of filtered images (feature maps) with dimensions representing batch size, number of filters, feature map height, and feature map width. The height and width of the feature map depend on the border mode. For 'valid' it is ``image_size - filter_size + 1`` while for 'full' it is ``image_size + filter_size - 1`` ---------------------------------------------------------------------------------- Limitations of using FilterActs compared to conv2d: > Number of channels <= 3; If you want to compute the gradient, it should be divisible by 4. > Filters must be square. > Number of filters must be a multiple of 16 > All minibatch sizes are supported, but the best performance is achieved when the minibatch size is a multiple of 128. > Works only on the GPU """ input_shuffled = input.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = self.W.dimshuffle(1, 2, 3, 0) # bc01 to c01b ## Use zero padding with (filter_size - 1) border i.e. full convolution if self.border_mode == "full": padding = self.filter_shape[0] - 1 else: padding = 0 conv_out = FilterActs(stride=1, partial_sum=1, pad=padding) contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_shuffled) conv_out_shuffled = conv_out(contiguous_input, contiguous_filters) if self.pool == True: pool_op = MaxPool(ds=self.pool_size[0], stride=self.pool_size[0]) pooled_out_shuffled = pool_op(conv_out_shuffled) pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 pooled_out = max_pool_2d(input=conv_out, ds=self.pool_size) else: pooled_out = conv_out self.output = pooled_out if self.tied_biases: self.output += self.b.dimshuffle("x", 0, "x", "x") else: self.output += self.b.dimshuffle('x', 0, 1, 2) return self.output
def make_funcs(batch_size, rows, cols, channels, filter_rows, num_filters): rng = np.random.RandomState([2012, 10, 9]) filter_cols = filter_rows base_image_value = rng.uniform( -1., 1., (channels, rows, cols, batch_size)).astype('float32') base_filters_value = rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32') images = shared(base_image_value) filters = shared(base_filters_value, name='filters') # bench.py should always be run in gpu mode so we should not need a gpu_from_host here output = FilterActs()(images, filters) output_shared = shared(output.eval()) cuda_convnet = function([], updates={output_shared: output}) cuda_convnet.name = 'cuda_convnet' images_bc01v = base_image_value.transpose(3, 0, 1, 2) filters_bc01v = base_filters_value.transpose(3, 0, 1, 2) filters_bc01v = filters_bc01v[:, :, ::-1, ::-1] images_bc01 = shared(images_bc01v) filters_bc01 = shared(filters_bc01v) output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid', image_shape=images_bc01v.shape, filter_shape=filters_bc01v.shape) output_conv2d_shared = shared(output_conv2d.eval()) baseline = function([], updates={output_conv2d_shared: output_conv2d}) baseline.name = 'baseline' return cuda_convnet, baseline
def lmul(self, x): """ dot(x, A) aka, do convolution with input image x """ cpu = 'Cuda' not in str(type(x)) if cpu: x = gpu_from_host(x) # x must be formatted as channel, topo dim 0, topo dim 1, batch_index # for use with FilterActs assert x.ndim == 4 x_axes = self.input_axes assert len(x_axes) == 4 op_axes = ('c', 0, 1, 'b') if tuple(x_axes) != op_axes: x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes]) x = gpu_contiguous(x) rval = FilterActs(self.pad, self.partial_sum)(x, self._filters) # Format the output based on the output space rval_axes = self.output_axes assert len(rval_axes) == 4 if tuple(rval_axes) != op_axes: rval = rval.dimshuffle( *[op_axes.index(axis) for axis in rval_axes]) if cpu: rval = host_from_gpu(rval) return rval
def test_filter_acts_strided(): # Tests that FilterActs with all possible strides rng = np.random.RandomState([2012, 10, 9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [ [(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images, name='images') gpu_filters = float32_shared_constructor(filters, name='filters') print("test case %d..." % (test_idx + 1)) for ii in xrange(filters.shape[1]): stride = ii + 1 output = FilterActs(stride=stride)(gpu_images, gpu_filters) output = host_from_gpu(output) f = function([], output) output_val = f() output_python = FilterActs_python(images, filters, stride) if np.abs(output_val - output_python).max() > 8.6e-6: assert type(output_val) == type(output_python) assert output_val.dtype == output_python.dtype if output_val.shape != output_python.shape: print('cuda-convnet shape: ', output_val.shape) print('python conv shape: ', output_python.shape) assert False err = np.abs(output_val - output_python) print('stride %d' % stride) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (output_val.min(), output_val.max())) print('python conv value range: ', (output_python.min(), output_python.max()))
def lmul(self, x): """ dot(x, A) aka, do convolution with input image x """ cpu = 'Cuda' not in str(type(x)) if cpu: x = gpu_from_host(x) # x must be formatted as channel, topo dim 0, topo dim 1, batch_index # for use with FilterActs assert x.ndim == 4 x_axes = self.input_axes assert len(x_axes) == 4 op_axes = ('c', 0, 1, 'b') if tuple(x_axes) != op_axes: x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes]) x = gpu_contiguous(x) rval = FilterActs(self.pad, self.partial_sum)(x, self._filters) # Format the output based on the output space rval_axes = self.output_axes assert len(rval_axes) == 4 if tuple(rval_axes) != op_axes: rval = rval.dimshuffle(*[op_axes.index(axis) for axis in rval_axes]) if cpu: rval = host_from_gpu(rval) return rval
def fp(self, x, _): if self.on_gpu: print "conv on gpu..." conv_op = FilterActs(stride=self.subsample[0]) input_shuffled = x.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = self.W.dimshuffle(1, 2, 3, 0) # bc01 to c01b contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_shuffled) out_shuffled = conv_op(contiguous_input, contiguous_filters) self.output = out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 else: self.output = conv.conv2d(x, self.W, filter_shape=self.filter_shape, image_shape=self.in_shape, subsample=self.subsample, border_mode=self.border_mode)
def __init__(self, input_layer, n_filters, filter_size, weights_std, stride=1, nonlinearity=layers.rectify, dropout=0., partial_sum=None, pad=0, trainable=True): """ Only the valid border mode is supported. n_filters should be a multiple of 16 """ self.input_layer = input_layer self.input_shape = self.input_layer.get_output_shape() self.n_filters = n_filters n_channels = self.input_shape[0] self.n_channels = n_channels self.filter_size = filter_size self.weights_std = numpy.float32(weights_std) self.stride = stride self.nonlinearity = nonlinearity self.dropout = dropout self.partial_sum = partial_sum self.pad = pad self.mb_size = self.input_layer.mb_size self.data_order = layers.data_order.type2 assert (len(self.input_layer.get_output_shape()) == 4), \ 'Input must have 4 dimensions.' assert (self.input_layer.data_order == self.data_order), \ 'Input data order does not match this layer\'s data order.' self.filter_shape = (n_channels, filter_size, filter_size, n_filters) self.trainable = trainable self.W = layers.shared_single(4) self.params = [self.W] self.reset_params() self.filter_acts_op = FilterActs(stride=self.stride, partial_sum=self.partial_sum, pad=self.pad)
def __init__(self,layer_def,input,input_shape,rs,clone_from=None): """ Create a (GPU only) convolutional layer with shared variable internal parameters. Each filter has a corresponding bias :type layer_def: Element, xml containing configu for Conv layer :type input: tensor.tensor4 :type input_shape: tuple or list of size 4 :param input_shape: [channels,height,width,batchsize] c01b :type rs: a random number generator used to initialize weights """ layer_name = layer_def.attrib["name"] convPadStride = [ int(layer_def.find("convpad").text),int(layer_def.find("convstride").text)] num_filters = int(layer_def.find("numfilters").text) filter_size = int(layer_def.find("filtersize").text) init_bias = float(layer_def.find("bias").text) rng = np.random.RandomState(seed=int(time.time())) self.input = gpu_contiguous(input) image_channels,image_size0,image_size1,batch_size = input_shape filter_shape = [image_channels,filter_size,filter_size,num_filters]#c01b if clone_from is None: #W_bound = 0.01#numpy.sqrt(6. / (fan_in + fan_out)) W_bound = np.sqrt( 2. / (filter_size*filter_size*image_channels) )#initialization from PRELU self.W = theano.shared( np.asarray(rng.normal(loc=0., scale=W_bound, size=filter_shape), dtype=theano.config.floatX), borrow=True, name= layer_name+"-W") self.b = theano.shared( np.asarray(init_bias*np.ones((num_filters,)), dtype=theano.config.floatX), borrow=True , name=layer_name+"-b") else: self.W = clone_from.W self.b = clone_from.b #CONV conv_op = FilterActs(partial_sum=1,pad=convPadStride[0],stride=convPadStride[1]) contiguous_filters = gpu_contiguous(self.W) self.output = conv_op(self.input, contiguous_filters) + self.b.dimshuffle(0, 'x', 'x','x') #output size is equal to (image+2*pad - filter_size + 1) / stride output_size0 = (image_size0 + 2 * convPadStride[0] - filter_size + 1 ) / convPadStride[1] + (1 if convPadStride[1]>1 else 0) output_size1 = (image_size1 + 2 * convPadStride[0] - filter_size + 1 ) / convPadStride[1] + (1 if convPadStride[1]>1 else 0) self.input_shape = input_shape#[filter_shape[0],img_size,img_size,filter_shape[0]]#c01b self.output_shape = [num_filters, output_size0, output_size1, batch_size]#c01b self.params = [self.W,self.b]
def test_reject_rect(): # Tests that running FilterActs with a non-square # kernel is an error rng = np.random.RandomState([2012, 10, 9]) batch_size = 5 rows = 10 cols = 9 channels = 3 filter_rows = 4 filter_cols = filter_rows + 1 num_filters = 6 images = shared(rng.uniform( -1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform( -1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs()(gpu_images, gpu_filters) images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = images.dimshuffle(3, 0, 1, 2) output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') try: f = function([], [output, output_conv2d]) except: raise KnownFailureTest( "cuda-convnet code depends on an unmerged theano feature.") try: output, output_conv2d = f() except ValueError: return assert False
def __init__(self, filter_size=7, num_channels=3): # magic numbers that make things work for stl10 self.filter_size = filter_size self.pad = self.filter_size / 2 # -1 self.num_channels = num_channels self.num_filters = 16 input = T.ftensor4(name='input') filter = T.ftensor4(name='filter') gpu_input = gpu_contiguous(input) gpu_filter = gpu_contiguous(filter) self.conv_func = theano.function([input, filter], FilterActs(pad=self.pad)(gpu_input, gpu_filter)) n = self.num_channels * self.filter_size * self.filter_size self.w = numpy.float32( numpy.ones((self.num_channels, self.filter_size, self.filter_size, self.num_filters))) / n
def __init__(self, incoming, num_filters, filter_size, groups=1, stride=(1, 1), border_mode=None, untie_biases=False, W=init.Uniform(), b=init.Constant(0.), nonlinearity=nonlinearities.rectify, pad=None, dimshuffle=True, flip_filters=False, partial_sum=1, **kwargs): super(CaffeConv2DCCLayer, self).__init__(incoming, num_filters, filter_size, stride=stride, untie_biases=untie_biases, W=W, b=b, nonlinearity=nonlinearity, pad=pad, dimshuffle=dimshuffle, flip_filters=flip_filters, partial_sum=partial_sum, **kwargs) self.groups = groups # the FilterActs in pylearn2 cannot accept tuple-type pad if isinstance(self.pad, int): self.pad = self.pad elif isinstance(self.pad, tuple): self.pad = self.pad[0] else: self.pad = 0 self.filter_acts_op = FilterActs(numGroups=self.groups, stride=self.stride, partial_sum=self.partial_sum, pad=self.pad)
def fprop(self, input): # we reduce the precision of parameters for the computations self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range) self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range) input = input.reshape(self.image_shape) # convolution input_shuffled = input.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = self.w_comp.dimshuffle( 1, 2, 3, 0) * self.scale # bc01 to c01b conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum, pad=self.zero_pad) contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_shuffled) conv_out_shuffled = conv_op(contiguous_input, contiguous_filters) # downsample each feature map individually, using maxpooling # pooled_out = downsample.max_pool_2d(input=conv_out, # ds=poolsize, ignore_border=True) pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride) pooled_out_shuffled = pool_op(conv_out_shuffled) pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 # bias pooled_out = apply_format( self.format, pooled_out + self.b_comp.dimshuffle('x', 0, 'x', 'x') * self.scale, self.comp_precision, self.z_range) # activation pooled_out = self.activation(pooled_out) pooled_out = apply_format(self.format, pooled_out.flatten(2), self.comp_precision, self.y_range) return pooled_out
def test_match_grad_valid_conv(): # Tests that weightActs is the gradient of FilterActs # with respect to the weights. for partial_sum in [0, 1, 4]: rng = np.random.RandomState([2012, 10, 9]) batch_size = 3 rows = 7 cols = 9 channels = 8 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform(-1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = rng.uniform(-1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32') filters = shared(filters, name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) output = FilterActs(partial_sum=partial_sum)(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = images.dimshuffle(3, 0, 1, 2) filters_bc01 = filters.dimshuffle(3, 0, 1, 2) filters_bc01 = filters_bc01[:, :, ::-1, ::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0) theano_rng = MRG_RandomStreams(2013 + 1 + 31) coeffs = theano_rng.normal(avg=0., std=1., size=output_conv2d.shape, dtype='float32') cost_conv2d = (coeffs * output_conv2d).sum() weights_grad_conv2d = T.grad(cost_conv2d, filters) cost = (coeffs * output).sum() hid_acts_grad = T.grad(cost, output) weights_grad = WeightActs(partial_sum=partial_sum)( gpu_images, gpu_from_host(hid_acts_grad), as_tensor_variable((4, 4)) )[0] weights_grad = host_from_gpu(weights_grad) f = function([], [output, output_conv2d, weights_grad, weights_grad_conv2d]) output, output_conv2d, weights_grad, weights_grad_conv2d = f() if np.abs(output - output_conv2d).max() > 8e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print('cuda-convnet shape: ', output.shape) print('theano shape: ', output_conv2d.shape) assert False err = np.abs(output - output_conv2d) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (output.min(), output.max())) print('theano value range: ', (output_conv2d.min(), output_conv2d.max())) assert False warnings.warn( "test_match_grad_valid_conv success criterion is not very strict." " Can we verify that this is OK? One possibility is that theano" " is numerically unstable and Alex's code is better. Probably" " theano CPU 64 bit is OK but it's worth checking the others.") if np.abs(weights_grad - weights_grad_conv2d).max() > 8.6e-6: if type(weights_grad) != type(weights_grad_conv2d): raise AssertionError("weights_grad is of type " + str(weights_grad)) assert weights_grad.dtype == weights_grad_conv2d.dtype if weights_grad.shape != weights_grad_conv2d.shape: print('cuda-convnet shape: ', weights_grad.shape) print('theano shape: ', weights_grad_conv2d.shape) assert False err = np.abs(weights_grad - weights_grad_conv2d) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (weights_grad.min(), weights_grad.max())) print('theano value range: ', (weights_grad_conv2d.min(), weights_grad_conv2d.max())) assert False
def test_match_valid_conv_padded(): # Tests that running FilterActs with no padding is the same as running # theano's conv2D in valid mode rng = np.random.RandomState([2012,10,9]) batch_size = 5 rows = 10 cols = 9 channels = 3 filter_rows = 4 filter_cols = filter_rows num_filters = 16 images = shared(rng.uniform(-1., 1., (channels, rows, cols, batch_size)).astype('float32'), name='images') filters = shared(rng.uniform(-1., 1., (channels, filter_rows, filter_cols, num_filters)).astype('float32'), name='filters') gpu_images = gpu_from_host(images) gpu_filters = gpu_from_host(filters) PAD = 3 output = FilterActs(PAD)(gpu_images, gpu_filters) output = host_from_gpu(output) images_bc01 = T.alloc(0., batch_size, channels, rows + PAD * 2, cols + PAD * 2) images_bc01 = T.set_subtensor(images_bc01[:,:,PAD:-PAD,PAD:-PAD], images.dimshuffle(3,0,1,2)) filters_bc01 = filters.dimshuffle(3,0,1,2) filters_bc01 = filters_bc01[:,:,::-1,::-1] output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid') output_conv2d = output_conv2d.dimshuffle(1,2,3,0) f = function([], [output, output_conv2d]) output, output_conv2d = f() warnings.warn("""test_match_valid_conv success criterion is not very strict. Can we verify that this is OK? One possibility is that theano is numerically unstable and Alex's code is better. Probably theano CPU 64 bit is OK but it's worth checking the others.""") assert output.shape == output_conv2d.shape if np.abs(output - output_conv2d).max() > 2.4e-6: assert type(output) == type(output_conv2d) assert output.dtype == output_conv2d.dtype if output.shape != output_conv2d.shape: print('cuda-convnet shape: ',output.shape) print('theano shape: ',output_conv2d.shape) assert False err = np.abs(output - output_conv2d) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (output.min(), output.max())) print('theano value range: ', (output_conv2d.min(), output_conv2d.max())) assert False
import time import cPickle as pickle import numpy as np import theano import theano.tensor as T from theano.sandbox.cuda.basic_ops import gpu_from_host # Theano's own convolution implementation from theano.tensor.nnet import conv # cuda-convnet convolution implementation from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs filter_acts_op = FilterActs(stride=1, partial_sum=1, pad=0) # FFT-based convolution implementation import fftconv target_path = "speedtest_data.pkl" num_runs = 10 # number of times each convolution is run, # running time is averaged across these runs. atol = 1e-3 rtol = 1e-5 std = 0.1 shapes_list = [ # (input_shape, filter_shape) # ((minibatch_size, num_input_channels, image_width, image_height), # (num_filters, num_input_channels, filter_width, filter_height))
def __init__(self, rng, input, filter_shape, image_shape, pad = 0, poolsize=(2, 2), activation = T.tanh, poolstride=(2, 2), init_type="tanh", W=None, b=None): """ Allocate a LeNetConvPoolLayer with shared variable internal parameters. :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shape: tuple or list of length 4 :param filter_shape: (number of filters, num input feature maps, filter height, filter width) :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) :type poolsize: tuple or list of length 2 :param poolsize: the downsampling (pooling) factor (#rows, #cols) """ assert image_shape[1] == filter_shape[1] self.input = input # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size if init_type=="ReLU": print "ConvPoolLayer with He init" std = numpy.sqrt(2.0/fan_in) self.W = theano.shared( numpy.asarray( rng.normal(0, std, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) else: print "ConvPoolLayer with Xavier init" fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared( numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) if W!=None: self.W.set_value(W) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) if b!=None: self.b.set_value(b) # convolve input feature maps with filters #conv_out = conv.conv2d( # input=input, # filters=self.W, # filter_shape=filter_shape, # image_shape=image_shape, # border_mode='full' #) input_shuffled = input.dimshuffle(1, 2, 3, 0) # bc01 to c01b filters_shuffled = self.W.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_op = FilterActs(stride=1, partial_sum=1, pad=pad) contiguous_input = gpu_contiguous(input_shuffled) contiguous_filters = gpu_contiguous(filters_shuffled) conv_out_shuffled = conv_op(contiguous_input, contiguous_filters) # downsample each feature map individually, using maxpooling #pooled_out = downsample.max_pool_2d( # input=conv_out, # ds=poolsize, # st=poolstride, # ignore_border=False #) pool_op = MaxPool(ds=poolsize[0], stride=poolstride[0]) pooled_out_shuffled = pool_op(conv_out_shuffled) pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height #self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) #self.output = relu(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) self.output = activation(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) stride = 1# not used assert (image_shape[2]-filter_shape[2]+2*pad)%stride==0 output_im_size = (image_shape[2]-filter_shape[2]+2*pad)/stride+1 assert output_im_size%poolsize[0]==0 output_im_size = output_im_size//poolsize[0] self.output_shape = [image_shape[0], filter_shape[0], output_im_size, output_im_size] # store parameters of this layer self.params = [self.W, self.b]
def __init__(self, rng, input, filter_shape, image_shape, activation=prelu, W1=None, W2=None, b1=None, b2=None): assert image_shape[1] == filter_shape[1] self.input = input # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W1 = theano.shared( numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) if W1!=None: self.W1.set_value(W1) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b1 = theano.shared(value=b_values, borrow=True) if b1!=None: self.b1.set_value(b1) assert filter_shape[2]%2==1# odd size pad = (filter_shape[2]-1)//2 input_shuffled = input.dimshuffle(1, 2, 3, 0) # bc01 to c01b w1_shuffled = self.W1.dimshuffle(1, 2, 3, 0) # bc01 to c01b conv_op1 = FilterActs(stride=1, partial_sum=1, pad=pad) contiguous_input = gpu_contiguous(input_shuffled) contiguous_w1 = gpu_contiguous(w1_shuffled) conv_out_1_shuffled = conv_op1(contiguous_input, contiguous_w1) conv_out_1 = conv_out_1_shuffled.dimshuffle(3, 0, 1, 2)# c01b to bc01 activ_1_out = activation(conv_out_1+self.b1.dimshuffle('x',0,'x','x')) filter_shape[1] = filter_shape[0] # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W2 = theano.shared( numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) if W2!=None: self.W2.set_value(W2) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b2 = theano.shared(value=b_values, borrow=True) if b2!=None: self.b2.set_value(b2) w2_shuffled = self.W2.dimshuffle(1, 2, 3, 0) # bc01 to c01b activ_1_out_shuffled = activ_1_out.dimshuffle(1, 2, 3, 0) # bc01 to c01b contiguous_activ_1_out = gpu_contiguous(activ_1_out_shuffled) contiguous_w2 = gpu_contiguous(w2_shuffled) conv_op2 = FilterActs(stride=1, partial_sum=1, pad=pad) conv_out_2_shuffled = conv_op2(contiguous_activ_1_out, contiguous_w2) conv_out_2 = conv_out_2_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01 self.output = activation(conv_out_2+self.b2.dimshuffle('x', 0, 'x', 'x')+input) stride = 1# not used assert (image_shape[2]-filter_shape[2]+2*pad)%stride==0 output_im_size = (image_shape[2]-filter_shape[2]+2*pad)/stride+1 self.output_shape = [image_shape[0], filter_shape[0], output_im_size, output_im_size] # store parameters of this layer self.params = [self.W1, self.b1, self.W2, self.b2]
def __init__(self, incoming, num_filters, filter_size, stride=(1, 1), border_mode=None, untie_biases=False, W=None, b=init.Constant(0.), nonlinearity=nonlinearities.rectify, pad=None, dimshuffle=True, flip_filters=False, partial_sum=1, **kwargs): super(Conv2DCCLayer, self).__init__(incoming, **kwargs) if nonlinearity is None: self.nonlinearity = nonlinearities.identity else: self.nonlinearity = nonlinearity filter_size = as_tuple(filter_size, 2) stride = as_tuple(stride, 2) if filter_size[0] != filter_size[1]: raise RuntimeError("Conv2DCCLayer only supports square filters, " "but filter_size=(%d, %d)" % filter_size) if stride[0] != stride[1]: raise RuntimeError("Conv2DCCLayer only supports square strides, " "but stride=(%d, %d)" % stride) if num_filters % 16 != 0: raise RuntimeError("Conv2DCCLayer requires num_filters to be a " "multiple of 16, but num_filters is " "%d" % num_filters) self.num_filters = num_filters self.filter_size = filter_size[0] self.stride = stride[0] self.untie_biases = untie_biases self.dimshuffle = dimshuffle self.flip_filters = flip_filters self.partial_sum = partial_sum if border_mode is not None and pad is not None: raise RuntimeError("You cannot specify both 'border_mode' and " "'pad'. To avoid ambiguity, please specify " "only one of them.") elif border_mode is None and pad is None: # no option specified, default to valid mode self.pad = 0 elif border_mode is not None: if border_mode == 'valid': self.pad = 0 elif border_mode == 'full': self.pad = self.filter_size - 1 elif border_mode == 'same': # only works for odd filter size, but the even filter size case # is probably not worth supporting. self.pad = (self.filter_size - 1) // 2 else: raise RuntimeError("Unsupported border_mode for " "Conv2DCCLayer: %s" % border_mode) else: self.pad = pad if W is None: if dimshuffle: W = init.GlorotUniform() else: W = init.GlorotUniform(c01b=True) self.W = self.create_param(W, self.get_W_shape()) if b is None: self.b = None elif self.untie_biases: output_shape = self.get_output_shape() if self.dimshuffle: self.b = self.create_param( b, (num_filters, output_shape[2], output_shape[3])) else: self.b = self.create_param( b, (num_filters, output_shape[1], output_shape[2])) else: self.b = self.create_param(b, (num_filters, )) self.filter_acts_op = FilterActs(stride=self.stride, partial_sum=self.partial_sum, pad=self.pad)