def setUp(self): self.gwa = GpuWeightActs(module_stride=self.module_stride, partial_sum=self.partial_sum) self.gpu_images = float32_shared_constructor( numpy.random.rand(*self.ishape).astype(self.dtype)) self.gpu_hidact = float32_shared_constructor( numpy.random.rand(*self.hshape).astype(self.dtype))
def run_match(self, images, filters, module_stride, retvals=False, partial_sum=1): gfa = GpuFilterActs(module_stride, partial_sum) fa = FilterActs(module_stride) gpu_images = float32_shared_constructor(images) gpu_filters = float32_shared_constructor(filters) cpu_images = theano.shared(images) cpu_filters = theano.shared(filters) gpu_out = gfa(gpu_images, gpu_filters) cpu_out = fa(cpu_images, cpu_filters) f = theano.function([], [cpu_out, gpu_out]) cpuval, gpuval = f() gpuval = numpy.asarray(gpuval) if retvals: return cpuval, gpuval else: #print 'run_match: cpu shape', cpuval.shape #print 'run_match: gpu shape', gpuval.shape assert cpuval.shape == gpuval.shape assert numpy.allclose(cpuval, gpuval)
def test_weight_acts_strided(): # Tests that WeightActs with all possible strides rng = np.random.RandomState([2012,10,9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [[(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for partial_sum in [0, 1, 4]: print "partial_sum: %d"%(partial_sum) for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images,name='images') print "test case %d..."%(test_idx+1) for ii in xrange(filters.shape[1]): stride = ii + 1 output_python = FilterActs_python(images,filters,stride) _, h_rows, h_cols, _ = output_python.shape if partial_sum == 4: if (h_rows*h_cols)%partial_sum != 0: print "skip test case %d, stride %d when partial_sum is equal to %d"%(test_idx+1,stride,partial_sum) break hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32') gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts') weights_grad_python = WeightActs_python(images,hidacts,filters.shape[1],filters.shape[2],stride) weights_grad = WeightActs(partial_sum=partial_sum,stride=stride)( gpu_images, gpu_hidacts, as_tensor_variable((filters.shape[1], filters.shape[2])) )[0] weights_grad = host_from_gpu(weights_grad) f = function([], weights_grad) weights_grad_val = f() warnings.warn("""test_weight_acts_strided success criterion is not very strict.""") if np.abs(weights_grad_val - weights_grad_python).max() > 3.4e-5: assert type(weights_grad_val) == type(weights_grad_python) assert weights_grad_val.dtype == weights_grad_python.dtype if weights_grad_val.shape != weights_grad_python.shape: print 'cuda-convnet shape: ',weights_grad_val.shape print 'python conv shape: ',weights_grad_python.shape assert False err = np.abs(weights_grad_val - weights_grad_python) print 'stride %d'%stride print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (weights_grad_val.min(), weights_grad_val.max()) print 'python conv value range: ', (weights_grad_python.min(), weights_grad_python.max())
def test_weight_acts_strided(): # Tests that WeightActs with all possible strides rng = np.random.RandomState([2012,10,9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [[(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for partial_sum in [0, 1, 4]: print("partial_sum: %d"%(partial_sum)) for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images,name='images') print("test case %d..."%(test_idx+1)) for ii in xrange(filters.shape[1]): stride = ii + 1 output_python = FilterActs_python(images,filters,stride) _, h_rows, h_cols, _ = output_python.shape if partial_sum == 4: if (h_rows*h_cols)%partial_sum != 0: print("skip test case %d, stride %d when partial_sum is equal to %d"%(test_idx+1,stride,partial_sum)) break hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32') gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts') weights_grad_python = WeightActs_python(images,hidacts,filters.shape[1],filters.shape[2],stride) weights_grad = WeightActs(partial_sum=partial_sum,stride=stride)( gpu_images, gpu_hidacts, as_tensor_variable((filters.shape[1], filters.shape[2])) )[0] weights_grad = host_from_gpu(weights_grad) f = function([], weights_grad) weights_grad_val = f() warnings.warn("""test_weight_acts_strided success criterion is not very strict.""") if np.abs(weights_grad_val - weights_grad_python).max() > 3.4e-5: assert type(weights_grad_val) == type(weights_grad_python) assert weights_grad_val.dtype == weights_grad_python.dtype if weights_grad_val.shape != weights_grad_python.shape: print('cuda-convnet shape: ',weights_grad_val.shape) print('python conv shape: ',weights_grad_python.shape) assert False err = np.abs(weights_grad_val - weights_grad_python) print('stride %d'%stride) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (weights_grad_val.min(), weights_grad_val.max())) print('python conv value range: ', (weights_grad_python.min(), weights_grad_python.max()))
def setUp(self): self.gwa = GpuWeightActs( module_stride=self.module_stride, partial_sum=self.partial_sum) self.gpu_images = float32_shared_constructor( numpy.random.rand(*self.ishape).astype(self.dtype)) self.gpu_hidact = float32_shared_constructor( numpy.random.rand(*self.hshape).astype(self.dtype))
def setUp(self): test_unshared_conv.TestFilterActs.setUp(self) self.gpu_op = GpuFilterActs( module_stride=self.module_stride, partial_sum=1) self.s_images = float32_shared_constructor( self.s_images.get_value()) self.s_filters = float32_shared_constructor( self.s_filters.get_value())
def test_image_acts_strided(): # Tests that running FilterActs with all possible strides rng = np.random.RandomState([2012,10,9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [[(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images,name='images') gpu_filters = float32_shared_constructor(filters,name='filters') print("test case %d..."%(test_idx+1)) for ii in xrange(filters.shape[1]): stride = ii + 1 output_python = FilterActs_python(images,filters,stride) hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32') gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts') Img_output_python = ImageActs_python(filters,hidacts,stride,(images.shape[1], images.shape[2])) Img_output = ImageActs(stride=stride)(gpu_hidacts, gpu_filters, as_tensor_variable((images.shape[1], images.shape[2]))) Img_output = host_from_gpu(Img_output) f = function([], Img_output) Img_output_val = f() warnings.warn("""test_image_acts_strided success criterion is not very strict.""") if np.abs(Img_output_val - Img_output_python).max() > 2.1e-5: assert type(Img_output_val) == type(Img_output_python) assert Img_output_val.dtype == Img_output_python.dtype if Img_output_val.shape != Img_output_python.shape: print('cuda-convnet shape: ',Img_output_val.shape) print('python conv shape: ',Img_output_python.shape) assert False err = np.abs(Img_output_val - Img_output_python) print('stride %d'%stride) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (Img_output_val.min(), Img_output_val.max())) print('python conv value range: ', (Img_output_python.min(), Img_output_python.max()))
def test_filter_acts_strided(): # Tests that FilterActs with all possible strides rng = np.random.RandomState([2012, 10, 9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [ [(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images, name='images') gpu_filters = float32_shared_constructor(filters, name='filters') print("test case %d..." % (test_idx + 1)) for ii in xrange(filters.shape[1]): stride = ii + 1 output = FilterActs(stride=stride)(gpu_images, gpu_filters) output = host_from_gpu(output) f = function([], output) output_val = f() output_python = FilterActs_python(images, filters, stride) if np.abs(output_val - output_python).max() > 8.6e-6: assert type(output_val) == type(output_python) assert output_val.dtype == output_python.dtype if output_val.shape != output_python.shape: print('cuda-convnet shape: ', output_val.shape) print('python conv shape: ', output_python.shape) assert False err = np.abs(output_val - output_python) print('stride %d' % stride) print('absolute error range: ', (err.min(), err.max())) print('mean absolute error: ', err.mean()) print('cuda-convnet value range: ', (output_val.min(), output_val.max())) print('python conv value range: ', (output_python.min(), output_python.max()))
def test_image_acts_strided(): # Tests that running FilterActs with all possible strides rng = np.random.RandomState([2012,10,9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [[(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images,name='images') gpu_filters = float32_shared_constructor(filters,name='filters') print "test case %d..."%(test_idx+1) for ii in xrange(filters.shape[1]): stride = ii + 1 output_python = FilterActs_python(images,filters,stride) hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32') gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts') Img_output_python = ImageActs_python(filters,hidacts,stride,(images.shape[1], images.shape[2])) Img_output = ImageActs(stride=stride)(gpu_hidacts, gpu_filters, as_tensor_variable((images.shape[1], images.shape[2]))) Img_output = host_from_gpu(Img_output) f = function([], Img_output) Img_output_val = f() warnings.warn("""test_image_acts_strided success criterion is not very strict.""") if np.abs(Img_output_val - Img_output_python).max() > 2.1e-5: assert type(Img_output_val) == type(Img_output_python) assert Img_output_val.dtype == Img_output_python.dtype if Img_output_val.shape != Img_output_python.shape: print 'cuda-convnet shape: ',Img_output_val.shape print 'python conv shape: ',Img_output_python.shape assert False err = np.abs(Img_output_val - Img_output_python) print 'stride %d'%stride print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (Img_output_val.min(), Img_output_val.max()) print 'python conv value range: ', (Img_output_python.min(), Img_output_python.max())
def test_blocksparse_grad_merge(): b = tensor.fmatrix() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data() W = float32_shared_constructor(W_val) o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = theano.grad(o.sum(), W) lr = numpy.asarray(0.05, dtype='float32') upd = W - lr * gW f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # not running with mode=gpu ensures that the elemwise is not merged in f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)]) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)
def test_blocksparse_grad_merge(): b = tensor.fmatrix() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data() W = float32_shared_constructor(W_val) o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = theano.grad(o.sum(), W) lr = numpy.asarray(0.05, dtype='float32') upd = W - lr * gW f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # not running with mode=gpu ensures that the elemwise is not merged in mode = None if theano.config.mode == 'FAST_COMPILE': mode = theano.compile.mode.get_mode('FAST_RUN') f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)
def test_filter_acts_strided(): # Tests that FilterActs with all possible strides rng = np.random.RandomState([2012,10,9]) #Each list in shape_list : #[img_shape,filter_shape] #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)] shape_list = [[(1, 7, 8, 5), (1, 2, 2, 16)], [(3, 7, 8, 5), (3, 3, 3, 16)], [(16, 11, 11, 4), (16, 4, 4, 16)], [(3, 20, 20, 3), (3, 5, 5, 16)], [(3, 21, 21, 3), (3, 6, 6, 16)], ] for test_idx in xrange(len(shape_list)): images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32') filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32') gpu_images = float32_shared_constructor(images,name='images') gpu_filters = float32_shared_constructor(filters,name='filters') print "test case %d..."%(test_idx+1) for ii in xrange(filters.shape[1]): stride = ii + 1 output = FilterActs(stride=stride)(gpu_images, gpu_filters) output = host_from_gpu(output) f = function([], output) output_val = f() output_python = FilterActs_python(images,filters,stride) if np.abs(output_val - output_python).max() > 8.6e-6: assert type(output_val) == type(output_python) assert output_val.dtype == output_python.dtype if output_val.shape != output_python.shape: print 'cuda-convnet shape: ',output_val.shape print 'python conv shape: ',output_python.shape assert False err = np.abs(output_val - output_python) print 'stride %d'%stride print 'absolute error range: ', (err.min(), err.max()) print 'mean absolute error: ', err.mean() print 'cuda-convnet value range: ', (output_val.min(), output_val.max()) print 'python conv value range: ', (output_python.min(), output_python.max())
def run_match(self, images, filters, module_stride, retvals=False): gfa = GpuFilterActs(module_stride) fa = FilterActs(module_stride) gpu_images = float32_shared_constructor(images) gpu_filters = float32_shared_constructor(filters) cpu_images = theano.shared(images) cpu_filters = theano.shared(filters) gpu_out = gfa(gpu_images, gpu_filters) cpu_out = fa(cpu_images, cpu_filters) f = theano.function([], [cpu_out, gpu_out]) cpuval, gpuval = f() gpuval = numpy.asarray(gpuval) if retvals: return cpuval, gpuval else: # print 'run_match: cpu shape', cpuval.shape # print 'run_match: gpu shape', gpuval.shape assert cpuval.shape == gpuval.shape assert numpy.allclose(cpuval, gpuval)
def Xtest_blocksparse_grad_merge(self): b = tensor.fmatrix() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() W = float32_shared_constructor(W_val) o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = theano.grad(o.sum(), W) lr = numpy.asarray(0.05, dtype='float32') upd = W - lr * gW f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # Make sure the lr update was merged. assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) # Exclude the merge optimizations. mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha') mode = mode.excluding('local_merge_blocksparse_output') f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) # Make sure the lr update is not merged. assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)