def test_sum(): to_cpu = numpy.asarray dtypes = list(dtypes_all) # I remove *int8 as currently the output have the same dtype # And this cause overflow dtypes.remove("int8") dtypes.remove("uint8") # I need to find how pycuda handle complexe in c. # I probably just need to add an header. dtypes.remove("complex64") if enable_double: dtypes.remove("complex128") for shape in [ # need something bigger then 32, 1024 or 4096. # Those are corner case. # 1d, take only a few seconds on a GTX470 (0,), (5,), (31,), (32,), (33,), (1023,), (1024,), (1025,), (4095,), (4096,), (4097,), (32 * 1024 - 1,), (32 * 1024,), (32 * 1024 + 1,), # 2d, take 2 minutes on a GTX 470 (0, 0), (1, 0), (0, 1,), (5, 4), (31, 31), (31, 32), (31, 33), (32, 31), (32, 32), (32, 33), (33, 31), (33, 32), (33, 33), (1024, 32), (1025, 32), (1024, 33), (1025, 33), (4096, 32), (32, 4096), (4096, 33), (33, 4096), (4097, 32), (32, 4097), (4097, 33), (33, 4097), # 3d, take 2 minutes on a GTX 470 (0, 0, 0), (0, 1, 0), (0, 0, 1), (5, 4, 3), (5, 4, 3), (5, 4, 3), (4096, 2, 33), (2, 4096, 33), (33, 2, 4096), (4097, 2, 33), (2, 4097, 33), (33, 2, 4097), (4096, 33, 2), (33, 4096, 2), (2, 33, 4096), (4097, 33, 2), (33, 4097, 2), (2, 33, 4097), # 4d, take 1 minutes on a GTX 470 (0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1), (5, 4, 3, 2), (1024, 32, 2, 3), (3, 1024, 32, 2), (2, 3, 1024, 32), (1024, 2, 32, 3), (3, 1024, 2, 32), (1024, 3, 2, 32), (1025, 33, 2, 3), (3, 1025, 33, 2), (2, 3, 1025, 33), (1025, 2, 33, 3), (3, 1025, 2, 33), (1025, 3, 2, 33), (4100, 4, 3, 2), (4, 4100, 3, 2), (4, 3, 4100, 2), (4, 3, 2, 4100), # 5d, work only if c contiguous (5, 4, 3, 10, 11), ]: for dtype, off_o, off_i, sliced, order in product( *([dtypes] + [[False, True]] + [[False, True]] + [[-1, 2, -2, 1]] + [['f', 'c']])): cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o, off_i, sliced, order) if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"] or gpu_val.flags["F_CONTIGUOUS"]): continue gpu_val = MyGpuNdArray(gpu_val) cpu_sum = cpu_val.sum() # print dtype, shape, off_o, off_i, sliced, order # print (cpu_val.strides, # cpu_val.flags["C_CONTIGUOUS"], # cpu_val.flags["F_CONTIGUOUS"]) # print (gpu_val.strides, # gpu_val.flags["C_CONTIGUOUS"], # gpu_val.flags["F_CONTIGUOUS"]) gpu_sum = to_cpu(gpu_val.sum()) def get_rtol(orig, after_reduction): if after_reduction.size == 0: return 0 if orig.size // after_reduction.size > 500000: rtols = {"float32": 4.3e-5} elif orig.size // after_reduction.size > 100000: rtols = {"float32": 3e-5} elif orig.size // after_reduction.size > 50000: rtols = {"float32": 2e-5} else: rtols = {"float32": 1e-5} if dtype in rtols: rtol = rtols[dtype] else: rtol = 1e-8 return rtol rtol = get_rtol(gpu_val, gpu_sum) cpu_sum = cpu_sum.astype(dtype) if not (dtype.endswith("int16") and numpy.prod(shape) > 20000): assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol) or cpu_sum == gpu_sum), ( dtype, shape, cpu_sum, gpu_sum, (cpu_sum - gpu_sum) / cpu_sum) # Test pattern 10 and 01 # Test pattern 100, 010 and 001 if len(shape) in [2, 3]: for axis in range(len(shape)): gpu_sum = to_cpu(gpu_val.sum(axis=[axis])) cpu_sum = cpu_val.sum(axis=axis) rtol = get_rtol(gpu_val, gpu_sum) if cpu_sum.size > 0: argmax = numpy.absolute(cpu_sum - gpu_sum).argmax() cpu_max = cpu_sum.flatten()[argmax] gpu_max = gpu_sum.flatten()[argmax] assert numpy.allclose(cpu_sum, gpu_sum), ( "axis=%d" % axis, dtype, shape, cpu_sum.shape, cpu_sum, gpu_sum, cpu_max, gpu_max, (cpu_max - gpu_max) / cpu_max)
def test_sum(): to_cpu = numpy.asarray dtypes = list(dtypes_all) # I remove *int8 as currently the output have the same dtype # And this cause overflow dtypes.remove("int8") dtypes.remove("uint8") # I need to find how pycuda handle complexe in c. # I probably just need to add an header. dtypes.remove("complex64") if enable_double: dtypes.remove("complex128") for shape in [ # need something bigger then 32, 1024 or 4096. # Those are corner case. # 1d, take only a few seconds on a GTX470 ( 0, ), (5, ), (31, ), (32, ), (33, ), (1023, ), (1024, ), (1025, ), (4095, ), (4096, ), (4097, ), (32 * 1024 - 1, ), (32 * 1024, ), (32 * 1024 + 1, ), # 2d, take 2 minutes on a GTX 470 (0, 0), (1, 0), ( 0, 1, ), (5, 4), (31, 31), (31, 32), (31, 33), (32, 31), (32, 32), (32, 33), (33, 31), (33, 32), (33, 33), (1024, 32), (1025, 32), (1024, 33), (1025, 33), (4096, 32), (32, 4096), (4096, 33), (33, 4096), (4097, 32), (32, 4097), (4097, 33), (33, 4097), # 3d, take 2 minutes on a GTX 470 (0, 0, 0), (0, 1, 0), (0, 0, 1), (5, 4, 3), (5, 4, 3), (5, 4, 3), (4096, 2, 33), (2, 4096, 33), (33, 2, 4096), (4097, 2, 33), (2, 4097, 33), (33, 2, 4097), (4096, 33, 2), (33, 4096, 2), (2, 33, 4096), (4097, 33, 2), (33, 4097, 2), (2, 33, 4097), # 4d, take 1 minutes on a GTX 470 (0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1), (5, 4, 3, 2), (1024, 32, 2, 3), (3, 1024, 32, 2), (2, 3, 1024, 32), (1024, 2, 32, 3), (3, 1024, 2, 32), (1024, 3, 2, 32), (1025, 33, 2, 3), (3, 1025, 33, 2), (2, 3, 1025, 33), (1025, 2, 33, 3), (3, 1025, 2, 33), (1025, 3, 2, 33), (4100, 4, 3, 2), (4, 4100, 3, 2), (4, 3, 4100, 2), (4, 3, 2, 4100), # 5d, work only if c contiguous (5, 4, 3, 10, 11), ]: for dtype, off_o, off_i, sliced, order in product( *([dtypes] + [[False, True]] + [[False, True]] + [[-1, 2, -2, 1]] + [['f', 'c']])): cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o, off_i, sliced, order) if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"] or gpu_val.flags["F_CONTIGUOUS"]): continue gpu_val = MyGpuNdArray(gpu_val) cpu_sum = cpu_val.sum() # print dtype, shape, off_o, off_i, sliced, order # print (cpu_val.strides, # cpu_val.flags["C_CONTIGUOUS"], # cpu_val.flags["F_CONTIGUOUS"]) # print (gpu_val.strides, # gpu_val.flags["C_CONTIGUOUS"], # gpu_val.flags["F_CONTIGUOUS"]) gpu_sum = to_cpu(gpu_val.sum()) def get_rtol(orig, after_reduction): if after_reduction.size == 0: return 0 if orig.size // after_reduction.size > 500000: rtols = {"float32": 4.3e-5} elif orig.size // after_reduction.size > 100000: rtols = {"float32": 3e-5} elif orig.size // after_reduction.size > 50000: rtols = {"float32": 2e-5} else: rtols = {"float32": 1e-5} if dtype in rtols: rtol = rtols[dtype] else: rtol = 1e-8 return rtol rtol = get_rtol(gpu_val, gpu_sum) cpu_sum = cpu_sum.astype(dtype) if not (dtype.endswith("int16") and numpy.prod(shape) > 20000): assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol) or cpu_sum == gpu_sum), (dtype, shape, cpu_sum, gpu_sum, (cpu_sum - gpu_sum) / cpu_sum) # Test pattern 10 and 01 # Test pattern 100, 010 and 001 if len(shape) in [2, 3]: for axis in range(len(shape)): gpu_sum = to_cpu(gpu_val.sum(axis=[axis])) cpu_sum = cpu_val.sum(axis=axis) rtol = get_rtol(gpu_val, gpu_sum) if cpu_sum.size > 0: argmax = numpy.absolute(cpu_sum - gpu_sum).argmax() cpu_max = cpu_sum.flatten()[argmax] gpu_max = gpu_sum.flatten()[argmax] assert numpy.allclose( cpu_sum, gpu_sum), ("axis=%d" % axis, dtype, shape, cpu_sum.shape, cpu_sum, gpu_sum, cpu_max, gpu_max, (cpu_max - gpu_max) / cpu_max)