Python MyGpuNdArray.sum 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gen_elemwise

클래스/타입: MyGpuNdArray

메소드/함수: sum

hotexamples.com에서의 예제들: 2

Python MyGpuNdArray.sum - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gen_elemwise.MyGpuNdArray.sum에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

MyGpuNdArray(3)

adds(2)

add(1)

gen_fct(1)

multiplys(1)

sum(1)

예제 #1

파일 보기

파일: test_gpu_elemwise.py 프로젝트: abergeron/compyte

def test_sum():
    to_cpu = numpy.asarray
    dtypes = list(dtypes_all)
    # I remove *int8 as currently the output have the same dtype
    # And this cause overflow
    dtypes.remove("int8")
    dtypes.remove("uint8")
    # I need to find how pycuda handle complexe in c.
    # I probably just need to add an header.
    dtypes.remove("complex64")
    if  enable_double:
        dtypes.remove("complex128")
    for shape in [
        # need something bigger then 32, 1024 or 4096.
        # Those are corner case.

        # 1d, take only a few seconds on a GTX470
        (0,), (5,), (31,), (32,), (33,),
        (1023,), (1024,), (1025,),
        (4095,), (4096,), (4097,),
        (32 * 1024 - 1,), (32 * 1024,), (32 * 1024 + 1,),

        # 2d, take 2 minutes on a GTX 470
        (0, 0), (1, 0), (0, 1,), (5, 4),
        (31, 31), (31, 32), (31, 33),
        (32, 31), (32, 32), (32, 33),
        (33, 31), (33, 32), (33, 33),
        (1024, 32), (1025, 32),
        (1024, 33), (1025, 33),
        (4096, 32), (32, 4096), (4096, 33), (33, 4096),
        (4097, 32), (32, 4097), (4097, 33), (33, 4097),

        # 3d, take 2 minutes on a GTX 470
        (0, 0, 0), (0, 1, 0), (0, 0, 1),
        (5, 4, 3), (5, 4, 3), (5, 4, 3),
        (4096, 2, 33), (2, 4096, 33), (33, 2, 4096),
        (4097, 2, 33), (2, 4097, 33), (33, 2, 4097),
        (4096, 33, 2), (33, 4096, 2), (2, 33, 4096),
        (4097, 33, 2), (33, 4097, 2), (2, 33, 4097),

        # 4d, take 1 minutes on a GTX 470
        (0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0),
        (0, 0, 1, 0), (0, 0, 0, 1),
        (5, 4, 3, 2),
        (1024, 32, 2, 3), (3, 1024, 32, 2), (2, 3, 1024, 32),
        (1024, 2, 32, 3), (3, 1024, 2, 32), (1024, 3, 2, 32),
        (1025, 33, 2, 3), (3, 1025, 33, 2), (2, 3, 1025, 33),
        (1025, 2, 33, 3), (3, 1025, 2, 33), (1025, 3, 2, 33),
        (4100, 4, 3, 2), (4, 4100, 3, 2),
        (4, 3, 4100, 2), (4, 3, 2, 4100),

        # 5d, work only if c contiguous
        (5, 4, 3, 10, 11),
        ]:

        for dtype, off_o, off_i, sliced, order in product(
            *([dtypes] +
              [[False, True]] +
              [[False, True]] +
              [[-1, 2, -2, 1]] +
              [['f', 'c']])):

            cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o,
                                                off_i, sliced, order)

            if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"] or
                                       gpu_val.flags["F_CONTIGUOUS"]):
                continue
            gpu_val = MyGpuNdArray(gpu_val)
            cpu_sum = cpu_val.sum()
#            print dtype, shape, off_o, off_i, sliced, order
#            print (cpu_val.strides,
#                   cpu_val.flags["C_CONTIGUOUS"],
#                   cpu_val.flags["F_CONTIGUOUS"])
#            print (gpu_val.strides,
#                   gpu_val.flags["C_CONTIGUOUS"],
#                   gpu_val.flags["F_CONTIGUOUS"])
            gpu_sum = to_cpu(gpu_val.sum())

            def get_rtol(orig, after_reduction):
                if after_reduction.size == 0:
                    return 0
                if orig.size // after_reduction.size > 500000:
                    rtols = {"float32": 4.3e-5}
                elif orig.size // after_reduction.size > 100000:
                    rtols = {"float32": 3e-5}
                elif orig.size // after_reduction.size > 50000:
                    rtols = {"float32": 2e-5}
                else:
                    rtols = {"float32": 1e-5}
                if dtype in rtols:
                    rtol = rtols[dtype]
                else:
                    rtol = 1e-8
                return rtol
            rtol = get_rtol(gpu_val, gpu_sum)
            cpu_sum = cpu_sum.astype(dtype)
            if not (dtype.endswith("int16") and numpy.prod(shape) > 20000):
                assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol) or
                        cpu_sum == gpu_sum), (
                    dtype, shape, cpu_sum, gpu_sum,
                    (cpu_sum - gpu_sum) / cpu_sum)

            # Test pattern 10 and 01
            # Test pattern 100, 010 and 001
            if len(shape) in [2, 3]:
                for axis in range(len(shape)):
                    gpu_sum = to_cpu(gpu_val.sum(axis=[axis]))
                    cpu_sum = cpu_val.sum(axis=axis)
                    rtol = get_rtol(gpu_val, gpu_sum)
                    if cpu_sum.size > 0:
                        argmax = numpy.absolute(cpu_sum - gpu_sum).argmax()
                        cpu_max = cpu_sum.flatten()[argmax]
                        gpu_max = gpu_sum.flatten()[argmax]
                    assert numpy.allclose(cpu_sum, gpu_sum), (
                        "axis=%d" % axis, dtype, shape, cpu_sum.shape,
                        cpu_sum, gpu_sum,
                        cpu_max, gpu_max, (cpu_max - gpu_max) / cpu_max)

예제 #2

파일 보기

파일: test_gpu_elemwise.py 프로젝트: torrange/nixcuda

def test_sum():
    to_cpu = numpy.asarray
    dtypes = list(dtypes_all)
    # I remove *int8 as currently the output have the same dtype
    # And this cause overflow
    dtypes.remove("int8")
    dtypes.remove("uint8")
    # I need to find how pycuda handle complexe in c.
    # I probably just need to add an header.
    dtypes.remove("complex64")
    if enable_double:
        dtypes.remove("complex128")
    for shape in [
            # need something bigger then 32, 1024 or 4096.
            # Those are corner case.

            # 1d, take only a few seconds on a GTX470
        (
            0, ),
        (5, ),
        (31, ),
        (32, ),
        (33, ),
        (1023, ),
        (1024, ),
        (1025, ),
        (4095, ),
        (4096, ),
        (4097, ),
        (32 * 1024 - 1, ),
        (32 * 1024, ),
        (32 * 1024 + 1, ),

            # 2d, take 2 minutes on a GTX 470
        (0, 0),
        (1, 0),
        (
            0,
            1,
        ),
        (5, 4),
        (31, 31),
        (31, 32),
        (31, 33),
        (32, 31),
        (32, 32),
        (32, 33),
        (33, 31),
        (33, 32),
        (33, 33),
        (1024, 32),
        (1025, 32),
        (1024, 33),
        (1025, 33),
        (4096, 32),
        (32, 4096),
        (4096, 33),
        (33, 4096),
        (4097, 32),
        (32, 4097),
        (4097, 33),
        (33, 4097),

            # 3d, take 2 minutes on a GTX 470
        (0, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (5, 4, 3),
        (5, 4, 3),
        (5, 4, 3),
        (4096, 2, 33),
        (2, 4096, 33),
        (33, 2, 4096),
        (4097, 2, 33),
        (2, 4097, 33),
        (33, 2, 4097),
        (4096, 33, 2),
        (33, 4096, 2),
        (2, 33, 4096),
        (4097, 33, 2),
        (33, 4097, 2),
        (2, 33, 4097),

            # 4d, take 1 minutes on a GTX 470
        (0, 0, 0, 0),
        (1, 0, 0, 0),
        (0, 1, 0, 0),
        (0, 0, 1, 0),
        (0, 0, 0, 1),
        (5, 4, 3, 2),
        (1024, 32, 2, 3),
        (3, 1024, 32, 2),
        (2, 3, 1024, 32),
        (1024, 2, 32, 3),
        (3, 1024, 2, 32),
        (1024, 3, 2, 32),
        (1025, 33, 2, 3),
        (3, 1025, 33, 2),
        (2, 3, 1025, 33),
        (1025, 2, 33, 3),
        (3, 1025, 2, 33),
        (1025, 3, 2, 33),
        (4100, 4, 3, 2),
        (4, 4100, 3, 2),
        (4, 3, 4100, 2),
        (4, 3, 2, 4100),

            # 5d, work only if c contiguous
        (5, 4, 3, 10, 11),
    ]:

        for dtype, off_o, off_i, sliced, order in product(
                *([dtypes] + [[False, True]] + [[False, True]] +
                  [[-1, 2, -2, 1]] + [['f', 'c']])):

            cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o, off_i,
                                                sliced, order)

            if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"]
                                       or gpu_val.flags["F_CONTIGUOUS"]):
                continue
            gpu_val = MyGpuNdArray(gpu_val)
            cpu_sum = cpu_val.sum()
            #            print dtype, shape, off_o, off_i, sliced, order
            #            print (cpu_val.strides,
            #                   cpu_val.flags["C_CONTIGUOUS"],
            #                   cpu_val.flags["F_CONTIGUOUS"])
            #            print (gpu_val.strides,
            #                   gpu_val.flags["C_CONTIGUOUS"],
            #                   gpu_val.flags["F_CONTIGUOUS"])
            gpu_sum = to_cpu(gpu_val.sum())

            def get_rtol(orig, after_reduction):
                if after_reduction.size == 0:
                    return 0
                if orig.size // after_reduction.size > 500000:
                    rtols = {"float32": 4.3e-5}
                elif orig.size // after_reduction.size > 100000:
                    rtols = {"float32": 3e-5}
                elif orig.size // after_reduction.size > 50000:
                    rtols = {"float32": 2e-5}
                else:
                    rtols = {"float32": 1e-5}
                if dtype in rtols:
                    rtol = rtols[dtype]
                else:
                    rtol = 1e-8
                return rtol

            rtol = get_rtol(gpu_val, gpu_sum)
            cpu_sum = cpu_sum.astype(dtype)
            if not (dtype.endswith("int16") and numpy.prod(shape) > 20000):
                assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol)
                        or cpu_sum == gpu_sum), (dtype, shape, cpu_sum,
                                                 gpu_sum,
                                                 (cpu_sum - gpu_sum) / cpu_sum)

            # Test pattern 10 and 01
            # Test pattern 100, 010 and 001
            if len(shape) in [2, 3]:
                for axis in range(len(shape)):
                    gpu_sum = to_cpu(gpu_val.sum(axis=[axis]))
                    cpu_sum = cpu_val.sum(axis=axis)
                    rtol = get_rtol(gpu_val, gpu_sum)
                    if cpu_sum.size > 0:
                        argmax = numpy.absolute(cpu_sum - gpu_sum).argmax()
                        cpu_max = cpu_sum.flatten()[argmax]
                        gpu_max = gpu_sum.flatten()[argmax]
                    assert numpy.allclose(
                        cpu_sum,
                        gpu_sum), ("axis=%d" % axis, dtype, shape,
                                   cpu_sum.shape, cpu_sum, gpu_sum, cpu_max,
                                   gpu_max, (cpu_max - gpu_max) / cpu_max)