예제 #1
0
파일: test_gates.py 프로젝트: phorcys/nufhe
def test_ntt_lsh_method_performance(thread, single_kernel_bootstrap,
                                    heavy_performance_load, ntt_lsh_method):

    if thread.api.get_id() != cuda_id() and ntt_lsh_method == 'cuda_asm':
        pytest.skip()

    size = 4096 if heavy_performance_load else 64

    rng = numpy.random.RandomState()
    secret_key, cloud_key = make_key_pair(thread, rng, transform_type='NTT')

    # TODO: instead of creating a whole key and then checking if the parameters are supported,
    # we can just create a parameter object separately.
    if (single_kernel_bootstrap and not single_kernel_bootstrap_supported(
            secret_key.params, thread.device_params)):
        pytest.skip()

    perf_params = PerformanceParameters(
        secret_key.params,
        single_kernel_bootstrap=single_kernel_bootstrap,
        ntt_lsh_method=ntt_lsh_method)

    results = check_performance(thread, (secret_key, cloud_key),
                                perf_params,
                                shape=size)
    print()
    print(check_performance_str(results))
예제 #2
0
    def __init__(self, api, pnum, dnum):
        self.api_id = api.get_id()
        self.pnum = pnum
        self.dnum = dnum

        self._api = api

        platform = api.get_platforms()[pnum]
        self._device = platform.get_devices()[dnum]

        platform_name = platform.name
        device_name = self._device.name

        self.device_params = api.DeviceParameters(self._device)

        self.device_id = "{api},{pnum},{dnum}".format(api=api.get_id(),
                                                      pnum=pnum,
                                                      dnum=dnum)
        self.device_full_name = platform_name + ", " + device_name

        self.id = self.device_id

        # if we import it in the header, it messes up with coverage results
        import reikna.cluda as cluda

        self.cuda = (api.get_id() == cluda.cuda_id())
예제 #3
0
def check_func_performance(tag,
                           thread,
                           func_module,
                           reference_func,
                           output_type,
                           input_types,
                           ranges=None,
                           heavy_performance_load=False):

    N = 1024 * (1024 if heavy_performance_load else 32)
    repetitions = 100000

    test = get_func_kernel(thread, func_module, output_type, input_types)
    perf_test = get_func_kernel(thread,
                                func_module,
                                output_type,
                                input_types,
                                repetitions=repetitions)

    arrays = [
        get_test_array(N,
                       tp,
                       val_range=ranges[i] if ranges is not None else None)
        for i, tp in enumerate(input_types)
    ]

    arrays_dev = [thread.to_device(arr) for arr in arrays]
    dest_dev = thread.array(N, tp_dtype(output_type))

    # Sanity check
    test(dest_dev, *arrays_dev, global_size=N)
    assert (dest_dev.get() == reference_func(*arrays)).all()

    # Performance check
    times = []

    for j in range(10):
        thread.synchronize()
        t1 = time.time()
        perf_test(dest_dev, *arrays_dev, global_size=N)
        thread.synchronize()
        t2 = time.time()
        times.append(t2 - t1)

    times = numpy.array(times)

    times /= repetitions
    times /= N
    times *= 1e12

    print()
    print(
        "{backend}: {tag} --- min: {min:.4f}, mean: {mean:.4f}, std: {std:.4f}"
        .format(tag=tag,
                min=times.min(),
                mean=times.mean(),
                std=times.std(),
                backend='cuda'
                if thread.api.get_id() == cluda.cuda_id() else 'ocl '))
예제 #4
0
def test_sub_perf(thread, method):
    if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id():
        pytest.skip()
    if method == "c_from_asm":
        pytest.skip()

    check_func_performance("sub(), " + method, thread, ntt.sub(method=method),
                           ref_sub, 'ff_number', ['ff_number', 'ff_number'])
예제 #5
0
def test_mul(thread, method):
    if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id():
        pytest.skip()
    check_func(
        thread, ntt.mul(method=method), ref_mul, 'ff_number', ['ff_number', 'ff_number'],
        test_values=[
            (ntt_cpu.GaloisNumber.modulus - 1, 2**33) # regression test for an error in method=c
            ]
        )
예제 #6
0
def test_mod(thread, method):
    if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id():
        pytest.skip()
    check_func(
        thread, ntt.mod(method=method), ref_mod, 'ff_number', [numpy.uint64],
        test_values=[
            (ntt_cpu.GaloisNumber.modulus - 1,),
            (ntt_cpu.GaloisNumber.modulus,),
            (ntt_cpu.GaloisNumber.modulus + 1,)])
예제 #7
0
def test_lsh(thread, exp_range, method):
    if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id():
        pytest.skip()
    exp_dtype = numpy.uint32
    check_func(thread,
               ntt.lsh(exp_range, exp_dtype, method=method),
               ref_lsh,
               'ff_number', ['ff_number', exp_dtype],
               ranges=[None, (exp_range - 32, exp_range)],
               test_values=[(11509900421665959066, exp_range - 1)])
예제 #8
0
def test_add_perf(thread, method, heavy_performance_load):
    if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id():
        pytest.skip()
    if method == "c_from_asm":
        pytest.skip()

    check_func_performance(
        "add(), " + method,
        thread, ntt.add(method=method), ref_add, 'ff_number', ['ff_number', 'ff_number'],
        heavy_performance_load=heavy_performance_load)
예제 #9
0
def test_constant_memory(thr):
    global_size = 100

    arr1 = get_test_array(global_size, numpy.int32)
    arr2 = get_test_array(global_size * 2, numpy.float32)
    ref = ((arr2[:global_size] + arr2[global_size:]) * arr1).astype(numpy.float32)

    src = """
        KERNEL void test(GLOBAL_MEM float *dest)
        {
            const SIZE_T i = get_global_id(0);
            dest[i] = (arr2[i] + arr2[i + get_global_size(0)]) * arr1[i];
        }
        """
    constant_arrays = dict(
        arr1=arr1, # use the parameters of an existing array; the array itself is not copied!
        arr2=(global_size * 2, numpy.float32)
        )

    # Compiled-in constant arrays only supported on CUDA
    if thr.api.get_id() != cluda.cuda_id():
        with pytest.raises(ValueError):
            program = thr.compile(src, constant_arrays=constant_arrays)
        return

    program = thr.compile(src, constant_arrays=constant_arrays)
    test = program.test

    # TODO: program.constant.arr1.set(arr1)
    program.set_constant('arr1', arr1)
    program.set_constant('arr2', arr2)

    dest_dev = thr.array(global_size, numpy.float32)
    test(dest_dev, global_size=global_size)

    assert diff_is_negligible(dest_dev.get(), ref)


    # Use a program/kernel with different constant arrays
    arr1_2 = get_test_array(global_size, numpy.int32)
    arr2_2 = get_test_array(global_size * 2, numpy.float32)
    ref_2 = ((arr2_2[:global_size] + arr2_2[global_size:]) * arr1_2).astype(numpy.float32)

    program2 = thr.compile(src, constant_arrays=constant_arrays)
    test2 = program2.test
    program2.set_constant('arr1', arr1_2)
    program2.set_constant('arr2', arr2_2)
    test2(dest_dev, global_size=global_size)
    assert diff_is_negligible(dest_dev.get(), ref_2)


    # Check that the first kernel preserved the data in its constant arrays
    test(dest_dev, global_size=global_size)
    assert diff_is_negligible(dest_dev.get(), ref)
예제 #10
0
    def __init__(self, api_id, platform_id, device_id):
        self.api_id = api_id
        self.platform_id = platform_id
        self.device_id = device_id

        # We do not save the actual device_obj because this object
        # can be passed to another thread/process.
        api, platform, device = self._get_objects()
        self.platform_name = platform.name
        self.device_name = device.name
        self.api_name = "CUDA" if api.get_id() == cluda.cuda_id() else "OpenCL"
예제 #11
0
def test_constant_memory(thr):
    global_size = 100

    arr1 = get_test_array(global_size, numpy.int32)
    arr2 = get_test_array(global_size * 2, numpy.float32)
    ref = ((arr2[:global_size] + arr2[global_size:]) * arr1).astype(
        numpy.float32)

    src = """
        KERNEL void test(GLOBAL_MEM float *dest)
        {
            const SIZE_T i = get_global_id(0);
            dest[i] = (arr2[i] + arr2[i + get_global_size(0)]) * arr1[i];
        }
        """
    constant_arrays = dict(
        arr1=
        arr1,  # use the parameters of an existing array; the array itself is not copied!
        arr2=(global_size * 2, numpy.float32))

    # Compiled-in constant arrays only supported on CUDA
    if thr.api.get_id() != cluda.cuda_id():
        with pytest.raises(ValueError):
            program = thr.compile(src, constant_arrays=constant_arrays)
        return

    program = thr.compile(src, constant_arrays=constant_arrays)
    test = program.test

    # TODO: program.constant.arr1.set(arr1)
    program.set_constant('arr1', arr1)
    program.set_constant('arr2', arr2)

    dest_dev = thr.array(global_size, numpy.float32)
    test(dest_dev, global_size=global_size)

    assert diff_is_negligible(dest_dev.get(), ref)

    # Use a program/kernel with different constant arrays
    arr1_2 = get_test_array(global_size, numpy.int32)
    arr2_2 = get_test_array(global_size * 2, numpy.float32)
    ref_2 = ((arr2_2[:global_size] + arr2_2[global_size:]) * arr1_2).astype(
        numpy.float32)

    program2 = thr.compile(src, constant_arrays=constant_arrays)
    test2 = program2.test
    program2.set_constant('arr1', arr1_2)
    program2.set_constant('arr2', arr2_2)
    test2(dest_dev, global_size=global_size)
    assert diff_is_negligible(dest_dev.get(), ref_2)

    # Check that the first kernel preserved the data in its constant arrays
    test(dest_dev, global_size=global_size)
    assert diff_is_negligible(dest_dev.get(), ref)
예제 #12
0
def test_constant_memory_static_kernel(thr):
    global_size = 100

    arr1 = get_test_array(global_size, numpy.int32)
    arr2 = get_test_array(global_size * 2, numpy.float32)
    ref = ((arr2[:global_size] + arr2[global_size:]) * arr1).astype(
        numpy.float32)

    src = """
        KERNEL void test(GLOBAL_MEM float *dest)
        {
            const SIZE_T i = get_global_id(0);
            dest[i] = (arr2[i] + arr2[i + get_global_size(0)]) * arr1[i];
        }
        """
    constant_arrays = dict(
        arr1=
        arr1,  # use the parameters of an existing array; the array itself is not copied!
        arr2=(global_size * 2, numpy.float32))

    # Compiled-in constant arrays only supported on CUDA
    if thr.api.get_id() != cluda.cuda_id():
        with pytest.raises(ValueError):
            program = thr.compile_static(src,
                                         "test",
                                         global_size=global_size,
                                         constant_arrays=constant_arrays)
        return

    test = thr.compile_static(src,
                              "test",
                              global_size=global_size,
                              constant_arrays=constant_arrays)

    # TODO: test.constant.arr1.set(arr1)
    test.set_constant('arr1', arr1)
    test.set_constant(
        'arr2',
        thr.to_device(arr2))  # test that a device array is also accepted

    dest_dev = thr.array(global_size, numpy.float32)
    test(dest_dev)

    assert diff_is_negligible(dest_dev.get(), ref)
예제 #13
0
def test_ntt_lsh_method_performance(thread, single_kernel_bootstrap,
                                    heavy_performance_load, ntt_lsh_method):

    if thread.api.get_id() != cuda_id() and ntt_lsh_method == 'cuda_asm':
        pytest.skip()

    size = 4096 if heavy_performance_load else 64

    rng = numpy.random.RandomState()
    secret_key, cloud_key = make_key_pair(thread, rng, transform_type='NTT')
    perf_params = performance_parameters(
        nufhe_params=secret_key.params,
        single_kernel_bootstrap=single_kernel_bootstrap,
        ntt_lsh_method=ntt_lsh_method)

    results = check_performance(thread, (secret_key, cloud_key),
                                perf_params,
                                shape=size)
    print()
    print(check_performance_str(results))
예제 #14
0
def test_fft_performance(thread, transforms_per_block, constant_memory, heavy_performance_load):

    if not transform_supported(thread.device_params, 'FFT'):
        pytest.skip()

    if transforms_per_block > max_supported_transforms_per_block(thread.device_params, 'FFT'):
        pytest.skip()

    is_cuda = thread.api.get_id() == cuda_id()

    batch_shape = (2**14,)
    a = get_test_array(batch_shape + (512,), numpy.complex128)

    kernel_repetitions = 100 if heavy_performance_load else 5

    a_dev = thread.to_device(a)
    res_dev = thread.empty_like(a_dev)

    res_ref = tr_fft.fft_transform_ref(a)

    transform = fft512(use_constant_memory=constant_memory)

    fft_comp = Transform(
        transform, batch_shape, transforms_per_block=transforms_per_block,
        ).compile(thread)
    fft_comp_repeated = Transform(
        transform, batch_shape, transforms_per_block=transforms_per_block,
        kernel_repetitions=kernel_repetitions).compile(thread)

    # Quick check of correctness
    fft_comp(res_dev, a_dev)
    res_test = res_dev.get()
    assert numpy.allclose(res_test, res_ref)

    # Test performance
    times, times_str = get_times(thread, fft_comp_repeated, res_dev, a_dev)
    print("\n{backend}, {trnum} per block, test --- {times}".format(
        times=times_str,
        backend='cuda' if is_cuda else 'ocl ',
        trnum=transforms_per_block))
예제 #15
0
    def __init__(self, tr_tree, translator, thread, fast_math, compiler_options, keep):
        """__init__()""" # hide the signature from Sphinx

        self._thread = thread
        self._is_cuda = (thread.api.get_id() == cuda_id())
        self._tr_tree = tr_tree
        self._translator = translator
        self._fast_math = fast_math
        self._compiler_options = compiler_options
        self._keep = keep

        self._nested_comp_idgen = IdGen('_nested')
        self._persistent_value_idgen = IdGen('_value')
        self._constant_value_idgen = IdGen('_constant')
        self._temp_array_idgen = IdGen('_temp')

        self._external_annotations = self._tr_tree.get_root_annotations()
        self._persistent_values = {}
        self._constant_arrays = {}
        self._temp_arrays = set()
        self._internal_annotations = {}
        self._kernels = []
예제 #16
0
def performance_parameters_for_device(perf_params, device_params):

    is_cuda = device_params.api_id == cuda_id()

    ntt_base_method = perf_params.ntt_base_method
    ntt_mul_method = perf_params.ntt_mul_method
    ntt_lsh_method = perf_params.ntt_lsh_method

    if ntt_base_method is None:
        ntt_base_method = 'cuda_asm' if is_cuda else 'c'
    if ntt_mul_method is None:
        ntt_mul_method = 'cuda_asm' if is_cuda else 'c'
    if ntt_lsh_method is None:
        ntt_lsh_method = 'cuda_asm' if is_cuda else 'c'

    pdict = perf_params._asdict()
    pdict.update(
        ntt_base_method=ntt_base_method,
        ntt_mul_method=ntt_mul_method,
        ntt_lsh_method=ntt_lsh_method,
        )

    return PerformanceParameters(**pdict)
예제 #17
0
    def __init__(self, tr_tree, translator, thread, fast_math,
                 compiler_options, keep):
        """__init__()"""  # hide the signature from Sphinx

        self._thread = thread
        self._is_cuda = (thread.api.get_id() == cuda_id())
        self._tr_tree = tr_tree
        self._translator = translator
        self._fast_math = fast_math
        self._compiler_options = compiler_options
        self._keep = keep

        self._nested_comp_idgen = IdGen('_nested')
        self._persistent_value_idgen = IdGen('_value')
        self._constant_value_idgen = IdGen('_constant')
        self._temp_array_idgen = IdGen('_temp')

        self._external_annotations = self._tr_tree.get_root_annotations()
        self._persistent_values = {}
        self._constant_arrays = {}
        self._temp_arrays = set()
        self._internal_annotations = {}
        self._kernels = []
예제 #18
0
def test_constant_memory_static_kernel(thr):
    global_size = 100

    arr1 = get_test_array(global_size, numpy.int32)
    arr2 = get_test_array(global_size * 2, numpy.float32)
    ref = ((arr2[:global_size] + arr2[global_size:]) * arr1).astype(numpy.float32)

    src = """
        KERNEL void test(GLOBAL_MEM float *dest)
        {
            const SIZE_T i = get_global_id(0);
            dest[i] = (arr2[i] + arr2[i + get_global_size(0)]) * arr1[i];
        }
        """
    constant_arrays = dict(
        arr1=arr1, # use the parameters of an existing array; the array itself is not copied!
        arr2=(global_size * 2, numpy.float32)
        )

    # Compiled-in constant arrays only supported on CUDA
    if thr.api.get_id() != cluda.cuda_id():
        with pytest.raises(ValueError):
            program = thr.compile_static(
                src, "test", global_size=global_size, constant_arrays=constant_arrays)
        return

    test = thr.compile_static(src, "test", global_size=global_size, constant_arrays=constant_arrays)

    # TODO: test.constant.arr1.set(arr1)
    test.set_constant('arr1', arr1)
    test.set_constant('arr2', thr.to_device(arr2)) # test that a device array is also accepted

    dest_dev = thr.array(global_size, numpy.float32)
    test(dest_dev)

    assert diff_is_negligible(dest_dev.get(), ref)
예제 #19
0
    def __init__(self, api, pnum, dnum):
        self.api_id = api.get_id()
        self.pnum = pnum
        self.dnum = dnum

        self._api = api

        platform = api.get_platforms()[pnum]
        self._device = platform.get_devices()[dnum]

        platform_name = platform.name
        device_name = self._device.name

        self.device_params = api.DeviceParameters(self._device)

        self.device_id = "{api},{pnum},{dnum}".format(api=api.get_id(), pnum=pnum, dnum=dnum)
        self.device_full_name = platform_name + ", " + device_name

        self.id = self.device_id

        # if we import it in the header, it messes up with coverage results
        import reikna.cluda as cluda

        self.cuda = (api.get_id() == cluda.cuda_id())
예제 #20
0
def test_ntt_performance(thread, transforms_per_block, constant_memory, heavy_performance_load):

    if not transform_supported(thread.device_params, 'NTT'):
        pytest.skip()

    if transforms_per_block > max_supported_transforms_per_block(thread.device_params, 'NTT'):
        pytest.skip()

    is_cuda = thread.api.get_id() == cuda_id()

    methods = list(itertools.product(
        ['cuda_asm', 'c'], # base method
        ['cuda_asm', 'c_from_asm', 'c'], # mul method
        ['cuda_asm', 'c_from_asm', 'c'] # lsh method
        ))

    if not is_cuda:
        # filter out all usage of CUDA asm if we're on OpenCL
        methods = [ms for ms in methods if 'cuda_asm' not in ms]

    batch_shape = (2**14,)
    a = get_test_array(batch_shape + (1024,), "ff_number")

    kernel_repetitions = 100 if heavy_performance_load else 5

    a_dev = thread.to_device(a)
    res_dev = thread.empty_like(a_dev)

    # TODO: compute a reference NTT when it's fast enough on CPU
    #res_ref = tr_ntt.ntt_transform_ref(a)

    print()
    min_times = []
    for base_method, mul_method, lsh_method in methods:

        transform = ntt1024(
            base_method=base_method, mul_method=mul_method, lsh_method=lsh_method,
            use_constant_memory=constant_memory)

        ntt_comp = Transform(
            transform, batch_shape, transforms_per_block=transforms_per_block,
            ).compile(thread)
        ntt_comp_repeated = Transform(
            transform, batch_shape, transforms_per_block=transforms_per_block,
            kernel_repetitions=kernel_repetitions).compile(thread)

        # TODO: compute a reference NTT when it's fast enough on CPU
        # Quick check of correctness
        #ntt_comp(res_dev, a_dev)
        #res_test = res_dev.get()
        #assert (res_test == res_ref).all()

        # Test performance
        times, times_str = get_times(thread, ntt_comp_repeated, res_dev, a_dev)
        print("  base: {bm}, mul: {mm}, lsh: {lm}".format(
            bm=base_method, mm=mul_method, lm=lsh_method))
        print("  {backend}, {trnum} per block, test --- {times}".format(
            times=times_str,
            backend='cuda' if is_cuda else 'ocl ',
            trnum=transforms_per_block))

        min_times.append((times.min(), base_method, mul_method, lsh_method))

    best = min(min_times, key=lambda t: t[0])
    time_best, base_method, mul_method, lsh_method = best
    print("Best time: {tb:.4f} for [base: {bm}, mul: {mm}, lsh: {lm}]".format(
        tb=time_best, bm=base_method, mm=mul_method, lm=lsh_method
        ))
예제 #21
0
    def __init__(self, perf_params: PerformanceParameters, device_params):

        low_end_device = perf_params.low_end_device
        if low_end_device is None:
            # TODO: an arbitrary distinction, need to test on some devices close to it.
            low_end_device = device_params.compute_units < 20

        is_cuda = device_params.api_id == cuda_id()

        transform_type = perf_params.nufhe_params.tgsw_params.tlwe_params.transform_type
        mask_size = perf_params.nufhe_params.tgsw_params.tlwe_params.mask_size
        decomp_length = perf_params.nufhe_params.tgsw_params.decomp_length

        use_constant_memory_multi_iter = perf_params.use_constant_memory_multi_iter
        if use_constant_memory_multi_iter is None:
            use_constant_memory_multi_iter = not low_end_device

        use_constant_memory_single_iter = perf_params.use_constant_memory_single_iter
        if use_constant_memory_single_iter is None:
            use_constant_memory_single_iter = False

        # Avoiding circular reference
        from .polynomial_transform import max_supported_transforms_per_block
        max_supported_tpb = max_supported_transforms_per_block(device_params, transform_type)

        transforms_per_block = perf_params.transforms_per_block
        if transforms_per_block is None:
            if low_end_device:
                transforms_per_block = 1
            else:
                transforms_per_block = 4 if transform_type == 'NTT' else 2
            transforms_per_block = min(transforms_per_block, max_supported_tpb)
        else:
            if transforms_per_block > max_supported_tpb:
                raise ValueError(
                    "The chosen device does not support more than " + str(max_supported_tpb) +
                    " transforms per block")

        # Avoiding circular reference
        from .blind_rotate import single_kernel_bootstrap_supported

        single_kernel_bootstrap = perf_params.single_kernel_bootstrap
        skbs_supported = single_kernel_bootstrap_supported(perf_params.nufhe_params, device_params)

        if single_kernel_bootstrap is None:
            # If both encryption parameters and device capabilities allow it,
            # single kernel bootstrap is the optimal choice.
            single_kernel_bootstrap = not low_end_device and skbs_supported
        elif single_kernel_bootstrap and not skbs_supported:
            single_kernel_bootstrap_supported(
                perf_params.nufhe_params, device_params, raise_exception=True)

        ntt_base_method = perf_params.ntt_base_method
        ntt_mul_method = perf_params.ntt_mul_method
        ntt_lsh_method = perf_params.ntt_lsh_method

        if 'cuda_asm' in (ntt_base_method, ntt_mul_method, ntt_lsh_method) and not is_cuda:
            raise ValueError("'cuda_asm' is only supported for the CUDA backend")

        if low_end_device:
            skbs = single_kernel_bootstrap
            if ntt_base_method is None:
                ntt_base_method = ('c' if skbs else 'cuda_asm') if is_cuda else 'c'
            if ntt_mul_method is None:
                ntt_mul_method = ('c_from_asm' if skbs else 'cuda_asm') if is_cuda else 'c'
            if ntt_lsh_method is None:
                ntt_lsh_method = 'cuda_asm' if is_cuda else 'c'
        else:
            if ntt_base_method is None:
                ntt_base_method = 'cuda_asm' if is_cuda else 'c'
            if ntt_mul_method is None:
                ntt_mul_method = 'cuda_asm' if is_cuda else 'c'
            if ntt_lsh_method is None:
                ntt_lsh_method = 'cuda_asm' if is_cuda else 'c'

        self.ntt_base_method = ntt_base_method
        self.ntt_mul_method = ntt_mul_method
        self.ntt_lsh_method = ntt_lsh_method
        self.use_constant_memory_multi_iter = use_constant_memory_multi_iter
        self.use_constant_memory_single_iter = use_constant_memory_single_iter
        self.transforms_per_block = transforms_per_block
        self.single_kernel_bootstrap = single_kernel_bootstrap
예제 #22
0
def get_id():
    return cluda.cuda_id()
예제 #23
0
def test_sub(thread, method):
    if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id():
        pytest.skip()
    check_func(thread, ntt.sub(method=method), ref_sub, 'ff_number',
               ['ff_number', 'ff_number'])
예제 #24
0
def test_mul_prepared(thread, method):
    if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id():
        pytest.skip()
    check_func(thread, ntt.mul_prepared(method=method), ref_mul_prepared,
               'ff_number', ['ff_number', 'ff_number'])
예제 #25
0
파일: cuda.py 프로젝트: simonlegrand/reikna
def get_id():
    return cluda.cuda_id()