예제 #1
0
    def setUp(self):
        if cupy.cuda.runtime.is_hip:
            if _environment.get_hipcc_path() is None:
                self.skipTest('hipcc is not found')

        self.can_use = cupy._core._cub_reduction._can_use_cub_block_reduction

        self.old_accelerators = _accelerator.get_reduction_accelerators()
        _accelerator.set_reduction_accelerators(['cub'])
예제 #2
0
 def setUp(self):
     self.old_routine_accelerators = _acc.get_routine_accelerators()
     self.old_reduction_accelerators = _acc.get_reduction_accelerators()
     if self.backend == 'device':
         _acc.set_routine_accelerators(['cub'])
         _acc.set_reduction_accelerators([])
     elif self.backend == 'block':
         _acc.set_routine_accelerators([])
         _acc.set_reduction_accelerators(['cub'])
예제 #3
0
    def setUp(self):
        cupy._core._optimize_config._clear_all_contexts_cache()
        self.old_reductions = _accelerator.get_reduction_accelerators()
        _accelerator.set_reduction_accelerators(self.backend)

        # avoid shadowed by the cub module
        self.old_routines = _accelerator.get_routine_accelerators()
        _accelerator.set_routine_accelerators([])

        self.x = testing.shaped_arange((3, 4), cupy, dtype=cupy.float32)
예제 #4
0
 def setUp(self):
     self.order, self.axis = self.order_and_axis
     self.old_routine_accelerators = _acc.get_routine_accelerators()
     self.old_reduction_accelerators = _acc.get_reduction_accelerators()
     if self.backend == 'device':
         if self.axis is not None:
             raise unittest.SkipTest('does not support')
         _acc.set_routine_accelerators(['cub'])
         _acc.set_reduction_accelerators([])
     elif self.backend == 'block':
         _acc.set_routine_accelerators([])
         _acc.set_reduction_accelerators(['cub'])
예제 #5
0
 def setUp(self):
     self.order, self.axis = self.order_and_axis
     old_routine_accelerators = _acc.get_routine_accelerators()
     old_reduction_accelerators = _acc.get_reduction_accelerators()
     if self.backend == 'device':
         if self.axis is not None:
             pytest.skip('does not support')
         _acc.set_routine_accelerators(['cub'])
         _acc.set_reduction_accelerators([])
     elif self.backend == 'block':
         _acc.set_routine_accelerators([])
         _acc.set_reduction_accelerators(['cub'])
     yield
     _acc.set_routine_accelerators(old_routine_accelerators)
     _acc.set_reduction_accelerators(old_reduction_accelerators)
예제 #6
0
    def tearDown(self):
        if (self.in_memory and _accelerator.ACCELERATOR_CUB
                not in _accelerator.get_reduction_accelerators()):
            # should not write any file to the cache dir, but the CUB reduction
            # kernel uses nvcc, with which I/O cannot be avoided
            files = os.listdir(self.cache_dir)
            for f in files:
                if f == 'test_load_cubin.cu':
                    count = 1
                    break
            else:
                count = 0
            assert len(files) == count

        self.in_memory_context.__exit__(*sys.exc_info())
        self.temporary_cache_dir_context.__exit__(*sys.exc_info())
예제 #7
0
    def test_optimize_cache(self):
        if (_accelerator.ACCELERATOR_CUB
                in _accelerator.get_reduction_accelerators()):
            pytest.skip('optimize cannot be mocked for CUB reduction')

        target = cupyx.optimizing._optimize._optimize
        target_full_name = '{}.{}'.format(target.__module__, target.__name__)

        with mock.patch(target_full_name) as optimize_impl:
            my_sum = cupy.ReductionKernel('T x', 'T out', 'x', 'a + b',
                                          'out = a', '0', 'my_sum')
            my_sum_ = cupy.ReductionKernel('T x', 'T out', 'x', 'a + b',
                                           'out = a', '0', 'my_sum_')
            x = testing.shaped_arange((3, 4), cupy)
            x_ = testing.shaped_arange((3, 4), cupy)
            y = testing.shaped_arange((4, 4), cupy)
            z = testing.shaped_arange((3, 4), cupy)[::-1]
            assert x.strides == y.strides
            assert x.shape == z.shape

            with cupyx.optimizing.optimize():
                my_sum(x, axis=1)
                assert optimize_impl.call_count == 1
                my_sum(x, axis=1)
                assert optimize_impl.call_count == 1
                my_sum(x, axis=0)
                assert optimize_impl.call_count == 2
                my_sum(x_, axis=1)
                assert optimize_impl.call_count == 2
                my_sum(y, axis=1)
                assert optimize_impl.call_count == 3
                my_sum(z, axis=1)
                assert optimize_impl.call_count == 4
                my_sum_(x, axis=1)
                assert optimize_impl.call_count == 5

            with cupyx.optimizing.optimize(key='new_key'):
                my_sum(x, axis=1)
                assert optimize_impl.call_count == 6

            with cupyx.optimizing.optimize(key=None):
                my_sum(x, axis=1)
                assert optimize_impl.call_count == 6
                my_sum(x)
                assert optimize_impl.call_count == 7
예제 #8
0
    def test_optimize_cache_multi_gpus(self):
        if (_accelerator.ACCELERATOR_CUB
                in _accelerator.get_reduction_accelerators()):
            pytest.skip('optimize cannot be mocked for CUB reduction')

        target = cupyx.optimizing._optimize._optimize
        target_full_name = '{}.{}'.format(target.__module__, target.__name__)

        with mock.patch(target_full_name) as optimize_impl:
            my_sum = cupy.ReductionKernel('T x', 'T out', 'x', 'a + b',
                                          'out = a', '0', 'my_sum')

            with cupyx.optimizing.optimize():
                with cupy.cuda.Device(0):
                    x = testing.shaped_arange((3, 4), cupy)
                    my_sum(x, axis=1)
                    assert optimize_impl.call_count == 1

                with cupy.cuda.Device(1):
                    x = testing.shaped_arange((3, 4), cupy)
                    my_sum(x, axis=1)
                    assert optimize_impl.call_count == 2
예제 #9
0
 def setUp(self):
     self.old_accelerators = _acc.get_routine_accelerators()
     _acc.set_routine_accelerators([])
     # also avoid fallback to CUB via the general reduction kernel
     self.old_reduction_accelerators = _acc.get_reduction_accelerators()
     _acc.set_reduction_accelerators([])