Python register_host_memory 예제들, pycuda.driver.register_host_memory Python 예제들

예제 #1

0

파일 보기

파일: ce.py 프로젝트: renlliang3/cuvarbase

    def allocate_buffered_data_arrays(self, **kwargs):
        n0 = kwargs.get('n0', self.n0)
        if self.buffered_transfer:
            n0 = kwargs.get('n0_buffer', self.n0_buffer)
        assert (n0 is not None)

        kw = dict(dtype=self.real_type, alignment=resource.getpagesize())

        self.t = cuda.aligned_zeros(shape=(n0, ), **kw)
        self.t = cuda.register_host_memory(self.t)

        self.y = cuda.aligned_zeros(shape=(n0, ),
                                    dtype=self.ytype,
                                    alignment=resource.getpagesize())

        self.y = cuda.register_host_memory(self.y)
        if self.weighted:
            self.dy = cuda.aligned_zeros(shape=(n0, ), **kw)
            self.dy = cuda.register_host_memory(self.dy)

        if self.balanced_magbins:
            self.mag_bwf = cuda.aligned_zeros(shape=(self.mag_bins, ), **kw)
            self.mag_bwf = cuda.register_host_memory(self.mag_bwf)

        if self.compute_log_prob:
            self.mag_bin_fracs = cuda.aligned_zeros(shape=(self.mag_bins, ),
                                                    **kw)
            self.mag_bin_fracs = cuda.register_host_memory(self.mag_bin_fracs)
        return self

예제 #2

0

파일 보기

    def allocate_buffered_data_arrays(self, **kwargs):
        """
        Allocates pinned memory for lightcurves if we're reusing
        this container
        """
        n0 = kwargs.get('n0', self.n0)
        if self.buffered_transfer:
            n0 = kwargs.get('n0_buffer', self.n0_buffer)
        assert (n0 is not None)

        self.t = cuda.aligned_zeros(shape=(n0, ),
                                    dtype=self.real_type,
                                    alignment=resource.getpagesize())
        self.t = cuda.register_host_memory(self.t)

        self.yw = cuda.aligned_zeros(shape=(n0, ),
                                     dtype=self.real_type,
                                     alignment=resource.getpagesize())
        self.yw = cuda.register_host_memory(self.yw)

        self.w = cuda.aligned_zeros(shape=(n0, ),
                                    dtype=self.real_type,
                                    alignment=resource.getpagesize())
        self.w = cuda.register_host_memory(self.w)
        return self

예제 #3

0

파일 보기

파일: inference.py 프로젝트: leo-XUKANG/TensorRT-1

        def inference(features, tokens):
            global h_output

            _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
                    "NetworkOutput",
                    ["start_logits", "end_logits", "feature_index"])
            networkOutputs = []

            eval_time_elapsed = 0
            for feature_index, feature in enumerate(features):
                # Copy inputs
                input_ids_batch = np.dstack([feature.input_ids] * args.batch_size).squeeze()
                segment_ids_batch = np.dstack([feature.segment_ids] * args.batch_size).squeeze()
                input_mask_batch = np.dstack([feature.input_mask] * args.batch_size).squeeze()

                input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel()))
                segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel()))
                input_mask = cuda.register_host_memory(np.ascontiguousarray(input_mask_batch.ravel()))

                eval_start_time = time.time()
                cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)
                cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)
                cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)

                # Run inference
                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] + [int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
                # Synchronize the stream
                stream.synchronize()
                eval_time_elapsed += (time.time() - eval_start_time)

                # Transfer predictions back from GPU
                cuda.memcpy_dtoh_async(h_output, d_output, stream)
                stream.synchronize()

                # Only retrieve and post-process the first batch
                batch = h_output[0]
                networkOutputs.append(_NetworkOutput(
                    start_logits = np.array(batch.squeeze()[:, 0]),
                    end_logits = np.array(batch.squeeze()[:, 1]),
                    feature_index = feature_index
                    ))

            eval_time_elapsed /= len(features)

            # Total number of n-best predictions to generate in the nbest_predictions.json output file
            n_best_size = 20

            # The maximum length of an answer that can be generated. This is needed
            # because the start and end predictions are not conditioned on one another
            max_answer_length = 30

            prediction, nbest_json, scores_diff_json = dp.get_predictions(tokens, features,
                    networkOutputs, args.n_best_size, args.max_answer_length)

            return eval_time_elapsed, prediction, nbest_json

예제 #4

0

파일 보기

파일: test_driver.py 프로젝트: davidweichiang/pycuda

    def test_register_host_memory(self):
        if drv.get_version() < (4,):
            from py.test import skip
            skip("register_host_memory only exists on CUDA 4.0 and later")

        import sys
        if sys.platform == "darwin":
            from py.test import skip
            skip("register_host_memory is not supported on OS X")

        a = drv.aligned_empty((2**20,), np.float64, alignment=4096)
        drv.register_host_memory(a)

예제 #5

0

파일 보기

파일: test_driver.py 프로젝트: BuraaqAlrawi/SearchEngine

    def test_register_host_memory(self):
        if drv.get_version() < (4, ):
            from py.test import skip
            skip("register_host_memory only exists on CUDA 4.0 and later")

        import sys
        if sys.platform == "darwin":
            from py.test import skip
            skip("register_host_memory is not supported on OS X")

        a = drv.aligned_empty((2**20, ), np.float64, alignment=4096)
        drv.register_host_memory(a)

예제 #6

0

파일 보기

파일: inference.py 프로젝트: gloriouskilka/DeepLearningExamples-1

        def inference(features, tokens):
            global h_output

            _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
                "NetworkOutput",
                ["start_logits", "end_logits", "feature_index"])
            networkOutputs = []

            eval_time_elapsed = 0
            for feature_index, feature in enumerate(features):
                # Copy inputs
                input_ids = cuda.register_host_memory(
                    np.ascontiguousarray(feature.input_ids.ravel()))
                segment_ids = cuda.register_host_memory(
                    np.ascontiguousarray(feature.segment_ids.ravel()))
                input_mask = cuda.register_host_memory(
                    np.ascontiguousarray(feature.input_mask.ravel()))

                eval_start_time = time.time()
                cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)
                cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)
                cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)

                # Run inference
                context.execute_async_v2(
                    bindings=[int(d_inp)
                              for d_inp in d_inputs] + [int(d_output)],
                    stream_handle=stream.handle)
                # Synchronize the stream
                stream.synchronize()
                eval_time_elapsed += (time.time() - eval_start_time)

                # Transfer predictions back from GPU
                cuda.memcpy_dtoh_async(h_output, d_output, stream)
                stream.synchronize()

                for index, batch in enumerate(h_output):
                    # Data Post-processing
                    networkOutputs.append(
                        _NetworkOutput(
                            start_logits=np.array(batch.squeeze()[:, 0]),
                            end_logits=np.array(batch.squeeze()[:, 1]),
                            feature_index=feature_index))

            eval_time_elapsed /= len(features)

            prediction, nbest_json, scores_diff_json = dp.get_predictions(
                tokens, features, networkOutputs, args.n_best_size,
                args.max_answer_length)

            return eval_time_elapsed, prediction, nbest_json

예제 #7

0

파일 보기

    def _set_thread_args(self, dev_id: int, ctx: cuda.Context,
                         moment: np.ndarray, w_out: np.ndarray,
                         x_out: np.ndarray, y_out: np.ndarray):
        '''
        Set the input moment for all the stream for a specific GPU
        '''

        ctx.push()
        # number of input for this GPU
        max_size = moment.shape[1]
        # loop through the streams to set their input
        for i in range(0, self.num_stream, 1):
            # Size of input allocated for each stream
            size_per_batch = int(np.ceil(max_size / self.num_stream))
            # location on the original input array where the input to this stream starts
            loc = np.int32((i) * size_per_batch)
            if loc + size_per_batch > max_size:
                size_per_batch = max_size - loc

            self.moment_chunk_host[dev_id].append(
                np.ascontiguousarray(moment[:, loc:loc + size_per_batch],
                                     dtype=np.float32))
            self.moment_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.moment_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)
            self.w_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(w_out[:, loc:loc + size_per_batch])))
            self.w_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.w_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

            self.x_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(x_out[:, loc:loc + size_per_batch])))
            self.x_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.x_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

            self.y_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(y_out[:, loc:loc + size_per_batch])))
            self.y_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.y_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

        ctx.synchronize()
        ctx.pop()

예제 #8

0

파일 보기

파일: pdm.py 프로젝트: renlliang3/cuvarbase

    def allocate(self, data):
        if len(data) > len(self.streams):
            self._create_streams(len(data) - len(self.streams))

        gpu_data, pow_cpus = [], []

        for t, y, w, freqs in data:

            pow_cpu = cuda.aligned_zeros(shape=(len(freqs), ),
                                         dtype=np.float32,
                                         alignment=resource.getpagesize())

            pow_cpu = cuda.register_host_memory(pow_cpu)

            t_g, y_g, w_g = None, None, None
            if len(t) > 0:
                t_g, y_g, w_g = tuple([
                    gpuarray.zeros(len(t), dtype=np.float32) for i in range(3)
                ])

            pow_g = gpuarray.zeros(len(pow_cpu), dtype=pow_cpu.dtype)
            freqs_g = gpuarray.to_gpu(np.asarray(freqs).astype(np.float32))

            gpu_data.append((t_g, y_g, w_g, freqs_g, pow_g))
            pow_cpus.append(pow_cpu)
        return gpu_data, pow_cpus

예제 #9

0

파일 보기

파일: tensorrt_runner.py 프로젝트: joehoeller/NVIDIA-Rapids-NeMo-PyTorch-Tensorboard

 def copy_htod(self, np_buffer, stream=None):
     if stream:
         # PyCUDA requires the host buffer to be pagelocked for asynchronous memcpys.
         pagelocked = cuda.register_host_memory(
             np.ascontiguousarray(np_buffer.ravel()))
         cuda.memcpy_htod_async(self.ptr, pagelocked, stream)
     else:
         cuda.memcpy_htod(self.ptr, np.ascontiguousarray(np_buffer.ravel()))

예제 #10

0

파일 보기

    def allocate_pinned_cpu(self, **kwargs):
        """ Allocates pinned CPU memory for asynchronous transfer of result """
        nf = kwargs.get('nf', self.nf)
        assert(nf is not None)

        self.lsp_c = cuda.aligned_zeros(shape=(nf,), dtype=self.real_type,
                                        alignment=resource.getpagesize())
        self.lsp_c = cuda.register_host_memory(self.lsp_c)

        return self

예제 #11

0

파일 보기

파일: ce.py 프로젝트: renlliang3/cuvarbase

    def allocate_pinned_cpu(self, **kwargs):
        nf = kwargs.get('nf', self.nf)
        assert (nf is not None)

        self.ce_c = cuda.aligned_zeros(shape=(nf, ),
                                       dtype=self.real_type,
                                       alignment=resource.getpagesize())
        self.ce_c = cuda.register_host_memory(self.ce_c)

        return self

예제 #12

0

파일 보기

파일: test_memory.py 프로젝트: qwang3/PyQBMMlib

def batch_memcpy_cmp(size: int, batch: int):
    event_start_1 = cuda.Event()
    event_stop_1 = cuda.Event()
    event_start_2 = cuda.Event()
    event_stop_2 = cuda.Event()

    array = np.random.rand(size, 9)
    array.astype(np.float32)

    mem = cuda.aligned_zeros_like(array)
    mem = cuda.register_host_memory(mem,
                                    cuda.mem_host_register_flags.DEVICEMAP)

    mem_d = cuda.mem_alloc_like(mem)

    event_start_1.record()
    cuda.memcpy_htod(mem_d, mem)
    event_stop_1.record()
    event_stop_1.synchronize()

    mem2 = []
    this_mem = []
    size_per_batch = int(size / batch)
    for i in range(batch):
        mem2.append(
            cuda.mem_alloc_like(array[i * size_per_batch:(i + 1) *
                                      size_per_batch]))
        this_mem.append(array[i * size_per_batch:(i + 1) * size_per_batch])
        this_mem[i] = cuda.register_host_memory(
            this_mem[i], cuda.mem_host_register_flags.DEVICEMAP)

    event_start_2.record()
    for i in range(batch):
        cuda.memcpy_htod(mem2[i], this_mem[i])
    event_stop_2.record()
    event_stop_2.synchronize()

    t1 = event_stop_1.time_since(event_start_1)
    t2 = event_stop_2.time_since(event_start_2)
    print("batch_memcpy_cmp size", size, " batch ", batch)
    print(t1)
    print(t2)

예제 #13

0

파일 보기

    def allocate_pinned_arrays(self, nfreqs=None, ndata=None):
        if nfreqs is None:
            nfreqs = int(self.max_nfreqs)
        if ndata is None:
            ndata = int(self.max_ndata)

        self.bls = cuda.aligned_zeros(shape=(nfreqs,),
                                      dtype=self.rtype,
                                      alignment=resource.getpagesize())
        self.bls = cuda.register_host_memory(self.bls)

        self.nbins0 = cuda.aligned_zeros(shape=(nfreqs,),
                                         dtype=np.int32,
                                         alignment=resource.getpagesize())
        self.nbins0 = cuda.register_host_memory(self.nbins0)

        self.nbinsf = cuda.aligned_zeros(shape=(nfreqs,),
                                         dtype=np.int32,
                                         alignment=resource.getpagesize())
        self.nbinsf = cuda.register_host_memory(self.nbinsf)

        self.t = cuda.aligned_zeros(shape=(ndata,),
                                    dtype=self.rtype,
                                    alignment=resource.getpagesize())
        self.t = cuda.register_host_memory(self.t)

        self.yw = cuda.aligned_zeros(shape=(ndata,),
                                     dtype=self.rtype,
                                     alignment=resource.getpagesize())
        self.yw = cuda.register_host_memory(self.yw)

        self.w = cuda.aligned_zeros(shape=(ndata,),
                                    dtype=self.rtype,
                                    alignment=resource.getpagesize())
        self.w = cuda.register_host_memory(self.w)

예제 #14

0

파일 보기

파일: test_driver.py 프로젝트: shashankg7/pycuda

    def test_register_host_memory(self):
        if drv.get_version() < (4, ):
            from py.test import skip
            skip("register_host_memory only exists on CUDA 4.0 and later")

        import sys
        if sys.platform == "darwin":
            from py.test import skip
            skip("register_host_memory is not supported on OS X")

        a = drv.aligned_empty((2**20, ), np.float64)
        a_pin = drv.register_host_memory(a)

        gpu_ary = drv.mem_alloc_like(a)
        stream = drv.Stream()
        drv.memcpy_htod_async(gpu_ary, a_pin, stream)
        drv.Context.synchronize()

예제 #15

0

파일 보기

파일: test_driver.py 프로젝트: chunggi/pycuda

    def test_register_host_memory(self):
        if drv.get_version() < (4,):
            from py.test import skip
            skip("register_host_memory only exists on CUDA 4.0 and later")

        import sys
        if sys.platform == "darwin":
            from py.test import skip
            skip("register_host_memory is not supported on OS X")

        a = drv.aligned_empty((2**20,), np.float64)
        a_pin = drv.register_host_memory(a)

        gpu_ary = drv.mem_alloc_like(a)
        stream = drv.Stream()
        drv.memcpy_htod_async(gpu_ary, a_pin, stream)
        drv.Context.synchronize()

예제 #16

0

파일 보기

파일: speech_to_text_quant_infer_trt.py 프로젝트: vadam5/NeMo

def trt_inference(stream, trt_ctx, d_input, d_output, input_signal,
                  input_signal_length):
    print("infer with shape: {}".format(input_signal.shape))
    trt_ctx.set_binding_shape(0, input_signal.shape)
    assert trt_ctx.all_binding_shapes_specified

    h_output = cuda.pagelocked_empty(tuple(trt_ctx.get_binding_shape(1)),
                                     dtype=np.float32)

    h_input_signal = cuda.register_host_memory(
        np.ascontiguousarray(input_signal.cpu().numpy().ravel()))
    cuda.memcpy_htod_async(d_input, h_input_signal, stream)
    trt_ctx.execute_async_v2(bindings=[int(d_input),
                                       int(d_output)],
                             stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    stream.synchronize()

    greedy_predictions = torch.tensor(h_output).argmax(dim=-1, keepdim=False)
    return greedy_predictions

예제 #17

0

파일 보기

def test_kernel(n, a, x_gpu, y_gpu):
    code = """
		#include <stdio.h>
		__global__ void saxpy(int n, float a, float *x, float *y)
		{
			int index = blockIdx.x * blockDim.x + threadIdx.x;
			int stride = blockDim.x * gridDim.x;

			for (int i = index; i < n; i += stride)
			{
				y[i] = a*x[i] + y[i];
			}
		}
		"""
    mod = SourceModule(code)
    saxpy = mod.get_function("saxpy")
    saxpy(n, a, x_gpu, y_gpu, block=(1024, 1, 1), grid=(1, 1))
    out = cuda.register_host_memory(np.empty(n, dtype=np.float32))

    cuda.memcpy_dtoh_async(out, y_gpu)
    return out

예제 #18

0

파일 보기

 # omega_e = omega_e.astype(np.float32); omega_eps = omega_eps.astype(np.float32)
 # omega_q = omega_q.astype(np.float32); omega_nu = omega_nu.astype(np.float32)
 
 # Allocate memory on GPU
 fin_g = cuda.mem_alloc(fin.size * fin.dtype.itemsize)
 ftemp_g = cuda.mem_alloc(fin.nbytes)
 feq_g = cuda.mem_alloc(fin.size * fin.dtype.itemsize)
 rho_g = cuda.mem_alloc(rho.size * rho.dtype.itemsize)
 taus_g = cuda.mem_alloc(tauS.size * tauS.dtype.itemsize)
 u_g = cuda.mem_alloc(u.nbytes)
 c_g = cuda.mem_alloc(c.nbytes)
 t_g = cuda.mem_alloc(t.nbytes)
 
 #fpost_g = cuda.mem_alloc(fin.size * fin.dtype.itemsize)
 #Pinning memory for faster transfers between cpu and gpu
 fin_pin = cuda.register_host_memory(fin)
 #this didnt seem to make difference, so using fin only for transfers
 
 cuda.memcpy_htod(fin_g,fin)
 cuda.memcpy_htod(ftemp_g,fin)
 cuda.memcpy_htod(feq_g,fin)
 cuda.memcpy_htod(rho_g,rho)
 cuda.memcpy_htod(taus_g, tauS)
 cuda.memcpy_htod(u_g,u)
 #cuda.memcpy_htod(fpost_g,fin)
 # cuda.memcpy_htod(c_g,c)
 # cuda.memcpy_htod(t_g,t)

예제 #19

0

파일 보기

		const float x = g_x[i];
		g_y[i] = cos(x)*exp(sin(x)-sqrt(x*x));
	}
	"""
mod = SourceModule(code)
custom_kernel = mod.get_function("custom_kernel")
size = 5120000
block_size = 512
grid_size = size / block_size
block = (block_size, 1, 1)
grid = (grid_size, 1)

x = np.linspace(1, size, size).astype(np.float32)
print x.shape

x_pin = cuda.register_host_memory(x)

x_gpu = cuda.mem_alloc(x.nbytes)
cuda.memcpy_htod(x_gpu, x)

y_gpu = cuda.mem_alloc(x.nbytes)

custom_kernel(y_gpu, x_gpu, block=block, grid=grid)

ans = np.zeros_like(x_pin)
cuda.memcpy_dtoh(ans, y_gpu)
ans = np.sum(ans)

print ans
print np.sum(np.cos(x) * np.exp(np.sin(x) - np.sqrt(x * x)))

예제 #20

0

파일 보기

def compare_performance():
    # a quick warm up..
    n = 25000000
    a = np.random.randn(n).astype(np.float32)
    # allocate space on GPU
    mem_gpu = cuda.mem_alloc(a.nbytes)
    cuda.memcpy_htod(mem_gpu, a)
    # free space on GPU
    mem_gpu.free()
    h2d_nopin = []
    h2d_nopin_bw = []
    # measure timing without pinning
    for n in num_elems:
        # the data to be transferred
        a = np.random.randn(n).astype(np.float32)
        # allocate space on GPU
        mem_gpu = cuda.mem_alloc(a.nbytes)
        # only measure h2d transfer time
        start = time.perf_counter()
        cuda.memcpy_htod(mem_gpu, a)
        te = time.perf_counter() - start  #te: time elapsed
        h2d_nopin.append(te)
        h2d_nopin_bw.append(a.nbytes / (10**9 *
                                        (te)))  # convert to a bandwidth
        # free space on GPU
        mem_gpu.free()
    # now do pinning and measure time to pin and time to transfer
    h2h_pinned = []  # records the transfer time from unpinned -> pinned memory
    h2d_pin = [
    ]  # records the host to device transfer time with data in pinned memory.
    h2d_pin_total = []  # records the total (sum of the previous two)
    h2d_pin_bw = []  #h2d_pin, converted to a bandwidth (GB/sec)
    for i, n in enumerate(num_elems):
        a = np.random.randn(n).astype(np.float32)
        # allocate space on GPU
        mem_gpu = cuda.mem_alloc(a.nbytes)
        # allocate page locked memory
        a_pin = cuda.register_host_memory(a)
        # copy data from np array to pinned memory and measure transfer time
        start = time.perf_counter()
        copy_np_to_pinned_memory(a, a_pin)
        te = time.perf_counter() - start  # te: time elapsed
        h2h_pinned.append(te)
        # measure h2d transfer time
        start = time.perf_counter()
        cuda.memcpy_htod(mem_gpu, a_pin)
        te = time.perf_counter() - start  #te: time elapsed
        h2d_pin.append(te)
        h2d_pin_bw.append(a.nbytes / (10**9 * te))
        h2d_pin_total.append(h2d_pin[i] + h2h_pinned[i])
        # free allocated pinned memory
        a_pin.base.unregister()
        # free space on GPU
        mem_gpu.free()

    fig = plt.figure()
    num_elems_mb = [x * 4 / 10**6 for x in num_elems]

    plt.plot(num_elems_mb,
             h2d_nopin,
             'g',
             label='h2d transfer_time (no pinning)')
    plt.plot(num_elems_mb,
             h2d_pin,
             'r',
             label='h2d transfer_time (with pinning)')
    plt.plot(num_elems_mb, h2h_pinned, 'b', label='h2h transfer_time')
    plt.plot(num_elems_mb,
             h2d_pin_total,
             'k',
             label='h2d transfer_time (with pinning, total)')
    plt.legend()
    plt.xlabel('data size (MB)')
    plt.ylabel('time (sec)')
    plt.show()

예제 #21

0

파일 보기

파일: inference_varseqlen.py 프로젝트: celidos/TensorRT_study

        def inference(features, tokens):
            global h_output

            _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
                "NetworkOutput",
                ["start_logits", "end_logits", "feature_index"])
            networkOutputs = []

            eval_time_elapsed = 0
            for feature_index, feature in enumerate(features):
                # Copy inputs
                B = 1
                S = np.sum(feature.input_mask)
                input_ids = feature.input_ids[0:S]
                segment_ids = feature.segment_ids[0:S]
                cu_seq_lens = np.array([0, S], dtype=np.int32)

                if context.get_binding_shape(0)[0] != S:
                    context.set_binding_shape(0, (S, ))
                if context.get_binding_shape(1)[0] != S:
                    context.set_binding_shape(1, (S, ))
                if context.get_binding_shape(2)[0] != 2:
                    context.set_binding_shape(2, (2, ))
                if context.get_binding_shape(3)[0] != S:
                    context.set_binding_shape(3, (S, ))

                h_input_ids = cuda.register_host_memory(
                    np.ascontiguousarray(input_ids.ravel()))
                h_segment_ids = cuda.register_host_memory(
                    np.ascontiguousarray(segment_ids.ravel()))
                h_cu_seq_lens = cuda.register_host_memory(
                    np.ascontiguousarray(cu_seq_lens.ravel()))

                eval_start_time = time.time()
                cuda.memcpy_htod_async(d_inputs[0], h_input_ids, stream)
                cuda.memcpy_htod_async(d_inputs[1], h_segment_ids, stream)
                cuda.memcpy_htod_async(d_inputs[2], h_cu_seq_lens, stream)

                # Run inference
                context.execute_async_v2(
                    bindings=[int(d_inp)
                              for d_inp in d_inputs] + [int(d_output)],
                    stream_handle=stream.handle)
                # Synchronize the stream
                stream.synchronize()
                eval_time_elapsed += (time.time() - eval_start_time)

                # Transfer predictions back from GPU
                cuda.memcpy_dtoh_async(h_output, d_output, stream)
                stream.synchronize()

                # Only retrieve and post-process the first batch
                networkOutputs.append(
                    _NetworkOutput(start_logits=np.array(h_output[0:S]),
                                   end_logits=np.array(h_output[S:S * 2]),
                                   feature_index=feature_index))

            eval_time_elapsed /= len(features)

            # Total number of n-best predictions to generate in the nbest_predictions.json output file
            n_best_size = 20

            # The maximum length of an answer that can be generated. This is needed
            # because the start and end predictions are not conditioned on one another
            max_answer_length = 30

            prediction, nbest_json, scores_diff_json = dp.get_predictions(
                tokens, features, networkOutputs, args.n_best_size,
                args.max_answer_length)

            return eval_time_elapsed, prediction, nbest_json

예제 #22

0

파일 보기

파일: test_pinned_register_v2.py 프로젝트: stoiver/anuga-cuda

    x_orig = x.copy()
    start = time()
    increment(
            drv.InOut(x), 
            np.uint32(N), 
            block=(320, 1, 1),
            grid=(N/320,1,1)
            )
    times[i] = time()-start
    np.allclose(x_orig + 1, x), "%r %r" % (x_orig, x)

print "Average kernel execution time with pageable memory: %3.7f" % np.mean(times)

# Time use of pinned host memory:
#x = drv.aligned_empty((N, N), dtype=np.float64, order='C')
x = drv.register_host_memory(x, flags=drv.mem_host_register_flags.DEVICEMAP)

x_gpu_ptr = np.intp(x.base.get_device_pointer())

times = np.empty(M)
for i in xrange(M):
    #x[:, :] = np.random.rand(N, N)
    x_orig = x.copy()
    start = time()
    increment(
            x_gpu_ptr, 
            np.uint32(N), 
            block=(320, 1, 1), 
            grid=(N/320,1,1)
            )
    times[i] = time()-start

예제 #23

0

파일 보기

파일: colorspace_converter.py 프로젝트: svn2github/Xpra

    def convert_image_rgb(self, image):
        global program
        start = time.time()
        iplanes = image.get_planes()
        w = image.get_width()
        h = image.get_height()
        stride = image.get_rowstride()
        pixels = image.get_pixels()
        debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels))
        assert iplanes==ImageWrapper.PACKED, "must use packed format as input"
        assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format)
        divs = get_subsampling_divs(self.dst_format)

        #copy packed rgb pixels to GPU:
        upload_start = time.time()
        stream = driver.Stream()
        mem = numpy.frombuffer(pixels, dtype=numpy.byte)
        in_buf = driver.mem_alloc(len(pixels))
        hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP)
        pycuda.driver.memcpy_htod_async(in_buf, mem, stream)

        out_bufs = []
        out_strides = []
        out_sizes = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_stride = roundup(self.dst_width/x_div, 4)
            out_height = roundup(self.dst_height/y_div, 2)
            out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4)
            out_bufs.append(out_buf)
            out_strides.append(out_stride)
            out_sizes.append((out_stride, out_height))
        #ensure uploading has finished:
        stream.synchronize()
        #we can now unpin the host memory:
        hmem.base.unregister()
        debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start))

        kstart = time.time()
        kargs = [in_buf, numpy.int32(stride)]
        for i in range(3):
            kargs.append(out_bufs[i])
            kargs.append(numpy.int32(out_strides[i]))
        blockw, blockh = 16, 16
        #figure out how many pixels we process at a time in each dimension:
        xdiv = max([x[0] for x in divs])
        ydiv = max([x[1] for x in divs])
        gridw = max(1, w/blockw/xdiv)
        if gridw*2*blockw<w:
            gridw += 1
        gridh = max(1, h/blockh/ydiv)
        if gridh*2*blockh<h:
            gridh += 1
        debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1))
        self.kernel_function(*kargs, block=(blockw,blockh,1), grid=(gridw, gridh))

        #we can now free the GPU source buffer:
        in_buf.free()
        kend = time.time()
        debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0)
        self.frames += 1

        #copy output YUV channel data to host memory:
        read_start = time.time()
        pixels = []
        strides = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_size = out_sizes[i]
            #direct full plane async copy keeping current GPU padding:
            plane = driver.aligned_empty(out_size, dtype=numpy.byte)
            driver.memcpy_dtoh_async(plane, out_bufs[i], stream)
            pixels.append(plane.data)
            stride = out_strides[min(len(out_strides)-1, i)]
            strides.append(stride)
        stream.synchronize()
        #the copying has finished, we can now free the YUV GPU memory:
        #(the host memory will be freed by GC when 'pixels' goes out of scope)
        for out_buf in out_bufs:
            out_buf.free()
        self.cuda_context.synchronize()
        read_end = time.time()
        debug("strides=%s", strides)
        debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start))
        return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)

예제 #24

0

파일 보기

 def reg_mem(a):
     temp = drv.register_host_memory(
         a, flags=drv.mem_host_register_flags.DEVICEMAP)
     return numpy.intp(temp.base.get_device_pointer())

예제 #25

0

파일 보기

def chyqmom9_pycuda(moments: np.ndarray, size: int, w: np.ndarray,
                    x: np.ndarray, y: np.ndarray, batch_size: int):

    mem_d_size_in_byte = np.ones(size).astype(np.float32).nbytes
    sizeof_float = np.int32(np.dtype(np.float32).itemsize)
    size = np.int32(size)

    # Allocate 1 concurrent streams to each batch
    num_stream = batch_size
    streams = []
    for i in range(num_stream):
        streams.append(cuda.Stream())

    BlockSize = (256, 1, 1)
    GridSize = (size + BlockSize[0] - 1) / BlockSize[0]
    GridSize = (int(GridSize), 1, 1)

    # timers
    event_start = cuda.Event()
    event_stop = cuda.Event()

    size_per_batch = np.int32(np.ceil(float(size) / batch_size))
    print("size_per_batch: ", size_per_batch)

    # initialize kernels
    c_kernel = CHYQMOM9.get_function('chyqmom9_cmoments')
    float_value_set = CHYQMOM9.get_function('float_value_set')
    float_array_set = CHYQMOM9.get_function('float_array_set')
    chyqmom9_mu_yf = CHYQMOM9.get_function('chyqmom9_mu_yf')
    chyqmom9_wout = CHYQMOM9.get_function('chyqmom9_wout')
    chyqmom9_xout = CHYQMOM9.get_function('chyqmom9_xout')
    chyqmom9_yout = CHYQMOM9.get_function('chyqmom9_yout')

    moments_d = []
    this_moment = []
    this_x = []
    this_w = []
    this_y = []
    w_out_d = []
    x_out_d = []
    y_out_d = []

    c_moments = []
    mu = []
    yf = []

    m1 = []
    x1 = []
    w1 = []
    x2 = []
    w2 = []

    for i in range(0, num_stream, 1):
        loc = np.int32((i) * size_per_batch)
        if loc + size_per_batch > size:
            size_per_batch = size - loc
        # allocate memory on device
        moments_d.append(
            cuda.mem_alloc(int(sizeof_float * size_per_batch * 10)))
        w_out_d.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 9)))
        x_out_d.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 9)))
        y_out_d.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 9)))
        this_moment.append(
            np.ascontiguousarray(moments[:, loc:loc + size_per_batch],
                                 dtype=np.float32))
        this_moment[i] = cuda.register_host_memory(
            this_moment[i], cuda.mem_host_register_flags.PORTABLE)

        this_w.append(
            np.ascontiguousarray(np.zeros_like(w[:,
                                                 loc:loc + size_per_batch])))
        this_w[i] = cuda.register_host_memory(
            this_w[i], cuda.mem_host_register_flags.PORTABLE)
        this_x.append(
            np.ascontiguousarray(np.zeros_like(x[:,
                                                 loc:loc + size_per_batch])))
        this_x[i] = cuda.register_host_memory(
            this_x[i], cuda.mem_host_register_flags.PORTABLE)
        this_y.append(
            np.ascontiguousarray(np.zeros_like(y[:,
                                                 loc:loc + size_per_batch])))
        this_y[i] = cuda.register_host_memory(
            this_y[i], cuda.mem_host_register_flags.PORTABLE)

        c_moments.append(cuda.mem_alloc(int(sizeof_float * size_per_batch *
                                            7)))
        mu.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3)))
        yf.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3)))

        m1.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 5)))
        float_value_set(m1[i],
                        np.float32(1),
                        size_per_batch,
                        np.int32(0),
                        block=BlockSize,
                        grid=GridSize,
                        stream=streams[i])
        float_value_set(m1[i],
                        np.float32(0),
                        size_per_batch,
                        size_per_batch,
                        block=BlockSize,
                        grid=GridSize,
                        stream=streams[i])
        x1.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3)))
        w1.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3)))
        x2.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3)))
        w2.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3)))

    hyq = hyqmom.Hyqmom(BlockSize, GridSize)

    event_start.record()
    for i in range(0, num_stream, 1):
        loc = np.int32((i) * size_per_batch)
        if loc + size_per_batch > size:
            size_per_batch = size - loc

        cuda.memcpy_htod_async(moments_d[i], this_moment[i], stream=streams[i])

        c_kernel(moments_d[i],
                 c_moments[i],
                 size_per_batch,
                 block=BlockSize,
                 grid=GridSize,
                 stream=streams[i])

        float_array_set(m1[i],
                        c_moments[i],
                        np.int32(size_per_batch),
                        np.int32(size_per_batch * 2),
                        np.int32(0),
                        block=BlockSize,
                        grid=GridSize,
                        stream=streams[i])
        float_array_set(m1[i],
                        c_moments[i],
                        np.int32(size_per_batch * 2),
                        np.int32(size_per_batch * 3),
                        np.int32(size_per_batch * 4),
                        block=BlockSize,
                        grid=GridSize,
                        stream=streams[i])
        hyq.hyqmom3(m1[i],
                    x1[i],
                    w1[i],
                    size_per_batch,
                    block=BlockSize,
                    grid=GridSize,
                    stream=streams[i])
        chyqmom9_mu_yf(c_moments[i],
                       x1[i],
                       w1[i],
                       yf[i],
                       mu[i],
                       size_per_batch,
                       block=BlockSize,
                       grid=GridSize,
                       stream=streams[i])
        float_array_set(m1[i],
                        mu[i],
                        np.int32(size_per_batch * 3),
                        np.int32(size_per_batch * 2),
                        np.int32(0),
                        block=BlockSize,
                        grid=GridSize,
                        stream=streams[i])
        hyq.hyqmom3(m1[i],
                    x2[i],
                    w2[i],
                    size_per_batch,
                    size_per_batch,
                    block=BlockSize,
                    grid=GridSize,
                    stream=streams[i])

    for i in range(0, num_stream, 1):
        streams[i].synchronize()
        chyqmom9_wout(moments_d[i],
                      w1[i],
                      w2[i],
                      w_out_d[i],
                      size_per_batch,
                      block=BlockSize,
                      grid=GridSize,
                      stream=streams[i])

        # w[:, loc:loc+size_per_batch] = cuda.from_device(w_out_d[i], (9, size_per_batch), np.float32, order="C")
        cuda.memcpy_dtoh_async(this_w[i], w_out_d[i], stream=streams[i])

        chyqmom9_xout(moments_d[i],
                      x1[i],
                      x_out_d[i],
                      size_per_batch,
                      block=BlockSize,
                      grid=GridSize,
                      stream=streams[i])

        # x[:, loc:loc+size_per_batch] = cuda.from_device(x_out_d[i], (9, size_per_batch), np.float32, order="C")
        cuda.memcpy_dtoh_async(this_x[i], x_out_d[i], stream=streams[i])

        chyqmom9_yout(moments_d[i],
                      x2[i],
                      yf[i],
                      y_out_d[i],
                      size_per_batch,
                      block=BlockSize,
                      grid=GridSize,
                      stream=streams[i])

        # y[:, loc:loc+size_per_batch] = cuda.from_device(y_out_d[i], (9, size_per_batch), np.float32, order="C")
        cuda.memcpy_dtoh_async(this_y[i], y_out_d[i], stream=streams[i])

    event_stop.record()
    event_stop.synchronize()

    for i in range(0, num_stream, 1):
        loc = np.int32((i) * size_per_batch)
        if loc + size_per_batch > size:
            size_per_batch = size - loc

        w[:, loc:loc + size_per_batch] = this_w[i]
        y[:, loc:loc + size_per_batch] = this_y[i]
        x[:, loc:loc + size_per_batch] = this_x[i]

    for i in range(0, num_stream, 1):

        # allocate memory on device
        moments_d[i].free()
        w_out_d[i].free()
        x_out_d[i].free()
        y_out_d[i].free()
        this_moment[i].base.unregister()

        this_w[i].base.unregister()
        this_x[i].base.unregister()
        this_y[i].base.unregister()

        c_moments[i].free()
        mu[i].free()
        yf[i].free()

        m1[i].free()
        x1[i].free()
        w1[i].free()
        x2[i].free()
        w2[i].free()

    calc_time = event_stop.time_since(event_start)
    return calc_time

예제 #26

0

파일 보기

s = cuda.Event()
e = cuda.Event()
s.record()

N = np.int32(1 << 20)
a = np.float32(2)

x = np.ones(N, dtype=np.float32)
y = 2. * np.ones(N, dtype=np.float32)

nStreams = 2
streams = [cuda.Stream() for i in range(nStreams)]

x_pin = [
    cuda.register_host_memory(x[i * N / nStreams:(i + 1) * N / nStreams])
    for i in range(nStreams)
]
y_pin = [
    cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams])
    for i in range(nStreams)
]

h = cublas.cublasCreate()

x_gpu = np.empty(nStreams, dtype=object)
y_gpu = np.empty(nStreams, dtype=object)
ans = np.empty(nStreams, dtype=object)

for i in range(nStreams):
    cublas.cublasSetStream(h, streams[i].handle)

예제 #27

0

파일 보기

파일: CLEAN.py 프로젝트: Jackmastr/gpu-optimize

def clean(res,
          ker,
          mdl=None,
          area=None,
          gain=0.1,
          maxiter=10000,
          tol=1e-3,
          stop_if_div=True,
          verbose=False):
    #s = cuda.Event()
    #e = cuda.Event()

    res = np.array(res)
    ker = np.array(ker)
    if mdl is not None:
        mdl = np.array(mdl)
    if area is not None:
        area = np.array(area)

    isComplex = (res.dtype == np.complex64)
    imgType = res.dtype

    oneImg = (res.ndim == 1)
    gain = np.float64(gain)
    maxiter = np.int32(maxiter)
    tol = np.float64(tol)
    stop_if_div = np.int32(stop_if_div)

    if oneImg:
        dim = len(res)
        res = np.array([res], dtype=imgType)
        ker = np.array([ker], dtype=imgType)

        if mdl is None:
            mdl = np.array([np.zeros(dim, dtype=imgType)], dtype=imgType)
        else:
            res[0] = res[0] - np.fft.ifft(
                np.fft.fft(mdl) * np.fft.fft(ker[0])).astype(imgType)
            mdl = np.array([mdl], dtype=imgType)

        if area is None:
            area = np.array([np.ones(dim, dtype=np.int32)], dtype=np.int32)
        else:
            area = np.array([area], dtype=np.int32)

    else:
        dim = len(res[0])
        numImgs = len(res)
        res = np.array(res, dtype=imgType)

        if ker.ndim == 1:
            ker = np.array([ker] * numImgs, dtype=imgType)
        else:
            ker = np.array(ker, dtype=imgType)

        if mdl is None:
            mdl = np.array([np.zeros(dim, dtype=imgType)] * numImgs,
                           dtype=imgType)
        elif mdl.ndim == 1:
            res = np.array([
                res[i] - np.fft.ifft(
                    np.fft.fft(mdl) * np.fft.fft(ker[i])).astype(imgType)
                for i in xrange(numImgs)
            ])
            mdl = np.array([mdl] * numImgs, dtype=imgType)
        else:
            res = np.array([
                res[i] - np.fft.ifft(
                    np.fft.fft(mdl[i]) * np.fft.fft(ker[i])).astype(imgType)
                for i in xrange(numImgs)
            ])
            mdl = np.array(mdl, dtype=imgType)

        if area is None:
            area = np.array([np.ones(dim, dtype=np.int32)] * numImgs,
                            dtype=np.int32)
        elif area.ndim == 1:
            area = np.array([area] * numImgs, dtype=np.int32)
        else:
            area = np.array(area, dtype=np.int32)

    blockDimX = min(1024, len(ker))
    block = (blockDimX, 1, 1)

    grid = (int(ceil(len(ker) / blockDimX)), 1, 1)

    # block=(1,1,1)
    # grid=(1,1,1)

    # make all the arguments 1 level deeper of a pointer, use thread index to choose which one at the very start, then continue through like normal

    code_complex = """
	#pragma comment(linker, "/HEAP:40000000")
	#include <cuComplex.h>
	#include <stdio.h>
	#include <cmath>
	
	__global__ void clean(cuFloatComplex *resP, cuFloatComplex *kerP, cuFloatComplex *mdlP, int* areaP, int stop_if_div)
	{
		const int dim = %(DIM)s;
		const int maxiter = %(MAXITER)s;
		const double gain = %(GAIN)s;
		const double tol = %(TOL)s;
		const int index = blockDim.x * blockIdx.x + threadIdx.x;

		cuFloatComplex *res = resP + index * %(DIM)s;
		cuFloatComplex *ker = kerP + index * %(DIM)s;
		cuFloatComplex *mdl = mdlP + index * %(DIM)s;
		int *area = areaP + index * %(DIM)s;


		float maxr=0, maxi=0, valr=0, vali, stepr, stepi, qr=0, qi=0;
		float score=-1, nscore, best_score=-1;
		float mmax, mval, mq=0;
		float firstscore=-1;
		int argmax=0, nargmax=0, wrap_n;
		
		cuFloatComplex best_mdl[%(DIM)s];
		cuFloatComplex best_res[%(DIM)s];

		cuFloatComplex stepComplex;

		
		// Compute gain/phase of kernel
		for (int n = 0; n < %(DIM)s; n++)
		{
			valr = cuCrealf(ker[n]);
			vali = cuCimagf(ker[n]);
			mval = valr * valr + vali * vali;
			if (mval > mq && area[n])
			{
				mq = mval;
				qr = valr;
				qi = vali;
			}
		}
		qr /= mq;
		qi /= -mq;
		// The clean loop
		for (int i = 0; i < maxiter; i++)
		{
			nscore = 0;
			mmax = -1;
			stepr = (float) gain * (maxr * qr - maxi * qi);
			stepi = (float) gain * (maxr * qi + maxi * qr);

			stepComplex = make_cuFloatComplex(stepr, stepi);

			mdl[argmax] = cuCaddf(mdl[argmax], stepComplex);

			// Take next step and compute score
			for (int n = 0; n < %(DIM)s; n++)
			{
				wrap_n = (n + argmax) %% dim;

				float kr = cuCrealf(ker[n]), ki = cuCimagf(ker[n]);
				float realSub = kr * stepr - ki * stepi;
				float imagSub = kr * stepi + ki * stepr;
				res[wrap_n] = cuCsubf(res[wrap_n], make_cuFloatComplex(realSub, imagSub));


				valr = cuCrealf(res[wrap_n]);
				vali = cuCimagf(res[wrap_n]);
				mval = valr * valr + vali * vali;
				nscore += mval;
				if (mval > mmax && area[wrap_n])
				{
					nargmax = wrap_n;
					maxr = valr;
					maxi = vali;
					mmax = mval;
				}
			}
			nscore = sqrt(nscore/dim);
			if (firstscore < 0) firstscore = nscore;
			if (score > 0 && nscore > score)
			{
				if (stop_if_div)
				{
					// We've diverged: undo last step and give up
					mdl[argmax] = cuCsubf(mdl[argmax], stepComplex);

					for (int n=0; n < dim; n++)
					{
						wrap_n = (n + argmax) %% dim;

						float kr = cuCrealf(ker[n]), ki = cuCimagf(ker[n]);
						float realAdd = kr * stepr - ki * stepi;
						float imagAdd = kr * stepi + ki * stepr;
						res[wrap_n] = cuCaddf(res[wrap_n], make_cuFloatComplex(realAdd, imagAdd));
					}
					return;
				} else if (best_score < 0 || score < best_score)
				{
					// We've diverged: buf prev score in case it's global best
					for (int n=0; n < dim; n++)
					{
						wrap_n = (n + argmax) %% dim;
						best_mdl[n] = mdl[n];

						float kr = cuCrealf(ker[n]), ki = cuCimagf(ker[n]);
						float realAdd = kr * stepr - ki * stepi;
						float imagAdd = kr * stepi + ki * stepr;
						best_res[wrap_n] = cuCaddf(res[wrap_n], make_cuFloatComplex(realAdd, imagAdd));
					}
					best_mdl[argmax] = cuCsubf(best_mdl[argmax], stepComplex);

					best_score = score;
					i = 0; // Reset maxiter counter
				}
			} else if (score > 0 && (score - nscore) / firstscore < tol)
			{
				// We're done
				return;
			} else if (not stop_if_div && (best_score < 0 || nscore < best_score))
			{
				i = 0; // Reset maxiter counter
			}
			score = nscore;
			argmax = nargmax;
		}
		// If we end on maxiter, then make sure mdl/res reflect best score
		if (best_score > 0 && best_score < nscore)
		{
			for (int n=0; n < dim; n++)
			{
				mdl[n] = best_mdl[n];
				res[n] = best_res[n];
			}
		}
	}
	"""

    code = """
	#pragma comment(linker, "/HEAP:40000000")
	#include <stdio.h>
	#include <cmath>

	__global__ void clean(float *resP, float *kerP, float *mdlP, int *areaP, int stop_if_div)
	{

		const int dim = %(DIM)s;
		const int maxiter = %(MAXITER)s;
		const double gain = %(GAIN)s;
		const double tol = %(TOL)s;
		const int index = blockDim.x * blockIdx.x + threadIdx.x;

		float *res = resP + index * %(DIM)s;
		float *ker = kerP + index * %(DIM)s;
		float *mdl = mdlP + index * %(DIM)s;
		int *area = areaP + index * %(DIM)s;

		float score=-1, nscore, best_score=-1;
       	float max=0, mmax, val, mval, step, q=0, mq=0;
        float firstscore=-1;
        int argmax=0, nargmax=0, wrap_n;

        float best_mdl[%(DIM)s], best_res[%(DIM)s];

        // Compute gain/phase of kernel
        for (int n=0; n < dim; n++) {
            val = ker[n];
            mval = val * val;
            if (mval > mq && area[n]) {
                mq = mval;
                q = val;
            }
        }
        q = 1/q;
        // The clean loop
        for (int i=0; i < maxiter; i++) {
            nscore = 0;
            mmax = -1;
            step = (float) gain * max * q;
            mdl[argmax] += step;
            // Take next step and compute score
            for (int n=0; n < dim; n++) {
                wrap_n = (n + argmax) %% dim;
                res[wrap_n] -= ker[n] * step;
                val = res[wrap_n];
                mval = val * val;
                nscore += mval;
                if (mval > mmax && area[wrap_n]) {
                    nargmax = wrap_n;
                    max = val;
                    mmax = mval;
                }
            }
            nscore = sqrt(nscore / dim);
            if (firstscore < 0) firstscore = nscore;

			if (i > 10000)
            {
			printf("MY CLEAN Iter %%d: Max=(%%d), Score = %%f, Prev = %%f\\n", \
                    i, nargmax, (double) (nscore/firstscore), \
					(double) (score/firstscore));
			}

            if (score > 0 && nscore > score) {
                if (stop_if_div) {
                    // We've diverged: undo last step and give up
                    mdl[argmax] -= step;
                    for (int n=0; n < dim; n++) {
                        wrap_n = (n + argmax) %% dim;
                        res[wrap_n] += ker[n] * step;
                    }
                    return;
                } else if (best_score < 0 || score < best_score) {
                    // We've diverged: buf prev score in case it's global best
                    for (int n=0; n < dim; n++) {
                        wrap_n = (n + argmax) %% dim;
                        best_mdl[n] = mdl[n];
                        best_res[wrap_n] = res[wrap_n] + ker[n] * step;
                    }
                    best_mdl[argmax] -= step;
                    best_score = score;
                    i = 0;  // Reset maxiter counter
                }
            } else if (score > 0 && (score - nscore) / firstscore < tol) {
                // We're done
                return;
            } else if (!stop_if_div && (best_score < 0 || nscore < best_score)) {
                i = 0;  // Reset maxiter counter
            }
            score = nscore;
            argmax = nargmax;
        }
        // If we end on maxiter, then make sure mdl/res reflect best score
        if (best_score > 0 && best_score < nscore) {
            for (int n=0; n < dim; n++) {
                mdl[n] = best_mdl[n];
                res[n] = best_res[n];
           }
        }
	}
	
	"""
    code = code % {
        'DIM': dim,
        'MAXITER': maxiter,
        'GAIN': gain,
        'TOL': tol,
    }
    code_complex = code_complex % {
        'DIM': dim,
        'MAXITER': maxiter,
        'GAIN': gain,
        'TOL': tol,
    }

    if isComplex:
        mod = SourceModule(code_complex, options=["-fmad=false"])
    else:
        mod = SourceModule(code, options=["-fmad=false"])

    clean = mod.get_function("clean")

    res_pin = cuda.register_host_memory(res)
    ker_pin = cuda.register_host_memory(ker)
    mdl_pin = cuda.register_host_memory(mdl)
    area_pin = cuda.register_host_memory(area)

    res_gpu = cuda.mem_alloc(res.nbytes)
    ker_gpu = cuda.mem_alloc(ker.nbytes)
    mdl_gpu = cuda.mem_alloc(mdl.nbytes)
    area_gpu = cuda.mem_alloc(area.nbytes)

    cuda.memcpy_htod(res_gpu, res_pin)
    cuda.memcpy_htod(ker_gpu, ker_pin)
    cuda.memcpy_htod(mdl_gpu, mdl_pin)
    cuda.memcpy_htod(area_gpu, area_pin)

    clean.prepare("PPPPi")
    clean.prepared_call(grid, block, res_gpu, ker_gpu, mdl_gpu, area_gpu,
                        stop_if_div)

    cuda.memcpy_dtoh(res_pin, res_gpu)
    cuda.memcpy_dtoh(mdl_pin, mdl_gpu)

    if oneImg:
        return mdl_pin[0], res_pin[0]
    return mdl_pin, res_pin

예제 #28

0

파일 보기

파일: transposestreams.py 프로젝트: Jackmastr/cuda_sandbox

stream = []
for i in range(nStreams):
    stream.append(cuda.Stream())

idata = np.tril(np.ones((N, N), dtype=np.float32))

odata = np.zeros_like(idata, dtype=np.float32)

idata_pin_list = []
odata_pin_list = []

for i in range(nStreams):
    i_slice = idata[i * N / nStreams:(i + 1) * N / nStreams]
    o_slice = odata[i * N / nStreams:(i + 1) * N / nStreams]
    idata_pin_list.append(cuda.register_host_memory(i_slice))
    odata_pin_list.append(cuda.register_host_memory(o_slice))

#print idata_pin_list[0]

# Using independent host->device, kernel, device->host streams?
idata_gpu_list = []
odata_gpu_list = []

for i in range(nStreams):
    idata_gpu_list.append(cuda.mem_alloc(idata.nbytes / nStreams))
    odata_gpu_list.append(cuda.mem_alloc(odata.nbytes / nStreams))

    cuda.memcpy_htod_async(idata_gpu_list[i], idata_pin_list[i])
    cuda.memcpy_htod_async(odata_gpu_list[i], odata_pin_list[i])

예제 #29

0

파일 보기

def MedianFilter(input=None, kernel_size=3, bw=32, bh=32):

	#s = cuda.Event()
	#e = cuda.Event()

	input_list = input

	BLOCK_WIDTH = bw
	BLOCK_HEIGHT = bh

	if isinstance(kernel_size, (int, long)):
		kernel_size = [kernel_size]*2

	WS_x, WS_y = kernel_size
	padding_y = WS_x/2
	padding_x = WS_y/2

	input_list = np.asarray(input_list)

	if input_list.ndim == 3:
		_, N, M = input_list.shape
	elif input_list.ndim == 2:
		N, M = input_list.shape
		input_list = [input_list]

	expanded_N = N + (2 * padding_y)
	expanded_M = M + (2 * padding_x)

	gridx = int(np.ceil((expanded_N)/BLOCK_WIDTH))+1
	gridy = int(np.ceil((expanded_M)/BLOCK_HEIGHT))+1
	grid = (gridx,gridy, 1)
	block = (BLOCK_WIDTH, BLOCK_HEIGHT, 1)

	code = """
		#pragma comment(linker, "/HEAP:4000000")


		/* Some sample C code for the quickselect algorithm, 
		   taken from Numerical Recipes in C. */

		#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;

		__device__ float quickselect(float *arr, int n, int k) {
		  unsigned long i,ir,j,l,mid;
		  float a,temp;

		  l=0;
		  ir=n-1;
		  for(;;) {
		    if (ir <= l+1) { 
		      if (ir == l+1 && arr[ir] < arr[l]) {
			SWAP(arr[l],arr[ir]);
		      }
		      return arr[k];
		    }
		    else {
		      mid=(l+ir) >> 1; 
		      SWAP(arr[mid],arr[l+1]);
		      if (arr[l] > arr[ir]) {
			SWAP(arr[l],arr[ir]);
		      }
		      if (arr[l+1] > arr[ir]) {
			SWAP(arr[l+1],arr[ir]);
		      }
		      if (arr[l] > arr[l+1]) {
			SWAP(arr[l],arr[l+1]);
		      }
		      i=l+1; 
		      j=ir;
		      a=arr[l+1]; 
		      for (;;) { 
			do i++; while (arr[i] < a); 
			do j--; while (arr[j] > a); 
			if (j < i) break; 
			SWAP(arr[i],arr[j]);
		      } 
		      arr[l+1]=arr[j]; 
		      arr[j]=a;
		      if (j >= k) ir=j-1; 
		      if (j <= k) l=i;
		    }
		  }
		}

		/* https://softwareengineering.stackexchange.com/questions/284767/kth-selection-routine-floyd-algorithm-489
		 * Implementation from Stack Exchange user: Andy Dansby 
		 */

		__device__ float FloydWirth_kth(float arr[], const int kTHvalue) 
		{
		#define F_SWAP(a,b) { float temp=(a);(a)=(b);(b)=temp; }
		#define SIGNUM(x) ((x) < 0 ? -1 : ((x) > 0 ? 1 : (x)))

		    int left = 0;       
		    int right = %(WS^2)s - 1;     
		    int left2 = 0;
		    int right2 = %(WS^2)s - 1;

		    while (left < right) 
		    {           
		        if( arr[right2] < arr[left2] ) F_SWAP(arr[left2],arr[right2]);
		        if( arr[right2] < arr[kTHvalue] ) F_SWAP(arr[kTHvalue],arr[right2]);
		        if( arr[kTHvalue] < arr[left2] ) F_SWAP(arr[left2],arr[kTHvalue]);

		        int rightleft = right - left;

		        if (rightleft < kTHvalue)
		        {
		            int n = right - left + 1;
		            int ii = kTHvalue - left + 1;
		            int s = (n + n) / 3;
		            int sd = (n * s * (n - s) / n) * SIGNUM(ii - n / 2);
		            int left2 = max(left, kTHvalue - ii * s / n + sd);
		            int right2 = min(right, kTHvalue + (n - ii) * s / n + sd);              
		        }

		        float x=arr[kTHvalue];

		        while ((right2 > kTHvalue) && (left2 < kTHvalue))
		        {
		            do 
		            {
		                left2++;
		            }while (arr[left2] < x);

		            do
		            {
		                right2--;
		            }while (arr[right2] > x);

		            F_SWAP(arr[left2],arr[right2]);
		        }
		        left2++;
		        right2--;

		        if (right2 < kTHvalue) 
		        {
		            while (arr[left2]<x)
		            {
		                left2++;
		            }
		            left = left2;
		            right2 = right;
		        }

		        if (kTHvalue < left2) 
		        {
		            while (x < arr[right2])
		            {
		                right2--;
		            }

		            right = right2;
		            left2 = left;
		        }

		        if( arr[left] < arr[right] ) F_SWAP(arr[right],arr[left]);
		    }

		#undef F_SWAP
		#undef SIGNUM
		    return arr[kTHvalue];
		}



		texture<float, 2> tex;

		__global__ void mf(float* in, float* out, int imgDimY, int imgDimX)
		{

			float window[%(WS^2)s];

			int x_thread_offset = %(BY)s * blockIdx.x + threadIdx.x;
			int y_thread_offset = %(BX)s * blockIdx.y + threadIdx.y;
			for (int y = %(WSx/2)s + y_thread_offset; y < imgDimX - %(WSx/2)s; y += %(y_stride)s)
			{
				for (int x = %(WSy/2)s + x_thread_offset; x < imgDimY - %(WSy/2)s; x += %(x_stride)s)
				{
					int i = 0;
					for (int fx = 0; fx < %(WSy)s; ++fx)
					{
						for (int fy = 0; fy < %(WSx)s; ++fy)
						{
							//window[i] = tex2D(tex, (float) (x + fx - %(WSy/2)s), (float) (y + fy - %(WSx/2)s));
							window[i] = in[(x + fx - %(WSy/2)s) + (y + fy - %(WSx/2)s)*imgDimY];
							i += 1;
						}
					}

					// Sort to find the median
					//for (int j = 0; j < %(WS^2)s/2 + 1; j++)
					//{
					//	for (int k = j + 1; k < %(WS^2)s; k++)
					//	{
					//		if (window[j] > window[k])
					//		{
					//			float tmp = window[j];
					//			window[j] = window[k];
					//			window[k] = tmp;
					//		}
					//	}
					//}
					//out[y*imgDimY + x] = window[%(WS^2)s/2];
					out[y*imgDimY + x] = FloydWirth_kth(window, %(WS^2)s/2);
					//out[y*imgDimY + x] = quickselect(window, %(WS^2)s, %(WS^2)s/2);
				}
			}
		}


		__global__ void mf_shared(float *in, float* out, int imgDimY, int imgDimX)
		{			

			const int TSx = %(BX)s + %(WSx)s - 1;
			const int TSy = %(BY)s + %(WSy)s - 1;
            __shared__ float tile[TSx][TSy];

            float window[%(WS^2)s];
            const int x_thread_offset = %(BX)s * blockIdx.x + threadIdx.x;
            const int y_thread_offset = %(BY)s * blockIdx.y + threadIdx.y;


			const int thread_index = blockDim.y * threadIdx.x + threadIdx.y;

			int imgX = blockIdx.x * blockDim.x + thread_index;
			int imgY;

            // Load into the tile for this block
			if (thread_index < TSx && imgX < imgDimX)
			{
				for (int i = 0; i < TSy && i < imgDimY - blockIdx.y * blockDim.y; i++)
				{
					imgY = blockIdx.y * blockDim.y + i;
					tile[thread_index][i] = in[imgX * imgDimY + imgY];
					//tile[thread_index][i] = tex2D(tex, (float) imgY, (float) imgX);
				}

			}

			__syncthreads();


			int x = %(WSx/2)s + x_thread_offset;
			int y = %(WSy/2)s + y_thread_offset;

			if (x >= imgDimX - %(WSx/2)s || y >= imgDimY - %(WSy/2)s)
			{
				return;
			}

			int i = 0;
			for (int fx = 0; fx < %(WSx)s; ++fx)
			{
				for (int fy = 0; fy < %(WSy)s; ++fy)
				{
					window[i++] = tile[threadIdx.x + fx][threadIdx.y + fy];
				}
			}


			// Sort to find the median
			//for (int j = 0; j <= %(WS^2)s/2; j++)
			//{
			//	for (int k = j + 1; k < %(WS^2)s; k++)
			//	{
			//		if (window[j] > window[k])
			//		{
			//			float tmp = window[j];
			//			window[j] = window[k];
			//			window[k] = tmp;
			//		}
			//	}
			//}
			//out[x*imgDimY + y] = window[%(WS^2)s/2];

			out[x*imgDimY + y] = FloydWirth_kth(window, %(WS^2)s/2);

			//forgetfulSelection(window, %(WSx)s);
			//out[x*imgDimY + y] = window[%(WS^2)s/2];

			//out[x*imgDimY + y] = myForgetfulSelection(window);
		}

		"""

	code = code % {
			'BY' : BLOCK_WIDTH,
			'BX' : BLOCK_HEIGHT,
			'WS^2' : WS_x * WS_y,
			'x_stride' : BLOCK_WIDTH * gridx,
			'y_stride' : BLOCK_HEIGHT * gridy,
			'WSx' : WS_x,
			'WSy' : WS_y,
			'WSx/2' : WS_x/2,
			'WSy/2' : WS_y/2,
		}
	mod = SourceModule(code)
	#mf_shared = mod.get_function('mf_shared')
	mf = mod.get_function('mf')
	texref = mod.get_texref("tex")


	# NSTREAMS := NUMBER OF INPUT IMAGES
	nStreams = len(input_list)

	# Initialize the streams
	stream = [cuda.Stream()]*nStreams

	# Pad all the images with zeros
	input_list = [np.array( np.pad(img, ( (padding_y, padding_y), (padding_x, padding_x) ), 'constant', constant_values=0) , dtype=np.float32) for img in input_list]

	# Use pinned memory for all the images
	in_pin_list = [cuda.register_host_memory(img) for img in input_list]
	imgBytes = in_pin_list[0].nbytes

	# Initialize the outputs to empty images (assuming all images are of the same shape)
	outdata_list = [cuda.pagelocked_empty_like(img) for img in input_list]

	# Malloc on the GPU for each input and output image
	#in_gpu_list = [cuda.mem_alloc(pinnedImg.nbytes) for pinnedImg in in_pin_list]
	in_gpu_list = [None]*nStreams
	#out_gpu_list = [cuda.mem_alloc(pinnedImg.nbytes) for pinnedImg in in_pin_list]
	out_gpu_list = [None]*nStreams
	mf.prepare("PPii")
	for i in xrange(nStreams + 2):
		ii = i - 1
		iii = i - 2

		if 0 <= iii < nStreams:
			st = stream[iii]
			cuda.memcpy_dtoh_async(outdata_list[iii], out_gpu_list[iii], stream=st)

		if 0 <= ii < nStreams:
			st = stream[ii]
			out_gpu_list[ii] = cuda.mem_alloc(imgBytes)
			# s.record(stream=stream[0])
			# mf_shared.prepare("Pii")
			# mf_shared.prepared_async_call(grid, block, st, out_gpu_list[ii], expanded_M, expanded_N)

			#mf.prepare("PPii")
			mf.prepared_async_call(grid, block, st, in_gpu_list[ii], out_gpu_list[ii], expanded_M, expanded_N)
			# e.record(stream=stream[0])
			# e.synchronize()
			# print s.time_till(e), "ms for the kernel"

		if 0 <= i < nStreams:
			st = stream[i]
			#cuda.matrix_to_texref(in_pin_list[i], texref, order="C")
			in_gpu_list[i] = cuda.mem_alloc(imgBytes)
			cuda.memcpy_htod_async(in_gpu_list[i], in_pin_list[i], stream=st)

	if (padding_y > 0):
		outdata_list = [out[padding_y:-padding_y] for out in outdata_list]
	if (padding_x > 0):
		outdata_list = [out[:, padding_x:-padding_x] for out in outdata_list]

	return outdata_list

예제 #30

0

파일 보기

파일: matmul.py 프로젝트: Jackmastr/cuda_sandbox

import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import skcuda.linalg as linalg
import skcuda.misc as misc
import pycuda.driver as cuda

s = cuda.Event()
e = cuda.Event()
s.record()

linalg.init()
M = 4096
N = 4096

A = np.asarray(np.random.rand(M, N), dtype=np.float32)
B = np.asarray(np.random.rand(N, M), dtype=np.float32)

A_pin = cuda.register_host_memory(A)
B_pin = cuda.register_host_memory(B)

A_gpu = gpuarray.to_gpu(A)
B_gpu = gpuarray.to_gpu(B)
C_gpu = linalg.dot(A_gpu, B_gpu)
#print np.allclose(np.dot(A,B), C_gpu.get())

e.record()
e.synchronize()
print s.time_till(e) / 1000., "s"

예제 #31

0

파일 보기

    def convert_image_rgb(self, image):
        global program
        start = time.time()
        iplanes = image.get_planes()
        w = image.get_width()
        h = image.get_height()
        stride = image.get_rowstride()
        pixels = image.get_pixels()
        debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image,
              iplanes, type(pixels), len(pixels))
        assert iplanes == ImageWrapper.PACKED, "must use packed format as input"
        assert image.get_pixel_format(
        ) == self.src_format, "invalid source format: %s (expected %s)" % (
            image.get_pixel_format(), self.src_format)
        divs = get_subsampling_divs(self.dst_format)

        #copy packed rgb pixels to GPU:
        upload_start = time.time()
        stream = driver.Stream()
        mem = numpy.frombuffer(pixels, dtype=numpy.byte)
        in_buf = driver.mem_alloc(len(pixels))
        hmem = driver.register_host_memory(
            mem, driver.mem_host_register_flags.DEVICEMAP)
        pycuda.driver.memcpy_htod_async(in_buf, mem, stream)

        out_bufs = []
        out_strides = []
        out_sizes = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_stride = roundup(self.dst_width / x_div, 4)
            out_height = roundup(self.dst_height / y_div, 2)
            out_buf, out_stride = driver.mem_alloc_pitch(
                out_stride, out_height, 4)
            out_bufs.append(out_buf)
            out_strides.append(out_stride)
            out_sizes.append((out_stride, out_height))
        #ensure uploading has finished:
        stream.synchronize()
        #we can now unpin the host memory:
        hmem.base.unregister()
        debug("allocation and upload took %.1fms",
              1000.0 * (time.time() - upload_start))

        kstart = time.time()
        kargs = [in_buf, numpy.int32(stride)]
        for i in range(3):
            kargs.append(out_bufs[i])
            kargs.append(numpy.int32(out_strides[i]))
        blockw, blockh = 16, 16
        #figure out how many pixels we process at a time in each dimension:
        xdiv = max([x[0] for x in divs])
        ydiv = max([x[1] for x in divs])
        gridw = max(1, w / blockw / xdiv)
        if gridw * 2 * blockw < w:
            gridw += 1
        gridh = max(1, h / blockh / ydiv)
        if gridh * 2 * blockh < h:
            gridh += 1
        debug("calling %s%s, with grid=%s, block=%s",
              self.kernel_function_name, tuple(kargs), (gridw, gridh),
              (blockw, blockh, 1))
        self.kernel_function(*kargs,
                             block=(blockw, blockh, 1),
                             grid=(gridw, gridh))

        #we can now free the GPU source buffer:
        in_buf.free()
        kend = time.time()
        debug("%s took %.1fms", self.kernel_function_name,
              (kend - kstart) * 1000.0)
        self.frames += 1

        #copy output YUV channel data to host memory:
        read_start = time.time()
        pixels = []
        strides = []
        for i in range(3):
            x_div, y_div = divs[i]
            out_size = out_sizes[i]
            #direct full plane async copy keeping current GPU padding:
            plane = driver.aligned_empty(out_size, dtype=numpy.byte)
            driver.memcpy_dtoh_async(plane, out_bufs[i], stream)
            pixels.append(plane.data)
            stride = out_strides[min(len(out_strides) - 1, i)]
            strides.append(stride)
        stream.synchronize()
        #the copying has finished, we can now free the YUV GPU memory:
        #(the host memory will be freed by GC when 'pixels' goes out of scope)
        for out_buf in out_bufs:
            out_buf.free()
        self.cuda_context.synchronize()
        read_end = time.time()
        debug("strides=%s", strides)
        debug("read back took %.1fms, total time: %.1f",
              (read_end - read_start) * 1000.0, 1000.0 * (time.time() - start))
        return ImageWrapper(0,
                            0,
                            self.dst_width,
                            self.dst_height,
                            pixels,
                            self.dst_format,
                            24,
                            strides,
                            planes=ImageWrapper._3_PLANES)

예제 #32

0

파일 보기

파일: transpose.py 프로젝트: Jackmastr/cuda_sandbox

import numpy as np

s = cuda.Event()
e = cuda.Event()
s.record()

TILE_DIM = 32
BLOCK_ROWS = 8

N = 32 * 1024

idata = np.tril(np.ones((N, N), dtype=np.float32))

odata = np.empty_like(idata, dtype=np.float32)

idata_pin = cuda.register_host_memory(idata)
odata_pin = cuda.register_host_memory(odata)

#s.record()

idata_gpu = cuda.mem_alloc(idata.nbytes)
odata_gpu = cuda.mem_alloc(odata.nbytes)

cuda.memcpy_htod_async(idata_gpu, idata_pin)
cuda.memcpy_htod_async(odata_gpu, odata_pin)

#e.record()
#e.synchronize()
#print s.time_till(e)

code = """

예제 #33

0

파일 보기

    context.append(device[i].make_context())

# init streams
for i in range(0, n_devs):
    # activate context for current device
    context[i].push()
    # create streams
    stream.append(cuda.Stream())
    # deactivate context for current device
    context[i].pop()

# activate the first context and register memory on host
context[0].push()
# define matrices in which each dev will compute slice of the final matrix
j_regs = []
i_reg = cuda.register_host_memory(image, 1)
for dev in device:
    # each slice that will contain the partial matrix of the final image
    # need to be L rows greater than the fraction of n_rows/n_devs to correctly
    # compute elements at the edges
    image_final_gpu_slice = np.empty(shape=(imageDims[0]//n_devs+L, imageDims[1]), dtype=np.float32)
    j_reg = cuda.register_host_memory(image_final_gpu_slice, 1)
    j_regs.append(j_reg)
# "here flag 1 stands for cudaHostRegisterPortable, which means that the pinned memory will be available
# for all contexts at the same time"
context[0].pop()

# split the data in array 'image' to equal parts
# and transfer the data to GPUs asynchronously
for i in range(0, n_devs):
    # input matrix is divided in vertical direction