Exemplo n.º 1
0
    def allocate(self, size):
        from traceback import extract_stack
        stack = tuple(frm[2] for frm in extract_stack())
        description = self.describe(stack, size)

        histogram = {}
        for bsize, descr in self.blocks.itervalues():
            histogram[bsize, descr] = histogram.get((bsize, descr), 0) + 1

        from pytools import common_prefix
        cpfx = common_prefix(descr for bsize, descr in histogram)

        print >> self.logfile, \
                "\n  Allocation of size %d occurring " \
                "(mem: last_free:%d, free: %d, total:%d) (pool: held:%d, active:%d):" \
                "\n      at: %s" % (
                (size, self.last_free)
                + cuda.mem_get_info()
                + (self.held_blocks, self.active_blocks,
                    description))

        hist_items = sorted(list(histogram.iteritems()))
        for (bsize, descr), count in hist_items:
            print >> self.logfile, \
                    "  %s (%d bytes): %dx" % (descr[len(cpfx):], bsize, count)

        if self.interactive:
            raw_input("  [Enter]")

        result = DeviceMemoryPool.allocate(self, size)
        self.blocks[result] = size, description
        self.last_free, _ = cuda.mem_get_info()
        return result
Exemplo n.º 2
0
def test_memleak():
    log.info("test_memleak()")
    from pycuda import driver
    #use the first device for this test
    start_free_memory = None
    for i in range(100):
        d = driver.Device(0)
        context = d.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST)
        if start_free_memory is None:
            start_free_memory, _ = driver.mem_get_info()
        free_memory, total_memory = driver.mem_get_info()
        log.info("%s%% free_memory: %s MB, total_memory: %s MB", str(i).rjust(3), free_memory/1024/1024, total_memory/1024/1024)
        context.pop()
        context.detach()
        w = random.randint(16, 128)*8
        h = random.randint(16, 128)*8
        n = random.randint(2, 10)
        test_encoder(encoder_module, options={}, dimensions=[(w, h)], n_images=n)

    d = driver.Device(0)
    context = d.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST)
    end_free_memory, _ = driver.mem_get_info()
    context.pop()
    context.detach()
    log.info("memory lost: %s MB", (start_free_memory-end_free_memory)/1024/1024)
Exemplo n.º 3
0
def test_memleak():
    log.info("test_memleak()")
    from pycuda import driver
    #use the first device for this test
    start_free_memory = None
    for i in range(100):
        d = driver.Device(0)
        context = d.make_context(flags=driver.ctx_flags.SCHED_AUTO
                                 | driver.ctx_flags.MAP_HOST)
        if start_free_memory is None:
            start_free_memory, _ = driver.mem_get_info()
        free_memory, total_memory = driver.mem_get_info()
        log.info("%s%% free_memory: %s MB, total_memory: %s MB",
                 str(i).rjust(3), free_memory / 1024 / 1024,
                 total_memory / 1024 / 1024)
        context.pop()
        context.detach()
        w = random.randint(16, 128) * 8
        h = random.randint(16, 128) * 8
        n = random.randint(2, 10)
        test_encoder(encoder_module,
                     options={},
                     dimensions=[(w, h)],
                     n_images=n)

    d = driver.Device(0)
    context = d.make_context(flags=driver.ctx_flags.SCHED_AUTO
                             | driver.ctx_flags.MAP_HOST)
    end_free_memory, _ = driver.mem_get_info()
    context.pop()
    context.detach()
    log.info("memory lost: %s MB",
             (start_free_memory - end_free_memory) / 1024 / 1024)
Exemplo n.º 4
0
    def __init__(self, init_data, n_generators):

        self.ctx = curr_gpu.make_context()
        self.module = pycuda.compiler.SourceModule(kernels_cuda_src, no_extern_c=True)
        (free, total) = cuda.mem_get_info()
        print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
        print(("Global free memory :%i Mo free" % (free / 10 ** 6)))

        ################################################################################################################

        self.width_mat = np.int32(init_data.shape[0])
        #        self.gpu_init_data = ga.to_gpu(init_data)
        self.gpu_init_data = cuda.mem_alloc(init_data.nbytes)
        cuda.memcpy_htod(self.gpu_init_data, init_data)

        self.cpu_new_data = np.zeros_like(init_data, dtype=np.float32)
        print("size new data = ", self.cpu_new_data.nbytes / 10 ** 6)
        (free, total) = cuda.mem_get_info()
        print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
        print(("Global free memory :%i Mo free" % (free / 10 ** 6)))

        self.gpu_new_data = cuda.mem_alloc(self.cpu_new_data.nbytes)
        cuda.memcpy_htod(self.gpu_new_data, self.cpu_new_data)
        #        self.gpu_new_data = ga.to_gpu(self.cpu_new_data)

        self.cpu_vect_sum = np.zeros((self.width_mat,), dtype=np.float32)
        self.gpu_vect_sum = cuda.mem_alloc(self.cpu_vect_sum.nbytes)
        cuda.memcpy_htod(self.gpu_vect_sum, self.cpu_vect_sum)
        #        self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum)
        ################################################################################################################
        self.init_rng = self.module.get_function("init_rng")
        self.gen_rand_mat = self.module.get_function("gen_rand_mat")
        self.sum_along_axis = self.module.get_function("sum_along_axis")
        self.norm_along_axis = self.module.get_function("norm_along_axis")
        self.init_vect_sum = self.module.get_function("init_vect_sum")
        self.copy_mat = self.module.get_function("copy_mat")
        ################################################################################################################
        self.n_generators = n_generators
        seed = 1
        self.rng_states = cuda.mem_alloc(
            n_generators
            * characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
        )
        self.init_rng(
            np.int32(n_generators),
            self.rng_states,
            np.uint64(seed),
            np.uint64(0),
            block=(64, 1, 1),
            grid=(n_generators // 64 + 1, 1),
        )
        (free, total) = cuda.mem_get_info()

        size_block_x = 32
        size_block_y = 32
        n_blocks_x = int(self.width_mat) // (size_block_x) + 1
        n_blocks_y = int(self.width_mat) // (size_block_y) + 1
        self.grid = (n_blocks_x, n_blocks_y, 1)
        self.block = (size_block_x, size_block_y, 1)
Exemplo n.º 5
0
def getFreeMemory(show=True):
    ''' Return the free memory of the device,. Very usful to look for save device memory '''
    Mb = 1024.*1024.
    Mbytes = float(cuda.mem_get_info()[0])/Mb
    if show:
      print "Free Global Memory: %f Mbytes" %Mbytes

    return cuda.mem_get_info()[0]/Mb
Exemplo n.º 6
0
    def propagate_eager(self, wavelength, wavefront):
        """
        'Not-Too-Good' version of the propagation on the GPU (lots of Memory issues...)
        Remove in the future
        :param wavelength:
        :param wavefront:
        :return:
        """

        N = self.N_PIX
        # free, total = cuda.mem_get_info()
        free, total = cuda.mem_get_info()
        print("Free: %.2f percent" % (free / total * 100))

        # Pupil Plane -> Image Slicer
        complex_pupil = self.pupil_masks[wavelength] * np.exp(
            1j * 2 * np.pi * self.pupil_masks[wavelength] / wavelength)
        complex_pupil_gpu = gpuarray.to_gpu(
            np.asarray(complex_pupil, np.complex64))
        plan = cu_fft.Plan(complex_pupil_gpu.shape, np.complex64, np.complex64)
        cu_fft.fft(complex_pupil_gpu, complex_pupil_gpu, plan, scale=True)

        # Add N_slices copies to be Masked
        complex_slicer_cpu = complex_pupil_gpu.get()
        complex_pupil_gpu.gpudata.free()

        free, total = cuda.mem_get_info()
        print("*Free: %.2f percent" % (free / total * 100))

        complex_slicer_cpu = np.stack([complex_slicer_cpu] * self.N_slices)
        complex_slicer_gpu = gpuarray.to_gpu(complex_slicer_cpu)
        slicer_masks_gpu = gpuarray.to_gpu(self.slicer_masks_fftshift)
        clinalg.multiply(slicer_masks_gpu, complex_slicer_gpu, overwrite=True)
        slicer_masks_gpu.gpudata.free()
        free, total = cuda.mem_get_info()
        print("**Free: %.2f percent" % (free / total * 100))

        # Slicer -> Pupil Mirror
        plan = cu_fft.Plan((N, N), np.complex64, np.complex64, self.N_slices)
        cu_fft.ifft(complex_slicer_gpu, complex_slicer_gpu, plan, scale=True)
        mirror_mask_gpu = gpuarray.to_gpu(self.pupil_mirror_masks_fft)
        clinalg.multiply(mirror_mask_gpu, complex_slicer_gpu, overwrite=True)

        # Pupil Mirror -> Slits
        cu_fft.fft(complex_slicer_gpu, complex_slicer_gpu, plan)
        slits = complex_slicer_gpu.get()
        complex_slicer_gpu.gpudata.free()
        mirror_mask_gpu.gpudata.free()
        slit = fftshift(np.sum((np.abs(slits))**2, axis=0))

        free, total = cuda.mem_get_info()
        print("***Free: %.2f percent" % (free / total * 100))

        return slit
Exemplo n.º 7
0
def swap_out_to_CPU(elem):
	# prepare variables
	return_falg = True
	u, ss, sp = elem
	dp = data_list[u][ss][sp]
	bytes = dp.data_bytes

	# now we will swap out, this data to CPU
	# so first we should check CPU has enough free memory

	MemFree = cpu_mem_check()

	if log_type in ['memory']:
		fm,tm = cuda.mem_get_info()
		log_str = "CPU MEM CEHCK Before swap out: %s Free, %s Maximum, %s Want to use"%(print_bytes(MemFree),'-',print_bytes(bytes))
		log(log_str,'memory',log_type)


	if bytes > MemFree:
		# not enough memory for swap out to CPU
		return False
	
	# we have enough memory so we can swap out
	# if other process not malloc during this swap out oeprataion

	try:
		buf = numpy.empty((dp.data_memory_shape), dtype= dp.data_contents_memory_dtype)
	except:
		# we failed memory allocation in the CPU
		return False

	# do the swap out
	#cuda.memcpy_dtoh_async(buf, dp.devptr, stream=stream[1])
	cuda.memcpy_dtoh(buf, dp.devptr)
	ctx.synchronize()

	dp.devptr.free()
	dp.devptr = None
	dp.data = buf
	dp.data_dtype = numpy.ndarray
	dp.memory_type = 'memory'


	gpu_list.remove(elem)
	cpu_list.append(elem)

	if log_type in ['memory']:
		fm,tm = cuda.mem_get_info()
		log_str = "GPU MEM CEHCK After swap out: %s Free, %s Maximum, %s Want to use"%(print_bytes(fm),print_bytes(tm),print_bytes(bytes))
		
		log(log_str,'memory',log_type)


	return True
Exemplo n.º 8
0
def swap_out_to_CPU(elem):
    # prepare variables
    return_falg = True
    u, ss, sp = elem
    dp = data_list[u][ss][sp]
    bytes = dp.data_bytes

    # now we will swap out, this data to CPU
    # so first we should check CPU has enough free memory

    MemFree = cpu_mem_check()

    if log_type in ['memory']:
        fm, tm = cuda.mem_get_info()
        log_str = "CPU MEM CEHCK Before swap out: %s Free, %s Maximum, %s Want to use" % (
            print_bytes(MemFree), '-', print_bytes(bytes))
        log(log_str, 'memory', log_type)

    if bytes > MemFree:
        # not enough memory for swap out to CPU
        return False

    # we have enough memory so we can swap out
    # if other process not malloc during this swap out oeprataion

    try:
        buf = numpy.empty((dp.data_memory_shape),
                          dtype=dp.data_contents_memory_dtype)
    except:
        # we failed memory allocation in the CPU
        return False

    # do the swap out
    #cuda.memcpy_dtoh_async(buf, dp.devptr, stream=stream[1])
    cuda.memcpy_dtoh(buf, dp.devptr)
    ctx.synchronize()

    dp.devptr.free()
    dp.devptr = None
    dp.data = buf
    dp.data_dtype = numpy.ndarray
    dp.memory_type = 'memory'

    gpu_list.remove(elem)
    cpu_list.append(elem)

    if log_type in ['memory']:
        fm, tm = cuda.mem_get_info()
        log_str = "GPU MEM CEHCK After swap out: %s Free, %s Maximum, %s Want to use" % (
            print_bytes(fm), print_bytes(tm), print_bytes(bytes))

        log(log_str, 'memory', log_type)

    return True
Exemplo n.º 9
0
def swap_out_to_hard_disk(elem):
    # prepare variables
    return_falg = True
    u, ss, sp = elem
    dp = data_list[u][ss][sp]
    bytes = dp.data_bytes

    # now we will swap out, this CPU to hard disk
    # so first we should check hard disk has enough free memory
    file_name = '%d_temp' % (rank)
    os.system('df . > %s' % (file_name))

    f = open(file_name)
    s = f.read()
    f.close()

    ss = s.split()

    # get available byte
    avail = int(ss[10])

    if log_type in ['memory']:
        fm, tm = cuda.mem_get_info()
        log_str = "HARD disk MEM CEHCK Before swap out: %s Free, %s Maximum, %s Want to use" % (
            print_bytes(avail), '-', print_bytes(bytes))
        log(log_str, 'memory', log_type)

    if bytes > avail:
        # we failed make swap file in hard disk
        return False

    # now we have enough hard disk to make swap file
    # temp file name, "temp_data, rank, u, ss, sp"
    file_name = 'temp_data, %s, %s, %s, %s' % (rank, u, ss, sp)
    f = open(file_name, 'wb')
    f.write(dp.data)
    f.close()

    dp.data = None
    dp.hard_disk = file_name
    dp.memory_type = 'hard_disk'

    cpu_list.remove(elem)
    hard_list.append(elem)

    if log_type in ['memory']:
        fm, tm = cuda.mem_get_info()
        log_str = "CPU MEM CEHCK After swap out: %s Free, %s Maximum, %s Want to use" % (
            print_bytes(fm), print_bytes(tm), print_bytes(bytes))
        log(log_str, 'memory', log_type)

    return True
Exemplo n.º 10
0
def show_GPU_mem():
    import pycuda.driver as cuda

    mem_free = float(cuda.mem_get_info()[0])
    mem_free_per = mem_free / float(cuda.mem_get_info()[1])
    mem_used = float(cuda.mem_get_info()[1] - cuda.mem_get_info()[0])
    mem_used_per = mem_used / float(cuda.mem_get_info()[1])

    print '\nGPU memory available {0} Mbytes, {1} % of total \n'.format(
        mem_free / 1024**2, 100 * mem_free_per)

    print 'GPU memory used {0} Mbytes, {1} % of total \n'.format(
        mem_used / 1024**2, 100 * mem_used_per)
Exemplo n.º 11
0
def show_GPU_mem():
    import pycuda.driver as cuda

    mem_free = float(cuda.mem_get_info()[0])
    mem_free_per = mem_free/float(cuda.mem_get_info()[1])
    mem_used = float(cuda.mem_get_info()[1] - cuda.mem_get_info()[0])
    mem_used_per = mem_used/float(cuda.mem_get_info()[1])
    
    print '\nGPU memory available {0} Mbytes, {1} % of total \n'.format(
    mem_free/1024**2, 100*mem_free_per)
    
    print 'GPU memory used {0} Mbytes, {1} % of total \n'.format(
    mem_used/1024**2, 100*mem_used_per)
Exemplo n.º 12
0
def swap_out_to_hard_disk(elem):
	# prepare variables
	return_falg = True
	u, ss, sp = elem
	dp = data_list[u][ss][sp]
	bytes = dp.data_bytes

	# now we will swap out, this CPU to hard disk
	# so first we should check hard disk has enough free memory
	file_name = '%d_temp'%(rank)
	os.system('df . > %s'%(file_name))

	f = open(file_name)
	s = f.read()
	f.close()

	ss = s.split()

	# get available byte
	avail = int(ss[10])

	if log_type in ['memory']:
		fm,tm = cuda.mem_get_info()
		log_str = "HARD disk MEM CEHCK Before swap out: %s Free, %s Maximum, %s Want to use"%(print_bytes(avail),'-',print_bytes(bytes))
		log(log_str,'memory',log_type)

	if bytes > avail:
		# we failed make swap file in hard disk
		return False

	# now we have enough hard disk to make swap file
	# temp file name, "temp_data, rank, u, ss, sp"
	file_name = 'temp_data, %s, %s, %s, %s'%(rank, u, ss, sp)
	f = open(file_name,'wb')
	f.write(dp.data)
	f.close()

	dp.data = None
	dp.hard_disk = file_name
	dp.memory_type = 'hard_disk'

	cpu_list.remove(elem)
	hard_list.append(elem)
	
	if log_type in ['memory']:
		fm,tm = cuda.mem_get_info()
		log_str = "CPU MEM CEHCK After swap out: %s Free, %s Maximum, %s Want to use"%(print_bytes(fm),print_bytes(tm),print_bytes(bytes))
		log(log_str,'memory',log_type)

	return True
Exemplo n.º 13
0
def create_array(n_elements,
                 n_dims,
                 device_array,
                 list_array,
                 seed=0,
                 ftype=FTYPE):
    """Create an arbitrary array for test_GPUHist."""
    assert n_elements > 0
    assert n_dims > 0
    center = 1e3
    sigm = 1e3
    rand = np.random.RandomState(seed)
    values = rand.normal(loc=center, scale=sigm,
                         size=(n_elements, n_dims)).astype(ftype)
    if device_array or (list_array and n_dims > 3):
        try:
            d_values = cuda.mem_alloc(values.nbytes)
            cuda.memcpy_htod(d_values, values)
            return values, d_values
        except pycuda._driver.MemoryError:
            print("Error at allocating memory")
            available_memory = cuda.mem_get_info()[0]
            print("You have %d Mbytes memory. Trying to allocate %d"
                  " bytes (%d Mbytes) of memory\n" %
                  (available_memory /
                   (1024 * 1024), values.nbytes, values.nbytes /
                   (1024 * 1024)))
            return values, values
    elif list_array and n_dims < 4:
        try:
            # We need a different shape here: Each array in a list shall
            # contain one dimension of all data.
            d_values = []
            for i in xrange(n_dims):
                tmp_values = np.asarray([v[i] for v in values])
                d_values.append(cuda.mem_alloc(tmp_values.nbytes))
                cuda.memcpy_htod(d_values[i], tmp_values)
            return values, d_values
        except pycuda._driver.MemoryError:
            print("Error at allocating memory")
            available_memory = cuda.mem_get_info()[0]
            print("You have %d Mbytes memory. Trying to allocate %d"
                  " bytes (%d Mbytes) of memory\n" %
                  (available_memory /
                   (1024 * 1024), values.nbytes, values.nbytes /
                   (1024 * 1024)))
            return values, values
    else:
        return values, values
Exemplo n.º 14
0
    def run(self):
        drv.init()
        a0=numpy.zeros((p,),dtype=numpy.complex64)
        self.dev = drv.Device(self.number)
        self.ctx = self.dev.make_context()
#TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING)
        print drv.mem_get_info() 
        self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64)
        self.gpu_b = garray.zeros_like(self.gpu_a)
        self.gpu_a = garray.to_gpu(self.input_cpu)
        plan = Plan(a0.shape,context=self.ctx)
        plan.execute(self.gpu_a, self.gpu_b, batch=p/m)
        self.temp = self.gpu_b.get()
        print output_cpu._closed
        self.output_cpu.put(self.temp)
Exemplo n.º 15
0
def init_module():
    global context, context_wrapper
    if context_wrapper is not None:
        return
    log_sys_info()
    device_id, device = select_device()
    context = device.make_context(flags=driver.ctx_flags.SCHED_YIELD
                                  | driver.ctx_flags.MAP_HOST)
    debug("testing with context=%s", context)
    debug("api version=%s", context.get_api_version())
    free, total = driver.mem_get_info()
    debug("using device %s", device_info(device))
    debug("memory: free=%sMB, total=%sMB", int(free / 1024 / 1024),
          int(total / 1024 / 1024))
    context_wrapper = CudaContextWrapper(context)

    #generate kernel sources:
    for rgb_format, yuv_formats in COLORSPACES_MAP.items():
        m = gen_rgb_to_yuv_kernels(rgb_format, yuv_formats)
        KERNELS_MAP.update(m)
    _kernel_names_ = sorted(set([x[0] for x in KERNELS_MAP.values()]))
    log.info("%s csc_nvcuda kernels: %s", len(_kernel_names_),
             ", ".join(_kernel_names_))

    #now, pre-compile the kernels:
    for src_format, dst_format in KERNELS_MAP.keys():
        get_CUDA_kernel(device_id, src_format, dst_format)
    context.pop()
Exemplo n.º 16
0
    def filter(self, video_input):
        """
        Performs RF filtering on input video
        for all the rfs
        """
        if len(video_input.shape) == 2:
            # if input has 2 dimensions
            assert video_input.shape[1] == self.size
        else:
            # if input has 3 dimensions
            assert (video_input.shape[1] * video_input.shape[2] == self.size)
        # rasterizing inputs
        video_input.resize((video_input.shape[0], self.size))

        d_video = parray.to_gpu(video_input)
        d_output = parray.empty((self.num_neurons, video_input.shape[0]),
                                self.dtype)
        free, total = cuda.mem_get_info()
        self.ONE_TIME_FILTERS = ((free // self.dtype.itemsize) * 3 // 4 //
                                 self.size)
        self.ONE_TIME_FILTERS -= self.ONE_TIME_FILTERS % 2
        self.ONE_TIME_FILTERS = min(self.ONE_TIME_FILTERS, self.num_neurons)
        handle = la.cublashandle()

        for i in np.arange(0, self.num_neurons, self.ONE_TIME_FILTERS):
            Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i)
            self.generate_filters(startbias=i, N_filters=Nfilters)
            la.dot(self.filters,
                   d_video,
                   opb='t',
                   C=d_output[i:i + Nfilters],
                   handle=handle)
        del self.filters
        return d_output.T()
Exemplo n.º 17
0
def init_module():
    global context, context_wrapper
    if context_wrapper is not None:
        return
    log_sys_info()
    device_id, device = select_device()
    context = device.make_context(flags=driver.ctx_flags.SCHED_YIELD | driver.ctx_flags.MAP_HOST)
    debug("testing with context=%s", context)
    debug("api version=%s", context.get_api_version())
    free, total = driver.mem_get_info()
    debug("using device %s",  device_info(device))
    debug("memory: free=%sMB, total=%sMB",  int(free/1024/1024), int(total/1024/1024))
    context_wrapper = CudaContextWrapper(context)

    #generate kernel sources:
    for rgb_format, yuv_formats in COLORSPACES_MAP.items():
        m = gen_rgb_to_yuv_kernels(rgb_format, yuv_formats)
        KERNELS_MAP.update(m)
    _kernel_names_ = sorted(set([x[0] for x in KERNELS_MAP.values()]))
    log.info("%s csc_nvcuda kernels: %s", len(_kernel_names_), ", ".join(_kernel_names_))

    #now, pre-compile the kernels:
    for src_format, dst_format in KERNELS_MAP.keys():
        get_CUDA_kernel(device_id, src_format, dst_format)
    context.pop()
Exemplo n.º 18
0
def setDevice(ndev=None):
    ''' To use CUDA or OpenCL you need a context and a device to stablish the context o                   communication '''
    cuda.init()
    nDevices = cuda.Device.count()
    print "Available Devices:"
    for i in range(nDevices):
        dev = cuda.Device(i)
        try:
            mem = cuda.mem_get_info()[-i - 1]
        except:
            mem = 0
        print "  Device {0}: {1}, Total (MB) {2:.1f}, Free (MB) {3:.1f}".format(
            i, dev.name(),
            dev.total_memory() / 2.**20, mem / 2.**20)  #mem/2.**20 )
    devNumber = 0
    if nDevices > 1:
        if ndev == None:
            devNumber = int(raw_input("Select device number: "))
        else:
            devNumber = ndev
    dev = cuda.Device(devNumber)
    #cuda.Context.pop()  #Disable previus CUDA context
    ctxCUDA = dev.make_context()
    devdata = DeviceData(dev)
    print "Using device {0}: {1}".format(devNumber, dev.name())
    return ctxCUDA, dev, devdata
Exemplo n.º 19
0
Arquivo: vrf.py Projeto: bionet/vtem
    def filter(self, V):
        """
        Filter a video V
        Must set up parameters of CS RF first
        
        Parameters
        ----------
        V : 3D ndarray, with shape (num_frames, Px, Py)
           
        Returns
        -------
        the filtered output by the gabor filters specified in self
        output is a PitchArray with shape (num_neurons, num_frames),
        jth row of which is the output of jth gabor filter

        """
        d_output = parray.empty((self.num_neurons, V.shape[0]), self.dtype)
        d_video = parray.to_gpu(V.reshape(V.shape[0], V.shape[1]*V.shape[2]))
    
        free,total = cuda.mem_get_info()
        self.ONE_TIME_FILTERS = (free / self.dtype.itemsize) * 3/4 / self.Pxall / self.Pyall
        
        handle = la.cublashandle()
        for i in np.arange(0,self.num_neurons,self.ONE_TIME_FILTERS):
            Nfilters =  min(self.ONE_TIME_FILTERS, self.num_neurons - i)
            self.generate_visual_receptive_fields(startbias = i, N_filters = Nfilters)
            cublasDgemm(handle.handle, 't','n', V.shape[0], int(Nfilters), self.Pxall*self.Pyall, self.dx*self.dy, d_video.gpudata, d_video.ld, self.filters.gpudata, self.filters.ld, 0, int(int(d_output.gpudata)+int(d_output.ld*i*d_output.dtype.itemsize)) , d_output.ld)
        return d_output.T()
Exemplo n.º 20
0
    def filter(self, video_input):
        """
        Performs RF filtering on input video
        for all the rfs
        """
        if len(video_input.shape) == 2:
            # if input has 2 dimensions
            assert video_input.shape[1] == self.size
        else:
            # if input has 3 dimensions
            assert (video_input.shape[1]*video_input.shape[2] ==
                    self.size)
        # rasterizing inputs
        video_input.resize((video_input.shape[0], self.size))

        d_video = parray.to_gpu(video_input)
        d_output = parray.empty((self.num_neurons, video_input.shape[0]),
                                self.dtype)
        free, total = cuda.mem_get_info()
        self.ONE_TIME_FILTERS = ((free // self.dtype.itemsize)
                                 * 3 // 4 // self.size)
        self.ONE_TIME_FILTERS -= self.ONE_TIME_FILTERS % 2
        self.ONE_TIME_FILTERS = min(self.ONE_TIME_FILTERS, self.num_neurons)
        handle = la.cublashandle()

        for i in np.arange(0, self.num_neurons, self.ONE_TIME_FILTERS):
            Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i)
            self.generate_filters(startbias=i, N_filters=Nfilters)
            la.dot(self.filters, d_video, opb='t',
                   C=d_output[i: i+Nfilters],
                   handle=handle)
        del self.filters
        return d_output.T()
Exemplo n.º 21
0
def init_cuda():
    """Initialize CUDA functionality

    This function attempts to load the necessary interfaces
    (hardware connectivity) to run CUDA-based filtering. This
    function should only need to be run once per session.

    If the config var (set via mne.set_config or in ENV)
    MNE_USE_CUDA == 'true', this function will be executed when
    importing mne. If this variable is not set, this function can
    be manually executed.
    """
    global cuda_capable
    global cuda_multiply_inplace_c128
    global cuda_halve_c128
    global cuda_real_c128
    if cuda_capable is True:
        logger.info("CUDA previously enabled, currently %s available memory" % sizeof_fmt(mem_get_info()[0]))
        return
    # Triage possible errors for informative messaging
    cuda_capable = False
    try:
        import pycuda.gpuarray
        import pycuda.driver
    except ImportError:
        logger.warning("module pycuda not found, CUDA not enabled")
        return
    try:
        # Initialize CUDA; happens with importing autoinit
        import pycuda.autoinit  # noqa, analysis:ignore
    except ImportError:
        logger.warning("pycuda.autoinit could not be imported, likely " "a hardware error, CUDA not enabled")
        return
    # Make sure scikits.cuda is installed
    try:
        from scikits.cuda import fft as cudafft
    except ImportError:
        logger.warning("module scikits.cuda not found, CUDA not " "enabled")
        return

    # Make our multiply inplace kernel
    from pycuda.elementwise import ElementwiseKernel

    # let's construct our own CUDA multiply in-place function
    cuda_multiply_inplace_c128 = ElementwiseKernel(
        "pycuda::complex<double> *a, pycuda::complex<double> *b", "b[i] *= a[i]", "multiply_inplace"
    )
    cuda_halve_c128 = ElementwiseKernel("pycuda::complex<double> *a", "a[i] /= 2.0", "halve_value")
    cuda_real_c128 = ElementwiseKernel("pycuda::complex<double> *a", "a[i] = real(a[i])", "real_value")

    # Make sure we can use 64-bit FFTs
    try:
        cudafft.Plan(16, np.float64, np.complex128)  # will get auto-GC'ed
    except:
        logger.warning("Device does not support 64-bit FFTs, " "CUDA not enabled")
        return
    cuda_capable = True
    # Figure out limit for CUDA FFT calculations
    logger.info("Enabling CUDA with %s available memory" % sizeof_fmt(mem_get_info()[0]))
Exemplo n.º 22
0
def init_all_devices():
    global DEVICES
    if DEVICES is not None:
        return DEVICES
    log.info("CUDA initialization (this may take a few seconds)")
    driver.init()
    DEVICES = []
    log("CUDA driver version=%s", driver.get_driver_version())
    ngpus = driver.Device.count()
    log.info("CUDA %s / PyCUDA %s, found %s device(s):",
             ".".join([str(x) for x in driver.get_version()]),
             pycuda.VERSION_TEXT, ngpus)
    da = driver.device_attribute
    cf = driver.ctx_flags
    for i in range(ngpus):
        device = None
        context = None
        try:
            device = driver.Device(i)
            log(" + testing device %s: %s", i, device_info(device))
            host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY)
            if not host_mem:
                log.warn("skipping device %s (cannot map host memory)",
                         device_info(device))
                continue
            context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
            log("   created context=%s", context)
            log("   api version=%s", context.get_api_version())
            free, total = driver.mem_get_info()
            log("   memory: free=%sMB, total=%sMB", int(free / 1024 / 1024),
                int(total / 1024 / 1024))
            log("   multi-processors: %s, clock rate: %s",
                device.get_attribute(da.MULTIPROCESSOR_COUNT),
                device.get_attribute(da.CLOCK_RATE))
            log("   max block sizes: (%s, %s, %s)",
                device.get_attribute(da.MAX_BLOCK_DIM_X),
                device.get_attribute(da.MAX_BLOCK_DIM_Y),
                device.get_attribute(da.MAX_BLOCK_DIM_Z))
            log("   max grid sizes: (%s, %s, %s)",
                device.get_attribute(da.MAX_GRID_DIM_X),
                device.get_attribute(da.MAX_GRID_DIM_Y),
                device.get_attribute(da.MAX_GRID_DIM_Z))
            max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH)
            max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT)
            log("   maximum texture size: %sx%s", max_width, max_height)
            log("   max pitch: %s", device.get_attribute(da.MAX_PITCH))
            SMmajor, SMminor = device.compute_capability()
            compute = (SMmajor << 4) + SMminor
            log("   compute capability: %#x (%s.%s)", compute, SMmajor,
                SMminor)
            try:
                DEVICES.append(i)
                log.info("  + %s (memory: %s%% free, compute: %s.%s)",
                         device_info(device), 100 * free / total, SMmajor,
                         SMminor)
            finally:
                context.pop()
        except Exception, e:
            log.error("error on device %s: %s", (device or i), e)
Exemplo n.º 23
0
 def is_gpu_memory_enough(self, a):
     if CUDA:
         rest, total = driver.mem_get_info()
         
         if (sys.getsizeof(a) * 2) < rest:
             return True
     else:
         return True
Exemplo n.º 24
0
 def is_gpu_memory_enough(self, a):
     if CUDA:
         rest, total = driver.mem_get_info()
         
         if (sys.getsizeof(a) * 2) < rest:
             return True
     else:
         return True
Exemplo n.º 25
0
    def is_memory_enough(a):
        try:
            rest, total = driver.mem_get_info()
        except driver.LogicError: # child thread cannot use context from the main thread...
            # the following does not work yet

            from pycuda import tools
            import skcuda
            
            driver.init()
            context = tools.make_default_context() # try to make as new context, but cannot deactivate the old context stack
            device = context.get_device()
            skcuda.misc.init_context(device)
            rest, total = driver.mem_get_info()
            
        if (sys.getsizeof(a) * 2) < rest:
            return True
Exemplo n.º 26
0
def select_device(preferred_device_id=-1, preferred_device_name=None, min_compute=0):
    if preferred_device_name is None:
        preferred_device_name = get_pref("device-name")
    if preferred_device_id<0:
        device_id = get_pref("device-id")
        if device_id is not None and device_id>=0:
            preferred_device_id = device_id
    devices = init_all_devices()
    global DEVICE_STATE
    free_pct = 0
    cf = driver.ctx_flags
    #split device list according to device state:
    ok_devices = [device_id for device_id in devices if DEVICE_STATE.get(device_id, True) is True]
    nok_devices = [device_id for device_id in devices if DEVICE_STATE.get(device_id, True) is not True]
    for list_name, device_list in {"OK" : ok_devices, "failing" : nok_devices}.items():
        selected_device_id = -1
        selected_device = None
        log("will test %s device%s from %s list: %s", len(device_list), engs(device_list), list_name, device_list)
        for device_id in device_list:
            context = None
            try:
                log("device %i", device_id)
                device = driver.Device(device_id)
                log("select_device: testing device %s: %s", device_id, device_info(device))
                context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
                log("created context=%s", context)
                free, total = driver.mem_get_info()
                log("memory: free=%sMB, total=%sMB",  int(free/1024/1024), int(total/1024/1024))
                tpct = 100*free/total
                SMmajor, SMminor = device.compute_capability()
                compute = (SMmajor<<4) + SMminor
                if compute<min_compute:
                    log("ignoring device %s: compute capability %#x (minimum %#x required)", device_info(device), compute, min_compute)
                elif device_id==preferred_device_id:
                    l = log
                    if len(device_list)>1:
                        l = log.info
                    l("device matches preferred device id %s: %s", preferred_device_id, device_info(device))
                    return device_id, device
                elif preferred_device_name and device_info(device).find(preferred_device_name)>=0:
                    log("device matches preferred device name: %s", preferred_device_name)
                    return device_id, device
                elif tpct>=MIN_FREE_MEMORY and tpct>free_pct:
                    log("device has enough free memory: %i (min=%i, current best device=%i)", tpct, MIN_FREE_MEMORY, free_pct)
                    selected_device = device
                    selected_device_id = device_id
                    free_pct = tpct
            finally:
                if context:
                    context.pop()
                    context.detach()
        if selected_device_id>=0 and selected_device:
            l = log
            if len(devices)>1:
                l = log.info
            l("selected device %s: %s", device_id, device_info(device))
            return selected_device_id, selected_device
    return -1, None
Exemplo n.º 27
0
 def measure_gpu_memory(ident=""):
     global DATA_MEM
     torch.cuda.synchronize()
     old = DATA_MEM[1] - DATA_MEM[0]
     mem = mem_get_info()
     now = mem[1] - mem[0]
     text = "[Memory] {} {} + {} = {}".format(ident, format_memory(old), format_memory(now - old), format_memory(now))
     DATA_MEM = mem
     return text
Exemplo n.º 28
0
def gpu_stat():
    if torch.cuda.is_available():

        def pretty_bytes(bytes, precision=1):
            abbrevs = ((1<<50, 'PB'),(1<<40, 'TB'),(1<<30, 'GB'),(1<<20, 'MB'),(1<<10, 'kB'),(1, 'bytes'))
            if bytes == 1:
                return '1 byte'
            for factor, suffix in abbrevs:
                if bytes >= factor:
                    break
            return '%.*f%s' % (precision, bytes / factor, suffix)

        device = autoinit.device
        print()
        print( 'GPU Name: %s' % device.name())
        print( 'GPU Memory: %s' % pretty_bytes(device.total_memory()))
        print( 'CUDA Version: %s' % str(driver.get_version()))
        print( 'GPU Free/Total Memory: %d%%' % ((driver.mem_get_info()[0] /driver.mem_get_info()[1]) * 100))
Exemplo n.º 29
0
def select_device(preferred_device_id=-1, preferred_device_name=None, min_compute=0):
    if preferred_device_name is None:
        preferred_device_name = get_pref("device-name")
    if preferred_device_id<0:
        device_id = get_pref("device-id")
        if device_id>=0:
            preferred_device_id = device_id
    devices = init_all_devices()
    global DEVICE_STATE
    free_pct = 0
    cf = driver.ctx_flags
    #split device list according to device state:
    ok_devices = [device_id for device_id in devices if DEVICE_STATE.get(device_id, True) is True]
    nok_devices = [device_id for device_id in devices if DEVICE_STATE.get(device_id, True) is not True]
    for list_name, device_list in {"OK" : ok_devices, "failing" : nok_devices}.items():
        selected_device_id = None
        selected_device = None
        log("will test %s device%s from %s list: %s", len(device_list), engs(device_list), list_name, device_list)
        for device_id in device_list:
            context = None
            try:
                device = driver.Device(device_id)
                log("select_device: testing device %s: %s", device_id, device_info(device))
                context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
                log("created context=%s", context)
                free, total = driver.mem_get_info()
                log("memory: free=%sMB, total=%sMB",  int(free/1024/1024), int(total/1024/1024))
                tpct = 100*free/total
                SMmajor, SMminor = device.compute_capability()
                compute = (SMmajor<<4) + SMminor
                if compute<min_compute:
                    log("ignoring device %s: compute capability %#x (minimum %#x required)", device_info(device), compute, min_compute)
                elif device_id==preferred_device_id:
                    l = log
                    if len(device_list)>1:
                        l = log.info
                    l("device matches preferred device id %s: %s", preferred_device_id, device_info(device))
                    return device_id, device
                elif preferred_device_name and device_info(device).find(preferred_device_name)>=0:
                    log("device matches preferred device name: %s", preferred_device_name)
                    return device_id, device
                elif tpct>free_pct:
                    selected_device = device
                    selected_device_id = device_id
                    free_pct = tpct
            finally:
                if context:
                    context.pop()
                    context.detach()
        if selected_device_id>=0 and selected_device:
            l = log
            if len(devices)>1:
                l = log.info
            l("selected device %s: %s", device_id, device_info(device))
            return selected_device_id, selected_device
    return -1, None
Exemplo n.º 30
0
def init_all_devices():
    global DEVICES, DEVICE_INFO
    if DEVICES is not None:
        return  DEVICES
    log.info("CUDA initialization (this may take a few seconds)")
    driver.init()
    DEVICES = []
    DEVICE_INFO = {}
    log("CUDA driver version=%s", driver.get_driver_version())
    ngpus = driver.Device.count()
    if ngpus==0:
        log.info("CUDA %s / PyCUDA %s, no devices found", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT)
        return DEVICES
    da = driver.device_attribute
    cf = driver.ctx_flags
    for i in range(ngpus):
        device = None
        context = None
        devinfo = "gpu %i" % i
        try:
            device = driver.Device(i)
            devinfo = device_info(device)
            log(" + testing device %s: %s", i, devinfo)
            DEVICE_INFO[i] = devinfo
            host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY)
            if not host_mem:
                log.warn("skipping device %s (cannot map host memory)", devinfo)
                continue
            context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
            try:
                log("   created context=%s", context)
                log("   api version=%s", context.get_api_version())
                free, total = driver.mem_get_info()
                log("   memory: free=%sMB, total=%sMB",  int(free/1024/1024), int(total/1024/1024))
                log("   multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE))
                log("   max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z))
                log("   max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z))
                max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH)
                max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT)
                log("   maximum texture size: %sx%s", max_width, max_height)
                log("   max pitch: %s", device.get_attribute(da.MAX_PITCH))
                SMmajor, SMminor = device.compute_capability()
                compute = (SMmajor<<4) + SMminor
                log("   compute capability: %#x (%s.%s)", compute, SMmajor, SMminor)
                if i==0:
                    #we print the list info "header" from inside the loop
                    #so that the log output is bunched up together
                    log.info("CUDA %s / PyCUDA %s, found %s device%s:",
                             ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT, ngpus, engs(ngpus))
                DEVICES.append(i)
                log.info("  + %s (memory: %s%% free, compute: %s.%s)", device_info(device), 100*free/total, SMmajor, SMminor)
            finally:
                context.pop()
        except Exception as e:
            log.error("error on device %s: %s", devinfo, e)
    return DEVICES
Exemplo n.º 31
0
    def infer(self, input_img, output_size, num_binding):
        #self.runtime=self.create_runtime()
        #self.context=self.create_context()

        assert (self.__engine.get_nb_bindings() == num_binding)
        output = np.empty(output_size, dtype=np.float32)

        d_input = cuda.mem_alloc(self.batchsize * input_img.size *
                                 input_img.dtype.itemsize)
        d_output = cuda.mem_alloc(self.batchsize * output.size *
                                  output.dtype.itemsize)

        # pointers to gpu memory
        bindings = [int(d_input), int(d_output)]

        stream = cuda.Stream()

        #transfer input data to device
        cuda.memcpy_htod_async(d_input, input_img, stream)

        #execute model
        self.context.enqueue(self.batchsize, bindings, stream.handle, None)

        #transfer predictions back
        cuda.memcpy_dtoh_async(output, d_output, stream)

        #syncronize threads
        stream.synchronize()

        print 'all of activities in stream is done: {}'.format(
            stream.is_done())
        #destroy cuda context
        d_input.free()
        d_output.free()

        print 1999 - cuda.mem_get_info()[0] / 1048576, cuda.mem_get_info(
        )[1] / 1048576

        #self.context.destroy

        #self.runtime.destroy()

        return output
def ShowGPUInfo():
    (free,total) = driver.mem_get_info()
    print('Global memory occupancy:%f%% free' % (free*100 / total))
    for devicenum in range(driver.Device.count()):
        device = driver.Device(devicenum)
        attrs = device.get_attributes()
        #Beyond this point is just pretty printing
        print('\n===Attributes for device %d' % devicenum)
        for (key,value) in attrs.iteritems():
            print('    %s:%s' % (str(key), str(value)))
Exemplo n.º 33
0
def mem_check_and_malloc(bytes):
    fm, tm = cuda.mem_get_info()

    if log_type in ['memory']:
        log_str = "RANK %d, GPU MEM CEHCK before malloc: %s Free, %s Maximum, %s Want to use" % (
            rank, print_bytes(fm), print_bytes(tm), print_bytes(bytes))
        log(log_str, 'memory', log_type)

    # we have enough memory

    if fm < bytes:
        # we don't have enough memory, free data fool
        print "BUFFER POOL"
        size = fm
        for elem in list(data_pool):
            usage = elem['usage']
            devptr = elem['devptr']
            devptr.free()
            print "FREE data", usage
            size += usage
            data_pool.remove(elem)
            if size >= bytes: break

        fm, tm = cuda.mem_get_info()

    if fm >= bytes:
        # we have enough memory, just malloc
        afm, tm = cuda.mem_get_info()

        devptr = cuda.mem_alloc(bytes)

        bfm, tm = cuda.mem_get_info()

        if log_type in ['memory']:
            fm, tm = cuda.mem_get_info()
            log_str = "RANK %d, GPU MALLOC AFTER: %s Free, %s Maximum, %s Want to use" % (
                rank, print_bytes(fm), print_bytes(tm), print_bytes(bytes))
            log(log_str, 'memory', log_type)
        return True, devptr

    # we don't have enough memory
    return False, None
Exemplo n.º 34
0
def mem_check_and_malloc(bytes):
	fm,tm = cuda.mem_get_info()

	if log_type in ['memory']:
		log_str = "RANK %d, GPU MEM CEHCK before malloc: %s Free, %s Maximum, %s Want to use"%(rank, print_bytes(fm),print_bytes(tm),print_bytes(bytes))
		log(log_str,'memory',log_type)
		

	# we have enough memory

	if fm < bytes:
		# we don't have enough memory, free data fool
		print "BUFFER POOL"
		size = fm
		for elem in list(data_pool):
			usage = elem['usage']
			devptr = elem['devptr']
			devptr.free()
			print "FREE data", usage
			size += usage
			data_pool.remove(elem)
			if size >= bytes: break
	
		fm,tm = cuda.mem_get_info()

	if fm >= bytes:
		# we have enough memory, just malloc
		afm,tm = cuda.mem_get_info()

		devptr = cuda.mem_alloc(bytes)
		
		bfm,tm = cuda.mem_get_info()

		if log_type in ['memory']:
			fm,tm = cuda.mem_get_info()
			log_str = "RANK %d, GPU MALLOC AFTER: %s Free, %s Maximum, %s Want to use"%(rank, print_bytes(fm),print_bytes(tm),print_bytes(bytes))
			log(log_str, 'memory', log_type)
		return True, devptr


	# we don't have enough memory
	return False, None
Exemplo n.º 35
0
    def meminfo(self,kernel,k=-1,o=-1,threads=[],name=""):
        (free,total)=cuda.mem_get_info()
        shared=kernel.shared_size_bytes
        regs=kernel.num_regs
        local=kernel.local_size_bytes
        const=kernel.const_size_bytes
        mbpt=kernel.max_threads_per_block
        devdata=ctools.DeviceData()
        occupancy=ctools.OccupancyRecord(devdata,threads[0], shared_mem=shared,registers=regs)

        util.log.info("%s(%03d,%d)=L:%d,S:%d,R:%d,C:%d,MT:%d,T:%d,OC:%f,Free:%d"%(name,k,o,local,shared,regs,const,mbpt,threads[0],occupancy.occupancy,(free*100)/total))
Exemplo n.º 36
0
def load_device(device_id):
    log("load_device(%i)", device_id)
    device = driver.Device(device_id)
    log("select_device: testing device %s: %s", device_id, device_info(device))
    cf = driver.ctx_flags
    context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
    log("created context=%s", context)
    free, total = driver.mem_get_info()
    log("memory: free=%sMB, total=%sMB",  int(free/1024/1024), int(total/1024/1024))
    tpct = 100*free//total
    return device, context, tpct
Exemplo n.º 37
0
def select_device(preferred_device_id=DEFAULT_CUDA_DEVICE_ID, min_compute=0):
    devices = init_all_devices()
    global DEVICE_STATE
    free_pct = 0
    cf = driver.ctx_flags
    #split device list according to device state:
    ok_devices = [
        device_id for device_id in devices
        if DEVICE_STATE.get(device_id, True) is True
    ]
    nok_devices = [
        device_id for device_id in devices
        if DEVICE_STATE.get(device_id, True) is not True
    ]
    for list_name, device_list in {
            "OK": ok_devices,
            "failing": nok_devices
    }.items():
        selected_device_id = None
        selected_device = None
        log("will test %s devices from %s list: %s", len(device_list),
            list_name, device_list)
        for device_id in device_list:
            context = None
            try:
                device = driver.Device(device_id)
                log("select_device: testing device %s: %s", device_id,
                    device_info(device))
                context = device.make_context(flags=cf.SCHED_YIELD
                                              | cf.MAP_HOST)
                log("created context=%s", context)
                free, total = driver.mem_get_info()
                log("memory: free=%sMB, total=%sMB", int(free / 1024 / 1024),
                    int(total / 1024 / 1024))
                tpct = 100 * free / total
                SMmajor, SMminor = device.compute_capability()
                compute = (SMmajor << 4) + SMminor
                if compute < min_compute:
                    log(
                        "ignoring device %s: compute capability %#x (minimum %#x required)",
                        device_info(device), compute, min_compute)
                elif device_id == preferred_device_id:
                    return device_id, device
                elif tpct > free_pct:
                    selected_device = device
                    selected_device_id = device_id
            finally:
                if context:
                    context.pop()
                    context.detach()
        if selected_device_id >= 0 and selected_device:
            log("select device: %s / %s", device_id, device)
            return selected_device_id, selected_device
    return -1, None
Exemplo n.º 38
0
    def run(self):
        try:
            #Initialise this device
            self.local.dev = cuda.Device(self.device)
            self.local.ctx = self.local.dev.make_context()
            self.local.ctx.push()
            (free,total)=cuda.mem_get_info()
            util.log.info("Initialising CUDA device %d:(%.2f%% Free)"%(self.device,(free*100.0/total)))
        except pycuda._driver.MemoryError:
            util.log.info("Balls")
            raise
            return
        
        #Initialise the kernel
        self.local.kernels=SourceModule(self.r_kernels)
                
        gridmax=65535
        
        #Kernels
        self.k_osbprepare=self.local.kernels.get_function("lk_osbprepare_permutations")
        self.k_osbsolve=self.local.kernels.get_function("solve_permutations")
        self.k_osblk=self.local.kernels.get_function("lk_max_permutations")
        self.k_solve=self.local.kernels.get_function("solve")        
        self.k_isboptimise=self.local.kernels.get_function("isb_optimise_pk")
        self.k_isboptimise_inc=self.local.kernels.get_function("isb_optimise_inc")
        self.k_calcpsd=self.local.kernels.get_function("calc_psd")
        self.k_osb_optimise_p=self.local.kernels.get_function("osb_optimise_p")

        #loop to empty queue
        while True:
            #grab args from queue (block until recieved)
            queueitem=self.argqueue.get()
            func=queueitem[0]
            args=queueitem[1:]
            
            if func=='osb_optimise_p':
                result=self.osb_optimise_p(*args)
                self.resqueue.put((func,result))
            elif func=='isb_optimise_p':
                result=self.isb_optimise_p(*args)
                self.resqueue.put((func,result))
            elif func=='isb_optimise_inc':
                result=self.isb_optimise_inc(*args)
                self.resqueue.put((func,result))
            elif func=='mipb_update_cost':
                result=self.mipb_update_cost(*args)
                self.resqueue.put((func,result))
            elif func=='calc_psd':
                result=self.calc_psd(*args)
                self.resqueue.put((func,result))
            else:
                self.resqueue.put(None)
            
            self.argqueue.task_done()#nothing seems to get past this
Exemplo n.º 39
0
    def usage_info(self):
        (free, total) = drv.mem_get_info()
        print("Global memory occupancy:{}% free".format(free * 100 / total))

        for devicenum in range(drv.Device.count()):
            device = drv.Device(devicenum)
            attrs = device.get_attributes()

            #Beyond this point is just pretty printing
            print("\n===Attributes for device {}".format(device))
            for (key, value) in attrs.items():
                print("{}:{}".format(str(key), str(value)))
def remaining_mem(N, L, flag):
    meminfo = cuda.mem_get_info()
    print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))
    available_mem = float(meminfo[0])
    available_mem /= np.dtype(np.float32).itemsize
    NL = N * L
    if flag is 0:
        available_mem -= NL
    x = available_mem
    temp = N * 2
    x /= temp
    return int(x)
Exemplo n.º 41
0
def cuda_mem_check(device_dictionary,cache_size,arrays):
    """Function to check if there will be enought memory in the GPU
       to perform the computation"""

    module_logger.info('Checking if the system has enought memory on device.')

    input_size=0

    for array in arrays:
        input_size = input_size + array.nbytes

    cache_size_bytes = cache_size *4

    free,total= driver.mem_get_info()

    max_mem_size=512*1000

    memory_limit=(total-input_size)/device_dictionary['MULTIPROCESSOR_COUNT']/device_dictionary[
        'MAX_THREADS_PER_MULTIPROCESSOR']

    limitator=min(max_mem_size,memory_limit)

    if cache_size_bytes >= limitator:

        module_logger.error("Cache memory per thread ("+bytes2human(cache_size_bytes)+") is greater than memory "
                            "limitation per thread ("+bytes2human(limitator)+")")
        exit()


    elif input_size >= total:

        module_logger.error("The arrays to transfer ("+bytes2human(input_size)+") is greater than global memory "
                            "limitations ("+bytes2human(total)+")")
        exit()


    else:

        headers=("Cache size per thread","Maximum memory size per thread")
        printdata=(bytes2human(cache_size_bytes),bytes2human(limitator))
        stype('\n'+'Memory limitation status on device:')
        stype (tabulate.tabulate(zip(headers,printdata), headers=['Variable Name', 'Value'],
                                             tablefmt='rst')+'\n')

        module_logger.ok('The system has enought memory to perform the calculation.')
        module_logger.info('Using '+bytes2human(cache_size_bytes)+' out of '+bytes2human(limitator)+'.')

        # module_logger.warning("Warning: The cuda kernel will use max capacity of graphics procesors, the screen could "
        #                "become unresponsible during the process.")

        stype('\n'+bcolors.WARNING +"Warning: The cuda kernel will use max capacity of graphics procesors,"
                                    +'\n the screen could become unresponsible during the process.'+ bcolors.ENDC+'\n')
Exemplo n.º 42
0
def showDeviceAttributes():

    (free, total) = cuda.mem_get_info()
    print("Global memory occupancy:%f%% free" % (free * 100 / total))

    for devicenum in range(cuda.Device.count()):
        device = cuda.Device(devicenum)
        attrs = device.get_attributes()

        #Beyond this point is just pretty printing
        print("\n===Attributes for device %d" % devicenum)
        for (key, value) in attrs.iteritems():
            print("%s:%s" % (str(key), str(value)))
Exemplo n.º 43
0
def showDeviceAttributes():

    (free,total)=cuda.mem_get_info()
    print("Global memory occupancy:%f%% free"%(free*100/total))
    
    for devicenum in range(cuda.Device.count()):
        device=cuda.Device(devicenum)
        attrs=device.get_attributes()
    
        #Beyond this point is just pretty printing
        print("\n===Attributes for device %d"%devicenum)
        for (key,value) in attrs.iteritems():
            print("%s:%s"%(str(key),str(value)))
Exemplo n.º 44
0
 def device_usage_str(self):
     '''Returns a formatted string displaying the memory usage.'''
     s = 'device usage:\n'
     s += '-'*10 + '\n'
     #s += format_array('vertices', self.vertices) + '\n'
     #s += format_array('triangles', self.triangles) + '\n'
     s += format_array('nodes', self.nodes) + '\n'
     s += '%-15s %6s %6s' % ('total', '', format_size(self.nodes.nbytes)) + '\n'
     s += '-'*10 + '\n'
     free, total = cuda.mem_get_info()
     s += '%-15s %6s %6s' % ('device total', '', format_size(total)) + '\n'
     s += '%-15s %6s %6s' % ('device used', '', format_size(total-free)) + '\n'
     s += '%-15s %6s %6s' % ('device free', '', format_size(free)) + '\n'
     return s
Exemplo n.º 45
0
 def __call__(self, tag, description):
     if api.is_gpu_api_cuda():
         gpu_free, gpu_total = cuda.mem_get_info()
     elif api.is_gpu_api_opencl():
         ctx = cltools.get_last_context()
         device = ctx.get_info(cl.context_info.DEVICES)[0]
         gpu_total = device.get_info(cl.device_info.GLOBAL_MEM_SIZE)
         gpu_free = gpu_total  # free memory info not availabe to opencl...
     if tag is None:
         self['gpu_total'] = gpu_total
     else:
         self['%s' % tag] = description
         self['%s_gpu_used' % tag] = gpu_total - gpu_free
     pass
Exemplo n.º 46
0
 def device_usage_str(self):
     '''Returns a formatted string displaying the memory usage.'''
     s = 'device usage:\n'
     s += '-'*10 + '\n'
     #s += format_array('vertices', self.vertices) + '\n'
     #s += format_array('triangles', self.triangles) + '\n'
     s += format_array('nodes', self.nodes) + '\n'
     s += '%-15s %6s %6s' % ('total', '', format_size(self.nodes.nbytes)) + '\n'
     s += '-'*10 + '\n'
     free, total = cuda.mem_get_info()
     s += '%-15s %6s %6s' % ('device total', '', format_size(total)) + '\n'
     s += '%-15s %6s %6s' % ('device used', '', format_size(total-free)) + '\n'
     s += '%-15s %6s %6s' % ('device free', '', format_size(free)) + '\n'
     return s
Exemplo n.º 47
0
    def _gpuAlloc(self):
        #Get GPU information
        self.freeMem = cuda.mem_get_info()[0] * .5 * .8 # limit memory use to 80% of available
        self.maxPossRows = np.int(np.floor(self.freeMem / (4 * self.totalCols)))    # multiply by 4 as that is size of float
        # set max rows to smaller number to save memory usage
        if self.totalRows < self.maxPossRows:
            print "reducing max rows to reduce memory use on GPU"
            self.maxPossRows = self.totalRows

        # create pagelocked buffers and GPU arrays
        self.to_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
        self.from_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
        self.data_gpu = cuda.mem_alloc(self.to_gpu_buffer.nbytes)
        self.result_gpu = cuda.mem_alloc(self.from_gpu_buffer.nbytes)
Exemplo n.º 48
0
    def _gpuAlloc(self):
        #Get GPU information
        self.freeMem = cuda.mem_get_info()[0] * .5 * .8 # limit memory use to 80% of available
        self.maxPossRows = np.int(np.floor(self.freeMem / (4 * self.totalCols)))    # multiply by 4 as that is size of float
        # set max rows to smaller number to save memory usage
        if self.totalRows < self.maxPossRows:
            print "reducing max rows to reduce memory use on GPU"
            self.maxPossRows = self.totalRows

        # create pagelocked buffers and GPU arrays
        self.to_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
        self.from_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
        self.data_gpu = cuda.mem_alloc(self.to_gpu_buffer.nbytes)
        self.result_gpu = cuda.mem_alloc(self.from_gpu_buffer.nbytes)
Exemplo n.º 49
0
def get_cuda_memory():
    """Get the amount of free memory for CUDA operations

    Returns
    -------
    memory : str
        The amount of available memory as a human-readable string.
    """
    if not _cuda_capable:
        warn('CUDA not enabled, returning zero for memory')
        mem = 0
    else:
        from pycuda.driver import mem_get_info
        mem = mem_get_info()[0]
    return sizeof_fmt(mem)
Exemplo n.º 50
0
 def free_gpu(self,):
     self.rng_states.free()
     self.gpu_vect_sum.free()
     self.gpu_new_data.free()
     self.gpu_init_data.free()
     (free, total) = cuda.mem_get_info()
     print(
         (
             "Global memory occupancy after cleaning processes: %f%% free"
             % (free * 100 / total)
         )
     )
     print(("Global free memory :%i Mo free" % (free / 10 ** 6)))
     del self.module
     self.ctx.detach()
Exemplo n.º 51
0
def get_cuda_memory():
    """Get the amount of free memory for CUDA operations.

    Returns
    -------
    memory : str
        The amount of available memory as a human-readable string.
    """
    if not _cuda_capable:
        warn('CUDA not enabled, returning zero for memory')
        mem = 0
    else:
        from pycuda.driver import mem_get_info
        mem = mem_get_info()[0]
    return sizeof_fmt(mem)
Exemplo n.º 52
0
def run():
	(free,total) = cuda.mem_get_info()
	#free = 2.e9
	print "Device global memory {0:.2f}GB total, {1:0.2f}GB free".format(total/1.e9, free/1.e9)
	print "Roughly {0:.2f}GB required for large box".format(float(NBYTES*4)/1.e9)
	if not os.path.exists(parent_folder+'/Boxes'):
		os.makedirs(parent_folder+"/Boxes")
	if NBYTES*4 < free:
		print "Congratulations, your GPU has enough memory, running without stitching"
		init()
	else:
		N = DIM
		while float(N)/DIM*NBYTES*8 > free:
			N /= 2
		print "Stitching with {} meta block size".format(N)
		init_stitch(np.int32(N))
Exemplo n.º 53
0
    def __init__(self, interactive=True, logfile=None):
        DeviceMemoryPool.__init__(self)
        self.last_free, _ = cuda.mem_get_info()
        self.interactive = interactive

        if logfile is None:
            import sys
            logfile = sys.stdout

        self.logfile = logfile

        from weakref import WeakKeyDictionary
        self.blocks = WeakKeyDictionary()

        if interactive:
            from pytools.diskdict import DiskDict
            self.stacktrace_mnemonics = DiskDict("pycuda-stacktrace-mnemonics")
Exemplo n.º 54
0
def Optimize_Blocks():
    # Get memory info
    print "Memory info:"
    (free, total) = cuda.mem_get_info()
    print round(100.0 * free / total, 2), "% free of", round(total * 1.0 / pow(1024, 3), 4), "Gb"
    # Find the largest numpy array
    mem_list = [1.0]
    np_array_size = 1
    opts = 0
    counter = 0
    while opts < free:
        opts = free * 1.0 / sys.getsizeof(mem_list) * counter
        counter += 1
        print free, opts, sys.getsizeof(mem_list) * counter
    print "Max list size:", counter
    # while np_array_size < free:
    #    mem_list = mem_list.append(1.0)
    #    np_array_size = sys.getsizeof(mem_list)
    # np = array(np)
    print "Max Numpy array size:", sys.getsizeof(np)
    # Create lists to store multiple device info
    threads_per_block = []
    grid_dim_x = []

    # Loop through devices
    for devicenum in range(cuda.Device.count()):
        # Initialize the device
        device = cuda.Device(devicenum)

        # Get dictionary of device info
        attrs = device.get_attributes()

        # Get max threads per block info
        tpb = attrs[pycuda._driver.device_attribute.MAX_THREADS_PER_BLOCK]
        if tpb not in threads_per_block:
            threads_per_block.append(tpb)

        # Get max grid dimension
        mgd = attrs[pycuda._driver.device_attribute.MAX_GRID_DIM_X]
        if mgd not in grid_dim_x:
            grid_dim_x.append(mgd)

    print "You should use the following code in your Python code:"
    print "block_size =", str(min(threads_per_block))
    print "blocks =", str(min(grid_dim_x) / min(threads_per_block))
Exemplo n.º 55
0
def init_all_devices():
    global DEVICES
    if DEVICES is not None:
        return  DEVICES
    log.info("CUDA initialization (this may take a few seconds)")
    driver.init()
    DEVICES = []
    log("CUDA driver version=%s", driver.get_driver_version())
    log.info("PyCUDA version=%s", pycuda.VERSION_TEXT)
    ngpus = driver.Device.count()
    log.info("CUDA version=%s found %s device(s):", ".".join([str(x) for x in driver.get_version()]), ngpus)
    da = driver.device_attribute
    cf = driver.ctx_flags
    for i in range(ngpus):
        device = None
        context = None
        try:
            device = driver.Device(i)
            log(" + testing device %s: %s", i, device_info(device))
            host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY)
            if not host_mem:
                log.warn("skipping device %s (cannot map host memory)", device_info(device))
                continue
            context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
            log("   created context=%s", context)
            log("   api version=%s", context.get_api_version())
            free, total = driver.mem_get_info()
            log("   memory: free=%sMB, total=%sMB",  int(free/1024/1024), int(total/1024/1024))
            log("   multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE))
            log("   max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z))
            log("   max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z))
            max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH)
            max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT)
            log("   maximum texture size: %sx%s", max_width, max_height)
            log("   max pitch: %s", device.get_attribute(da.MAX_PITCH))
            SMmajor, SMminor = device.compute_capability()
            compute = (SMmajor<<4) + SMminor
            log("   compute capability: %#x (%s.%s)", compute, SMmajor, SMminor)
            try:
                DEVICES.append(i)
                log.info(" + %s (memory %s%% free, compute %#x)", device_info(device), 100*free/total, compute)
            finally:
                context.pop()
        except Exception, e:
            log.error("error on device %s: %s", (device or i), e)
Exemplo n.º 56
0
    def test_mempool(self):
        from pycuda.tools import bitlog2
        from pycuda.tools import DeviceMemoryPool

        pool = DeviceMemoryPool()
        maxlen = 10
        queue = []
        free, total = drv.mem_get_info()

        e0 = bitlog2(free)

        for e in range(e0-6, e0-4):
            for i in range(100):
                queue.append(pool.allocate(1<<e))
                if len(queue) > 10:
                    queue.pop(0)
        del queue
        pool.stop_holding()
Exemplo n.º 57
0
    def filter_image(self, image_input):
        """
        Performs RF filtering on input video
        for all the rfs
        """
        # video dimensions should match screen dimensions
        # numpy resize operation doesn,t make any checks
        if len(image_input.shape) == 2:
            # if input has 2 dimensions
            assert image_input.shape[1] == self.size
        else:
            # if input has 3 dimensions
            assert (image_input.shape[1]*image_input.shape[2] ==
                    self.size)

        # rasterizing inputs
        image_input.resize((1, self.size))

        d_image = parray.to_gpu(image_input)
        d_output = parray.empty((self.num_neurons, image_input.shape[0]),
                                self.dtype)
        free, total = cuda.mem_get_info()
        self.ONE_TIME_FILTERS = ((free // self.dtype.itemsize)
                                 * 3 // 4 // self.size)
        self.ONE_TIME_FILTERS -= self.ONE_TIME_FILTERS % 2
        self.ONE_TIME_FILTERS = min(self.ONE_TIME_FILTERS, self.num_neurons)
        handle = la.cublashandle()

        for i in np.arange(0, self.num_neurons, self.ONE_TIME_FILTERS):
            Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i)
            self.generate_filters(startbias=i, N_filters=Nfilters)
            la.dot(self.filters, d_image, opb='t',
                   C=d_output[i: i+Nfilters],
                   handle=handle)
        del self.filters
        return d_output.T()