def cufft_conv(x, y):

    x = x.astype(np.complex64)
    y = y.astype(np.complex64)

    if (x.shape != y.shape):
        return -1

    plan = fft.Plan(x.shape, np.complex64, np.complex64)
    inverse_plan = fft.Plan(x.shape, np.complex64, np.complex64)

    x_gpu = gpuarray.to_gpu(x)
    y_gpu = gpuarray.to_gpu(y)

    x_fft = gpuarray.empty_like(x_gpu, dtype=np.complex64)
    y_fft = gpuarray.empty_like(y_gpu, dtype=np.complex64)
    out_gpu = gpuarray.empty_like(x_gpu, dtype=np.complex64)

    fft.fft(x_gpu, x_fft, plan)
    fft.fft(y_gpu, y_fft, plan)

    linalg.multiply(x_fft, y_fft, overwrite=True)
    fft.ifft(y_fft, out_gpu, inverse_plan, scale=True)
    conv_out = out_gpu.get()

    x_gpu.gpudata.free()
    y_gpu.gpudata.free()
    x_fft.gpudata.free()
    y_fft.gpudata.free()
    out_gpu.gpudata.free()

    return conv_out
Пример #2
0
    def propagate_eager(self, wavelength, wavefront):
        """
        'Not-Too-Good' version of the propagation on the GPU (lots of Memory issues...)
        Remove in the future
        :param wavelength:
        :param wavefront:
        :return:
        """

        N = self.N_PIX
        # free, total = cuda.mem_get_info()
        free, total = cuda.mem_get_info()
        print("Free: %.2f percent" % (free / total * 100))

        # Pupil Plane -> Image Slicer
        complex_pupil = self.pupil_masks[wavelength] * np.exp(
            1j * 2 * np.pi * self.pupil_masks[wavelength] / wavelength)
        complex_pupil_gpu = gpuarray.to_gpu(
            np.asarray(complex_pupil, np.complex64))
        plan = cu_fft.Plan(complex_pupil_gpu.shape, np.complex64, np.complex64)
        cu_fft.fft(complex_pupil_gpu, complex_pupil_gpu, plan, scale=True)

        # Add N_slices copies to be Masked
        complex_slicer_cpu = complex_pupil_gpu.get()
        complex_pupil_gpu.gpudata.free()

        free, total = cuda.mem_get_info()
        print("*Free: %.2f percent" % (free / total * 100))

        complex_slicer_cpu = np.stack([complex_slicer_cpu] * self.N_slices)
        complex_slicer_gpu = gpuarray.to_gpu(complex_slicer_cpu)
        slicer_masks_gpu = gpuarray.to_gpu(self.slicer_masks_fftshift)
        clinalg.multiply(slicer_masks_gpu, complex_slicer_gpu, overwrite=True)
        slicer_masks_gpu.gpudata.free()
        free, total = cuda.mem_get_info()
        print("**Free: %.2f percent" % (free / total * 100))

        # Slicer -> Pupil Mirror
        plan = cu_fft.Plan((N, N), np.complex64, np.complex64, self.N_slices)
        cu_fft.ifft(complex_slicer_gpu, complex_slicer_gpu, plan, scale=True)
        mirror_mask_gpu = gpuarray.to_gpu(self.pupil_mirror_masks_fft)
        clinalg.multiply(mirror_mask_gpu, complex_slicer_gpu, overwrite=True)

        # Pupil Mirror -> Slits
        cu_fft.fft(complex_slicer_gpu, complex_slicer_gpu, plan)
        slits = complex_slicer_gpu.get()
        complex_slicer_gpu.gpudata.free()
        mirror_mask_gpu.gpudata.free()
        slit = fftshift(np.sum((np.abs(slits))**2, axis=0))

        free, total = cuda.mem_get_info()
        print("***Free: %.2f percent" % (free / total * 100))

        return slit
Пример #3
0
 def _impl_test_multiply(self, N, dtype):
     mk_matrix = lambda N, dtype: np.asarray(np.random.rand(N, N), dtype)
     x = mk_matrix(N, dtype)
     y = mk_matrix(N, dtype)
     if np.iscomplexobj(x):
         x += 1j*mk_matrix(N, dtype)
         y += 1j*mk_matrix(N, dtype)
     x_gpu = gpuarray.to_gpu(x)
     y_gpu = gpuarray.to_gpu(y)
     z_gpu = linalg.multiply(x_gpu, y_gpu)
     assert np.allclose(x*y, z_gpu.get())
Пример #4
0
 def _impl_test_multiply(self, N, dtype):
     mk_matrix = lambda N, dtype: np.asarray(np.random.rand(N, N), dtype)
     x = mk_matrix(N, dtype)
     y = mk_matrix(N, dtype)
     if np.iscomplexobj(x):
         x += 1j*mk_matrix(N, dtype)
         y += 1j*mk_matrix(N, dtype)
     x_gpu = gpuarray.to_gpu(x)
     y_gpu = gpuarray.to_gpu(y)
     z_gpu = linalg.multiply(x_gpu, y_gpu)
     assert np.allclose(x*y, z_gpu.get())
Пример #5
0
    def filter(self):
        import pycuda.gpuarray as gpuarray
        import skcuda.fft as cu_fft
        import skcuda.linalg as linalg
        import pycuda.driver as cuda
        from pycuda.tools import make_default_context
        cuda.init()
        context = make_default_context()
        device = context.get_device()
        signal = self.series[0]
        window = self.series[1]
        linalg.init()
        nfft = determine_size(len(signal) + len(window) - 1)
        # Move data to GPU
        sig_zero_pad = np.zeros(nfft, dtype=self.precision['float'])
        win_zero_pad = np.zeros(nfft, dtype=self.precision['float'])
        sig_gpu = gpuarray.zeros(sig_zero_pad.shape,
                                 dtype=self.precision['float'])
        win_gpu = gpuarray.zeros(win_zero_pad.shape,
                                 dtype=self.precision['float'])
        sig_zero_pad[0:len(signal)] = signal
        win_zero_pad[0:len(window)] = window
        sig_gpu.set(sig_zero_pad)
        win_gpu.set(win_zero_pad)

        # Plan forwards
        sig_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex'])
        win_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex'])
        sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape,
                                       self.precision['float'],
                                       self.precision['complex'])
        win_plan_forward = cu_fft.Plan(win_fft_gpu.shape,
                                       self.precision['float'],
                                       self.precision['complex'])
        cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward)
        cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward)

        # Convolve
        out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True)
        linalg.scale(2.0, out_fft)

        # Plan inverse
        out_gpu = gpuarray.zeros_like(out_fft)
        plan_inverse = cu_fft.Plan(out_fft.shape, self.precision['complex'],
                                   self.precision['complex'])
        cu_fft.ifft(out_fft, out_gpu, plan_inverse, True)
        out_np = np.zeros(len(out_gpu), self.precision['complex'])
        out_gpu.get(out_np)
        context.pop()
        return out_np
Пример #6
0
def filter_fft_cuda(signal: np.array, window: np.array, prec: dict):
    """
    Computes the low_pass filter using the numpy pycuda method.
    Also auto-inits the pycuda library
    :param signal: The input series
    :param window: The input window
    :param prec: The precision entry
    :return: The filtered signal
    """
    import pycuda.autoinit  # Here because it initialises a new cuda environment every trial.
    import pycuda.gpuarray as gpuarray
    import skcuda.fft as cu_fft
    import skcuda.linalg as linalg
    linalg.init()
    nfft = determine_size(len(signal) + len(window) - 1)
    # Move data to GPU
    sig_zero_pad = np.zeros(nfft, dtype=prec['float'])
    win_zero_pad = np.zeros(nfft, dtype=prec['float'])
    sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=prec['float'])
    win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=prec['float'])
    sig_zero_pad[0:len(signal)] = signal
    win_zero_pad[0:len(window)] = window
    sig_gpu.set(sig_zero_pad)
    win_gpu.set(win_zero_pad)

    # Plan forwards
    sig_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex'])
    win_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex'])
    sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, prec['float'],
                                   prec['complex'])
    win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, prec['float'],
                                   prec['complex'])
    cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward)
    cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward)

    # Convolve
    out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True)
    linalg.scale(2.0, out_fft)

    # Plan inverse
    out_gpu = gpuarray.zeros_like(out_fft)
    plan_inverse = cu_fft.Plan(out_fft.shape, prec['complex'], prec['complex'])
    cu_fft.ifft(out_fft, out_gpu, plan_inverse, True)
    out_np = np.zeros(len(out_gpu), prec['complex'])
    out_gpu.get(out_np)
    return out_np
def logis(y,x):
    end = 0
    start = 0
    x = x.astype(np.float32)
    y = y.astype(np.float32)
    start=time.time()
    # Translado de variable a GPU
    x_gpu = gpuarray.to_gpu(x)
    y_gpu = gpuarray.to_gpu(y)

    linalg.init()
    # Transpuesta de X
    x_gpu_T = linalg.transpose(x_gpu)
    beta_gpu = linalg.dot(linalg.dot(linalg.inv(linalg.dot(x_gpu_T,x_gpu)),x_gpu_T),y_gpu)
    j = 1
    while(True):
        mu = sapply(x,beta_gpu.get())
        mu = mu.astype(np.float32)
        mu_gpu = gpuarray.to_gpu(mu)
        V_gpu= linalg.diag(mu_gpu)
        f2_gpu = linalg.multiply(mu_gpu,1-mu_gpu)
        f3_gpu = linalg.diag(1/f2_gpu)
        f4_gpu = (y_gpu-mu_gpu)
        f5_gpu = linalg.dot(f3_gpu,f4_gpu)
        if(np.isnan(f5_gpu.get()).any()):
            f5_cpu = f5_gpu.get()
            f5_cpu = nanValue(f5_cpu)
            f5_gpu = gpuarray.to_gpu(f5_cpu.astype(np.float32))
        y_1_gpu = linalg.dot(x_gpu,beta_gpu) + f5_gpu
        beta_1_gpu = linalg.dot(linalg.dot(linalg.dot(linalg.inv(linalg.dot(linalg.dot(x_gpu_T,V_gpu),x_gpu)),x_gpu_T),V_gpu),y_1_gpu)
        check_value = np.absolute(linalg.norm(beta_1_gpu-beta_gpu))
        #if(check_value<0.00001):
            #break
        if(j == 10 or check_value<0.00001):
            break
        beta_gpu = beta_1_gpu
        j = j + 1
    end = time.time()
    tiempo = (end-start)
    return {"iteraciones":j,"Betas":beta_gpu.get(),"time":tiempo}
Пример #8
0
def step_2(matrix_1, matrix_image_pan):
    #La función linalg.mulitply realiza la multiplicación elemento a elemento entre dos matrices
    matrix_2 = linalg.multiply(matrix_1, matrix_image_pan)
    return matrix_2
Пример #9
0
def _sub_kmeans_gpu(X, k):
    import skcuda
    import skcuda.linalg as LA
    import pycuda.driver as cuda
    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    LA.init()

    n, d = X.shape
    X = X.astype(np.float32)
    V_gpu = random_V(d, mode='gpu')
    m = d / 2
    X_gpu = gpuarray.to_gpu(X)
    mu_D_gpu = skcuda.misc.mean(X_gpu, axis=0, keepdims=True)
    sub_gpu = skcuda.misc.subtract(X_gpu, mu_D_gpu)
    S_D_gpu = LA.dot(sub_gpu, sub_gpu, transa='T')
    mu_is_gpu = gpuarray.to_gpu(X[np.random.choice(n, k)])
    itr = 1
    assignment_unchanged = 0
    C_gpu = None
    MAX_ITER = 100
    while itr < MAX_ITER:
        Pc_gpu = projection_matrix(d, m, mode='gpu')
        PcV_gpu = LA.dot(Pc_gpu, V_gpu, transa='T', transb='T')
        PcVmu_is_gpu = gpuarray.empty((k, m), dtype=np.float32)

        for i in range(k):
            PcVmu_is_gpu[i] = LA.dot(PcV_gpu, mu_is_gpu[i][:, None]).ravel()

        global_temp = LA.dot(X_gpu, PcV_gpu, transb='T')
        if itr % 2 == 0:
            C_old = C_gpu.get()
        X_transformed_gpu = gpuarray.empty(
            (n, k, m), dtype=np.float32)
        for i in xrange(n):
            temp = global_temp[i]
            X_transformed_gpu[i] = skcuda.misc.subtract(
                PcVmu_is_gpu, temp)

        X_transformed_squared_gpu = LA.multiply(
            X_transformed_gpu, X_transformed_gpu)
        X_transformed_squared_gpu = X_transformed_squared_gpu.reshape(
            (n * k, m))
        X_transformed_sum_gpu = skcuda.misc.sum(
            X_transformed_squared_gpu, axis=-1, keepdims=True)
        X_transformed_sum_gpu = X_transformed_sum_gpu.reshape((n, k))
        C_gpu = skcuda.misc.argmin(
            X_transformed_sum_gpu, axis=1)
        if itr % 2 == 0:
            Cnew = C_gpu.get()
            points_changed = np.sum(1 - np.equal(C_old, Cnew).astype(np.uint8))
            if points_changed == 0:
                assignment_unchanged += 1
            if assignment_unchanged >= 2:
                break
            print('[i] Itr %d: %d points changed' % (itr, points_changed))
        C = C_gpu.get()
        counts = {i: 0 for i in range(k)}
        mu_is = np.zeros((k, d)).astype(np.float32)
        for i in range(n):
            C_id = np.int(C[i])
            mu_is[C_id] += X[i]
            counts[C_id] += 1

        mu_is = np.array([mu_is[i] / counts[i] for i in range(k)])
        mu_is_gpu = gpuarray.to_gpu(mu_is)
        S_is_gpu = gpuarray.zeros((k, d, d), dtype=np.float32)

        maxv = np.max(counts.values())
        storage = np.empty((k, np.int(maxv), d)).astype(np.float32)
        counter = np.zeros(k, dtype=np.uint32)

        for i in range(n):
            C_id = np.int(C[i])
            X_minus_mu_isi = (X[i] - mu_is[C_id])[:, None]
            storage[C_id, np.int(counter[C_id]), :] = X_minus_mu_isi.ravel()
            counter[C_id] += 1

        storage_gpu = gpuarray.to_gpu(storage)
        for i in range(k):
            curr_cluster_points = storage_gpu[i,
                                              :np.int(counter[i]), :]
            S_is_gpu[i] = LA.dot(curr_cluster_points,
                                 curr_cluster_points, transa='T')

        S_is_sum_gpu = S_is_gpu.reshape((k, d * d))
        S_is_sum_gpu = skcuda.misc.sum(S_is_sum_gpu, axis=0, keepdims=True)
        S_is_sum_gpu = S_is_sum_gpu.reshape((d, d))

        S_is_diff_gpu = skcuda.misc.subtract(S_is_sum_gpu, S_D_gpu)

        w, V_gpu = sorted_eig(S_is_diff_gpu, mode='gpu')

        maxVal = min(w)
        m = np.sum([1 for i in w if i / maxVal > 1e-3])
        m = max(1, m)

        itr += 1
    return C_gpu.get(), V_gpu.get(), m
Пример #10
0
def fitSlcGPU(slc, srcFatT2, t2, b1, ff):
    global ROWSTEP
    print("Fitting slice", slc)
    yValues = dicomStack[:, :, slc, :].squeeze()
    slcShape = yValues.shape
    nrows = slcShape[0]
    ncols = slcShape[1]
    sigLen = slcShape[2]
    success = False
    
    ffParams_gpu = None
    ffValues_gpu = None
    
    if np.any(ff[:,:,slc] > 0):
        useFF = True
        ffParams_gpu = findmax_ff.prepareAndLoadParams(parameterCombinations)
    else:
        useFF = False
        
    while not success:
        try:
            for r in range(0,nrows,ROWSTEP):
                rowMax = min(r+ROWSTEP, nrows)
                slcLin = yValues[r:rowMax,:,:].reshape(ncols*(rowMax-r), sigLen).astype(np.float32)
                
                slcGPU = None
                
                slcGPU = pycuda.gpuarray.to_gpu(slcLin)
                slcGPU = sklinalg.multiply(slcGPU, slcGPU)
                corrMatrixGPU = sklinalg.mdot(slcGPU, signalsGPU) # correlation
                
                tryFree(slcGPU)
                
                if useFF:
                    ffValues_gpu = findmax_ff.prepareAndLoadFF(ff[r:rowMax, :, slc])
                    corrMax = findmax_ff.findmax_gpu(corrMatrixGPU, ffValues_gpu, ffParams_gpu)
                else:
                    corrMaxGPU = skmisc.argmax(corrMatrixGPU, 1)
                    corrMax = corrMaxGPU.get()
                    tryFree(corrMaxGPU)
                    
                tryFree(corrMatrixGPU)
                tryFree(ffValues_gpu)
                
                for row in range(r, rowMax):
                    for c in range(ncols):
                        ind = (row-r)*ncols + c
                        t2[row,c,slc] = parameterCombinations[corrMax[ind]][0]
                        b1[row,c,slc] = parameterCombinations[corrMax[ind]][1]
                        ff[row,c,slc] = parameterCombinations[corrMax[ind]][2]
                        
                if DOPLOT >= 1:
                    plotImages()
                    
            success = True
        except pycuda._driver.MemoryError:
            ROWSTEP -= 1
            tryFree(slcGPU)
            tryFree(corrMatrixGPU)
            tryFree(ffValues_gpu)
            
            gc.collect()
            print("Not enough GPU Mem: decreasing ROWSTEP to", ROWSTEP)
Пример #11
0
start = time.time()
xf = np.fft.fft2(image) * np.fft.fft2(kernel)
conv_cpu = np.real(np.fft.ifft2(xf))
cpu_time = time.time() - start
print('CPU FFT in ', cpu_time)

shape = image.shape
image_gpu = gpuarray.to_gpu(image)
xf_gpu = gpuarray.empty(shape, np.complex64)
image_plan_forward = cu_fft.Plan(shape, np.float32, np.complex64)

kernel_gpu = gpuarray.to_gpu(kernel)
kf_gpu = gpuarray.empty(shape, np.complex64)
kernel_plan_forward = cu_fft.Plan(shape, np.float32, np.complex64)

plan_inverse = cu_fft.Plan(shape, np.complex64, np.float32)

start = time.time()
cu_fft.fft(image_gpu, xf_gpu, image_plan_forward)
cu_fft.fft(kernel_gpu, kf_gpu, kernel_plan_forward)
cf_gpu = culinalg.multiply(xf_gpu, kf_gpu)
cu_fft.ifft(cf_gpu, image_gpu, plan_inverse, True)
gpu_time = time.time() - start

conv_gpu = image_gpu.get()
print('GPU FFT in ', gpu_time)
tol = 1e-4
print('Success status: ', np.allclose(conv_cpu, conv_gpu, atol=tol), "; atol=",
      tol)
Пример #12
0
    def propagate_gpu_wavelength(self, wavelength, wavefront, N):
        """
        Propagation from Pupil Plane to Exit Slit on the GPU for a single wavelength

        Repeated N times to show how it runs much faster on the GPU when we want to compute
        many PSF images
        :param wavefront:
        :return:
        """
        # It is a pain in the ass to handle the memory properly on the GPU when you have [N_slices, N_pix, N_pix]
        # arrays
        print("\nPropagating on the GPU")
        # GPU memory management
        free, total = cuda.mem_get_info()
        print("Memory Start | Free: %.2f percent" % (free / total * 100))
        slicer_masks_gpu = gpuarray.to_gpu(self.slicer_masks_fftshift)
        mirror_mask_gpu = gpuarray.to_gpu(
            self.pupil_mirror_masks_fft[wavelength])

        plan_batch = cu_fft.Plan((self.N_PIX, self.N_PIX), np.complex64,
                                 np.complex64, self.N_slices)

        # Allocate GPU arrays that will be overwritten with skcuda.misc.set_realloc to save memory
        _pupil = np.zeros((self.N_PIX, self.N_PIX), dtype=np.complex64)
        complex_pupil_gpu = gpuarray.to_gpu(_pupil)

        _slicer = np.zeros((self.N_slices, self.N_PIX, self.N_PIX),
                           dtype=np.complex64)
        complex_slicer_gpu = gpuarray.to_gpu(_slicer)

        PSF_images = []
        for i in range(N):
            print(i)

            # Pupil Plane -> Image Slicer
            pupil_mask = self.pupil_masks[wavelength]
            complex_pupil = pupil_mask * np.exp(
                1j * 2 * np.pi * wavefront[i] / wavelength)
            skcuda.misc.set_realloc(complex_pupil_gpu,
                                    np.asarray(complex_pupil, np.complex64))
            cu_fft.fft(complex_pupil_gpu, complex_pupil_gpu, plan_batch)

            # Add N_slices copies to be Masked
            complex_slicer_cpu = complex_pupil_gpu.get()
            complex_slicer_cpu = np.stack([complex_slicer_cpu] * self.N_slices)
            skcuda.misc.set_realloc(complex_slicer_gpu, complex_slicer_cpu)
            clinalg.multiply(slicer_masks_gpu,
                             complex_slicer_gpu,
                             overwrite=True)

            # Image Slicer -> Pupil Mirror
            cu_fft.ifft(complex_slicer_gpu, complex_slicer_gpu, plan_batch,
                        True)
            clinalg.multiply(mirror_mask_gpu,
                             complex_slicer_gpu,
                             overwrite=True)

            # Pupil Mirror -> Exit Slits
            cu_fft.fft(complex_slicer_gpu, complex_slicer_gpu, plan_batch)

            # pycuda.cumath.fabs(complex_slicer_gpu, out=complex_slicer_gpu)

            _slits = complex_slicer_gpu.get()
            slits = np.sum((np.abs(_slits))**2, axis=0)
            PSF_images.append(slits)

            # free, total = cuda.mem_get_info()
            # print("Memory Usage | Free: %.2f percent" % (free / total * 100))

            # free, total = cuda.mem_get_info()
            # print("Memory End | Free: %.2f percent" % (free/total*100))

        # Make sure you clean up the memory so that it doesn't blow up!!
        complex_pupil_gpu.gpudata.free()
        complex_slicer_gpu.gpudata.free()
        slicer_masks_gpu.gpudata.free()
        mirror_mask_gpu.gpudata.free()
        free, total = cuda.mem_get_info()
        print("Memory Final | Free: %.2f percent" % (free / total * 100))

        return fftshift(np.array(PSF_images), axes=(1, 2))