Пример #1
0
def gemm_v2():
    """
    Let GEMM transpose the input matrices so that they can be in C order,
    originally.  Note that the output matrix is still in Fortran array.
    The string arguments in gemm tells it to apply transformation on the input
    matrices.
    
    See argument description in:
        http://docs.continuum.io/numbapro/cudalib.html#blas-level-2
    """
    print("Version 2".center(80, '='))
    # Prepare arrays for input
    A = np.array(np.arange(N**2, dtype=np.float32).reshape(N, N))
    B = np.array(np.arange(N) + 10, dtype=A.dtype)
    D = np.zeros_like(A, order='F')

    # NumPy
    start = timer()
    E = np.dot(A, np.diag(B))
    numpy_time = timer() - start
    print("Numpy took %f seconds" % numpy_time)

    # cuBLAS
    blas = cublas.Blas()

    start = timer()
    blas.gemm('T', 'T', N, N, N, 1.0, A, np.diag(B), 1.0, D)
    cuda_time = timer() - start

    print("CUBLAS took %f seconds" % cuda_time)
    diff = np.abs(D - E)
    print("Maximum error %f" % np.max(diff))
Пример #2
0
def gemm():
    print("Version 2".center(80, '='))

    A = np.random.rand(dim, dim)
    B = np.random.rand(dim, dim)

    D = np.zeros_like(A, order='F')

    print("MATRIX A :")
    print A
    print("VECTOR B :")
    print B

    # NumPy
    start = timer()
    E = np.dot(A, B)
    numpy_time = timer() - start
    print("Numpy took %f seconds" % numpy_time)

    # cuBLAS
    blas = cublas.Blas()

    start = timer()
    blas.gemm('T', 'T', dim, dim, dim, 1.0, A, B, 1.0, D)
    cuda_time = timer() - start
    print("RESULT MATRIX EVALUATED WITH CUBLAS")
    print D
    print("CUBLAS took %f seconds" % cuda_time)
    diff = np.abs(D - E)
    print("Maximum error %f" % np.max(diff))
Пример #3
0
def gemm_v1():
    '''
    Note that all arrays are in Fortran order.
    '''
    print("Version 1".center(80, '='))
    # Prepare arrays for input
    A = np.array(np.arange(N**2, dtype=np.float32).reshape(N, N), order='F')
    B = np.array(np.arange(N) + 10, dtype=A.dtype, order='F')
    D = np.zeros_like(A, order='F')

    # NumPy
    start = timer()
    E = np.dot(A, np.diag(B))
    numpy_time = timer() - start
    print("Numpy took %f seconds" % numpy_time)

    # cuBLAS
    blas = cublas.Blas()

    start = timer()
    blas.gemm('N', 'N', N, N, N, 1.0, A, np.diag(B), 1.0, D)
    cuda_time = timer() - start

    print("CUBLAS took %f seconds" % cuda_time)
    diff = np.abs(D - E)
    print("Maximum error %f" % np.max(diff))
Пример #4
0
def infer(learner, stimuli, coeffs=None):
    #Get Blas routines
    blas = cublas.Blas()
    #Initialize arrays
    numDict = learner.Q.shape[0]
    numStim = stimuli.shape[0]
    dataLength = stimuli.shape[1]
    u = np.zeros((numStim, numDict), dtype=np.float32, order='F')
    if coeffs is not None:
        u[:] = np.atleast_2d(coeffs)
    d_u = cuda.to_device(u)
    d_s = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_b = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_ci = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_c = cuda.to_device(
        np.zeros((numDict, numDict), dtype=np.float32, order='F'))

    #Move inputs to GPU
    d_dictionary = cuda.to_device(
        np.array(learner.Q, dtype=np.float32, order='F'))
    d_stimuli = cuda.to_device(np.array(stimuli, dtype=np.float32, order='F'))

    blockdim2 = (32, 32)  # TODO: experiment, was all 32s
    blockdim1 = 32
    griddimcsub = int(ceil(numDict / blockdim1))
    griddimi = (int(ceil(numStim / blockdim2[0])),
                int(ceil(numDict / blockdim2[1])))

    #Calculate c: overlap of basis functions with each other minus identity
    blas.gemm('N', 'T', numDict, numDict, dataLength, 1., d_dictionary,
              d_dictionary, 0., d_c)
    LCALearner.csub[griddimcsub, blockdim1](d_c)
    blas.gemm('N', 'T', numStim, numDict, dataLength, 1., d_stimuli,
              d_dictionary, 0., d_b)
    thresh = np.mean(np.absolute(d_b.copy_to_host()), axis=1)
    d_thresh = cuda.to_device(thresh)
    #Update u[i] and s[i] for niter time steps
    for kk in range(learner.niter):
        #Calculate ci: amount other neurons are stimulated times overlap with rest of basis
        blas.gemm('N', 'N', numStim, numDict, numDict, 1., d_s, d_c, 0., d_ci)
        LCALearner.iterate[griddimi,
                           blockdim2](d_c, d_b, d_ci, d_u, d_s,
                                      learner.infrate, d_thresh,
                                      learner.min_thresh, learner.adapt,
                                      learner.softthresh)
    u = d_u.copy_to_host()
    s = d_s.copy_to_host()
    return s.T, u.T, thresh
Пример #5
0
def infer(dictionary, coeffs, stimuli, eta, lamb, nIter, softThresh, adapt):
    #Get Blas routines
    bs = cublas.Blas()
    #Initialize arrays
    numDict = dictionary.shape[0]
    numStim = stimuli.shape[0]
    dataLength = stimuli.shape[1]
    d_u = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_s = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_b = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_ci = cuda.to_device(
        np.zeros((numStim, numDict), dtype=np.float32, order='F'))
    d_c = cuda.to_device(
        np.zeros((numDict, numDict), dtype=np.float32, order='F'))

    #Move inputs to GPU
    d_dictionary = cuda.to_device(
        np.array(dictionary, dtype=np.float32, order='F'))
    d_coeffs = cuda.to_device(np.array(coeffs, dtype=np.float32, order='F'))
    d_stimuli = cuda.to_device(np.array(stimuli, dtype=np.float32, order='F'))

    blockdim2 = (32, 32)
    blockdim1 = 32
    #griddimc = (int(numDict/blockdim[0]),int(numDict/blockdim[1]))
    griddimcsub = int(numDict / blockdim1)
    griddimb = (int(numStim / blockdim2[0]), int(numDict / blockdim2[1]))
    griddimi = (int(numStim / blockdim2[0]), int(numDict / blockdim2[1]))

    #Calculate c: overlap of basis functions with each other minus identity
    #cinit[griddimc,blockdim](d_dictionary,d_c)
    bs.gemm('N', 'T', numDict, numDict, dataLength, 1., d_dictionary,
            d_dictionary, 0., d_c)
    csub[griddimcsub, blockdim1](d_c)
    #binit[griddimb,blockdim2](d_dictionary,d_stimuli,d_b)
    bs.gemm('N', 'T', numStim, numDict, dataLength, 1., d_stimuli,
            d_dictionary, 0., d_b)
    thresh = np.mean(np.absolute(d_b.copy_to_host()), axis=1)
    d_thresh = cuda.to_device(thresh)
    #Update u[i] and s[i] for nIter time steps
    for kk in xrange(nIter):
        #Calculate ci: amount other neurons are stimulated times overlap with rest of basis
        bs.gemm('N', 'N', numStim, numDict, numDict, 1., d_s, d_c, 0., d_ci)
        iter[griddimi, blockdim2](d_c, d_b, d_ci, d_u, d_s, eta, d_thresh,
                                  lamb, adapt, softThresh)
    u = d_u.copy_to_host()
    s = d_s.copy_to_host()
    return (s, u, thresh)
def mp(dictionary, stimuli, k=None, minabs=None):
    """
    Does matching pursuit on a batch of stimuli.
    Args:
        dictionary: Dictionary for matching pursuit. First axis should be dictionary element number.
        stimuli: Stimulus batch for matching pursuit. First axis should be stimulus number.
        k: Sparseness constraint. k dictionary elements will be used to represent stimuli.
        minabs: Minimum absolute value of the remaining signal to continue projection. If nothing is given, minabs is set to zero and k basis elements will be used.
    Returns:
        coeffs: List of dictionary element coefficients to be used for each stimulus.
    """
    if k is None:
        k = dictionary.shape[0]
    if minabs is None:
        minabs = 0.

    bs = cublas.Blas()

    numDict = dictionary.shape[0]
    numStim = stimuli.shape[0]
    dataLength = stimuli.shape[1]
    assert k <= numDict
    #Setup variables on GPU
    d_coefs = cuda.to_device(
        np.zeros(shape=(numStim, numDict), dtype=np.float32, order='F'))
    d_curCoef = cuda.to_device(
        np.zeros(shape=(numStim, numDict), dtype=np.float32, order='F'))
    d_coefsd = cuda.to_device(
        np.zeros(shape=(numStim, numDict), dtype=np.float32, order='F'))
    d_winners = cuda.to_device(
        np.zeros(shape=(k, numStim), dtype=np.int64, order='F'))
    d_delta = cuda.to_device(
        np.zeros_like(stimuli, dtype=np.float32, order='F'))
    d_coefsd = cuda.to_device(
        np.zeros(shape=(numStim, numDict), dtype=np.float32, order='F'))
    #Move args to GPU
    d_stim = cuda.to_device(np.array(stimuli, dtype=np.float32, order='F'))
    d_stimt = cuda.to_device(
        np.zeros_like(stimuli, dtype=np.float32, order='F'))
    d_dict = cuda.to_device(np.array(dictionary, dtype=np.float32, order='F'))

    griddim1 = 32
    griddim2 = (32, 32)
    assert numStim % 32 == 0 and dataLength % 32 == 0 and numDict % 32 == 0
    blockdimstim = int(numStim / griddim1)
    blockdim2 = (int(numStim / griddim2[0]), int(dataLength / griddim2[1]))
    blockdimcoef = (int(numStim / griddim2[0]), int(numDict / griddim2[1]))

    for ii in xrange(k):
        if minabs >= np.mean(np.absolute(d_stim.copy_to_host())):
            break
        bs.gemm('N', 'T', numStim, numDict, dataLength, 1., d_stim, d_dict, 0.,
                d_curCoef)
        if ii > 0:
            removeWinners[griddim1, blockdimstim](d_curCoef, d_winners, ii)
        maxCoefsABS[griddim1, blockdimstim](d_curCoef, d_coefs, d_coefsd,
                                            d_winners, ii, 0)
        #print d_winners.copy_to_host()
        bs.gemm('N', 'N', numStim, dataLength, numDict, 1., d_coefsd, d_dict,
                0., d_delta)
        #print 'delta'
        #print d_delta.copy_to_host()
        #d_coefsd = cuda.to_device(np.zeros(shape=(numStim,numDict),dtype=np.float32,order='F'))
        bs.geam('N', 'N', numStim, numDict, 0., d_coefsd, 0., d_coefsd,
                d_coefsd)
        bs.geam('N', 'N', numStim, dataLength, 1., d_stim, -1., d_delta,
                d_stim)
        #bs.geam('N','N',numStim,dataLength,1.,d_stimt,0.,d_delta,d_stim)
        #print 'stim'
        #print d_stim.copy_to_host()
    return d_coefs.copy_to_host()
Пример #7
0
def fista(I, Phi, lambdav, L=None, tol=10e-6, max_iterations=200, display=True, verbose=False):
	b = cublas.Blas()
	c = cusparse.Sparse()
	descr = c.matdescr()
	(m, n) = Phi.shape
	(m, batch) = I.shape

	if L == None:
		L = scipy.sparse.linalg.svds(Phi, 1, which='LM', return_singular_vectors=False)
		print "Max eigenvalue: ." + str(L)

	L = (L**2)*4 # L = svd(Phi) -> eig(2*(Phi.T*Phi))
	invL = 1/L
	t = 1.

	#if sps.issparse(Phi):
	#	Phi = np.array(Phi.todense())

	d_I = cuda.to_device(np.array(I, dtype=np.float32, order='F'))
	# d_Phi = cuda.to_device(np.array(Phi, dtype=np.float32, order='F'))
	d_Phi =  cusparse.csr_matrix(Phi, dtype=np.float32)
	d_PhiT = cusparse.csr_matrix(Phi.T, dtype=np.float32) # hack because csrgemm issues with 'T'
	# d_Q = cuda.device_array((n, n), dtype=np.float32, order='F')
	d_c = cuda.device_array((n, batch), dtype=np.float32, order='F')
	d_x = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F'))
	d_y = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F'))
	d_x2 = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F'))

	# Temporary array variables
	d_t = cuda.device_array((m, batch), dtype=np.float32, order='F')
	d_t2 = cuda.device_array(n*batch, dtype=np.float32, order='F')

	#b.gemm('T', 'N', n, n, m, 1, d_Phi, d_Phi, 0, d_Q) 	# Q = Phi^T * Phi
	#b.gemm('T', 'N', n, batch, m, -2, d_Phi, d_I, 0, d_c) # c = -2*Phi^T * y
	# c.csrgemm('T', 'N', n, n, m, descr, d_Phi.nnz, d_Phi.data, d_Phi.indptr, d_Phi.indices,
	#	descr, d_Phi.nnz, d_Phi.data, d_Phi.indptr, d_Phi.indices, descr, d_Q.data, d_Q.indptr, d_Q.indices)
	d_Q = c.csrgemm_ez(d_PhiT, d_Phi, transA='N', transB='N')
	c.csrmm('T', m, batch, n, d_Phi.nnz, -2, descr, d_Phi.data, d_Phi.indptr, d_Phi.indices,
		d_I, m, 0, d_c, n)

	blockdim = 32, 32
	griddim = int(math.ceil(n/blockdim[0])), int(math.ceil(batch/blockdim[1]))

	blockdim_1d = 256
	griddim_1d = int(math.ceil(n*batch/blockdim_1d))

	start = l2l1obj(b, c, descr, d_I, d_Phi, d_x, d_t, d_t2, lambdav, blockdim_1d, griddim_1d)
	obj2 = start

	for i in xrange(max_iterations):

		# x2 = 2*Q*y + c
		# b.symm('L', 'U', n, batch, 2, d_Q, d_y, 0, d_x2)
		c.csrmm('N', n, batch, n, d_Q.nnz, 2, descr, d_Q.data, d_Q.indptr, d_Q.indices,
			d_y, n, 0, d_x2, n)
		b.geam('N', 'N', n, batch, 1, d_c, 1, d_x2, d_x2)

		# x2 = y - invL * x2
		b.geam('N', 'N', n, batch, 1, d_y, -invL, d_x2, d_x2)

		# proxOp()
		l1prox[griddim, blockdim](d_x2, invL*lambdav, d_x2)
		t2 = (1+math.sqrt(1+4*(t**2)))/2.0

		# y = x2 + ((t-1)/t2)*(x2-x)
		b.geam('N', 'N', n, batch, 1+(t-1)/t2, d_x2, (1-t)/t2, d_x, d_y)

		# x = x2
		b.geam('N', 'N', n, batch, 1, d_x2, 0, d_x, d_x)
		t = t2

		# update objective
		obj = obj2
		obj2 = l2l1obj(b, c, descr, d_I, d_Phi, d_x2, d_t, d_t2, lambdav, blockdim_1d, griddim_1d)

		if verbose:
			x2 = d_x2.copy_to_host()
			print "L1 Objective: " + str(obj2)
			# print "L1 Objective: " +  str(lambdav*np.sum(np.abs(x2)) + np.sum((I-Phi.dot(x2))**2))

		if np.abs(obj-obj2)/float(obj) < tol:
			break

	x2 = d_x2.copy_to_host()

	if display:
		print "FISTA Iterations: " + str(i)
		# print "L1 Objective: " + str(obj2)
		print "L1 Objective: " +  str(lambdav*np.sum(np.abs(x2)) + np.sum((I-Phi.dot(x2))**2))
		print "Objective delta: " + str(obj2-start)

	return x2