def setup(sizes, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) host_arrays, device_arrays = [], [] for size in sizes: numpy_array = np.random.rand(*size).astype(dtype=dtype) opencl_array = Array(queue, numpy_array.shape, numpy_array.dtype) opencl_array.set(numpy_array) host_arrays.append(numpy_array) device_arrays.append(opencl_array) queue.finish() return queue, host_arrays, device_arrays
def setup_op(queue, A): cla = Array(queue, A.shape, A.dtype) cla.set(A) def matvect(x, y): blas.gemv(queue, cla, x, y, transA=True) return def matvec(x, y): blas.gemv(queue, cla, x, y) return LinearOperator(A.shape, matvec, rmatvec=matvect, dtype=A.dtype)
import numpy as np import pyopencl as cl from pyopencl.array import Array import pyclblast # Settings for this sample dtype = 'float32' print("# Setting up OpenCL") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("# Setting up Numpy arrays") m, n, k = 2, 3, 4 a = np.random.rand(m, k).astype(dtype=dtype) b = np.random.rand(k, n).astype(dtype=dtype) c = np.random.rand(m, n).astype(dtype=dtype) print("# Setting up OpenCL arrays") cla = Array(queue, a.shape, a.dtype) clb = Array(queue, b.shape, b.dtype) clc = Array(queue, c.shape, c.dtype) cla.set(a) clb.set(b) clc.set(c) print("# Example level-3 operation: GEMM") pyclblast.gemm(queue, m, n, k, cla, clb, clc, a_ld=k, b_ld=n, c_ld=n) print("# Matrix C result: %s" % clc.get()) print("# Expected result: %s" % (np.dot(a, b)))
from datetime import datetime if __name__ == "__main__": # Set up pyopencl: ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # Set up a basic sgemm example: m, n, k = 2, 3, 4 a = np.random.rand(m, k).astype(dtype=np.float32) b = np.random.rand(k, n).astype(dtype=np.float32) c = np.empty((m, n), np.float32) cla = Array(queue, a.shape, a.dtype) clb = Array(queue, b.shape, b.dtype) clc = Array(queue, c.shape, c.dtype) cla.set(a) clb.set(b) clc.set(c) # Perform sgemm on these matrices, overriding the CLBlast parameters. In this example, we'll # just change the 'MWG' parameter a couple of times: params = { "KWG": 32, "KWI": 2, "MDIMA": 8, "MDIMC": 8, "MWG": 64, "NDIMB": 8, "NDIMC": 8, "NWG": 64, "SA": 0, "SB": 0, "STRM": 0, "STRN": 0, "VWM": 4, "VWN": 1 } for mwg in (32, 64, 256): print("Running sgemm tuned with MWG = %d" % mwg) params["MWG"] = mwg pyclblast.override_parameters(ctx.devices[0], 'Xgemm', 32, params) pyclblast.gemm(queue, m, n, k, cla, clb, clc, a_ld=k, b_ld=n, c_ld=n) assert np.allclose(clc.get(), a.dot(b)), "uh-oh, xgemm isn't behaving correctly"
from pyopencl.array import arange, Array from pyopencl.reduction import ReductionKernel import numpy ctx = pyopencl.create_some_context() queue = pyopencl.CommandQueue(ctx) #print dir(cl) #a = arange(queue, 400, dtype=numpy.float32) #b = arange(queue, 400, dtype=numpy.float32) acpu = numpy.zeros((100, 1), dtype=numpy.int32) for i in xrange(0, 100): if i % 5 == 0: acpu[i] = 1 a = Array(queue, (100, 1), numpy.int32) a.set(acpu) queue.finish() krnl = ReductionKernel( ctx, numpy.int32, neutral="0", reduce_expr="a+b", map_expr="x[i]", #*y[i]", arguments="__global int *x") #, __global in *y") my_sum = krnl(a).get() queue.finish() print my_sum
A = np.zeros((m, n), dtype=dtype) x = np.zeros(n, dtype=dtype) y = np.zeros(m, dtype=dtype) rng = np.random.RandomState(1) # change the seed to see different data A[...] = rng.uniform(-1, 1, size=A.shape) x[...] = rng.uniform(-1, 1, size=x.shape) y[...] = rng.uniform(-1, 1, size=y.shape) # allocate OpenCL memory on the device clA = Array(queue, A.shape, A.dtype) clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) # copy data to device clA.set(A) clx.set(x) # compute a matrix-vector product (gemv) blas.gemv(queue, clA, clx, cly) # check the result print("Expected: ", np.dot(A, x)) print("Actual: ", cly.get()) # try a matrix-vector product with the transpose cly.set(y) blas.gemv(queue, clA, cly, clx, transA=True) print("Expected: ", np.dot(A.T, y)) print("Actual: ", clx.get())
import pyclblast # Settings for this sample dtype = 'float32' m, n = 4, 3 alpha = 1.0 beta = 0.0 print("# Setting up OpenCL") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("# Setting up Numpy arrays") a = np.random.rand(m, n).astype(dtype=dtype) x = np.random.rand(n).astype(dtype=dtype) y = np.random.rand(m).astype(dtype=dtype) print("# Setting up OpenCL arrays") cla = Array(queue, a.shape, a.dtype) clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) cla.set(a) clx.set(x) cly.set(y) print("# Example level-2 operation: GEMV") pyclblast.gemv(queue, m, n, cla, clx, cly, a_ld=n, alpha=alpha, beta=beta) queue.finish() print("# Result for vector y: %s" % cly.get()) print("# Expected result: %s" % (alpha * np.dot(a, x) + beta * y))
def to_ocl(a): cla = Array(queue, a.shape, a.dtype) cla.set(a) return cla
# generate some random data on the CPU n = 5 dtype = 'float64' # also supports 'float64' x = np.zeros(n, dtype=dtype) y = np.zeros(n, dtype=dtype) rng = np.random.RandomState(1) # change the seed to see different data x[...] = rng.uniform(-1, 1, size=x.shape) y[...] = rng.uniform(-1, 1, size=y.shape) # allocate OpenCL memory on the device clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) cld = Array(queue, 1, x.dtype) # copy data to device clx.set(x) cly.set(y) # compute a dot product (dot) blas.dot(queue, clx, cly, cld) # check the result print("Expected: ", np.dot(x, y)) print("Actual: ", cld.get()[0]) # tidy up the BLAS blas.teardown()
import numpy as np import pyopencl as cl from pyopencl.array import Array import pyclblast # Settings for this sample dtype = 'float32' alpha = 1.5 n = 4 print("# Setting up OpenCL") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("# Setting up Numpy arrays") x = np.random.rand(n).astype(dtype=dtype) y = np.random.rand(n).astype(dtype=dtype) print("# Setting up OpenCL arrays") clx = Array(queue, x.shape, x.dtype) cly = Array(queue, y.shape, y.dtype) clx.set(x) cly.set(y) print("# Example level-1 operation: AXPY") pyclblast.axpy(queue, n, clx, cly, alpha=alpha) queue.finish() print("# Result for vector y: %s" % cly.get()) print("# Expected result: %s" % (alpha * x + y))
import pyopencl from pyopencl.array import arange, Array from pyopencl.reduction import ReductionKernel import numpy ctx = pyopencl.create_some_context() queue = pyopencl.CommandQueue(ctx) #print dir(cl) #a = arange(queue, 400, dtype=numpy.float32) #b = arange(queue, 400, dtype=numpy.float32) acpu = numpy.zeros((100, 1), dtype=numpy.int32) for i in xrange(0,100): if i % 5 == 0: acpu[i] = 1 a = Array(queue, (100,1), numpy.int32) a.set(acpu) queue.finish() krnl = ReductionKernel(ctx, numpy.int32, neutral="0", reduce_expr="a+b", map_expr="x[i]", #*y[i]", arguments="__global int *x")#, __global in *y") my_sum = krnl(a).get() queue.finish() print my_sum
x[...] = rng.uniform(-1, 1, size=x.shape) x1[...] = rng.uniform(-1, 1, size=x1.shape) x2[...] = rng.uniform(-1, 1, size=x2.shape) A_upper = np.triu(A) A = np.tril(A) # allocate OpenCL memory on the device clA = Array(queue, A.shape, A.dtype) clA_upper = Array(queue, A.shape, A.dtype) clx = Array(queue, x.shape, x.dtype) clx1 = Array(queue, x1.shape, x1.dtype) clx2 = Array(queue, x2.shape, x2.dtype) # copy data to device clA.set(A) clA_upper.set(A_upper) clx.set(x) # compute a triangular solve (trsv) blas.trsv(queue, clA, clx) # check the result print("Expected: ", np.linalg.solve(A, x)) print("Actual: ", clx.get()) print() # try a triangular solve with the transpose clx1.set(x1) blas.trsv(queue, clA, clx1, transA=True) print("Expected: ", np.linalg.solve(A.T, x1))