def test_basic(): # -- prepare initial conditions on host A = RA([[[0.1, .2], [.3, .4]], [[.5, .6]]]) X = RA([[3, 5]]) Y = RA([[0.0], [2, 3], ]) A_js = RA([[1], [0]], dtype=np.int32) X_js = RA([[0], [0]], dtype=np.int32) # alpha = 0.5 alpha = 1.0 # beta = 0.1 beta = 1.0 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) clA = CLRA(queue, A) clX = CLRA(queue, X) clY = CLRA(queue, Y) assert allclose(A, clA) assert allclose(X, clX) assert allclose(Y, clY) # -- run cl computation prog = plan_ragged_gather_gemv( queue, alpha, clA, A_js, clX, X_js, beta, clY) # plans = prog.choose_plans() # assert len(plans) == 1 for plan in prog.plans: plan() # -- ensure they match for i in range(len(A_js)): aj, xj = int(A_js[i]), int(X_js[i]) ref = alpha * np.dot(A[aj], X[xj]) + beta * Y[i] sim = clY[i] assert np.allclose(ref, sim)
def test_basic(self): # -- prepare initial conditions on host A = RA([ [[0.1, .2], [.3, .4]], [[.5, .6]]]) X = RA([ [3, 5] ]) Y = RA([[0.0], [2, 3],]) A_js = RA([[1], [0]]) X_js = RA([[0], [0]]) alpha = 0.5 beta = 0.1 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) clA = CLRA(queue, A) clX = CLRA(queue, X) clY = CLRA(queue, Y) clA_js = CLRA(queue, A_js) clX_js = CLRA(queue, X_js) assert allclose(A, clA) assert allclose(X, clX) assert allclose(Y, clY) assert allclose(A_js, clA_js) assert allclose(X_js, clX_js) # -- run cl computation plan = plan_ragged_gather_gemv( queue, alpha, clA, clA_js, clX, clX_js, beta, clY) plan() # -- ensure they match for i in xrange(len(A_js)): aj, xj = int(A_js[i]), int(X_js[i]) ref = alpha*np.dot(A[aj], X[xj]) + beta*Y[i] sim = clY[i] assert np.allclose(ref, sim)
def test_basic(ctx): # -- prepare initial conditions on host A = RA([[[0.1, 0.2], [0.3, 0.4]], [[0.5, 0.6]]]) X = RA([[3, 5]]) Y = RA([[0.0], [2, 3]]) A_js = RA([[1], [0]], dtype=np.int32) X_js = RA([[0], [0]], dtype=np.int32) # alpha = 0.5 alpha = 1.0 # beta = 0.1 beta = 1.0 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) clA = CLRA(queue, A) clX = CLRA(queue, X) clY = CLRA(queue, Y) assert ra_allclose(A, clA) assert ra_allclose(X, clX) assert ra_allclose(Y, clY) # -- run cl computation prog = plan_ragged_gather_gemv(queue, alpha, clA, A_js, clX, X_js, beta, clY) # plans = prog.choose_plans() # assert len(plans) == 1 for plan in prog.plans: plan() # -- ensure they match for i, _ in enumerate(A_js): aj, xj = int(A_js[i]), int(X_js[i]) ref = alpha * np.dot(A[aj], X[xj]) + beta * Y[i] sim = clY[i] assert np.allclose(ref, sim)
def _test_random(self, k=4, p=1, m=10, n=10): """ Parameters ---------- k : number of operations (length of A_js) p : number of dots per operation (width of A_js) m : output dimensions n : input dimensions """ rng = np.random.RandomState(3294) aa = [rng.normal(size=(m, n)) for i in xrange(k)] xx = [rng.normal(size=n) for i in xrange(k)] yy = [rng.normal(size=m) for i in xrange(k)] ajs = [rng.randint(k, size=p) for i in xrange(k)] xjs = [rng.randint(k, size=p) for i in xrange(k)] A = RA(aa) X = RA(xx) Y = RA(yy) A_js = RA(ajs) X_js = RA(xjs) alpha = 0.5 beta = 0.1 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) clA = CLRA(queue, A) clX = CLRA(queue, X) clY = CLRA(queue, Y) clA_js = CLRA(queue, A_js) clX_js = CLRA(queue, X_js) assert allclose(A, clA) assert allclose(X, clX) assert allclose(Y, clY) assert allclose(A_js, clA_js) assert allclose(X_js, clX_js) # -- run cl computation prog = plan_ragged_gather_gemv(queue, alpha, clA, clA_js, clX, clX_js, beta, clY) print '-' * 5 + ' Plans ' + '-' * 45 for plan in prog.plans: print plan prog() # -- ensure they match for i in xrange(k): ref = beta * Y[i] for aj, xj in zip(A_js[i], X_js[i]): ref += alpha * np.dot(A[aj], X[xj]) sim = clY[i] assert np.allclose(ref, sim, atol=1e-3, rtol=1e-3)
def _test_random(self, k=4, p=1, m=10, n=10): """ Parameters ---------- k : number of operations (length of A_js) p : number of dots per operation (width of A_js) m : output dimensions n : input dimensions """ rng = np.random.RandomState(3294) aa = [rng.normal(size=(m, n)) for i in xrange(k)] xx = [rng.normal(size=n) for i in xrange(k)] yy = [rng.normal(size=m) for i in xrange(k)] ajs = [rng.randint(k, size=p) for i in xrange(k)] xjs = [rng.randint(k, size=p) for i in xrange(k)] A = RA(aa) X = RA(xx) Y = RA(yy) A_js = RA(ajs) X_js = RA(xjs) alpha = 0.5 beta = 0.1 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) clA = CLRA(queue, A) clX = CLRA(queue, X) clY = CLRA(queue, Y) clA_js = CLRA(queue, A_js) clX_js = CLRA(queue, X_js) assert allclose(A, clA) assert allclose(X, clX) assert allclose(Y, clY) assert allclose(A_js, clA_js) assert allclose(X_js, clX_js) # -- run cl computation prog = plan_ragged_gather_gemv( queue, alpha, clA, clA_js, clX, clX_js, beta, clY) print '-' * 5 + ' Plans ' + '-' * 45 for plan in prog.plans: print plan prog() # -- ensure they match for i in xrange(k): ref = beta*Y[i] for aj, xj in zip(A_js[i], X_js[i]): ref += alpha*np.dot(A[aj], X[xj]) sim = clY[i] assert np.allclose(ref, sim, atol=1e-3, rtol=1e-3)
def test_basic(self): # -- prepare initial conditions on host A = RA([[[0.1, .2], [.3, .4]], [[.5, .6]]]) X = RA([[3, 5]]) Y = RA([ [0.0], [2, 3], ]) A_js = RA([[1], [0]]) X_js = RA([[0], [0]]) alpha = 0.5 beta = 0.1 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) clA = CLRA(queue, A) clX = CLRA(queue, X) clY = CLRA(queue, Y) clA_js = CLRA(queue, A_js) clX_js = CLRA(queue, X_js) assert allclose(A, clA) assert allclose(X, clX) assert allclose(Y, clY) assert allclose(A_js, clA_js) assert allclose(X_js, clX_js) # -- run cl computation plan = plan_ragged_gather_gemv(queue, alpha, clA, clA_js, clX, clX_js, beta, clY) plan() # -- ensure they match for i in xrange(len(A_js)): aj, xj = int(A_js[i]), int(X_js[i]) ref = alpha * np.dot(A[aj], X[xj]) + beta * Y[i] sim = clY[i] assert np.allclose(ref, sim)
def test_speed(rng): try: import pyopencl_blas except ImportError: pyopencl_blas = None # enable_out_of_order = ( # cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) k = 300 # k = 100 # k = 32 # k = 16 ms = [rng.randint(100, 1000) for i in range(k)] ns = [rng.randint(100, 1000) for i in range(k)] # ms = [4096 for i in range(k)] # ns = [4096 for i in range(k)] aa = [rng.uniform(-1, 1, size=(m, n)).astype('float32') for m, n in zip(ms, ns)] xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns] yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms] ajs = [np.int32(i) for i in range(k)] xjs = [np.int32(i) for i in range(k)] # ajs = [rng.randint(k, size=p) for i in range(k)] # xjs = [rng.randint(k, size=p) for i in range(k)] # alpha = 0.5 # beta = 0.1 alpha = 1.0 beta = 1.0 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) # queue = cl.CommandQueue(ctx, properties=enable_out_of_order) clA = CLRA.from_arrays(queue, aa) clX = CLRA.from_arrays(queue, xx) clY = CLRA.from_arrays(queue, yy) A_js = RA(ajs, dtype=np.int32) X_js = RA(xjs, dtype=np.int32) # -- run cl computation prog = plan_ragged_gather_gemv( queue, alpha, clA, A_js, clX, X_js, beta, clY) plans = prog.choose_plans() print('') print('-' * 5 + ' Plans ' + '-' * 45) for plan in plans: print(plan) with Timer() as timer: for plan in plans: plan() print("nengo_ocl: %0.3f" % timer.duration) # -- speed test in ocl blas if pyopencl_blas: pyopencl_blas.setup() def array(a): cla = cl.array.Array(queue, a.shape, a.dtype) cla.set(a) return cla clAs = [array(a) for a in aa] clXs = [array(x.ravel()) for x in xx] clYs = [array(y.ravel()) for y in yy] queues = [cl.CommandQueue(ctx) for _ in range(k)] # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order) # for _ in range(k)] queue.finish() with Timer() as timer: if 0: # use a single queue for A, X, Y in zip(clAs, clXs, clYs): pyopencl_blas.gemv(queue, A, X, Y) queue.finish() else: # use multiple parallel queues events = [] for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)): q = queues[i % len(queues)] e = pyopencl_blas.gemv(q, A, X, Y) events.append(e) for q in queues: q.flush() cl.wait_for_events(events) print("clBLAS: %0.3f" % timer.duration)
def plan_ragged_gather_gemv(self, *args, **kwargs): return plan_ragged_gather_gemv(self.queue, *args, **kwargs)
def test_speed(ctx, rng): try: import pyopencl_blas except ImportError: pyopencl_blas = None # enable_out_of_order = ( # cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) k = 300 # k = 100 # k = 32 # k = 16 ms = [rng.randint(100, 1000) for i in range(k)] ns = [rng.randint(100, 1000) for i in range(k)] # ms = [4096 for i in range(k)] # ns = [4096 for i in range(k)] aa = [ rng.uniform(-1, 1, size=(m, n)).astype('float32') for m, n in zip(ms, ns) ] xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns] yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms] ajs = [np.int32(i) for i in range(k)] xjs = [np.int32(i) for i in range(k)] # ajs = [rng.randint(k, size=p) for i in range(k)] # xjs = [rng.randint(k, size=p) for i in range(k)] # alpha = 0.5 # beta = 0.1 alpha = 1.0 beta = 1.0 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) # queue = cl.CommandQueue(ctx, properties=enable_out_of_order) clA = CLRA.from_arrays(queue, aa) clX = CLRA.from_arrays(queue, xx) clY = CLRA.from_arrays(queue, yy) A_js = RA(ajs, dtype=np.int32) X_js = RA(xjs, dtype=np.int32) # -- run cl computation prog = plan_ragged_gather_gemv(queue, alpha, clA, A_js, clX, X_js, beta, clY) plans = prog.choose_plans() print('') print('-' * 5 + ' Plans ' + '-' * 45) for plan in plans: print(plan) with Timer() as timer: for plan in plans: plan() print("nengo_ocl: %0.3f" % timer.duration) # -- speed test in ocl blas if pyopencl_blas: pyopencl_blas.setup() def array(a): cla = cl.array.Array(queue, a.shape, a.dtype) cla.set(a) return cla clAs = [array(a) for a in aa] clXs = [array(x.ravel()) for x in xx] clYs = [array(y.ravel()) for y in yy] queues = [cl.CommandQueue(ctx) for _ in range(k)] # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order) # for _ in range(k)] queue.finish() with Timer() as timer: if 0: # use a single queue for A, X, Y in zip(clAs, clXs, clYs): pyopencl_blas.gemv(queue, A, X, Y) queue.finish() else: # use multiple parallel queues events = [] for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)): q = queues[i % len(queues)] e = pyopencl_blas.gemv(q, A, X, Y) events.append(e) for q in queues: q.flush() cl.wait_for_events(events) print("clBLAS: %0.3f" % timer.duration)