def test_float16(): # gemv (gemm called) float16_data = [ rand(3).astype("float16"), np.asarray(1, dtype=np.float32), rand(3, 3).astype("float16"), rand(3).astype("float16"), np.asarray(0.5, dtype=np.float32), ] float16_shared = [ gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data ] o = gemv(*float16_shared) f = aesara.function([], o, mode=mode_with_gpu) y, alpha, A, x, beta = float16_data out = f() utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y) topo = f.maker.fgraph.toposort() assert any(isinstance(n.op, GpuGemm) for n in topo) # gemm float16_data = [ rand(3, 3).astype("float16"), np.asarray(1, dtype=np.float32), rand(3, 3).astype("float16"), rand(3, 3).astype("float16"), np.asarray(0.5, dtype=np.float32), ] float16_shared = [ gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data ] o = gpugemm_no_inplace(*float16_shared) f = aesara.function([], o) y, alpha, A, x, beta = float16_data out = f() utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y) # dot22 float16_data = [rand(3, 3).astype("float16"), rand(3, 3).astype("float16")] float16_shared = [gpuarray_shared_constructor(val) for val in float16_data] o = gpu_dot22(*float16_shared) f = aesara.function([], o) x, y = float16_data out = f() utt.assert_allclose(np.asarray(out), np.dot(x, y))
def main(dev1, dev2): init_dev(dev1, "ctx1") init_dev(dev2, "ctx2") size = 1024 * 16 data = np.random.randn(size, size).astype("float32") val1a = aesara.shared(data, target="ctx1") val1b = aesara.shared(data, target="ctx1") val1c = aesara.shared(data, target="ctx1") val1d = aesara.shared(data, target="ctx1") val2a = aesara.shared(data, target="ctx2") val2b = aesara.shared(data, target="ctx2") f1 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val1c, val1d)]) f2 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val2a, val2b)]) f3 = aesara.function([], [gpu_dot22(val1a, val1b)]) f4 = aesara.function([], [gpu_dot22(val2a, val2b)]) f5 = aesara.function([], [gpu_dot22(val1a, val1b)[0, 0].transfer("cpu")]) f6 = aesara.function([], [gpu_dot22(val2a, val2b)[0, 0].transfer("cpu")]) # pre-execute to load code to GPU. r = f1.fn() r[0].sync(), r[1].sync() r = f2.fn() r[0].sync(), r[1].sync() r = f3.fn() r[0].sync() r = f4.fn() r[0].sync() r = f5.fn() r = f6.fn() r = None t = time.time() r = f1.fn() r[0].sync(), r[1].sync() t2 = time.time() r = None print(f"one ctx async {t2 - t:f}") t = time.time() r = f2.fn() r[0].sync(), r[1].sync() t2 = time.time() r = None print(f"two ctx async {t2 - t:f}") t = time.time() r = f3.fn() r2 = f4.fn() r[0].sync() r2[0].sync() t2 = time.time() r = None print(f"two ctx, 2 fct async {t2 - t:f}") t = time.time() r = f5.fn() r2 = f6.fn() t2 = time.time() r = None print(f"two ctx, 2 fct with transfer {t2 - t:f}") # Multi-thread version class myThread(threading.Thread): def __init__(self, name, f, sync): threading.Thread.__init__(self) self.f = f self.name = name self.sync = sync def run(self): # print "Starting " + self.name # r = self.f.fn(n_calls=10) r = self.f() # print "End " + self.name if self.sync: r[0].sync() self.r = r # print "Exiting " + self.name thread1 = myThread("Thread-3", f3, True) thread2 = myThread("Thread-4", f4, True) t = time.time() thread1.start() thread2.start() thread1.join() thread2.join() t2 = time.time() print(f"two ctx, 2 fct async, 2 threads {t2 - t:f}") thread1 = myThread("Thread-5", f5, False) thread2 = myThread("Thread-6", f6, False) t = time.time() thread1.start() thread2.start() thread1.join() thread2.join() t2 = time.time() print(f"two ctx, 2 fct with transfer, 2 threads {t2 - t:f}")