def cholesky_blocked_inplace(shape, num_gpus): """ This is a less naive version of dpotrf with one level of blocking. Blocks are currently assumed to evenly divide the axes lengths. The input array 4 dimensional. The first and second index select the block (row first, then column). The third and fourth index select the entry within the given block. """ if shape[0] * shape[2] != shape[1] * shape[3]: raise ValueError("A square matrix is required.") if shape[0] != shape[1]: raise ValueError("Non-square blocks are not supported.") # Define task spaces gemm1 = TaskSpace("gemm1") # Inter-block GEMM subcholesky = TaskSpace("subcholesky") # Cholesky on block gemm2 = TaskSpace("gemm2") # Inter-block GEMM solve = TaskSpace("solve") # Triangular solve for j in range(shape[0]): for k in range(j): # Inter - block GEMM @spawn(gemm1[j, k], [solve[j, k]], placement=[gpu(j%num_gpus)]) def t1(): out = get_gpu_memory(j, j, num_gpus) rhs = get_gpu_memory(j, k, num_gpus) out = update(rhs, rhs, out) set_gpu_memory_from_gpu(j, j, num_gpus, out) # Cholesky on block @spawn(subcholesky[j], [gemm1[j, 0:j]], placement=[gpu(j%num_gpus)]) def t2(): dblock = get_gpu_memory(j, j, num_gpus) dblock = cholesky(dblock) set_gpu_memory_from_gpu(j, j, num_gpus, dblock) for i in range(j+1, shape[0]): for k in range(j): # Inter - block GEMM @spawn(gemm2[i, j, k], [solve[j, k], solve[i, k]], placement=[gpu(i%num_gpus)]) def t3(): out = get_gpu_memory(i, j, num_gpus) rhs1 = get_gpu_memory(i, k, num_gpus) rhs2 = get_gpu_memory(j, k, num_gpus) out = update(rhs1, rhs2, out) set_gpu_memory_from_gpu(i, j, num_gpus, out) # Triangular solve @spawn(solve[i, j], [gemm2[i, j, 0:j], subcholesky[j]], placement=[gpu(i%num_gpus)]) def t4(): factor = get_gpu_memory(j, j, num_gpus) panel = get_gpu_memory(i, j, num_gpus) out = ltriang_solve(factor, panel) set_gpu_memory_from_gpu(i, j, num_gpus, out) return subcholesky[shape[0]-1]
async def test_tsqr_blocked(placement=cpu): for i in range(WARMUP + ITERS): # Reset all iteration-specific timers and counters perf_stats.reset() # Original matrix np.random.seed(i) A = np.random.rand(NROWS, NCOLS) if PLACEMENT_STRING == 'puregpu': if (NROWS % NGPUS != 0): raise ValueError( "Pure GPU version requires NROWS %% NGPUS == 0 (currently %i %% %i)" % (NROWS, NGPUS)) # Partition matrix on GPUs mapper = LDeviceSequenceBlocked( NGPUS, placement=[gpu(dev) for dev in range(NGPUS)]) A_dev = mapper.partition_tensor(A) tot_start = time() Q_dev, R_dev = await tsqr_blocked_puregpu(A_dev, BLOCK_SIZE) tot_end = time() # Copy the data back if CHECK_RESULT: Q = np.empty(shape=(0, NCOLS)) for dev in range(NGPUS): with cp.cuda.Device(dev): Q = np.vstack((Q, cp.asnumpy(Q_dev[dev]))) R = cp.asnumpy(R_dev) else: # Normal version # Run and time the algorithm tot_start = time() Q, R = await tsqr_blocked(A, BLOCK_SIZE) tot_end = time() perf_stats.tot_time = tot_end - tot_start # Combine task timings into totals for this iteration perf_stats.consolidate_stats() if (i >= WARMUP): iter = i - WARMUP if CSV: perf_stats.print_stats_csv(iter) else: perf_stats.print_stats(iter) # Check the results if CHECK_RESULT: if check_result(A, Q, R): print("\nCorrect result!\n") else: print("%***** ERROR: Incorrect final result!!! *****%")
def test_placement_await(): try: from parla.cuda import gpu except (ImportError, AttributeError): skip("CUDA required for this test.") devices = [cpu(0), gpu(0)] for rep in repetitions(): task_results = [] for i in range(2): @spawn(placement=devices[i]) async def task(): task_results.append(get_current_device()) await tasks() # Await nothing to force a new task. task_results.append(get_current_device()) sleep_until(lambda: len(task_results) == (i + 1) * 2) assert task_results == [cpu(0), cpu(0), gpu(0), gpu(0)]
def test_placement_data(runtime_sched): try: from parla.cuda import gpu except: skip("Test needs cuda.") return devices = [cpu(0), gpu(0)] for rep in repetitions(): task_results = [] for (i, dev) in enumerate(devices): d = dev.memory()(np.array([1, 2, 3])) @spawn(placement=d) def task(): task_results.append(get_current_devices()[0]) sleep_until(lambda: len(task_results) == i+1) assert task_results == devices
NCOLS = args.cols BLOCK_SIZE = args.block_size ITERS = args.iterations WARMUP = args.warmup NTHREADS = args.threads NGPUS = args.ngpus PLACEMENT_STRING = args.placement CHECK_RESULT = args.check_result CSV = args.csv # Set up PLACEMENT variable if PLACEMENT_STRING == 'cpu': PLACEMENT = cpu ACUS = None elif PLACEMENT_STRING == 'gpu': PLACEMENT = [gpu(i) for i in range(NGPUS)] ACUS = None elif PLACEMENT_STRING == 'both': PLACEMENT = [cpu(0)] + [gpu(i) for i in range(NGPUS)] ACUS = 1 elif PLACEMENT_STRING == 'puregpu': PLACEMENT = [gpu(i) for i in range(NGPUS)] ACUS = None BLOCK_SIZE = int(NROWS / NGPUS) else: print( "Invalid value for placement. Must be 'cpu' or 'gpu' or 'both' or 'puregpu'" ) perf_stats = perfStats(ITERS, NROWS, BLOCK_SIZE)