async def f(): nrows = int(1e7) nnz = nrows * 10 n_repeats = 1 mat = random_test_matrix(nrows, nnz) vec = np.random.rand(nrows) - 0.5 t = tsk.Timer() for i in range(n_repeats): correct = mat.dot(vec) t.report("simple dot") gang = await tsk.ctx().wait_for_workers(cfg.n_workers) t.report("wait for workers") t.report("launch profiler") tsk_vec = TskArray(vals=vec) t.report("shmem v") tsk_mat = distribute(mat, gang) t.report("distribute mat") result = await tsk_mat.dot(tsk_vec) t.report("first dot") async with tsk.Profiler(gang): t.restart() for i in range(n_repeats): result = await tsk_mat.dot(tsk_vec) t.report("parallel dot") print(np.sum(correct)) print(np.sum(result)) assert np.sum(result) == np.sum(correct)
async def f(): gang = await tsk.ctx().wait_for_workers(2) fnc_dref = tsk.put(long_fnc) async with tsk.Profiler(gang): start = time.time() await wait_all([ tsk.task(fnc_dref, to=gang[i % len(gang)]) for i in range(n_jobs) ]) print("inside: ", time.time() - start)
async def submit(w): n = int(4e8) ref = tsk.alloc(w, n * 8) A = np.frombuffer(await ref.get(), dtype=np.float64) A[:] = np.random.rand(n) rhs = np.sum(A) for to in range(2): async with tsk.Profiler(w, range(2)): # ref = tsk.put(w, A.data.cast('B')) async def remote(w): A = np.frombuffer(await ref.get(), dtype=np.float64) return np.sum(A) lhs = await tsk.task(w, remote, to=1) assert (lhs == rhs)
async def submit(): gang = await tsk.ctx().wait_for_workers(2) n = int(4e8) ref = tsk.alloc(n * 8) A = np.frombuffer(await ref.get(), dtype=np.float64) A[:] = np.random.rand(n) rhs = np.sum(A) for i in range(2): async with tsk.Profiler(gang): # ref = tsk.put(w, A.data.cast('B')) async def remote(): A = np.frombuffer(await ref.get(), dtype=np.float64) return np.sum(A) lhs = await tsk.task(remote, to=gang[1]) assert lhs == rhs
async def submit(w): t = tsk.Timer() nrows = int(1e8) # nnz = 5 * nrows A = make_test_matrix(nrows, 1) t.report('build csr') v = np.random.rand(nrows) t.report('gen v') correct = v.copy() #np.empty(A.shape[0]) for i in range(100): v[:] = correct t.report('copy') _sparse.csrmv(A.indptr, A.indices, A.data, v, correct, True) t.report('csrmv') data_dref = tsk.put(w, value=A.data.data.cast('B'), eager_alloc=1) indptr_dref = tsk.put(w, value=A.indptr.data.cast('B'), eager_alloc=1) indices_dref = tsk.put(w, value=A.indices.data.cast('B'), eager_alloc=1) v_dref = tsk.put(w, value=v.data.cast('B'), eager_alloc=1) t.report('put matrix') print('total bytes', A.data.nbytes + A.indptr.nbytes + A.indices.nbytes + v.nbytes * 2) async def dot_chunk(w, args): t = tsk.Timer(None) start_row, end_row, out_dref = args # print(start_row, end_row) v_buf = await tsk.remote_get(w, v_dref) out_buf = await tsk.remote_get(w, out_dref) data_buf = await tsk.remote_get(w, data_dref) indptr_buf = await tsk.remote_get(w, indptr_dref) indices_buf = await tsk.remote_get(w, indices_dref) v = np.frombuffer(v_buf, dtype=np.float64) indptr = np.frombuffer(indptr_buf, dtype=np.int32)[start_row:end_row + 1] indices = np.frombuffer(indices_buf, dtype=np.int32) data = np.frombuffer(data_buf, dtype=np.float64) out = np.frombuffer(out_buf, dtype=np.float64) t.report(str(w.addr) + ' setup') inner_chunk_size = int(1e9) for i in range(0, indptr.shape[0], inner_chunk_size): # print(start_row, i, i+inner_chunk_size) _sparse.csrmv( indptr[i:(i + inner_chunk_size + 1)], indices, data, v, out[(start_row + i):(start_row + i + inner_chunk_size)], False) await asyncio.sleep(0) t.report(str(w.addr) + ' dot') dot_chunk_dref = put_fnc(w, dot_chunk) t.report('put fnc') out_dref = tsk.alloc(w, nrows * 8) out = np.frombuffer(w.memory.get_local(out_dref), dtype=np.float64) t.report('alloc out') n_super_chunks = int(np.floor(np.sqrt(n_cores))) async with tsk.Profiler(w, range(1)): for i in range(50): await map(w, dot_chunk_dref, nrows, out_dref, n_super_chunks=n_super_chunks) t.report('dot')
async def submit2(w): t = tsk.Timer() nrows = int(5e7) n_super_chunks = int(np.floor(np.sqrt(n_cores))) # A = make_test_matrix(nrows, 1) # t.report('build csr') # v = np.random.rand(nrows) # t.report('gen v') # correct = np.empty(A.shape[0]) # _sparse.csrmv(A.indptr, A.indices, A.data, v, correct, True) # t.report('serial1') # v_dref = w.memory.put(value = v.data.cast('B'), eager_alloc = 1) # data_dref = w.memory.put(value = A.data.data.cast('B'), eager_alloc = 1) # indptr_dref = w.memory.put(value = A.indptr.data.cast('B'), eager_alloc = 1) # indices_dref = w.memory.put(value = A.indices.data.cast('B'), eager_alloc = 1) # t.report('put matrix') # async def build_local_matrix(w, args): # t = tsk.Timer(output_fnc = lambda x: None) # start_row, end_row = args # data_buf = await tsk.remote_get(w, data_dref) # indptr_buf = await tsk.remote_get(w, indptr_dref) # indices_buf = await tsk.remote_get(w, indices_dref) # indptr = np.frombuffer(indptr_buf, dtype = np.int32)[start_row:end_row+1].copy() # indices = np.frombuffer(indices_buf, dtype = np.int32)[indptr[0]:indptr[-1]].copy() # data = np.frombuffer(data_buf, dtype = np.float64)[indptr[0]:indptr[-1]].copy() # indptr -= indptr[0] # out = np.empty(end_row - start_row) # matrix = (indptr, indices, data, out) # matrix_dref = tsk.put(w, value = matrix) # v_buf = await tsk.remote_get(w, v_dref) # v = np.frombuffer(v_buf, dtype = np.float64).copy() # v_dref_out = tsk.put(w, value = v) # t.report(str(w.addr) + ' distribute') # return (v_dref_out, matrix_dref) # build_dref = put_fnc(w, build_local_matrix) # def rand_vec(w, args): # start_row, end_row = args # v_dref = tsk.put(w, value = np.random.rand(end_row - start_row).data.cast('B'), eager_alloc = 1) # return v_dref # v_chunks = await map(w, rand_vec, nrows, n_super_chunks = n_super_chunks) v = np.random.rand(nrows) v_dref = w.memory.put(value=v.data.cast('B'), eager_alloc=1) t.report('gen v') async def build_matrix(w, args): start_row, end_row = args A = make_test_matrix(end_row - start_row, 1) out = np.empty(end_row - start_row) matrix = (A.indptr, A.indices, A.data, out) matrix_dref = tsk.put(w, value=matrix) # v_dref_out = v_dref # vs = [] # for vc in v_chunks: # vs.append(np.frombuffer(await tsk.remote_get(w, vc), dtype = np.float64)) # v = np.concatenate(vs) v_buf = await tsk.remote_get(w, v_dref) v = np.frombuffer(v_buf, dtype=np.float64).copy() v_dref_out = tsk.put(w, value=v) return (v_dref_out, matrix_dref) build_dref = put_fnc(w, build_matrix) matrix_chunks = await map(w, build_dref, nrows, n_super_chunks=n_super_chunks) t.report('distribute matrix') async def dot(w, args): t = tsk.Timer(output_fnc=lambda x: None) if w.addr == 10: t = tsk.Timer() istart, iend, st = args assert (iend == istart + 1) # st, i = args v_dref, matrix_dref = matrix_chunks[istart] # print(w.addr, 'took', time.time() - st, 'to launch') indptr, indices, data, out = w.memory.get_local(matrix_dref) v = w.memory.get_local(v_dref) # v_buf = await tsk.remote_get(w, v_dref) # v = np.frombuffer(v_buf, dtype = np.float64) inner_chunk_size = int(1e9) t.report(str(w.addr) + ' setup') for i in range(0, indptr.shape[0], inner_chunk_size): _sparse.csrmv(indptr[i:(i + inner_chunk_size + 1)], indices, data, v, out[i:(i + inner_chunk_size)], False) await asyncio.sleep(0) t.report(str(w.addr) + ' dot') # return out_dref dot_dref = put_fnc(w, dot) # await run_dot() async with tsk.Profiler(w, range(0)): t.report('put/startprof') for i in range(4): print('') print('') print('') await map(w, dot_dref, len(matrix_chunks), time.time(), n_super_chunks=n_super_chunks) t.report('dot')