def test_qr(): N = 28 shard_size = 7 shard_sizes = (shard_size, shard_size) X = np.random.randn(N, N) X_sharded = BigMatrix("QR_input_X", shape=X.shape, shard_sizes=shard_sizes, write_header=True) N_blocks = X_sharded.num_blocks(0) shard_matrix(X_sharded, X) program, meta = qr(X_sharded) executor = fs.ProcessPoolExecutor(2) program.start() print("starting program...") future = executor.submit(job_runner.lambdapack_run, program, timeout=60, idle_timeout=6, pipeline_width=1) future = executor.submit(job_runner.lambdapack_run, program, timeout=60, idle_timeout=6, pipeline_width=1) future = executor.submit(job_runner.lambdapack_run, program, timeout=60, idle_timeout=6, pipeline_width=1) program.wait() program.free() Rs = meta["outputs"][0] R_remote = Rs.get_block(N_blocks - 1, N_blocks - 1, 0) R_local = np.linalg.qr(X)[1][-shard_size:, -shard_size:] sign_matrix_local = np.eye(R_local.shape[0]) sign_matrix_remote = np.eye(R_local.shape[0]) sign_matrix_local[np.where(np.diag(R_local) <= 0)] *= -1 sign_matrix_remote[np.where(np.diag(R_remote) <= 0)] *= -1 # make the signs match R_remote *= np.diag(sign_matrix_remote)[:, np.newaxis] R_local *= np.diag(sign_matrix_local)[:, np.newaxis] assert (np.allclose(R_local, R_remote))
def test_cholesky_lambda(): X = np.random.randn(64, 64) A = X.dot(X.T) + np.eye(X.shape[0]) shard_size = 16 shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("cholesky_test_A", shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) program, meta = cholesky(A_sharded) futures = run_program_in_pywren(program) program.start() program.wait() program.free() L_sharded = meta["outputs"][0] L_npw = L_sharded.numpy() L = np.linalg.cholesky(A) assert(np.allclose(L_npw, L)) print("great success!")
def test_elemwise_uop(self, f, f_numpy): X = np.random.randn(16, 16) pwex = pywren.default_executor() X_sharded = BigMatrix("{0}_uop_test".format(f), shape=X.shape, shard_sizes=X.shape) shard_matrix(X_sharded, X) res_sharded = f(pwex, X_sharded) res = res_sharded.numpy() res.free() assert (np.isclose(f_numpy(X), res))
def test_multiple_shard_matrix_multiply_symmetric_2(self): X = np.random.randn(16, 16) shard_sizes = [8, 16] X_sharded = BigMatrix("gemm_test_1", shape=X.shape, shard_sizes=shard_sizes) shard_matrix(X_sharded, X) pwex = pywren.lambda_executor() XTX_sharded = binops.gemm(pwex, X_sharded.T, X_sharded, X_sharded.bucket, 1, local=True) XTX_sharded_local = XTX_sharded.numpy() XTX = X.T.dot(X) X_sharded.free() XTX_sharded.free() assert (np.all(np.isclose(XTX, XTX_sharded_local))) os.system("rm -rf /dev/shm/*")
def test_single_shard_gemv(self): X = np.random.randn(16, 16) Y = np.random.randn(16) X_sharded = BigMatrix("gemv_test_0", shape=X.shape, shard_sizes=X.shape) Y_sharded = BigMatrix("gemv_test_2", shape=Y.shape, shard_sizes=Y.shape) shard_matrix(X_sharded, X) pwex = pywren.default_executor() XY_sharded = binops.gemv(pwex, X_sharded, Y_sharded, X_sharded.bucket, 1) XY_sharded_local = XY_sharded.numpy() XY = X.dot(Y) print(XY) print(XY_sharded_local) X_sharded.free() XY_sharded.free() assert (np.all(np.isclose(XY, XY_sharded_local)))
def test_single_shard_matrix_multiply(self): fexec = lithops.FunctionExecutor(runtime='jsampe/numpy-lithops:04', log_level='DEBUG') X = np.random.randn(16, 16) X_sharded = BigMatrix("gemm_test_0", shape=X.shape, shard_sizes=X.shape, storage=fexec.storage) shard_matrix(X_sharded, X) XX_sharded = binops.gemm(fexec, X_sharded, X_sharded.T, X_sharded.bucket, 1) XX_sharded_local = XX_sharded.numpy() XX = X.dot(X.T) X_sharded.free() XX_sharded.free() assert (np.all(np.isclose(XX, XX_sharded_local))) os.system("rm -rf /dev/shm/*")
def test_nested_if_run(self): X = np.random.randn(64) shard_sizes = (int(X.shape[0] / 8), ) X_sharded = BigMatrix("if_test", shape=X.shape, shard_sizes=shard_sizes, write_header=True) O_sharded = BigMatrix("if_test_output", shape=X.shape, shard_sizes=shard_sizes, write_header=True) X_sharded.free() shard_matrix(X_sharded, X) f = frontend.lpcompile(f1_if_nested) p = f(X_sharded, O_sharded, X_sharded.num_blocks(0)) num_cores = 1 executor = fs.ProcessPoolExecutor(num_cores) config = npw.config.default() p_ex = lp.LambdaPackProgram(p, config=config) p_ex.start() all_futures = [] for i in range(num_cores): all_futures.append( executor.submit(job_runner.lambdapack_run, p_ex, pipeline_width=1, idle_timeout=5, timeout=60)) p_ex.wait() time.sleep(5) p_ex.free() for i in range(X_sharded.num_blocks(0)): Ob = O_sharded.get_block(i) Xb = X_sharded.get_block(i) if ((i % 2) == 0 and ((i % 3) == 0)): assert (np.allclose(Ob, 3 * Xb)) elif ((i % 2) == 0): assert (np.allclose(Ob, Xb)) else: assert (np.allclose(Ob, 2 * Xb))
def test_cholesky(): X = np.random.randn(64, 64) A = X.dot(X.T) + np.eye(X.shape[0]) shard_size = 8 shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("cholesky_test_A", shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) program, meta = cholesky(A_sharded) executor = fs.ProcessPoolExecutor(1) print("starting program") program.start() future = executor.submit(job_runner.lambdapack_run, program, timeout=30, idle_timeout=6) program.wait() program.free() L_sharded = meta["outputs"][0] L_npw = L_sharded.numpy() L = np.linalg.cholesky(A) assert(np.allclose(L_npw, L)) print("great success!")
def test_gemm_lambda(): size = 32 A = np.random.randn(size, size) B = np.random.randn(size, size) C = np.dot(A, B) shard_sizes = (8, 8) A_sharded = BigMatrix("Gemm_test_A", shape=A.shape, shard_sizes=shard_sizes, write_header=True) B_sharded = BigMatrix("Gemm_test_B", shape=A.shape, shard_sizes=shard_sizes, write_header=True) shard_matrix(A_sharded, A) shard_matrix(B_sharded, B) program, meta = gemm(A_sharded, B_sharded) executor = fs.ProcessPoolExecutor(1) program.start() run_program_in_pywren(program) program.wait() program.free() C_sharded = meta["outputs"][0] C_npw = C_sharded.numpy() assert(np.allclose(C_npw, C)) return
def test_bdfac_truncated(): N = 16 shard_size = 4 shard_sizes = (shard_size, shard_size) np.random.seed(0) X = np.random.randn(N, N) U, S, V = bdfac_python(X, block_size=shard_size) svd_bdfac = np.linalg.svd(S, compute_uv=False) svd_local = np.linalg.svd(X, compute_uv=False) print(svd_bdfac) print(svd_local) assert(np.allclose(svd_bdfac, svd_local)) X_sharded = BigMatrix("BDFAC_input_X", shape=X.shape, shard_sizes=shard_sizes, write_header=True) N_blocks = X_sharded.num_blocks(0) shard_matrix(X_sharded, X) program, meta = bdfac(X_sharded, truncate=2) executor = fs.ProcessPoolExecutor(1) program.start() executor.submit(job_runner.lambdapack_run, program, timeout=200, idle_timeout=200, pipeline_width=1) program.wait() print("returned..")
def test_gemm(): size = 64 # np.random.seed(0) A = np.random.randn(size, size) B = np.random.randn(size, size) C = np.dot(A, B) shard_sizes = (16, 16) A_sharded = BigMatrix("Gemm_test_A", shape=A.shape, shard_sizes=shard_sizes, write_header=True) B_sharded = BigMatrix("Gemm_test_B", shape=A.shape, shard_sizes=shard_sizes, write_header=True) shard_matrix(A_sharded, A) shard_matrix(B_sharded, B) program, meta = gemm(A_sharded, B_sharded) program.start() job_runner.lambdapack_run( program, timeout=60, idle_timeout=6, pipeline_width=3) program.wait() program.free() C_sharded = meta["outputs"][0] C_npw = C_sharded.numpy() assert(np.allclose(C_npw, C)) return
def cholesky(X, truncate=0): S = BigMatrix("Cholesky.Intermediate({0})".format(X.key), shape=(X.num_blocks(1) + 1, X.shape[0], X.shape[0]), shard_sizes=(1, X.shard_sizes[0], X.shard_sizes[0]), bucket=X.bucket, write_header=True) O = BigMatrix("Cholesky({0})".format(X.key), shape=(X.shape[0], X.shape[0]), shard_sizes=(X.shard_sizes[0], X.shard_sizes[0]), write_header=True, parent_fn=constant_zeros) t = time.time() p0 = lpcompile_for_execution(CHOLESKY, inputs=["I"], outputs=["O"]) p1 = p0(O, X, S, int(np.ceil(X.shape[0] / X.shard_sizes[0])), truncate) e = time.time() c_time = e - t config = npw.config.default() program = lp.LambdaPackProgram(p1, config=config) return program, { "outputs": [O], "intermediates": [S], "compile_time": c_time }
def tsqr(X, truncate=0): b_fac = 2 assert (X.shard_sizes[1] == X.shape[1]) shard_size = X.shard_sizes[0] shard_sizes = X.shard_sizes num_tree_levels = max( int(np.ceil(np.log2(X.num_blocks(0)) / np.log2(b_fac))), 1) R_sharded = BigMatrix("tsqr_R({0})".format(X.key), shape=(num_tree_levels * shard_size, X.shape[0]), shard_sizes=shard_sizes, write_header=True, safe=False) T_sharded = BigMatrix("tsqr_T({0})".format(X.key), shape=(num_tree_levels * shard_size * b_fac, X.shape[0]), shard_sizes=(shard_size * b_fac, shard_size), write_header=True, safe=False) V_sharded = BigMatrix("tsqr_V({0})".format(X.key), shape=(num_tree_levels * shard_size * b_fac, X.shape[0]), shard_sizes=(shard_size * b_fac, shard_size), write_header=True, safe=False) t = time.time() p0 = lpcompile_for_execution(TSQR, inputs=["A"], outputs=["Rs"]) config = npw.config.default() N_blocks = X.num_blocks(0) p1 = p0(X, V_sharded, T_sharded, R_sharded, N_blocks) e = time.time() c_time = e - t program = lp.LambdaPackProgram(p1, config=config) return program, { "outputs": [R_sharded, V_sharded, T_sharded], "intermediates": [], "compile_time": c_time }
def test_cholesky_timeouts(): X = np.random.randn(64, 64) A = X.dot(X.T) + np.eye(X.shape[0]) shard_size = 8 shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("job_runner_test", shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) program, meta = cholesky(A_sharded) executor = fs.ProcessPoolExecutor(1) print("starting program") program.start() future = executor.submit(job_runner.lambdapack_run, program, timeout=10, idle_timeout=6) time.sleep(15) print("poop") assert (int(program.get_up()) == 0) program.free() print("great success!")
def test_simple_slices(self): X = np.random.randn(128, 128) shard_sizes = [32, 32] X_sharded = BigMatrix("test_3", shape=X.shape, shard_sizes=shard_sizes) shard_matrix(X_sharded, X) assert(np.all(X[0:64] == X_sharded.submatrix([2]).numpy())) assert(np.all(X[64:128] == X_sharded.submatrix([2, None]).numpy())) assert(np.all(X[:, 0:96] == X_sharded.submatrix(None, [0, 3]).numpy())) assert(np.all(X[:, 96:128] == X_sharded.submatrix( None, [3, None]).numpy()))
def test_multiple_shard_index_get(self): X = np.random.randn(128, 128) shard_sizes = [64, 64] X_sharded = BigMatrix("test_2", shape=X.shape, shard_sizes=shard_sizes) shard_matrix(X_sharded, X) assert (np.all(X[0:64, 0:64] == X_sharded.submatrix(0).get_block(0))) assert (np.all(X[64:128, 64:128] == X_sharded.submatrix(1, 1).get_block())) assert (np.all(X[0:64, 64:128] == X_sharded.submatrix(0, 1).get_block())) assert (np.all(X[64:128, 0:64] == X_sharded.submatrix(None, 0).get_block(1)))
def test_step_slices(self): X = np.random.randn(128, 128) shard_sizes = [16, 16] X_sharded = BigMatrix("test_4", shape=X.shape, shard_sizes=shard_sizes) shard_matrix(X_sharded, X) assert (np.all( X[::32] == X_sharded.submatrix([None, None, 2]).numpy()[::16])) assert (np.all( X[16::32] == X_sharded.submatrix([1, None, 2]).numpy()[::16])) assert (np.all(X[:, 0:96:64] == X_sharded.submatrix( None, [0, 6, 4]).numpy()[:, ::16])) assert (np.all(X[:, 96:128:64] == X_sharded.submatrix( None, [6, 8, 4]).numpy()[:, ::16]))
def test_multiple_shard_matrix_multiply(self): X = np.random.randn(16, 16) Y = np.random.randn(16, 16) shard_sizes = tuple(map(int, np.array(X.shape) / 2)) X_sharded = BigMatrix("gemm_test_1", shape=X.shape, shard_sizes=shard_sizes) Y_sharded = BigMatrix("gemm_test_2", shape=X.shape, shard_sizes=shard_sizes) shard_matrix(X_sharded, X) shard_matrix(Y_sharded, Y) pwex = pywren.lambda_executor() XY_sharded = binops.gemm(pwex, X_sharded, Y_sharded, X_sharded.bucket, 1) XY_sharded_local = XY_sharded.numpy() XY = X.dot(Y) X_sharded.free() Y_sharded.free() XY_sharded.free() assert (np.all(np.isclose(XY, XY_sharded_local))) os.system("rm -rf /dev/shm/*")
def test_multiple_shard_matrix_gemv(self): X = np.random.randn(16, 16) Y = np.random.randn(16, 1) shard_sizes_0 = tuple(map(int, np.array(X.shape) / 2)) shard_sizes_1 = (Y.shape[0], 1) X_sharded = BigMatrix("gemv_test_1", shape=X.shape, shard_sizes=shard_sizes_0) Y_sharded = BigMatrix("gemv_test_2", shape=Y.shape, shard_sizes=shard_sizes_1) shard_matrix(X_sharded, X) shard_matrix(Y_sharded, Y) pwex = pywren.default_executor() XY_sharded = binops.gemv(pwex, X_sharded, Y_sharded, X_sharded.bucket, 1) XY_sharded_local = XY_sharded.numpy() XY = X.dot(Y) X_sharded.free() Y_sharded.free() XY_sharded.free() assert (np.all(np.isclose(XY, XY_sharded_local)))
def test_multiple_shard_cholesky(self): np.random.seed(1) size = 128 shard_size = 64 np.random.seed(1) print("Generating X") executor = fs.ProcessPoolExecutor(cpu_count) X = np.random.randn(size, 128) print("Generating A") A = X.dot(X.T) + np.eye(X.shape[0]) y = np.random.randn(size) pwex = pywren.default_executor() print("sharding A") shard_sizes = (shard_size, shard_size) A_sharded = BigSymmetricMatrix("cholesky_test_A", shape=A.shape, shard_sizes=shard_sizes) y_sharded = BigMatrix("cholesky_test_y", shape=y.shape, shard_sizes=shard_sizes[:1]) A_sharded.free() y_sharded.free() A_sharded = BigSymmetricMatrix("cholesky_test_A", shape=A.shape, shard_sizes=shard_sizes) y_sharded = BigMatrix("cholesky_test_y", shape=y.shape, shard_sizes=shard_sizes[:1]) t = time.time() shard_matrix(A_sharded, A, executor=executor) e = time.time() print("A_sharded", e - t) t = time.time() shard_matrix(y_sharded, y, executor=executor) e = time.time() print("y_sharded time", e - t) print("Computing LL^{T}") L = cholesky(A) print(L) L_sharded = uops.chol(pwex, A_sharded) L_sharded_local = L_sharded.numpy() print(L_sharded_local) print(L) print("L_{infty} difference ", np.max(np.abs(L_sharded_local - L))) assert (np.allclose(L, L_sharded_local)) os.system("rm -rf /dev/shm/*")
def test_if_static(self): X = np.random.randn(64, 64) shard_sizes = (int(X.shape[0]/8), X.shape[1]) X_sharded = BigMatrix("if_test", shape=X.shape, shard_sizes=shard_sizes, write_header=True) O_sharded = BigMatrix("if_test_output", shape=X.shape, shard_sizes=shard_sizes, write_header=True) X_sharded.free() shard_matrix(X_sharded, X) f = frontend.lpcompile(f1_if) p = f(X_sharded, O_sharded, X_sharded.num_blocks(0)) assert(p.starters == p.find_terminators()) for s, var_values in p.starters: if(var_values['i'] % 2 == 0): assert s == 0 else: assert s == 1
def test_tsqr(): np.random.seed(1) size = 256 shard_size = 32 X = np.random.randn(size, shard_size) Q, R = np.linalg.qr(X) q0, r0 = np.linalg.qr(X[:2, :2]) q1, r1 = np.linalg.qr(X[2:, :2]) r2 = np.linalg.qr(np.vstack((r0, r1)))[1] shard_sizes = (shard_size, X.shape[1]) X_sharded = BigMatrix("tsqr_test_X", shape=X.shape, shard_sizes=shard_sizes, write_header=True) shard_matrix(X_sharded, X) program, meta = tsqr(X_sharded) executor = fs.ProcessPoolExecutor(1) print("starting program") program.start() future = executor.submit(job_runner.lambdapack_run, program, timeout=10, idle_timeout=6) program.wait() program.free() R_sharded = meta["outputs"][0] num_tree_levels = int(np.log(np.ceil(size / shard_size)) / np.log(2)) print("num_tree_levels", num_tree_levels) R_npw = R_sharded.get_block(max(num_tree_levels, 0), 0) sign_matrix_local = np.eye(R.shape[0]) sign_matrix_remote = np.eye(R.shape[0]) sign_matrix_local[np.where(np.diag(R) <= 0)] *= -1 sign_matrix_remote[np.where(np.diag(R_npw) <= 0)] *= -1 # make the signs match R_npw *= np.diag(sign_matrix_remote)[:, np.newaxis] R *= np.diag(sign_matrix_local)[:, np.newaxis] assert (np.allclose(R_npw, R))
def test_single_shard_index_get(self): X = np.random.randn(128, 128) X_sharded = BigMatrix("test_0", shape=X.shape, shard_sizes=X.shape) shard_matrix(X_sharded, X) X_sharded_local = X_sharded.submatrix(0, 0).get_block() assert (np.all(X_sharded_local == X))
def run_experiment(problem_size, shard_size, pipeline, num_priorities, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, standalone, warmup, verify, matrix_exists, read_limit, write_limit, compute_threads_per_worker): # set up logging invoke_executor = fs.ThreadPoolExecutor(1) logger = logging.getLogger() region = npw.config.default()["account"]["aws_region"] print("REGION", region) for key in logging.Logger.manager.loggerDict: logging.getLogger(key).setLevel(logging.CRITICAL) logger.setLevel(logging.DEBUG) arg_bytes = pickle.dumps( (problem_size, shard_size, pipeline, num_priorities, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, read_limit, write_limit)) arg_hash = hashlib.md5(arg_bytes).hexdigest() log_file = "{0}.log".format(arg_hash) fh = logging.FileHandler(log_file) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.info("Logging to {0}".format(log_file)) if standalone: extra_env = { "AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"].strip(), "AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"].strip(), "OMP_NUM_THREADS": "1", "AWS_DEFAULT_REGION": region } config = npw.config.default() pwex = lithops.FunctionExecutor() else: extra_env = {"AWS_DEFAULT_REGION": region} config = npw.config.default() pwex = lithops.FunctionExecutor() if (not matrix_exists): X = np.random.randn(problem_size, 1) shard_sizes = [shard_size, 1] X_sharded = BigMatrix("qr_test_{0}_{1}".format(problem_size, shard_size), shape=X.shape, shard_sizes=shard_sizes, write_header=True, autosqueeze=False, bucket=config['s3']['bucket']) shard_matrix(X_sharded, X) print("Generating PSD matrix...") t = time.time() print(X_sharded.shape) XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False) e = time.time() print("GEMM took {0}".format(e - t)) else: X_sharded = BigMatrix("qr_test_{0}_{1}".format(problem_size, shard_size), autosqueeze=False, bucket="numpywrennsdi") key_name = binops.generate_key_name_binop(X_sharded, X_sharded.T, "gemm") XXT_sharded = BigMatrix(key_name, hash_keys=False, bucket=config['s3']['bucket']) XXT_sharded.lambdav = problem_size * 10 t = time.time() program, meta = bdfac(XXT_sharded, truncate=truncate) pipeline_width = args.pipeline if (lru): cache_size = 5 else: cache_size = 0 pywren_config = pwex.config e = time.time() print("Program compile took {0} seconds".format(e - t)) print("program.hash", program.hash) REDIS_CLIENT = program.control_plane.client done_counts = [] ready_counts = [] post_op_counts = [] not_ready_counts = [] running_counts = [] sqs_invis_counts = [] sqs_vis_counts = [] up_workers_counts = [] busy_workers_counts = [] read_objects = [] write_objects = [] all_read_timeouts = [] all_write_timeouts = [] all_redis_timeouts = [] times = [time.time()] flops = [0] reads = [0] writes = [0] print("LRU", lru) print("eager", eager) exp = {} exp["redis_done_counts"] = done_counts exp["redis_ready_counts"] = ready_counts exp["redis_post_op_counts"] = post_op_counts exp["redis_not_ready_counts"] = not_ready_counts exp["redis_running_counts"] = running_counts exp["sqs_invis_counts"] = sqs_invis_counts exp["sqs_vis_counts"] = sqs_vis_counts exp["busy_workers"] = busy_workers_counts exp["up_workers"] = up_workers_counts exp["times"] = times exp["lru"] = lru exp["priority"] = num_priorities exp["eager"] = eager exp["truncate"] = truncate exp["max_cores"] = max_cores exp["problem_size"] = problem_size exp["shard_size"] = shard_size exp["pipeline"] = pipeline exp["flops"] = flops exp["reads"] = reads exp["writes"] = writes exp["read_objects"] = read_objects exp["write_objects"] = write_objects exp["read_timeouts"] = all_read_timeouts exp["write_timeouts"] = all_write_timeouts exp["redis_timeouts"] = all_redis_timeouts exp["trial"] = trial exp["launch_granularity"] = launch_granularity exp["log_granularity"] = log_granularity exp["autoscale_policy"] = autoscale_policy exp["standalone"] = standalone exp["program"] = program exp["time_steps"] = 1 exp["failed"] = False program.start() t = time.time() logger.info("Starting with {0} cores".format(start_cores)) all_futures = pwex.map( lambda x: job_runner.lambdapack_run(program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(start_cores), extra_env=extra_env) start_time = time.time() last_run_time = start_time print(program.program_status()) print("QUEUE URLS", len(program.queue_urls)) total_lambda_epochs = start_cores try: while (program.program_status() == lp.PS.RUNNING): time.sleep(log_granularity) curr_time = int(time.time() - start_time) p = program.get_progress() if (p is None): print("no progress...") continue else: p = int(p) times.append(int(time.time())) max_pc = p waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') attrs = client.get_queue_attributes( QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible' ])['Attributes'] waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) sqs_invis_counts.append(running) sqs_vis_counts.append(waiting) busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash)) repeated_compute = parse_int( REDIS_CLIENT.get("{0}_repeated_compute".format(program.hash))) repeated_post_op = parse_int( REDIS_CLIENT.get("{0}_repeated_post_op".format(program.hash))) repeated_finish = parse_int( REDIS_CLIENT.get("{0}_repeated_finish".format(program.hash))) not_ready = parse_int( REDIS_CLIENT.get("{0}_not_ready".format(program.hash))) if (busy_workers == None): busy_workers = 0 else: busy_workers = int(busy_workers) up_workers = program.get_up() if (up_workers == None): up_workers = 0 else: up_workers = int(up_workers) up_workers_counts.append(up_workers) busy_workers_counts.append(busy_workers) logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) if ((curr_time % INFO_FREQ) == 0): logger.info("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) current_gflops = program.get_flops() if (current_gflops is None): current_gflops = 0 else: current_gflops = int(current_gflops) / 1e9 flops.append(current_gflops) current_gbytes_read = program.get_read() if (current_gbytes_read is None): current_gbytes_read = 0 else: current_gbytes_read = int(current_gbytes_read) / 1e9 reads.append(current_gbytes_read) current_gbytes_write = program.get_write() if (current_gbytes_write is None): current_gbytes_write = 0 else: current_gbytes_write = int(current_gbytes_write) / 1e9 writes.append(current_gbytes_write) gflops_rate = flops[-1] / (times[-1] - times[0]) greads_rate = reads[-1] / (times[-1] - times[0]) gwrites_rate = writes[-1] / (times[-1] - times[0]) b = XXT_sharded.shard_sizes[0] current_objects_read = (current_gbytes_read * 1e9) / (b * b * 8) current_objects_write = (current_gbytes_write * 1e9) / (b * b * 8) read_objects.append(current_objects_read) write_objects.append(current_objects_write) read_rate = read_objects[-1] / (times[-1] - times[0]) write_rate = write_objects[-1] / (times[-1] - times[0]) avg_workers = np.mean(up_workers_counts) smooth_len = 10 if (len(flops) > smooth_len + 5): gflops_rate_5_min_window = (flops[-1] - flops[-smooth_len]) / ( times[-1] - times[-smooth_len]) gread_rate_5_min_window = (reads[-1] - reads[-smooth_len]) / ( times[-1] - times[-smooth_len]) gwrite_rate_5_min_window = ( writes[-1] - writes[-smooth_len]) / (times[-1] - times[-smooth_len]) read_rate_5_min_window = (read_objects[-1] - read_objects[-smooth_len]) / ( times[-1] - times[-smooth_len]) write_rate_5_min_window = (write_objects[-1] - write_objects[-smooth_len]) / ( times[-1] - times[-smooth_len]) workers_5_min_window = np.mean(up_workers_counts[-smooth_len:]) else: gflops_rate_5_min_window = "N/A" gread_rate_5_min_window = "N/A" gwrite_rate_5_min_window = "N/A" workers_5_min_window = "N/A" read_rate_5_min_window = "N/A" write_rate_5_min_window = "N/A" read_timeouts = int(parse_int( REDIS_CLIENT.get("s3.timeouts.read"))) write_timeouts = int( parse_int(REDIS_CLIENT.get("s3.timeouts.write"))) redis_timeouts = int(parse_int(REDIS_CLIENT.get("redis.timeouts"))) all_read_timeouts.append(read_timeouts) all_write_timeouts.append(write_timeouts) all_redis_timeouts.append(redis_timeouts) read_timeouts_fraction = read_timeouts / (current_objects_read + 1e-8) write_timeouts_fraction = write_timeouts / (current_objects_write + 1e-8) print("=======================================") print( f"Progress is {p}, Repeated Compute is {repeated_compute}, Repeated POST OP is {repeated_post_op}, Repeated Finishes is {repeated_finish}, Not ready Nodes scheduled are {not_ready}" ) print("Max PC is {0}".format(max_pc)) print("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) print("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) print( "{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}" .format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write)) print( "{0}: Average GFLOPS rate {1}, Average GBytes Read rate {2}, Average GBytes Write rate {3}, Average Worker Count {4}" .format(curr_time, gflops_rate, greads_rate, gwrites_rate, avg_workers)) print("{0}: Average read txns/s {1}, Average write txns/s {2}". format(curr_time, read_rate, write_rate)) print( "{0}: smoothed GFLOPS rate {1}, smoothed GBytes Read rate {2}, smoothed GBytes Write rate {3}, smoothed Worker Count {4}" .format(curr_time, gflops_rate_5_min_window, gread_rate_5_min_window, gwrite_rate_5_min_window, workers_5_min_window)) print("{0}: smoothed read txns/s {1}, smoothed write txns/s {2}". format(curr_time, read_rate_5_min_window, write_rate_5_min_window)) print( "{0}: Read timeouts: {1}, Write timeouts: {2}, Redis timeouts: {3} " .format(curr_time, read_timeouts, write_timeouts, redis_timeouts)) print( "{0}: Read timeouts fraction: {1}, Write timeouts fraction: {2}" .format(curr_time, read_timeouts_fraction, write_timeouts_fraction)) print("=======================================") time_since_launch = time.time() - last_run_time if (time_since_launch > (0.85 * timeout)): cores_to_launch = max_cores logger.info( "launching {0} new tasks....".format(cores_to_launch)) new_futures = pwex.map(lambda x: job_runner.lambdapack_run( program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=extra_env) # print("waiting for second result") # print("result..", new_futures[0].result()) # print([x.result() for x in new_futures]) last_run_time = time.time() all_futures.extend(new_futures) exp["time_steps"] += 1 except KeyboardInterrupt: exp["failed"] = True program.stop() pass except Exception as e: traceback.print_exc() exp["failed"] = True program.stop() raise pass print(program.program_status()) exp["all_futures"] = all_futures exp_bytes = dill.dumps(exp) client = boto3.client('s3') client.put_object(Key="lambdapack/{0}/runtime.pickle".format(program.hash), Body=exp_bytes, Bucket=program.bucket) print("=======================") print("=======================") print("Execution Summary:") print("Executed Program ID: {0}".format(program.hash)) print("Program Success: {0}".format((not exp["failed"]))) print("Problem Size: {0}".format(exp["problem_size"])) print("Shard Size: {0}".format(exp["shard_size"])) print("Total Execution time: {0}".format(times[-1] - times[0])) print("Average Flop Rate (GFlop/s): {0}".format(exp["flops"][-1] / (times[-1] - times[0]))) with open("/tmp/last_run", "w+") as f: f.write(program.hash)
def test_single_shard_index_put(self): X = np.random.randn(128, 128) X_sharded = BigMatrix("test_1", shape=X.shape, shard_sizes=X.shape) X_sharded.submatrix(0, 0).put_block(X) assert (np.all(X_sharded.numpy() == X))
def test_cholesky_multi_repeats(self): ''' Insert repeated instructions into PC queue avoid double increments ''' print("RUNNING MULTI") np.random.seed(1) size = 256 shard_size = 30 repeats = 15 total_repeats = 150 np.random.seed(2) print("Generating X") X = np.random.randn(size, 128) print("Generating A") A = X.dot(X.T) + size*np.eye(X.shape[0]) shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("cholesky_test_A_{0}".format( int(time.time())), shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) instructions, trailing, L_sharded = compiler._chol(A_sharded) all_nodes = instructions.unroll_program() L_sharded.free() pwex = pywren.default_executor() executor = pywren.lambda_executor config = npw.config.default() pywren_config = pwex.config program = lp.LambdaPackProgram( instructions, executor=executor, pywren_config=pywren_config, config=config, eager=True) print("PROGRAM HASH", program.hash) cores = 1 program.start() jobs = [] for c in range(cores): p = mp.Process(target=job_runner.lambdapack_run, args=( program,), kwargs={'timeout': 3600, 'pipeline_width': 5}) jobs.append(p) p.start() np.random.seed(0) try: while(program.program_status() == lp.PS.RUNNING): sqs = boto3.resource( 'sqs', region_name=program.control_plane.region) time.sleep(0.5) waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') print("Priority {0}".format(i)) attrs = client.get_queue_attributes(QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible'])['Attributes'] print(attrs) waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) print("SQS QUEUE STATUS Waiting {0}, Running {1}".format( waiting, running)) for i in range(repeats): p = program.get_progress() if (p is None): continue else: p = int(p) pc = int(np.random.choice(min(p, len(all_nodes)), 1)) node = all_nodes[pc] queue = sqs.Queue(program.queue_urls[0]) total_repeats -= 1 if (total_repeats > 0): print("Malicilously enqueueing node ", pc, node, total_repeats) queue.send_message(MessageBody=json.dumps(node)) time.sleep(1) # for p in jobs: # p.join() except: pass print("Program status") print(program.program_status()) for node in all_nodes: edge_sum = lp.get(program.control_plane.client, program._node_edge_sum_key(*node)) if (edge_sum == None): edge_sum = 0 edge_sum = int(edge_sum) parents = program.program.get_parents(*node) children = program.program.get_children(*node) indegree = len(parents) node_status = program.get_node_status(*node) redis_str = "Node: {0}, Edge Sum: {1}, Indegree: {2}, Node Status {3}".format( node, edge_sum, indegree, node_status) if (edge_sum != indegree): print(redis_str) for p in parents: p_status = program.get_node_status(*p) edge_key = program._edge_key(p[0], p[1], node[0], node[1]) edge_value = lp.get(program.control_plane.client, edge_key) child_str = "Parent Node: {0}, Parent Status: {1}, Edge Key: {2}".format( p, p_status, edge_value) print(child_str) #assert(edge_sum == indegree) program.free() L_npw = L_sharded.numpy() L = np.linalg.cholesky(A) z = np.argmax(np.abs(L - L_npw)) assert(np.allclose(L_npw, L))
def test_cholesky_multi_failures(self): ''' Insert repeated instructions into PC queue avoid double increments ''' print("RUNNING MULTI") np.random.seed(1) size = 256 shard_size = 64 failures = 4 np.random.seed(1) print("Generating X") X = np.random.randn(size, 128) print("Generating A") A = X.dot(X.T) + size*np.eye(X.shape[0]) shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("cholesky_test_A", shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) instructions, trailing, L_sharded = compiler._chol(A_sharded) pwex = pywren.default_executor() executor = pywren.lambda_executor pywren_config = pwex.config config = npw.config.default() program = lp.LambdaPackProgram( instructions, executor=executor, pywren_config=pywren_config, config=config, eager=False) cores = 16 program.start() jobs = [] for c in range(cores): p = mp.Process(target=job_runner.lambdapack_run, args=( program,), kwargs={'timeout': 3600, 'pipeline_width': 4}) jobs.append(p) p.start() np.random.seed(0) while(program.program_status() == lp.PS.RUNNING): sqs = boto3.resource( 'sqs', region_name=program.control_plane.region) waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') print("Priority {0}".format(i)) attrs = client.get_queue_attributes(QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible'])['Attributes'] print(attrs) waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) print("SQS QUEUE STATUS Waiting {0}, Running {1}".format( waiting, running)) time.sleep(10) if (np.random.random() > 0.65): for i in range(failures): core = int(np.random.choice(cores, 1)[0]) print("Maliciously Killing a job!") jobs[core].terminate() p = mp.Process(target=job_runner.lambdapack_run, args=( program,), kwargs={'timeout': 3600, 'pipeline_width': 4}) p.start() jobs[core] = p for p in jobs: p.join() print("Program status") print(program.program_status()) program.free() L_npw = L_sharded.numpy() L = np.linalg.cholesky(A) print(L_npw) print(L) print("MAX ", np.max(np.abs(L - L_npw))) assert(np.allclose(L_npw, L))
def run_experiment(problem_size, shard_size, pipeline, priority, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, failure_percentage, max_failure_events, failure_time): # set up logging logger = logging.getLogger() for key in logging.Logger.manager.loggerDict: logging.getLogger(key).setLevel(logging.CRITICAL) logger.setLevel(logging.DEBUG) arg_bytes = pickle.dumps( (problem_size, shard_size, pipeline, priority, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, failure_percentage, max_failure_events, failure_time)) arg_hash = hashlib.md5(arg_bytes).hexdigest() log_file = "failure_experiments/{0}.log".format(arg_hash) fh = logging.FileHandler(log_file) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.info("Logging to {0}".format(log_file)) X = np.random.randn(problem_size, 1) pwex = pywren.default_executor() shard_sizes = [shard_size, 1] X_sharded = BigMatrix("cholesky_test_{0}_{1}".format( problem_size, shard_size), shape=X.shape, shard_sizes=shard_sizes, write_header=True) shard_matrix(X_sharded, X) print("Generating PSD matrix...") XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False) XXT_sharded.lambdav = problem_size * 10 instructions, L_sharded, trailing = lp._chol(XXT_sharded) pipeline_width = args.pipeline if (priority): num_priorities = 5 else: num_priorities = 1 if (lru): cache_size = 5 else: cache_size = 0 REDIS_CLIENT = redis.StrictRedis(REDIS_ADDR, port=REDIS_PORT, password=REDIS_PASS, db=0, socket_timeout=5) if (truncate is not None): instructions = instructions[:truncate] config = pwex.config program = lp.LambdaPackProgram(instructions, executor=pywren.lambda_executor, pywren_config=config, num_priorities=num_priorities, eager=eager) redis_env = { "REDIS_ADDR": os.environ.get("REDIS_ADDR", ""), "REDIS_PASS": os.environ.get("REDIS_PASS", "") } done_counts = [] ready_counts = [] post_op_counts = [] not_ready_counts = [] running_counts = [] sqs_invis_counts = [] sqs_vis_counts = [] up_workers_counts = [] busy_workers_counts = [] times = [] flops = [] reads = [] writes = [] failure_times = [] exp = {} exp["redis_done_counts"] = done_counts exp["redis_ready_counts"] = ready_counts exp["redis_post_op_counts"] = post_op_counts exp["redis_not_ready_counts"] = not_ready_counts exp["redis_running_counts"] = running_counts exp["sqs_invis_counts"] = sqs_invis_counts exp["sqs_vis_counts"] = sqs_vis_counts exp["busy_workers"] = busy_workers_counts exp["up_workers"] = up_workers_counts exp["times"] = times exp["lru"] = lru exp["priority"] = priority exp["eager"] = eager exp["truncate"] = truncate exp["max_cores"] = max_cores exp["problem_size"] = problem_size exp["shard_size"] = shard_size exp["pipeline"] = pipeline exp["flops"] = flops exp["reads"] = reads exp["writes"] = writes exp["trial"] = trial exp["launch_granularity"] = launch_granularity exp["log_granularity"] = log_granularity exp["autoscale_policy"] = autoscale_policy exp["failure_times"] = failure_times logger.info("Longest Path: {0}".format(program.longest_path)) program.start() t = time.time() logger.info("Starting with {0} cores".format(start_cores)) failure_keys = [ "{0}_failure_{1}_{2}".format(program.hash, i, 0) for i in range(start_cores) ] all_futures = pwex.map(lambda x: job_runner.lambdapack_run_with_failures( failure_keys[x], program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(start_cores), extra_env=redis_env) start_time = time.time() last_run_time = start_time last_failure = time.time() num_failure_events = 0 while (program.program_status() == lp.PS.RUNNING): curr_time = int(time.time() - start_time) max_pc = program.get_max_pc() times.append(int(time.time())) time.sleep(log_granularity) waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') attrs = client.get_queue_attributes( QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible' ])['Attributes'] waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) sqs_invis_counts.append(running) sqs_vis_counts.append(waiting) busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash)) if (busy_workers == None): busy_workers = 0 else: busy_workers = int(busy_workers) up_workers = program.get_up() if (up_workers == None): up_workers = 0 else: up_workers = int(up_workers) up_workers_counts.append(up_workers) busy_workers_counts.append(busy_workers) logger.debug("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) if ((curr_time % INFO_FREQ) == 0): logger.info("Max PC is {0}".format(max_pc)) logger.info("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) #print("{5}: Not Ready: {0}, Ready: {1}, Running: {4}, Post OP: {2}, Done: {3}".format(not_ready_count, ready_count, post_op_count, done_count, running_count, curr_time)) current_gflops = program.get_flops() if (current_gflops is None): current_gflops = 0 else: current_gflops = int(current_gflops) / 1e9 flops.append(current_gflops) current_gbytes_read = program.get_read() if (current_gbytes_read is None): current_gbytes_read = 0 else: current_gbytes_read = int(current_gbytes_read) / 1e9 reads.append(current_gbytes_read) current_gbytes_write = program.get_write() if (current_gbytes_write is None): current_gbytes_write = 0 else: current_gbytes_write = int(current_gbytes_write) / 1e9 writes.append(current_gbytes_write) #print("{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}".format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write)) time_since_launch = time.time() - last_run_time if (autoscale_policy == "dynamic"): if (time_since_launch > launch_granularity and up_workers < np.ceil(waiting * 0.5 / pipeline_width) and up_workers < max_cores): cores_to_launch = int( min( np.ceil(waiting / pipeline_width) - up_workers, max_cores - up_workers)) logger.info( "launching {0} new tasks....".format(cores_to_launch)) _failure_keys = [ "{0}_failure_{1}_{2}".format(program.hash, i, curr_time) for i in range(cores_to_launch) ] new_futures = pwex.map( lambda x: job_runner.lambdapack_run_with_failures( _failure_keys[x], program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=redis_env) last_run_time = time.time() # check if we OOM-erred # [x.result() for x in all_futures] all_futures.extend(new_futures) elif (autoscale_policy == "constant_timeout"): if (time_since_launch > (0.75 * timeout)): cores_to_launch = max_cores logger.info( "launching {0} new tasks....".format(cores_to_launch)) _failure_keys = [ "{0}_failure_{1}_{2}".format(program.hash, i, curr_time) for i in range(cores_to_launch) ] new_futures = pwex.map( lambda x: job_runner.lambdapack_run_with_failures( _failure_keys[x], program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=redis_env) last_run_time = time.time() failure_keys += _failure_keys # check if we OOM-erred # [x.result() for x in all_futures] all_futures.extend(new_futures) else: raise Exception("unknown autoscale policy") if ((time.time() - last_failure) > failure_time and num_failure_events < max_failure_events): logging.info("Killing some jobs") idxs = np.random.choice(len(failure_keys), int(failure_percentage * len(failure_keys)), replace=False) num_failure_events += 1 last_failure = time.time() failure_times.append(last_failure) for i in idxs: logging.info("Killing: job {0}".format(i)) REDIS_CLIENT.set(failure_keys[i], 1) exp["all_futures"] = all_futures for pc in range(program.num_inst_blocks): run_count = REDIS_CLIENT.get("{0}_{1}_start".format(program.hash, pc)) if (run_count is None): run_count = 0 else: run_count = int(run_count) if (run_count != 1): logger.info("PC: {0}, Run Count: {1}".format(pc, run_count)) e = time.time() logger.info(program.program_status()) logger.info("PROGRAM STATUS " + str(program.program_status())) logger.info("PROGRAM HASH " + str(program.hash)) logger.info("Took {0} seconds".format(e - t)) exp["total_runtime"] = e - t exp["num_failure_events"] = num_failure_events # collect in executor = fs.ThreadPoolExecutor(72) futures = [] for i in range(0, program.num_inst_blocks, 1): futures.append(executor.submit(program.get_profiling_info, i)) res = fs.wait(futures) profiled_blocks = [f.result() for f in futures] serializer = serialize.SerializeIndependent() byte_string = serializer([profiled_blocks])[0][0] exp["profiled_block_pickle_bytes"] = byte_string read, write, total_flops, bins, instructions, runtimes = lp.perf_profile( profiled_blocks, num_bins=100) flop_rate = sum(total_flops) / max(bins) exp["flop_rate"] = flop_rate print("Average Flop rate of {0}".format(flop_rate)) # save other stuff try: os.mkdir("failure_experiments/") except FileExistsError: pass exp_bytes = pickle.dumps(exp) dump_path = "failure_experiments/{0}.pickle".format(arg_hash) print("Dumping experiment pickle to {0}".format(dump_path)) with open(dump_path, "wb+") as f: f.write(exp_bytes)
def run_experiment(problem_size, shard_size, pipeline, num_priorities, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, standalone, warmup, verify, matrix_exists, read_limit, write_limit): # set up logging invoke_executor = fs.ThreadPoolExecutor(1) logger = logging.getLogger() region = wc.default()["account"]["aws_region"] print("REGION", region) for key in logging.Logger.manager.loggerDict: logging.getLogger(key).setLevel(logging.CRITICAL) logger.setLevel(logging.DEBUG) arg_bytes = pickle.dumps( (problem_size, shard_size, pipeline, num_priorities, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, read_limit, write_limit)) arg_hash = hashlib.md5(arg_bytes).hexdigest() log_file = "{0}.log".format(arg_hash) fh = logging.FileHandler(log_file) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.info("Logging to {0}".format(log_file)) if standalone: extra_env = { "AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"], "AWS_SECRET_ACCESS_KEY": os.environ["AWS_ACCESS_KEY_ID"], "OMP_NUM_THREADS": "1", "AWS_DEFAULT_REGION": region } config = wc.default() config['runtime']['s3_bucket'] = 'numpywrenpublic' key = "pywren.runtime/pywren_runtime-3.6-numpywren-standalone.tar.gz" config['runtime']['s3_key'] = key pwex = pywren.standalone_executor(config=config) else: extra_env = {"AWS_DEFAULT_REGION": region} config = wc.default() config['runtime']['s3_bucket'] = 'numpywrenpublic-us-east-1' key = "pywren.runtime/pywren_runtime-3.6-numpywren-08-25-2018.tar.gz" config['runtime']['s3_key'] = key pwex = pywren.default_executor(config=config) if (not matrix_exists): X = np.random.randn(problem_size, 1) shard_sizes = [shard_size, 1] X_sharded = BigMatrix("cholesky_test_{0}_{1}".format( problem_size, shard_size), shape=X.shape, shard_sizes=shard_sizes, write_header=True, autosqueeze=False, bucket="numpywrentop500test", hash_keys=False) shard_matrix(X_sharded, X) print("Generating PSD matrix...") t = time.time() print(X_sharded.shape) XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False) e = time.time() print("GEMM took {0}".format(e - t)) else: X_sharded = BigMatrix("cholesky_test_{0}_{1}".format( problem_size, shard_size), autosqueeze=False, hash_keys=False, bucket="numpywrentop500test") key_name = binops.generate_key_name_binop(X_sharded, X_sharded.T, "gemm") XXT_sharded = BigMatrix(key_name, hash_keys=False, bucket="numpywrentop500test") XXT_sharded.lambdav = problem_size * 10 if (verify): A = XXT_sharded.numpy() print("Computing local cholesky") L = np.linalg.cholesky(A) t = time.time() instructions, trailing, L_sharded = compiler._chol(XXT_sharded, truncate=truncate) pipeline_width = args.pipeline if (lru): cache_size = 5 else: cache_size = 0 pywren_config = pwex.config config = npw.config.default() program = lp.LambdaPackProgram(instructions, executor=pywren.lambda_executor, pywren_config=pywren_config, num_priorities=num_priorities, eager=eager, config=config, write_limit=write_limit, read_limit=read_limit) warmup_start = time.time() if (warmup): warmup_sleep = 170 def warmup_fn(x): program.incr_up(1) time.sleep(warmup_sleep) program.decr_up(1) print("Warming up...") futures = pwex.map(warmup_fn, range(max_cores)) last_spinup = time.time() while (True): if ((time.time() - last_spinup) > 0.75 * warmup_sleep): print("Calling pwex.map..") futures = pwex.map(warmup_fn, range(max_cores)) last_spinup = time.time() time.sleep(2) if (program.get_up() is None): up_workers = 0 else: up_workers = int(program.get_up()) print("{0} workers alive".format(up_workers)) if (up_workers >= max_cores): time.sleep(warmup_sleep) break warmup_end = time.time() print("Warmup took {0} seconds".format(warmup_end - warmup_start)) e = time.time() print("Program compile took {0} seconds".format(e - t)) print("program.hash", program.hash) REDIS_CLIENT = program.control_plane.client done_counts = [] ready_counts = [] post_op_counts = [] not_ready_counts = [] running_counts = [] sqs_invis_counts = [] sqs_vis_counts = [] up_workers_counts = [] busy_workers_counts = [] read_objects = [] write_objects = [] all_read_timeouts = [] all_write_timeouts = [] all_redis_timeouts = [] times = [time.time()] flops = [0] reads = [0] writes = [0] print("LRU", lru) print("eager", eager) exp = {} exp["redis_done_counts"] = done_counts exp["redis_ready_counts"] = ready_counts exp["redis_post_op_counts"] = post_op_counts exp["redis_not_ready_counts"] = not_ready_counts exp["redis_running_counts"] = running_counts exp["sqs_invis_counts"] = sqs_invis_counts exp["sqs_vis_counts"] = sqs_vis_counts exp["busy_workers"] = busy_workers_counts exp["up_workers"] = up_workers_counts exp["times"] = times exp["lru"] = lru exp["priority"] = num_priorities exp["eager"] = eager exp["truncate"] = truncate exp["max_cores"] = max_cores exp["problem_size"] = problem_size exp["shard_size"] = shard_size exp["pipeline"] = pipeline exp["flops"] = flops exp["reads"] = reads exp["writes"] = writes exp["read_objects"] = read_objects exp["write_objects"] = write_objects exp["read_timeouts"] = all_read_timeouts exp["write_timeouts"] = all_write_timeouts exp["redis_timeouts"] = all_redis_timeouts exp["trial"] = trial exp["launch_granularity"] = launch_granularity exp["log_granularity"] = log_granularity exp["autoscale_policy"] = autoscale_policy exp["standalone"] = standalone exp["program"] = program exp["time_steps"] = 1 exp["failed"] = False program.start() t = time.time() logger.info("Starting with {0} cores".format(start_cores)) invoker = fs.ThreadPoolExecutor(1) all_future_futures = invoker.submit(lambda: pwex.map( lambda x: job_runner.lambdapack_run(program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(start_cores), extra_env=extra_env)) # print(all_future_futures.result()) all_futures = [all_future_futures] # print([f.result() for f in all_futures]) start_time = time.time() last_run_time = start_time print(program.program_status()) print("QUEUE URLS", len(program.queue_urls)) total_lambda_epochs = start_cores try: while (program.program_status() == lp.PS.RUNNING): time.sleep(log_granularity) curr_time = int(time.time() - start_time) p = program.get_progress() if (p is None): print("no progress...") continue else: p = int(p) times.append(int(time.time())) max_pc = p waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') attrs = client.get_queue_attributes( QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible' ])['Attributes'] waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) sqs_invis_counts.append(running) sqs_vis_counts.append(waiting) busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash)) if (busy_workers == None): busy_workers = 0 else: busy_workers = int(busy_workers) up_workers = program.get_up() if (up_workers == None): up_workers = 0 else: up_workers = int(up_workers) up_workers_counts.append(up_workers) busy_workers_counts.append(busy_workers) logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) if ((curr_time % INFO_FREQ) == 0): logger.info("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) current_gflops = program.get_flops() if (current_gflops is None): current_gflops = 0 else: current_gflops = int(current_gflops) / 1e9 flops.append(current_gflops) current_gbytes_read = program.get_read() if (current_gbytes_read is None): current_gbytes_read = 0 else: current_gbytes_read = int(current_gbytes_read) / 1e9 reads.append(current_gbytes_read) current_gbytes_write = program.get_write() if (current_gbytes_write is None): current_gbytes_write = 0 else: current_gbytes_write = int(current_gbytes_write) / 1e9 writes.append(current_gbytes_write) gflops_rate = flops[-1] / (times[-1] - times[0]) greads_rate = reads[-1] / (times[-1] - times[0]) gwrites_rate = writes[-1] / (times[-1] - times[0]) b = XXT_sharded.shard_sizes[0] current_objects_read = (current_gbytes_read * 1e9) / (b * b * 8) current_objects_write = (current_gbytes_write * 1e9) / (b * b * 8) read_objects.append(current_objects_read) write_objects.append(current_objects_write) read_rate = read_objects[-1] / (times[-1] - times[0]) write_rate = write_objects[-1] / (times[-1] - times[0]) avg_workers = np.mean(up_workers_counts) smooth_len = 10 if (len(flops) > smooth_len + 5): gflops_rate_5_min_window = (flops[-1] - flops[-smooth_len]) / ( times[-1] - times[-smooth_len]) gread_rate_5_min_window = (reads[-1] - reads[-smooth_len]) / ( times[-1] - times[-smooth_len]) gwrite_rate_5_min_window = ( writes[-1] - writes[-smooth_len]) / (times[-1] - times[-smooth_len]) read_rate_5_min_window = (read_objects[-1] - read_objects[-smooth_len]) / ( times[-1] - times[-smooth_len]) write_rate_5_min_window = (write_objects[-1] - write_objects[-smooth_len]) / ( times[-1] - times[-smooth_len]) workers_5_min_window = np.mean(up_workers_counts[-smooth_len:]) else: gflops_rate_5_min_window = "N/A" gread_rate_5_min_window = "N/A" gwrite_rate_5_min_window = "N/A" workers_5_min_window = "N/A" read_rate_5_min_window = "N/A" write_rate_5_min_window = "N/A" read_timeouts = int(REDIS_CLIENT.get("s3.timeouts.read")) write_timeouts = int(REDIS_CLIENT.get("s3.timeouts.write")) redis_timeouts = int(REDIS_CLIENT.get("redis.timeouts")) all_read_timeouts.append(read_timeouts) all_write_timeouts.append(write_timeouts) all_redis_timeouts.append(redis_timeouts) read_timeouts_fraction = read_timeouts / current_objects_read write_timeouts_fraction = write_timeouts / current_objects_write print("=======================================") print("Max PC is {0}".format(max_pc)) print("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) print("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) print( "{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}" .format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write)) print( "{0}: Average GFLOPS rate {1}, Average GBytes Read rate {2}, Average GBytes Write rate {3}, Average Worker Count {4}" .format(curr_time, gflops_rate, greads_rate, gwrites_rate, avg_workers)) print("{0}: Average read txns/s {1}, Average write txns/s {2}". format(curr_time, read_rate, write_rate)) print( "{0}: smoothed GFLOPS rate {1}, smoothed GBytes Read rate {2}, smoothed GBytes Write rate {3}, smoothed Worker Count {4}" .format(curr_time, gflops_rate_5_min_window, gread_rate_5_min_window, gwrite_rate_5_min_window, workers_5_min_window)) print("{0}: smoothed read txns/s {1}, smoothed write txns/s {2}". format(curr_time, read_rate_5_min_window, write_rate_5_min_window)) print( "{0}: Read timeouts: {1}, Write timeouts: {2}, Redis timeouts: {3} " .format(curr_time, read_timeouts, write_timeouts, redis_timeouts)) print( "{0}: Read timeouts fraction: {1}, Write timeouts fraction: {2}" .format(curr_time, read_timeouts_fraction, write_timeouts_fraction)) print("=======================================") time_since_launch = time.time() - last_run_time if (autoscale_policy == "dynamic"): if (time_since_launch > launch_granularity and up_workers < np.ceil(waiting * 0.5 / pipeline_width) and up_workers < max_cores): cores_to_launch = int( min( np.ceil(waiting / pipeline_width) - up_workers, max_cores - up_workers)) logger.info( "launching {0} new tasks....".format(cores_to_launch)) new_future_futures = invoker.submit( lambda: pwex.map(lambda x: job_runner.lambdapack_run( program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=extra_env)) last_run_time = time.time() # check if we OOM-erred # [x.result() for x in all_futures] all_futures.extend(new_future_futures) elif (autoscale_policy == "constant_timeout"): if (time_since_launch > (0.85 * timeout)): cores_to_launch = max_cores logger.info( "launching {0} new tasks....".format(cores_to_launch)) new_future_futures = invoker.submit( lambda: pwex.map(lambda x: job_runner.lambdapack_run( program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=extra_env)) last_run_time = time.time() # check if we OOM-erred # [x.result() for x in all_futures] all_futures.append(new_future_futures) else: raise Exception("unknown autoscale policy") exp["time_steps"] += 1 if (verify): L_sharded_local = L_sharded.numpy() print("max diff", np.max(np.abs(L_sharded_local - L))) except KeyboardInterrupt: exp["failed"] = True program.stop() pass except Exception as e: traceback.print_exc() exp["failed"] = True program.stop() raise pass print(program.program_status()) exp["all_futures"] = all_futures exp_bytes = dill.dumps(exp) client = boto3.client('s3') client.put_object(Key="lambdapack/{0}/runtime.pickle".format(program.hash), Body=exp_bytes, Bucket=program.bucket) print("=======================") print("=======================") print("Execution Summary:") print("Executed Program ID: {0}".format(program.hash)) print("Program Success: {0}".format((not exp["failed"]))) print("Problem Size: {0}".format(exp["problem_size"])) print("Shard Size: {0}".format(exp["shard_size"])) print("Total Execution time: {0}".format(times[-1] - times[0])) print("Average Flop Rate (GFlop/s): {0}".format(exp["flops"][-1] / (times[-1] - times[0]))) with open("/tmp/last_run", "w+") as f: f.write(program.hash)
from numpywren.matrix_init import shard_matrix from numpywren.binops import gemm import pywren pwex = pywren.lambda_executor() Ns = [5000, 10000, 15000, 20000, 25000, 30000] shard_size = (5000, 5000) np.random.seed(42) # Only run this if matrices not already in the bucket. # This takes a very long time (for 30000x30000xf64 - 8GB of data) # Big_X = BigMatrix("multiply_test2", shape=(max(Ns),max(Ns), shard_sizes=shard_size) # for i in range(): # for j in range(): # X = np.random.randn(5000,5000) # Big_X.put_block(X, i, j) # start = time.time() # gemm(pwex, Big_X, Big_X, Big_X.bucket, 1) # end = time.time() # print(end - start) for N in Ns: X_sharded = BigMatrix("multiply_test2", shape=(N, N), shard_sizes=shard_size) start = time.time() gemm(pwex, X_sharded, X_sharded, X_sharded.bucket, 1) end = time.time() print(end - start)