def test_cholesky_multiprocess(): X = np.random.randn(128, 128) A = X.dot(X.T) + 1e9 * np.eye(X.shape[0]) shard_size = 8 shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("job_runner_test", shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) program, meta = cholesky(A_sharded) executor = fs.ProcessPoolExecutor(8) print("starting program") program.start() futures = [] for i in range(8): future = executor.submit(job_runner.lambdapack_run, program, timeout=25) futures.append(future) print("Waiting for futures") fs.wait(futures) [f.result() for f in futures] futures = [] for i in range(8): future = executor.submit(job_runner.lambdapack_run, program, timeout=25) futures.append(future) print("Waiting for futures..again") fs.wait(futures) [f.result() for f in futures] print("great success!") return 0
def test_cholesky_lambda(): X = np.random.randn(128, 128) A = X.dot(X.T) + np.eye(X.shape[0]) shard_size = 128 shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("job_runner_test", shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) program, meta = cholesky(A_sharded) executor = fs.ProcessPoolExecutor(1) print("starting program") program.start() pwex = pywren.default_executor() futures = pwex.map( lambda x: job_runner.lambdapack_run( program, timeout=60, idle_timeout=6), range(16)) pywren.wait(futures) print("RESULTSSS") print([f.result() for f in futures]) futures = pwex.map( lambda x: job_runner.lambdapack_run( program, timeout=60, idle_timeout=6), range(16)) program.wait() #program.free() L_sharded = meta["outputs"][0] L_npw = L_sharded.numpy() L = np.linalg.cholesky(A) assert (np.allclose(L_npw, L)) print("great success!")
def test_cholesky(): X = np.random.randn(64, 64) A = X.dot(X.T) + np.eye(X.shape[0]) shard_size = 8 shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("cholesky_test_A", shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) program, meta = cholesky(A_sharded) executor = fs.ProcessPoolExecutor(1) print("starting program") program.start() future = executor.submit(job_runner.lambdapack_run, program, timeout=30, idle_timeout=6) program.wait() program.free() L_sharded = meta["outputs"][0] L_npw = L_sharded.numpy() L = np.linalg.cholesky(A) assert (np.allclose(L_npw, L)) print("great success!")
def test_cholesky_lambda(): X = np.random.randn(64, 64) A = X.dot(X.T) + np.eye(X.shape[0]) shard_size = 16 shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("cholesky_test_A", shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) program, meta = cholesky(A_sharded) futures = run_program_in_pywren(program) program.start() program.wait() program.free() L_sharded = meta["outputs"][0] L_npw = L_sharded.numpy() L = np.linalg.cholesky(A) assert(np.allclose(L_npw, L)) print("great success!")
def test_cholesky_timeouts(): X = np.random.randn(64, 64) A = X.dot(X.T) + np.eye(X.shape[0]) shard_size = 8 shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("job_runner_test", shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) program, meta = cholesky(A_sharded) executor = fs.ProcessPoolExecutor(1) print("starting program") program.start() future = executor.submit(job_runner.lambdapack_run, program, timeout=10, idle_timeout=6) time.sleep(15) print("poop") assert (int(program.get_up()) == 0) program.free() print("great success!")
def run_experiment(problem_size, shard_size, pipeline, num_priorities, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, standalone, warmup, verify, matrix_exists, read_limit, write_limit, n_threads): # set up logging invoke_executor = fs.ThreadPoolExecutor(1) logger = logging.getLogger() region = wc.default()["account"]["aws_region"] print("REGION", region) for key in logging.Logger.manager.loggerDict: logging.getLogger(key).setLevel(logging.CRITICAL) logger.setLevel(logging.DEBUG) arg_bytes = pickle.dumps((problem_size, shard_size, pipeline, num_priorities, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, read_limit, write_limit)) arg_hash = hashlib.md5(arg_bytes).hexdigest() log_file = "{0}.log".format(arg_hash) fh = logging.FileHandler(log_file) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.info("Logging to {0}".format(log_file)) if standalone: config = wc.default() config['runtime']['s3_bucket'] = 'numpywrenpublic' key = "pywren.runtime/pywren_runtime-3.6-numpywren.tar.gz" config['runtime']['s3_key'] = key extra_env ={"AWS_ACCESS_KEY_ID" : os.environ["AWS_ACCESS_KEY_ID"], "AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"], "OMP_NUM_THREADS":str(n_threads), "AWS_DEFAULT_REGION":region} pwex = pywren.standalone_executor(config=config, job_max_runtime=999999) else: extra_env = {"AWS_DEFAULT_REGION":region} config = wc.default() config['runtime']['s3_bucket'] = 'numpywrenpublic' key = "pywren.runtime/pywren_runtime-3.6-numpywren.tar.gz" config['runtime']['s3_key'] = key print(config) pwex = pywren.default_executor(config=config) if (not matrix_exists): np.random.seed(0) X = np.random.randn(problem_size, 1) shard_sizes = [shard_size, 1] X_sharded = BigMatrix("sosp_cholesky_test_{0}_{1}".format(problem_size, shard_size), shape=X.shape, shard_sizes=shard_sizes, write_header=True, autosqueeze=False, bucket="numpywrentest", parent_fn=constant_zeros) shard_matrix(X_sharded, X) t = time.time() print(X_sharded.shape) XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False) e = time.time() print("GEMM took {0}".format(e - t)) else: X_sharded = BigMatrix("sosp_cholesky_test_{0}_{1}".format(problem_size, shard_size), shape=X.shape, shard_sizes=shard_sizes, write_header=True, autosqueeze=False, bucket="numpywrentest", parent_fn=constant_zeros) key_name = binops.generate_key_name_binop(X_sharded, X_sharded.T, "gemm") XXT_sharded = BigMatrix(key_name, hash_keys=False, bucket="numpywrentest") XXT_sharded.lambdav = problem_size*20e12 if (verify): A = XXT_sharded.numpy() print("Computing local cholesky") L = np.linalg.cholesky(A) t = time.time() program, meta = cholesky(XXT_sharded) L_sharded = meta["outputs"][0] pipeline_width = args.pipeline if (lru): cache_size = 5 else: cache_size = 0 pywren_config = pwex.config e = time.time() print("Program compile took {0} seconds".format(e - t)) print("program.hash", program.hash) REDIS_CLIENT = program.control_plane.client done_counts = [] ready_counts = [] post_op_counts = [] not_ready_counts = [] running_counts = [] sqs_invis_counts = [] sqs_vis_counts = [] up_workers_counts = [] busy_workers_counts = [] read_objects = [] write_objects = [] all_read_timeouts = [] all_write_timeouts = [] all_redis_timeouts = [] times = [time.time()] flops = [0] reads = [0] writes = [0] print("LRU", lru) print("eager", eager) exp = {} exp["redis_done_counts"] = done_counts exp["redis_ready_counts"] = ready_counts exp["redis_post_op_counts"] = post_op_counts exp["redis_not_ready_counts"] = not_ready_counts exp["redis_running_counts"] = running_counts exp["sqs_invis_counts"] = sqs_invis_counts exp["sqs_vis_counts"] = sqs_vis_counts exp["busy_workers"] = busy_workers_counts exp["up_workers"] = up_workers_counts exp["times"] = times exp["lru"] = lru exp["priority"] = num_priorities exp["eager"] = eager exp["truncate"] = truncate exp["max_cores"] = max_cores exp["problem_size"] = problem_size exp["shard_size"] = shard_size exp["pipeline"] = pipeline exp["flops"] = flops exp["reads"] = reads exp["writes"] = writes exp["read_objects"] = read_objects exp["write_objects"] = write_objects exp["read_timeouts"] = all_read_timeouts exp["write_timeouts"] = all_write_timeouts exp["redis_timeouts"] = all_redis_timeouts exp["trial"] = trial exp["launch_granularity"] = launch_granularity exp["log_granularity"] = log_granularity exp["autoscale_policy"] = autoscale_policy exp["standalone"] = standalone exp["program"] = program exp["time_steps"] = 1 exp["failed"] = False program.start() t = time.time() logger.info("Starting with {0} cores".format(start_cores)) all_futures = pwex.map(lambda x: job_runner.lambdapack_run(program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(start_cores), extra_env=extra_env) start_time = time.time() last_run_time = start_time print(program.program_status()) print("QUEUE URLS", len(program.queue_urls)) total_lambda_epochs = start_cores try: while(program.program_status() == lp.PS.RUNNING): time.sleep(log_granularity) curr_time = int(time.time() - start_time) p = program.get_progress() if (p is None): print("no progress...") continue else: p = int(p) times.append(int(time.time())) max_pc = p waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') attrs = client.get_queue_attributes(QueueUrl=queue_url, AttributeNames=['ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible'])['Attributes'] waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) sqs_invis_counts.append(running) sqs_vis_counts.append(waiting) busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash)) sparse_writes = parse_int(REDIS_CLIENT.get("{0}_write_sparse".format(program.hash)))/1e9 if (busy_workers == None): busy_workers = 0 else: busy_workers = int(busy_workers) up_workers = program.get_up() if (up_workers == None): up_workers = 0 else: up_workers = int(up_workers) up_workers_counts.append(up_workers) busy_workers_counts.append(busy_workers) logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format(up_workers, busy_workers, curr_time)) if ((curr_time % INFO_FREQ) == 0): logger.info("Waiting: {0}, Currently Processing: {1}".format(waiting, running)) logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format(up_workers, busy_workers, curr_time)) current_gflops = program.get_flops() if (current_gflops is None): current_gflops = 0 else: current_gflops = int(current_gflops)/1e9 flops.append(current_gflops) current_gbytes_read = program.get_read() if (current_gbytes_read is None): current_gbytes_read = 0 else: current_gbytes_read = int(current_gbytes_read)/1e9 reads.append(current_gbytes_read) current_gbytes_write = program.get_write() if (current_gbytes_write is None): current_gbytes_write = 0 else: current_gbytes_write = int(current_gbytes_write)/1e9 writes.append(current_gbytes_write) gflops_rate = flops[-1]/(times[-1] - times[0]) greads_rate = reads[-1]/(times[-1] - times[0]) gwrites_rate = writes[-1]/(times[-1] - times[0]) b = XXT_sharded.shard_sizes[0] current_objects_read = (current_gbytes_read*1e9)/(b*b*8) current_objects_write = (current_gbytes_write*1e9)/(b*b*8) read_objects.append(current_objects_read) write_objects.append(current_objects_write) read_rate = read_objects[-1]/(times[-1] - times[0]) write_rate = write_objects[-1]/(times[-1] - times[0]) avg_workers = np.mean(up_workers_counts) smooth_len = 10 if (len(flops) > smooth_len + 5): gflops_rate_5_min_window = (flops[-1] - flops[-smooth_len])/(times[-1] - times[-smooth_len]) gread_rate_5_min_window = (reads[-1] - reads[-smooth_len])/(times[-1] - times[-smooth_len]) gwrite_rate_5_min_window = (writes[-1] - writes[-smooth_len])/(times[-1] - times[-smooth_len]) read_rate_5_min_window = (read_objects[-1] - read_objects[-smooth_len])/(times[-1] - times[-smooth_len]) write_rate_5_min_window = (write_objects[-1] - write_objects[-smooth_len])/(times[-1] - times[-smooth_len]) workers_5_min_window = np.mean(up_workers_counts[-smooth_len:]) else: gflops_rate_5_min_window = "N/A" gread_rate_5_min_window = "N/A" gwrite_rate_5_min_window = "N/A" workers_5_min_window = "N/A" read_rate_5_min_window = "N/A" write_rate_5_min_window = "N/A" read_timeouts = int(parse_int(REDIS_CLIENT.get("s3.timeouts.read"))) write_timeouts = int(parse_int(REDIS_CLIENT.get("s3.timeouts.write"))) redis_timeouts = int(parse_int(REDIS_CLIENT.get("redis.timeouts"))) all_read_timeouts.append(read_timeouts) all_write_timeouts.append(write_timeouts) all_redis_timeouts.append(redis_timeouts) read_timeouts_fraction = read_timeouts/current_objects_read write_timeouts_fraction = write_timeouts/current_objects_write print("=======================================") print("Max PC is {0}".format(max_pc)) print("Waiting: {0}, Currently Processing: {1}".format(waiting, running)) print("{2}: Up Workers: {0}, Busy Workers: {1}".format(up_workers, busy_workers, curr_time)) print("{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}, Total Gbytes Write Sparse : {4}".format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write, sparse_writes)) print("{0}: Average GFLOPS rate {1}, Average GBytes Read rate {2}, Average GBytes Write rate {3}, Average Worker Count {4}".format(curr_time, gflops_rate, greads_rate, gwrites_rate, avg_workers)) print("{0}: Average read txns/s {1}, Average write txns/s {2}".format(curr_time, read_rate, write_rate)) print("{0}: smoothed GFLOPS rate {1}, smoothed GBytes Read rate {2}, smoothed GBytes Write rate {3}, smoothed Worker Count {4}".format(curr_time, gflops_rate_5_min_window, gread_rate_5_min_window, gwrite_rate_5_min_window, workers_5_min_window)) print("{0}: smoothed read txns/s {1}, smoothed write txns/s {2}".format(curr_time, read_rate_5_min_window, write_rate_5_min_window)) print("{0}: Read timeouts: {1}, Write timeouts: {2}, Redis timeouts: {3} ".format(curr_time, read_timeouts, write_timeouts, redis_timeouts)) print("{0}: Read timeouts fraction: {1}, Write timeouts fraction: {2}".format(curr_time, read_timeouts_fraction, write_timeouts_fraction)) print("=======================================") time_since_launch = time.time() - last_run_time if (autoscale_policy == "dynamic"): if (time_since_launch > launch_granularity and up_workers < np.ceil(waiting*0.5/pipeline_width) and up_workers < max_cores): cores_to_launch = int(min(np.ceil(waiting/pipeline_width) - up_workers, max_cores - up_workers)) logger.info("launching {0} new tasks....".format(cores_to_launch)) new_futures = pwex.map(lambda x: job_runner.lambdapack_run(program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=extra_env) all_futures += new_futures last_run_time = time.time() # check if we OOM-erred # [x.result() for x in all_futures] all_futures.extend(new_future_futures) elif (autoscale_policy == "constant_timeout"): if (time_since_launch > (0.85*timeout)): cores_to_launch = max_cores logger.info("launching {0} new tasks....".format(cores_to_launch)) new_futures = pwex.map(lambda x: job_runner.lambdapack_run(program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=extra_env) all_futures += new_futures last_run_time = time.time() else: raise Exception("unknown autoscale policy") exp["time_steps"] += 1 if (verify): L_sharded_local = L_sharded.numpy() print("max diff", np.max(np.abs(L_sharded_local - L))) except KeyboardInterrupt: exp["failed"] = True program.stop() pass except Exception as e: traceback.print_exc() exp["failed"] = True program.stop() raise pass print(program.program_status()) print([f.result() for f in all_futures]) exp["all_futures"] = all_futures exp_bytes = dill.dumps(exp) client = boto3.client('s3') client.put_object(Key="lambdapack/{0}/runtime.pickle".format(program.hash), Body=exp_bytes, Bucket=program.bucket) print("=======================") print("=======================") print("Execution Summary:") print("Executed Program ID: {0}".format(program.hash)) print("Program Success: {0}".format((not exp["failed"]))) print("Problem Size: {0}".format(exp["problem_size"])) print("Shard Size: {0}".format(exp["shard_size"])) print("Total Execution time: {0}".format(times[-1] - times[0])) print("Average Flop Rate (GFlop/s): {0}".format(exp["flops"][-1]/(times[-1] - times[0]))) with open("/tmp/last_run", "w+") as f: f.write(program.hash)
def test_cholesky_multi_repeats(): ''' Insert repeated instructions into PC queue avoid double increments ''' print("RUNNING MULTI") np.random.seed(1) size = 256 shard_size = 30 repeats = 15 total_repeats = 150 np.random.seed(2) print("Generating X") X = np.random.randn(size, 128) print("Generating A") A = X.dot(X.T) + size * np.eye(X.shape[0]) shard_sizes = (shard_size, shard_size) A_sharded = BigMatrix("cholesky_test_A_{0}".format(int(time.time())), shape=A.shape, shard_sizes=shard_sizes, write_header=True) A_sharded.free() shard_matrix(A_sharded, A) program, meta = cholesky(A_sharded) states = compiler.walk_program(program.program.remote_calls) L_sharded = meta["outputs"][0] L_sharded.free() pwex = pywren.default_executor() executor = pywren.lambda_executor config = npw.config.default() print("PROGRAM HASH", program.hash) cores = 1 program.start() jobs = [] for c in range(cores): p = mp.Process(target=job_runner.lambdapack_run, args=(program, ), kwargs={ 'timeout': 3600, 'pipeline_width': 3 }) jobs.append(p) p.start() np.random.seed(0) while (program.program_status() == lp.PS.RUNNING): sqs = boto3.resource('sqs', region_name=program.control_plane.region) time.sleep(0.5) waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') print("Priority {0}".format(i)) attrs = client.get_queue_attributes( QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible' ])['Attributes'] print(attrs) waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) print("SQS QUEUE STATUS Waiting {0}, Running {1}".format( waiting, running)) for i in range(repeats): p = program.get_progress() if (p is None): continue else: p = int(p) pc = int(np.random.choice(min(p, len(states)), 1)) node = states[pc] queue = sqs.Queue(program.queue_urls[0]) total_repeats -= 1 if (total_repeats > 0): print("Malicilously enqueueing node ", pc, node, total_repeats) queue.send_message(MessageBody=json.dumps(node)) time.sleep(1) #for p in jobs: # p.join() program.wait() program.free() L_npw = L_sharded.numpy() L = np.linalg.cholesky(A) z = np.argmax(np.abs(L - L_npw)) assert (np.allclose(L_npw, L))