def main(args): imgnt = imagenet.ImageNetData() with open(args.flickr_api_key_filename, 'r') as f: flickr_api_keys = json.load(f) api_key = flickr_api_keys[0] api_secret = flickr_api_keys[1] with open(args.wnids, 'r') as f: wnids = json.load(f) print('processing {} wnids'.format(len(wnids))) if not args.parallel: all_results = [] for wnid in wnids: print("Flickr search for wnid {}".format(wnid)) res = flickr_search_synset(imgnt, [wnid], api_key, api_secret, args) all_results += res else: pywren_config = wc.default() pywren_config["runtime"]["s3_bucket"] = "imagenet2datav2" pywren_config["runtime"][ "s3_key"] = "pywren.runtime/pywren_runtime-3.6-imagenet2.tar.gz" pwex = pywren.default_executor(config=pywren_config) pywren_func = lambda x: flickr_search_synset(imgnt, x, api_key, api_secret, args) pywren_args = list( utils.chunks(wnids, int(np.ceil(len(wnids) / args.num_serial_tasks)))) num_images_per_wnid = {} with open( '../data/metadata/flickr_' + args.min_date_uploaded + '_' + args.max_date_uploaded + '.json', 'r') as fp: num_images_per_wnid = json.load(fp) for ii, lst in enumerate(pywren_args): print("Map {} over {} wnids ".format(ii, len(lst))) unfinished_wnids = [] for wnid in lst: if wnid not in num_images_per_wnid: unfinished_wnids.append(wnid) print("Executing pywren call for {} wnids".format( len(unfinished_wnids))) futures = pwex.map(pywren_func, [[x] for x in unfinished_wnids]) pywren.wait(futures) results = [f.result()[0] for f in futures] num_images = [f.result()[1] for f in futures] for ii, wnid in enumerate(unfinished_wnids): num_images_per_wnid[wnid] = num_images[ii] all_results = [] for res in results: all_results += res with open( '../data/metadata/flickr_' + args.min_date_uploaded + '_' + args.max_date_uploaded + '.json', 'w') as fp: json.dump(num_images_per_wnid, fp, indent=2) print('Got {} results'.format(len(all_results))) current_date = str(datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S')) out_file = '../data/search_results/' + current_date + '_' + getpass.getuser( ) + '.json' with open(out_file, 'w+') as fp: json.dump(all_results, fp, indent=2)
def result(self, timeout=None, check_only=False, throw_except=True, storage_handler=None): """ From the python docs: Return the value returned by the call. If the call hasn't yet completed then this method will wait up to timeout seconds. If the call hasn't completed in timeout seconds then a TimeoutError will be raised. timeout can be an int or float.If timeout is not specified or None then there is no limit to the wait time. If the future is cancelled before completing then CancelledError will be raised. If the call raised then this method will raise the same exception. """ if self._state == JobState.new: raise ValueError("job not yet invoked") if self._state == JobState.success: return self._return_val if self._state == JobState.error: if throw_except: raise self._exception else: return None if storage_handler is None: storage_config = wrenconfig.extract_storage_config( wrenconfig.default()) storage_handler = storage.Storage(storage_config) storage_utils.check_storage_path(storage_handler.get_storage_config(), self.storage_path) call_status = storage_handler.get_call_status(self.callset_id, self.call_id) self.status_query_count += 1 ## FIXME implement timeout if timeout is not None: raise NotImplementedError() if check_only is True: if call_status is None: return None while call_status is None: time.sleep(self.GET_RESULT_SLEEP_SECS) call_status = storage_handler.get_call_status( self.callset_id, self.call_id) self.status_query_count += 1 self._invoke_metadata['status_done_timestamp'] = time.time() self._invoke_metadata['status_query_count'] = self.status_query_count self.run_status = call_status # this is the remote status information self.invoke_status = self._invoke_metadata # local status information if call_status['exception'] is not None: # the wrenhandler had an exception exception_str = call_status['exception'] print(call_status) exception_args = call_status['exception_args'] if exception_args[0] == "WRONGVERSION": if throw_except: raise Exception("Pywren version mismatch: remote " + \ "expected version {}, local library is version {}".format( exception_args[2], exception_args[3])) return None elif exception_args[0] == "OUTATIME": if throw_except: raise Exception("process ran out of time") return None else: if throw_except: if 'exception_traceback' in call_status: logger.error(call_status['exception_traceback']) raise Exception(exception_str, *exception_args) return None call_output_time = time.time() call_invoker_result = pickle.loads( storage_handler.get_call_output(self.callset_id, self.call_id)) call_output_time_done = time.time() self._invoke_metadata[ 'download_output_time'] = call_output_time_done - call_output_time self._invoke_metadata[ 'download_output_timestamp'] = call_output_time_done call_success = call_invoker_result['success'] logger.info("ResponseFuture.result() {} {} call_success {}".format( self.callset_id, self.call_id, call_success)) self._call_invoker_result = call_invoker_result if call_success: self._return_val = call_invoker_result['result'] self._state = JobState.success return self._return_val elif throw_except: self._exception = call_invoker_result['result'] self._traceback = (call_invoker_result['exc_type'], call_invoker_result['exc_value'], call_invoker_result['exc_traceback']) self._state = JobState.error if call_invoker_result.get('pickle_fail', False): logging.warning( "there was an error pickling. The original exception: " + \ "{}\nThe pickling exception: {}".format( call_invoker_result['exc_value'], str(call_invoker_result['pickle_exception']))) reraise(Exception, call_invoker_result['exc_value'], call_invoker_result['exc_traceback']) else: # reraise the exception reraise(*self._traceback) else: return None # nothing, don't raise, no value
def run_experiment(problem_size, shard_size, pipeline, num_priorities, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, standalone, warmup, verify, matrix_exists, read_limit, write_limit, compute_threads_per_worker): # set up logging invoke_executor = fs.ThreadPoolExecutor(1) logger = logging.getLogger() region = wc.default()["account"]["aws_region"] print("REGION", region) for key in logging.Logger.manager.loggerDict: logging.getLogger(key).setLevel(logging.CRITICAL) logger.setLevel(logging.DEBUG) arg_bytes = pickle.dumps( (problem_size, shard_size, pipeline, num_priorities, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, read_limit, write_limit)) arg_hash = hashlib.md5(arg_bytes).hexdigest() log_file = "{0}.log".format(arg_hash) fh = logging.FileHandler(log_file) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.info("Logging to {0}".format(log_file)) if standalone: extra_env = { "AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"].strip(), "AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"].strip(), "OMP_NUM_THREADS": "1", "AWS_DEFAULT_REGION": region } config = wc.default() config['runtime']['s3_bucket'] = 'numpywrenpublic' key = "pywren.runtime/pywren_runtime-3.6-numpywren.tar.gz" config['runtime']['s3_key'] = key pwex = pywren.standalone_executor(config=config) else: extra_env = {"AWS_DEFAULT_REGION": region} config = wc.default() config['runtime']['s3_bucket'] = 'numpywrenpublic' key = "pywren.runtime/pywren_runtime-3.6-numpywren.tar.gz" config['runtime']['s3_key'] = key print(config) pwex = pywren.default_executor(config=config) if (not matrix_exists): X = np.random.randn(problem_size, 1) shard_sizes = [shard_size, 1] X_sharded = BigMatrix("qr_test_{0}_{1}".format(problem_size, shard_size), shape=X.shape, shard_sizes=shard_sizes, write_header=True, autosqueeze=False, bucket="numpywrennsdi") shard_matrix(X_sharded, X) print("Generating PSD matrix...") t = time.time() print(X_sharded.shape) XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False) e = time.time() print("GEMM took {0}".format(e - t)) else: X_sharded = BigMatrix("qr_test_{0}_{1}".format(problem_size, shard_size), autosqueeze=False, bucket="numpywrennsdi") key_name = binops.generate_key_name_binop(X_sharded, X_sharded.T, "gemm") XXT_sharded = BigMatrix(key_name, hash_keys=False, bucket="numpywrensdi2") XXT_sharded.lambdav = problem_size * 10 t = time.time() program, meta = bdfac(XXT_sharded, truncate=truncate) pipeline_width = args.pipeline if (lru): cache_size = 5 else: cache_size = 0 pywren_config = pwex.config e = time.time() print("Program compile took {0} seconds".format(e - t)) print("program.hash", program.hash) REDIS_CLIENT = program.control_plane.client done_counts = [] ready_counts = [] post_op_counts = [] not_ready_counts = [] running_counts = [] sqs_invis_counts = [] sqs_vis_counts = [] up_workers_counts = [] busy_workers_counts = [] read_objects = [] write_objects = [] all_read_timeouts = [] all_write_timeouts = [] all_redis_timeouts = [] times = [time.time()] flops = [0] reads = [0] writes = [0] print("LRU", lru) print("eager", eager) exp = {} exp["redis_done_counts"] = done_counts exp["redis_ready_counts"] = ready_counts exp["redis_post_op_counts"] = post_op_counts exp["redis_not_ready_counts"] = not_ready_counts exp["redis_running_counts"] = running_counts exp["sqs_invis_counts"] = sqs_invis_counts exp["sqs_vis_counts"] = sqs_vis_counts exp["busy_workers"] = busy_workers_counts exp["up_workers"] = up_workers_counts exp["times"] = times exp["lru"] = lru exp["priority"] = num_priorities exp["eager"] = eager exp["truncate"] = truncate exp["max_cores"] = max_cores exp["problem_size"] = problem_size exp["shard_size"] = shard_size exp["pipeline"] = pipeline exp["flops"] = flops exp["reads"] = reads exp["writes"] = writes exp["read_objects"] = read_objects exp["write_objects"] = write_objects exp["read_timeouts"] = all_read_timeouts exp["write_timeouts"] = all_write_timeouts exp["redis_timeouts"] = all_redis_timeouts exp["trial"] = trial exp["launch_granularity"] = launch_granularity exp["log_granularity"] = log_granularity exp["autoscale_policy"] = autoscale_policy exp["standalone"] = standalone exp["program"] = program exp["time_steps"] = 1 exp["failed"] = False program.start() t = time.time() logger.info("Starting with {0} cores".format(start_cores)) all_futures = pwex.map( lambda x: job_runner.lambdapack_run(program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(start_cores), extra_env=extra_env) start_time = time.time() last_run_time = start_time print(program.program_status()) print("QUEUE URLS", len(program.queue_urls)) total_lambda_epochs = start_cores try: while (program.program_status() == lp.PS.RUNNING): time.sleep(log_granularity) curr_time = int(time.time() - start_time) p = program.get_progress() if (p is None): print("no progress...") continue else: p = int(p) times.append(int(time.time())) max_pc = p waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') attrs = client.get_queue_attributes( QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible' ])['Attributes'] waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) sqs_invis_counts.append(running) sqs_vis_counts.append(waiting) busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash)) repeated_compute = parse_int( REDIS_CLIENT.get("{0}_repeated_compute".format(program.hash))) repeated_post_op = parse_int( REDIS_CLIENT.get("{0}_repeated_post_op".format(program.hash))) repeated_finish = parse_int( REDIS_CLIENT.get("{0}_repeated_finish".format(program.hash))) not_ready = parse_int( REDIS_CLIENT.get("{0}_not_ready".format(program.hash))) if (busy_workers == None): busy_workers = 0 else: busy_workers = int(busy_workers) up_workers = program.get_up() if (up_workers == None): up_workers = 0 else: up_workers = int(up_workers) up_workers_counts.append(up_workers) busy_workers_counts.append(busy_workers) logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) if ((curr_time % INFO_FREQ) == 0): logger.info("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) current_gflops = program.get_flops() if (current_gflops is None): current_gflops = 0 else: current_gflops = int(current_gflops) / 1e9 flops.append(current_gflops) current_gbytes_read = program.get_read() if (current_gbytes_read is None): current_gbytes_read = 0 else: current_gbytes_read = int(current_gbytes_read) / 1e9 reads.append(current_gbytes_read) current_gbytes_write = program.get_write() if (current_gbytes_write is None): current_gbytes_write = 0 else: current_gbytes_write = int(current_gbytes_write) / 1e9 writes.append(current_gbytes_write) gflops_rate = flops[-1] / (times[-1] - times[0]) greads_rate = reads[-1] / (times[-1] - times[0]) gwrites_rate = writes[-1] / (times[-1] - times[0]) b = XXT_sharded.shard_sizes[0] current_objects_read = (current_gbytes_read * 1e9) / (b * b * 8) current_objects_write = (current_gbytes_write * 1e9) / (b * b * 8) read_objects.append(current_objects_read) write_objects.append(current_objects_write) read_rate = read_objects[-1] / (times[-1] - times[0]) write_rate = write_objects[-1] / (times[-1] - times[0]) avg_workers = np.mean(up_workers_counts) smooth_len = 10 if (len(flops) > smooth_len + 5): gflops_rate_5_min_window = (flops[-1] - flops[-smooth_len]) / ( times[-1] - times[-smooth_len]) gread_rate_5_min_window = (reads[-1] - reads[-smooth_len]) / ( times[-1] - times[-smooth_len]) gwrite_rate_5_min_window = ( writes[-1] - writes[-smooth_len]) / (times[-1] - times[-smooth_len]) read_rate_5_min_window = (read_objects[-1] - read_objects[-smooth_len]) / ( times[-1] - times[-smooth_len]) write_rate_5_min_window = (write_objects[-1] - write_objects[-smooth_len]) / ( times[-1] - times[-smooth_len]) workers_5_min_window = np.mean(up_workers_counts[-smooth_len:]) else: gflops_rate_5_min_window = "N/A" gread_rate_5_min_window = "N/A" gwrite_rate_5_min_window = "N/A" workers_5_min_window = "N/A" read_rate_5_min_window = "N/A" write_rate_5_min_window = "N/A" read_timeouts = int(parse_int( REDIS_CLIENT.get("s3.timeouts.read"))) write_timeouts = int( parse_int(REDIS_CLIENT.get("s3.timeouts.write"))) redis_timeouts = int(parse_int(REDIS_CLIENT.get("redis.timeouts"))) all_read_timeouts.append(read_timeouts) all_write_timeouts.append(write_timeouts) all_redis_timeouts.append(redis_timeouts) read_timeouts_fraction = read_timeouts / (current_objects_read + 1e-8) write_timeouts_fraction = write_timeouts / \ (current_objects_write+1e-8) print("=======================================") print( f"Progress is {p}, Repeated Compute is {repeated_compute}, Repeated POST OP is {repeated_post_op}, Repeated Finishes is {repeated_finish}, Not ready Nodes scheduled are {not_ready}" ) print("Max PC is {0}".format(max_pc)) print("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) print("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) print( "{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}" .format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write)) print( "{0}: Average GFLOPS rate {1}, Average GBytes Read rate {2}, Average GBytes Write rate {3}, Average Worker Count {4}" .format(curr_time, gflops_rate, greads_rate, gwrites_rate, avg_workers)) print("{0}: Average read txns/s {1}, Average write txns/s {2}". format(curr_time, read_rate, write_rate)) print( "{0}: smoothed GFLOPS rate {1}, smoothed GBytes Read rate {2}, smoothed GBytes Write rate {3}, smoothed Worker Count {4}" .format(curr_time, gflops_rate_5_min_window, gread_rate_5_min_window, gwrite_rate_5_min_window, workers_5_min_window)) print("{0}: smoothed read txns/s {1}, smoothed write txns/s {2}". format(curr_time, read_rate_5_min_window, write_rate_5_min_window)) print( "{0}: Read timeouts: {1}, Write timeouts: {2}, Redis timeouts: {3} " .format(curr_time, read_timeouts, write_timeouts, redis_timeouts)) print( "{0}: Read timeouts fraction: {1}, Write timeouts fraction: {2}" .format(curr_time, read_timeouts_fraction, write_timeouts_fraction)) print("=======================================") time_since_launch = time.time() - last_run_time if (time_since_launch > (0.85 * timeout)): cores_to_launch = max_cores logger.info( "launching {0} new tasks....".format(cores_to_launch)) new_futures = pwex.map(lambda x: job_runner.lambdapack_run( program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=extra_env) #print("waiting for second result") #print("result..", new_futures[0].result()) #print([x.result() for x in new_futures]) last_run_time = time.time() all_futures.extend(new_futures) exp["time_steps"] += 1 except KeyboardInterrupt: exp["failed"] = True program.stop() pass except Exception as e: traceback.print_exc() exp["failed"] = True program.stop() raise pass print(program.program_status()) exp["all_futures"] = all_futures exp_bytes = dill.dumps(exp) client = boto3.client('s3') client.put_object(Key="lambdapack/{0}/runtime.pickle".format(program.hash), Body=exp_bytes, Bucket=program.bucket) print("=======================") print("=======================") print("Execution Summary:") print("Executed Program ID: {0}".format(program.hash)) print("Program Success: {0}".format((not exp["failed"]))) print("Problem Size: {0}".format(exp["problem_size"])) print("Shard Size: {0}".format(exp["shard_size"])) print("Total Execution time: {0}".format(times[-1] - times[0])) print("Average Flop Rate (GFlop/s): {0}".format(exp["flops"][-1] / (times[-1] - times[0]))) with open("/tmp/last_run", "w+") as f: f.write(program.hash)
import pywren from pywren import wrenconfig as wc import candidate_data import utils pywren_config = wc.default() pywren_config["runtime"]["s3_bucket"] = "imagenet2pywren" pywren_config["runtime"][ "s3_key"] = "pywren.runtime/pywren_runtime-3.6-imagenet2pywren.meta.json" pwex = pywren.default_executor(config=pywren_config) print("pywren config", pwex.config) c_data = candidate_data.CandidateData() all_cs = c_data.all_candidates chunked_cs = list(utils.chunks(list(all_cs.keys()), 100)) def return_not_exists(lst): ret_lst = [] for e in lst: key = "{0}/{1}.jpg".format("imagenet2candidates_scaled", e) exists = utils.key_exists(bucket="imagenet2datav2", key=key) print(exists, key) if (not exists): ret_lst.append(e) return ret_lst def return_not_exists_encrypted(lst):
def interactive_setup(ctx): ''' Take the following setup 1) First check if pywren works by running pywren test 2) Create config file 3) Create auxillary ``cron lambda" a) Check for ``unused redis" ''' def ds(key): """ Debug suffix for defaults. For automated testing, automatically adds a suffix to each default """ return "{}{}".format(key, suffix) ok = click.confirm(NUMPYWREN_SETUP, default=True) if (not ok): return click.echo("Testing pywren is correctly installed...") try: test_pywren() except Exception as e: click.echo( "Looks like there is something wrong with your pywren setup. Please make sure the command\ pywren test_function returns sucessfully") raise pywren_config = wrenconfig.default() pywren_bucket = pywren_config["s3"]["bucket"] # if config file exists, ask before overwriting config_filename = click_validate_prompt( "Location for config file: ", default=npw.config.get_default_home_filename()) overwrite = check_overwrite_function(config_filename) config_filename = os.path.expanduser(config_filename) s3_bucket = click_validate_prompt( "numpywren requires an s3 bucket to store all data. " + \ "What s3 bucket would you like to use?", default=pywren_bucket, validate_func=check_valid_bucket_name) create_bucket = False if not check_bucket_exists(s3_bucket): create_bucket = click.confirm( "Bucket does not currently exist, would you like to create it?", default=True) click.echo( "numpywren prefixes every object it puts in S3 with a particular prefix." ) prefix = click_validate_prompt("numpywren s3 prefix: ", default=npw.config.AWS_S3_PREFIX_DEFAULT) if (overwrite): default_yaml = yaml.safe_load( open(os.path.join(SOURCE_DIR, "../default_config.yaml"))) else: default_yaml = yaml.safe_load(open(config_filename)) default_yaml["s3"]["bucket"] = s3_bucket default_yaml["s3"]["prefix"] = prefix default_yaml["iam"]["role_name"] = npw.config.AWS_ROLE_DEFAULT default_yaml["iam"][ "instance_profile_name"] = npw.config.AWS_INSTANCE_PROFILE_DEFAULT try: ec2_client = boto3.client('ec2') response = ec2_client.describe_key_pairs() key_pairs = [x['KeyName'] for x in response["KeyPairs"]] key_pair = key_pairs[0] except: raise click.echo( "Error in acquiring ec2 key pair, perhaps you don't have any setup?" ) return default_yaml["control_plane"]["ec2_ssh_key"] = key_pair config_advanced = click.confirm( "Would you like to configure advanced numpywren properties?", default=False) if (config_advanced): lifespan = int( click_validate_prompt( "How many days would you like numpywren to temporarily store data on S3 (default is 1 day, which translates to roughly $0.72 per TB)", default=default_yaml["s3"]["lifespan"], validate_func=check_valid_lifespan)) default_yaml["s3"]["lifespan"] = lifespan runtime_bucket = click_validate_prompt( "Which bucket would you like pywren to load the python runtime from", default=default_yaml["runtime"]["bucket"], validate_func=check_valid_bucket_name) runtime_key = click_validate_prompt( "What is the runtime key in above bucket", default=default_yaml["runtime"]["s3_key"]) default_yaml["runtime"]["bucket"] = runtime_bucket default_yaml["runtime"]["s3_key"] = runtime_key role_name = click_validate_prompt( "What would you like to name the numpywren iam role which will allow numpywren executors to access your AWS resources", default=default_yaml["iam"]["role_name"]) default_yaml["iam"]["role_name"] = role_name instance_profile_name = click_validate_prompt( "What would you like to name the numpywren iam instance profile which will allow numpywren executors to access your AWS resources", default=default_yaml["iam"]["instance_profile_name"]) default_yaml["iam"]["instance_profile_name"] = instance_profile_name ec2_ssh_key = click_validate_prompt( "Pick a valid ec2 ssh key pair", default=default_yaml["control_plane"]["ec2_ssh_key"]) default_yaml["control_plane"]["ec2_ssh_key"] = ec2_ssh_key else: role_name = default_yaml["iam"]["role_name"] instance_profile_name = default_yaml["iam"]["instance_profile_name"] create_role(default_yaml, role_name) create_instance_profile(default_yaml, instance_profile_name) lifespan = default_yaml["s3"]["lifespan"] s3Client = boto3.client('s3') s3Client.put_bucket_lifecycle_configuration(Bucket=s3_bucket, LifecycleConfiguration={ 'Rules': [ { 'Status': 'Enabled', 'Expiration': { 'Days': lifespan }, 'Filter': { 'Prefix': prefix } }, ] }) open(config_filename, "w+").write(yaml.dump(default_yaml, default_flow_style=False))
import botocore import cloudpickle import numpy as np import pywren.wrenconfig as wc import dill from collections import defaultdict from . import matrix_utils from .matrix_utils import list_all_keys, block_key_to_block, get_local_matrix, key_exists_async from . import utils cpu_count = multiprocessing.cpu_count() logger = logging.getLogger('numpywren') try: DEFAULT_BUCKET = wc.default()['s3']['bucket'] DEFAULT_REGION = wc.default()['account']['aws_region'] except Exception as e: DEFAULT_BUCKET = "" DEFAULT_REGION = "" class BigMatrix(object): """ A multidimensional array stored in S3, sharded in blocks of a given size. Parameters ---------- key : string The S3 key to store this matrix at. shape : tuple of int, optional
def _wait(fs, return_early_n, max_direct_query_n, random_query=False, THREADPOOL_SIZE=16): """ internal function that performs the majority of the WAIT task work. For the list of futures fn, we will check at a minimum `max_direct_query_n` futures at least once. Internally we : 1. use list() to quickly get a list of which ones are done (but list can be behind due to eventual consistency issues) 2. then individually call get_status on at most `max_direct_query_n` returning early if we have found at least `return_early_n` This can mitigate the stragglers. random_query decides whether we get the fs in the order they are presented or in a random order. """ # get all the futures that are not yet done not_done_futures = [f for f in fs if f._state not in [JobState.success, JobState.error]] if len(not_done_futures) == 0: return fs, [] storage_config = wrenconfig.extract_storage_config(wrenconfig.default()) storage_handler = storage.Storage(storage_config) ### Callset optimization via object store convenience functions: # check if the not-done ones have the same callset_id present_callsets = {f.callset_id for f in not_done_futures} # get the list of all objects in this callset still_not_done_futures = [] while present_callsets: callset_id = present_callsets.pop() # note this returns everything done, so we have to figure out # the intersection of those that are done callids_done_in_callset = set(storage_handler.get_callset_status(callset_id)) not_done_call_ids = {f.call_id for f in not_done_futures} done_call_ids = not_done_call_ids.intersection(callids_done_in_callset) not_done_call_ids = not_done_call_ids - done_call_ids still_not_done_futures += [f for f in not_done_futures if (f.call_id in not_done_call_ids)] def fetch_future_status(f): return storage_handler.get_call_status(f.callset_id, f.call_id) pool = ThreadPool(THREADPOOL_SIZE) # now try up to max_direct_query_n direct status queries, quitting once # we have return_n done. query_count = 0 max_queries = min(max_direct_query_n, len(still_not_done_futures)) if random_query: random.shuffle(still_not_done_futures) while query_count < max_queries: if len(done_call_ids) >= return_early_n: break num_to_query_at_once = THREADPOOL_SIZE fs_to_query = still_not_done_futures[query_count:query_count + num_to_query_at_once] fs_statuses = pool.map(fetch_future_status, fs_to_query) callids_found = [fs_to_query[i].call_id for i in range(len(fs_to_query)) if (fs_statuses[i] is not None)] done_call_ids = done_call_ids.union(set(callids_found)) # # update done call_ids # callids_done.update(callids_found) # # break if not all N tasks completed # if (len(callids_found) < len(fs_samples)): # break # # calculate new still_not_done_futures # still_not_done_futures = [f for f in not_done_futures if (f.call_id not in callids_done)] query_count += len(fs_to_query) # now we walk through all the original queries and get # the ones that are actually done. fs_dones = [] fs_notdones = [] f_to_wait_on = [] for f in fs: if f._state in [JobState.success, JobState.error]: # done, don't need to do anything fs_dones.append(f) else: if f.call_id in done_call_ids: f_to_wait_on.append(f) fs_dones.append(f) else: fs_notdones.append(f) def get_result(f): f.result(throw_except=False, storage_handler=storage_handler) pool.map(get_result, f_to_wait_on) pool.close() pool.join() return fs_dones, fs_notdones
import pywren from pywren.serialize import serialize import pywren.wrenconfig as wc import sympy import redis import scipy.linalg import dill import redis.exceptions import logging from .matrix import BigMatrix from .matrix_utils import load_mmap, chunk, generate_key_name_uop, generate_key_name_binop, constant_zeros from . import control_plane, matrix from . import utils try: DEFAULT_CONFIG = wc.default() except: DEFAULT_CONFIG = {} logger = logging.getLogger(__name__) class RemoteInstructionOpCodes(Enum): S3_LOAD = 0 S3_WRITE = 1 GENERIC = 3 RET = 4 class NodeStatus(Enum): NOT_READY = 0
def dummy_executor(config=None, job_max_runtime=300): if config is None: config = wrenconfig.default() invoker = invokers.DummyInvoker() return Executor(invoker, config, job_max_runtime)
def run_experiment(problem_size, shard_size, pipeline, priority, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy, standalone): # set up logging logger = logging.getLogger() for key in logging.Logger.manager.loggerDict: logging.getLogger(key).setLevel(logging.CRITICAL) logger.setLevel(logging.DEBUG) arg_bytes = pickle.dumps( (problem_size, shard_size, pipeline, priority, lru, eager, truncate, max_cores, start_cores, trial, launch_granularity, timeout, log_granularity, autoscale_policy)) arg_hash = hashlib.md5(arg_bytes).hexdigest() log_file = "optimization_experiments/{0}.log".format(arg_hash) fh = logging.FileHandler(log_file) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.info("Logging to {0}".format(log_file)) X = np.random.randn(problem_size, 1) if standalone: redis_env = { "REDIS_ADDR": os.environ.get("REDIS_ADDR", ""), "REDIS_PASS": os.environ.get("REDIS_PASS", ""), "AWS_ACCESS_KEY_ID": "AKIAIV3ENRQOI3FET2YA", "AWS_SECRET_ACCESS_KEY": "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", "OMP_NUM_THREADS": "1" } config = wc.default() config['runtime']['s3_bucket'] = 'pictureweb' config['runtime'][ 's3_key'] = 'pywren.runtime/pywren_runtime-3.6-numpywren_avx512.tar.gz' pwex = pywren.standalone_executor(config=config) else: redis_env = { "REDIS_ADDR": os.environ.get("REDIS_ADDR", ""), "REDIS_PASS": os.environ.get("REDIS_PASS", "") } config = wc.default() config['runtime']['s3_bucket'] = 'pictureweb' config['runtime'][ 's3_key'] = 'pywren.runtime/pywren_runtime-3.6-numpywren.tar.gz' pwex = pywren.default_executor(config=config) shard_sizes = [shard_size, 1] X_sharded = BigMatrix("cholesky_test_{0}_{1}".format( problem_size, shard_size), shape=X.shape, shard_sizes=shard_sizes, write_header=True) shard_matrix(X_sharded, X) print("Generating PSD matrix...") t = time.time() XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False) e = time.time() print("GEMM took {0}".format(e - t)) XXT_sharded.lambdav = problem_size * 10 instructions, L_sharded, trailing = lp._chol(XXT_sharded) pipeline_width = args.pipeline if (priority): num_priorities = 5 else: num_priorities = 1 if (lru): cache_size = 5 else: cache_size = 0 REDIS_CLIENT = redis.StrictRedis(REDIS_ADDR, port=REDIS_PORT, password=REDIS_PASS, db=0, socket_timeout=5) if (truncate is not None): instructions = instructions[:truncate] config = pwex.config program = lp.LambdaPackProgram(instructions, executor=pywren.lambda_executor, pywren_config=config, num_priorities=num_priorities, eager=eager) done_counts = [] ready_counts = [] post_op_counts = [] not_ready_counts = [] running_counts = [] sqs_invis_counts = [] sqs_vis_counts = [] up_workers_counts = [] busy_workers_counts = [] times = [] flops = [] reads = [] writes = [] print("LRU", lru) print("eager", eager) exp = {} exp["redis_done_counts"] = done_counts exp["redis_ready_counts"] = ready_counts exp["redis_post_op_counts"] = post_op_counts exp["redis_not_ready_counts"] = not_ready_counts exp["redis_running_counts"] = running_counts exp["sqs_invis_counts"] = sqs_invis_counts exp["sqs_vis_counts"] = sqs_vis_counts exp["busy_workers"] = busy_workers_counts exp["up_workers"] = up_workers_counts exp["times"] = times exp["lru"] = lru exp["priority"] = priority exp["eager"] = eager exp["truncate"] = truncate exp["max_cores"] = max_cores exp["problem_size"] = problem_size exp["shard_size"] = shard_size exp["pipeline"] = pipeline exp["flops"] = flops exp["reads"] = reads exp["writes"] = writes exp["trial"] = trial exp["launch_granularity"] = launch_granularity exp["log_granularity"] = log_granularity exp["autoscale_policy"] = autoscale_policy exp["standalone"] = standalone logger.info("Longest Path: {0}".format(program.longest_path)) program.start() t = time.time() logger.info("Starting with {0} cores".format(start_cores)) all_futures = pwex.map( lambda x: job_runner.lambdapack_run(program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(start_cores), extra_env=redis_env) # print([f.result() for f in all_futures]) start_time = time.time() last_run_time = start_time while (program.program_status() == lp.PS.RUNNING): curr_time = int(time.time() - start_time) max_pc = program.get_max_pc() times.append(int(time.time())) time.sleep(log_granularity) waiting = 0 running = 0 for i, queue_url in enumerate(program.queue_urls): client = boto3.client('sqs') attrs = client.get_queue_attributes( QueueUrl=queue_url, AttributeNames=[ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible' ])['Attributes'] waiting += int(attrs["ApproximateNumberOfMessages"]) running += int(attrs["ApproximateNumberOfMessagesNotVisible"]) sqs_invis_counts.append(running) sqs_vis_counts.append(waiting) busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash)) if (busy_workers == None): busy_workers = 0 else: busy_workers = int(busy_workers) up_workers = program.get_up() if (up_workers == None): up_workers = 0 else: up_workers = int(up_workers) up_workers_counts.append(up_workers) busy_workers_counts.append(busy_workers) logger.debug("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) if ((curr_time % INFO_FREQ) == 0): logger.info("Max PC is {0}".format(max_pc)) logger.info("Waiting: {0}, Currently Processing: {1}".format( waiting, running)) logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format( up_workers, busy_workers, curr_time)) #print("{5}: Not Ready: {0}, Ready: {1}, Running: {4}, Post OP: {2}, Done: {3}".format(not_ready_count, ready_count, post_op_count, done_count, running_count, curr_time)) current_gflops = program.get_flops() if (current_gflops is None): current_gflops = 0 else: current_gflops = int(current_gflops) / 1e9 flops.append(current_gflops) current_gbytes_read = program.get_read() if (current_gbytes_read is None): current_gbytes_read = 0 else: current_gbytes_read = int(current_gbytes_read) / 1e9 reads.append(current_gbytes_read) current_gbytes_write = program.get_write() if (current_gbytes_write is None): current_gbytes_write = 0 else: current_gbytes_write = int(current_gbytes_write) / 1e9 writes.append(current_gbytes_write) #print("{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}".format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write)) time_since_launch = time.time() - last_run_time if (autoscale_policy == "dynamic"): if (time_since_launch > launch_granularity and up_workers < np.ceil(waiting * 0.5 / pipeline_width) and up_workers < max_cores): cores_to_launch = int( min( np.ceil(waiting / pipeline_width) - up_workers, max_cores - up_workers)) logger.info( "launching {0} new tasks....".format(cores_to_launch)) new_futures = pwex.map(lambda x: job_runner.lambdapack_run( program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=redis_env) last_run_time = time.time() # check if we OOM-erred # [x.result() for x in all_futures] all_futures.extend(new_futures) elif (autoscale_policy == "constant_timeout"): if (time_since_launch > (0.99 * timeout)): cores_to_launch = max_cores logger.info( "launching {0} new tasks....".format(cores_to_launch)) new_futures = pwex.map(lambda x: job_runner.lambdapack_run( program, pipeline_width=pipeline_width, cache_size=cache_size, timeout=timeout), range(cores_to_launch), extra_env=redis_env) last_run_time = time.time() # check if we OOM-erred # [x.result() for x in all_futures] all_futures.extend(new_futures) else: raise Exception("unknown autoscale policy") exp["all_futures"] = all_futures doubles = 0 for pc in range(program.num_inst_blocks): run_count = REDIS_CLIENT.get("{0}_{1}_start".format(program.hash, pc)) if (run_count is None): run_count = 0 else: run_count = int(run_count) if (run_count != 1): logger.warn("PC: {0}, Run Count: {1}".format(pc, run_count)) doubles += 1 print("Number of repeats: {0}".format(doubles)) e = time.time() time.sleep(10) logger.info(program.program_status()) logger.info("PROGRAM STATUS " + str(program.program_status())) logger.info("PROGRAM HASH " + str(program.hash)) logger.info("Took {0} seconds".format(e - t)) # collect in executor = fs.ThreadPoolExecutor(72) futures = [] for i in range(0, program.num_inst_blocks, 1): futures.append(executor.submit(program.get_profiling_info, i)) res = fs.wait(futures) profiled_blocks = [f.result() for f in futures] serializer = serialize.SerializeIndependent() byte_string = serializer([profiled_blocks])[0][0] exp["profiled_block_pickle_bytes"] = byte_string read, write, total_flops, bins, instructions, runtimes = lp.perf_profile( profiled_blocks, num_bins=100) flop_rate = sum(total_flops) / max(bins) exp["flop_rate"] = flop_rate print("Average Flop rate of {0}".format(flop_rate)) # save other stuff try: os.mkdir("optimization_experiments/") except FileExistsError: pass exp_bytes = pickle.dumps(exp) dump_path = "optimization_experiments/{0}.pickle".format(arg_hash) print("Dumping experiment pickle to {0}".format(dump_path)) with open(dump_path, "wb+") as f: f.write(exp_bytes)
def launch_and_provision_redis(config=None): if (config == None): config = npw.config.default() pywren_config = wc.default() rc = config["control_plane"] port = rc["port"] spot_price = rc["spot_price"] password = rc["password"] ipn = config["iam"]["instance_profile_name"] ami = rc["target_ami"] instance_type = rc["ec2_instance_type"] # TODO fix key_name = config["control_plane"]["ec2_ssh_key"] aws_region = pywren_config['account']['aws_region'] availability_zone = rc.get("availability_zone", None) redis_conf = open(sd("redis.conf")).read() template_file = sd("redis.cloudinit.template") user_data = open(template_file, 'r').read() cloud_agent_conf = open(sd("cloudwatch-agent.config"), 'r').read() cloud_agent_conf_64 = b64s(cloud_agent_conf) redis_conf_b64 = b64s(redis_conf.format(port=port, password=password)) redis_init_b64 = b64s( open(sd("redis_init_script")).read().format(port=port)) user_data = user_data.format(redis_init=redis_init_b64, cloud_agent_conf=cloud_agent_conf_64, redis_conf=redis_conf_b64, aws_region=aws_region) iam = boto3.resource('iam') instance_profile = iam.InstanceProfile(ipn) instance_profile_dict = {'Name': instance_profile.name} group_id = create_security_group() instances = _create_instances(1, aws_region, spot_price, ami=ami, instance_type=instance_type, block_device_mappings=None, security_group_ids=[group_id], ebs_optimized=True, availability_zone=None, instance_profile=instance_profile_dict, user_data=user_data, key_name=key_name) inst = instances[0] inst.reload() inst.create_tags(Resources=[inst.instance_id], Tags=[ { 'Key': 'Name', 'Value': 'numpywren.control_plane' }, ]) host = inst.public_ip_address info = { 'id': inst.id, 'type': inst.instance_type, 'private_ip': inst.private_ip_address, 'public_ip': inst.public_ip_address, } set_control_plane(info, config) return info
def result(self, timeout=None, check_only=False, throw_except=True, storage_handler=None): """ check_only = True implies we only check if the job is completed. # FIXME check_only is the worst API and should be refactored # out to be part of done() From the python docs: Return the value returned by the call. If the call hasn't yet completed then this method will wait up to timeout seconds. If the call hasn't completed in timeout seconds then a TimeoutError will be raised. timeout can be an int or float. If timeout is not specified or None then there is no limit to the wait time. Return the value returned by the call. If the call raised an exception, this method will raise the same exception If the future is cancelled before completing then CancelledError will be raised. :param timeout: This method will wait up to timeout seconds before raising a TimeoutError if function hasn't completed. If None, wait indefinitely. Default None. :param check_only: Return None immediately if job is not complete. Default False. :param throw_except: Reraise exception if call raised. Default true. :param storage_handler: Storage handler to poll cloud storage. Default None. :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. """ if self._state == JobState.new: raise ValueError("job not yet invoked") if check_only: if self._state == JobState.success or self._state == JobState.error: return True if self._state == JobState.success: return self._return_val if self._state == JobState.error: if throw_except: print("Encountered exception: {}".format(self._exception)) raise self._exception else: return None if storage_handler is None: storage_config = wrenconfig.extract_storage_config(wrenconfig.default()) storage_handler = storage.Storage(storage_config) storage_utils.check_storage_path(storage_handler.get_storage_config(), self.storage_path) call_status = storage_handler.get_call_status(self.callset_id, self.call_id) self.status_query_count += 1 ## FIXME implement timeout if timeout is not None: raise NotImplementedError() if check_only: if call_status is None: return False else: return True while call_status is None: time.sleep(self.GET_RESULT_SLEEP_SECS) call_status = storage_handler.get_call_status(self.callset_id, self.call_id) self.status_query_count += 1 self._invoke_metadata['status_done_timestamp'] = time.time() self._invoke_metadata['status_query_count'] = self.status_query_count self.run_status = call_status # this is the remote status information self.invoke_status = self._invoke_metadata # local status information if call_status['exception'] is not None: # the wrenhandler had an exception exception_str = call_status['exception'] exception_args = call_status['exception_args'] if exception_args[0] == "WRONGVERSION": if throw_except: raise Exception("Pywren version mismatch: remote " + \ "expected version {}, local library is version {}".format( exception_args[2], exception_args[3])) return None elif exception_args[0] == "OUTATIME": if throw_except: raise Exception("process ran out of time") return None elif exception_args[0] == "CANCELLED": if throw_except: raise Exception("job was cancelled") elif exception_args[0] == "RETCODE": if throw_except: raise Exception("python process failed, returned a non-zero return code" "(check stdout for information)") return None else: if throw_except: if 'exception_traceback' in call_status: logger.error(call_status['exception_traceback']) raise Exception(exception_str, *exception_args) return None # FIXME this shouldn't be called if check_only is True call_output_time = time.time() call_invoker_result = pickle.loads(storage_handler.get_call_output( self.callset_id, self.call_id)) call_output_time_done = time.time() self._invoke_metadata['download_output_time'] = call_output_time_done - call_output_time self._invoke_metadata['download_output_timestamp'] = call_output_time_done call_success = call_invoker_result['success'] logger.info("ResponseFuture.result() {} {} call_success {}".format(self.callset_id, self.call_id, call_success)) self._call_invoker_result = call_invoker_result if call_success: self._return_val = call_invoker_result['result'] self._set_state(JobState.success) return self._return_val else: self._set_state(JobState.error) self._exception = call_invoker_result['result'] self._traceback = (call_invoker_result['exc_type'], call_invoker_result['exc_value'], call_invoker_result['exc_traceback']) print("Exception: {}\nTraceback: {}\n{}\n{}".format(self._exception, str(self._traceback[0]), str(self._traceback[1]), str(call_invoker_result['exc_traceback']))) if "exec_traceback_formatted" in call_invoker_result: print("exec_traceback_formatted:\n{}".format(call_invoker_result["exec_traceback_formatted"])) print("call_invoker_result: {}".format(call_invoker_result)) if throw_except: if call_invoker_result.get('pickle_fail', False): logging.warning( "there was an error pickling. The original exception: " + \ "{}\nThe pickling exception: {}".format( call_invoker_result['exc_value'], str(call_invoker_result['pickle_exception']))) reraise(Exception, call_invoker_result['exc_value'], call_invoker_result['exc_traceback']) else: # reraise the exception reraise(*self._traceback) else: return None # nothing, don't raise, no value