def generic_handler(event, context_dict, custom_handler_env=None): """ event is from the invoker, and contains job information context_dict is generic infromation about the context that we are running in, provided by the scheduler custom_handler_env are environment variables we should set based on the platform we are on. """ pid = os.getpid() response_status = {'exception': None} try: if event['storage_config']['storage_backend'] != 's3': raise NotImplementedError(("Using {} as storage backend is not supported " + "yet.").format(event['storage_config']['storage_backend'])) s3_client = boto3.client("s3") s3_bucket = event['storage_config']['backend_config']['bucket'] logger.info("invocation started") # download the input status_key = event['status_key'] func_key = event['func_key'] data_key = event['data_key'] cancel_key = event['cancel_key'] # Check for cancel if key_exists(s3_client, s3_bucket, cancel_key): logger.info("invocation cancelled") raise Exception("CANCELLED", "Function cancelled") time_of_last_cancel_check = time.time() data_byte_range = event['data_byte_range'] output_key = event['output_key'] if version.__version__ != event['pywren_version']: raise Exception("WRONGVERSION", "Pywren version mismatch", version.__version__, event['pywren_version']) start_time = time.time() response_status['start_time'] = start_time runtime_s3_bucket = event['runtime']['s3_bucket'] runtime_s3_key = event['runtime']['s3_key'] if event.get('runtime_url'): # NOTE(shivaram): Right now we only support S3 urls. runtime_s3_bucket_used, runtime_s3_key_used = wrenutil.split_s3_url( event['runtime_url']) else: runtime_s3_bucket_used = runtime_s3_bucket runtime_s3_key_used = runtime_s3_key job_max_runtime = event.get("job_max_runtime", 290) # default for lambda response_status['func_key'] = func_key response_status['data_key'] = data_key response_status['output_key'] = output_key response_status['status_key'] = status_key data_key_size = get_key_size(s3_client, s3_bucket, data_key) #logger.info("bucket=", s3_bucket, "key=", data_key, "status: ", data_key_size, "bytes" ) while data_key_size is None: logger.warning("WARNING COULD NOT GET FIRST KEY") data_key_size = get_key_size(s3_client, s3_bucket, data_key) if not event['use_cached_runtime']: shutil.rmtree(RUNTIME_LOC, True) os.mkdir(RUNTIME_LOC) free_disk_bytes = free_disk_space(TEMP) response_status['free_disk_bytes'] = free_disk_bytes response_status['runtime_s3_key_used'] = runtime_s3_key_used response_status['runtime_s3_bucket_used'] = runtime_s3_bucket_used if (custom_handler_env is not None): delete_old_runtimes = custom_handler_env.get('delete_old_runtimes', False) else: delete_old_runtimes = False runtime_cached = download_runtime_if_necessary(s3_client, runtime_s3_bucket_used, runtime_s3_key_used, delete_old_runtimes) logger.info("Runtime ready, cached={}".format(runtime_cached)) response_status['runtime_cached'] = runtime_cached cwd = os.getcwd() jobrunner_path = os.path.join(cwd, "jobrunner.py") extra_env = event.get('extra_env', {}) extra_env['PYTHONPATH'] = "{}".format(os.getcwd()) call_id = event['call_id'] callset_id = event['callset_id'] response_status['call_id'] = call_id response_status['callset_id'] = callset_id runtime_meta = s3_client.head_object(Bucket=runtime_s3_bucket_used, Key=runtime_s3_key_used) ETag = str(runtime_meta['ETag'])[1:-1] conda_runtime_dir = CONDA_RUNTIME_DIR.format(ETag) conda_python_path = os.path.join(conda_runtime_dir, "bin") conda_python_runtime = os.path.join(conda_python_path, "python") # pass a full json blob jobrunner_config_filename = JOBRUNNER_CONFIG_FILENAME.format(pid) jobrunner_stats_filename = JOBRUNNER_STATS_FILENAME.format(pid) python_module_path = PYTHON_MODULE_PATH.format(pid) jobrunner_config = {'func_bucket' : s3_bucket, 'func_key' : func_key, 'data_bucket' : s3_bucket, 'data_key' : data_key, 'data_byte_range' : data_byte_range, 'python_module_path' : python_module_path, 'output_bucket' : s3_bucket, 'output_key' : output_key, 'stats_filename' : jobrunner_stats_filename} with open(jobrunner_config_filename, 'w') as jobrunner_fid: json.dump(jobrunner_config, jobrunner_fid) if os.path.exists(jobrunner_stats_filename): os.remove(jobrunner_stats_filename) cmdstr = "{} {} {}".format(conda_python_runtime, jobrunner_path, jobrunner_config_filename) setup_time = time.time() response_status['setup_time'] = setup_time - start_time local_env = os.environ.copy() if custom_handler_env is not None: local_env.update(custom_handler_env) local_env.update(extra_env) local_env['PATH'] = "{}{}{}".format(conda_python_path, os.pathsep, local_env.get("PATH", "")) logger.debug("command str=%s", cmdstr) # This is copied from http://stackoverflow.com/a/17698359/4577954 # reasons for setting process group: http://stackoverflow.com/a/4791612 if os.name == 'nt': process = subprocess.Popen(cmdstr, shell=True, env=local_env, bufsize=1, stdout=subprocess.PIPE, creationflags=subprocess.CREATE_NEW_PROCESS_GROUP) else: process = subprocess.Popen(cmdstr, # pylint: disable=subprocess-popen-preexec-fn shell=True, env=local_env, bufsize=1, stdout=subprocess.PIPE, preexec_fn=os.setsid) logger.info("launched process") def kill_process(process): if os.name == 'nt': subprocess.call(['taskkill', '/F', '/T', '/PID', str(process.pid)]) # pylint: disable=no-member else: os.killpg(os.getpgid(process.pid), signal.SIGTERM) def consume_stdout(stdout, queue): with stdout: for line in iter(stdout.readline, b''): queue.put(line) q = Queue() t = Thread(target=consume_stdout, args=(process.stdout, q)) t.daemon = True t.start() stdout = b"" while t.isAlive() or process.returncode is None: logger.info("Running {} {}".format(time.time(), process.returncode)) try: line = q.get_nowait() stdout += line logger.info(line) except Empty: time.sleep(PROCESS_STDOUT_SLEEP_SECS) process.poll() # this updates retcode but does not block if not t.isAlive() and process.returncode is None: time.sleep(PROCESS_STDOUT_SLEEP_SECS) total_runtime = time.time() - start_time time_since_cancel_check = time.time() - time_of_last_cancel_check if time_since_cancel_check > CANCEL_CHECK_EVERY_SECS: if key_exists(s3_client, s3_bucket, cancel_key): logger.info("invocation cancelled") # kill the process kill_process(process) raise Exception("CANCELLED", "Function cancelled") time_of_last_cancel_check = time.time() if total_runtime > job_max_runtime: logger.warning("Process exceeded maximum runtime of {} sec".format(job_max_runtime)) # Send the signal to all the process groups kill_process(process) raise Exception("OUTATIME", "Process executed for too long and was killed") response_status['retcode'] = process.returncode logger.info("command execution finished, retcode= {}".format(process.returncode)) if process.returncode != 0: logger.warning("process returned non-zero retcode {}".format(process.returncode)) logger.info(stdout.decode('ascii')) raise Exception("RETCODE", "Python process returned a non-zero return code") if os.path.exists(JOBRUNNER_STATS_FILENAME): with open(JOBRUNNER_STATS_FILENAME, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ") float_value = float(value) response_status[key] = float_value end_time = time.time() response_status['stdout'] = stdout.decode("ascii") response_status['exec_time'] = time.time() - setup_time response_status['end_time'] = end_time response_status['host_submit_time'] = event['host_submit_time'] response_status['server_info'] = get_server_info() response_status.update(context_dict) except Exception as e: # internal runtime exceptions response_status['exception'] = str(e) response_status['exception_args'] = e.args response_status['exception_traceback'] = traceback.format_exc() finally: # creating new client in case the client has not been created boto3.client("s3").put_object(Bucket=s3_bucket, Key=status_key, Body=json.dumps(response_status))
def generic_handler(event, context_dict): """ context_dict is generic infromation about the context that we are running in, provided by the scheduler """ try: response_status = {'exception': None} s3 = boto3.resource('s3') logger.info("invocation started") # download the input status_key = event['status_key'] func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] output_key = event['output_key'] if version.__version__ != event['pywren_version']: raise Exception("WRONGVERSION", "Pywren version mismatch", version.__version__, event['pywren_version']) start_time = time.time() response_status['start_time'] = start_time func_filename = "/tmp/func.pickle" data_filename = "/tmp/data.pickle" output_filename = "/tmp/output.pickle" runtime_s3_bucket = event['runtime_s3_bucket'] runtime_s3_key = event['runtime_s3_key'] if event.get('runtime_url'): # NOTE(shivaram): Right now we only support S3 urls. runtime_s3_bucket_used, runtime_s3_key_used = wrenutil.split_s3_url( event['runtime_url']) else: runtime_s3_bucket_used = runtime_s3_bucket runtime_s3_key_used = runtime_s3_key job_max_runtime = event.get("job_max_runtime", 290) # default for lambda response_status['func_key'] = func_key response_status['data_key'] = data_key response_status['output_key'] = output_key response_status['status_key'] = status_key b, k = data_key KS = s3util.key_size(b, k) #logger.info("bucket=", b, "key=", k, "status: ", KS, "bytes" ) while KS is None: logger.warn("WARNING COULD NOT GET FIRST KEY") KS = s3util.key_size(b, k) if not event['use_cached_runtime']: subprocess.check_output("rm -Rf {}/*".format(RUNTIME_LOC), shell=True) # get the input and save to disk # FIXME here is we where we would attach the "canceled" metadata s3.meta.client.download_file(func_key[0], func_key[1], func_filename) func_download_time = time.time() - start_time response_status['func_download_time'] = func_download_time logger.info("func download complete, took {:3.2f} sec".format( func_download_time)) if data_byte_range is None: s3.meta.client.download_file(data_key[0], data_key[1], data_filename) else: range_str = 'bytes={}-{}'.format(*data_byte_range) dres = s3.meta.client.get_object(Bucket=data_key[0], Key=data_key[1], Range=range_str) data_fid = open(data_filename, 'wb') data_fid.write(dres['Body'].read()) data_fid.close() data_download_time = time.time() - start_time logger.info("data data download complete, took {:3.2f} sec".format( data_download_time)) response_status['data_download_time'] = data_download_time # now split d = json.load(open(func_filename, 'r')) shutil.rmtree(PYTHON_MODULE_PATH, True) # delete old modules os.mkdir(PYTHON_MODULE_PATH) # get modules and save for m_filename, m_data in d['module_data'].items(): m_path = os.path.dirname(m_filename) if len(m_path) > 0 and m_path[0] == "/": m_path = m_path[1:] to_make = os.path.join(PYTHON_MODULE_PATH, m_path) #print "to_make=", to_make, "m_path=", m_path try: os.makedirs(to_make) except OSError as e: if e.errno == 17: pass else: raise e full_filename = os.path.join(to_make, os.path.basename(m_filename)) #print "creating", full_filename fid = open(full_filename, 'wb') fid.write(b64str_to_bytes(m_data)) fid.close() logger.info("Finished writing {} module files".format( len(d['module_data']))) logger.debug( subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) logger.debug( subprocess.check_output("find {}".format(os.getcwd()), shell=True)) response_status['runtime_s3_key_used'] = runtime_s3_key_used response_status['runtime_s3_bucket_used'] = runtime_s3_bucket_used runtime_cached = download_runtime_if_necessary(s3, runtime_s3_bucket_used, runtime_s3_key_used) logger.info("Runtime ready, cached={}".format(runtime_cached)) response_status['runtime_cached'] = runtime_cached cwd = os.getcwd() jobrunner_path = os.path.join(cwd, "jobrunner.py") extra_env = event.get('extra_env', {}) extra_env['PYTHONPATH'] = "{}:{}".format(os.getcwd(), PYTHON_MODULE_PATH) call_id = event['call_id'] callset_id = event['callset_id'] response_status['call_id'] = call_id response_status['callset_id'] = callset_id CONDA_PYTHON_PATH = "/tmp/condaruntime/bin" CONDA_PYTHON_RUNTIME = os.path.join(CONDA_PYTHON_PATH, "python") cmdstr = "{} {} {} {} {}".format(CONDA_PYTHON_RUNTIME, jobrunner_path, func_filename, data_filename, output_filename) setup_time = time.time() response_status['setup_time'] = setup_time - start_time local_env = os.environ.copy() local_env["OMP_NUM_THREADS"] = "1" local_env.update(extra_env) local_env['PATH'] = "{}:{}".format(CONDA_PYTHON_PATH, local_env.get("PATH", "")) logger.debug("command str=%s", cmdstr) # This is copied from http://stackoverflow.com/a/17698359/4577954 # reasons for setting process group: http://stackoverflow.com/a/4791612 process = subprocess.Popen(cmdstr, shell=True, env=local_env, bufsize=1, stdout=subprocess.PIPE, preexec_fn=os.setsid) logger.info("launched process") def consume_stdout(stdout, queue): with stdout: for line in iter(stdout.readline, b''): queue.put(line) q = Queue() t = Thread(target=consume_stdout, args=(process.stdout, q)) t.daemon = True t.start() stdout = b"" while t.isAlive(): try: line = q.get_nowait() stdout += line logger.info(line) except Empty: time.sleep(PROCESS_STDOUT_SLEEP_SECS) total_runtime = time.time() - start_time if total_runtime > job_max_runtime: logger.warn( "Process exceeded maximum runtime of {} sec".format( job_max_runtime)) # Send the signal to all the process groups os.killpg(os.getpgid(process.pid), signal.SIGTERM) raise Exception( "OUTATIME", "Process executed for too long and was killed") logger.info("command execution finished") s3.meta.client.upload_file(output_filename, output_key[0], output_key[1]) logger.debug("output uploaded to %s %s", output_key[0], output_key[1]) end_time = time.time() response_status['stdout'] = stdout.decode("ascii") response_status['exec_time'] = time.time() - setup_time response_status['end_time'] = end_time response_status['host_submit_time'] = event['host_submit_time'] response_status['server_info'] = get_server_info() response_status.update(context_dict) except Exception as e: # internal runtime exceptions response_status['exception'] = str(e) response_status['exception_args'] = e.args response_status['exception_traceback'] = traceback.format_exc() finally: s3.meta.client.put_object(Bucket=status_key[0], Key=status_key[1], Body=json.dumps(response_status))
def generic_handler(event, context_dict, custom_handler_env=None): """ event is from the invoker, and contains job information context_dict is generic infromation about the context that we are running in, provided by the scheduler custom_handler_env are environment variables we should set based on the platform we are on. """ response_status = {'exception': None} try: if event['storage_config']['storage_backend'] != 's3': raise NotImplementedError(("Using {} as storage backend is not supported " + "yet.").format(event['storage_config']['storage_backend'])) s3_client = boto3.client("s3") s3_bucket = event['storage_config']['backend_config']['bucket'] logger.info("invocation started") # download the input status_key = event['status_key'] func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] output_key = event['output_key'] if version.__version__ != event['pywren_version']: raise Exception("WRONGVERSION", "Pywren version mismatch", version.__version__, event['pywren_version']) start_time = time.time() response_status['start_time'] = start_time runtime_s3_bucket = event['runtime']['s3_bucket'] runtime_s3_key = event['runtime']['s3_key'] if event.get('runtime_url'): # NOTE(shivaram): Right now we only support S3 urls. runtime_s3_bucket_used, runtime_s3_key_used = wrenutil.split_s3_url( event['runtime_url']) else: runtime_s3_bucket_used = runtime_s3_bucket runtime_s3_key_used = runtime_s3_key job_max_runtime = event.get("job_max_runtime", 290) # default for lambda response_status['func_key'] = func_key response_status['data_key'] = data_key response_status['output_key'] = output_key response_status['status_key'] = status_key data_key_size = get_key_size(s3_client, s3_bucket, data_key) #logger.info("bucket=", s3_bucket, "key=", data_key, "status: ", data_key_size, "bytes" ) while data_key_size is None: logger.warning("WARNING COULD NOT GET FIRST KEY") data_key_size = get_key_size(s3_client, s3_bucket, data_key) if not event['use_cached_runtime']: subprocess.check_output("rm -Rf {}/*".format(RUNTIME_LOC), shell=True) free_disk_bytes = free_disk_space("/tmp") response_status['free_disk_bytes'] = free_disk_bytes response_status['runtime_s3_key_used'] = runtime_s3_key_used response_status['runtime_s3_bucket_used'] = runtime_s3_bucket_used runtime_cached = download_runtime_if_necessary(s3_client, runtime_s3_bucket_used, runtime_s3_key_used) logger.info("Runtime ready, cached={}".format(runtime_cached)) response_status['runtime_cached'] = runtime_cached cwd = os.getcwd() jobrunner_path = os.path.join(cwd, "jobrunner.py") extra_env = event.get('extra_env', {}) extra_env['PYTHONPATH'] = "{}".format(os.getcwd()) call_id = event['call_id'] callset_id = event['callset_id'] response_status['call_id'] = call_id response_status['callset_id'] = callset_id CONDA_PYTHON_PATH = "/tmp/condaruntime/bin" CONDA_PYTHON_RUNTIME = os.path.join(CONDA_PYTHON_PATH, "python") # pass a full json blob jobrunner_config = {'func_bucket' : s3_bucket, 'func_key' : func_key, 'data_bucket' : s3_bucket, 'data_key' : data_key, 'data_byte_range' : data_byte_range, 'python_module_path' : PYTHON_MODULE_PATH, 'output_bucket' : s3_bucket, 'output_key' : output_key, 'stats_filename' : JOBRUNNER_STATS_FILENAME} with open(JOBRUNNER_CONFIG_FILENAME, 'w') as jobrunner_fid: json.dump(jobrunner_config, jobrunner_fid) if os.path.exists(JOBRUNNER_STATS_FILENAME): os.remove(JOBRUNNER_STATS_FILENAME) cmdstr = "{} {} {}".format(CONDA_PYTHON_RUNTIME, jobrunner_path, JOBRUNNER_CONFIG_FILENAME) setup_time = time.time() response_status['setup_time'] = setup_time - start_time local_env = os.environ.copy() if custom_handler_env is not None: local_env.update(custom_handler_env) local_env.update(extra_env) local_env['PATH'] = "{}:{}".format(CONDA_PYTHON_PATH, local_env.get("PATH", "")) logger.debug("command str=%s", cmdstr) # This is copied from http://stackoverflow.com/a/17698359/4577954 # reasons for setting process group: http://stackoverflow.com/a/4791612 process = subprocess.Popen(cmdstr, shell=True, env=local_env, bufsize=1, stdout=subprocess.PIPE, preexec_fn=os.setsid) logger.info("launched process") def consume_stdout(stdout, queue): with stdout: for line in iter(stdout.readline, b''): queue.put(line) q = Queue() t = Thread(target=consume_stdout, args=(process.stdout, q)) t.daemon = True t.start() stdout = b"" while t.isAlive(): try: line = q.get_nowait() stdout += line logger.info(line) except Empty: time.sleep(PROCESS_STDOUT_SLEEP_SECS) total_runtime = time.time() - start_time if total_runtime > job_max_runtime: logger.warning("Process exceeded maximum runtime of {} sec".format(job_max_runtime)) # Send the signal to all the process groups os.killpg(os.getpgid(process.pid), signal.SIGTERM) raise Exception("OUTATIME", "Process executed for too long and was killed") logger.info("command execution finished") if os.path.exists(JOBRUNNER_STATS_FILENAME): with open(JOBRUNNER_STATS_FILENAME, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ") float_value = float(value) response_status[key] = float_value end_time = time.time() response_status['stdout'] = stdout.decode("ascii") response_status['exec_time'] = time.time() - setup_time response_status['end_time'] = end_time response_status['host_submit_time'] = event['host_submit_time'] response_status['server_info'] = get_server_info() response_status.update(context_dict) except Exception as e: # internal runtime exceptions response_status['exception'] = str(e) response_status['exception_args'] = e.args response_status['exception_traceback'] = traceback.format_exc() finally: # creating new client in case the client has not been created boto3.client("s3").put_object(Bucket=s3_bucket, Key=status_key, Body=json.dumps(response_status))