def put_object(self, bucket_name, key, data): """ Put an object in COS. Override the object if the key already exists. :param key: key of the object. :param data: data of the object :type data: str/bytes :return: None """ retries = 0 status = None while status is None: try: res = self.cos_client.put_object(Bucket=bucket_name, Key=key, Body=data) status = 'OK' if res['ResponseMetadata']['HTTPStatusCode'] == 200 else 'Error' try: logger.debug('PUT Object {} - Size: {} - {}'.format(key, sizeof_fmt(len(data)), status)) except Exception: logger.debug('PUT Object {} {}'.format(key, status)) except ibm_botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "NoSuchKey": raise StorageNoSuchKeyError(bucket_name, key) else: raise e except ibm_botocore.exceptions.ReadTimeoutError as e: if retries == OBJ_REQ_RETRIES: raise e logger.debug('PUT Object timeout. Retrying request') retries += 1 return True
def _send_status_rabbitmq(self): """ Send the status event to RabbitMQ """ dmpd_response_status = json.dumps(self.response) drs = sizeof_fmt(len(dmpd_response_status)) executor_id = self.response['executor_id'] job_id = self.response['job_id'] rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url') status_sent = False output_query_count = 0 params = pika.URLParameters(rabbit_amqp_url) exchange = 'pywren-{}-{}'.format(executor_id, job_id) while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: connection = pika.BlockingConnection(params) channel = connection.channel() channel.exchange_declare(exchange=exchange, exchange_type='fanout', auto_delete=True) channel.basic_publish(exchange=exchange, routing_key='', body=dmpd_response_status) connection.close() logger.info("Execution status sent to rabbitmq - Size: {}".format(drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send status to rabbitmq...') time.sleep(0.2)
def put_object(self, bucket_name, key, data): try: res = self.cos_client.put_object(Bucket=bucket_name, Key=key, Body=data) status = 'OK' if res['ResponseMetadata']['HTTPStatusCode'] == 200 else 'Error' try: print('PUT Object {} - Size: {} - {}'.format(key, sizeof_fmt(len(data)), status)) except: print('PUT Object {} {}'.format(key, status)) except ibm_botocore.exceptions.ClientError as e: raise e
def get_memory_usage(formatted=True): """ Gets the current memory usage of the runtime. To be used only in the action code. """ split_args = False pids_to_show = None discriminate_by_pid = False ps_mem.verify_environment(pids_to_show) sorted_cmds, shareds, count, total, swaps, total_swap = \ ps_mem.get_memory_usage(pids_to_show, split_args, discriminate_by_pid, include_self=True, only_self=False) if formatted: return sizeof_fmt(int(ps_mem.human(total, units=1))) else: return int(ps_mem.human(total, units=1))
def _send_status_os(self): """ Send the status event to the Object Storage """ executor_id = self.response['executor_id'] job_id = self.response['job_id'] call_id = self.response['call_id'] if self.response['type'] == '__init__': init_key = create_init_key(JOBS_PREFIX, executor_id, job_id, call_id) self.internal_storage.put_data(init_key, '') elif self.response['type'] == '__end__': status_key = create_status_key(JOBS_PREFIX, executor_id, job_id, call_id) dmpd_response_status = json.dumps(self.response) drs = sizeof_fmt(len(dmpd_response_status)) logger.info("Storing execution stats - status.json - Size: {}".format(drs)) self.internal_storage.put_data(status_key, dmpd_response_status)
def function_handler(event): start_time = time.time() log_level = event['log_level'] cloud_logging_config(log_level) logger.debug("Action handler started") response_status = {'exception': False} response_status['host_submit_time'] = event['host_submit_time'] response_status['start_time'] = start_time context_dict = { 'python_version': os.environ.get("PYTHON_VERSION"), } config = event['config'] storage_config = extract_storage_config(config) call_id = event['call_id'] job_id = event['job_id'] executor_id = event['executor_id'] logger.info("Execution ID: {}/{}/{}".format(executor_id, job_id, call_id)) execution_timeout = event['execution_timeout'] logger.debug( "Set function execution timeout to {}s".format(execution_timeout)) status_key = event['status_key'] func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] output_key = event['output_key'] extra_env = event.get('extra_env', {}) response_status['call_id'] = call_id response_status['job_id'] = job_id response_status['executor_id'] = executor_id # response_status['func_key'] = func_key # response_status['data_key'] = data_key # response_status['output_key'] = output_key # response_status['status_key'] = status_key try: if version.__version__ != event['pywren_version']: raise Exception("WRONGVERSION", "PyWren version mismatch", version.__version__, event['pywren_version']) # response_status['free_disk_bytes'] = free_disk_space("/tmp") custom_env = { 'PYWREN_CONFIG': json.dumps(config), 'PYWREN_REMOTE': 'TRUE', 'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH), 'PYTHONUNBUFFERED': 'True' } os.environ.update(custom_env) os.environ.update(extra_env) jobrunner_config = { 'pywren_config': config, 'call_id': call_id, 'job_id': job_id, 'executor_id': executor_id, 'func_key': func_key, 'data_key': data_key, 'log_level': log_level, 'data_byte_range': data_byte_range, 'python_module_path': PYTHON_MODULE_PATH, 'output_key': output_key, 'stats_filename': JOBRUNNER_STATS_FILENAME } if os.path.exists(JOBRUNNER_STATS_FILENAME): os.remove(JOBRUNNER_STATS_FILENAME) setup_time = time.time() response_status['setup_time'] = round(setup_time - start_time, 8) result_queue = multiprocessing.Queue() tr = JobRunner(jobrunner_config, result_queue) tr.daemon = True logger.debug('Starting JobRunner process') tr.start() tr.join(execution_timeout) logger.debug('Finished JobRunner process') response_status['exec_time'] = round(time.time() - setup_time, 8) if tr.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it tr.terminate() msg = ('Jobrunner process exceeded maximum time of {} ' 'seconds and was killed'.format(execution_timeout)) raise Exception('OUTATIME', msg) if result_queue.empty(): # Only 1 message is returned by jobrunner when it finishes. # If no message, this means that the jobrunner process was killed. # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM. msg = 'Jobrunner process exceeded maximum memory and was killed' raise Exception('OUTOFMEMORY', msg) # print(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) # print(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) if os.path.exists(JOBRUNNER_STATS_FILENAME): with open(JOBRUNNER_STATS_FILENAME, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: response_status[key] = float(value) except Exception: response_status[key] = value if key in [ 'exception', 'exc_pickle_fail', 'result', 'new_futures' ]: response_status[key] = eval(value) # response_status['server_info'] = get_server_info() response_status.update(context_dict) response_status['end_time'] = time.time() except Exception: # internal runtime exceptions print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) response_status['end_time'] = time.time() response_status['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads( pickled_exc) # this is just to make sure they can be unpickled response_status['exc_info'] = str(pickled_exc) finally: store_status = strtobool(os.environ.get('STORE_STATUS', 'True')) dmpd_response_status = json.dumps(response_status) drs = sizeof_fmt(len(dmpd_response_status)) rabbitmq_monitor = config['pywren'].get('rabbitmq_monitor', False) if rabbitmq_monitor and store_status: rabbit_amqp_url = config['rabbitmq'].get('amqp_url') status_sent = False output_query_count = 0 params = pika.URLParameters(rabbit_amqp_url) queue = '{}-{}'.format(executor_id, job_id) while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: connection = pika.BlockingConnection(params) channel = connection.channel() channel.queue_declare(queue=queue, auto_delete=True) channel.basic_publish(exchange='', routing_key=queue, body=dmpd_response_status) connection.close() logger.info( "Execution status sent to rabbitmq - Size: {}".format( drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send status to rabbitmq...') time.sleep(0.2) if store_status: internal_storage = InternalStorage(storage_config) logger.info( "Storing execution stats - status.json - Size: {}".format(drs)) internal_storage.put_data(status_key, dmpd_response_status)
def run(self): """ Runs the function """ logger.info("Started") # initial output file in case job fails output_dict = {'result': None, 'success': False} pickled_output = pickle.dumps(output_dict) try: self.internal_storage = storage.InternalStorage(self.storage_config) loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() data = self._create_storage_clients(function, data) if self.show_memory: logger.debug("Memory usage before call the function: {}".format(get_current_memory_usage())) logger.info("Function: Going to execute '{}()'".format(str(function.__name__))) print('------------------- FUNCTION LOG -------------------', flush=True) func_exec_time_t1 = time.time() result = function(**data) func_exec_time_t2 = time.time() print('----------------------------------------------------', flush=True) logger.info("Function: Success execution") if self.show_memory: logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage())) self.stats.write('function_exec_time', round(func_exec_time_t2-func_exec_time_t1, 8)) output_dict = {'result': result, 'success': True} pickled_output = pickle.dumps(output_dict) # Check for new futures if isinstance(result, ResponseFuture): callgroup_id = result.callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, 1)) elif type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture): callgroup_id = result[0].callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, len(result))) else: self.stats.write('new_futures', '{}/{}'.format(None, 0)) if self.show_memory: logger.debug("Memory usage after output serialization: {}".format(get_current_memory_usage())) except Exception as e: print('------------------ EXCEPTION -------------------------') exc_type, exc_value, exc_traceback = sys.exc_info() #traceback.print_tb(exc_traceback) # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails logger.error("There was an exception: {}".format(str(e))) try: pickled_output = pickle.dumps({'result': e, 'exc_type': exc_type, 'exc_value': exc_value, 'exc_traceback': exc_traceback, 'sys.path': sys.path, 'success': False}) # this is just to make sure they can be unpickled pickle.loads(pickled_output) except Exception as pickle_exception: pickled_output = pickle.dumps({'result': str(e), 'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'exc_traceback_str': str(exc_traceback), 'sys.path': sys.path, 'pickle_fail': True, 'pickle_exception': pickle_exception, 'success': False}) finally: store_result = True if 'STORE_RESULT' in os.environ: store_result = eval(os.environ['STORE_RESULT']) if store_result: output_upload_timestamp_t1 = time.time() logger.info("Storing {} - Size: {}".format(self.output_key, sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_timestamp_t2 = time.time() self.stats.write("output_upload_time", round(output_upload_timestamp_t2 - output_upload_timestamp_t1, 8)) self.result_queue.put("Finished") logger.info("Finished")
def _map(self, func, iterdata, extra_env=None, extra_meta=None, invoke_pool_threads=128, data_all_as_one=True, overwrite_invoke_args=None, exclude_modules=None, original_func_name=None, remote_invocation=False, original_iterdata_len=None, job_max_runtime=wrenconfig.RUNTIME_TIMEOUT): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :param original_func_name: Name of the function to invoke. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ if original_func_name: func_name = original_func_name else: func_name = func.__name__ data = wrenutil.iterdata_as_list(iterdata) if extra_env is not None: extra_env = wrenutil.convert_bools_to_string(extra_env) if not data: return [] if self.map_item_limit is not None and len(data) > self.map_item_limit: raise ValueError("len(data) ={}, exceeding map item limit of {}" "consider mapping over a smaller" "number of items".format(len(data), self.map_item_limit)) # This allows multiple parameters in functions data = wrenutil.verify_args(func, data) callgroup_id = wrenutil.create_callgroup_id() host_job_meta = {} log_msg = 'Executor ID {} Serializing function and data'.format(self.executor_id) logger.debug(log_msg) # pickle func and all data (to capture module dependencies) func_and_data_ser, mod_paths = self.serializer([func] + data) func_str = func_and_data_ser[0] data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) agg_data_key = None host_job_meta['agg_data'] = False host_job_meta['data_size_bytes'] = data_size_bytes log_msg = 'Executor ID {} Uploading function and data'.format(self.executor_id) logger.info(log_msg) if not self.log_level: print(log_msg, end=' ') if data_size_bytes < wrenconfig.MAX_AGG_DATA_SIZE and data_all_as_one: agg_data_key = create_agg_data_key(self.internal_storage.prefix, self.executor_id, callgroup_id) agg_data_bytes, agg_data_ranges = self.agg_data(data_strs) agg_upload_time = time.time() self.internal_storage.put_data(agg_data_key, agg_data_bytes) host_job_meta['agg_data'] = True host_job_meta['data_upload_time'] = time.time() - agg_upload_time host_job_meta['data_upload_timestamp'] = time.time() else: log_msg = ('Executor ID {} Total data exceeded ' 'maximum size of {} bytes'.format(self.executor_id, wrenconfig.MAX_AGG_DATA_SIZE)) logger.warning(log_msg) if exclude_modules: for module in exclude_modules: for mod_path in list(mod_paths): if module in mod_path and mod_path in mod_paths: mod_paths.remove(mod_path) module_data = create_mod_data(mod_paths) # Create func and upload func_module_str = pickle.dumps({'func': func_str, 'module_data': module_data}, -1) host_job_meta['func_module_bytes'] = len(func_module_str) func_upload_time = time.time() func_key = create_func_key(self.internal_storage.prefix, self.executor_id, callgroup_id) self.internal_storage.put_func(func_key, func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() if not self.log_level: func_and_data_size = wrenutil.sizeof_fmt(host_job_meta['func_module_bytes']+host_job_meta['data_size_bytes']) log_msg = '- Total: {}'.format(func_and_data_size) print(log_msg) def invoke(data_str, executor_id, callgroup_id, call_id, func_key, host_job_meta, agg_data_key=None, data_byte_range=None): data_key, output_key, status_key = create_keys(self.internal_storage.prefix, executor_id, callgroup_id, call_id) host_job_meta['job_invoke_timestamp'] = time.time() if agg_data_key is None: data_upload_time = time.time() self.internal_storage.put_data(data_key, data_str) data_upload_time = time.time() - data_upload_time host_job_meta['data_upload_time'] = data_upload_time host_job_meta['data_upload_timestamp'] = time.time() data_key = data_key else: data_key = agg_data_key return self.invoke_with_keys(func_key, data_key, output_key, status_key, executor_id, callgroup_id, call_id, extra_env, extra_meta, data_byte_range, host_job_meta.copy(), job_max_runtime, overwrite_invoke_args=overwrite_invoke_args) N = len(data) call_futures = [] if remote_invocation and original_iterdata_len > 1: log_msg = 'Executor ID {} Starting {} remote invocation function: Spawning {}() - Total: {} activations'.format(self.executor_id, N, func_name, original_iterdata_len) else: log_msg = 'Executor ID {} Starting function invocation: {}() - Total: {} activations'.format(self.executor_id, func_name, N) logger.info(log_msg) if not self.log_level: print(log_msg) with ThreadPoolExecutor(max_workers=invoke_pool_threads) as executor: for i in range(N): call_id = "{:05d}".format(i) data_byte_range = None if agg_data_key is not None: data_byte_range = agg_data_ranges[i] future = executor.submit(invoke, data_strs[i], self.executor_id, callgroup_id, call_id, func_key, host_job_meta.copy(), agg_data_key, data_byte_range) call_futures.append(future) res = [ft.result() for ft in call_futures] return res
def run(self): """ Runs the function """ logger.info("Started") # initial output file in case job fails result = None exception = False try: self.internal_storage = InternalStorage(self.storage_config) self.internal_storage.tmp_obj_prefix = self.output_key.rsplit('/', 1)[0] loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() data = self._create_storage_clients(function, data) if self.show_memory: logger.debug("Memory usage before call the function: {}".format(get_current_memory_usage())) logger.info("Function: Going to execute '{}()'".format(str(function.__name__))) print('---------------------- FUNCTION LOG ----------------------', flush=True) func_exec_time_t1 = time.time() result = function(**data) func_exec_time_t2 = time.time() print('----------------------------------------------------------', flush=True) logger.info("Function: Success execution") if self.show_memory: logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage())) self.stats.write('function_exec_time', round(func_exec_time_t2-func_exec_time_t1, 8)) # Check for new futures if result is not None: self.stats.write("result", True) if isinstance(result, ResponseFuture): callgroup_id = result.callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, 1)) elif type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture): callgroup_id = result[0].callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, len(result))) else: self.stats.write('new_futures', '{}/{}'.format(None, 0)) logger.debug("Pickling result") output_dict = {'result': result} pickled_output = pickle.dumps(output_dict) if self.show_memory: logger.debug("Memory usage after output serialization: {}".format(get_current_memory_usage())) else: logger.debug("No result to store") self.stats.write("result", False) except Exception as e: exception = True self.stats.write("exception", True) print('----------------------- EXCEPTION !-----------------------') logger.error("There was an exception: {}".format(str(e))) print('----------------------------------------------------------', flush=True) if self.show_memory: logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage())) try: logger.debug("Pickling exception") pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) except Exception as pickle_exception: # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails logger.debug("Failed pickling exception: {}".format(str(pickle_exception))) self.stats.write("exc_pickle_fail", True) exc_type, exc_value, exc_traceback = sys.exc_info() pickled_exc = pickle.dumps({'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'pickle_exception': pickle_exception}) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) finally: store_result = strtobool(os.environ.get('STORE_RESULT', 'True')) if result is not None and store_result and not exception: output_upload_timestamp_t1 = time.time() logger.info("Storing function result - output.pickle - Size: {}".format(sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_timestamp_t2 = time.time() self.stats.write("output_upload_time", round(output_upload_timestamp_t2 - output_upload_timestamp_t1, 8)) self.result_queue.put("Finished") logger.info("Finished")
def _create_job(config, internal_storage, executor_id, job_id, func, data, runtime_meta, runtime_memory=None, extra_env=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], original_func_name=None, remote_invocation=False, original_total_tasks=None, execution_timeout=EXECUTION_TIMEOUT): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :param original_func_name: Name of the function to invoke. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ log_level = os.getenv('CB_LOG_LEVEL') runtime_name = config['pywren']['runtime'] if runtime_memory is None: runtime_memory = config['pywren']['runtime_memory'] if original_func_name: func_name = original_func_name else: func_name = func.__name__ if extra_env is not None: extra_env = utils.convert_bools_to_string(extra_env) if not data: return [] host_job_meta = {} job_description = {} job_description['runtime_name'] = runtime_name job_description['runtime_memory'] = int(runtime_memory) job_description['execution_timeout'] = execution_timeout job_description['func_name'] = func_name job_description['extra_env'] = extra_env job_description['total_calls'] = len(data) job_description['invoke_pool_threads'] = invoke_pool_threads job_description['job_id'] = job_id job_description['remote_invocation'] = remote_invocation job_description['original_total_calls'] = original_total_tasks log_msg = 'ExecutorID {} | JobID {} - Serializing function and data'.format(executor_id, job_id) logger.debug(log_msg) # pickle func and all data (to capture module dependencies) exclude_modules.extend(config['pywren'].get('exclude_modules', [])) include_modules_cfg = config['pywren'].get('include_modules', []) if include_modules is not None and include_modules_cfg is not None: include_modules.extend(include_modules_cfg) serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + data, include_modules, exclude_modules) func_str = func_and_data_ser[0] data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) host_job_meta['agg_data'] = False host_job_meta['data_size_bytes'] = data_size_bytes log_msg = 'ExecutorID {} | JobID {} - Uploading function and data'.format(executor_id, job_id) logger.info(log_msg) if not log_level: print(log_msg, end=' ') if data_size_bytes < MAX_AGG_DATA_SIZE: agg_data_key = create_agg_data_key(internal_storage.prefix, executor_id, job_id) job_description['data_key'] = agg_data_key agg_data_bytes, agg_data_ranges = _agg_data(data_strs) job_description['data_ranges'] = agg_data_ranges agg_upload_time = time.time() internal_storage.put_data(agg_data_key, agg_data_bytes) host_job_meta['agg_data'] = True host_job_meta['data_upload_time'] = time.time() - agg_upload_time host_job_meta['data_upload_timestamp'] = time.time() else: log_msg = ('ExecutorID {} | JobID {} - Total data exceeded ' 'maximum size of {} bytes'.format(executor_id, job_id, MAX_AGG_DATA_SIZE)) raise Exception(log_msg) module_data = create_module_data(mod_paths) # Create func and upload host_job_meta['func_name'] = func_name func_module_str = pickle.dumps({'func': func_str, 'module_data': module_data}, -1) host_job_meta['func_module_bytes'] = len(func_module_str) func_upload_time = time.time() func_key = create_func_key(internal_storage.prefix, executor_id, job_id) job_description['func_key'] = func_key internal_storage.put_func(func_key, func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() if not log_level: func_and_data_size = utils.sizeof_fmt(host_job_meta['func_module_bytes']+host_job_meta['data_size_bytes']) log_msg = '- Total: {}'.format(func_and_data_size) print(log_msg) job_description['host_job_meta'] = host_job_meta return job_description
def run(self): """ Runs the function """ # self.stats.write('jobrunner_start', time.time()) logger.info("Started") result = None exception = False try: loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() if strtobool(os.environ.get('__PW_REDUCE_JOB', 'False')): self._wait_futures(data) elif is_object_processing_function(function): self._load_object(data) self._fill_optional_args(function, data) logger.info("Going to execute '{}()'".format(str(function.__name__))) print('---------------------- FUNCTION LOG ----------------------', flush=True) func_exec_time_t1 = time.time() result = function(**data) func_exec_time_t2 = time.time() print('----------------------------------------------------------', flush=True) logger.info("Success function execution") self.stats.write('function_exec_time', round(func_exec_time_t2-func_exec_time_t1, 8)) # Check for new futures if result is not None: self.stats.write("result", True) if isinstance(result, ResponseFuture) or \ (type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture)): self.stats.write('new_futures', True) logger.debug("Pickling result") output_dict = {'result': result} pickled_output = pickle.dumps(output_dict) else: logger.debug("No result to store") self.stats.write("result", False) except Exception: exception = True self.stats.write("exception", True) exc_type, exc_value, exc_traceback = sys.exc_info() print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) try: logger.debug("Pickling exception") pickled_exc = pickle.dumps((exc_type, exc_value, exc_traceback)) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) except Exception as pickle_exception: # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails self.stats.write("exc_pickle_fail", True) pickled_exc = pickle.dumps({'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'pickle_exception': pickle_exception}) pickle.loads(pickled_exc) # this is just to make sure it can be unpickled self.stats.write("exc_info", str(pickled_exc)) finally: store_result = strtobool(os.environ.get('STORE_RESULT', 'True')) if result is not None and store_result and not exception: output_upload_timestamp_t1 = time.time() logger.info("Storing function result - Size: {}".format(sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_timestamp_t2 = time.time() self.stats.write("output_upload_time", round(output_upload_timestamp_t2 - output_upload_timestamp_t1, 8)) self.jobrunner_conn.send("Finished") logger.info("Finished")
def function_handler(event): start_time = time.time() logger.debug("Action handler started") response_status = {'exception': False} response_status['host_submit_time'] = event['host_submit_time'] response_status['start_time'] = start_time context_dict = { 'ibm_cf_request_id': os.environ.get("__OW_ACTIVATION_ID"), 'ibm_cf_python_version': os.environ.get("PYTHON_VERSION"), } config = event['config'] storage_config = extract_storage_config(config) log_level = event['log_level'] cloud_logging_config(log_level) call_id = event['call_id'] job_id = event['job_id'] executor_id = event['executor_id'] logger.info("Execution ID: {}/{}/{}".format(executor_id, job_id, call_id)) task_execution_timeout = event.get("task_execution_timeout", 590) # default for CF status_key = event['status_key'] func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] output_key = event['output_key'] extra_env = event.get('extra_env', {}) response_status['call_id'] = call_id response_status['job_id'] = job_id response_status['executor_id'] = executor_id # response_status['func_key'] = func_key # response_status['data_key'] = data_key # response_status['output_key'] = output_key # response_status['status_key'] = status_key try: if version.__version__ != event['pywren_version']: raise Exception("WRONGVERSION", "PyWren version mismatch", version.__version__, event['pywren_version']) # response_status['free_disk_bytes'] = free_disk_space("/tmp") custom_env = { 'CB_CONFIG': json.dumps(config), 'CB_CALL_ID': call_id, 'CB_JOB_ID': job_id, 'CB_EXECUTOR_ID': executor_id, 'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH), 'PYTHONUNBUFFERED': 'True' } os.environ.update(custom_env) os.environ.update(extra_env) # pass a full json blob jobrunner_config = { 'func_key': func_key, 'data_key': data_key, 'log_level': log_level, 'data_byte_range': data_byte_range, 'python_module_path': PYTHON_MODULE_PATH, 'output_key': output_key, 'stats_filename': JOBRUNNER_STATS_FILENAME } if os.path.exists(JOBRUNNER_STATS_FILENAME): os.remove(JOBRUNNER_STATS_FILENAME) setup_time = time.time() response_status['setup_time'] = round(setup_time - start_time, 8) result_queue = multiprocessing.Queue() tr = JobRunner(jobrunner_config, result_queue) tr.daemon = True logger.info("Starting JobRunner process") tr.start() tr.join(task_execution_timeout) response_status['exec_time'] = round(time.time() - setup_time, 8) if tr.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it logger.error( "Process exceeded maximum runtime of {} seconds".format( task_execution_timeout)) # Send the signal to all the process groups tr.terminate() raise Exception("OUTATIME", "Process executed for too long and was killed") try: # Only 1 message is returned by jobrunner result_queue.get(block=False) except Exception: # If no message, this means that the process was killed due an exception pickling an exception raise Exception( "EXCPICKLEERROR", "PyWren was unable to pickle the exception, check function logs" ) # print(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) # print(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) if os.path.exists(JOBRUNNER_STATS_FILENAME): with open(JOBRUNNER_STATS_FILENAME, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: response_status[key] = float(value) except Exception: response_status[key] = value if key == 'exception' or key == 'exc_pickle_fail' \ or key == 'result': response_status[key] = eval(value) # response_status['server_info'] = get_server_info() response_status.update(context_dict) response_status['end_time'] = time.time() except Exception as e: # internal runtime exceptions logger.error("There was an exception: {}".format(str(e))) response_status['end_time'] = time.time() response_status['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads( pickled_exc) # this is just to make sure they can be unpickled response_status['exc_info'] = str(pickled_exc) finally: store_status = strtobool(os.environ.get('STORE_STATUS', 'True')) rabbit_amqp_url = config['rabbitmq'].get('amqp_url') dmpd_response_status = json.dumps(response_status) drs = sizeof_fmt(len(dmpd_response_status)) if rabbit_amqp_url and store_status: status_sent = False output_query_count = 0 while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: params = pika.URLParameters(rabbit_amqp_url) connection = pika.BlockingConnection(params) channel = connection.channel() channel.queue_declare(queue=executor_id, auto_delete=True) channel.basic_publish(exchange='', routing_key=executor_id, body=dmpd_response_status) connection.close() logger.info( "Execution stats sent to rabbitmq - Size: {}".format( drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send stats to rabbitmq...') time.sleep(0.2) if store_status: internal_storage = InternalStorage(storage_config) logger.info( "Storing execution stats - status.json - Size: {}".format(drs)) internal_storage.put_data(status_key, dmpd_response_status)
def _create_job(config, internal_storage, executor_id, job_id, func, data, runtime_meta, runtime_memory=None, extra_env=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], execution_timeout=None, job_created_timestamp=None): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ log_level = os.getenv('PYWREN_LOGLEVEL') runtime_name = config['pywren']['runtime'] if runtime_memory is None: runtime_memory = config['pywren']['runtime_memory'] ext_env = {} if extra_env is None else extra_env.copy() if ext_env: ext_env = utils.convert_bools_to_string(ext_env) logger.debug("Extra environment vars {}".format(ext_env)) if not data: return [] if execution_timeout is None: execution_timeout = config['pywren']['runtime_timeout'] - 5 job_description = {} job_description['runtime_name'] = runtime_name job_description['runtime_memory'] = int(runtime_memory) job_description['execution_timeout'] = execution_timeout job_description['function_name'] = func.__name__ job_description['extra_env'] = ext_env job_description['total_calls'] = len(data) job_description['invoke_pool_threads'] = invoke_pool_threads job_description['executor_id'] = executor_id job_description['job_id'] = job_id exclude_modules_cfg = config['pywren'].get('exclude_modules', []) include_modules_cfg = config['pywren'].get('include_modules', []) exc_modules = set() inc_modules = set() if exclude_modules_cfg: exc_modules.update(exclude_modules_cfg) if exclude_modules: exc_modules.update(exclude_modules) if include_modules_cfg is not None: inc_modules.update(include_modules_cfg) if include_modules_cfg is None and not include_modules: inc_modules = None if include_modules is not None and include_modules: inc_modules.update(include_modules) if include_modules is None: inc_modules = None host_job_meta = {'job_created_timestamp': job_created_timestamp} logger.debug( 'ExecutorID {} | JobID {} - Serializing function and data'.format( executor_id, job_id)) serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + data, inc_modules, exc_modules) data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) module_data = create_module_data(mod_paths) func_str = func_and_data_ser[0] func_module_str = pickle.dumps( { 'func': func_str, 'module_data': module_data }, -1) func_module_size_bytes = len(func_module_str) total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes) host_job_meta['data_size_bytes'] = data_size_bytes host_job_meta['func_module_size_bytes'] = func_module_size_bytes if 'data_limit' in config['pywren']: data_limit = config['pywren']['data_limit'] else: data_limit = MAX_AGG_DATA_SIZE if data_limit and data_size_bytes > data_limit * 1024**2: log_msg = ( 'ExecutorID {} | JobID {} - Total data exceeded maximum size ' 'of {}'.format(executor_id, job_id, sizeof_fmt(data_limit * 1024**2))) raise Exception(log_msg) log_msg = ('ExecutorID {} | JobID {} - Uploading function and data ' '- Total: {}'.format(executor_id, job_id, total_size)) print(log_msg) if not log_level else logger.info(log_msg) # Upload data data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id) job_description['data_key'] = data_key data_bytes, data_ranges = utils.agg_data(data_strs) job_description['data_ranges'] = data_ranges data_upload_time = time.time() internal_storage.put_data(data_key, data_bytes) host_job_meta['data_upload_time'] = time.time() - data_upload_time host_job_meta['data_upload_timestamp'] = time.time() # Upload function and modules func_upload_time = time.time() func_key = create_func_key(JOBS_PREFIX, executor_id, job_id) job_description['func_key'] = func_key internal_storage.put_func(func_key, func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() job_description['metadata'] = host_job_meta return job_description
def function_handler(event): start_tstamp = time.time() log_level = event['log_level'] cloud_logging_config(log_level) logger.debug("Action handler started") extra_env = event.get('extra_env', {}) os.environ.update(extra_env) os.environ.update({'PYWREN_FUNCTION': 'True', 'PYTHONUNBUFFERED': 'True'}) config = event['config'] call_id = event['call_id'] job_id = event['job_id'] executor_id = event['executor_id'] exec_id = "{}/{}/{}".format(executor_id, job_id, call_id) logger.info("Execution-ID: {}".format(exec_id)) runtime_name = event['runtime_name'] runtime_memory = event['runtime_memory'] execution_timeout = event['execution_timeout'] logger.debug("Runtime name: {}".format(runtime_name)) logger.debug("Runtime memory: {}MB".format(runtime_memory)) logger.debug("Function timeout: {}s".format(execution_timeout)) func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) call_status = CallStatus(config, internal_storage) call_status.response['host_submit_tstamp'] = event['host_submit_tstamp'] call_status.response['start_tstamp'] = start_tstamp context_dict = { 'python_version': os.environ.get("PYTHON_VERSION"), 'call_id': call_id, 'job_id': job_id, 'executor_id': executor_id, 'activation_id': os.environ.get('__PW_ACTIVATION_ID') } call_status.response.update(context_dict) show_memory_peak = strtobool(os.environ.get('SHOW_MEMORY_PEAK', 'False')) try: if version.__version__ != event['pywren_version']: msg = ( "PyWren version mismatch. Host version: {} - Runtime version: {}" .format(event['pywren_version'], version.__version__)) raise RuntimeError('HANDLER', msg) # send init status event call_status.send('__init__') # call_status.response['free_disk_bytes'] = free_disk_space("/tmp") custom_env = { 'PYWREN_CONFIG': json.dumps(config), 'PYWREN_EXECUTION_ID': exec_id, 'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH) } os.environ.update(custom_env) jobrunner_stats_dir = os.path.join(STORAGE_FOLDER, storage_config['bucket'], JOBS_PREFIX, executor_id, job_id, call_id) os.makedirs(jobrunner_stats_dir, exist_ok=True) jobrunner_stats_filename = os.path.join(jobrunner_stats_dir, 'jobrunner.stats.txt') jobrunner_config = { 'pywren_config': config, 'call_id': call_id, 'job_id': job_id, 'executor_id': executor_id, 'func_key': func_key, 'data_key': data_key, 'log_level': log_level, 'data_byte_range': data_byte_range, 'output_key': create_output_key(JOBS_PREFIX, executor_id, job_id, call_id), 'stats_filename': jobrunner_stats_filename } if show_memory_peak: mm_handler_conn, mm_conn = Pipe() memory_monitor = Thread(target=memory_monitor_worker, args=(mm_conn, )) memory_monitor.start() handler_conn, jobrunner_conn = Pipe() jobrunner = JobRunner(jobrunner_config, jobrunner_conn, internal_storage) logger.debug('Starting JobRunner process') local_execution = strtobool( os.environ.get('__PW_LOCAL_EXECUTION', 'False')) jrp = Thread(target=jobrunner.run) if local_execution else Process( target=jobrunner.run) jrp.start() jrp.join(execution_timeout) logger.debug('JobRunner process finished') if jrp.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it try: jrp.terminate() except Exception: # thread does not have terminate method pass msg = ('Function exceeded maximum time of {} seconds and was ' 'killed'.format(execution_timeout)) raise TimeoutError('HANDLER', msg) if show_memory_peak: mm_handler_conn.send('STOP') memory_monitor.join() peak_memory_usage = int(mm_handler_conn.recv()) logger.info("Peak memory usage: {}".format( sizeof_fmt(peak_memory_usage))) call_status.response['peak_memory_usage'] = peak_memory_usage if not handler_conn.poll(): logger.error( 'No completion message received from JobRunner process') logger.debug('Assuming memory overflow...') # Only 1 message is returned by jobrunner when it finishes. # If no message, this means that the jobrunner process was killed. # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM. msg = 'Function exceeded maximum memory and was killed' raise MemoryError('HANDLER', msg) if os.path.exists(jobrunner_stats_filename): with open(jobrunner_stats_filename, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: call_status.response[key] = float(value) except Exception: call_status.response[key] = value if key in [ 'exception', 'exc_pickle_fail', 'result', 'new_futures' ]: call_status.response[key] = eval(value) except Exception: # internal runtime exceptions print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) call_status.response['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads( pickled_exc) # this is just to make sure they can be unpickled call_status.response['exc_info'] = str(pickled_exc) finally: call_status.response['end_tstamp'] = time.time() call_status.send('__end__') for key in extra_env: os.environ.pop(key) logger.info("Finished")