class JobRunner: def __init__(self, jr_config, jobrunner_conn): start_time = time.time() self.jr_config = jr_config self.jobrunner_conn = jobrunner_conn log_level = self.jr_config['log_level'] cloud_logging_config(log_level) self.pywren_config = self.jr_config['pywren_config'] self.storage_config = extract_storage_config(self.pywren_config) self.call_id = self.jr_config['call_id'] self.job_id = self.jr_config['job_id'] self.executor_id = self.jr_config['executor_id'] self.func_key = self.jr_config['func_key'] self.data_key = self.jr_config['data_key'] self.data_byte_range = self.jr_config['data_byte_range'] self.output_key = self.jr_config['output_key'] self.stats = stats(self.jr_config['stats_filename']) self.stats.write('jobrunner_start', start_time) self.show_memory = strtobool( os.environ.get('SHOW_MEMORY_USAGE', 'False')) def _get_function_and_modules(self): """ Gets and unpickles function and modules from storage """ logger.debug("Getting function and modules") func_download_time_t1 = time.time() func_obj = self.internal_storage.get_func(self.func_key) loaded_func_all = pickle.loads(func_obj) func_download_time_t2 = time.time() self.stats.write( 'func_download_time', round(func_download_time_t2 - func_download_time_t1, 8)) logger.debug("Finished getting Function and modules") return loaded_func_all def _save_modules(self, module_data): """ Save modules, before we unpickle actual function """ if module_data: logger.debug("Writing Function dependencies to local disk") module_path = os.path.join(PYTHON_MODULE_PATH, self.executor_id, self.job_id, self.call_id) # shutil.rmtree(PYTHON_MODULE_PATH, True) # delete old modules os.makedirs(module_path, exist_ok=True) sys.path.append(module_path) for m_filename, m_data in module_data.items(): m_path = os.path.dirname(m_filename) if len(m_path) > 0 and m_path[0] == "/": m_path = m_path[1:] to_make = os.path.join(module_path, m_path) try: os.makedirs(to_make) except OSError as e: if e.errno == 17: pass else: raise e full_filename = os.path.join(to_make, os.path.basename(m_filename)) with open(full_filename, 'wb') as fid: fid.write(b64str_to_bytes(m_data)) #logger.info("Finished writing {} module files".format(len(module_data))) #logger.debug(subprocess.check_output("find {}".format(module_path), shell=True)) #logger.debug(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) logger.debug("Finished writing Function dependencies") def _unpickle_function(self, pickled_func): """ Unpickle function; it will expect modules to be there """ logger.debug("Unpickle Function") loaded_func = pickle.loads(pickled_func) logger.debug("Finished Function unpickle") return loaded_func def _load_data(self): extra_get_args = {} if self.data_byte_range is not None: range_str = 'bytes={}-{}'.format(*self.data_byte_range) extra_get_args['Range'] = range_str logger.debug("Getting function data") data_download_time_t1 = time.time() data_obj = self.internal_storage.get_data( self.data_key, extra_get_args=extra_get_args) logger.debug("Finished getting Function data") logger.debug("Unpickle Function data") loaded_data = pickle.loads(data_obj) logger.debug("Finished unpickle Function data") data_download_time_t2 = time.time() self.stats.write( 'data_download_time', round(data_download_time_t2 - data_download_time_t1, 8)) return loaded_data def _fill_optional_args(self, function, data): """ Fills in those reserved, optional parameters that might be write to the function signature """ func_sig = inspect.signature(function) if 'ibm_cos' in func_sig.parameters: if 'ibm_cos' in self.pywren_config: try: ibm_boto3_client = Storage(self.pywren_config, 'ibm_cos').get_client() data['ibm_cos'] = ibm_boto3_client except Exception as e: logger.error('Cannot create the ibm_cos connection: {}', str(e)) data['ibm_cos'] = None else: logger.error( 'Cannot create the ibm_cos connection: Configuration not provided' ) data['ibm_cos'] = None if 'internal_storage' in func_sig.parameters: data['internal_storage'] = self.internal_storage if 'rabbitmq' in func_sig.parameters: if 'rabbitmq' in self.pywren_config: try: rabbit_amqp_url = self.pywren_config['rabbitmq'].get( 'amqp_url') params = pika.URLParameters(rabbit_amqp_url) connection = pika.BlockingConnection(params) data['rabbitmq'] = connection except Exception as e: logger.error('Cannot create the rabbitmq connection: {}', str(e)) data['rabbitmq'] = None else: logger.error( 'Cannot create the rabbitmq connection: Configuration not provided' ) data['rabbitmq'] = None if 'id' in func_sig.parameters: data['id'] = int(self.call_id) def _create_data_stream(self, data): """ Creates the data stream in case of object processing """ extra_get_args = {} if 'url' in data: url = data['url'] logger.info('Getting dataset from {}'.format(url.path)) if url.data_byte_range is not None: range_str = 'bytes={}-{}'.format(*url.data_byte_range) extra_get_args['Range'] = range_str logger.info('Chunk: {} - Range: {}'.format( url.part, extra_get_args['Range'])) resp = requests.get(url.path, headers=extra_get_args, stream=True) url.data_stream = resp.raw if 'obj' in data: obj = data['obj'] obj.storage_backend storage_handler = Storage( self.pywren_config, obj.storage_backend).get_storage_handler() logger.info('Getting dataset from {}://{}/{}'.format( obj.storage_backend, obj.bucket, obj.key)) # logger.info("ob.limit_X_values {}".format(obj.limit_X_values))##### if obj.data_byte_range is not None: extra_get_args['Range'] = 'bytes={}-{}'.format( *obj.data_byte_range) logger.info('Chunk: {} - Range: {}'.format( obj.part, extra_get_args['Range'])) sb = storage_handler.get_object(obj.bucket, obj.key, stream=True, extra_get_args=extra_get_args) obj.data_stream = WrappedStreamingBodyPartition( sb, obj.chunk_size, obj.data_byte_range) elif obj.data_byte_range is None and (obj.limit_X_values is None or obj.limit_Y_values is None): obj.data_stream = storage_handler.get_object(obj.bucket, obj.key, stream=True) else: obj.data_stream = storage_handler.get_object(obj.bucket, obj.key, stream=True) obj.data_stream = file_part(obj.data_stream, obj) #obj.bord_meta, def run(self): """ Runs the function """ logger.info("Started") result = None exception = False try: self.internal_storage = InternalStorage(self.storage_config) self.internal_storage.tmp_obj_prefix = self.output_key.rsplit( '/', 1)[0] loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() logger.info("data_obj {}".format(data)) if is_object_processing_function(function): self._create_data_stream(data) self._fill_optional_args(function, data) if self.show_memory: logger.debug( "Memory usage before call the function: {}".format( get_current_memory_usage())) logger.info("Going to execute '{}()'".format(str( function.__name__))) print('---------------------- FUNCTION LOG ----------------------', flush=True) func_exec_time_t1 = time.time() result = function(**data) func_exec_time_t2 = time.time() print('----------------------------------------------------------', flush=True) logger.info("Success function execution") if self.show_memory: logger.debug("Memory usage after call the function: {}".format( get_current_memory_usage())) self.stats.write('function_exec_time', round(func_exec_time_t2 - func_exec_time_t1, 8)) # Check for new futures if result is not None: self.stats.write("result", True) if isinstance(result, ResponseFuture) or \ (type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture)): self.stats.write('new_futures', True) logger.debug("Pickling result") output_dict = {'result': result} pickled_output = pickle.dumps(output_dict) if self.show_memory: logger.debug( "Memory usage after output serialization: {}".format( get_current_memory_usage())) else: logger.debug("No result to store") self.stats.write("result", False) except Exception: exception = True self.stats.write("exception", True) exc_type, exc_value, exc_traceback = sys.exc_info() print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) if self.show_memory: logger.debug("Memory usage after call the function: {}".format( get_current_memory_usage())) try: logger.debug("Pickling exception") pickled_exc = pickle.dumps( (exc_type, exc_value, exc_traceback)) pickle.loads( pickled_exc ) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) except Exception as pickle_exception: # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails self.stats.write("exc_pickle_fail", True) pickled_exc = pickle.dumps({ 'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'pickle_exception': pickle_exception }) pickle.loads( pickled_exc ) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) finally: store_result = strtobool(os.environ.get('STORE_RESULT', 'True')) if result is not None and store_result and not exception: output_upload_timestamp_t1 = time.time() logger.info( "Storing function result - output.pickle - Size: {}". format(sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_timestamp_t2 = time.time() self.stats.write( "output_upload_time", round( output_upload_timestamp_t2 - output_upload_timestamp_t1, 8)) self.jobrunner_conn.send("Finished") logger.info("Finished")
def function_handler(event): start_time = time.time() logger.debug("Action handler started") response_status = {'exception': False} response_status['host_submit_time'] = event['host_submit_time'] response_status['start_time'] = start_time context_dict = { 'ibm_cf_request_id': os.environ.get("__OW_ACTIVATION_ID"), 'ibm_cf_python_version': os.environ.get("PYTHON_VERSION"), } config = event['config'] storage_config = wrenconfig.extract_storage_config(config) log_level = event['log_level'] ibm_cf_logging_config(log_level) call_id = event['call_id'] callgroup_id = event['callgroup_id'] executor_id = event['executor_id'] logger.info("Execution ID: {}/{}/{}".format(executor_id, callgroup_id, call_id)) job_max_runtime = event.get("job_max_runtime", 590) # default for CF status_key = event['status_key'] func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] output_key = event['output_key'] extra_env = event.get('extra_env', {}) response_status['call_id'] = call_id response_status['callgroup_id'] = callgroup_id response_status['executor_id'] = executor_id # response_status['func_key'] = func_key # response_status['data_key'] = data_key # response_status['output_key'] = output_key # response_status['status_key'] = status_key try: if version.__version__ != event['pywren_version']: raise Exception("WRONGVERSION", "PyWren version mismatch", version.__version__, event['pywren_version']) # response_status['free_disk_bytes'] = free_disk_space("/tmp") custom_env = {'PYWREN_CONFIG': json.dumps(config), 'PYWREN_EXECUTOR_ID': executor_id, 'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH), 'PYTHONUNBUFFERED': 'True'} os.environ.update(custom_env) os.environ.update(extra_env) # pass a full json blob jobrunner_config = {'func_key': func_key, 'data_key': data_key, 'log_level': log_level, 'data_byte_range': data_byte_range, 'python_module_path': PYTHON_MODULE_PATH, 'output_key': output_key, 'stats_filename': JOBRUNNER_STATS_FILENAME} if os.path.exists(JOBRUNNER_STATS_FILENAME): os.remove(JOBRUNNER_STATS_FILENAME) setup_time = time.time() response_status['setup_time'] = round(setup_time - start_time, 8) result_queue = multiprocessing.Queue() jr = jobrunner(jobrunner_config, result_queue) jr.daemon = True logger.info("Starting jobrunner process") jr.start() jr.join(job_max_runtime) response_status['exec_time'] = round(time.time() - setup_time, 8) if jr.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it logger.error("Process exceeded maximum runtime of {} seconds".format(job_max_runtime)) # Send the signal to all the process groups jr.terminate() raise Exception("OUTATIME", "Process executed for too long and was killed") try: # Only 1 message is returned by jobrunner result_queue.get(block=False) except Exception: # If no message, this means that the process was killed due an exception pickling an exception raise Exception("EXCPICKLEERROR", "PyWren was unable to pickle the exception, check function logs") # print(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) # print(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) if os.path.exists(JOBRUNNER_STATS_FILENAME): with open(JOBRUNNER_STATS_FILENAME, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: response_status[key] = float(value) except Exception: response_status[key] = value if key == 'exception' or key == 'exc_pickle_fail' \ or key == 'result': response_status[key] = eval(value) # response_status['server_info'] = get_server_info() response_status.update(context_dict) response_status['end_time'] = time.time() except Exception as e: # internal runtime exceptions logger.error("There was an exception: {}".format(str(e))) response_status['end_time'] = time.time() response_status['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled response_status['exc_info'] = str(pickled_exc) finally: store_status = strtobool(os.environ.get('STORE_STATUS', 'True')) rabbit_amqp_url = config['rabbitmq'].get('amqp_url') dmpd_response_status = json.dumps(response_status) drs = sizeof_fmt(len(dmpd_response_status)) if rabbit_amqp_url and store_status: status_sent = False output_query_count = 0 while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: params = pika.URLParameters(rabbit_amqp_url) connection = pika.BlockingConnection(params) channel = connection.channel() channel.queue_declare(queue=executor_id, auto_delete=True) channel.basic_publish(exchange='', routing_key=executor_id, body=dmpd_response_status) connection.close() logger.info("Execution stats sent to rabbitmq - Size: {}".format(drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send stats to rabbitmq...') time.sleep(0.2) if store_status: internal_storage = InternalStorage(storage_config) logger.info("Storing execution stats - status.json - Size: {}".format(drs)) internal_storage.put_data(status_key, dmpd_response_status)
class CallStatus: def __init__(self, pywren_config): self.config = pywren_config self.rabbitmq_monitor = self.config['pywren'].get( 'rabbitmq_monitor', False) self.store_status = strtobool(os.environ.get('STORE_STATUS', 'True')) storage_config = extract_storage_config(self.config) self.internal_storage = InternalStorage(storage_config) self.response = {'exception': False} def send(self, event_type): self.response['type'] = event_type if self.store_status: if self.rabbitmq_monitor: self._send_status_rabbitmq() if not self.rabbitmq_monitor or event_type == '__end__': self._send_status_os() def _send_status_os(self): """ Send the status event to the Object Storage """ executor_id = self.response['executor_id'] job_id = self.response['job_id'] call_id = self.response['call_id'] if self.response['type'] == '__init__': init_key = create_init_key(JOBS_PREFIX, executor_id, job_id, call_id) self.internal_storage.put_data(init_key, '') elif self.response['type'] == '__end__': status_key = create_status_key(JOBS_PREFIX, executor_id, job_id, call_id) dmpd_response_status = json.dumps(self.response) drs = sizeof_fmt(len(dmpd_response_status)) logger.info( "Storing execution stats - status.json - Size: {}".format(drs)) self.internal_storage.put_data(status_key, dmpd_response_status) def _send_status_rabbitmq(self): """ Send the status event to RabbitMQ """ dmpd_response_status = json.dumps(self.response) drs = sizeof_fmt(len(dmpd_response_status)) executor_id = self.response['executor_id'] job_id = self.response['job_id'] rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url') status_sent = False output_query_count = 0 params = pika.URLParameters(rabbit_amqp_url) exchange = 'pywren-{}-{}'.format(executor_id, job_id) while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: connection = pika.BlockingConnection(params) channel = connection.channel() channel.exchange_declare(exchange=exchange, exchange_type='fanout', auto_delete=True) channel.basic_publish(exchange=exchange, routing_key='', body=dmpd_response_status) connection.close() logger.info( "Execution status sent to rabbitmq - Size: {}".format(drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send status to rabbitmq...') time.sleep(0.2)
class jobrunner(Process): def __init__(self, jr_config, result_queue): super().__init__() start_time = time.time() self.config = jr_config log_level = self.config['log_level'] self.result_queue = result_queue ibm_cf_logging_config(log_level) self.stats = stats(self.config['stats_filename']) self.stats.write('jobrunner_start', start_time) pw_config = json.loads(os.environ.get('PYWREN_CONFIG')) self.storage_config = extract_storage_config(pw_config) if 'SHOW_MEMORY_USAGE' in os.environ: self.show_memory = eval(os.environ['SHOW_MEMORY_USAGE']) else: self.show_memory = False self.func_key = self.config['func_key'] self.data_key = self.config['data_key'] self.data_byte_range = self.config['data_byte_range'] self.output_key = self.config['output_key'] def _get_function_and_modules(self): """ Gets and unpickles function and modules from storage """ logger.debug("Getting function and modules") func_download_time_t1 = time.time() func_obj = self.internal_storage.get_func(self.func_key) loaded_func_all = pickle.loads(func_obj) func_download_time_t2 = time.time() self.stats.write('func_download_time', round(func_download_time_t2-func_download_time_t1, 8)) logger.debug("Finished getting Function and modules") return loaded_func_all def _save_modules(self, module_data): """ Save modules, before we unpickle actual function """ logger.debug("Writing Function dependencies to local disk") PYTHON_MODULE_PATH = self.config['python_module_path'] shutil.rmtree(PYTHON_MODULE_PATH, True) # delete old modules os.mkdir(PYTHON_MODULE_PATH) sys.path.append(PYTHON_MODULE_PATH) for m_filename, m_data in module_data.items(): m_path = os.path.dirname(m_filename) if len(m_path) > 0 and m_path[0] == "/": m_path = m_path[1:] to_make = os.path.join(PYTHON_MODULE_PATH, m_path) try: os.makedirs(to_make) except OSError as e: if e.errno == 17: pass else: raise e full_filename = os.path.join(to_make, os.path.basename(m_filename)) with open(full_filename, 'wb') as fid: fid.write(b64str_to_bytes(m_data)) #logger.info("Finished writing {} module files".format(len(loaded_func_all['module_data']))) #logger.debug(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) #logger.debug(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) logger.debug("Finished writing Function dependencies") def _unpickle_function(self, pickled_func): """ Unpickle function; it will expect modules to be there """ logger.debug("Unpickle Function") loaded_func = pickle.loads(pickled_func) logger.debug("Finished Function unpickle") return loaded_func def _load_data(self): extra_get_args = {} if self.data_byte_range is not None: range_str = 'bytes={}-{}'.format(*self.data_byte_range) extra_get_args['Range'] = range_str logger.debug("Getting function data") data_download_time_t1 = time.time() data_obj = self.internal_storage.get_data(self.data_key, extra_get_args=extra_get_args) logger.debug("Finished getting Function data") logger.debug("Unpickle Function data") loaded_data = pickle.loads(data_obj) logger.debug("Finished unpickle Function data") data_download_time_t2 = time.time() self.stats.write('data_download_time', round(data_download_time_t2-data_download_time_t1, 8)) return loaded_data def _create_storage_clients(self, function, data): # Verify storage parameters - Create clients func_sig = inspect.signature(function) if 'ibm_cos' in func_sig.parameters: ibm_boto3_client = IbmCosStorageBackend(self.storage_config['ibm_cos']).get_client() data['ibm_cos'] = ibm_boto3_client if 'swift' in func_sig.parameters: swift_client = SwiftStorageBackend(self.storage_config['swift']) data['swift'] = swift_client if 'internal_storage' in func_sig.parameters: data['internal_storage'] = self.internal_storage return data def run(self): """ Runs the function """ logger.info("Started") # initial output file in case job fails result = None exception = False try: self.internal_storage = InternalStorage(self.storage_config) loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() data = self._create_storage_clients(function, data) if self.show_memory: logger.debug("Memory usage before call the function: {}".format(get_current_memory_usage())) logger.info("Function: Going to execute '{}()'".format(str(function.__name__))) print('---------------------- FUNCTION LOG ----------------------', flush=True) func_exec_time_t1 = time.time() result = function(**data) func_exec_time_t2 = time.time() print('----------------------------------------------------------', flush=True) logger.info("Function: Success execution") if self.show_memory: logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage())) self.stats.write('function_exec_time', round(func_exec_time_t2-func_exec_time_t1, 8)) # Check for new futures if result is not None: self.stats.write("result", True) if isinstance(result, ResponseFuture): callgroup_id = result.callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, 1)) elif type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture): callgroup_id = result[0].callgroup_id self.stats.write('new_futures', '{}/{}'.format(callgroup_id, len(result))) else: self.stats.write('new_futures', '{}/{}'.format(None, 0)) logger.debug("Pickling result") output_dict = {'result': result} pickled_output = pickle.dumps(output_dict) if self.show_memory: logger.debug("Memory usage after output serialization: {}".format(get_current_memory_usage())) else: logger.debug("No result to store") self.stats.write("result", False) except Exception as e: exception = True self.stats.write("exception", True) print('----------------------- EXCEPTION !-----------------------') logger.error("There was an exception: {}".format(str(e))) print('----------------------------------------------------------', flush=True) if self.show_memory: logger.debug("Memory usage after call the function: {}".format(get_current_memory_usage())) try: logger.debug("Pickling exception") pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) except Exception as pickle_exception: # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails logger.debug("Failed pickling exception: {}".format(str(pickle_exception))) self.stats.write("exc_pickle_fail", True) exc_type, exc_value, exc_traceback = sys.exc_info() pickled_exc = pickle.dumps({'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'pickle_exception': pickle_exception}) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) finally: store_result = strtobool(os.environ.get('STORE_RESULT', 'True')) if result is not None and store_result and not exception: output_upload_timestamp_t1 = time.time() logger.info("Storing function result - output.pickle - Size: {}".format(sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_timestamp_t2 = time.time() self.stats.write("output_upload_time", round(output_upload_timestamp_t2 - output_upload_timestamp_t1, 8)) self.result_queue.put("Finished") logger.info("Finished")