def put_object(self, bucket_name, key, data): """ Put an object in COS. Override the object if the key already exists. :param key: key of the object. :param data: data of the object :type data: str/bytes :return: None """ retries = 0 status = None while status is None: try: res = self.cos_client.put_object(Bucket=bucket_name, Key=key, Body=data) status = 'OK' if res['ResponseMetadata']['HTTPStatusCode'] == 200 else 'Error' try: logger.debug('PUT Object {} - Size: {} - {}'.format(key, sizeof_fmt(len(data)), status)) except Exception: logger.debug('PUT Object {} {}'.format(key, status)) except ibm_botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "NoSuchKey": raise StorageNoSuchKeyError(bucket_name, key) else: raise e except ibm_botocore.exceptions.ReadTimeoutError as e: if retries == OBJ_REQ_RETRIES: raise e logger.debug('PUT Object timeout. Retrying request') retries += 1
def _send_status_os(self): """ Send the status event to the Object Storage """ executor_id = self.response['executor_id'] job_id = self.response['job_id'] call_id = self.response['call_id'] if self.response['type'] == '__init__': init_key = create_init_key(JOBS_PREFIX, executor_id, job_id, call_id) self.internal_storage.put_data(init_key, '') elif self.response['type'] == '__end__': status_key = create_status_key(JOBS_PREFIX, executor_id, job_id, call_id) dmpd_response_status = json.dumps(self.response) drs = sizeof_fmt(len(dmpd_response_status)) logger.info( "Storing execution stats - status.json - Size: {}".format(drs)) self.internal_storage.put_data(status_key, dmpd_response_status)
def _send_status_rabbitmq(self): """ Send the status event to RabbitMQ """ dmpd_response_status = json.dumps(self.response) drs = sizeof_fmt(len(dmpd_response_status)) executor_id = self.response['executor_id'] job_id = self.response['job_id'] rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url') status_sent = False output_query_count = 0 params = pika.URLParameters(rabbit_amqp_url) exchange = 'pywren-{}-{}'.format(executor_id, job_id) while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: connection = pika.BlockingConnection(params) channel = connection.channel() channel.exchange_declare(exchange=exchange, exchange_type='fanout', auto_delete=True) channel.basic_publish(exchange=exchange, routing_key='', body=dmpd_response_status) connection.close() logger.info( "Execution status sent to rabbitmq - Size: {}".format(drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send status to rabbitmq...') time.sleep(0.2)
def _create_job(config, internal_storage, executor_id, job_id, func, data, runtime_meta, runtime_memory=None, extra_env=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], original_func_name=None, execution_timeout=EXECUTION_TIMEOUT): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :param original_func_name: Name of the function to invoke. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ log_level = os.getenv('PYWREN_LOGLEVEL') runtime_name = config['pywren']['runtime'] if runtime_memory is None: runtime_memory = config['pywren']['runtime_memory'] if original_func_name: func_name = original_func_name else: func_name = func.__name__ extra_env = {} if extra_env is None else extra_env if extra_env: extra_env = utils.convert_bools_to_string(extra_env) logger.debug("Extra environment vars {}".format(extra_env)) if not data: return [] host_job_meta = {} job_description = {} job_description['runtime_name'] = runtime_name job_description['runtime_memory'] = int(runtime_memory) job_description['execution_timeout'] = execution_timeout job_description['func_name'] = func_name job_description['extra_env'] = extra_env job_description['total_calls'] = len(data) job_description['invoke_pool_threads'] = invoke_pool_threads job_description['executor_id'] = executor_id job_description['job_id'] = job_id exclude_modules_cfg = config['pywren'].get('exclude_modules', []) include_modules_cfg = config['pywren'].get('include_modules', []) exc_modules = set() inc_modules = set() if exclude_modules_cfg: exc_modules.update(exclude_modules_cfg) if exclude_modules: exc_modules.update(exclude_modules) if include_modules_cfg is not None: inc_modules.update(include_modules_cfg) if include_modules_cfg is None and not include_modules: inc_modules = None if include_modules is not None and include_modules: inc_modules.update(include_modules) if include_modules is None: inc_modules = None logger.debug( 'ExecutorID {} | JobID {} - Serializing function and data'.format( executor_id, job_id)) # pickle func and all data (to capture module dependencies) serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + data, inc_modules, exc_modules) func_str = func_and_data_ser[0] data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) host_job_meta['agg_data'] = False host_job_meta['data_size_bytes'] = data_size_bytes log_msg = 'ExecutorID {} | JobID {} - Uploading function and data'.format( executor_id, job_id) logger.info(log_msg) if not log_level: print(log_msg, end=' ') if data_size_bytes < MAX_AGG_DATA_SIZE: agg_data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id) job_description['data_key'] = agg_data_key agg_data_bytes, agg_data_ranges = _agg_data(data_strs) job_description['data_ranges'] = agg_data_ranges agg_upload_time = time.time() internal_storage.put_data(agg_data_key, agg_data_bytes) host_job_meta['agg_data'] = True host_job_meta['data_upload_time'] = time.time() - agg_upload_time host_job_meta['data_upload_timestamp'] = time.time() else: log_msg = ('ExecutorID {} | JobID {} - Total data exceeded ' 'maximum size of {} bytes'.format(executor_id, job_id, MAX_AGG_DATA_SIZE)) raise Exception(log_msg) module_data = create_module_data(mod_paths) # Create func and upload host_job_meta['func_name'] = func_name func_module_str = pickle.dumps( { 'func': func_str, 'module_data': module_data }, -1) host_job_meta['func_module_bytes'] = len(func_module_str) func_upload_time = time.time() func_key = create_func_key(JOBS_PREFIX, executor_id, job_id) job_description['func_key'] = func_key internal_storage.put_func(func_key, func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() if not log_level: func_and_data_size = utils.sizeof_fmt( host_job_meta['func_module_bytes'] + host_job_meta['data_size_bytes']) log_msg = '- Total: {}'.format(func_and_data_size) print(log_msg) job_description['metadata'] = host_job_meta return job_description
def run(self): """ Runs the function """ logger.info("Started") result = None exception = False try: self.internal_storage = InternalStorage(self.storage_config) self.internal_storage.tmp_obj_prefix = self.output_key.rsplit( '/', 1)[0] loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() logger.info("data_obj {}".format(data)) if is_object_processing_function(function): self._create_data_stream(data) self._fill_optional_args(function, data) if self.show_memory: logger.debug( "Memory usage before call the function: {}".format( get_current_memory_usage())) logger.info("Going to execute '{}()'".format(str( function.__name__))) print('---------------------- FUNCTION LOG ----------------------', flush=True) func_exec_time_t1 = time.time() result = function(**data) func_exec_time_t2 = time.time() print('----------------------------------------------------------', flush=True) logger.info("Success function execution") if self.show_memory: logger.debug("Memory usage after call the function: {}".format( get_current_memory_usage())) self.stats.write('function_exec_time', round(func_exec_time_t2 - func_exec_time_t1, 8)) # Check for new futures if result is not None: self.stats.write("result", True) if isinstance(result, ResponseFuture) or \ (type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture)): self.stats.write('new_futures', True) logger.debug("Pickling result") output_dict = {'result': result} pickled_output = pickle.dumps(output_dict) if self.show_memory: logger.debug( "Memory usage after output serialization: {}".format( get_current_memory_usage())) else: logger.debug("No result to store") self.stats.write("result", False) except Exception: exception = True self.stats.write("exception", True) exc_type, exc_value, exc_traceback = sys.exc_info() print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) if self.show_memory: logger.debug("Memory usage after call the function: {}".format( get_current_memory_usage())) try: logger.debug("Pickling exception") pickled_exc = pickle.dumps( (exc_type, exc_value, exc_traceback)) pickle.loads( pickled_exc ) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) except Exception as pickle_exception: # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails self.stats.write("exc_pickle_fail", True) pickled_exc = pickle.dumps({ 'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'pickle_exception': pickle_exception }) pickle.loads( pickled_exc ) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) finally: store_result = strtobool(os.environ.get('STORE_RESULT', 'True')) if result is not None and store_result and not exception: output_upload_timestamp_t1 = time.time() logger.info( "Storing function result - output.pickle - Size: {}". format(sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_timestamp_t2 = time.time() self.stats.write( "output_upload_time", round( output_upload_timestamp_t2 - output_upload_timestamp_t1, 8)) self.jobrunner_conn.send("Finished") logger.info("Finished")
def function_handler(event): start_time = time.time() logger.debug("Action handler started") response_status = {'exception': False} response_status['host_submit_time'] = event['host_submit_time'] response_status['start_time'] = start_time context_dict = { 'ibm_cf_request_id': os.environ.get("__OW_ACTIVATION_ID"), 'ibm_cf_python_version': os.environ.get("PYTHON_VERSION"), } config = event['config'] storage_config = wrenconfig.extract_storage_config(config) log_level = event['log_level'] ibm_cf_logging_config(log_level) call_id = event['call_id'] callgroup_id = event['callgroup_id'] executor_id = event['executor_id'] logger.info("Execution ID: {}/{}/{}".format(executor_id, callgroup_id, call_id)) job_max_runtime = event.get("job_max_runtime", 590) # default for CF status_key = event['status_key'] func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] output_key = event['output_key'] extra_env = event.get('extra_env', {}) response_status['call_id'] = call_id response_status['callgroup_id'] = callgroup_id response_status['executor_id'] = executor_id # response_status['func_key'] = func_key # response_status['data_key'] = data_key # response_status['output_key'] = output_key # response_status['status_key'] = status_key try: if version.__version__ != event['pywren_version']: raise Exception("WRONGVERSION", "PyWren version mismatch", version.__version__, event['pywren_version']) # response_status['free_disk_bytes'] = free_disk_space("/tmp") custom_env = {'PYWREN_CONFIG': json.dumps(config), 'PYWREN_EXECUTOR_ID': executor_id, 'PYTHONPATH': "{}:{}".format(os.getcwd(), PYWREN_LIBS_PATH), 'PYTHONUNBUFFERED': 'True'} os.environ.update(custom_env) os.environ.update(extra_env) # pass a full json blob jobrunner_config = {'func_key': func_key, 'data_key': data_key, 'log_level': log_level, 'data_byte_range': data_byte_range, 'python_module_path': PYTHON_MODULE_PATH, 'output_key': output_key, 'stats_filename': JOBRUNNER_STATS_FILENAME} if os.path.exists(JOBRUNNER_STATS_FILENAME): os.remove(JOBRUNNER_STATS_FILENAME) setup_time = time.time() response_status['setup_time'] = round(setup_time - start_time, 8) result_queue = multiprocessing.Queue() jr = jobrunner(jobrunner_config, result_queue) jr.daemon = True logger.info("Starting jobrunner process") jr.start() jr.join(job_max_runtime) response_status['exec_time'] = round(time.time() - setup_time, 8) if jr.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it logger.error("Process exceeded maximum runtime of {} seconds".format(job_max_runtime)) # Send the signal to all the process groups jr.terminate() raise Exception("OUTATIME", "Process executed for too long and was killed") try: # Only 1 message is returned by jobrunner result_queue.get(block=False) except Exception: # If no message, this means that the process was killed due an exception pickling an exception raise Exception("EXCPICKLEERROR", "PyWren was unable to pickle the exception, check function logs") # print(subprocess.check_output("find {}".format(PYTHON_MODULE_PATH), shell=True)) # print(subprocess.check_output("find {}".format(os.getcwd()), shell=True)) if os.path.exists(JOBRUNNER_STATS_FILENAME): with open(JOBRUNNER_STATS_FILENAME, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: response_status[key] = float(value) except Exception: response_status[key] = value if key == 'exception' or key == 'exc_pickle_fail' \ or key == 'result': response_status[key] = eval(value) # response_status['server_info'] = get_server_info() response_status.update(context_dict) response_status['end_time'] = time.time() except Exception as e: # internal runtime exceptions logger.error("There was an exception: {}".format(str(e))) response_status['end_time'] = time.time() response_status['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled response_status['exc_info'] = str(pickled_exc) finally: store_status = strtobool(os.environ.get('STORE_STATUS', 'True')) rabbit_amqp_url = config['rabbitmq'].get('amqp_url') dmpd_response_status = json.dumps(response_status) drs = sizeof_fmt(len(dmpd_response_status)) if rabbit_amqp_url and store_status: status_sent = False output_query_count = 0 while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: params = pika.URLParameters(rabbit_amqp_url) connection = pika.BlockingConnection(params) channel = connection.channel() channel.queue_declare(queue=executor_id, auto_delete=True) channel.basic_publish(exchange='', routing_key=executor_id, body=dmpd_response_status) connection.close() logger.info("Execution stats sent to rabbitmq - Size: {}".format(drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send stats to rabbitmq...') time.sleep(0.2) if store_status: internal_storage = InternalStorage(storage_config) logger.info("Storing execution stats - status.json - Size: {}".format(drs)) internal_storage.put_data(status_key, dmpd_response_status)