def _create_job(config, internal_storage, executor_id, job_id, func, iterdata, runtime_meta, runtime_memory, extra_env, include_modules, exclude_modules, execution_timeout, host_job_meta, invoke_pool_threads=128): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ ext_env = {} if extra_env is None else extra_env.copy() if ext_env: ext_env = utils.convert_bools_to_string(ext_env) logger.debug("Extra environment vars {}".format(ext_env)) job = SimpleNamespace() job.executor_id = executor_id job.job_id = job_id job.extra_env = ext_env job.execution_timeout = execution_timeout or config['lithops'][ 'execution_timeout'] job.function_name = func.__name__ job.total_calls = len(iterdata) mode = config['lithops']['mode'] if mode == SERVERLESS: job.invoke_pool_threads = invoke_pool_threads or config['serverless'][ 'invoke_pool_threads'] job.runtime_memory = runtime_memory or config['serverless'][ 'runtime_memory'] job.runtime_timeout = config['serverless']['runtime_timeout'] if job.execution_timeout >= job.runtime_timeout: job.execution_timeout = job.runtime_timeout - 5 elif mode == STANDALONE: job.runtime_memory = None runtime_timeout = config['standalone']['hard_dismantle_timeout'] if job.execution_timeout >= runtime_timeout: job.execution_timeout = runtime_timeout - 10 elif mode == LOCALHOST: job.runtime_memory = None job.runtime_timeout = execution_timeout exclude_modules_cfg = config['lithops'].get('exclude_modules', []) include_modules_cfg = config['lithops'].get('include_modules', []) exc_modules = set() inc_modules = set() if exclude_modules_cfg: exc_modules.update(exclude_modules_cfg) if exclude_modules: exc_modules.update(exclude_modules) if include_modules_cfg is not None: inc_modules.update(include_modules_cfg) if include_modules_cfg is None and not include_modules: inc_modules = None if include_modules is not None and include_modules: inc_modules.update(include_modules) if include_modules is None: inc_modules = None logger.debug( 'ExecutorID {} | JobID {} - Serializing function and data'.format( executor_id, job_id)) job_serialize_start = time.time() serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + iterdata, inc_modules, exc_modules) data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) module_data = create_module_data(mod_paths) func_str = func_and_data_ser[0] func_module_str = pickle.dumps( { 'func': func_str, 'module_data': module_data }, -1) func_module_size_bytes = len(func_module_str) total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes) host_job_meta['host_job_serialize_time'] = round( time.time() - job_serialize_start, 6) host_job_meta['data_size_bytes'] = data_size_bytes host_job_meta['func_module_size_bytes'] = func_module_size_bytes if 'data_limit' in config['lithops']: data_limit = config['lithops']['data_limit'] else: data_limit = MAX_AGG_DATA_SIZE if data_limit and data_size_bytes > data_limit * 1024**2: log_msg = ( 'ExecutorID {} | JobID {} - Total data exceeded maximum size ' 'of {}'.format(executor_id, job_id, sizeof_fmt(data_limit * 1024**2))) raise Exception(log_msg) logger.info('ExecutorID {} | JobID {} - Uploading function and data ' '- Total: {}'.format(executor_id, job_id, total_size)) # Upload data data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id) job.data_key = data_key data_bytes, data_ranges = utils.agg_data(data_strs) job.data_ranges = data_ranges data_upload_start = time.time() internal_storage.put_data(data_key, data_bytes) data_upload_end = time.time() host_job_meta['host_data_upload_time'] = round( data_upload_end - data_upload_start, 6) func_upload_start = time.time() # Upload function and modules if config[mode].get('customized_runtime'): # Prepare function and modules locally to store in the runtime image later function_file = func.__code__.co_filename function_hash = hashlib.md5(open(function_file, 'rb').read()).hexdigest()[:16] mod_hash = hashlib.md5(repr( sorted(mod_paths)).encode('utf-8')).hexdigest()[:16] uuid = f'{function_hash}{mod_hash}' func_key = create_func_key(JOBS_PREFIX, uuid, "") _store_func_and_modules(func_key, func_str, module_data) job.ext_runtime_uuid = uuid else: func_key = create_func_key(JOBS_PREFIX, executor_id, job_id) internal_storage.put_func(func_key, func_module_str) job.func_key = func_key func_upload_end = time.time() host_job_meta['host_func_upload_time'] = round( func_upload_end - func_upload_start, 6) host_job_meta['host_job_created_time'] = round( time.time() - host_job_meta['host_job_create_tstamp'], 6) job.metadata = host_job_meta return job
def _create_job(config, internal_storage, executor_id, job_id, func, data, runtime_meta, runtime_memory=None, extra_env=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], execution_timeout=None, host_job_meta=None): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ log_level = logger.getEffectiveLevel() != logging.WARNING runtime_name = config['lithops']['runtime'] if runtime_memory is None: runtime_memory = config['lithops']['runtime_memory'] ext_env = {} if extra_env is None else extra_env.copy() if ext_env: ext_env = utils.convert_bools_to_string(ext_env) logger.debug("Extra environment vars {}".format(ext_env)) if not data: return [] if execution_timeout is None: execution_timeout = config['lithops']['runtime_timeout'] - 5 job_description = {} job_description['runtime_name'] = runtime_name job_description['runtime_memory'] = runtime_memory job_description['execution_timeout'] = execution_timeout job_description['function_name'] = func.__name__ job_description['extra_env'] = ext_env job_description['total_calls'] = len(data) job_description['invoke_pool_threads'] = invoke_pool_threads job_description['executor_id'] = executor_id job_description['job_id'] = job_id exclude_modules_cfg = config['lithops'].get('exclude_modules', []) include_modules_cfg = config['lithops'].get('include_modules', []) exc_modules = set() inc_modules = set() if exclude_modules_cfg: exc_modules.update(exclude_modules_cfg) if exclude_modules: exc_modules.update(exclude_modules) if include_modules_cfg is not None: inc_modules.update(include_modules_cfg) if include_modules_cfg is None and not include_modules: inc_modules = None if include_modules is not None and include_modules: inc_modules.update(include_modules) if include_modules is None: inc_modules = None logger.debug( 'ExecutorID {} | JobID {} - Serializing function and data'.format( executor_id, job_id)) job_serialize_start = time.time() serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + data, inc_modules, exc_modules) data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) module_data = create_module_data(mod_paths) func_str = func_and_data_ser[0] func_module_str = pickle.dumps( { 'func': func_str, 'module_data': module_data }, -1) func_module_size_bytes = len(func_module_str) total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes) host_job_meta['host_job_serialize_time'] = round( time.time() - job_serialize_start, 6) host_job_meta['data_size_bytes'] = data_size_bytes host_job_meta['func_module_size_bytes'] = func_module_size_bytes if 'data_limit' in config['lithops']: data_limit = config['lithops']['data_limit'] else: data_limit = MAX_AGG_DATA_SIZE if data_limit and data_size_bytes > data_limit * 1024**2: log_msg = ( 'ExecutorID {} | JobID {} - Total data exceeded maximum size ' 'of {}'.format(executor_id, job_id, sizeof_fmt(data_limit * 1024**2))) raise Exception(log_msg) log_msg = ('ExecutorID {} | JobID {} - Uploading function and data ' '- Total: {}'.format(executor_id, job_id, total_size)) logger.info(log_msg) if not log_level: print(log_msg) # Upload data data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id) job_description['data_key'] = data_key data_bytes, data_ranges = utils.agg_data(data_strs) job_description['data_ranges'] = data_ranges data_upload_start = time.time() internal_storage.put_data(data_key, data_bytes) data_upload_end = time.time() host_job_meta['host_data_upload_time'] = round( data_upload_end - data_upload_start, 6) # Upload function and modules func_upload_start = time.time() func_key = create_func_key(JOBS_PREFIX, executor_id, job_id) job_description['func_key'] = func_key internal_storage.put_func(func_key, func_module_str) func_upload_end = time.time() host_job_meta['host_func_upload_time'] = round( func_upload_end - func_upload_start, 6) host_job_meta['host_job_created_time'] = round( time.time() - host_job_meta['host_job_create_tstamp'], 6) job_description['metadata'] = host_job_meta return job_description
def _create_job(config, internal_storage, executor_id, job_id, func, iterdata, runtime_meta, runtime_memory, extra_env, include_modules, exclude_modules, execution_timeout, host_job_meta, chunksize=None, worker_processes=None, invoke_pool_threads=16): """ Creates a new Job """ ext_env = {} if extra_env is None else extra_env.copy() if ext_env: ext_env = utils.convert_bools_to_string(ext_env) logger.debug("Extra environment vars {}".format(ext_env)) job = SimpleNamespace() job.chunksize = chunksize or config['lithops']['chunksize'] job.worker_processes = worker_processes or config['lithops'][ 'worker_processes'] job.execution_timeout = execution_timeout or config['lithops'][ 'execution_timeout'] job.executor_id = executor_id job.job_id = job_id job.job_key = create_job_key(job.executor_id, job.job_id) job.extra_env = ext_env job.function_name = func.__name__ job.total_calls = len(iterdata) mode = config['lithops']['mode'] if mode == SERVERLESS: job.invoke_pool_threads = invoke_pool_threads or config['serverless'][ 'invoke_pool_threads'] job.runtime_memory = runtime_memory or config['serverless'][ 'runtime_memory'] job.runtime_timeout = config['serverless']['runtime_timeout'] if job.execution_timeout >= job.runtime_timeout: job.execution_timeout = job.runtime_timeout - 5 elif mode == STANDALONE: job.runtime_memory = None runtime_timeout = config['standalone']['hard_dismantle_timeout'] if job.execution_timeout >= runtime_timeout: job.execution_timeout = runtime_timeout - 10 elif mode == LOCALHOST: job.runtime_memory = None job.runtime_timeout = execution_timeout exclude_modules_cfg = config['lithops'].get('exclude_modules', []) include_modules_cfg = config['lithops'].get('include_modules', []) exc_modules = set() inc_modules = set() if exclude_modules_cfg: exc_modules.update(exclude_modules_cfg) if exclude_modules: exc_modules.update(exclude_modules) if include_modules_cfg is not None: inc_modules.update(include_modules_cfg) if include_modules_cfg is None and not include_modules: inc_modules = None if include_modules is not None and include_modules: inc_modules.update(include_modules) if include_modules is None: inc_modules = None logger.debug( 'ExecutorID {} | JobID {} - Serializing function and data'.format( executor_id, job_id)) job_serialize_start = time.time() serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + iterdata, inc_modules, exc_modules) data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) module_data = create_module_data(mod_paths) func_str = func_and_data_ser[0] func_module_str = pickle.dumps( { 'func': func_str, 'module_data': module_data }, -1) func_module_size_bytes = len(func_module_str) total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes) host_job_meta['host_job_serialize_time'] = round( time.time() - job_serialize_start, 6) host_job_meta['data_size_bytes'] = data_size_bytes host_job_meta['func_module_size_bytes'] = func_module_size_bytes if 'data_limit' in config['lithops']: data_limit = config['lithops']['data_limit'] else: data_limit = MAX_AGG_DATA_SIZE if data_limit and data_size_bytes > data_limit * 1024**2: log_msg = ( 'ExecutorID {} | JobID {} - Total data exceeded maximum size ' 'of {}'.format(executor_id, job_id, sizeof_fmt(data_limit * 1024**2))) raise Exception(log_msg) logger.info('ExecutorID {} | JobID {} - Uploading function and data ' '- Total: {}'.format(executor_id, job_id, total_size)) # Upload data data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id) job.data_key = data_key data_bytes, data_byte_ranges = utils.agg_data(data_strs) job.data_byte_ranges = data_byte_ranges data_upload_start = time.time() internal_storage.put_data(data_key, data_bytes) data_upload_end = time.time() host_job_meta['host_data_upload_time'] = round( data_upload_end - data_upload_start, 6) func_upload_start = time.time() # Upload function and modules if config[mode].get('customized_runtime'): # Prepare function and modules locally to store in the runtime image later function_file = func.__code__.co_filename function_hash = hashlib.md5(open(function_file, 'rb').read()).hexdigest()[:16] mod_hash = hashlib.md5(repr( sorted(mod_paths)).encode('utf-8')).hexdigest()[:16] uuid = '{}{}'.format(function_hash, mod_hash) func_key = create_func_key(JOBS_PREFIX, uuid, "") _store_func_and_modules(func_key, func_str, module_data) job.ext_runtime_uuid = uuid else: func_key = create_func_key(JOBS_PREFIX, executor_id, job_id) internal_storage.put_func(func_key, func_module_str) job.func_key = func_key func_upload_end = time.time() host_job_meta['host_func_upload_time'] = round( func_upload_end - func_upload_start, 6) host_job_meta['host_job_created_time'] = round( time.time() - host_job_meta['host_job_create_tstamp'], 6) job.metadata = host_job_meta return job
def _create_job(config, internal_storage, executor_id, job_id, func, iterdata, runtime_meta, runtime_memory, extra_env, include_modules, exclude_modules, execution_timeout, host_job_meta, chunksize=None): """ Creates a new Job """ global FUNCTION_CACHE ext_env = {} if extra_env is None else extra_env.copy() if ext_env: ext_env = utils.convert_bools_to_string(ext_env) logger.debug("Extra environment vars {}".format(ext_env)) mode = config['lithops']['mode'] backend = config['lithops']['backend'] job = SimpleNamespace() job.chunksize = chunksize or config['lithops']['chunksize'] job.worker_processes = config[backend]['worker_processes'] job.execution_timeout = execution_timeout or config['lithops']['execution_timeout'] job.executor_id = executor_id job.job_id = job_id job.job_key = create_job_key(job.executor_id, job.job_id) job.extra_env = ext_env job.function_name = func.__name__ if inspect.isfunction(func) or inspect.ismethod(func) else type(func).__name__ job.total_calls = len(iterdata) if mode == SERVERLESS: job.runtime_memory = runtime_memory or config[backend]['runtime_memory'] job.runtime_timeout = config[backend]['runtime_timeout'] if job.execution_timeout >= job.runtime_timeout: job.execution_timeout = job.runtime_timeout - 5 elif mode in STANDALONE: job.runtime_memory = None runtime_timeout = config[STANDALONE]['hard_dismantle_timeout'] if job.execution_timeout >= runtime_timeout: job.execution_timeout = runtime_timeout - 10 elif mode == LOCALHOST: job.runtime_memory = None job.runtime_timeout = None exclude_modules_cfg = config['lithops'].get('exclude_modules', []) include_modules_cfg = config['lithops'].get('include_modules', []) exc_modules = set() inc_modules = set() if exclude_modules_cfg: exc_modules.update(exclude_modules_cfg) if exclude_modules: exc_modules.update(exclude_modules) if include_modules_cfg is not None: inc_modules.update(include_modules_cfg) if include_modules_cfg is None and not include_modules: inc_modules = None if include_modules is not None and include_modules: inc_modules.update(include_modules) if include_modules is None: inc_modules = None logger.debug('ExecutorID {} | JobID {} - Serializing function and data'.format(executor_id, job_id)) job_serialize_start = time.time() serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + iterdata, inc_modules, exc_modules) data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) module_data = create_module_data(mod_paths) func_str = func_and_data_ser[0] func_module_str = pickle.dumps({'func': func_str, 'module_data': module_data}, -1) func_module_size_bytes = len(func_module_str) host_job_meta['host_job_serialize_time'] = round(time.time()-job_serialize_start, 6) host_job_meta['data_size_bytes'] = data_size_bytes host_job_meta['func_module_size_bytes'] = func_module_size_bytes # Check data limit if 'data_limit' in config['lithops']: data_limit = config['lithops']['data_limit'] else: data_limit = MAX_AGG_DATA_SIZE if data_limit and data_size_bytes > data_limit*1024**2: log_msg = ('ExecutorID {} | JobID {} - Total data exceeded maximum size ' 'of {}'.format(executor_id, job_id, utils.sizeof_fmt(data_limit*1024**2))) raise Exception(log_msg) # Upload function and data upload_function = not config['lithops'].get('customized_runtime', False) upload_data = not (len(str(data_strs[0])) * job.chunksize < 8*1204 and backend in FAAS_BACKENDS) # Upload function and modules if upload_function: function_hash = hashlib.md5(func_module_str).hexdigest() job.func_key = create_func_key(executor_id, function_hash) if job.func_key not in FUNCTION_CACHE: logger.debug('ExecutorID {} | JobID {} - Uploading function and modules ' 'to the storage backend'.format(executor_id, job_id)) func_upload_start = time.time() internal_storage.put_func(job.func_key, func_module_str) func_upload_end = time.time() host_job_meta['host_func_upload_time'] = round(func_upload_end - func_upload_start, 6) FUNCTION_CACHE.add(job.func_key) else: logger.debug('ExecutorID {} | JobID {} - Function and modules ' 'found in local cache'.format(executor_id, job_id)) host_job_meta['host_func_upload_time'] = 0 else: # Prepare function and modules locally to store in the runtime image later function_file = func.__code__.co_filename function_hash = hashlib.md5(open(function_file, 'rb').read()).hexdigest()[:16] mod_hash = hashlib.md5(repr(sorted(mod_paths)).encode('utf-8')).hexdigest()[:16] job.func_key = func_key_suffix job.ext_runtime_uuid = '{}{}'.format(function_hash, mod_hash) job.local_tmp_dir = os.path.join(CUSTOM_RUNTIME_DIR, job.ext_runtime_uuid) _store_func_and_modules(job.local_tmp_dir, job.func_key, func_str, module_data) host_job_meta['host_func_upload_time'] = 0 # upload data if upload_data: # Upload iterdata to COS only if a single element is greater than 8KB logger.debug('ExecutorID {} | JobID {} - Uploading data to the storage backend' .format(executor_id, job_id)) # pass_iteradata through an object storage file data_key = create_data_key(executor_id, job_id) job.data_key = data_key data_bytes, data_byte_ranges = utils.agg_data(data_strs) job.data_byte_ranges = data_byte_ranges data_upload_start = time.time() internal_storage.put_data(data_key, data_bytes) data_upload_end = time.time() host_job_meta['host_data_upload_time'] = round(data_upload_end-data_upload_start, 6) else: # pass iteradata as part of the invocation payload logger.debug('ExecutorID {} | JobID {} - Data per activation is < ' '{}. Passing data through invocation payload' .format(executor_id, job_id, utils.sizeof_fmt(8*1024))) job.data_key = None job.data_byte_ranges = None job.data_byte_strs = data_strs host_job_meta['host_data_upload_time'] = 0 host_job_meta['host_job_created_time'] = round(time.time() - host_job_meta['host_job_create_tstamp'], 6) job.metadata = host_job_meta return job