def _create_job(config, internal_storage, executor_id, job_id, func, data, runtime_meta, runtime_memory=None, extra_env=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], original_func_name=None, remote_invocation=False, original_total_tasks=None, execution_timeout=EXECUTION_TIMEOUT): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :param original_func_name: Name of the function to invoke. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ log_level = os.getenv('CB_LOG_LEVEL') runtime_name = config['pywren']['runtime'] if runtime_memory is None: runtime_memory = config['pywren']['runtime_memory'] if original_func_name: func_name = original_func_name else: func_name = func.__name__ if extra_env is not None: extra_env = utils.convert_bools_to_string(extra_env) if not data: return [] host_job_meta = {} job_description = {} job_description['runtime_name'] = runtime_name job_description['runtime_memory'] = int(runtime_memory) job_description['execution_timeout'] = execution_timeout job_description['func_name'] = func_name job_description['extra_env'] = extra_env job_description['total_calls'] = len(data) job_description['invoke_pool_threads'] = invoke_pool_threads job_description['job_id'] = job_id job_description['remote_invocation'] = remote_invocation job_description['original_total_calls'] = original_total_tasks log_msg = 'ExecutorID {} | JobID {} - Serializing function and data'.format(executor_id, job_id) logger.debug(log_msg) # pickle func and all data (to capture module dependencies) exclude_modules.extend(config['pywren'].get('exclude_modules', [])) include_modules_cfg = config['pywren'].get('include_modules', []) if include_modules is not None and include_modules_cfg is not None: include_modules.extend(include_modules_cfg) serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + data, include_modules, exclude_modules) func_str = func_and_data_ser[0] data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) host_job_meta['agg_data'] = False host_job_meta['data_size_bytes'] = data_size_bytes log_msg = 'ExecutorID {} | JobID {} - Uploading function and data'.format(executor_id, job_id) logger.info(log_msg) if not log_level: print(log_msg, end=' ') if data_size_bytes < MAX_AGG_DATA_SIZE: agg_data_key = create_agg_data_key(internal_storage.prefix, executor_id, job_id) job_description['data_key'] = agg_data_key agg_data_bytes, agg_data_ranges = _agg_data(data_strs) job_description['data_ranges'] = agg_data_ranges agg_upload_time = time.time() internal_storage.put_data(agg_data_key, agg_data_bytes) host_job_meta['agg_data'] = True host_job_meta['data_upload_time'] = time.time() - agg_upload_time host_job_meta['data_upload_timestamp'] = time.time() else: log_msg = ('ExecutorID {} | JobID {} - Total data exceeded ' 'maximum size of {} bytes'.format(executor_id, job_id, MAX_AGG_DATA_SIZE)) raise Exception(log_msg) module_data = create_module_data(mod_paths) # Create func and upload host_job_meta['func_name'] = func_name func_module_str = pickle.dumps({'func': func_str, 'module_data': module_data}, -1) host_job_meta['func_module_bytes'] = len(func_module_str) func_upload_time = time.time() func_key = create_func_key(internal_storage.prefix, executor_id, job_id) job_description['func_key'] = func_key internal_storage.put_func(func_key, func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() if not log_level: func_and_data_size = utils.sizeof_fmt(host_job_meta['func_module_bytes']+host_job_meta['data_size_bytes']) log_msg = '- Total: {}'.format(func_and_data_size) print(log_msg) job_description['host_job_meta'] = host_job_meta return job_description
def _create_job(config, internal_storage, executor_id, job_id, func, data, runtime_meta, runtime_memory=None, extra_env=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], execution_timeout=None, job_created_timestamp=None): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ log_level = os.getenv('PYWREN_LOGLEVEL') runtime_name = config['pywren']['runtime'] if runtime_memory is None: runtime_memory = config['pywren']['runtime_memory'] ext_env = {} if extra_env is None else extra_env.copy() if ext_env: ext_env = utils.convert_bools_to_string(ext_env) logger.debug("Extra environment vars {}".format(ext_env)) if not data: return [] if execution_timeout is None: execution_timeout = config['pywren']['runtime_timeout'] - 5 job_description = {} job_description['runtime_name'] = runtime_name job_description['runtime_memory'] = int(runtime_memory) job_description['execution_timeout'] = execution_timeout job_description['function_name'] = func.__name__ job_description['extra_env'] = ext_env job_description['total_calls'] = len(data) job_description['invoke_pool_threads'] = invoke_pool_threads job_description['executor_id'] = executor_id job_description['job_id'] = job_id exclude_modules_cfg = config['pywren'].get('exclude_modules', []) include_modules_cfg = config['pywren'].get('include_modules', []) exc_modules = set() inc_modules = set() if exclude_modules_cfg: exc_modules.update(exclude_modules_cfg) if exclude_modules: exc_modules.update(exclude_modules) if include_modules_cfg is not None: inc_modules.update(include_modules_cfg) if include_modules_cfg is None and not include_modules: inc_modules = None if include_modules is not None and include_modules: inc_modules.update(include_modules) if include_modules is None: inc_modules = None host_job_meta = {'job_created_timestamp': job_created_timestamp} logger.debug( 'ExecutorID {} | JobID {} - Serializing function and data'.format( executor_id, job_id)) serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + data, inc_modules, exc_modules) data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) module_data = create_module_data(mod_paths) func_str = func_and_data_ser[0] func_module_str = pickle.dumps( { 'func': func_str, 'module_data': module_data }, -1) func_module_size_bytes = len(func_module_str) total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes) host_job_meta['data_size_bytes'] = data_size_bytes host_job_meta['func_module_size_bytes'] = func_module_size_bytes if 'data_limit' in config['pywren']: data_limit = config['pywren']['data_limit'] else: data_limit = MAX_AGG_DATA_SIZE if data_limit and data_size_bytes > data_limit * 1024**2: log_msg = ( 'ExecutorID {} | JobID {} - Total data exceeded maximum size ' 'of {}'.format(executor_id, job_id, sizeof_fmt(data_limit * 1024**2))) raise Exception(log_msg) log_msg = ('ExecutorID {} | JobID {} - Uploading function and data ' '- Total: {}'.format(executor_id, job_id, total_size)) print(log_msg) if not log_level else logger.info(log_msg) # Upload data data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id) job_description['data_key'] = data_key data_bytes, data_ranges = utils.agg_data(data_strs) job_description['data_ranges'] = data_ranges data_upload_time = time.time() internal_storage.put_data(data_key, data_bytes) host_job_meta['data_upload_time'] = time.time() - data_upload_time host_job_meta['data_upload_timestamp'] = time.time() # Upload function and modules func_upload_time = time.time() func_key = create_func_key(JOBS_PREFIX, executor_id, job_id) job_description['func_key'] = func_key internal_storage.put_func(func_key, func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() job_description['metadata'] = host_job_meta return job_description