def create_map_job(config, internal_storage, executor_id, map_job_id, map_function, iterdata, runtime_meta, runtime_memory=None, extra_params=None, extra_env=None, obj_chunk_size=None, obj_chunk_number=None, remote_invocation=False, remote_invocation_groups=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], is_remote_cluster=False, execution_timeout=EXECUTION_TIMEOUT): """ Wrapper to create a map job. It integrates COS logic to process objects. """ map_func = map_function map_iterdata = utils.verify_args(map_function, iterdata, extra_params) new_invoke_pool_threads = invoke_pool_threads new_runtime_memory = runtime_memory # Object processing functionality parts_per_object = None if utils.is_object_processing_function(map_function): # If it is object processing function, create partitions according chunk_size or chunk_number logger.debug('ExecutorID {} | JobID {} - Calling map on partitions from object storage flow'.format(executor_id, map_job_id)) map_iterdata, parts_per_object = create_partitions(config, map_iterdata, obj_chunk_size, obj_chunk_number) # ######## # Remote invocation functionality original_total_tasks = len(map_iterdata) if original_total_tasks == 1 or is_remote_cluster: remote_invocation = False if remote_invocation: def remote_invoker(input_data): pw = pywren.ibm_cf_executor() return pw.map(map_function, input_data, runtime_memory=runtime_memory, invoke_pool_threads=invoke_pool_threads, extra_env=extra_env) map_func = remote_invoker if remote_invocation_groups: map_iterdata = [[iterdata[x:x+remote_invocation_groups]] for x in range(0, original_total_tasks, remote_invocation_groups)] else: map_iterdata = [iterdata] map_iterdata = utils.verify_args(remote_invoker, map_iterdata, extra_params) new_invoke_pool_threads = 1 new_runtime_memory = runtime_memory # ######## job_description = _create_job(config, internal_storage, executor_id, map_job_id, map_func, map_iterdata, runtime_meta=runtime_meta, runtime_memory=new_runtime_memory, extra_env=extra_env, invoke_pool_threads=new_invoke_pool_threads, include_modules=include_modules, exclude_modules=exclude_modules, remote_invocation=remote_invocation, original_total_tasks=original_total_tasks, execution_timeout=execution_timeout) job_description['parts_per_object'] = parts_per_object return job_description
def create_map_job(config, internal_storage, executor_id, job_id, map_function, iterdata, runtime_meta, runtime_memory=None, extra_params=None, extra_env=None, obj_chunk_size=None, obj_chunk_number=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], execution_timeout=None): """ Wrapper to create a map job. It integrates COS logic to process objects. """ job_created_timestamp = time.time() map_func = map_function map_iterdata = utils.verify_args(map_function, iterdata, extra_params) new_invoke_pool_threads = invoke_pool_threads new_runtime_memory = runtime_memory if config['pywren'].get('rabbitmq_monitor', False): rabbit_amqp_url = config['rabbitmq'].get('amqp_url') utils.create_rabbitmq_resources(rabbit_amqp_url, executor_id, job_id) # Object processing functionality parts_per_object = None if is_object_processing_function(map_function): # If it is object processing function, create partitions according chunk_size or chunk_number logger.debug( 'ExecutorID {} | JobID {} - Calling map on partitions from object storage flow' .format(executor_id, job_id)) map_iterdata, parts_per_object = create_partitions( config, map_iterdata, obj_chunk_size, obj_chunk_number) # ######## job_description = _create_job(config, internal_storage, executor_id, job_id, map_func, map_iterdata, runtime_meta=runtime_meta, runtime_memory=new_runtime_memory, extra_env=extra_env, invoke_pool_threads=new_invoke_pool_threads, include_modules=include_modules, exclude_modules=exclude_modules, execution_timeout=execution_timeout, job_created_timestamp=job_created_timestamp) if parts_per_object: job_description['parts_per_object'] = parts_per_object return job_description
def create_map_job(config, internal_storage, executor_id, map_job_id, map_function, iterdata, runtime_meta, runtime_memory=None, extra_params=None, extra_env=None, obj_chunk_size=None, obj_chunk_number=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], execution_timeout=EXECUTION_TIMEOUT): """ Wrapper to create a map job. It integrates COS logic to process objects. """ map_func = map_function map_iterdata = utils.verify_args(map_function, iterdata, extra_params) new_invoke_pool_threads = invoke_pool_threads new_runtime_memory = runtime_memory # Object processing functionality parts_per_object = None if utils.is_object_processing_function(map_function): # If it is object processing function, create partitions according chunk_size or chunk_number logger.debug( 'ExecutorID {} | JobID {} - Calling map on partitions from object storage flow' .format(executor_id, map_job_id)) map_iterdata, parts_per_object = create_partitions( config, map_iterdata, obj_chunk_size, obj_chunk_number) # ######## job_description = _create_job(config, internal_storage, executor_id, map_job_id, map_func, map_iterdata, runtime_meta=runtime_meta, runtime_memory=new_runtime_memory, extra_env=extra_env, invoke_pool_threads=new_invoke_pool_threads, include_modules=include_modules, exclude_modules=exclude_modules, execution_timeout=execution_timeout) job_description['parts_per_object'] = parts_per_object return job_description
def run(self): """ Runs the function """ logger.info("Started") result = None exception = False try: self.internal_storage = InternalStorage(self.storage_config) self.internal_storage.tmp_obj_prefix = self.output_key.rsplit( '/', 1)[0] loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() if is_object_processing_function(function): self._create_data_stream(data) self._fill_optional_args(function, data) if self.show_memory: logger.debug( "Memory usage before call the function: {}".format( get_current_memory_usage())) logger.info("Going to execute '{}()'".format(str( function.__name__))) print('---------------------- FUNCTION LOG ----------------------', flush=True) func_exec_time_t1 = time.time() result = function(**data) func_exec_time_t2 = time.time() print('----------------------------------------------------------', flush=True) logger.info("Success function execution") if self.show_memory: logger.debug("Memory usage after call the function: {}".format( get_current_memory_usage())) self.stats.write('function_exec_time', round(func_exec_time_t2 - func_exec_time_t1, 8)) # Check for new futures if result is not None: self.stats.write("result", True) if isinstance(result, ResponseFuture) or \ (type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture)): self.stats.write('new_futures', True) logger.debug("Pickling result") output_dict = {'result': result} pickled_output = pickle.dumps(output_dict) if self.show_memory: logger.debug( "Memory usage after output serialization: {}".format( get_current_memory_usage())) else: logger.debug("No result to store") self.stats.write("result", False) except Exception: exception = True self.stats.write("exception", True) exc_type, exc_value, exc_traceback = sys.exc_info() print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) if self.show_memory: logger.debug("Memory usage after call the function: {}".format( get_current_memory_usage())) try: logger.debug("Pickling exception") pickled_exc = pickle.dumps( (exc_type, exc_value, exc_traceback)) pickle.loads( pickled_exc ) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) except Exception as pickle_exception: # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails self.stats.write("exc_pickle_fail", True) pickled_exc = pickle.dumps({ 'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'pickle_exception': pickle_exception }) pickle.loads( pickled_exc ) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) finally: store_result = strtobool(os.environ.get('STORE_RESULT', 'True')) if result is not None and store_result and not exception: output_upload_timestamp_t1 = time.time() logger.info( "Storing function result - output.pickle - Size: {}". format(sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_timestamp_t2 = time.time() self.stats.write( "output_upload_time", round( output_upload_timestamp_t2 - output_upload_timestamp_t1, 8)) self.result_queue.put("Finished") logger.info("Finished")
def create_map_job(config, internal_storage, executor_id, job_id, map_function, iterdata, obj_chunk_size=None, extra_env=None, extra_meta=None, runtime_memory=None, remote_invocation=False, remote_invocation_groups=None, invoke_pool_threads=128, exclude_modules=None, is_cf_cluster=False, execution_timeout=EXECUTION_TIMEOUT, overwrite_invoke_args=None): """ Wrapper to create a map job. It integrates COS logic to process objects. """ map_job_id = f'M{job_id}' data = utils.iterdata_as_list(iterdata) map_func = map_function map_iterdata = data new_invoke_pool_threads = invoke_pool_threads new_runtime_memory = runtime_memory # Object processing functionality parts_per_object = None if utils.is_object_processing_function(map_function): ''' If it is object processing function, create partitions according chunk_size ''' logger.debug( 'ExecutorID {} | JobID {} - Calling map on partitions from object storage flow' .format(executor_id, job_id)) arg_data = utils.verify_args(map_function, data, object_processing=True) map_iterdata, parts_per_object = create_partitions( config, arg_data, obj_chunk_size) map_func = partition_processor(map_function) # ######## # Remote invocation functionality original_total_tasks = len(map_iterdata) if original_total_tasks == 1 or is_cf_cluster: remote_invocation = False if remote_invocation: rabbitmq_monitor = "CB_RABBITMQ_MONITOR" in os.environ def remote_invoker(input_data): pw = pywren.ibm_cf_executor(rabbitmq_monitor=rabbitmq_monitor) return pw.map(map_function, input_data, runtime_memory=runtime_memory, invoke_pool_threads=invoke_pool_threads, extra_env=extra_env, extra_meta=extra_meta) map_func = remote_invoker if remote_invocation_groups: map_iterdata = [[ iterdata[x:x + remote_invocation_groups] ] for x in range(0, original_total_tasks, remote_invocation_groups) ] else: map_iterdata = [iterdata] new_invoke_pool_threads = 1 new_runtime_memory = runtime_memory # ######## job_description = _create_job(config, internal_storage, executor_id, map_job_id, map_func, map_iterdata, extra_env=extra_env, extra_meta=extra_meta, runtime_memory=new_runtime_memory, invoke_pool_threads=new_invoke_pool_threads, overwrite_invoke_args=overwrite_invoke_args, exclude_modules=exclude_modules, original_func_name=map_function.__name__, remote_invocation=remote_invocation, original_total_tasks=original_total_tasks, execution_timeout=execution_timeout) return job_description, parts_per_object