def run_job(self, job_payload): """ Run the job description against the selected environment """ executor_id = job_payload['executor_id'] job_id = job_payload['job_id'] runtime = job_payload['job_description']['runtime_name'] job_key = create_job_key(executor_id, job_id) log_file = os.path.join(LOGS_DIR, job_key + '.log') logger.info("Running job in {}. View execution logs at {}".format( runtime, log_file)) if not os.path.isfile(RUNNER): self.env.setup() exec_command = self.env.get_execution_cmd(runtime) executor_id = job_payload['executor_id'] job_id = job_payload['job_id'] storage_bucket = job_payload['config']['lithops']['storage_bucket'] job_dir = os.path.join(LITHOPS_TEMP_DIR, storage_bucket, JOBS_PREFIX) os.makedirs(job_dir, exist_ok=True) jobr_filename = os.path.join(job_dir, '{}-job.json'.format(job_key)) with open(jobr_filename, 'w') as jl: json.dump(job_payload, jl) log_file = open(RN_LOG_FILE, 'a') sp.Popen(exec_command + ' run ' + jobr_filename, shell=True, stdout=log_file, stderr=log_file, universal_newlines=True)
def run(): log_file_stream = open(RN_LOG_FILE, 'a') sys.stdout = log_file_stream sys.stderr = log_file_stream job_filename = sys.argv[2] logger.info('Got {} job file'.format(job_filename)) with open(job_filename, 'rb') as jf: job = SimpleNamespace(**json.load(jf)) logger.info('ExecutorID {} | JobID {} - Starting execution' .format(job.executor_id, job.job_id)) runner = Runner(job.config, job.executor_id, job.job_id) runner.run(job.job_description, job.log_level) runner.wait() job_key = create_job_key(job.executor_id, job.job_id) done = os.path.join(JOBS_DONE_DIR, job_key+'.done') Path(done).touch() if os.path.exists(job_filename): os.remove(job_filename) logger.info('ExecutorID {} | JobID {} - Execution Finished' .format(job.executor_id, job.job_id))
def _process_runner(self, worker_id): logger.debug('Localhost worker process {} started'.format(worker_id)) os.environ['__LITHOPS_LOCAL_EXECUTION'] = 'True' p_logger = logging.getLogger('lithops') while True: with io.StringIO() as buf, redirect_stdout(buf), redirect_stderr(buf): event = self.queue.get(block=True) if isinstance(event, ShutdownSentinel): break act_id = str(uuid.uuid4()).replace('-', '')[:12] os.environ['__LITHOPS_ACTIVATION_ID'] = act_id executor_id = event['executor_id'] job_id = event['job_id'] setup_logger(event['log_level']) p_logger.info("Lithops v{} - Starting execution".format(__version__)) function_handler(event) log_output = buf.getvalue() job_key = create_job_key(executor_id, job_id) log_file = os.path.join(LOGS_DIR, job_key+'.log') header = "Activation: '{}' ({})\n[\n".format(event['runtime_name'], act_id) tail = ']\n\n' output = log_output.replace('\n', '\n ', log_output.count('\n')-1) with open(log_file, 'a') as lf: lf.write(header+' '+output+tail) with open(FN_LOG_FILE, 'a') as lf: lf.write(header+' '+output+tail)
def _send_status_rabbitmq(self): """ Send the status event to RabbitMQ """ dmpd_response_status = json.dumps(self.response) drs = sizeof_fmt(len(dmpd_response_status)) executor_id = self.response['executor_id'] job_id = self.response['job_id'] rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url') status_sent = False output_query_count = 0 params = pika.URLParameters(rabbit_amqp_url) job_key = create_job_key(executor_id, job_id) exchange = 'lithops-{}'.format(job_key) while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: connection = pika.BlockingConnection(params) channel = connection.channel() channel.exchange_declare(exchange=exchange, exchange_type='fanout', auto_delete=True) channel.basic_publish(exchange=exchange, routing_key='', body=dmpd_response_status) connection.close() logger.info("Execution status sent to rabbitmq - Size: {}".format(drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send status to rabbitmq') time.sleep(0.2)
def clean(self, fs=None, cs=None, clean_cloudobjects=True, spawn_cleaner=True, force=False): """ Deletes all the temp files from storage. These files include the function, the data serialization and the function invocation results. It can also clean cloudobjects. :param fs: list of futures to clean :param cs: list of cloudobjects to clean :param clean_cloudobjects: true/false :param spawn_cleaner true/false """ os.makedirs(CLEANER_DIR, exist_ok=True) def save_data_to_clean(data): with tempfile.NamedTemporaryFile(dir=CLEANER_DIR, delete=False) as temp: pickle.dump(data, temp) if cs: data = { 'cos_to_clean': list(cs), 'storage_config': self.internal_storage.get_storage_config() } save_data_to_clean(data) if not fs: return futures = fs or self.futures futures = [futures] if type(futures) != list else futures present_jobs = { create_job_key(f.executor_id, f.job_id) for f in futures if (f.executor_id.count('-') == 1 and f.done) or force } jobs_to_clean = present_jobs - self.cleaned_jobs if jobs_to_clean: logger.info("ExecutorID {} - Cleaning temporary data".format( self.executor_id)) data = { 'jobs_to_clean': jobs_to_clean, 'clean_cloudobjects': clean_cloudobjects, 'storage_config': self.internal_storage.get_storage_config() } save_data_to_clean(data) self.cleaned_jobs.update(jobs_to_clean) self.compute_handler.clear() if (jobs_to_clean or cs) and spawn_cleaner: log_file = open(CLEANER_LOG_FILE, 'a') cmdstr = '{} -m lithops.scripts.cleaner'.format(sys.executable) sp.Popen(cmdstr, shell=True, stdout=log_file, stderr=log_file)
def start_job_monitoring(self, job): logger.debug('ExecutorID {} | JobID {} - Starting job monitoring' .format(job.executor_id, job.job_id)) if self.rabbitmq_monitor: th = Thread(target=self._job_monitoring_rabbitmq, args=(job,)) else: th = Thread(target=self._job_monitoring_os, args=(job,)) if not self.is_lithops_worker: th.daemon = True job_key = create_job_key(job.executor_id, job.job_id) self.monitors[job_key] = {'thread': th, 'should_run': True} th.start()
def create_resources(rabbit_amqp_url, executor_id, job_id): job_key = create_job_key(executor_id, job_id) exchange = 'lithops-{}'.format(job_key) queue_0 = '{}-0'.format(exchange) # For waiting queue_1 = '{}-1'.format(exchange) # For invoker params = pika.URLParameters(rabbit_amqp_url) connection = pika.BlockingConnection(params) channel = connection.channel() channel.exchange_declare(exchange=exchange, exchange_type='fanout', auto_delete=True) channel.queue_declare(queue=queue_0, auto_delete=True) channel.queue_bind(exchange=exchange, queue=queue_0) channel.queue_declare(queue=queue_1, auto_delete=True) channel.queue_bind(exchange=exchange, queue=queue_1) connection.close()
def run(self, job_description, log_level): logger.info("Localhost run method") job = SimpleNamespace(**job_description) job_key = create_job_key(job.executor_id, job.job_id) if not hasattr(job, 'call_id'): logger.info("Running entire job {}".format(job_key)) for i in range(job.total_calls): call_id = "{:05d}".format(i) self._invoke(job, call_id, log_level) else: logger.info("Running single call id {}-{}".format( job_key, job.call_id)) self._invoke(job, job.call_id, log_level) for i in self.workers: self.job_queue.put(ShutdownSentinel())
def _job_monitoring_os(self, job): total_callids_done = 0 job_key = create_job_key(job.executor_id, job.job_id) while self.monitors[job_key]['should_run'] and total_callids_done < job.total_calls: time.sleep(1) callids_running, callids_done = self.internal_storage.get_job_status(job.executor_id, job.job_id) total_new_tokens = len(callids_done) - total_callids_done total_callids_done = total_callids_done + total_new_tokens for i in range(total_new_tokens): if self.monitors[job_key]['should_run']: self.token_bucket_q.put('#') else: break logger.debug('ExecutorID {} | JobID {} - Job monitoring finished' .format(job.executor_id, job.job_id))
def _start_log_monitor(self, executor_id, job_id): """ Starts a process that polls the remote log into a local file """ job_key = create_job_key(executor_id, job_id) def log_monitor(): os.makedirs(LOGS_DIR, exist_ok=True) log_file = os.path.join(LOGS_DIR, job_key + '.log') fdout_0 = open(log_file, 'wb') fdout_1 = open(FN_LOG_FILE, 'ab') ssh_client = self.ssh_client.create_client(self.ip_address) cmd = 'tail -n +1 -F /tmp/lithops/logs/{}.log'.format(job_key) stdin, stdout, stderr = ssh_client.exec_command(cmd) channel = stdout.channel stdin.close() channel.shutdown_write() data = None while not channel.closed: try: readq, _, _ = select.select([channel], [], [], 10) if readq and readq[0].recv_ready(): data = channel.recv(len(readq[0].in_buffer)) fdout_0.write(data) fdout_0.flush() fdout_1.write(data) fdout_1.flush() else: if data: cmd = 'ls /tmp/lithops/jobs/{}.done'.format( job_key) _, out, _ = ssh_client.exec_command(cmd) if out.read().decode().strip(): break time.sleep(0.5) except Exception: pass if not self.is_lithops_worker: Thread(target=log_monitor, daemon=True).start() logger.debug('ExecutorID {} | JobID {} - Remote log monitor ' 'started'.format(executor_id, job_id))
def delete_rabbitmq_resources(rabbit_amqp_url, executor_id, job_id): """ Deletes RabbitMQ queues and exchanges of a given job. Only called when an exception is produced, otherwise resources are automatically deleted. """ job_key = create_job_key(executor_id, job_id) exchange = 'lithops-{}'.format(job_key) queue_0 = '{}-0'.format(exchange) # For waiting queue_1 = '{}-1'.format(exchange) # For invoker params = pika.URLParameters(rabbit_amqp_url) connection = pika.BlockingConnection(params) channel = connection.channel() channel.queue_delete(queue=queue_0) channel.queue_delete(queue=queue_1) channel.exchange_delete(exchange=exchange) connection.close()
def get_job_status(self, executor_id, job_id): """ Get the status of a callset. :param executor_id: executor's ID :return: A list of call IDs that have updated status. """ job_key = create_job_key(executor_id, job_id) callset_prefix = '/'.join([JOBS_PREFIX, job_key]) keys = self.storage.list_keys(self.bucket, callset_prefix) running_keys = [k.split('/') for k in keys if init_key_suffix in k] running_callids = [(tuple(k[1].rsplit("-", 1) + [k[2]]), k[3].replace(init_key_suffix, '')) for k in running_keys] done_keys = [k.split('/')[1:] for k in keys if status_key_suffix in k] done_callids = [tuple(k[0].rsplit("-", 1) + [k[1]]) for k in done_keys] return set(running_callids), set(done_callids)
def run(): """ Run a job """ global last_usage_time global backend_handler global jobs message = flask.request.get_json(force=True, silent=True) if message and not isinstance(message, dict): return error('The action did not receive a dictionary as an argument.') try: runtime = message['job_description']['runtime_name'] verify_runtime_name(runtime) except Exception as e: return error(str(e)) last_usage_time = time.time() standalone_config = message['config']['standalone'] backend_handler.auto_dismantle = standalone_config['auto_dismantle'] backend_handler.soft_dismantle_timeout = standalone_config[ 'soft_dismantle_timeout'] backend_handler.hard_dismantle_timeout = standalone_config[ 'hard_dismantle_timeout'] act_id = str(uuid.uuid4()).replace('-', '')[:12] executor_id = message['executor_id'] job_id = message['job_id'] job_key = create_job_key(executor_id, job_id) jobs[job_key] = 'running' localhost_handler = LocalhostHandler({'runtime': runtime}) localhost_handler.run_job(message) response = flask.jsonify({'activationId': act_id}) response.status_code = 202 return response
def run_job(self, job_payload): """ Run the job description against the selected environment """ executor_id = job_payload['executor_id'] job_id = job_payload['job_id'] job_key = create_job_key(executor_id, job_id) log_file = os.path.join(LOGS_DIR, job_key + '.log') if not self._is_proxy_ready(): # The VM instance is stopped if not self.log_active: print( 'ExecutorID {} - Starting VM instance'.format(executor_id)) init_time = time.time() self.backend.start() self._wait_proxy_ready() total_start_time = round(time.time() - init_time, 2) logger.info( 'VM instance ready in {} seconds'.format(total_start_time)) self._start_log_monitor(executor_id, job_id) logger.info('ExecutorID {} | JobID {} - Running job'.format( executor_id, job_id)) logger.info("View execution logs at {}".format(log_file)) if self.is_lithops_worker: url = "http://{}:{}/run".format('127.0.0.1', PROXY_SERVICE_PORT) r = requests.post(url, data=json.dumps(job_payload), verify=True) response = r.json() else: cmd = ('curl -X POST http://127.0.0.1:8080/run -d {} ' '-H \'Content-Type: application/json\''.format( shlex.quote(json.dumps(job_payload)))) out = self.ssh_client.run_remote_command(self.ip_address, cmd) response = json.loads(out) return response['activationId']
def _job_monitoring_rabbitmq(self, job): total_callids_done = 0 job_key = create_job_key(job.executor_id, job.job_id) exchange = 'lithops-{}'.format(job_key) queue_1 = '{}-1'.format(exchange) params = pika.URLParameters(self.rabbit_amqp_url) connection = pika.BlockingConnection(params) channel = connection.channel() def callback(ch, method, properties, body): nonlocal total_callids_done call_status = json.loads(body.decode("utf-8")) if call_status['type'] == '__end__': if self.monitors[job_key]['should_run']: self.token_bucket_q.put('#') total_callids_done += 1 if total_callids_done == job.total_calls or \ not self.monitors[job_key]['should_run']: ch.stop_consuming() channel.basic_consume(callback, queue=queue_1, no_ack=True) channel.start_consuming()
def clean(self, fs: Optional[Union[ResponseFuture, List[ResponseFuture]]] = None, cs: Optional[List[CloudObject]] = None, clean_cloudobjects: Optional[bool] = True, clean_fn: Optional[bool] = False, force: Optional[bool] = False): """ Deletes all the temp files from storage. These files include the function, the data serialization and the function invocation results. It can also clean cloudobjects. :param fs: List of futures to clean :param cs: List of cloudobjects to clean :param clean_cloudobjects: Delete all cloudobjects created with this executor :param clan_fn: Delete cached functions in this executor :param force: Clean all future objects even if they have not benn completed """ global CLEANER_PROCESS def save_data_to_clean(data): with tempfile.NamedTemporaryFile(dir=CLEANER_DIR, delete=False) as temp: pickle.dump(data, temp) if cs: data = { 'cos_to_clean': list(cs), 'storage_config': self.internal_storage.get_storage_config() } save_data_to_clean(data) if not fs: return if clean_fn: data = { 'fn_to_clean': self.executor_id, 'storage_config': self.internal_storage.get_storage_config() } save_data_to_clean(data) futures = fs or self.futures futures = [futures] if type(futures) != list else futures present_jobs = { create_job_key(f.executor_id, f.job_id) for f in futures if (f.executor_id.count('-') == 1 and f.done) or force } jobs_to_clean = present_jobs - self.cleaned_jobs if jobs_to_clean: logger.info( f'ExecutorID {self.executor_id} - Cleaning temporary data') data = { 'jobs_to_clean': jobs_to_clean, 'clean_cloudobjects': clean_cloudobjects, 'storage_config': self.internal_storage.get_storage_config() } save_data_to_clean(data) self.cleaned_jobs.update(jobs_to_clean) spawn_cleaner = not (CLEANER_PROCESS and CLEANER_PROCESS.poll() is None) if (jobs_to_clean or cs) and spawn_cleaner: cmd = [sys.executable, '-m', 'lithops.scripts.cleaner'] CLEANER_PROCESS = sp.Popen(cmd, start_new_session=True)
def _create_job(config, internal_storage, executor_id, job_id, func, iterdata, runtime_meta, runtime_memory, extra_env, include_modules, exclude_modules, execution_timeout, host_job_meta, chunksize=None, worker_processes=None, invoke_pool_threads=16): """ Creates a new Job """ ext_env = {} if extra_env is None else extra_env.copy() if ext_env: ext_env = utils.convert_bools_to_string(ext_env) logger.debug("Extra environment vars {}".format(ext_env)) job = SimpleNamespace() job.chunksize = chunksize or config['lithops']['chunksize'] job.worker_processes = worker_processes or config['lithops'][ 'worker_processes'] job.execution_timeout = execution_timeout or config['lithops'][ 'execution_timeout'] job.executor_id = executor_id job.job_id = job_id job.job_key = create_job_key(job.executor_id, job.job_id) job.extra_env = ext_env job.function_name = func.__name__ job.total_calls = len(iterdata) mode = config['lithops']['mode'] if mode == SERVERLESS: job.invoke_pool_threads = invoke_pool_threads or config['serverless'][ 'invoke_pool_threads'] job.runtime_memory = runtime_memory or config['serverless'][ 'runtime_memory'] job.runtime_timeout = config['serverless']['runtime_timeout'] if job.execution_timeout >= job.runtime_timeout: job.execution_timeout = job.runtime_timeout - 5 elif mode == STANDALONE: job.runtime_memory = None runtime_timeout = config['standalone']['hard_dismantle_timeout'] if job.execution_timeout >= runtime_timeout: job.execution_timeout = runtime_timeout - 10 elif mode == LOCALHOST: job.runtime_memory = None job.runtime_timeout = execution_timeout exclude_modules_cfg = config['lithops'].get('exclude_modules', []) include_modules_cfg = config['lithops'].get('include_modules', []) exc_modules = set() inc_modules = set() if exclude_modules_cfg: exc_modules.update(exclude_modules_cfg) if exclude_modules: exc_modules.update(exclude_modules) if include_modules_cfg is not None: inc_modules.update(include_modules_cfg) if include_modules_cfg is None and not include_modules: inc_modules = None if include_modules is not None and include_modules: inc_modules.update(include_modules) if include_modules is None: inc_modules = None logger.debug( 'ExecutorID {} | JobID {} - Serializing function and data'.format( executor_id, job_id)) job_serialize_start = time.time() serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + iterdata, inc_modules, exc_modules) data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) module_data = create_module_data(mod_paths) func_str = func_and_data_ser[0] func_module_str = pickle.dumps( { 'func': func_str, 'module_data': module_data }, -1) func_module_size_bytes = len(func_module_str) total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes) host_job_meta['host_job_serialize_time'] = round( time.time() - job_serialize_start, 6) host_job_meta['data_size_bytes'] = data_size_bytes host_job_meta['func_module_size_bytes'] = func_module_size_bytes if 'data_limit' in config['lithops']: data_limit = config['lithops']['data_limit'] else: data_limit = MAX_AGG_DATA_SIZE if data_limit and data_size_bytes > data_limit * 1024**2: log_msg = ( 'ExecutorID {} | JobID {} - Total data exceeded maximum size ' 'of {}'.format(executor_id, job_id, sizeof_fmt(data_limit * 1024**2))) raise Exception(log_msg) logger.info('ExecutorID {} | JobID {} - Uploading function and data ' '- Total: {}'.format(executor_id, job_id, total_size)) # Upload data data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id) job.data_key = data_key data_bytes, data_byte_ranges = utils.agg_data(data_strs) job.data_byte_ranges = data_byte_ranges data_upload_start = time.time() internal_storage.put_data(data_key, data_bytes) data_upload_end = time.time() host_job_meta['host_data_upload_time'] = round( data_upload_end - data_upload_start, 6) func_upload_start = time.time() # Upload function and modules if config[mode].get('customized_runtime'): # Prepare function and modules locally to store in the runtime image later function_file = func.__code__.co_filename function_hash = hashlib.md5(open(function_file, 'rb').read()).hexdigest()[:16] mod_hash = hashlib.md5(repr( sorted(mod_paths)).encode('utf-8')).hexdigest()[:16] uuid = '{}{}'.format(function_hash, mod_hash) func_key = create_func_key(JOBS_PREFIX, uuid, "") _store_func_and_modules(func_key, func_str, module_data) job.ext_runtime_uuid = uuid else: func_key = create_func_key(JOBS_PREFIX, executor_id, job_id) internal_storage.put_func(func_key, func_module_str) job.func_key = func_key func_upload_end = time.time() host_job_meta['host_func_upload_time'] = round( func_upload_end - func_upload_start, 6) host_job_meta['host_job_created_time'] = round( time.time() - host_job_meta['host_job_create_tstamp'], 6) job.metadata = host_job_meta return job
def status(self, throw_except=True, internal_storage=None, check_only=False): """ Return the status returned by the call. If the call raised an exception, this method will raise the same exception If the future is cancelled before completing then CancelledError will be raised. :param check_only: Return None immediately if job is not complete. Default False. :param throw_except: Reraise exception if call raised. Default true. :param storage_handler: Storage handler to poll cloud storage. Default None. :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. """ if self._state == ResponseFuture.State.New: raise ValueError("task not yet invoked") if self.success or self.done: return self._call_status if self.ready and self._new_futures: self._set_state(ResponseFuture.State.Done) return self._call_status if self._call_status is None or self._call_status['type'] == '__init__': if internal_storage is None: internal_storage = InternalStorage(self._storage_config) check_storage_path(internal_storage.get_storage_config(), self._storage_path) self._call_status = internal_storage.get_call_status( self.executor_id, self.job_id, self.call_id) self._status_query_count += 1 if check_only: return self._call_status while self._call_status is None: time.sleep(self.GET_RESULT_SLEEP_SECS) self._call_status = internal_storage.get_call_status( self.executor_id, self.job_id, self.call_id) self._status_query_count += 1 self._host_status_done_tstamp = time.time() self.stats[ 'host_status_done_tstamp'] = self._host_status_done_tstamp or time.time( ) self.stats['host_status_query_count'] = self._status_query_count self.activation_id = self._call_status['activation_id'] if 'logs' in self._call_status: self.logs = zlib.decompress( base64.b64decode(self._call_status['logs'].encode())).decode() job_key = create_job_key(self.executor_id, self.job_id) log_file = os.path.join(LOGS_DIR, job_key + '.log') header = "Activation: '{}' ({})\n[\n".format( self.runtime_name, self.activation_id) tail = ']\n\n' output = self.logs.replace('\r', '').replace('\n', '\n ', self.logs.count('\n') - 1) with open(log_file, 'a') as lf: lf.write(header + ' ' + output + tail) with open(FN_LOG_FILE, 'a') as lf: lf.write(header + ' ' + output + tail) if self._call_status['exception']: self._set_state(ResponseFuture.State.Error) self._exception = pickle.loads(eval(self._call_status['exc_info'])) msg1 = ( 'ExecutorID {} | JobID {} - There was an exception - Activation ' 'ID: {}'.format(self.executor_id, self.job_id, self.activation_id)) if not self._call_status.get('exc_pickle_fail', False): fn_exctype = self._exception[0] fn_exc = self._exception[1] if fn_exc.args and fn_exc.args[0] == "HANDLER": self._handler_exception = True try: del fn_exc.errno except Exception: pass fn_exc.args = (fn_exc.args[1], ) else: fn_exctype = Exception fn_exc = Exception(self._exception['exc_value']) self._exception = (fn_exctype, fn_exc, self._exception['exc_traceback']) def exception_hook(exctype, exc, trcbck): if exctype == fn_exctype and str(exc) == str(fn_exc): logger.warning(msg1) if self._handler_exception: msg2 = 'Exception: {} - {}'.format( fn_exctype.__name__, fn_exc) logger.warning(msg2) else: traceback.print_exception(*self._exception) else: sys.excepthook = sys.__excepthook__ traceback.print_exception(exctype, exc, trcbck) if throw_except: sys.excepthook = exception_hook reraise(*self._exception) else: logger.warning(msg1) msg2 = 'Exception: {} - {}'.format(self._exception[0].__name__, self._exception[1]) logger.warning(msg2) return None for key in self._call_status: if any(ss in key for ss in ['time', 'tstamp', 'count', 'size', 'container']): self.stats[key] = self._call_status[key] self.stats['worker_exec_time'] = round( self.stats['worker_end_tstamp'] - self.stats['worker_start_tstamp'], 8) total_time = format(round(self.stats['worker_exec_time'], 2), '.2f') logger.debug( 'ExecutorID {} | JobID {} - Got status from call {} - Activation ' 'ID: {} - Time: {} seconds'.format(self.executor_id, self.job_id, self.call_id, self.activation_id, str(total_time))) self._set_state(ResponseFuture.State.Success) if not self._call_status['result']: self._produce_output = False if not self._produce_output: self._set_state(ResponseFuture.State.Done) if 'new_futures' in self._call_status and not self._new_futures: new_futures = pickle.loads(eval(self._call_status['new_futures'])) self._new_futures = [ new_futures ] if type(new_futures) == ResponseFuture else new_futures self._set_state(ResponseFuture.State.Futures) return self._call_status
def _create_job(config, internal_storage, executor_id, job_id, func, iterdata, runtime_meta, runtime_memory, extra_env, include_modules, exclude_modules, execution_timeout, host_job_meta, chunksize=None): """ Creates a new Job """ global FUNCTION_CACHE ext_env = {} if extra_env is None else extra_env.copy() if ext_env: ext_env = utils.convert_bools_to_string(ext_env) logger.debug("Extra environment vars {}".format(ext_env)) mode = config['lithops']['mode'] backend = config['lithops']['backend'] job = SimpleNamespace() job.chunksize = chunksize or config['lithops']['chunksize'] job.worker_processes = config[backend]['worker_processes'] job.execution_timeout = execution_timeout or config['lithops']['execution_timeout'] job.executor_id = executor_id job.job_id = job_id job.job_key = create_job_key(job.executor_id, job.job_id) job.extra_env = ext_env job.function_name = func.__name__ if inspect.isfunction(func) or inspect.ismethod(func) else type(func).__name__ job.total_calls = len(iterdata) if mode == SERVERLESS: job.runtime_memory = runtime_memory or config[backend]['runtime_memory'] job.runtime_timeout = config[backend]['runtime_timeout'] if job.execution_timeout >= job.runtime_timeout: job.execution_timeout = job.runtime_timeout - 5 elif mode in STANDALONE: job.runtime_memory = None runtime_timeout = config[STANDALONE]['hard_dismantle_timeout'] if job.execution_timeout >= runtime_timeout: job.execution_timeout = runtime_timeout - 10 elif mode == LOCALHOST: job.runtime_memory = None job.runtime_timeout = None exclude_modules_cfg = config['lithops'].get('exclude_modules', []) include_modules_cfg = config['lithops'].get('include_modules', []) exc_modules = set() inc_modules = set() if exclude_modules_cfg: exc_modules.update(exclude_modules_cfg) if exclude_modules: exc_modules.update(exclude_modules) if include_modules_cfg is not None: inc_modules.update(include_modules_cfg) if include_modules_cfg is None and not include_modules: inc_modules = None if include_modules is not None and include_modules: inc_modules.update(include_modules) if include_modules is None: inc_modules = None logger.debug('ExecutorID {} | JobID {} - Serializing function and data'.format(executor_id, job_id)) job_serialize_start = time.time() serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + iterdata, inc_modules, exc_modules) data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) module_data = create_module_data(mod_paths) func_str = func_and_data_ser[0] func_module_str = pickle.dumps({'func': func_str, 'module_data': module_data}, -1) func_module_size_bytes = len(func_module_str) host_job_meta['host_job_serialize_time'] = round(time.time()-job_serialize_start, 6) host_job_meta['data_size_bytes'] = data_size_bytes host_job_meta['func_module_size_bytes'] = func_module_size_bytes # Check data limit if 'data_limit' in config['lithops']: data_limit = config['lithops']['data_limit'] else: data_limit = MAX_AGG_DATA_SIZE if data_limit and data_size_bytes > data_limit*1024**2: log_msg = ('ExecutorID {} | JobID {} - Total data exceeded maximum size ' 'of {}'.format(executor_id, job_id, utils.sizeof_fmt(data_limit*1024**2))) raise Exception(log_msg) # Upload function and data upload_function = not config['lithops'].get('customized_runtime', False) upload_data = not (len(str(data_strs[0])) * job.chunksize < 8*1204 and backend in FAAS_BACKENDS) # Upload function and modules if upload_function: function_hash = hashlib.md5(func_module_str).hexdigest() job.func_key = create_func_key(executor_id, function_hash) if job.func_key not in FUNCTION_CACHE: logger.debug('ExecutorID {} | JobID {} - Uploading function and modules ' 'to the storage backend'.format(executor_id, job_id)) func_upload_start = time.time() internal_storage.put_func(job.func_key, func_module_str) func_upload_end = time.time() host_job_meta['host_func_upload_time'] = round(func_upload_end - func_upload_start, 6) FUNCTION_CACHE.add(job.func_key) else: logger.debug('ExecutorID {} | JobID {} - Function and modules ' 'found in local cache'.format(executor_id, job_id)) host_job_meta['host_func_upload_time'] = 0 else: # Prepare function and modules locally to store in the runtime image later function_file = func.__code__.co_filename function_hash = hashlib.md5(open(function_file, 'rb').read()).hexdigest()[:16] mod_hash = hashlib.md5(repr(sorted(mod_paths)).encode('utf-8')).hexdigest()[:16] job.func_key = func_key_suffix job.ext_runtime_uuid = '{}{}'.format(function_hash, mod_hash) job.local_tmp_dir = os.path.join(CUSTOM_RUNTIME_DIR, job.ext_runtime_uuid) _store_func_and_modules(job.local_tmp_dir, job.func_key, func_str, module_data) host_job_meta['host_func_upload_time'] = 0 # upload data if upload_data: # Upload iterdata to COS only if a single element is greater than 8KB logger.debug('ExecutorID {} | JobID {} - Uploading data to the storage backend' .format(executor_id, job_id)) # pass_iteradata through an object storage file data_key = create_data_key(executor_id, job_id) job.data_key = data_key data_bytes, data_byte_ranges = utils.agg_data(data_strs) job.data_byte_ranges = data_byte_ranges data_upload_start = time.time() internal_storage.put_data(data_key, data_bytes) data_upload_end = time.time() host_job_meta['host_data_upload_time'] = round(data_upload_end-data_upload_start, 6) else: # pass iteradata as part of the invocation payload logger.debug('ExecutorID {} | JobID {} - Data per activation is < ' '{}. Passing data through invocation payload' .format(executor_id, job_id, utils.sizeof_fmt(8*1024))) job.data_key = None job.data_byte_ranges = None job.data_byte_strs = data_strs host_job_meta['host_data_upload_time'] = 0 host_job_meta['host_job_created_time'] = round(time.time() - host_job_meta['host_job_create_tstamp'], 6) job.metadata = host_job_meta return job
def wait_rabbitmq(fs, internal_storage, rabbit_amqp_url, download_results=False, throw_except=True, pbar=None, return_when=ALL_COMPLETED, THREADPOOL_SIZE=128): """ Wait for the Future instances `fs` to complete. Returns a 2-tuple of lists. The first list contains the futures that completed (finished or cancelled) before the wait completed. The second contains uncompleted futures. :param futures: A list of futures. :param executor_id: executor's ID. :param internal_storage: Storage handler to poll cloud storage. :param rabbit_amqp_url: amqp url for accessing rabbitmq. :param pbar: Progress bar. :param return_when: One of `ALL_COMPLETED`, `ANY_COMPLETED`, `ALWAYS` :return: `(fs_dones, fs_notdones)` where `fs_dones` is a list of futures that have completed and `fs_notdones` is a list of futures that have not completed. :rtype: 2-tuple of lists """ if return_when != ALL_COMPLETED: raise NotImplementedError(return_when) thread_pool = ThreadPoolExecutor(max_workers=THREADPOOL_SIZE) present_jobs = {} done_call_ids = {} for f in fs: if (download_results and not f.done) or (not download_results and not (f.ready or f.done)): job_key = create_job_key(f.executor_id, f.job_id) if job_key not in present_jobs: present_jobs[job_key] = {} present_jobs[job_key][f.call_id] = f job_monitor_q = queue.Queue() for job_key in present_jobs.keys(): total_calls = len(present_jobs[job_key]) done_call_ids[job_key] = {'total': total_calls, 'call_ids': []} job_monitor = Thread(target=_job_monitor_thread, args=(job_key, total_calls, rabbit_amqp_url, job_monitor_q)) job_monitor.daemon = True job_monitor.start() # thread to check possible function activations unexpected errors. # It will raise a Timeout error if the status is not received after X seconds. running_futures = [] ftc = Thread(target=_future_timeout_checker_thread, args=(running_futures, job_monitor_q, throw_except)) ftc.daemon = True ftc.start() def reception_finished(): """ Method to check if the call_status from all the function activations have been received. """ for job_id in done_call_ids: total = done_call_ids[job_id]['total'] recived_call_ids = len(done_call_ids[job_id]['call_ids']) if total is None or total > recived_call_ids: return False return True get_result_futures = [] def get_result(f): f.result(throw_except=throw_except, internal_storage=internal_storage) while not reception_finished(): try: call_status = job_monitor_q.get() except KeyboardInterrupt: raise KeyboardInterrupt rcvd_executor_id = call_status['executor_id'] rcvd_job_id = call_status['job_id'] rcvd_call_id = call_status['call_id'] job_key = create_job_key(rcvd_executor_id, rcvd_job_id) fut = present_jobs[job_key][rcvd_call_id] fut._call_status = call_status fut.status(throw_except=throw_except, internal_storage=internal_storage) if call_status['type'] == '__init__': running_futures.append(fut) if call_status['type'] == '__end__': done_call_ids[job_key]['call_ids'].append(rcvd_call_id) if pbar: pbar.update(1) pbar.refresh() if 'new_futures' in call_status: new_futures = fut.result() fs.extend(new_futures) if pbar: pbar.total = pbar.total + len(new_futures) pbar.refresh() present_jobs_new_futures = { create_job_key(f.executor_id, f.job_id) for f in new_futures } for f in new_futures: job_key_new_futures = create_job_key( f.executor_id, f.job_id) if job_key_new_futures not in present_jobs: present_jobs[job_key_new_futures] = {} present_jobs[job_key_new_futures][f.call_id] = f for job_key_new_futures in present_jobs_new_futures: total_calls = len(present_jobs[job_key_new_futures]) done_call_ids[job_key_new_futures] = { 'total': total_calls, 'call_ids': [] } job_monitor = Thread(target=_job_monitor_thread, args=(job_key, total_calls, rabbit_amqp_url, job_monitor_q)) job_monitor.daemon = True job_monitor.start() if 'new_futures' not in call_status and download_results: gr_ft = thread_pool.submit(get_result, fut) get_result_futures.append(gr_ft) wait(get_result_futures) return fs, []
def function_handler(event): start_tstamp = time.time() logger.debug("Action handler started") extra_env = event.get('extra_env', {}) os.environ.update(extra_env) os.environ.update({'LITHOPS_WORKER': 'True', 'PYTHONUNBUFFERED': 'True'}) config = event['config'] call_id = event['call_id'] job_id = event['job_id'] executor_id = event['executor_id'] job_key = create_job_key(executor_id, job_id) logger.info("Execution ID: {}/{}".format(job_key, call_id)) runtime_name = event['runtime_name'] runtime_memory = event['runtime_memory'] execution_timeout = event['execution_timeout'] logger.debug("Runtime name: {}".format(runtime_name)) if runtime_memory: logger.debug("Runtime memory: {}MB".format(runtime_memory)) logger.debug("Function timeout: {}s".format(execution_timeout)) func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) call_status = CallStatus(config, internal_storage) call_status.response['host_submit_tstamp'] = event['host_submit_tstamp'] call_status.response['worker_start_tstamp'] = start_tstamp context_dict = { 'python_version': os.environ.get("PYTHON_VERSION"), 'call_id': call_id, 'job_id': job_id, 'executor_id': executor_id, 'activation_id': os.environ.get('__LITHOPS_ACTIVATION_ID') } call_status.response.update(context_dict) show_memory_peak = strtobool(os.environ.get('SHOW_MEMORY_PEAK', 'False')) try: if version.__version__ != event['lithops_version']: msg = ( "Lithops version mismatch. Host version: {} - Runtime version: {}" .format(event['lithops_version'], version.__version__)) raise RuntimeError('HANDLER', msg) # send init status event call_status.send('__init__') # call_status.response['free_disk_bytes'] = free_disk_space("/tmp") custom_env = { 'LITHOPS_CONFIG': json.dumps(config), '__LITHOPS_SESSION_ID': '-'.join([job_key, call_id]), 'PYTHONPATH': "{}:{}".format(os.getcwd(), LITHOPS_LIBS_PATH) } os.environ.update(custom_env) jobrunner_stats_dir = os.path.join(LITHOPS_TEMP_DIR, storage_config['bucket'], JOBS_PREFIX, job_key, call_id) os.makedirs(jobrunner_stats_dir, exist_ok=True) jobrunner_stats_filename = os.path.join(jobrunner_stats_dir, 'jobrunner.stats.txt') jobrunner_config = { 'lithops_config': config, 'call_id': call_id, 'job_id': job_id, 'executor_id': executor_id, 'func_key': func_key, 'data_key': data_key, 'data_byte_range': data_byte_range, 'output_key': create_output_key(JOBS_PREFIX, executor_id, job_id, call_id), 'stats_filename': jobrunner_stats_filename } if show_memory_peak: mm_handler_conn, mm_conn = Pipe() memory_monitor = Thread(target=memory_monitor_worker, args=(mm_conn, )) memory_monitor.start() handler_conn, jobrunner_conn = Pipe() jobrunner = JobRunner(jobrunner_config, jobrunner_conn, internal_storage) logger.debug('Starting JobRunner process') local_execution = strtobool( os.environ.get('__LITHOPS_LOCAL_EXECUTION', 'False')) jrp = Thread(target=jobrunner.run) if local_execution else Process( target=jobrunner.run) jrp.start() jrp.join(execution_timeout) logger.debug('JobRunner process finished') if jrp.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it try: jrp.terminate() except Exception: # thread does not have terminate method pass msg = ('Function exceeded maximum time of {} seconds and was ' 'killed'.format(execution_timeout)) raise TimeoutError('HANDLER', msg) if show_memory_peak: mm_handler_conn.send('STOP') memory_monitor.join() peak_memory_usage = int(mm_handler_conn.recv()) logger.info("Peak memory usage: {}".format( sizeof_fmt(peak_memory_usage))) call_status.response['peak_memory_usage'] = peak_memory_usage if not handler_conn.poll(): logger.error( 'No completion message received from JobRunner process') logger.debug('Assuming memory overflow...') # Only 1 message is returned by jobrunner when it finishes. # If no message, this means that the jobrunner process was killed. # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM. msg = 'Function exceeded maximum memory and was killed' raise MemoryError('HANDLER', msg) if os.path.exists(jobrunner_stats_filename): with open(jobrunner_stats_filename, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: call_status.response[key] = float(value) except Exception: call_status.response[key] = value if key in [ 'exception', 'exc_pickle_fail', 'result', 'new_futures' ]: call_status.response[key] = eval(value) except Exception: # internal runtime exceptions print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) call_status.response['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads( pickled_exc) # this is just to make sure they can be unpickled call_status.response['exc_info'] = str(pickled_exc) finally: call_status.response['worker_end_tstamp'] = time.time() call_status.send('__end__') # Unset specific env vars for key in extra_env: os.environ.pop(key, None) os.environ.pop('__LITHOPS_TOTAL_EXECUTORS', None) logger.info("Finished")