예제 #1
0
def run_job(job):
    """
    Runs a single job within a separate process
    """
    start_tstamp = time.time()
    setup_lithops_logger(job.log_level)

    logger.info("Lithops v{} - Starting execution".format(__version__))
    logger.info("Execution ID: {}/{}".format(job.job_key, job.call_id))
    logger.debug("Runtime name: {}".format(job.runtime_name))
    if job.runtime_memory:
        logger.debug("Runtime memory: {}MB".format(job.runtime_memory))
    logger.debug("Function timeout: {}s".format(job.execution_timeout))

    env = job.extra_env
    env['LITHOPS_WORKER'] = 'True'
    env['PYTHONUNBUFFERED'] = 'True'
    env['LITHOPS_CONFIG'] = json.dumps(job.config)
    env['PYTHONPATH'] = "{}:{}".format(os.getcwd(), LITHOPS_LIBS_PATH)
    env['__LITHOPS_SESSION_ID'] = '-'.join([job.job_key, job.call_id])
    os.environ.update(env)

    storage_config = extract_storage_config(job.config)
    internal_storage = InternalStorage(storage_config)

    call_status = CallStatus(job.config, internal_storage)
    call_status.response['worker_start_tstamp'] = start_tstamp
    call_status.response['host_submit_tstamp'] = job.host_submit_tstamp
    call_status.response['call_id'] = job.call_id
    call_status.response['job_id'] = job.job_id
    call_status.response['executor_id'] = job.executor_id

    show_memory_peak = strtobool(os.environ.get('SHOW_MEMORY_PEAK', 'False'))

    try:
        if __version__ != job.lithops_version:
            msg = (
                "Lithops version mismatch. Host version: {} - Runtime version: {}"
                .format(job.lithops_version, __version__))
            raise RuntimeError('HANDLER', msg)

        # send init status event
        call_status.send('__init__')

        if show_memory_peak:
            mm_handler_conn, mm_conn = Pipe()
            memory_monitor = Thread(target=memory_monitor_worker,
                                    args=(mm_conn, ))
            memory_monitor.start()

        job.jr_stats_file = os.path.join(job.job_dir, 'jobrunner.stats.txt')
        handler_conn, jobrunner_conn = Pipe()
        jobrunner = JobRunner(job, jobrunner_conn, internal_storage)
        logger.debug('Starting JobRunner process')
        jrp = Process(target=jobrunner.run) if is_unix_system() else Thread(
            target=jobrunner.run)
        jrp.start()

        jrp.join(job.execution_timeout)
        logger.debug('JobRunner process finished')

        if jrp.is_alive():
            # If process is still alive after jr.join(job_max_runtime), kill it
            try:
                jrp.terminate()
            except Exception:
                # thread does not have terminate method
                pass
            msg = ('Function exceeded maximum time of {} seconds and was '
                   'killed'.format(job.execution_timeout))
            raise TimeoutError('HANDLER', msg)

        if show_memory_peak:
            mm_handler_conn.send('STOP')
            memory_monitor.join()
            peak_memory_usage = int(mm_handler_conn.recv())
            logger.info("Peak memory usage: {}".format(
                sizeof_fmt(peak_memory_usage)))
            call_status.response['peak_memory_usage'] = peak_memory_usage

        if not handler_conn.poll():
            logger.error(
                'No completion message received from JobRunner process')
            logger.debug('Assuming memory overflow...')
            # Only 1 message is returned by jobrunner when it finishes.
            # If no message, this means that the jobrunner process was killed.
            # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM.
            msg = 'Function exceeded maximum memory and was killed'
            raise MemoryError('HANDLER', msg)

        if os.path.exists(job.jr_stats_file):
            with open(job.jr_stats_file, 'r') as fid:
                for l in fid.readlines():
                    key, value = l.strip().split(" ", 1)
                    try:
                        call_status.response[key] = float(value)
                    except Exception:
                        call_status.response[key] = value
                    if key in [
                            'exception', 'exc_pickle_fail', 'result',
                            'new_futures'
                    ]:
                        call_status.response[key] = eval(value)

    except Exception:
        # internal runtime exceptions
        print('----------------------- EXCEPTION !-----------------------')
        traceback.print_exc(file=sys.stdout)
        print('----------------------------------------------------------')
        call_status.response['exception'] = True

        pickled_exc = pickle.dumps(sys.exc_info())
        pickle.loads(
            pickled_exc)  # this is just to make sure they can be unpickled
        call_status.response['exc_info'] = str(pickled_exc)

    finally:
        call_status.response['worker_end_tstamp'] = time.time()

        with open(job.log_file, 'rb') as lf:
            log_str = base64.b64encode(zlib.compress(lf.read())).decode()
            call_status.response['logs'] = log_str

        call_status.send('__end__')

        # Unset specific env vars
        for key in job.extra_env:
            os.environ.pop(key, None)
        os.environ.pop('__LITHOPS_TOTAL_EXECUTORS', None)

        logger.info("Finished")
예제 #2
0
def function_handler(event):
    start_tstamp = time.time()

    log_level = event['log_level']
    cloud_logging_config(log_level)
    logger.debug("Action handler started")

    extra_env = event.get('extra_env', {})
    os.environ.update(extra_env)

    os.environ.update({'LITHOPS_FUNCTION': 'True', 'PYTHONUNBUFFERED': 'True'})
    os.environ.pop('LITHOPS_TOTAL_EXECUTORS', None)

    config = event['config']
    call_id = event['call_id']
    job_id = event['job_id']
    executor_id = event['executor_id']
    exec_id = "{}/{}/{}".format(executor_id, job_id, call_id)
    logger.info("Execution-ID: {}".format(exec_id))

    runtime_name = event['runtime_name']
    runtime_memory = event['runtime_memory']
    execution_timeout = event['execution_timeout']
    logger.debug("Runtime name: {}".format(runtime_name))
    logger.debug("Runtime memory: {}MB".format(runtime_memory))
    logger.debug("Function timeout: {}s".format(execution_timeout))

    func_key = event['func_key']
    data_key = event['data_key']
    data_byte_range = event['data_byte_range']

    storage_config = extract_storage_config(config)
    internal_storage = InternalStorage(storage_config)

    call_status = CallStatus(config, internal_storage)
    call_status.response['host_submit_tstamp'] = event['host_submit_tstamp']
    call_status.response['worker_start_tstamp'] = start_tstamp
    context_dict = {
        'python_version': os.environ.get("PYTHON_VERSION"),
        'call_id': call_id,
        'job_id': job_id,
        'executor_id': executor_id,
        'activation_id': os.environ.get('__PW_ACTIVATION_ID')
    }
    call_status.response.update(context_dict)

    show_memory_peak = strtobool(os.environ.get('SHOW_MEMORY_PEAK', 'False'))

    try:
        if version.__version__ != event['lithops_version']:
            msg = (
                "Lithops version mismatch. Host version: {} - Runtime version: {}"
                .format(event['lithops_version'], version.__version__))
            raise RuntimeError('HANDLER', msg)

        # send init status event
        call_status.send('__init__')

        # call_status.response['free_disk_bytes'] = free_disk_space("/tmp")
        custom_env = {
            'LITHOPS_CONFIG': json.dumps(config),
            'LITHOPS_EXECUTION_ID': exec_id,
            'PYTHONPATH': "{}:{}".format(os.getcwd(), LITHOPS_LIBS_PATH)
        }
        os.environ.update(custom_env)

        jobrunner_stats_dir = os.path.join(STORAGE_FOLDER,
                                           storage_config['bucket'],
                                           JOBS_PREFIX, executor_id, job_id,
                                           call_id)
        os.makedirs(jobrunner_stats_dir, exist_ok=True)
        jobrunner_stats_filename = os.path.join(jobrunner_stats_dir,
                                                'jobrunner.stats.txt')

        jobrunner_config = {
            'lithops_config':
            config,
            'call_id':
            call_id,
            'job_id':
            job_id,
            'executor_id':
            executor_id,
            'func_key':
            func_key,
            'data_key':
            data_key,
            'log_level':
            log_level,
            'data_byte_range':
            data_byte_range,
            'output_key':
            create_output_key(JOBS_PREFIX, executor_id, job_id, call_id),
            'stats_filename':
            jobrunner_stats_filename
        }

        if show_memory_peak:
            mm_handler_conn, mm_conn = Pipe()
            memory_monitor = Thread(target=memory_monitor_worker,
                                    args=(mm_conn, ))
            memory_monitor.start()

        handler_conn, jobrunner_conn = Pipe()
        jobrunner = JobRunner(jobrunner_config, jobrunner_conn,
                              internal_storage)
        logger.debug('Starting JobRunner process')
        local_execution = strtobool(
            os.environ.get('__PW_LOCAL_EXECUTION', 'False'))
        jrp = Thread(target=jobrunner.run) if local_execution else Process(
            target=jobrunner.run)
        jrp.start()

        jrp.join(execution_timeout)
        logger.debug('JobRunner process finished')

        if jrp.is_alive():
            # If process is still alive after jr.join(job_max_runtime), kill it
            try:
                jrp.terminate()
            except Exception:
                # thread does not have terminate method
                pass
            msg = ('Function exceeded maximum time of {} seconds and was '
                   'killed'.format(execution_timeout))
            raise TimeoutError('HANDLER', msg)

        if show_memory_peak:
            mm_handler_conn.send('STOP')
            memory_monitor.join()
            peak_memory_usage = int(mm_handler_conn.recv())
            logger.info("Peak memory usage: {}".format(
                sizeof_fmt(peak_memory_usage)))
            call_status.response['peak_memory_usage'] = peak_memory_usage

        if not handler_conn.poll():
            logger.error(
                'No completion message received from JobRunner process')
            logger.debug('Assuming memory overflow...')
            # Only 1 message is returned by jobrunner when it finishes.
            # If no message, this means that the jobrunner process was killed.
            # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM.
            msg = 'Function exceeded maximum memory and was killed'
            raise MemoryError('HANDLER', msg)

        if os.path.exists(jobrunner_stats_filename):
            with open(jobrunner_stats_filename, 'r') as fid:
                for l in fid.readlines():
                    key, value = l.strip().split(" ", 1)
                    try:
                        call_status.response[key] = float(value)
                    except Exception:
                        call_status.response[key] = value
                    if key in [
                            'exception', 'exc_pickle_fail', 'result',
                            'new_futures'
                    ]:
                        call_status.response[key] = eval(value)

    except Exception:
        # internal runtime exceptions
        print('----------------------- EXCEPTION !-----------------------',
              flush=True)
        traceback.print_exc(file=sys.stdout)
        print('----------------------------------------------------------',
              flush=True)
        call_status.response['exception'] = True

        pickled_exc = pickle.dumps(sys.exc_info())
        pickle.loads(
            pickled_exc)  # this is just to make sure they can be unpickled
        call_status.response['exc_info'] = str(pickled_exc)

    finally:
        call_status.response['worker_end_tstamp'] = time.time()
        call_status.send('__end__')

        for key in extra_env:
            os.environ.pop(key)

        logger.info("Finished")
예제 #3
0
def run_job(job):
    """
    Runs a single job within a separate process
    """
    setup_lithops_logger(job.log_level)

    backend = os.environ.get('__LITHOPS_BACKEND', '')
    logger.info("Lithops v{} - Starting {} execution".format(
        __version__, backend))
    logger.info("Execution ID: {}/{}".format(job.job_key, job.call_id))

    env = job.extra_env
    env['LITHOPS_CONFIG'] = json.dumps(job.config)
    env['__LITHOPS_SESSION_ID'] = '-'.join([job.job_key, job.call_id])
    os.environ.update(env)

    storage_config = extract_storage_config(job.config)
    internal_storage = InternalStorage(storage_config)
    call_status = create_call_status(job, internal_storage)

    if job.runtime_memory:
        logger.debug('Runtime: {} - Memory: {}MB - Timeout: {} seconds'.format(
            job.runtime_name, job.runtime_memory, job.execution_timeout))
    else:
        logger.debug('Runtime: {} - Timeout: {} seconds'.format(
            job.runtime_name, job.execution_timeout))

    job_interruped = False

    try:
        # send init status event
        call_status.send_init_event()

        handler_conn, jobrunner_conn = Pipe()
        jobrunner = JobRunner(job, jobrunner_conn, internal_storage)
        logger.debug('Starting JobRunner process')
        jrp = Process(target=jobrunner.run) if is_unix_system() else Thread(
            target=jobrunner.run)
        jrp.start()
        jrp.join(job.execution_timeout)
        logger.debug('JobRunner process finished')

        if jrp.is_alive():
            # If process is still alive after jr.join(job_max_runtime), kill it
            try:
                jrp.terminate()
            except Exception:
                # thread does not have terminate method
                pass
            msg = ('Function exceeded maximum time of {} seconds and was '
                   'killed'.format(job.execution_timeout))
            raise TimeoutError('HANDLER', msg)

        if not handler_conn.poll():
            logger.error(
                'No completion message received from JobRunner process')
            logger.debug('Assuming memory overflow...')
            # Only 1 message is returned by jobrunner when it finishes.
            # If no message, this means that the jobrunner process was killed.
            # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM.
            msg = 'Function exceeded maximum memory and was killed'
            raise MemoryError('HANDLER', msg)

        if os.path.exists(job.stats_file):
            with open(job.stats_file, 'r') as fid:
                for l in fid.readlines():
                    key, value = l.strip().split(" ", 1)
                    try:
                        call_status.add(key, float(value))
                    except Exception:
                        call_status.add(key, value)
                    if key in ['exception', 'exc_pickle_fail', 'result']:
                        call_status.add(key, eval(value))

    except KeyboardInterrupt:
        job_interruped = True
        logger.debug("Job interrupted")

    except Exception:
        # internal runtime exceptions
        print('----------------------- EXCEPTION !-----------------------')
        traceback.print_exc(file=sys.stdout)
        print('----------------------------------------------------------')
        call_status.add('exception', True)

        pickled_exc = pickle.dumps(sys.exc_info())
        pickle.loads(
            pickled_exc)  # this is just to make sure they can be unpickled
        call_status.add('exc_info', str(pickled_exc))

    finally:
        if not job_interruped:
            call_status.add('worker_end_tstamp', time.time())

            # Flush log stream and save it to the call status
            job.log_stream.flush()
            if os.path.isfile(job.log_file):
                with open(job.log_file, 'rb') as lf:
                    log_str = base64.b64encode(zlib.compress(
                        lf.read())).decode()
                    call_status.add('logs', log_str)

            call_status.send_finish_event()

        logger.info("Finished")