Exemplo n.º 1
0
    def wait_for_deployment_to_complete(self, wait_seconds=5, log_output=True):
        """
        Waits for the job to complete. It checks the status of the job periodically to test for completion.

        Arguments:
            wait_seconds {float} -- The number of seconds to wait between job status check attempts (defaults to 5)

        Returns:
            - This method doesn't return a value.

        Raises:
            - This method doesn't raise any exception.

        Notes:
            A job is completed when it finishes running due to success or failure. This method will wait for
            any of these events to occur. It's a user responsibility to ensure his job is not programmed in a
            way that makes it run forever.
        """

        import time
        from foundations_contrib.global_state import log_manager

        log = log_manager.get_logger(__name__)

        while not self.is_job_complete():
            if log_output:
                log.info("waiting for job `" + self.job_name() + "` to finish")
            time.sleep(wait_seconds)

        if log_output:
            log.info("job `" + self.job_name() + "` completed")
        def _log_message(self, route_name, message, metadata):
            from foundations_contrib.global_state import log_manager

            logger = log_manager.get_logger(__name__)
            log_message = f'{route_name} {message}'
            if metadata is not None:
                log_message += f' {metadata}'
            logger.debug(log_message)
Exemplo n.º 3
0
def deploy_job(pipeline_context_wrapper, job_name, job_params):
    from foundations_contrib.global_state import deployment_manager, log_manager
    from foundations_contrib.deployment_wrapper import DeploymentWrapper

    logger = log_manager.get_logger(__name__)
    logger.info("Job submission started. Ctrl-C to cancel.")

    job_deployment = deployment_manager.simple_deploy(pipeline_context_wrapper, job_name, job_params)
    return DeploymentWrapper(job_deployment)
Exemplo n.º 4
0
def save_artifact(filepath, key=None):
    from foundations_contrib.global_state import log_manager, current_foundations_context

    logger = log_manager.get_logger(__name__)
    foundations_context = current_foundations_context()

    if not foundations_context.is_in_running_job():
        logger.warning('Cannot save artifact outside of job.')
    else:
        job_id = foundations_context.job_id()

        artifact_saver = _ArtifactSaver(logger, filepath, job_id, key)
        artifact_saver.save_artifact()
Exemplo n.º 5
0
    def log_metric(self, key, value):
        from foundations_contrib.global_state import log_manager
        from foundations_events.producers.metric_logged import MetricLogged

        if self._is_job_running():
            metric_logged_producer = MetricLogged(self._message_router,
                                                  self._project_name(),
                                                  self._job_id(), key, value)
            metric_logged_producer.push_message()
        elif not log_manager.foundations_not_running_warning_printed():
            logger = log_manager.get_logger(__name__)
            logger.warning('Script not run with Foundations.')
            log_manager.set_foundations_not_running_warning_printed()
Exemplo n.º 6
0
def stream_job_logs(deployment):
    from foundations_contrib.global_state import log_manager
    from os import environ
    import time
    
    logger = log_manager.get_logger(__name__)
    if environ.get('DISABLE_LOG_STREAMING', 'False') == 'False':
        logger.info('Job queued. Ctrl-C to stop streaming - job will not be interrupted or cancelled.')
        job_running = False

        time.sleep(1)

        try:
            for item in deployment.stream_job_logs():
                if not job_running:
                    logger.info('Job running, streaming logs.')
                    job_running = True
                if 'RuntimeError' in item:
                    import sys
                    sys.exit(item)
                print(item)

            try:
                counter = 0
                timeout = 15
                job_status = deployment.get_true_job_status()

                while (job_status == 'running' or job_status is None) and counter < timeout:
                    time.sleep(1)
                    counter += 1
                    job_status = deployment.get_true_job_status()

                if job_status == 'failed':
                    logger.error("Job '{}' has failed.".format(deployment.job_name()))
                elif job_status == 'completed':
                    logger.info("Job '{}' has completed.".format(deployment.job_name()))
                elif job_status == 'running':
                    logger.info("Job '{}' is running, see GUI for full logs and status.".format(deployment.job_name()))
                else:
                    logger.warning("Job status of job '{}' is unknown.".format(deployment.job_name()))
            except AttributeError:
                logger.info("Job '{}' has finished.".format(deployment.job_name()))

        except TimeoutError:
            logger.info('Job cannot be found. Possibly because it has been removed from the queue.')
Exemplo n.º 7
0
def _get_logger():
    from foundations_contrib.global_state import log_manager

    return log_manager.get_logger(__name__)
Exemplo n.º 8
0
def submit(arguments):
    from foundations_core_cli.job_submission.config import load
    from foundations_core_cli.job_submission.deployment import deploy
    from foundations_core_cli.job_submission.logs import stream_job_logs
    from foundations_internal.change_directory import ChangeDirectory
    from foundations_contrib.global_state import config_manager, log_manager
    from foundations_contrib.set_job_resources import set_job_resources
    from jsonschema import validate
    import os
    import os.path

    current_directory = os.getcwd()
    with ChangeDirectory(arguments.job_directory or current_directory):
        load(arguments.scheduler_config or 'scheduler')

        job_config = {}
        if os.path.exists('job.config.yaml'):
            with open('job.config.yaml') as file:
                job_config = yaml.load(file.read(), Loader=yaml.FullLoader)

        # validate(instance=job_config, schema=_job_schema)

        job_resource_args = {}

        if 'log_level' in job_config:
            config_manager['log_level'] = job_config['log_level']
        if 'worker' in job_config:
            config_manager['worker_container_overrides'].update(
                job_config['worker'])
        if 'num_gpus' in job_config:
            job_resource_args['num_gpus'] = job_config['num_gpus']
        if 'ram' in job_config:
            job_resource_args['ram'] = job_config['ram']

        logger = log_manager.get_logger(__name__)

        if arguments.command:
            config_manager['worker_container_overrides'][
                'args'] = arguments.command
            if not os.path.exists(arguments.command[0]):
                logger.warning(
                    f"Hey, seems like your command '{arguments.command[0]}' is not an existing file in your current directory. If you are using Atlas's advanced custom docker image functionality and know what you are doing, you can ignore this message."
                )
        else:
            logger.warning('No command was specified.')

        if arguments.num_gpus is not None:
            job_resource_args['num_gpus'] = arguments.num_gpus
        if arguments.ram is not None:
            job_resource_args['ram'] = arguments.ram
        set_job_resources(**job_resource_args)

        from foundations.global_state import current_foundations_context
        try:
            cur_job_id = current_foundations_context().pipeline_context(
            ).file_name
        except ValueError:
            cur_job_id = None

        deployment = deploy(
            arguments.project_name or job_config.get('project_name'),
            arguments.entrypoint or job_config.get('entrypoint'),
            arguments.params or job_config.get('params'))

        if arguments.stream_job_logs:
            try:
                stream_job_logs(deployment)
            except KeyboardInterrupt:
                pass

        if cur_job_id is not None:
            current_foundations_context().pipeline_context(
            ).file_name = cur_job_id

        return deployment
Exemplo n.º 9
0
    def stream_job_logs(self, strip_new_line=True):
        import requests
        import time

        status = self.get_job_status()
        counter = 0
        timeout = 15

        while status == "queued" or status is None:
            time.sleep(1)
            if status is None:
                counter += 1
            if counter >= timeout:
                raise TimeoutError('Job timed out')
            status = self.get_job_status()

        if status == "running":
            r = requests.get(
                f"{self._config['scheduler_url']}/running_jobs/{self._job_id}/container_id",
                headers={"Authorization": f"bearer {user_token()}"})
            if r.status_code == requests.codes.ok:
                import docker
                from docker.errors import APIError
                from requests.exceptions import ConnectionError

                try:
                    client = docker.from_env()
                    container = client.containers.get(r.json())
                    log_stream = container.logs(stream=True)

                    for line in log_stream:
                        if strip_new_line:
                            line = line.decode().strip('\n')
                        else:
                            line = line.decode()
                        yield line

                except APIError as e:
                    from foundations_contrib.global_state import log_manager
                    logger = log_manager.get_logger(__name__)
                    logger.warn(
                        f"Could not find local container for job {self._job_id}. The job may have already completed or was submitted to a remote machine. Please see the GUI for full job logs and status."
                    )

                except ConnectionError as e:
                    from foundations_contrib.global_state import log_manager
                    logger = log_manager.get_logger(__name__)
                    logger.warn(
                        f"Could not connect to local Docker engine for job {self._job_id}. You can ignore this warning if the job was submitted to a remote machine. Please see the GUI for full job logs and status."
                    )

            else:
                # try and see if it completed in between requests
                status = "completed"

        if status == "completed":
            from io import StringIO
            log_stream = StringIO(self.get_job_logs())

            for line in log_stream:
                if strip_new_line:
                    line = line.strip('\n')
                yield line
Exemplo n.º 10
0
    def _log_missing_job_id_for_upload(self):
        from foundations_contrib.global_state import log_manager

        foundations_syncable_directory_logger = log_manager.get_logger(__name__)
        foundations_syncable_directory_logger.warning('local_job_id required for uploading artifacts')