예제 #1
0
def set_job_resources(num_gpus=0, ram=None):
    """
    Specifies the resources to run a job with. The available amount will greatly depend on what is available on the infrastrcture that the Foundations job orchestrator is setup on.

    Arguments:
        num_gpus {int} -- The number of GPUs to run the job with.  Set to 0 to run with CPU resources instead.  By default uses 1 GPU.
        ram {number} -- The amount of ram in GB to use while running the job. Must be greater than 0 or None.  If None, no limit will be set.

    Returns:
        - This function doesn't return a value.

    Raises:
        ValueError -- If either the RAM or GPU quantity is an invalid value (ex: less than 0) or not specified.

    Notes:
        Setting the resources for a job from a given notebook or driver file will cause any additional jobs (ex: hyperparameter search) deployed from the same file and using the same process to use the same resources, unless specified otherwise.
        To clear specifying resources and use the default, you can pass in set_job_resources(1, None).  Set num_gpus=0 to use CPU instead.
    """
    if ram is not None and ram <= 0:
        raise ValueError(
            'Invalid RAM quantity. Please provide a RAM quantity greater than zero.'
        )

    if not isinstance(num_gpus, int) or num_gpus < 0:
        raise ValueError(
            'Invalid GPU quantity. Please provide a non-negative integer GPU quantity.'
        )

    job_resources = JobResources(num_gpus, ram)
    current_foundations_context().set_job_resources(job_resources)
예제 #2
0
def deploy(project_name, entrypoint, params):
    import os
    import os.path as path
    import json

    from foundations_contrib.job_deployer import deploy_job
    from foundations_contrib.global_state import (
        current_foundations_context,
        redis_connection,
        config_manager,
    )
    from foundations_internal.pipeline_context_wrapper import PipelineContextWrapper

    if project_name is None:
        project_name = path.basename(os.getcwd())

    current_foundations_context().set_project_name(project_name)
    config_manager["run_script_environment"] = {
        "script_to_run": entrypoint,
        "enable_stages": False,
    }

    current_foundations_context().pipeline_context().provenance.user_name = (
        _get_user_name_from_token()
    )
    pipeline_context_wrapper = PipelineContextWrapper(
        current_foundations_context().pipeline_context()
    )

    if params is not None:
        with open("foundations_job_parameters.json", "w+") as params_file:
            json.dump(params, params_file)

    return deploy_job(pipeline_context_wrapper, None, {})
예제 #3
0
def _log_metric_in_running_job(key, value):
    from foundations_contrib.global_state import message_router, current_foundations_context
    from foundations_events.producers.metric_logged import MetricLogged

    project_name = current_foundations_context().project_name()
    job_id = current_foundations_context().job_id()

    metric_logged_producer = MetricLogged(message_router, project_name, job_id,
                                          key, value)
    metric_logged_producer.push_message()
예제 #4
0
def _log_param_in_running_job(key, value):
    from foundations_contrib.global_state import current_foundations_context, redis_connection

    project_name = current_foundations_context().project_name()
    job_id = current_foundations_context().job_id()

    _insert_parameter_name_into_projects_params_set(redis_connection,
                                                    project_name, key)
    _insert_input_parameter_name_into_projects_input_params_set(
        redis_connection, project_name, key)
    _insert_parameter_value_into_job_run_data(redis_connection, job_id, key,
                                              value)
    _insert_input_parameter_name_into_job_input_parameter_data(
        redis_connection, job_id, key)
예제 #5
0
def _at_exit_callback():
    from foundations_contrib.global_state import (
        current_foundations_context,
        message_router,
    )
    from foundations_contrib.archiving.upload_artifacts import upload_artifacts
    from foundations_events.producers.jobs import CompleteJob
    from foundations_events.producers.jobs import FailedJob

    global _exception_happened

    pipeline_context = current_foundations_context().pipeline_context()
    upload_artifacts(pipeline_context.job_id)
    # This try-except block should be refactored at a later date

    if _exception_happened:
        FailedJob(
            message_router,
            pipeline_context,
            {
                "type": Exception,
                "exception": "",
                "traceback": []
            },
        ).push_message()
    else:
        CompleteJob(message_router, pipeline_context).push_message()
예제 #6
0
    def test_ram_set_less_than_or_equal_to_zero_does_not_actually_set_job_resources(
            self):
        with self.assertRaises(ValueError) as error_context:
            set_job_resources(self.num_gpus, self.invalid_ram)

        job_resources = current_foundations_context().job_resources()
        self.assertEqual(self.default_job_resources, job_resources)
예제 #7
0
파일: utils.py 프로젝트: tomzhang/atlas-1
def log_warning_if_not_running_in_job(function_if_running_in_job, *args):
    from foundations_contrib.global_state import log_manager, current_foundations_context

    if current_foundations_context().is_in_running_job():
        function_if_running_in_job(*args)
    elif not log_manager.foundations_not_running_warning_printed():
        logger = log_manager.get_logger(__name__)
        logger.warning('Script not run with Foundations.')
        log_manager.set_foundations_not_running_warning_printed()
    def set_up(self):
        from uuid import uuid4
        from foundations_events.producers.jobs import QueueJob
        from foundations_contrib.global_state import message_router, current_foundations_context

        foundations.set_project_name('default')
        self._job_id = str(uuid4())
        pipeline_context = current_foundations_context().pipeline_context()
        pipeline_context.file_name = self._job_id
        queue_job = QueueJob(message_router, pipeline_context)
        queue_job.push_message()
    def _set_tags(klass, job_name, tags):
        from foundations_contrib.global_state import current_foundations_context
        from foundations import set_tag

        pipeline_context = current_foundations_context().pipeline_context()
        pipeline_context.file_name = job_name

        if tags is not None:
            for key, value in tags.items():
                set_tag(key, value)

        pipeline_context.file_name = None
예제 #10
0
def create_syncable_directory(key, directory_path=None, source_job_id=None):
    from foundations.artifacts.syncable_directory import SyncableDirectory
    from foundations_contrib.global_state import current_foundations_context
    from tempfile import mkdtemp
    
    if directory_path is None:
        directory_path = mkdtemp()

    try:
        job_id = current_foundations_context().pipeline_context().file_name
    except ValueError:
        job_id = None
    return SyncableDirectory(key, directory_path, job_id, source_job_id or job_id)
예제 #11
0
def save_artifact(filepath, key=None):
    from foundations_contrib.global_state import log_manager, current_foundations_context

    logger = log_manager.get_logger(__name__)
    foundations_context = current_foundations_context()

    if not foundations_context.is_in_running_job():
        logger.warning('Cannot save artifact outside of job.')
    else:
        job_id = foundations_context.job_id()

        artifact_saver = _ArtifactSaver(logger, filepath, job_id, key)
        artifact_saver.save_artifact()
예제 #12
0
def set_up_job_environment():
    from foundations_events.producers.jobs import QueueJob
    from foundations_events.producers.jobs import RunJob
    from foundations_contrib.global_state import (
        current_foundations_context,
        message_router,
        config_manager,
    )
    import atexit

    config_manager["_is_deployment"] = True
    _get_logger().debug(
        f"Foundations has been run with the following configuration:\n"
        f"{yaml.dump(config_manager.config(), default_flow_style=False)}")
    pipeline_context = current_foundations_context().pipeline_context()
    _set_job_state(pipeline_context)

    QueueJob(message_router, pipeline_context).push_message()
    RunJob(message_router, pipeline_context).push_message()

    atexit.register(_at_exit_callback)
    _set_up_exception_handling()
예제 #13
0
def _config():
    from uuid import uuid4
    from os import getcwd
    from foundations_contrib.global_state import (
        config_manager,
        current_foundations_context,
    )
    from foundations_contrib.global_state import config_manager
    from foundations_contrib.local_file_system_pipeline_archive import LocalFileSystemPipelineArchive
    from foundations_contrib.local_file_system_pipeline_listing import LocalFileSystemPipelineListing

    # ensure a job uuid is set
    current_foundations_context().pipeline_context(
    ).file_name = "integration-test-job"

    # separates test runs
    test_uuid = uuid4()

    # below is used to create archives for all different types
    archive_root = getcwd() + "/tmp/archives_{}".format(test_uuid)

    archive_implementation = {
        "archive_type": LocalFileSystemPipelineArchive,
        "constructor_arguments": [archive_root],
    }
    config_manager["archive_listing_implementation"] = {
        "archive_listing_type": LocalFileSystemPipelineListing,
        "constructor_arguments": [archive_root],
    }
    config_manager[
        "persisted_data_archive_implementation"] = archive_implementation
    config_manager[
        "provenance_archive_implementation"] = archive_implementation
    config_manager[
        "job_source_archive_implementation"] = archive_implementation
    config_manager["artifact_archive_implementation"] = archive_implementation
    config_manager[
        "miscellaneous_archive_implementation"] = archive_implementation
예제 #14
0
파일: main.py 프로젝트: tomzhang/atlas-1
import foundations
from foundations_contrib.global_state import current_foundations_context, redis_connection

foundations.log_metric('ugh', 10)

with open('thomas_text.txt', 'w') as f:
    f.write('ugh_square')

foundations.save_artifact('thomas_text.txt', 'just_some_artifact')
foundations.log_param('blah', 20)

redis_connection.set('foundations_testing_job_id', current_foundations_context().pipeline_context().job_id)
예제 #15
0
    def _create_job_spec(self, job_mount_path, working_dir_root_path,
                         job_results_root_path, container_config_root_path,
                         job_id, project_name, username,
                         worker_container_overrides):
        from foundations_contrib.global_state import current_foundations_context

        worker_container = {
            'volumes': {
                job_mount_path: {
                    "bind": "/job",
                    "mode": "rw"
                },
                job_results_root_path: {
                    "bind": job_results_root_path,
                    "mode": "rw"
                },
                container_config_root_path: {
                    "bind": "/root/.foundations/config",
                    "mode": "rw"
                },
                working_dir_root_path: {
                    "bind": working_dir_root_path,
                    "mode": "rw"
                }
            },
            "working_dir": "/job/job_source",
            # [
            # {
            #     'name': 'logging',
            #     'mountPath': '/root/.foundations/logs',
            # },
            # {
            #     'name': 'execution-config',
            #     'mountPath': '/root/.foundations/config/execution'
            # }
            # ]
            'environment': {
                "FOUNDATIONS_USER": username,
                "FOUNDATIONS_JOB_ID": job_id,
                "FOUNDATIONS_PROJECT_NAME": project_name,
                "PYTHONPATH": "/job/",
                "FOUNDATIONS_HOME": "/root/.foundations/",
                "FOUNDATIONS_TOKEN": user_token()
            },
            "network": "foundations-atlas"
        }

        if current_foundations_context().job_resources().ram is not None:
            worker_container['mem_limit'] = int(
                current_foundations_context().job_resources().ram)

        if (current_foundations_context().job_resources().num_gpus is not None
                and
                current_foundations_context().job_resources().num_gpus > 0):
            worker_container[
                'image'] = 'us.gcr.io/dessa-atlas/worker-gpu:latest'
            worker_container['runtime'] = 'nvidia'

        else:
            worker_container['image'] = 'us.gcr.io/dessa-atlas/worker:latest'
            worker_container['runtime'] = 'runc'

        for override_key in ['command', 'image', 'working_dir', 'entrypoint']:
            if override_key in worker_container_overrides:
                worker_container[override_key] = worker_container_overrides[
                    override_key]
        if self._config['run_script_environment']['script_to_run']:
            worker_container['entrypoint'] = self._config[
                'run_script_environment']['script_to_run']

        if 'args' in worker_container_overrides:
            worker_container['command'] = worker_container_overrides['args']
        #
        # if not has_gpus:
        #     worker_container['env'] += [{'name': 'NVIDIA_VISIBLE_DEVICES', 'value': ''}]

        for override_key in ['environment', 'volumes']:
            if override_key in worker_container_overrides:
                worker_container[override_key] = {
                    **worker_container[override_key],
                    **worker_container_overrides[override_key]
                }

        if 'resources' in worker_container_overrides:
            for override_key in ['limits', 'requests']:
                if override_key in worker_container_overrides['resources']:
                    worker_container['resources'][override_key].update(
                        worker_container_overrides['resources'][override_key])

        return worker_container
예제 #16
0
 def _job_resources(self):
     from foundations_contrib.global_state import current_foundations_context
     return current_foundations_context().job_resources()
예제 #17
0
def _configure():
    from foundations_contrib.global_state import current_foundations_context
    current_foundations_context().pipeline_context(
    ).file_name = 'integration-test-job'
예제 #18
0
 def test_set_job_resources_sets_job_resources_object_in_current_foundations_context(
         self):
     set_job_resources(self.num_gpus, self.ram)
     job_resources = current_foundations_context().job_resources()
     self.assertEqual(self.job_resources, job_resources)
예제 #19
0
 def _set_job_id(self, job_id):
     from foundations_contrib.global_state import current_foundations_context
     current_foundations_context().pipeline_context().file_name = job_id
예제 #20
0
import os

import foundations
from foundations_contrib.global_state import current_foundations_context, message_router
from foundations_events.producers.jobs import RunJob

foundations.set_project_name('default')

job_id = os.environ['ACCEPTANCE_TEST_JOB_ID']
pipeline_context = current_foundations_context().pipeline_context()
pipeline_context.file_name = job_id

RunJob(message_router, pipeline_context).push_message()

foundations.set_tag('model type', 'simple mlp')
foundations.set_tag('data set', 'out of time')
foundations.set_tag('what I was doing,', 'drinking tea')

print('Hello World!')
예제 #21
0
    def test_set_job_resources_ram_defaults_to_none(self):
        set_job_resources(num_gpus=self.num_gpus)

        job_resources = current_foundations_context().job_resources()
        self.assertEqual(JobResources(self.num_gpus, None), job_resources)
예제 #22
0
    def set_up(self):
        from acceptance.cleanup import cleanup

        cleanup()
        current_foundations_context().pipeline_context(
        ).file_name = self.faker.uuid4()
예제 #23
0
 def test_set_job_resources_num_gpus_defaults_to_zero(self):
     set_job_resources(ram=self.ram)
     job_resources = current_foundations_context().job_resources()
     self.assertEqual(JobResources(0, self.ram), job_resources)
예제 #24
0
import foundations
import json

from foundations_contrib.global_state import current_foundations_context

params = foundations.load_parameters()
print(current_foundations_context().job_id())
print(json.dumps(params))
예제 #25
0
 def _pipeline_context(self):
     from foundations_contrib.global_state import current_foundations_context
     return current_foundations_context().pipeline_context()
예제 #26
0
    def test_ram_set_to_none_is_valid_configuration(self):
        set_job_resources(self.num_gpus, None)

        expected_job_resources = JobResources(num_gpus=self.num_gpus, ram=None)
        job_resources = current_foundations_context().job_resources()
        self.assertEqual(expected_job_resources, job_resources)
예제 #27
0
 def tear_down(self):
     current_foundations_context().reset_job_resources()
예제 #28
0
    def test_gpu_set_to_negative_value_not_actually_set_job_resources(self):
        with self.assertRaises(ValueError) as error_context:
            set_job_resources(self.negative_gpus, self.ram)

        job_resources = current_foundations_context().job_resources()
        self.assertEqual(self.default_job_resources, job_resources)
예제 #29
0
 def tear_down(self):
     current_foundations_context().pipeline_context().file_name = None