示例#1
0
 def test_resource_mutator_gpu(self):
     pod_spec = V1PodSpec(
         containers=[V1Container(name='model', image="image")], )
     k8s_utils.get_resource_mutator(gpu=1)(None, pod_spec, "")
     actual = pod_spec.containers[0].resources.limits
     expected = {'nvidia.com/gpu': 1}
     assert actual == expected
示例#2
0
 def test_resource_mutator_no_mem(self):
     pod_spec = V1PodSpec(
         containers=[V1Container(name='model', image="image")], )
     k8s_utils.get_resource_mutator(cpu=1.5)(None, pod_spec, "")
     actual = pod_spec.containers[0].resources.limits
     expected = {'cpu': 1.5}
     assert actual == expected
示例#3
0
 def test_resource_mutator_no_cpu(self):
     pod_spec = V1PodSpec(
         containers=[V1Container(name='model', image="image")], )
     k8s_utils.get_resource_mutator(memory=0.5)(None, pod_spec, "")
     actual = pod_spec.containers[0].resources.limits
     expected = {'memory': '0.47Gi'}
     assert actual == expected
示例#4
0
def test_resource_mutator_gpu_vendor():
    pod_spec = V1PodSpec(containers=[V1Container(name='model',
                                                 image="image")], )
    k8s_utils.get_resource_mutator(gpu=2, gpu_vendor='amd')(None, pod_spec, "")
    actual = pod_spec.containers[0].resources.limits
    expected = {'amd.com/gpu': 2}
    assert actual == expected
示例#5
0
if __name__ == '__main__':
    if os.getenv('FAIRING_RUNTIME', None) is None:
        from kubeflow import fairing
        from kubeflow.fairing.kubernetes import utils as k8s_utils

        DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000'
        fairing.config.set_builder(
            'append',
            image_name='katib-job',
            base_image='brightfly/kubeflow-jupyter-lab:tf2.0-gpu',
            registry=DOCKER_REGISTRY,
            push=True)
        # cpu 2, memory 5GiB
        fairing.config.set_deployer(
            'job',
            namespace='dudaji',
            pod_spec_mutators=[
                k8s_utils.mounting_pvc(pvc_name="fashion-mnist",
                                       pvc_mount_path="/result"),
                k8s_utils.get_resource_mutator(cpu=2, memory=5)
            ])
        fairing.config.run()
    else:
        remote_train = MyFashionMnist()
        remote_train.train()

# In[ ]:

# In[ ]:
        minio_context_source = MinioContextSource(endpoint_url=minio_endpoint,
                                                  minio_secret=minio_username,
                                                  minio_secret_key=minio_key,
                                                  region_name=minio_region)

        output_map = {"Dockerfile": "Dockerfile", "mnist.py": "mnist.py"}

        DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000'
        #fairing.config.set_preprocessor('notebook', notebook_file='app.ipynb', output_map=output_map)
        fairing.config.set_preprocessor('python', output_map=output_map)
        fairing.config.set_builder(
            'cluster',
            image_name='fairing-job',
            base_image='brightfly/kubeflow-jupyter-lab:tf2.0-cpu',
            context_source=minio_context_source,
            registry=DOCKER_REGISTRY,
            push=True)
        # cpu 1, memory 1GiB
        fairing.config.set_deployer('job',
                                    namespace='handson5',
                                    pod_spec_mutators=[
                                        k8s_utils.get_resource_mutator(
                                            cpu=1, memory=4)
                                    ])
        # python3
        # fairing.config.set_preprocessor('python', input_files=[__file__])
        fairing.config.run()
    else:
        remote_train = MyModel()
        remote_train.train()
if __name__ == '__main__':
    if os.getenv('FAIRING_RUNTIME', None) is None:
        from kubeflow import fairing
        from kubeflow.fairing.kubernetes import utils as k8s_utils

        DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000'  # 프라이빗 레지스트리

        fairing.config.set_builder(
            'append',
            image_name='tensorboard-job',  # here not fairing job but katib job
            base_image='brightfly/kubeflow-jupyter-lab:tf2.0-cpu',
            registry=DOCKER_REGISTRY,
            push=True)
        # cpu 1, memory 5GiB
        fairing.config.set_deployer(
            'job',
            namespace='admin',  # here
            pod_spec_mutators=[
                k8s_utils.get_resource_mutator(
                    cpu=1,  # here
                    memory=5)
            ])
        fairing.config.run()
    else:
        remote_train = MyFashionMnist()
        remote_train.train()

# In[2]:

# In[ ]:
                print("%s/%s" % (path, filename))


if __name__ == '__main__':
    if os.getenv('FAIRING_RUNTIME', None) is None:
        from kubeflow import fairing
        from kubeflow.fairing.kubernetes import utils as k8s_utils

        DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000'
        fairing.config.set_builder(
            'append',
            image_name='store-fashion-minst',
            base_image='brightfly/kubeflow-jupyter-lab:tf2.0-cpu-pil',
            registry=DOCKER_REGISTRY,
            push=True)
        # cpu 2, memory 5GiB
        fairing.config.set_deployer('job',
                                    namespace='dudaji',
                                    pod_spec_mutators=[
                                        k8s_utils.get_resource_mutator(
                                            cpu=0.5, memory=0.5)
                                    ])
        fairing.config.run()
    else:
        remote = StoreImage()
        remote.save()

# In[23]:

# In[30]:
示例#9
0
def execute(config,
            docker_registry,
            base_image="gcr.io/kubeflow-fairing/lightgbm:latest",
            namespace=None,
            stream_log=True,
            cores_per_worker=None,
            memory_per_worker=None,
            pod_spec_mutators=None):
    """Runs the LightGBM CLI in a single pod in user's Kubeflow cluster.
    Users can configure it to be a train, predict, and other supported tasks
    by using the right config.
    Please refere https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst
    for more information on config options.

    :param config: config entries
    :param docker_registry: docker registry name
    :param base_image: base image (Default value = "gcr.io/kubeflow-fairing/lightgbm:latest")
    :param namespace: k8s namespace (Default value = None)
    :param stream_log: should that stream log? (Default value = True)
    :param cores_per_worker: number of cores per worker (Default value = None)
    :param memory_per_worker: memory value per worker (Default value = None)
    :param pod_spec_mutators: pod spec mutators (Default value = None)

    """
    if not namespace and not fairing_utils.is_running_in_k8s():
        namespace = "kubeflow"
    namespace = namespace or fairing_utils.get_default_target_namespace()
    config_file_name = None
    if isinstance(config, str):
        config_file_name = config
        config = utils.load_properties_config_file(config)
    elif isinstance(config, dict):
        config_file_name = utils.save_properties_config_file(config)
    else:
        raise RuntimeError("config should be of type dict or string(filepath) "
                           "but got {}".format(type(dict)))

    utils.scrub_fields(config, BLACKLISTED_FIELDS)

    _, num_machines = utils.get_config_value(config, NUM_MACHINES_FILEDS)
    num_machines = num_machines or 1
    if num_machines:
        try:
            num_machines = int(num_machines)
        except ValueError:
            raise ValueError(
                "num_machines value in config should be an int >= 1 "
                "but got {}".format(config.get('num_machines')))
        if num_machines < 1:
            raise ValueError(
                "num_machines value in config should >= 1 but got {}".format(
                    num_machines))

    if num_machines > 1:
        config['machine_list_file'] = "mlist.txt"
    output_map = generate_context_files(config, config_file_name, num_machines)

    preprocessor = BasePreProcessor(command=[ENTRYPOINT],
                                    output_map=output_map)
    builder = AppendBuilder(registry=docker_registry,
                            base_image=base_image,
                            preprocessor=preprocessor)
    builder.build()
    pod_spec = builder.generate_pod_spec()

    pod_spec_mutators = pod_spec_mutators or []
    pod_spec_mutators.append(gcp.add_gcp_credentials_if_exists)
    pod_spec_mutators.append(
        k8s_utils.get_resource_mutator(cores_per_worker, memory_per_worker))

    if num_machines == 1:
        # non-distributed mode
        deployer = Job(namespace=namespace,
                       pod_spec_mutators=pod_spec_mutators,
                       stream_log=stream_log)
    else:
        # distributed mode
        deployer = TfJob(namespace=namespace,
                         pod_spec_mutators=pod_spec_mutators,
                         chief_count=1,
                         worker_count=num_machines - 1,
                         stream_log=stream_log)
    deployer.deploy(pod_spec)
    return deployer