def test_resource_mutator_gpu(self): pod_spec = V1PodSpec( containers=[V1Container(name='model', image="image")], ) k8s_utils.get_resource_mutator(gpu=1)(None, pod_spec, "") actual = pod_spec.containers[0].resources.limits expected = {'nvidia.com/gpu': 1} assert actual == expected
def test_resource_mutator_no_mem(self): pod_spec = V1PodSpec( containers=[V1Container(name='model', image="image")], ) k8s_utils.get_resource_mutator(cpu=1.5)(None, pod_spec, "") actual = pod_spec.containers[0].resources.limits expected = {'cpu': 1.5} assert actual == expected
def test_resource_mutator_no_cpu(self): pod_spec = V1PodSpec( containers=[V1Container(name='model', image="image")], ) k8s_utils.get_resource_mutator(memory=0.5)(None, pod_spec, "") actual = pod_spec.containers[0].resources.limits expected = {'memory': '0.47Gi'} assert actual == expected
def test_resource_mutator_gpu_vendor(): pod_spec = V1PodSpec(containers=[V1Container(name='model', image="image")], ) k8s_utils.get_resource_mutator(gpu=2, gpu_vendor='amd')(None, pod_spec, "") actual = pod_spec.containers[0].resources.limits expected = {'amd.com/gpu': 2} assert actual == expected
if __name__ == '__main__': if os.getenv('FAIRING_RUNTIME', None) is None: from kubeflow import fairing from kubeflow.fairing.kubernetes import utils as k8s_utils DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000' fairing.config.set_builder( 'append', image_name='katib-job', base_image='brightfly/kubeflow-jupyter-lab:tf2.0-gpu', registry=DOCKER_REGISTRY, push=True) # cpu 2, memory 5GiB fairing.config.set_deployer( 'job', namespace='dudaji', pod_spec_mutators=[ k8s_utils.mounting_pvc(pvc_name="fashion-mnist", pvc_mount_path="/result"), k8s_utils.get_resource_mutator(cpu=2, memory=5) ]) fairing.config.run() else: remote_train = MyFashionMnist() remote_train.train() # In[ ]: # In[ ]:
minio_context_source = MinioContextSource(endpoint_url=minio_endpoint, minio_secret=minio_username, minio_secret_key=minio_key, region_name=minio_region) output_map = {"Dockerfile": "Dockerfile", "mnist.py": "mnist.py"} DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000' #fairing.config.set_preprocessor('notebook', notebook_file='app.ipynb', output_map=output_map) fairing.config.set_preprocessor('python', output_map=output_map) fairing.config.set_builder( 'cluster', image_name='fairing-job', base_image='brightfly/kubeflow-jupyter-lab:tf2.0-cpu', context_source=minio_context_source, registry=DOCKER_REGISTRY, push=True) # cpu 1, memory 1GiB fairing.config.set_deployer('job', namespace='handson5', pod_spec_mutators=[ k8s_utils.get_resource_mutator( cpu=1, memory=4) ]) # python3 # fairing.config.set_preprocessor('python', input_files=[__file__]) fairing.config.run() else: remote_train = MyModel() remote_train.train()
if __name__ == '__main__': if os.getenv('FAIRING_RUNTIME', None) is None: from kubeflow import fairing from kubeflow.fairing.kubernetes import utils as k8s_utils DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000' # 프라이빗 레지스트리 fairing.config.set_builder( 'append', image_name='tensorboard-job', # here not fairing job but katib job base_image='brightfly/kubeflow-jupyter-lab:tf2.0-cpu', registry=DOCKER_REGISTRY, push=True) # cpu 1, memory 5GiB fairing.config.set_deployer( 'job', namespace='admin', # here pod_spec_mutators=[ k8s_utils.get_resource_mutator( cpu=1, # here memory=5) ]) fairing.config.run() else: remote_train = MyFashionMnist() remote_train.train() # In[2]: # In[ ]:
print("%s/%s" % (path, filename)) if __name__ == '__main__': if os.getenv('FAIRING_RUNTIME', None) is None: from kubeflow import fairing from kubeflow.fairing.kubernetes import utils as k8s_utils DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000' fairing.config.set_builder( 'append', image_name='store-fashion-minst', base_image='brightfly/kubeflow-jupyter-lab:tf2.0-cpu-pil', registry=DOCKER_REGISTRY, push=True) # cpu 2, memory 5GiB fairing.config.set_deployer('job', namespace='dudaji', pod_spec_mutators=[ k8s_utils.get_resource_mutator( cpu=0.5, memory=0.5) ]) fairing.config.run() else: remote = StoreImage() remote.save() # In[23]: # In[30]:
def execute(config, docker_registry, base_image="gcr.io/kubeflow-fairing/lightgbm:latest", namespace=None, stream_log=True, cores_per_worker=None, memory_per_worker=None, pod_spec_mutators=None): """Runs the LightGBM CLI in a single pod in user's Kubeflow cluster. Users can configure it to be a train, predict, and other supported tasks by using the right config. Please refere https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst for more information on config options. :param config: config entries :param docker_registry: docker registry name :param base_image: base image (Default value = "gcr.io/kubeflow-fairing/lightgbm:latest") :param namespace: k8s namespace (Default value = None) :param stream_log: should that stream log? (Default value = True) :param cores_per_worker: number of cores per worker (Default value = None) :param memory_per_worker: memory value per worker (Default value = None) :param pod_spec_mutators: pod spec mutators (Default value = None) """ if not namespace and not fairing_utils.is_running_in_k8s(): namespace = "kubeflow" namespace = namespace or fairing_utils.get_default_target_namespace() config_file_name = None if isinstance(config, str): config_file_name = config config = utils.load_properties_config_file(config) elif isinstance(config, dict): config_file_name = utils.save_properties_config_file(config) else: raise RuntimeError("config should be of type dict or string(filepath) " "but got {}".format(type(dict))) utils.scrub_fields(config, BLACKLISTED_FIELDS) _, num_machines = utils.get_config_value(config, NUM_MACHINES_FILEDS) num_machines = num_machines or 1 if num_machines: try: num_machines = int(num_machines) except ValueError: raise ValueError( "num_machines value in config should be an int >= 1 " "but got {}".format(config.get('num_machines'))) if num_machines < 1: raise ValueError( "num_machines value in config should >= 1 but got {}".format( num_machines)) if num_machines > 1: config['machine_list_file'] = "mlist.txt" output_map = generate_context_files(config, config_file_name, num_machines) preprocessor = BasePreProcessor(command=[ENTRYPOINT], output_map=output_map) builder = AppendBuilder(registry=docker_registry, base_image=base_image, preprocessor=preprocessor) builder.build() pod_spec = builder.generate_pod_spec() pod_spec_mutators = pod_spec_mutators or [] pod_spec_mutators.append(gcp.add_gcp_credentials_if_exists) pod_spec_mutators.append( k8s_utils.get_resource_mutator(cores_per_worker, memory_per_worker)) if num_machines == 1: # non-distributed mode deployer = Job(namespace=namespace, pod_spec_mutators=pod_spec_mutators, stream_log=stream_log) else: # distributed mode deployer = TfJob(namespace=namespace, pod_spec_mutators=pod_spec_mutators, chief_count=1, worker_count=num_machines - 1, stream_log=stream_log) deployer.deploy(pod_spec) return deployer