def get_builder(self, preprocessor, base_image, registry, needs_deps_installation=True, pod_spec_mutators=None): pod_spec_mutators = pod_spec_mutators or [] pod_spec_mutators.append(gcp.add_gcp_credentials_if_exists) # TODO (karthikv2k): Add cloud build as the deafult # once https://github.com/kubeflow/fairing/issues/145 is fixed if fairing.utils.is_running_in_k8s(): return ClusterBuilder(preprocessor=preprocessor, base_image=base_image, registry=registry, pod_spec_mutators=pod_spec_mutators) elif ml_tasks_utils.is_docker_daemon_exists(): return DockerBuilder(preprocessor=preprocessor, base_image=base_image, registry=registry) elif not needs_deps_installation: return AppendBuilder(preprocessor=preprocessor, base_image=base_image, registry=registry) else: # TODO (karthikv2k): Add more info on how to reolve this issue raise RuntimeError( "Not able to guess the right builder for this job!")
def get_builder( self, preprocessor, base_image, registry, needs_deps_installation=True, # pylint:disable=arguments-differ pod_spec_mutators=None): if not needs_deps_installation: return AppendBuilder(preprocessor=preprocessor, base_image=base_image, registry=registry) elif fairing.utils.is_running_in_k8s(): return ClusterBuilder(preprocessor=preprocessor, base_image=base_image, registry=registry, pod_spec_mutators=pod_spec_mutators, namespace=self._namespace, context_source=self._build_context_source) elif ml_tasks_utils.is_docker_daemon_exists(): return DockerBuilder(preprocessor=preprocessor, base_image=base_image, registry=registry) else: # TODO (karthikv2k): Add more info on how to reolve this issue raise RuntimeError( "Not able to guess the right builder for this job!")
def execute(config, docker_registry, base_image, namespace=None): """ Runs the LightGBM CLI in a single pod in user's Kubeflow cluster. Users can configure it to be a train, predict, and other supported tasks by using the right config. Please refere https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst for more information on config options. Attributes: config: LightGBM config - Ref https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst docker_registry: registry to push the built docker image base_image: base image to use for this job. It should have lightgbm installed and should be in PATH variable. namespace: Kubernetes namespace to use """ if namespace is None: namespace = "kubeflow" config_file_name = None if isinstance(config, str): config_file_name = config config = _load_config_file(config) elif isinstance(config, dict): config_file_name = _save_to_config_file(config) else: raise RuntimeError("config should be of type dict or string(filepath) " "but got {}".format(type(dict))) output_map = generate_context_files(config, config_file_name) preprocessor = BasePreProcessor( command=[ENTRYPOINT], output_map=output_map) builder = AppendBuilder(registry=docker_registry, base_image=base_image, preprocessor=preprocessor) builder.build() pod_spec = builder.generate_pod_spec() deployer = Job(namespace=namespace, pod_spec_mutators=[ fairing.cloud.gcp.add_gcp_credentials_if_exists]) deployer.deploy(pod_spec)
def get_builder(self, preprocessor, base_image, registry, needs_deps_installation=True, pod_spec_mutators=None): pod_spec_mutators = pod_spec_mutators or [] pod_spec_mutators.append(gcp.add_gcp_credentials_if_exists) if not needs_deps_installation: return AppendBuilder(preprocessor=preprocessor, base_image=base_image, registry=registry) elif (fairing.utils.is_running_in_k8s() or not ml_tasks_utils.is_docker_daemon_exists()) and \ KubeManager().secret_exists(constants.GCP_CREDS_SECRET_NAME, self._namespace): return ClusterBuilder(preprocessor=preprocessor, base_image=base_image, registry=registry, pod_spec_mutators=pod_spec_mutators, namespace=self._namespace, context_source=self._build_context_source) elif ml_tasks_utils.is_docker_daemon_exists(): return DockerBuilder(preprocessor=preprocessor, base_image=base_image, registry=registry) else: msg = ["Not able to guess the right builder for this job!"] if KubeManager().secret_exists(constants.GCP_CREDS_SECRET_NAME, self._namespace): msg.append( "It seems you don't have permission to list/access secrets in your " "Kubeflow cluster. We need this permission in order to build a docker " "image using Kubeflow cluster. Adding Kubeneters Admin role to the " "service account you are using might solve this issue.") if not fairing.utils.is_running_in_k8s(): msg.append( " Also If you are using 'sudo' to access docker in your system you can" " solve this problem by adding your username to the docker group. " "Reference: https://docs.docker.com/install/linux/linux-postinstall/" "#manage-docker-as-a-non-root-user You need to logout and login to " "get change activated.") message = " ".join(msg) raise RuntimeError(message)
def execute(config, docker_registry, base_image="gcr.io/kubeflow-fairing/lightgbm:latest", namespace="kubeflow", stream_log=True, cores_per_worker=None, memory_per_worker=None, pod_spec_mutators=None): """ Runs the LightGBM CLI in a single pod in user's Kubeflow cluster. Users can configure it to be a train, predict, and other supported tasks by using the right config. Please refere https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst for more information on config options. Attributes: config: LightGBM config - Ref https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst docker_registry: registry to push the built docker image base_image: base image to use for this job. It should have lightgbm installed and should be in PATH variable. namespace: Kubernetes namespace to use stream_log: True - streams logs from the first worker in the training job after job launch till the training is finished. Flase - no logs are streamed after the job launch. An async job launch use case. cores_per_worker: #cpu cores allocated per worker memory_per_worker: memory allocated per worker in GB, it can be fractional. pod_spec_mutators: list of functions that is used to mutate the podsspec. e.g. fairing.cloud.gcp.add_gcp_credentials_if_exists This can used to set things like volumes and security context. """ config_file_name = None if isinstance(config, str): config_file_name = config config = utils.load_properties_config_file(config) elif isinstance(config, dict): config_file_name = utils.save_properties_config_file(config) else: raise RuntimeError("config should be of type dict or string(filepath) " "but got {}".format(type(dict))) utils.scrub_fields(config, BLACKLISTED_FIELDS) _, num_machines = utils.get_config_value(config, NUM_MACHINES_FILEDS) num_machines = num_machines or 1 if num_machines: try: num_machines = int(num_machines) except ValueError: raise ValueError( "num_machines value in config should be an int >= 1 " "but got {}".format(config.get('num_machines'))) if num_machines < 1: raise ValueError( "num_machines value in config should >= 1 but got {}".format( num_machines)) if num_machines > 1: config['machine_list_file'] = "mlist.txt" output_map = generate_context_files(config, config_file_name, num_machines > 1) preprocessor = BasePreProcessor(command=[ENTRYPOINT], output_map=output_map) builder = AppendBuilder(registry=docker_registry, base_image=base_image, preprocessor=preprocessor) builder.build() pod_spec = builder.generate_pod_spec() pod_spec_mutators = pod_spec_mutators or [] pod_spec_mutators.append(fairing.cloud.gcp.add_gcp_credentials_if_exists) pod_spec_mutators.append( k8s_utils.get_resource_mutator(cores_per_worker, memory_per_worker)) if num_machines == 1: # non-distributed mode deployer = Job(namespace=namespace, pod_spec_mutators=pod_spec_mutators, stream_log=stream_log) else: # distributed mode deployer = TfJob(namespace=namespace, pod_spec_mutators=pod_spec_mutators, chief_count=1, worker_count=num_machines - 1, stream_log=stream_log) deployer.deploy(pod_spec) return deployer
def execute(config, docker_registry, base_image="gcr.io/kubeflow-fairing/lightgbm:latest", namespace="kubeflow"): """ Runs the LightGBM CLI in a single pod in user's Kubeflow cluster. Users can configure it to be a train, predict, and other supported tasks by using the right config. Please refere https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst for more information on config options. Attributes: config: LightGBM config - Ref https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst docker_registry: registry to push the built docker image base_image: base image to use for this job. It should have lightgbm installed and should be in PATH variable. namespace: Kubernetes namespace to use """ config_file_name = None if isinstance(config, str): config_file_name = config config = utils.load_properties_config_file(config) elif isinstance(config, dict): config_file_name = utils.save_properties_config_file(config) else: raise RuntimeError("config should be of type dict or string(filepath) " "but got {}".format(type(dict))) utils.scrub_fields(config, BLACKLISTED_FIELDS) _, num_machines = utils.get_config_value(config, NUM_MACHINES_FILEDS) num_machines = num_machines or 1 if num_machines: try: num_machines = int(num_machines) except ValueError: raise ValueError( "num_machines value in config should be an int >= 1 " "but got {}".format(config.get('num_machines'))) if num_machines < 1: raise ValueError( "num_machines value in config should >= 1 but got {}".format( num_machines)) if num_machines > 1: config['machine_list_file'] = "mlist.txt" output_map = generate_context_files(config, config_file_name, num_machines > 1) preprocessor = BasePreProcessor(command=[ENTRYPOINT], output_map=output_map) builder = AppendBuilder(registry=docker_registry, base_image=base_image, preprocessor=preprocessor) builder.build() pod_spec = builder.generate_pod_spec() if num_machines == 1: # non-distributed mode deployer = Job(namespace=namespace, pod_spec_mutators=[ fairing.cloud.gcp.add_gcp_credentials_if_exists ]) else: # distributed mode deployer = TfJob(namespace=namespace, pod_spec_mutators=[ fairing.cloud.gcp.add_gcp_credentials_if_exists ], chief_count=1, worker_count=num_machines - 1) deployer.deploy(pod_spec)