def execute(config, docker_registry, base_image, namespace=None): """ Runs the LightGBM CLI in a single pod in user's Kubeflow cluster. Users can configure it to be a train, predict, and other supported tasks by using the right config. Please refere https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst for more information on config options. Attributes: config: LightGBM config - Ref https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst docker_registry: registry to push the built docker image base_image: base image to use for this job. It should have lightgbm installed and should be in PATH variable. namespace: Kubernetes namespace to use """ if namespace is None: namespace = "kubeflow" config_file_name = None if isinstance(config, str): config_file_name = config config = _load_config_file(config) elif isinstance(config, dict): config_file_name = _save_to_config_file(config) else: raise RuntimeError("config should be of type dict or string(filepath) " "but got {}".format(type(dict))) output_map = generate_context_files(config, config_file_name) preprocessor = BasePreProcessor( command=[ENTRYPOINT], output_map=output_map) builder = AppendBuilder(registry=docker_registry, base_image=base_image, preprocessor=preprocessor) builder.build() pod_spec = builder.generate_pod_spec() deployer = Job(namespace=namespace, pod_spec_mutators=[ fairing.cloud.gcp.add_gcp_credentials_if_exists]) deployer.deploy(pod_spec)
def execute(config, docker_registry, base_image="gcr.io/kubeflow-fairing/lightgbm:latest", namespace="kubeflow"): """ Runs the LightGBM CLI in a single pod in user's Kubeflow cluster. Users can configure it to be a train, predict, and other supported tasks by using the right config. Please refere https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst for more information on config options. Attributes: config: LightGBM config - Ref https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst docker_registry: registry to push the built docker image base_image: base image to use for this job. It should have lightgbm installed and should be in PATH variable. namespace: Kubernetes namespace to use """ config_file_name = None if isinstance(config, str): config_file_name = config config = utils.load_properties_config_file(config) elif isinstance(config, dict): config_file_name = utils.save_properties_config_file(config) else: raise RuntimeError("config should be of type dict or string(filepath) " "but got {}".format(type(dict))) utils.scrub_fields(config, BLACKLISTED_FIELDS) _, num_machines = utils.get_config_value(config, NUM_MACHINES_FILEDS) num_machines = num_machines or 1 if num_machines: try: num_machines = int(num_machines) except ValueError: raise ValueError( "num_machines value in config should be an int >= 1 " "but got {}".format(config.get('num_machines'))) if num_machines < 1: raise ValueError( "num_machines value in config should >= 1 but got {}".format( num_machines)) if num_machines > 1: config['machine_list_file'] = "mlist.txt" output_map = generate_context_files(config, config_file_name, num_machines > 1) preprocessor = BasePreProcessor(command=[ENTRYPOINT], output_map=output_map) builder = AppendBuilder(registry=docker_registry, base_image=base_image, preprocessor=preprocessor) builder.build() pod_spec = builder.generate_pod_spec() if num_machines == 1: # non-distributed mode deployer = Job(namespace=namespace, pod_spec_mutators=[ fairing.cloud.gcp.add_gcp_credentials_if_exists ]) else: # distributed mode deployer = TfJob(namespace=namespace, pod_spec_mutators=[ fairing.cloud.gcp.add_gcp_credentials_if_exists ], chief_count=1, worker_count=num_machines - 1) deployer.deploy(pod_spec)
def execute(config, docker_registry, base_image="gcr.io/kubeflow-fairing/lightgbm:latest", namespace="kubeflow", stream_log=True, cores_per_worker=None, memory_per_worker=None, pod_spec_mutators=None): """ Runs the LightGBM CLI in a single pod in user's Kubeflow cluster. Users can configure it to be a train, predict, and other supported tasks by using the right config. Please refere https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst for more information on config options. Attributes: config: LightGBM config - Ref https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst docker_registry: registry to push the built docker image base_image: base image to use for this job. It should have lightgbm installed and should be in PATH variable. namespace: Kubernetes namespace to use stream_log: True - streams logs from the first worker in the training job after job launch till the training is finished. Flase - no logs are streamed after the job launch. An async job launch use case. cores_per_worker: #cpu cores allocated per worker memory_per_worker: memory allocated per worker in GB, it can be fractional. pod_spec_mutators: list of functions that is used to mutate the podsspec. e.g. fairing.cloud.gcp.add_gcp_credentials_if_exists This can used to set things like volumes and security context. """ config_file_name = None if isinstance(config, str): config_file_name = config config = utils.load_properties_config_file(config) elif isinstance(config, dict): config_file_name = utils.save_properties_config_file(config) else: raise RuntimeError("config should be of type dict or string(filepath) " "but got {}".format(type(dict))) utils.scrub_fields(config, BLACKLISTED_FIELDS) _, num_machines = utils.get_config_value(config, NUM_MACHINES_FILEDS) num_machines = num_machines or 1 if num_machines: try: num_machines = int(num_machines) except ValueError: raise ValueError( "num_machines value in config should be an int >= 1 " "but got {}".format(config.get('num_machines'))) if num_machines < 1: raise ValueError( "num_machines value in config should >= 1 but got {}".format( num_machines)) if num_machines > 1: config['machine_list_file'] = "mlist.txt" output_map = generate_context_files(config, config_file_name, num_machines > 1) preprocessor = BasePreProcessor(command=[ENTRYPOINT], output_map=output_map) builder = AppendBuilder(registry=docker_registry, base_image=base_image, preprocessor=preprocessor) builder.build() pod_spec = builder.generate_pod_spec() pod_spec_mutators = pod_spec_mutators or [] pod_spec_mutators.append(fairing.cloud.gcp.add_gcp_credentials_if_exists) pod_spec_mutators.append( k8s_utils.get_resource_mutator(cores_per_worker, memory_per_worker)) if num_machines == 1: # non-distributed mode deployer = Job(namespace=namespace, pod_spec_mutators=pod_spec_mutators, stream_log=stream_log) else: # distributed mode deployer = TfJob(namespace=namespace, pod_spec_mutators=pod_spec_mutators, chief_count=1, worker_count=num_machines - 1, stream_log=stream_log) deployer.deploy(pod_spec) return deployer
def submit(self): self._build() deployer = Job() deployer.deploy(self.pod_spec)