def test_create_run_official(mocker, kubernetes_api_client_node_port): mocker.patch('kubernetes.config.load_kube_config') rg = mocker.patch('concurrent.futures.ThreadPoolExecutor') rg.return_value.submit.return_value.result.return_value.json.return_value = "a" client = ApiClient(in_cluster=False) result = client.create_run("test_run", 5, num_cpus=4.1, max_bandwidth=10000, image='PyTorch Cifar-10 ResNet-20 Open-MPI') assert result is not None assert result.result().json() == "a"
def test_create_run_custom(mocker, kubernetes_api_client_node_port): mocker.patch('kubernetes.config.load_kube_config') rg = mocker.patch('concurrent.futures.ThreadPoolExecutor') rg.return_value.submit.return_value.result.return_value.json.return_value = "a" client = ApiClient(in_cluster=False) result = client.create_run( "test_run", 5, num_cpus=4.1, max_bandwidth=10000, custom_image_name="localhost:5000/mlbench_worker:latest", custom_image_command="/.openmpi/bin/mpirun /app/main.py", custom_image_all_nodes=False) assert result is not None assert result.result().json() == "a"
def run(name, num_workers, gpu, num_cpus, light, dashboard_url): """Start a new run for a benchmark image""" current_run_inputs = {} last_run_inputs_dir_location = os.path.join(os.environ["HOME"], ".local", "share", "mlbench") Path(last_run_inputs_dir_location).mkdir(parents=True, exist_ok=True) last_run_inputs_file_location = os.path.join(last_run_inputs_dir_location, "last_run_inputs.pkl") try: last_run_inputs = pickle.load(open(last_run_inputs_file_location, "rb")) except FileNotFoundError as e: last_run_inputs = {} images = list(MLBENCH_IMAGES.keys()) text_prompt = "Benchmark: \n\n" text_prompt += "\n".join("[{}]\t{}".format(i, t) for i, t in enumerate(images)) text_prompt += "\n[{}]\tCustom Image".format(len(images)) text_prompt += "\n\nSelection" selection = click.prompt( text_prompt, type=click.IntRange(0, len(images)), default=last_run_inputs.get("benchmark", 0), ) current_run_inputs["benchmark"] = selection if selection == len(images): # run custom image image = click.prompt("Image", type=str, default=last_run_inputs.get("image", None)) current_run_inputs["image"] = image image_command = click.prompt("Command", type=str, default=last_run_inputs.get( "image_command", None)) current_run_inputs["image_command"] = image_command benchmark = { "custom_image_name": image, "custom_image_command": image_command, } else: benchmark = {"image": images[selection]} # Backend Prompt text_prompt = "Backend: \n\n" text_prompt += "\n".join("[{}]\t{}".format(i, t) for i, t in enumerate(MLBENCH_BACKENDS)) text_prompt += "\n[{}]\tCustom Backend".format(len(MLBENCH_BACKENDS)) text_prompt += "\n\nSelection" selection = click.prompt( text_prompt, type=click.IntRange(0, len(MLBENCH_BACKENDS)), default=last_run_inputs.get("backend", 0), ) current_run_inputs["backend"] = selection if selection == len(MLBENCH_BACKENDS): backend = click.prompt("Backend", type=str, default=last_run_inputs.get( "custom_backend", None)) current_run_inputs["custom_backend"] = backend run_on_all = click.confirm( "Run command on all nodes (otherwise just first node)", default=last_run_inputs.get("run_on_all", None), ) current_run_inputs["run_on_all"] = run_on_all benchmark["custom_backend"] = backend benchmark["run_all_nodes"] = run_on_all else: benchmark["backend"] = MLBENCH_BACKENDS[selection] pickle.dump(current_run_inputs, open(last_run_inputs_file_location, "wb")) benchmark["gpu_enabled"] = gpu benchmark["light_target"] = light benchmark["num_cpus"] = num_cpus - 1 loaded = setup_client_from_config() client = ApiClient(in_cluster=False, url=dashboard_url, load_config=not loaded) results = [] for num_w in num_workers: current_name = "{}-{}".format(name, num_w) res = client.create_run(current_name, num_w, **benchmark) results.append(res) for res in results: act_result = res.result() if act_result.status_code > 201: try: click.echo("Couldn't start run: {}".format( act_result.json()["message"])) except json.JSONDecodeError: print(str(act_result.text)) click.echo("Couldn't start run: Status {} for request".format( act_result.status_code)) return click.echo("Run started with name {}".format( act_result.json()["name"]))