def info(job_name_or_id): """ Prints detailed info for the run """ try: experiment = ExperimentClient().get(normalize_job_name(job_name_or_id)) except FloydException: experiment = ExperimentClient().get(job_name_or_id) task_instance_id = get_module_task_instance_id(experiment.task_instances) task_instance = TaskInstanceClient().get( task_instance_id) if task_instance_id else None normalized_job_name = normalize_job_name(experiment.name) table = [["Job name", normalized_job_name], [ "Output name", normalized_job_name + '/output' if task_instance else None ], ["Created", experiment.created_pretty], ["Status", experiment.state], ["Duration(s)", experiment.duration_rounded], ["Instance", experiment.instance_type_trimmed], ["Description", experiment.description]] if task_instance and task_instance.mode in ['jupyter', 'serving']: table.append(["Mode", task_instance.mode]) table.append(["Url", experiment.service_url]) if experiment.tensorboard_url: table.append(["Tensorboard", experiment.tensorboard_url]) floyd_logger.info(tabulate(table))
def delete(id, yes): """ Delete project run """ experiment = ExperimentClient().get(id) task_instance = TaskInstanceClient().get( get_module_task_instance_id(experiment.task_instances)) if experiment.state in ["queued", "running"]: floyd_logger.info( "Experiment in {} state cannot be deleted. Stop it first".format( experiment.state)) return if not yes: click.confirm('Delete Run: {}?'.format(experiment.name), abort=True, default=False) if task_instance.module_id: ModuleClient().delete(task_instance.module_id) if ExperimentClient().delete(id): floyd_logger.info("Experiment deleted") else: floyd_logger.error("Failed to delete experiment")
def clone(id): """ Download files from a job. This will download the files that were originally uploaded at the start of the job. """ try: experiment = ExperimentClient().get( normalize_job_name(id, use_config=False)) except FloydException: experiment = ExperimentClient().get(id) task_instance_id = get_module_task_instance_id(experiment.task_instances) task_instance = TaskInstanceClient().get( task_instance_id) if task_instance_id else None if not task_instance: sys.exit( "Cannot clone this version of the job. Try a different version.") module = ModuleClient().get( task_instance.module_id) if task_instance else None code_url = "{}/api/v1/resources/{}?content=true&download=true".format( floyd.floyd_host, module.resource_id) ExperimentClient().download_tar(url=code_url, untar=True, delete_after_untar=True)
def delete(names, yes): """ Delete project runs """ failures = False for name in names: try: experiment = ExperimentClient().get(normalize_job_name(name)) except FloydException: experiment = ExperimentClient().get(name) if not experiment: failures = True continue if not yes and not click.confirm("Delete Job: {}?".format( experiment.name), abort=False, default=False): floyd_logger.info("Job {}: Skipped.".format(experiment.name)) continue if not ExperimentClient().delete(experiment.id): failures = True else: floyd_logger.info("Job %s Deleted", experiment.name) if failures: sys.exit(1)
def output(id, url, download): """ Shows the output url of the run. By default opens the output page in your default browser. """ experiment = ExperimentClient().get(id) task_instance = TaskInstanceClient().get( get_module_task_instance_id(experiment.task_instances)) if "output" in task_instance.output_ids: resource = ResourceClient().get(task_instance.output_ids["output"]) output_dir_url = "{}/viewer/{}".format(floyd.floyd_host, resource.uri) if url: floyd_logger.info(output_dir_url) else: if download: output_dir_url = "{}&download=true".format(output_dir_url) ExperimentClient().download_tar(url=output_dir_url, untar=True, delete_after_untar=True) else: floyd_logger.info( "Opening output directory in your browser ...") webbrowser.open(output_dir_url) else: floyd_logger.error("Output directory not available")
def delete(ids, yes): """ Delete project runs """ failures = False for id in ids: experiment = ExperimentClient().get(id) if not experiment: failures = True continue if not yes and not click.confirm("Delete Job: {}?".format( experiment.name), abort=False, default=False): floyd_logger.info("Job {}: Skipped.".format(experiment.name)) continue if not ExperimentClient().delete(experiment.id): failures = True else: floyd_logger.info("Job %s Deleted", experiment.name) if failures: sys.exit(1)
def status(id): """ View status of all or specific run. It can also list status of all the runs in the project. """ if id: experiment = ExperimentClient().get(id) print_experiments([experiment]) else: experiments = ExperimentClient().get_all() print_experiments(experiments)
def stop(id): """ Stop a run before it can finish. """ experiment = ExperimentClient().get(id) if experiment.state not in ["queued", "running"]: floyd_logger.info("Job in {} state cannot be stopped".format(experiment.state)) return if ExperimentClient().stop(experiment.id): floyd_logger.info("Experiment shutdown request submitted. Check status to confirm shutdown") else: floyd_logger.error("Failed to stop job")
def get_output(id, path, untar, delete_after_untar): """ - Download all files in a dataset or from a Job output Eg: alice/projects/mnist/1/files, alice/projects/mnist/1/output or alice/dataset/mnist-data/1/ Using /output will download the files that are saved at the end of the job. Note: This will download the files that are saved at the end of the job. - Download a directory from a dataset or from Job output Specify the path to a directory and download all its files and subdirectories. Eg: --path models/checkpoint1 """ data_source = get_data_object(id, use_data_config=False) if not data_source: if "output" in id: floyd_logger.info( "Note: You cannot clone the output of a running job. You need " "to wait for it to finish.") sys.exit() if path: # Download a directory from Dataset or Files # Get the type of data resource from the id # (foo/projects/bar/ or foo/datasets/bar/) if "/datasets/" in id: resource_type = "data" resource_id = data_source.id else: resource_type = "files" try: experiment = ExperimentClient().get( normalize_job_name(id, use_config=False)) except FloydException: experiment = ExperimentClient().get(id) resource_id = experiment.id data_url = "{}/api/v1/download/artifacts/{}/{}?is_dir=true&path={}" \ .format(floyd.floyd_host, resource_type, resource_id, path) else: # Download the full Dataset data_url = "{}/api/v1/resources/{}?content=true&download=true".format( floyd.floyd_host, data_source.resource_id) DataClient().download_tar( url=data_url, untar=untar, delete_after_untar=untar and delete_after_untar, )
def output(id, url): """ View the files from a job. """ try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) output_dir_url = "%s/%s/files" % (floyd.floyd_web_host, experiment.name) if url: floyd_logger.info(output_dir_url) else: floyd_logger.info("Opening output path in your browser ...") webbrowser.open(output_dir_url)
def clone(id): """ Download the code for the experiment to the current path """ experiment = ExperimentClient().get(id) task_instance_id = get_module_task_instance_id(experiment.task_instances) task_instance = TaskInstanceClient().get(task_instance_id) if task_instance_id else None if not task_instance: sys.exit("Cannot clone this version of the job. Try a different version.") module = ModuleClient().get(task_instance.module_id) if task_instance else None code_url = "{}/api/v1/resources/{}?content=true&download=true".format(floyd.floyd_host, module.resource_id) ExperimentClient().download_tar(url=code_url, untar=True, delete_after_untar=True)
def status(id): """ View status of all or specific run. It can also list status of all the runs in the project. """ if id: try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) print_experiments([experiment]) else: experiments = ExperimentClient().get_all() print_experiments(experiments)
def logs(id, url, tail, sleep_duration=1): """ Print the logs of the run. """ experiment = ExperimentClient().get(id) task_instance = TaskInstanceClient().get( get_module_task_instance_id(experiment.task_instances)) log_url = "{}/api/v1/resources/{}?content=true".format( floyd.floyd_host, task_instance.log_id) if url: floyd_logger.info(log_url) return if tail: floyd_logger.info("Launching job ...") current_shell_output = "" while True: # Get the logs in a loop and log the new lines log_file_contents = get_url_contents(log_url) print_output = log_file_contents[len(current_shell_output):] if len(print_output.strip()): floyd_logger.info(print_output) current_shell_output = log_file_contents sleep(sleep_duration) else: log_file_contents = get_url_contents(log_url) if len(log_file_contents.strip()): floyd_logger.info(log_file_contents) else: floyd_logger.info("Launching job now. Try after a few seconds.")
def info(id): """ Prints detailed info for the run """ experiment = ExperimentClient().get(id) task_instance_id = get_module_task_instance_id(experiment.task_instances) task_instance = TaskInstanceClient().get( task_instance_id) if task_instance_id else None mode = url = None if experiment.state == "running": if task_instance and task_instance.mode in ['jupyter', 'serving']: mode = task_instance.mode url = get_task_url(task_instance.id) table = [["Run ID", experiment.id], ["Name", experiment.name], ["Created", experiment.created_pretty], ["Status", experiment.state], ["Duration(s)", experiment.duration_rounded], ["Output ID", task_instance.id if task_instance else None], ["Instance", experiment.instance_type_trimmed], ["Version", experiment.description]] if mode: table.append(["Mode", mode]) if url: table.append(["Url", url]) floyd_logger.info(tabulate(table))
def status(id): """ View status of all jobs in a project. The command also accepts a specific job name. """ if id: try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) print_experiments([experiment]) else: experiments = ExperimentClient().get_all() print_experiments(experiments)
def output(id, url): """ Shows the output url of the run. By default opens the output page in your default browser. """ try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) output_dir_url = "%s/%s/output" % (floyd.floyd_web_host, experiment.name) if url: floyd_logger.info(output_dir_url) else: floyd_logger.info("Opening output path in your browser ...") webbrowser.open(output_dir_url)
def restart(ctx, job_name, data, open_notebook, env, message, gpu, cpu, gpup, cpup, command): """ Restart a given job as a new job. """ parameters = {} expt_client = ExperimentClient() try: job = expt_client.get(normalize_job_name(job_name)) except FloydException: job = expt_client.get(job_name) if gpup: instance_type = G1P_INSTANCE_TYPE elif cpup: instance_type = C1P_INSTANCE_TYPE elif gpu: instance_type = G1_INSTANCE_TYPE elif cpu: instance_type = C1_INSTANCE_TYPE else: instance_type = job.instance_type if instance_type is not None: parameters['instance_type'] = instance_type else: instance_type = job.instance_type if env is not None: if not validate_env(env, instance_type): sys.exit(1) parameters['env'] = env success, data_ids = process_data_ids(data) if not success: sys.exit(1) if message: parameters['message'] = message if command: parameters['command'] = ' '.join(command) floyd_logger.info('Restarting job %s...', job_name) new_job_info = expt_client.restart(job.id, parameters=parameters) if not new_job_info: floyd_logger.error("Failed to restart job") sys.exit(1) floyd_logger.info('New job created:') table_output = [["JOB NAME"], [new_job_info['name']]] floyd_logger.info('\n' + tabulate(table_output, headers="firstrow") + '\n') show_new_job_info(expt_client, new_job_info['name'], new_job_info, job.mode, open_notebook)
def get_log_id(job_id): log_msg_printed = False while True: try: experiment = ExperimentClient().get(normalize_job_name(job_id)) except FloydException: experiment = ExperimentClient().get(job_id) instance_log_id = experiment.instance_log_id if instance_log_id: break elif not log_msg_printed: floyd_logger.info("Waiting for logs ...\n") log_msg_printed = True sleep(1) return instance_log_id
def logs(id, url, tail, follow, sleep_duration=1): """ Print the logs of the run. """ tail = tail or follow log_msg_printed = False while True: try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) instance_log_id = experiment.instance_log_id if instance_log_id: break elif not log_msg_printed: floyd_logger.info("Waiting for logs ...\n") log_msg_printed = True sleep(1) log_url = "{}/api/v1/resources/{}?content=true".format( floyd.floyd_host, instance_log_id) if url: floyd_logger.info(log_url) return if tail: floyd_logger.info("Launching job ...") current_shell_output = "" while True: # Get the logs in a loop and log the new lines log_file_contents = ResourceClient().get_content(instance_log_id) print_output = log_file_contents[len(current_shell_output):] if len(print_output.strip()): floyd_logger.info(print_output) current_shell_output = log_file_contents sleep(sleep_duration) else: log_file_contents = ResourceClient().get_content(instance_log_id) if len(log_file_contents.strip()): floyd_logger.info(log_file_contents) else: floyd_logger.info("Launching job now. Try after a few seconds.")
def clone(id, path): """ - Download all files from a job Eg: alice/projects/mnist/1/ Note: This will download the files that were originally uploaded at the start of the job. - Download files in a specific path from a job Specify the path to a directory and download all its files and subdirectories. Eg: --path models/checkpoint1 """ try: experiment = ExperimentClient().get( normalize_job_name(id, use_config=False)) except FloydException: experiment = ExperimentClient().get(id) task_instance_id = get_module_task_instance_id(experiment.task_instances) task_instance = TaskInstanceClient().get( task_instance_id) if task_instance_id else None if not task_instance: sys.exit( "Cannot clone this version of the job. Try a different version.") module = ModuleClient().get( task_instance.module_id) if task_instance else None if path: # Download a directory from Code code_url = "{}/api/v1/download/artifacts/code/{}?is_dir=true&path={}".format( floyd.floyd_host, experiment.id, path) else: # Download the full Code code_url = "{}/api/v1/resources/{}?content=true&download=true".format( floyd.floyd_host, module.resource_id) ExperimentClient().download_tar(url=code_url, untar=True, delete_after_untar=True)
def stop(id): """ Stop a running job. """ try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) if experiment.state not in ["queued", "queue_scheduled", "running"]: floyd_logger.info("Job in {} state cannot be stopped".format( experiment.state)) sys.exit(1) if not ExperimentClient().stop(experiment.id): floyd_logger.error("Failed to stop job") sys.exit(1) floyd_logger.info( "Experiment shutdown request submitted. Check status to confirm shutdown" )
def logs(id, url, tail, sleep_duration=1): """ Print the logs of the run. """ try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) if experiment.state == 'queued': floyd_logger.info("Job is currently in a queue") return instance_log_id = experiment.instance_log_id if not instance_log_id: floyd_logger.info("Job not started yet, no log to show.") sys.exit(1) log_url = "{}/api/v1/resources/{}?content=true".format( floyd.floyd_host, instance_log_id) if url: floyd_logger.info(log_url) return if tail: floyd_logger.info("Launching job ...") current_shell_output = "" while True: # Get the logs in a loop and log the new lines log_file_contents = ResourceClient().get_content(instance_log_id) print_output = log_file_contents[len(current_shell_output):] if len(print_output.strip()): floyd_logger.info(print_output) current_shell_output = log_file_contents sleep(sleep_duration) else: log_file_contents = ResourceClient().get_content(instance_log_id) if len(log_file_contents.strip()): floyd_logger.info(log_file_contents) else: floyd_logger.info("Launching job now. Try after a few seconds.")
def output(id, url): """ Shows the output url of the run. By default opens the output page in your default browser. """ experiment = ExperimentClient().get(id) task_instance = TaskInstanceClient().get(get_module_task_instance_id(experiment.task_instances)) if "output" in task_instance.output_ids: output_dir_url = "{}/api/v1/resources/{}?content=true".format(floyd.floyd_host, task_instance.output_ids["output"]) if url: floyd_logger.info(output_dir_url) else: floyd_logger.info("Opening output directory in your browser ...") webbrowser.open(output_dir_url) else: floyd_logger.error("Output directory not available")
"""View experiments by job.""" import streamlit as st import numpy as np import pandas as pd import subprocess from floyd.client.experiment import ExperimentClient from floyd.client.data import DataClient from pathlib import Path from torch import tensor from metalearn import plotting experiment_client = ExperimentClient() data_client = DataClient() cache_dir = Path.home() / "floyd_cache" EXPERIMENT_LIMIT = 10000 SUCCESS_STATE = "success" METRICS_FILE = "rnn_metalearn_controller_experiment.csv" @st.cache def get_experiments(): return { exp.name: exp for exp in experiment_client.get_all(limit=EXPERIMENT_LIMIT) if exp.state == SUCCESS_STATE }
def run(ctx, gpu, env, data, mode, command): """ Run a command on Floyd. Floyd will upload contents of the current directory and run your command remotely. This command will generate a run id for reference. """ command_str = ' '.join(command) experiment_config = ExperimentConfigManager.get_config() access_token = AuthConfigManager.get_access_token() version = experiment_config.version experiment_name = "{}/{}:{}".format(access_token.username, experiment_config.name, version) # Create module module = Module(name=experiment_name, description=version, command=command_str, mode=get_mode_parameter(mode), family_id=experiment_config.family_id, default_container=get_docker_image(env, gpu), version=version) module_id = ModuleClient().create(module) floyd_logger.debug("Created module with id : {}".format(module_id)) # Create experiment request instance_type = GPU_INSTANCE_TYPE if gpu else CPU_INSTANCE_TYPE experiment_request = ExperimentRequest(name=experiment_name, description=version, module_id=module_id, data_id=data, predecessor=experiment_config.experiment_predecessor, family_id=experiment_config.family_id, version=version, instance_type=instance_type) experiment_id = ExperimentClient().create(experiment_request) floyd_logger.debug("Created experiment : {}".format(experiment_id)) # Update expt config including predecessor experiment_config.increment_version() experiment_config.set_module_predecessor(module_id) experiment_config.set_experiment_predecessor(experiment_id) ExperimentConfigManager.set_config(experiment_config) table_output = [["RUN ID", "NAME", "VERSION"], [experiment_id, experiment_name, version]] floyd_logger.info(tabulate(table_output, headers="firstrow")) floyd_logger.info("") if mode != 'default': while True: # Wait for the experiment to become available try: experiment = ExperimentClient().get(experiment_id) break except Exception: floyd_logger.debug("Experiment not available yet: {}".format(experiment_id)) sleep(1) continue # Print the path to jupyter notebook if mode == 'jupyter': jupyter_url = get_task_url(get_module_task_instance_id(experiment.task_instances)) floyd_logger.info("Waiting for Jupyter notebook to become available ...") if wait_for_url(jupyter_url): floyd_logger.info("\nPath to jupyter notebook: {}".format(jupyter_url)) else: floyd_logger.info("Problem starting the notebook. View logs for more information") # Print the path to serving endpoint if mode == 'serve': floyd_logger.info("Path to service endpoint: {}".format( get_task_url(get_module_task_instance_id(experiment.task_instances)))) floyd_logger.info(""" To view logs enter: floyd logs {} """.format(experiment_id))
def run(ctx, gpu, env, message, data, mode, open_notebook, tensorboard, gpup, cpup, gpu2, cpu2, command): """ Run a command on Floyd. Floyd will upload contents of the current directory and run your command remotely. This command will generate a run id for reference. """ # Error early if more than one --env is passed. Then get the first/only # --env out of the list so all other operations work normally (they don't # expect an iterable). For details on this approach, see the comment above # the --env click option if len(env) > 1: floyd_logger.error( "You passed more than one environment: {}. Please specify a single environment." .format(env)) sys.exit(1) env = env[0] experiment_config = ExperimentConfigManager.get_config() access_token = AuthConfigManager.get_access_token() namespace = experiment_config.namespace or access_token.username if not ProjectClient().exists(experiment_config.name, namespace=namespace): floyd_logger.error( 'Invalid project id, please run ' '"floyd init PROJECT_NAME" before scheduling a job.') sys.exit(1) experiment_name = "{}/{}".format(namespace, experiment_config.name) success, data_ids = process_data_ids(data) if not success: sys.exit(2) # Create module default_name = 'input' if len(data_ids) <= 1 else None module_inputs = [{ 'name': get_data_name(data_str, default_name), 'type': 'dir' } for data_str in data_ids] if gpu2: instance_type = G2_INSTANCE_TYPE elif cpu2: instance_type = C2_INSTANCE_TYPE elif gpup: instance_type = G1P_INSTANCE_TYPE elif cpup: instance_type = C1P_INSTANCE_TYPE elif gpu: instance_type = G1_INSTANCE_TYPE else: instance_type = C1_INSTANCE_TYPE if not validate_env(env, instance_type): sys.exit(3) command_str = ' '.join(command) if command_str and mode in ('jupyter', 'serve'): floyd_logger.error( 'Command argument "%s" cannot be used with mode: %s.\nSee http://docs.floydhub.com/guides/run_a_job/#mode for more information about run modes.', command_str, mode) # noqa sys.exit(3) module = Module(name=experiment_name, description=message or '', command=command_str, mode=get_mode_parameter(mode), enable_tensorboard=tensorboard, family_id=experiment_config.family_id, inputs=module_inputs, env=env, arch=INSTANCE_ARCH_MAP[instance_type]) try: module_id = ModuleClient().create(module) except BadRequestException as e: if 'Project not found, ID' in e.message: floyd_logger.error( 'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.' ) else: floyd_logger.error('ERROR: %s', e.message) sys.exit(4) floyd_logger.debug("Created module with id : %s", module_id) # Create experiment request # Get the actual command entered in the command line full_command = get_command_line(instance_type, env, message, data, mode, open_notebook, tensorboard, command_str) experiment_request = ExperimentRequest( name=experiment_name, description=message, full_command=full_command, module_id=module_id, env=env, data_ids=data_ids, family_id=experiment_config.family_id, instance_type=instance_type) expt_client = ExperimentClient() expt_info = expt_client.create(experiment_request) floyd_logger.debug("Created job : %s", expt_info['id']) job_name = expt_info['name'] show_new_job_info(expt_client, job_name, expt_info, mode, open_notebook)
def run(ctx, gpu, env, message, data, mode, open, tensorboard, command): """ Run a command on Floyd. Floyd will upload contents of the current directory and run your command remotely. This command will generate a run id for reference. """ experiment_config = ExperimentConfigManager.get_config() if not ProjectClient().exists(experiment_config.family_id): floyd_logger.error( 'Invalid project id, please run ' '"floyd init PROJECT_NAME" before scheduling a job.') return access_token = AuthConfigManager.get_access_token() experiment_name = "{}/{}".format(access_token.username, experiment_config.name) # Create module if len(data) > 5: floyd_logger.error("Cannot attach more than 5 datasets to an job") return # Get the data entity from the server to: # 1. Confirm that the data id or uri exists and has the right permissions # 2. If uri is used, get the id of the dataset data_ids = [] for data_name_or_id in data: path = None if ':' in data_name_or_id: data_name_or_id, path = data_name_or_id.split(':') data_obj = DataClient().get(data_name_or_id) if not data_obj: floyd_logger.error( "Data not found for name or id: {}".format(data_name_or_id)) return data_ids.append( "{}:{}".format(data_obj.id, path) if path else data_obj.id) default_name = 'input' if len(data_ids) <= 1 else None module_inputs = [{ 'name': get_data_name(data_str, default_name), 'type': 'dir' } for data_str in data_ids] if gpu: arch = 'gpu' instance_type = GPU_INSTANCE_TYPE else: arch = 'cpu' instance_type = CPU_INSTANCE_TYPE env_map = EnvClient().get_all() envs = env_map.get(arch) if envs: if env not in envs: floyd_logger.error( "{} is not in the list of supported environments: {}".format( env, ', '.join(envs.keys()))) return else: floyd_logger.error("{} is not a supported architecture".format(arch)) return command_str = ' '.join(command) module = Module(name=experiment_name, description=message or '', command=command_str, mode=get_mode_parameter(mode), enable_tensorboard=tensorboard, family_id=experiment_config.family_id, inputs=module_inputs, env=env, arch=arch) from floyd.exceptions import BadRequestException try: module_id = ModuleClient().create(module) except BadRequestException as e: if 'Project not found, ID' in e.message: floyd_logger.error( 'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.' ) else: floyd_logger.error('ERROR: %s', e.message) sys.exit(1) floyd_logger.debug("Created module with id : {}".format(module_id)) # Create experiment request # Get the actual command entered in the command line full_command = get_command_line(gpu, env, message, data, mode, open, tensorboard, command) experiment_request = ExperimentRequest( name=experiment_name, description=message, full_command=full_command, module_id=module_id, data_ids=data_ids, family_id=experiment_config.family_id, instance_type=instance_type) expt_cli = ExperimentClient() expt_info = expt_cli.create(experiment_request) floyd_logger.debug("Created job : {}".format(expt_info['id'])) table_output = [["JOB NAME"], [expt_info['name']]] floyd_logger.info(tabulate(table_output, headers="firstrow")) floyd_logger.info("") if mode in ['jupyter', 'serve']: while True: # Wait for the experiment / task instances to become available try: experiment = expt_cli.get(expt_info['id']) if experiment.task_instances: break except Exception: floyd_logger.debug("Job not available yet: {}".format( expt_info['id'])) floyd_logger.debug("Job not available yet: {}".format( expt_info['id'])) sleep(3) continue # Print the path to jupyter notebook if mode == 'jupyter': jupyter_url = experiment.service_url print( "Setting up your instance and waiting for Jupyter notebook to become available ...", end='') if wait_for_url(jupyter_url, sleep_duration_seconds=2, iterations=900): floyd_logger.info( "\nPath to jupyter notebook: {}".format(jupyter_url)) if open: webbrowser.open(jupyter_url) else: floyd_logger.info( "\nPath to jupyter notebook: {}".format(jupyter_url)) floyd_logger.info( "Notebook is still loading. View logs to track progress") floyd_logger.info(" floyd logs {}".format(expt_info['name'])) # Print the path to serving endpoint if mode == 'serve': floyd_logger.info("Path to service endpoint: {}".format( experiment.service_url)) if experiment.timeout_seconds < 4 * 60 * 60: floyd_logger.info( "\nYour job timeout is currently set to {} seconds".format( experiment.timeout_seconds)) floyd_logger.info( "This is because you are in a trial account. Paid users will have longer timeouts. " "See https://www.floydhub.com/pricing for details") else: floyd_logger.info("To view logs enter:") floyd_logger.info(" floyd logs {}".format(expt_info['name']))
def run(ctx, cpu, gpu, env, message, data, mode, open_notebook, follow, tensorboard, gpup, cpup, gpu2, cpu2, max_runtime, task, command): """ Run a command on Floyd. Floyd will upload contents of the current directory and run your command remotely. This command will generate a run id for reference. """ # cli_default is used for any option that has default value cli_default = {'description': '', 'command': ''} # Error early if more than one --env is passed. Then get the first/only # --env out of the list so all other operations work normally (they don't # expect an iterable). For details on this approach, see the comment above # the --env click option if not env: cli_default['env'] = DEFAULT_ENV env = None elif len(env) > 1: floyd_logger.error( "You passed more than one environment: {}. Please specify a single environment." .format(env)) sys.exit(1) else: env = env[0] if not mode: cli_default['mode'] = 'command' experiment_config = ExperimentConfigManager.get_config() access_token = AuthConfigManager.get_access_token() namespace = experiment_config.namespace or access_token.username if not ProjectClient().exists(experiment_config.name, namespace=namespace): floyd_logger.error( 'Invalid project id, please run ' '"floyd init PROJECT_NAME" before scheduling a job.') sys.exit(1) experiment_name = "{}/{}".format(namespace, experiment_config.name) success, data_ids = process_data_ids(data) if not success: sys.exit(2) # Create module default_name = 'input' if len(data_ids) <= 1 else None module_inputs = [{ 'name': get_data_name(data_str, default_name), 'type': 'dir' } for data_str in data_ids] instance_type = None if gpu2: instance_type = G2_INSTANCE_TYPE elif cpu2: instance_type = C2_INSTANCE_TYPE elif gpup: instance_type = G1P_INSTANCE_TYPE elif cpup: instance_type = C1P_INSTANCE_TYPE elif gpu: instance_type = G1_INSTANCE_TYPE elif cpu: instance_type = C1_INSTANCE_TYPE if not instance_type: cli_default['instance_type'] = C1_INSTANCE_TYPE yaml_config = read_yaml_config() arch = INSTANCE_ARCH_MAP[resolve_final_instance_type( instance_type, yaml_config, task, cli_default)] if not validate_env(env or cli_default['env'], arch): sys.exit(3) command_str = ' '.join(command) if command_str and mode in ('jupyter', 'serve'): floyd_logger.error( 'Command argument "%s" cannot be used with mode: %s.\nSee http://docs.floydhub.com/guides/run_a_job/#mode for more information about run modes.', command_str, mode) # noqa sys.exit(3) if command_str == '': # set to none so it won't override floyd config command_str = None module = Module(name=experiment_name, description=message or '', command=command_str, mode=mode, enable_tensorboard=tensorboard, family_id=experiment_config.family_id, inputs=module_inputs, env=env, instance_type=instance_type, yaml_config=yaml_config, task=task) try: module_id = ModuleClient().create(module, cli_default) except BadRequestException as e: if 'Project not found, ID' in e.message: floyd_logger.error( 'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.' ) else: floyd_logger.error('ERROR: %s', e.message) sys.exit(4) floyd_logger.debug("Created module with id : %s", module_id) # Create experiment request # Get the actual command entered in the command line if max_runtime: max_runtime = int(max_runtime) full_command = get_command_line(instance_type, env, message, data, mode, open_notebook, tensorboard, command_str) experiment_request = ExperimentRequest( name=experiment_name, description=message, full_command=full_command, module_id=module_id, max_runtime=max_runtime, env=env, data_ids=data_ids, family_id=experiment_config.family_id, instance_type=instance_type, yaml_config=yaml_config, task=task) expt_client = ExperimentClient() expt_info = expt_client.create(experiment_request, cli_default) floyd_logger.debug("Created job : %s", expt_info['id']) job_name = expt_info['name'] if not follow: show_new_job_info(expt_client, job_name, expt_info, mode, open_notebook) else: # If the user specified --follow, we assume they're only interested in # log output and not in anything that would be displayed by # show_new_job_info. floyd_logger.info("Opening logs ...") instance_log_id = instance_log_id = get_log_id(job_name) follow_logs(instance_log_id)
def run(ctx, gpu, env, message, data, mode, open, tensorboard, gpup, cpup, command): """ Run a command on Floyd. Floyd will upload contents of the current directory and run your command remotely. This command will generate a run id for reference. """ experiment_config = ExperimentConfigManager.get_config() if not ProjectClient().exists(experiment_config.family_id): floyd_logger.error( 'Invalid project id, please run ' '"floyd init PROJECT_NAME" before scheduling a job.') sys.exit(1) access_token = AuthConfigManager.get_access_token() experiment_name = "{}/{}".format(access_token.username, experiment_config.name) success, data_ids = process_data_ids(data) if not success: sys.exit(2) # Create module default_name = 'input' if len(data_ids) <= 1 else None module_inputs = [{ 'name': get_data_name(data_str, default_name), 'type': 'dir' } for data_str in data_ids] if gpup: instance_type = G1P_INSTANCE_TYPE elif cpup: instance_type = C1P_INSTANCE_TYPE elif gpu: instance_type = G1_INSTANCE_TYPE else: instance_type = C1_INSTANCE_TYPE if not validate_env(env, instance_type): sys.exit(3) command_str = ' '.join(command) if command_str and mode in ('jupyter', 'serve'): floyd_logger.error( 'Command argument "%s" cannot be used with mode: %s.\nSee http://docs.floydhub.com/guides/run_a_job/#mode for more information about run modes.', command_str, mode) sys.exit(3) module = Module(name=experiment_name, description=message or '', command=command_str, mode=get_mode_parameter(mode), enable_tensorboard=tensorboard, family_id=experiment_config.family_id, inputs=module_inputs, env=env, arch=INSTANCE_ARCH_MAP[instance_type]) try: module_id = ModuleClient().create(module) except BadRequestException as e: if 'Project not found, ID' in e.message: floyd_logger.error( 'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.' ) else: floyd_logger.error('ERROR: %s', e.message) sys.exit(4) floyd_logger.debug("Created module with id : %s", module_id) # Create experiment request # Get the actual command entered in the command line full_command = get_command_line(instance_type, env, message, data, mode, open, tensorboard, command_str) experiment_request = ExperimentRequest( name=experiment_name, description=message, full_command=full_command, module_id=module_id, data_ids=data_ids, family_id=experiment_config.family_id, instance_type=instance_type) expt_client = ExperimentClient() expt_info = expt_client.create(experiment_request) floyd_logger.debug("Created job : %s", expt_info['id']) job_name = normalize_job_name(expt_info['name']) floyd_logger.info("") table_output = [["JOB NAME"], [job_name]] floyd_logger.info(tabulate(table_output, headers="firstrow")) floyd_logger.info("") show_new_job_info(expt_client, job_name, expt_info, mode)
def restart(ctx, job_name, data, open_notebook, env, message, gpu, cpu, gpup, cpup, command): """ Restart a given job as a new job. """ # Error early if more than one --env is passed. Then get the first/only # --env out of the list so all other operations work normally (they don't # expect an iterable). For details on this approach, see the comment above # the --env click option if len(env) > 1: floyd_logger.error( "You passed more than one environment: {}. Please specify a single environment." .format(env)) sys.exit(1) env = env[0] parameters = {} expt_client = ExperimentClient() try: job = expt_client.get(normalize_job_name(job_name)) except FloydException: job = expt_client.get(job_name) if gpup: instance_type = G1P_INSTANCE_TYPE elif cpup: instance_type = C1P_INSTANCE_TYPE elif gpu: instance_type = G1_INSTANCE_TYPE elif cpu: instance_type = C1_INSTANCE_TYPE else: instance_type = job.instance_type if instance_type is not None: parameters['instance_type'] = instance_type else: instance_type = job.instance_type if env is not None: arch = INSTANCE_ARCH_MAP[instance_type] if not validate_env(env, arch): sys.exit(1) parameters['env'] = env success, data_ids = process_data_ids(data) if not success: sys.exit(1) if data_ids: parameters['data_ids'] = data_ids if message: parameters['description'] = message if command: parameters['command'] = ' '.join(command) floyd_logger.info('Restarting job %s...', job_name) new_job_info = expt_client.restart(job.id, parameters=parameters) if not new_job_info: floyd_logger.error("Failed to restart job") sys.exit(1) show_new_job_info(expt_client, new_job_info['name'], new_job_info, job.mode, open_notebook)