def info(job_name_or_id): """ Prints detailed info for the run """ try: experiment = ExperimentClient().get(normalize_job_name(job_name_or_id)) except FloydException: experiment = ExperimentClient().get(job_name_or_id) task_instance_id = get_module_task_instance_id(experiment.task_instances) task_instance = TaskInstanceClient().get( task_instance_id) if task_instance_id else None normalized_job_name = normalize_job_name(experiment.name) table = [["Job name", normalized_job_name], [ "Output name", normalized_job_name + '/output' if task_instance else None ], ["Created", experiment.created_pretty], ["Status", experiment.state], ["Duration(s)", experiment.duration_rounded], ["Instance", experiment.instance_type_trimmed], ["Description", experiment.description]] if task_instance and task_instance.mode in ['jupyter', 'serving']: table.append(["Mode", task_instance.mode]) table.append(["Url", experiment.service_url]) if experiment.tensorboard_url: table.append(["Tensorboard", experiment.tensorboard_url]) floyd_logger.info(tabulate(table))
def delete(names, yes): """ Delete project runs """ failures = False for name in names: try: experiment = ExperimentClient().get(normalize_job_name(name)) except FloydException: experiment = ExperimentClient().get(name) if not experiment: failures = True continue if not yes and not click.confirm("Delete Job: {}?".format( experiment.name), abort=False, default=False): floyd_logger.info("Job {}: Skipped.".format(experiment.name)) continue if not ExperimentClient().delete(experiment.id): failures = True else: floyd_logger.info("Job %s Deleted", experiment.name) if failures: sys.exit(1)
def clone(id): """ Download files from a job. This will download the files that were originally uploaded at the start of the job. """ try: experiment = ExperimentClient().get( normalize_job_name(id, use_config=False)) except FloydException: experiment = ExperimentClient().get(id) task_instance_id = get_module_task_instance_id(experiment.task_instances) task_instance = TaskInstanceClient().get( task_instance_id) if task_instance_id else None if not task_instance: sys.exit( "Cannot clone this version of the job. Try a different version.") module = ModuleClient().get( task_instance.module_id) if task_instance else None code_url = "{}/api/v1/resources/{}?content=true&download=true".format( floyd.floyd_host, module.resource_id) ExperimentClient().download_tar(url=code_url, untar=True, delete_after_untar=True)
def restart(ctx, job_name, data, open_notebook, env, message, gpu, cpu, gpup, cpup, command): """ Restart a given job as a new job. """ parameters = {} expt_client = ExperimentClient() try: job = expt_client.get(normalize_job_name(job_name)) except FloydException: job = expt_client.get(job_name) if gpup: instance_type = G1P_INSTANCE_TYPE elif cpup: instance_type = C1P_INSTANCE_TYPE elif gpu: instance_type = G1_INSTANCE_TYPE elif cpu: instance_type = C1_INSTANCE_TYPE else: instance_type = job.instance_type if instance_type is not None: parameters['instance_type'] = instance_type else: instance_type = job.instance_type if env is not None: if not validate_env(env, instance_type): sys.exit(1) parameters['env'] = env success, data_ids = process_data_ids(data) if not success: sys.exit(1) if message: parameters['message'] = message if command: parameters['command'] = ' '.join(command) floyd_logger.info('Restarting job %s...', job_name) new_job_info = expt_client.restart(job.id, parameters=parameters) if not new_job_info: floyd_logger.error("Failed to restart job") sys.exit(1) floyd_logger.info('New job created:') table_output = [["JOB NAME"], [new_job_info['name']]] floyd_logger.info('\n' + tabulate(table_output, headers="firstrow") + '\n') show_new_job_info(expt_client, new_job_info['name'], new_job_info, job.mode, open_notebook)
def print_experiments(experiments): """ Prints expt details in a table. Includes urls and mode parameters """ headers = ["JOB NAME", "CREATED", "STATUS", "DURATION(s)", "INSTANCE", "DESCRIPTION"] expt_list = [] for experiment in experiments: expt_list.append([normalize_job_name(experiment.name), experiment.created_pretty, experiment.state, experiment.duration_rounded, experiment.instance_type_trimmed, experiment.description]) floyd_logger.info(tabulate(expt_list, headers=headers))
def get_output(id, path, untar, delete_after_untar): """ - Download all files in a dataset or from a Job output Eg: alice/projects/mnist/1/files, alice/projects/mnist/1/output or alice/dataset/mnist-data/1/ Using /output will download the files that are saved at the end of the job. Note: This will download the files that are saved at the end of the job. - Download a directory from a dataset or from Job output Specify the path to a directory and download all its files and subdirectories. Eg: --path models/checkpoint1 """ data_source = get_data_object(id, use_data_config=False) if not data_source: if "output" in id: floyd_logger.info( "Note: You cannot clone the output of a running job. You need " "to wait for it to finish.") sys.exit() if path: # Download a directory from Dataset or Files # Get the type of data resource from the id # (foo/projects/bar/ or foo/datasets/bar/) if "/datasets/" in id: resource_type = "data" resource_id = data_source.id else: resource_type = "files" try: experiment = ExperimentClient().get( normalize_job_name(id, use_config=False)) except FloydException: experiment = ExperimentClient().get(id) resource_id = experiment.id data_url = "{}/api/v1/download/artifacts/{}/{}?is_dir=true&path={}" \ .format(floyd.floyd_host, resource_type, resource_id, path) else: # Download the full Dataset data_url = "{}/api/v1/resources/{}?content=true&download=true".format( floyd.floyd_host, data_source.resource_id) DataClient().download_tar( url=data_url, untar=untar, delete_after_untar=untar and delete_after_untar, )
def output(id, url): """ View the files from a job. """ try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) output_dir_url = "%s/%s/files" % (floyd.floyd_web_host, experiment.name) if url: floyd_logger.info(output_dir_url) else: floyd_logger.info("Opening output path in your browser ...") webbrowser.open(output_dir_url)
def status(id): """ View status of all or specific run. It can also list status of all the runs in the project. """ if id: try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) print_experiments([experiment]) else: experiments = ExperimentClient().get_all() print_experiments(experiments)
def output(id, url): """ Shows the output url of the run. By default opens the output page in your default browser. """ try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) output_dir_url = "%s/%s/output" % (floyd.floyd_web_host, experiment.name) if url: floyd_logger.info(output_dir_url) else: floyd_logger.info("Opening output path in your browser ...") webbrowser.open(output_dir_url)
def status(id): """ View status of all jobs in a project. The command also accepts a specific job name. """ if id: try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) print_experiments([experiment]) else: experiments = ExperimentClient().get_all() print_experiments(experiments)
def get_log_id(job_id): log_msg_printed = False while True: try: experiment = ExperimentClient().get(normalize_job_name(job_id)) except FloydException: experiment = ExperimentClient().get(job_id) instance_log_id = experiment.instance_log_id if instance_log_id: break elif not log_msg_printed: floyd_logger.info("Waiting for logs ...\n") log_msg_printed = True sleep(1) return instance_log_id
def logs(id, url, tail, follow, sleep_duration=1): """ Print the logs of the run. """ tail = tail or follow log_msg_printed = False while True: try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) instance_log_id = experiment.instance_log_id if instance_log_id: break elif not log_msg_printed: floyd_logger.info("Waiting for logs ...\n") log_msg_printed = True sleep(1) log_url = "{}/api/v1/resources/{}?content=true".format( floyd.floyd_host, instance_log_id) if url: floyd_logger.info(log_url) return if tail: floyd_logger.info("Launching job ...") current_shell_output = "" while True: # Get the logs in a loop and log the new lines log_file_contents = ResourceClient().get_content(instance_log_id) print_output = log_file_contents[len(current_shell_output):] if len(print_output.strip()): floyd_logger.info(print_output) current_shell_output = log_file_contents sleep(sleep_duration) else: log_file_contents = ResourceClient().get_content(instance_log_id) if len(log_file_contents.strip()): floyd_logger.info(log_file_contents) else: floyd_logger.info("Launching job now. Try after a few seconds.")
def stop(id): """ Stop a run before it can finish. """ try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) if experiment.state not in ["queued", "running"]: floyd_logger.info("Job in {} state cannot be stopped".format( experiment.state)) return if ExperimentClient().stop(experiment.id): floyd_logger.info( "Experiment shutdown request submitted. Check status to confirm shutdown" ) else: floyd_logger.error("Failed to stop job")
def clone(id, path): """ - Download all files from a job Eg: alice/projects/mnist/1/ Note: This will download the files that were originally uploaded at the start of the job. - Download files in a specific path from a job Specify the path to a directory and download all its files and subdirectories. Eg: --path models/checkpoint1 """ try: experiment = ExperimentClient().get( normalize_job_name(id, use_config=False)) except FloydException: experiment = ExperimentClient().get(id) task_instance_id = get_module_task_instance_id(experiment.task_instances) task_instance = TaskInstanceClient().get( task_instance_id) if task_instance_id else None if not task_instance: sys.exit( "Cannot clone this version of the job. Try a different version.") module = ModuleClient().get( task_instance.module_id) if task_instance else None if path: # Download a directory from Code code_url = "{}/api/v1/download/artifacts/code/{}?is_dir=true&path={}".format( floyd.floyd_host, experiment.id, path) else: # Download the full Code code_url = "{}/api/v1/resources/{}?content=true&download=true".format( floyd.floyd_host, module.resource_id) ExperimentClient().download_tar(url=code_url, untar=True, delete_after_untar=True)
def stop(id): """ Stop a running job. """ try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) if experiment.state not in ["queued", "queue_scheduled", "running"]: floyd_logger.info("Job in {} state cannot be stopped".format( experiment.state)) sys.exit(1) if not ExperimentClient().stop(experiment.id): floyd_logger.error("Failed to stop job") sys.exit(1) floyd_logger.info( "Experiment shutdown request submitted. Check status to confirm shutdown" )
def test_normalize_job_name(self, _0, _1, _2): from floyd.cli.utils import normalize_job_name # Make sure that the current_username and current_experiment_name are # honored: assert normalize_job_name('1') == 'pete/projects/test_proj/1' assert normalize_job_name('mnist/3') == 'pete/projects/mnist/3' assert normalize_job_name('foo/mnist/3') == 'foo/projects/mnist/3' # current_username and current_experiment_name are overridden with the # second and third args if passed assert normalize_job_name('bar/1', 'yoyo') == 'yoyo/projects/bar/1' assert normalize_job_name('1', 'yoyo', 'ma') == 'yoyo/projects/ma/1' # Full job names are returned unchanged assert normalize_job_name('foo/projects/bar/1') == 'foo/projects/bar/1' # If no job number is passed, get_last_job_name is used assert normalize_job_name('foo/projects/bar') == 'TEST'
def logs(id, url, tail, sleep_duration=1): """ Print the logs of the run. """ try: experiment = ExperimentClient().get(normalize_job_name(id)) except FloydException: experiment = ExperimentClient().get(id) if experiment.state == 'queued': floyd_logger.info("Job is currently in a queue") return instance_log_id = experiment.instance_log_id if not instance_log_id: floyd_logger.info("Job not started yet, no log to show.") sys.exit(1) log_url = "{}/api/v1/resources/{}?content=true".format( floyd.floyd_host, instance_log_id) if url: floyd_logger.info(log_url) return if tail: floyd_logger.info("Launching job ...") current_shell_output = "" while True: # Get the logs in a loop and log the new lines log_file_contents = ResourceClient().get_content(instance_log_id) print_output = log_file_contents[len(current_shell_output):] if len(print_output.strip()): floyd_logger.info(print_output) current_shell_output = log_file_contents sleep(sleep_duration) else: log_file_contents = ResourceClient().get_content(instance_log_id) if len(log_file_contents.strip()): floyd_logger.info(log_file_contents) else: floyd_logger.info("Launching job now. Try after a few seconds.")
def test_normalize_job_name(self): from floyd.cli.utils import normalize_job_name assert normalize_job_name('foo/bar/1') == 'foo/projects/bar/1' assert normalize_job_name('foo/projects/bar/1') == 'foo/projects/bar/1'
def run(ctx, gpu, env, message, data, mode, open, tensorboard, gpup, cpup, command): """ Run a command on Floyd. Floyd will upload contents of the current directory and run your command remotely. This command will generate a run id for reference. """ experiment_config = ExperimentConfigManager.get_config() if not ProjectClient().exists(experiment_config.family_id): floyd_logger.error( 'Invalid project id, please run ' '"floyd init PROJECT_NAME" before scheduling a job.') return access_token = AuthConfigManager.get_access_token() experiment_name = "{}/{}".format(access_token.username, experiment_config.name) # Create module if len(data) > 5: floyd_logger.error("Cannot attach more than 5 datasets to an job") return # Get the data entity from the server to: # 1. Confirm that the data id or uri exists and has the right permissions # 2. If uri is used, get the id of the dataset data_ids = [] for data_name_or_id in data: path = None if ':' in data_name_or_id: data_name_or_id, path = data_name_or_id.split(':') data_obj = DataClient().get(data_name_or_id) if not data_obj: floyd_logger.error( "Data not found for name or id: {}".format(data_name_or_id)) return data_ids.append( "{}:{}".format(data_obj.id, path) if path else data_obj.id) default_name = 'input' if len(data_ids) <= 1 else None module_inputs = [{ 'name': get_data_name(data_str, default_name), 'type': 'dir' } for data_str in data_ids] if gpup: instance_type = G1P_INSTANCE_TYPE elif cpup: instance_type = C1P_INSTANCE_TYPE elif gpu: instance_type = G1_INSTANCE_TYPE else: instance_type = C1_INSTANCE_TYPE arch = INSTANCE_ARCH_MAP[instance_type] env_map = EnvClient().get_all() envs = env_map.get(arch) if envs: if env not in envs: floyd_logger.error( "{} is not in the list of supported environments: {}".format( env, ', '.join(envs.keys()))) return else: floyd_logger.error("{} is not a supported architecture".format(arch)) return command_str = ' '.join(command) module = Module(name=experiment_name, description=message or '', command=command_str, mode=get_mode_parameter(mode), enable_tensorboard=tensorboard, family_id=experiment_config.family_id, inputs=module_inputs, env=env, arch=arch) from floyd.exceptions import BadRequestException try: module_id = ModuleClient().create(module) except BadRequestException as e: if 'Project not found, ID' in e.message: floyd_logger.error( 'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.' ) else: floyd_logger.error('ERROR: %s', e.message) sys.exit(1) floyd_logger.debug("Created module with id : {}".format(module_id)) # Create experiment request # Get the actual command entered in the command line full_command = get_command_line(gpu, env, message, data, mode, open, tensorboard, command) experiment_request = ExperimentRequest( name=experiment_name, description=message, full_command=full_command, module_id=module_id, data_ids=data_ids, family_id=experiment_config.family_id, instance_type=instance_type) expt_cli = ExperimentClient() expt_info = expt_cli.create(experiment_request) floyd_logger.debug("Created job : %s", expt_info['id']) job_name = normalize_job_name(expt_info['name']) floyd_logger.info("") table_output = [["JOB NAME"], [job_name]] floyd_logger.info(tabulate(table_output, headers="firstrow")) floyd_logger.info("") if mode in ['jupyter', 'serve']: while True: # Wait for the experiment / task instances to become available try: experiment = expt_cli.get(expt_info['id']) if experiment.task_instances: break except Exception: floyd_logger.debug("Job not available yet: %s", expt_info['id']) floyd_logger.debug("Job not available yet: %s", expt_info['id']) sleep(3) continue # Print the path to jupyter notebook if mode == 'jupyter': jupyter_url = experiment.service_url print( "Setting up your instance and waiting for Jupyter notebook to become available ...", end='') if wait_for_url(jupyter_url, sleep_duration_seconds=2, iterations=900): sleep(3) # HACK: sleep extra 3 seconds for traffic route sync floyd_logger.info( "\nPath to jupyter notebook: {}".format(jupyter_url)) if open: webbrowser.open(jupyter_url) else: floyd_logger.info("\nPath to jupyter notebook: %s", jupyter_url) floyd_logger.info( "Notebook is still loading. View logs to track progress") floyd_logger.info(" floyd logs %s", job_name) # Print the path to serving endpoint if mode == 'serve': floyd_logger.info("Path to service endpoint: {}".format( experiment.service_url)) if experiment.timeout_seconds < 4 * 60 * 60: floyd_logger.info( "\nYour job timeout is currently set to {} seconds".format( experiment.timeout_seconds)) floyd_logger.info( "This is because you are in a trial account. Paid users will have longer timeouts. " "See https://www.floydhub.com/pricing for details") else: floyd_logger.info("To view logs enter:") floyd_logger.info(" floyd logs %s", job_name)
def restart(ctx, job_name, data, open_notebook, env, message, gpu, cpu, gpup, cpup, command): """ Restart a given job as a new job. """ # Error early if more than one --env is passed. Then get the first/only # --env out of the list so all other operations work normally (they don't # expect an iterable). For details on this approach, see the comment above # the --env click option if len(env) > 1: floyd_logger.error( "You passed more than one environment: {}. Please specify a single environment." .format(env)) sys.exit(1) env = env[0] parameters = {} expt_client = ExperimentClient() try: job = expt_client.get(normalize_job_name(job_name)) except FloydException: job = expt_client.get(job_name) if gpup: instance_type = G1P_INSTANCE_TYPE elif cpup: instance_type = C1P_INSTANCE_TYPE elif gpu: instance_type = G1_INSTANCE_TYPE elif cpu: instance_type = C1_INSTANCE_TYPE else: instance_type = job.instance_type if instance_type is not None: parameters['instance_type'] = instance_type else: instance_type = job.instance_type if env is not None: arch = INSTANCE_ARCH_MAP[instance_type] if not validate_env(env, arch): sys.exit(1) parameters['env'] = env success, data_ids = process_data_ids(data) if not success: sys.exit(1) if data_ids: parameters['data_ids'] = data_ids if message: parameters['description'] = message if command: parameters['command'] = ' '.join(command) floyd_logger.info('Restarting job %s...', job_name) new_job_info = expt_client.restart(job.id, parameters=parameters) if not new_job_info: floyd_logger.error("Failed to restart job") sys.exit(1) show_new_job_info(expt_client, new_job_info['name'], new_job_info, job.mode, open_notebook)
def run(ctx, gpu, env, message, data, mode, open, tensorboard, gpup, cpup, command): """ Run a command on Floyd. Floyd will upload contents of the current directory and run your command remotely. This command will generate a run id for reference. """ experiment_config = ExperimentConfigManager.get_config() if not ProjectClient().exists(experiment_config.family_id): floyd_logger.error( 'Invalid project id, please run ' '"floyd init PROJECT_NAME" before scheduling a job.') sys.exit(1) access_token = AuthConfigManager.get_access_token() experiment_name = "{}/{}".format(access_token.username, experiment_config.name) success, data_ids = process_data_ids(data) if not success: sys.exit(2) # Create module default_name = 'input' if len(data_ids) <= 1 else None module_inputs = [{ 'name': get_data_name(data_str, default_name), 'type': 'dir' } for data_str in data_ids] if gpup: instance_type = G1P_INSTANCE_TYPE elif cpup: instance_type = C1P_INSTANCE_TYPE elif gpu: instance_type = G1_INSTANCE_TYPE else: instance_type = C1_INSTANCE_TYPE if not validate_env(env, instance_type): sys.exit(3) command_str = ' '.join(command) if command_str and mode in ('jupyter', 'serve'): floyd_logger.error( 'Command argument "%s" cannot be used with mode: %s.\nSee http://docs.floydhub.com/guides/run_a_job/#mode for more information about run modes.', command_str, mode) sys.exit(3) module = Module(name=experiment_name, description=message or '', command=command_str, mode=get_mode_parameter(mode), enable_tensorboard=tensorboard, family_id=experiment_config.family_id, inputs=module_inputs, env=env, arch=INSTANCE_ARCH_MAP[instance_type]) try: module_id = ModuleClient().create(module) except BadRequestException as e: if 'Project not found, ID' in e.message: floyd_logger.error( 'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.' ) else: floyd_logger.error('ERROR: %s', e.message) sys.exit(4) floyd_logger.debug("Created module with id : %s", module_id) # Create experiment request # Get the actual command entered in the command line full_command = get_command_line(instance_type, env, message, data, mode, open, tensorboard, command_str) experiment_request = ExperimentRequest( name=experiment_name, description=message, full_command=full_command, module_id=module_id, data_ids=data_ids, family_id=experiment_config.family_id, instance_type=instance_type) expt_client = ExperimentClient() expt_info = expt_client.create(experiment_request) floyd_logger.debug("Created job : %s", expt_info['id']) job_name = normalize_job_name(expt_info['name']) floyd_logger.info("") table_output = [["JOB NAME"], [job_name]] floyd_logger.info(tabulate(table_output, headers="firstrow")) floyd_logger.info("") show_new_job_info(expt_client, job_name, expt_info, mode)