def process_data_ids(data): if len(data) > 5: floyd_logger.error("Cannot attach more than 5 datasets to a job") return False, None # Get the data entity from the server to: # 1. Confirm that the data id or uri exists and has the right permissions # 2. If uri is used, get the id of the dataset data_ids = [] for data_name_or_id in data: path = None if ':' in data_name_or_id: data_name_or_id, path = data_name_or_id.split(':') data_name_or_id = normalize_data_name(data_name_or_id, use_data_config=False) data_obj = DataClient().get( normalize_data_name(data_name_or_id, use_data_config=False)) if not data_obj: # Try with the raw ID data_obj = DataClient().get(data_name_or_id) if not data_obj: floyd_logger.error( "Data not found for name or id: {}".format(data_name_or_id)) return False, None if path: data_ids.append("%s:%s" % (data_obj.id, path)) else: data_ids.append(data_obj.id) return True, data_ids
def test_normalize_data_name(self): from floyd.cli.utils import normalize_data_name assert normalize_data_name('foo/bar/1') == 'foo/datasets/bar/1' assert normalize_data_name( 'foo/datasets/bar/1') == 'foo/datasets/bar/1' assert normalize_data_name( 'foo/bar/1/output') == 'foo/projects/bar/1/output' assert normalize_data_name( 'foo/projects/bar/1/output') == 'foo/projects/bar/1/output'
def get_command_line(instance_type, env, message, data, mode, open_notebook, tensorboard, command_str): """ Return a string representing the full floyd command entered in the command line """ floyd_command = ["floyd", "run"] if instance_type: floyd_command.append('--' + INSTANCE_NAME_MAP[instance_type]) if env and not env == DEFAULT_ENV: floyd_command += ["--env", env] if message: floyd_command += ["--message", shell_quote(message)] if data: for data_item in data: parts = data_item.split(':') if len(parts) > 1: data_item = normalize_data_name( parts[0], use_data_config=False) + ':' + parts[1] floyd_command += ["--data", data_item] if tensorboard: floyd_command.append("--tensorboard") if mode and mode != "job": floyd_command += ["--mode", mode] if mode == 'jupyter': if not open_notebook: floyd_command.append("--no-open") else: if command_str: floyd_command.append(shell_quote(command_str)) return ' '.join(floyd_command)
def delete(ids, yes): """ Delete data sets. """ failures = False for id in ids: data_source = DataClient().get(id) if not data_source: failures = True continue data_name = normalize_data_name(data_source.name) suffix = data_name.split('/')[-1] if not suffix.isdigit(): failures = True floyd_logger.error('%s is not a dataset, skipped.', id) if suffix == 'output': floyd_logger.error( 'To delete job output, please delete the job itself.') continue if not yes and not click.confirm("Delete Data: {}?".format(data_name), abort=False, default=False): floyd_logger.info("Data %s: Skipped", data_name) continue if not DataClient().delete(data_source.id): failures = True else: floyd_logger.info("Data %s: Deleted", data_name) if failures: sys.exit(1)
def info(job_name_or_id): """ Prints detailed info for the run """ try: experiment = ExperimentClient().get(normalize_job_name(job_name_or_id)) except FloydException: experiment = ExperimentClient().get(job_name_or_id) task_instance_id = get_module_task_instance_id(experiment.task_instances) task_instance = TaskInstanceClient().get( task_instance_id) if task_instance_id else None table = [["Job name", normalize_job_name(experiment.name)], [ "Output name", normalize_data_name(experiment.name + '/output') if task_instance else None ], ["Created", experiment.created_pretty], ["Status", experiment.state], ["Duration(s)", experiment.duration_rounded], ["Instance", experiment.instance_type_trimmed], ["Description", experiment.description]] if task_instance and task_instance.mode in ['jupyter', 'serving']: table.append(["Mode", task_instance.mode]) table.append(["Url", experiment.service_url]) if experiment.tensorboard_url: table.append(["Tensorboard", experiment.tensorboard_url]) floyd_logger.info(tabulate(table))
def get_data_object(data_id, use_data_config=True): """ Normalize the data_id and query the server. If that is unavailable try the raw ID """ normalized_data_reference = normalize_data_name(data_id, use_data_config=use_data_config) client = DataClient() data_obj = client.get(normalized_data_reference) # Try with the raw ID if not data_obj and data_id != normalized_data_reference: data_obj = client.get(data_id) return data_obj
def print_data(data_sources): """ Print data information in tabular form """ if not data_sources: return headers = ["DATA NAME", "CREATED", "STATUS", "DISK USAGE"] data_list = [] for data_source in data_sources: data_list.append([ normalize_data_name(data_source.name), data_source.created_pretty, data_source.state, data_source.size ]) floyd_logger.info(tabulate(data_list, headers=headers))
def status(id): """ Show the status of a run with id. or a friendly name. It can also list status of all the runs in the project. """ if id: data_source = DataClient().get(normalize_data_name(id)) if not data_source: # Try with the raw ID data_source = DataClient().get(id) print_data([data_source] if data_source else []) else: data_sources = DataClient().get_all() print_data(data_sources)
def listfiles(data_name): """ List files in the given dataset """ data_source = DataClient().get( normalize_data_name(data_name, use_data_config=False)) if data_name and not data_source: # Try with the raw ID data_source = DataClient().get(data_name) if not data_source: if 'output' in data_name: floyd_logger.info( "Note: You cannot clone the output of a running job. You need to wait for it to finish." ) sys.exit() # Depth-first search dirs = [''] paths = [] while dirs: cur_dir = dirs.pop() url = "/resources/{}/{}?content=true".format(data_source.resource_id, cur_dir) response = DataClient().request("GET", url).json() if response['skipped_files'] > 0: floyd_logger.info( "Warning: in directory '%s', %s/%s files skipped (too many files)", cur_dir, response['skipped_files'], response['total_files']) files = response['files'] files.sort(key=lambda f: f['name']) for f in files: path = os.path.join(cur_dir, f['name']) if f['type'] == 'directory': path += os.sep paths.append(path) if f['type'] == 'directory': dirs.append(os.path.join(cur_dir, f['name'])) for path in paths: floyd_logger.info(path)
def output(id, url): """ Shows the url of the dataset. You can use id or a friendly URI. By default opens the output page in your default browser. """ data_source = DataClient().get(normalize_data_name(id)) if id and not data_source: # Try with the raw ID data_source = DataClient().get(id) if not data_source: sys.exit() data_url = "%s/%s" % (floyd.floyd_web_host, data_source.name) if url: floyd_logger.info(data_url) else: floyd_logger.info("Opening output directory in your browser ...") webbrowser.open(data_url)
def getfile(data_name, path): """ Get the specified individual file from a dataset """ data_source = DataClient().get( normalize_data_name(data_name, use_data_config=False)) if data_name and not data_source: # Try with the raw ID data_source = DataClient().get(data_name) if not data_source: if 'output' in data_name: floyd_logger.info( "Note: You cannot clone the output of a running job. You need to wait for it to finish." ) sys.exit() url = "{}/api/v1/resources/{}/{}?content=true".format( floyd.floyd_host, data_source.resource_id, path) fname = os.path.basename(path) DataClient().download(url, filename=fname)
def clone(id): """ Download the code for the job to the current path """ data_source = DataClient().get( normalize_data_name(id, use_data_config=False)) if id and not data_source: # Try with the raw ID data_source = DataClient().get(id) if not data_source: if 'output' in id: floyd_logger.info( "Note: You cannot clone the output of a running job. You need to wait for it to finish." ) sys.exit() data_url = "{}/api/v1/resources/{}?content=true&download=true".format( floyd.floyd_host, data_source.resource_id) DataClient().download_tar(url=data_url, untar=True, delete_after_untar=True)
def complete_upload(data_config): data_endpoint = data_config.data_endpoint data_id = data_config.data_id tarball_path = data_config.tarball_path if not data_id: floyd_logger.error("Corrupted upload state, please start a new one.") sys.exit(1) # check for tarball upload, upload to server if not done if not data_config.resource_id and (tarball_path and data_endpoint): floyd_logger.debug("Getting fresh upload credentials") creds = DataClient().new_tus_credentials(data_id) if not creds: sys.exit(1) file_size = os.path.getsize(tarball_path) # check for upload limit dimension if file_size > MAX_UPLOAD_SIZE: try: floyd_logger.info("Removing compressed data...") rmtree(os.path.dirname(tarball_path)) except (OSError, TypeError): pass sys.exit(("Data size too large to upload, please keep it under %s.\n") % (sizeof_fmt(MAX_UPLOAD_SIZE))) floyd_logger.info("Uploading compressed data. Total upload size: %s", sizeof_fmt(file_size)) tus_client = TusDataClient() if not tus_client.resume_upload(tarball_path, data_endpoint, auth=creds): floyd_logger.error("Failed to finish upload!") return try: floyd_logger.info("Removing compressed data...") rmtree(os.path.dirname(tarball_path)) except (OSError, TypeError): pass floyd_logger.debug("Created data with id : %s", data_id) floyd_logger.info("Upload finished.") # Update data config data_config.set_tarball_path(None) data_config.set_data_endpoint(None) data_source = DataClient().get(data_id) data_config.set_resource_id(data_source.resource_id) DataConfigManager.set_config(data_config) # data tarball uploaded, check for server untar if data_config.resource_id: floyd_logger.info( "Waiting for server to unpack data.\n" "You can exit at any time and come back to check the status with:\n" "\tfloyd data upload -r") try: for i in dots(ResourceWaitIter(data_config.resource_id), label='Waiting for unpack...'): pass except WaitTimeoutException: clint_STREAM.write('\n') clint_STREAM.flush() floyd_logger.info( "Looks like it is going to take longer for Floydhub to unpack " "your data. Please check back later.") sys.exit(1) else: data_config.set_resource_id(None) data_config.set_tarball_path(None) data_config.set_data_endpoint(None) data_config.set_resource_id(None) data_config.set_data_id(None) DataConfigManager.set_config(data_config) # Print output table_output = [["NAME"], [normalize_data_name(data_config.data_name)]] floyd_logger.info('') floyd_logger.info(tabulate(table_output, headers="firstrow"))
def test_normalize_data_name(self, _0, _1, _2, _3, mock_get_config): mock_get_config.return_value.namespace = None from floyd.cli.utils import normalize_data_name assert normalize_data_name('foo/bar/1') == 'foo/datasets/bar/1' assert normalize_data_name( 'foo/datasets/bar/1') == 'foo/datasets/bar/1' assert normalize_data_name( 'foo/bar/1/output') == 'foo/projects/bar/1/output' assert normalize_data_name( 'foo/projects/bar/1/output') == 'foo/projects/bar/1/output' # Make sure that the current_username and current_project_name are # honored: assert normalize_data_name('1') == 'pete/datasets/test_dataset/1' assert normalize_data_name('mnist/3') == 'pete/datasets/mnist/3' assert normalize_data_name('foo/mnist/3') == 'foo/datasets/mnist/3' # current_username and current_project_name are overridden with the # second and third args if passed assert normalize_data_name('bar/1', 'yoyo') == 'yoyo/datasets/bar/1' assert normalize_data_name('1', 'yoyo', 'ma') == 'yoyo/datasets/ma/1' # Full job names are returned unchanged assert normalize_data_name( 'foo/projects/bar/1') == 'foo/projects/bar/1' # If no job number is passed, it is not used assert normalize_data_name('foo/datasets/bar') == 'foo/datasets/bar'