Пример #1
0
def process_data_ids(data):
    if len(data) > 5:
        floyd_logger.error("Cannot attach more than 5 datasets to a job")
        return False, None

    # Get the data entity from the server to:
    # 1. Confirm that the data id or uri exists and has the right permissions
    # 2. If uri is used, get the id of the dataset
    data_ids = []
    for data_name_or_id in data:
        path = None
        if ':' in data_name_or_id:
            data_name_or_id, path = data_name_or_id.split(':')
            data_name_or_id = normalize_data_name(data_name_or_id,
                                                  use_data_config=False)

        data_obj = DataClient().get(
            normalize_data_name(data_name_or_id, use_data_config=False))

        if not data_obj:
            # Try with the raw ID
            data_obj = DataClient().get(data_name_or_id)

        if not data_obj:
            floyd_logger.error(
                "Data not found for name or id: {}".format(data_name_or_id))
            return False, None
        if path:
            data_ids.append("%s:%s" % (data_obj.id, path))
        else:
            data_ids.append(data_obj.id)
    return True, data_ids
Пример #2
0
 def test_normalize_data_name(self):
     from floyd.cli.utils import normalize_data_name
     assert normalize_data_name('foo/bar/1') == 'foo/datasets/bar/1'
     assert normalize_data_name(
         'foo/datasets/bar/1') == 'foo/datasets/bar/1'
     assert normalize_data_name(
         'foo/bar/1/output') == 'foo/projects/bar/1/output'
     assert normalize_data_name(
         'foo/projects/bar/1/output') == 'foo/projects/bar/1/output'
Пример #3
0
def get_command_line(instance_type, env, message, data, mode, open_notebook,
                     tensorboard, command_str):
    """
    Return a string representing the full floyd command entered in the command line
    """
    floyd_command = ["floyd", "run"]
    if instance_type:
        floyd_command.append('--' + INSTANCE_NAME_MAP[instance_type])
    if env and not env == DEFAULT_ENV:
        floyd_command += ["--env", env]
    if message:
        floyd_command += ["--message", shell_quote(message)]
    if data:
        for data_item in data:
            parts = data_item.split(':')

            if len(parts) > 1:
                data_item = normalize_data_name(
                    parts[0], use_data_config=False) + ':' + parts[1]

            floyd_command += ["--data", data_item]
    if tensorboard:
        floyd_command.append("--tensorboard")
    if mode and mode != "job":
        floyd_command += ["--mode", mode]
        if mode == 'jupyter':
            if not open_notebook:
                floyd_command.append("--no-open")
    else:
        if command_str:
            floyd_command.append(shell_quote(command_str))
    return ' '.join(floyd_command)
Пример #4
0
def delete(ids, yes):
    """
    Delete data sets.
    """
    failures = False

    for id in ids:
        data_source = DataClient().get(id)
        if not data_source:
            failures = True
            continue

        data_name = normalize_data_name(data_source.name)
        suffix = data_name.split('/')[-1]
        if not suffix.isdigit():
            failures = True
            floyd_logger.error('%s is not a dataset, skipped.', id)
            if suffix == 'output':
                floyd_logger.error(
                    'To delete job output, please delete the job itself.')
            continue

        if not yes and not click.confirm("Delete Data: {}?".format(data_name),
                                         abort=False,
                                         default=False):
            floyd_logger.info("Data %s: Skipped", data_name)
            continue

        if not DataClient().delete(data_source.id):
            failures = True
        else:
            floyd_logger.info("Data %s: Deleted", data_name)

    if failures:
        sys.exit(1)
Пример #5
0
def info(job_name_or_id):
    """
    Prints detailed info for the run
    """
    try:
        experiment = ExperimentClient().get(normalize_job_name(job_name_or_id))
    except FloydException:
        experiment = ExperimentClient().get(job_name_or_id)

    task_instance_id = get_module_task_instance_id(experiment.task_instances)
    task_instance = TaskInstanceClient().get(
        task_instance_id) if task_instance_id else None
    table = [["Job name", normalize_job_name(experiment.name)],
             [
                 "Output name",
                 normalize_data_name(experiment.name +
                                     '/output') if task_instance else None
             ], ["Created", experiment.created_pretty],
             ["Status", experiment.state],
             ["Duration(s)", experiment.duration_rounded],
             ["Instance", experiment.instance_type_trimmed],
             ["Description", experiment.description]]
    if task_instance and task_instance.mode in ['jupyter', 'serving']:
        table.append(["Mode", task_instance.mode])
        table.append(["Url", experiment.service_url])
    if experiment.tensorboard_url:
        table.append(["Tensorboard", experiment.tensorboard_url])
    floyd_logger.info(tabulate(table))
Пример #6
0
def get_data_object(data_id, use_data_config=True):
    """
    Normalize the data_id and query the server.
    If that is unavailable try the raw ID
    """
    normalized_data_reference = normalize_data_name(data_id, use_data_config=use_data_config)
    client = DataClient()
    data_obj = client.get(normalized_data_reference)

    # Try with the raw ID
    if not data_obj and data_id != normalized_data_reference:
        data_obj = client.get(data_id)

    return data_obj
Пример #7
0
def print_data(data_sources):
    """
    Print data information in tabular form
    """
    if not data_sources:
        return

    headers = ["DATA NAME", "CREATED", "STATUS", "DISK USAGE"]
    data_list = []
    for data_source in data_sources:
        data_list.append([
            normalize_data_name(data_source.name), data_source.created_pretty,
            data_source.state, data_source.size
        ])
    floyd_logger.info(tabulate(data_list, headers=headers))
Пример #8
0
def status(id):
    """
    Show the status of a run with id. or a friendly name.
    It can also list status of all the runs in the project.
    """
    if id:
        data_source = DataClient().get(normalize_data_name(id))

        if not data_source:
            # Try with the raw ID
            data_source = DataClient().get(id)

        print_data([data_source] if data_source else [])
    else:
        data_sources = DataClient().get_all()
        print_data(data_sources)
Пример #9
0
def listfiles(data_name):
    """
    List files in the given dataset
    """

    data_source = DataClient().get(
        normalize_data_name(data_name, use_data_config=False))
    if data_name and not data_source:
        # Try with the raw ID
        data_source = DataClient().get(data_name)

    if not data_source:
        if 'output' in data_name:
            floyd_logger.info(
                "Note: You cannot clone the output of a running job. You need to wait for it to finish."
            )
        sys.exit()

    # Depth-first search
    dirs = ['']
    paths = []
    while dirs:
        cur_dir = dirs.pop()
        url = "/resources/{}/{}?content=true".format(data_source.resource_id,
                                                     cur_dir)
        response = DataClient().request("GET", url).json()

        if response['skipped_files'] > 0:
            floyd_logger.info(
                "Warning: in directory '%s', %s/%s files skipped (too many files)",
                cur_dir, response['skipped_files'], response['total_files'])

        files = response['files']
        files.sort(key=lambda f: f['name'])
        for f in files:
            path = os.path.join(cur_dir, f['name'])
            if f['type'] == 'directory':
                path += os.sep
            paths.append(path)

            if f['type'] == 'directory':
                dirs.append(os.path.join(cur_dir, f['name']))
    for path in paths:
        floyd_logger.info(path)
Пример #10
0
def output(id, url):
    """
    Shows the url of the dataset. You can use id or a friendly URI.
    By default opens the output page in your default browser.
    """
    data_source = DataClient().get(normalize_data_name(id))
    if id and not data_source:
        # Try with the raw ID
        data_source = DataClient().get(id)

    if not data_source:
        sys.exit()

    data_url = "%s/%s" % (floyd.floyd_web_host, data_source.name)
    if url:
        floyd_logger.info(data_url)
    else:
        floyd_logger.info("Opening output directory in your browser ...")
        webbrowser.open(data_url)
Пример #11
0
def getfile(data_name, path):
    """
    Get the specified individual file from a dataset
    """

    data_source = DataClient().get(
        normalize_data_name(data_name, use_data_config=False))
    if data_name and not data_source:
        # Try with the raw ID
        data_source = DataClient().get(data_name)

    if not data_source:
        if 'output' in data_name:
            floyd_logger.info(
                "Note: You cannot clone the output of a running job. You need to wait for it to finish."
            )
        sys.exit()

    url = "{}/api/v1/resources/{}/{}?content=true".format(
        floyd.floyd_host, data_source.resource_id, path)
    fname = os.path.basename(path)
    DataClient().download(url, filename=fname)
Пример #12
0
def clone(id):
    """
    Download the code for the job to the current path
    """

    data_source = DataClient().get(
        normalize_data_name(id, use_data_config=False))
    if id and not data_source:
        # Try with the raw ID
        data_source = DataClient().get(id)

    if not data_source:
        if 'output' in id:
            floyd_logger.info(
                "Note: You cannot clone the output of a running job. You need to wait for it to finish."
            )
        sys.exit()

    data_url = "{}/api/v1/resources/{}?content=true&download=true".format(
        floyd.floyd_host, data_source.resource_id)
    DataClient().download_tar(url=data_url,
                              untar=True,
                              delete_after_untar=True)
Пример #13
0
def complete_upload(data_config):
    data_endpoint = data_config.data_endpoint
    data_id = data_config.data_id
    tarball_path = data_config.tarball_path

    if not data_id:
        floyd_logger.error("Corrupted upload state, please start a new one.")
        sys.exit(1)

    # check for tarball upload, upload to server if not done
    if not data_config.resource_id and (tarball_path and data_endpoint):
        floyd_logger.debug("Getting fresh upload credentials")
        creds = DataClient().new_tus_credentials(data_id)
        if not creds:
            sys.exit(1)

        file_size = os.path.getsize(tarball_path)
        # check for upload limit dimension
        if file_size > MAX_UPLOAD_SIZE:
            try:
                floyd_logger.info("Removing compressed data...")
                rmtree(os.path.dirname(tarball_path))
            except (OSError, TypeError):
                pass

            sys.exit(("Data size too large to upload, please keep it under %s.\n") %
                     (sizeof_fmt(MAX_UPLOAD_SIZE)))

        floyd_logger.info("Uploading compressed data. Total upload size: %s",
                          sizeof_fmt(file_size))
        tus_client = TusDataClient()
        if not tus_client.resume_upload(tarball_path, data_endpoint, auth=creds):
            floyd_logger.error("Failed to finish upload!")
            return

        try:
            floyd_logger.info("Removing compressed data...")
            rmtree(os.path.dirname(tarball_path))
        except (OSError, TypeError):
            pass

        floyd_logger.debug("Created data with id : %s", data_id)
        floyd_logger.info("Upload finished.")

        # Update data config
        data_config.set_tarball_path(None)
        data_config.set_data_endpoint(None)
        data_source = DataClient().get(data_id)
        data_config.set_resource_id(data_source.resource_id)
        DataConfigManager.set_config(data_config)

    # data tarball uploaded, check for server untar
    if data_config.resource_id:
        floyd_logger.info(
            "Waiting for server to unpack data.\n"
            "You can exit at any time and come back to check the status with:\n"
            "\tfloyd data upload -r")
        try:
            for i in dots(ResourceWaitIter(data_config.resource_id),
                          label='Waiting for unpack...'):
                pass
        except WaitTimeoutException:
            clint_STREAM.write('\n')
            clint_STREAM.flush()
            floyd_logger.info(
                "Looks like it is going to take longer for Floydhub to unpack "
                "your data. Please check back later.")
            sys.exit(1)
        else:
            data_config.set_resource_id(None)
            data_config.set_tarball_path(None)
            data_config.set_data_endpoint(None)
            data_config.set_resource_id(None)
            data_config.set_data_id(None)
            DataConfigManager.set_config(data_config)

    # Print output
    table_output = [["NAME"],
                    [normalize_data_name(data_config.data_name)]]
    floyd_logger.info('')
    floyd_logger.info(tabulate(table_output, headers="firstrow"))
Пример #14
0
    def test_normalize_data_name(self, _0, _1, _2, _3, mock_get_config):
        mock_get_config.return_value.namespace = None
        from floyd.cli.utils import normalize_data_name
        assert normalize_data_name('foo/bar/1') == 'foo/datasets/bar/1'
        assert normalize_data_name(
            'foo/datasets/bar/1') == 'foo/datasets/bar/1'
        assert normalize_data_name(
            'foo/bar/1/output') == 'foo/projects/bar/1/output'
        assert normalize_data_name(
            'foo/projects/bar/1/output') == 'foo/projects/bar/1/output'
        # Make sure that the current_username and current_project_name are
        # honored:
        assert normalize_data_name('1') == 'pete/datasets/test_dataset/1'
        assert normalize_data_name('mnist/3') == 'pete/datasets/mnist/3'
        assert normalize_data_name('foo/mnist/3') == 'foo/datasets/mnist/3'

        # current_username and current_project_name are overridden with the
        # second and third args if passed
        assert normalize_data_name('bar/1', 'yoyo') == 'yoyo/datasets/bar/1'
        assert normalize_data_name('1', 'yoyo', 'ma') == 'yoyo/datasets/ma/1'

        # Full job names are returned unchanged
        assert normalize_data_name(
            'foo/projects/bar/1') == 'foo/projects/bar/1'

        # If no job number is passed, it is not used
        assert normalize_data_name('foo/datasets/bar') == 'foo/datasets/bar'