示例#1
0
def update_datasets(
    client,
    names,
    creators,
    include,
    exclude,
    ref,
    delete,
    external=False,
    progress_context=contextlib.nullcontext,
    commit_message=None,
):
    """Update files from a remote Git repo."""
    records = _filter(client,
                      names=names,
                      creators=creators,
                      include=include,
                      exclude=exclude)

    if not records:
        raise ParameterError("No files matched the criteria.")

    possible_updates = []
    unique_remotes = set()
    external_files = []

    for file_ in records:
        if file_.based_on:
            possible_updates.append(file_)
            unique_remotes.add(file_.based_on.source)
        elif file_.external:
            external_files.append(file_)

    if ref and len(unique_remotes) > 1:
        raise ParameterError(
            'Cannot use "--ref" with more than one Git repository.\n'
            "Limit list of files to be updated to one repository. See"
            '"renku dataset update -h" for more information.')

    if external_files:
        if external:
            client.update_external_files(external_files)
        else:
            click.echo("To update external files run update command with "
                       '"--external" flag.')

    with progress_context(possible_updates,
                          item_show_func=lambda x: x.path
                          if x else None) as progressbar:
        deleted_files = client.update_dataset_files(files=progressbar,
                                                    ref=ref,
                                                    delete=delete)

    if deleted_files and not delete:
        click.echo(
            "Some files are deleted from remote. To also delete them locally "
            "run update command with `--delete` flag.")
示例#2
0
def update_datasets(
    client,
    names,
    creators,
    include,
    exclude,
    ref,
    delete,
    progress_context=contextlib.nullcontext,
    commit_message=None,
):
    """Update files from a remote Git repo."""
    records = _filter(client,
                      names=names,
                      creators=creators,
                      include=include,
                      exclude=exclude)

    if not records:
        raise ParameterError('No files matched the criteria.')

    datasets = {}
    possible_updates = []
    unique_remotes = set()

    for file_ in records:
        if file_.based_on:
            dataset_name = file_.dataset
            dataset = datasets.get(dataset_name)

            if not dataset:
                dataset = client.load_dataset(name=dataset_name)
                datasets[dataset_name] = dataset

            file_.dataset = dataset
            possible_updates.append(file_)
            unique_remotes.add(file_.based_on.url)

    if ref and len(unique_remotes) > 1:
        raise ParameterError(
            'Cannot use "--ref" with more than one Git repository.\n'
            'Limit list of files to be updated to one repository. See'
            '"renku dataset update -h" for more information.')

    with progress_context(possible_updates,
                          item_show_func=lambda x: x.path
                          if x else None) as progressbar:
        deleted_files = client.update_dataset_files(files=progressbar,
                                                    ref=ref,
                                                    delete=delete)

    if deleted_files and not delete:
        click.echo(
            'Some files are deleted from remote. To also delete them locally '
            'run update command with `--delete` flag.')
示例#3
0
def remove_dataset_tags(client, short_name, tags, commit_message=True):
    """Removes tags from a dataset."""
    dataset = client.load_dataset(short_name)
    if not dataset:
        raise ParameterError('Dataset not found.')

    try:
        dataset = client.remove_dataset_tags(dataset, tags)
    except ValueError as e:
        raise ParameterError(e)

    dataset.to_yaml()
示例#4
0
def tag_dataset(client, short_name, tag, description, force=False):
    """Creates a new tag for a dataset."""
    dataset_ = client.load_dataset(short_name)
    if not dataset_:
        raise ParameterError('Dataset not found.')

    try:
        dataset = client.add_dataset_tag(dataset_, tag, description, force)
    except ValueError as e:
        raise ParameterError(e)

    dataset.to_yaml()
示例#5
0
def dataset_remove(
    client,
    short_names,
    with_output=False,
    datasetscontext=contextlib.nullcontext,
    referencescontext=contextlib.nullcontext,
    commit_message=None
):
    """Delete a dataset."""
    datasets = {name: client.get_dataset_path(name) for name in short_names}

    if not datasets:
        raise ParameterError(
            'use dataset short_name or identifier', param_hint='short_names'
        )

    unknown = [
        name
        for name, path in datasets.items() if not path or not path.exists()
    ]
    if unknown:
        raise ParameterError(
            'unknown datasets ' + ', '.join(unknown), param_hint='short_names'
        )

    datasets = set(datasets.values())
    references = list(LinkReference.iter_items(client, common_path='datasets'))

    if not with_output:
        for dataset in datasets:
            if dataset and dataset.exists():
                dataset.unlink()

        for ref in references:
            if ref.reference in datasets:
                ref.delete()

        return datasets, references

    datasets_c = datasetscontext(datasets)

    with datasets_c as bar:
        for dataset in bar:
            if dataset and dataset.exists():
                dataset.unlink()

    references_c = referencescontext(references)

    with references_c as bar:
        for ref in bar:
            if ref.reference in datasets:
                ref.delete()
示例#6
0
def list_tags(client, short_name, format):
    """List all tags for a dataset."""
    dataset_ = client.load_dataset(short_name)

    if not dataset_:
        raise ParameterError('Dataset not found.')

    tags = sorted(dataset_.tags, key=lambda t: t.created)

    return DATASET_TAGS_FORMATS[format](client, tags)
示例#7
0
def file_unlink(
    client,
    short_name,
    include,
    exclude,
    interactive=False,
    yes=False,
    commit_message=None
):
    """Remove matching files from a dataset."""
    if not include and not exclude:
        raise ParameterError((
            'include or exclude filters not found.\n'
            'Check available filters with `renku dataset unlink --help`\n'
            'Hint: `renku dataset unlink mydataset -I myfile`'
        ))

    dataset = client.load_dataset(short_name=short_name)

    if not dataset:
        raise ParameterError('Dataset does not exist.')

    records = _filter(
        client, short_names=[short_name], include=include, exclude=exclude
    )
    if not records:
        raise ParameterError('No records found.')

    if interactive and not yes:
        prompt_text = (
            f'You are about to remove following from "{short_name}" dataset.' +
            '\n' + '\n'.join([str(record.full_path) for record in records]) +
            '\nDo you wish to continue?'
        )
        click.confirm(WARNING + prompt_text, abort=True)

    for item in records:
        dataset.unlink_file(item.path)

    dataset.to_yaml()

    return records
示例#8
0
def file_unlink(client, name, include, exclude, commit_message=None):
    """Remove matching files from a dataset."""
    dataset = client.load_dataset(name=name)

    if not dataset:
        raise ParameterError('Dataset does not exist.')

    records = _filter(client,
                      names=[dataset.name],
                      include=include,
                      exclude=exclude)
    if not records:
        raise ParameterError('No records found.')

    yield records

    for item in records:
        dataset.unlink_file(item.path)

    dataset.to_yaml()
示例#9
0
def import_dataset(
    client,
    uri,
    short_name='',
    extract=False,
    with_prompt=False,
    yes=False,
    commit_message=None,
    progress=None,
):
    """Import data from a 3rd party provider or another renku project."""
    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise ParameterError('Could not process {0}.\n{1}'.format(uri, err))

    try:
        record = provider.find_record(uri, client)
        dataset = record.as_dataset(client)
        files = dataset.files
        total_size = 0

        if with_prompt and not yes:
            click.echo(
                tabulate(
                    files,
                    headers=OrderedDict((
                        ('checksum', None),
                        ('filename', 'name'),
                        ('size_in_mb', 'size (mb)'),
                        ('filetype', 'type'),
                    )),
                    floatfmt='.2f'
                )
            )

            text_prompt = 'Do you wish to download this version?'
            if record.is_last_version(uri) is False:
                text_prompt = WARNING + 'Newer version found at {}\n'.format(
                    record.links.get('latest_html')
                ) + text_prompt

            click.confirm(text_prompt, abort=True)

            for file_ in files:
                if file_.size_in_mb is not None:
                    total_size += file_.size_in_mb

            total_size *= 2**20

    except KeyError as e:
        raise ParameterError((
            'Could not process {0}.\n'
            'Unable to fetch metadata due to {1}'.format(uri, e)
        ))

    except LookupError as e:
        raise ParameterError(
            ('Could not process {0}.\n'
             'Reason: {1}'.format(uri, str(e)))
        )

    if not files:
        raise ParameterError('Dataset {} has no files.'.format(uri))

    dataset.same_as = Url(url_id=remove_credentials(uri))

    if not provider.is_git_based:
        if not short_name:
            short_name = generate_default_short_name(
                dataset.name, dataset.version
            )

        if is_doi(dataset.identifier):
            dataset.same_as = Url(
                url_str=urllib.parse.
                urljoin('https://doi.org', dataset.identifier)
            )

        urls, names = zip(*[(f.url, f.filename) for f in files])

        _add_to_dataset(
            client,
            urls=urls,
            short_name=short_name,
            create=True,
            with_metadata=dataset,
            force=True,
            extract=extract,
            all_at_once=True,
            destination_names=names,
            progress=progress,
            interactive=with_prompt,
            total_size=total_size,
        )

        if dataset.version:
            tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version)
            tag_dataset(
                client, short_name, tag_name,
                'Tag {} created by renku import'.format(dataset.version)
            )
    else:
        short_name = short_name or dataset.short_name

        _add_to_dataset(
            client,
            urls=[record.project_url],
            short_name=short_name,
            sources=[f.path for f in files],
            with_metadata=dataset,
            create=True
        )
示例#10
0
def export_dataset(
    client,
    short_name,
    provider,
    publish,
    tag,
    handle_access_token_fn=None,
    handle_tag_selection_fn=None,
    commit_message=None,
    dataverse_server_url=None,
    dataverse_name=None,
):
    """Export data to 3rd party provider.

    :raises: ``ValueError``, ``HTTPError``, ``InvalidAccessToken``,
             ``DatasetNotFound``
    """
    # TODO: all these callbacks are ugly, improve in #737
    config_key_secret = 'access_token'
    provider_id = provider.lower()

    dataset_ = client.load_dataset(short_name)
    if not dataset_:
        raise DatasetNotFound(name=short_name)

    try:
        provider = ProviderFactory.from_id(provider_id)
    except KeyError:
        raise ParameterError('Unknown provider.')

    provider.set_parameters(
        client,
        dataverse_server_url=dataverse_server_url,
        dataverse_name=dataverse_name
    )

    selected_tag = None
    selected_commit = client.repo.head.commit

    if tag:
        selected_tag = next((t for t in dataset_.tags if t.name == tag), None)

        if not selected_tag:
            raise ValueError('Tag {} not found'.format(tag))

        selected_commit = selected_tag.commit
    elif dataset_.tags and len(dataset_.tags) > 0 and handle_tag_selection_fn:
        tag_result = handle_tag_selection_fn(dataset_.tags)

        if tag_result:
            selected_tag = tag_result
            selected_commit = tag_result.commit

            # If the tag is created automatically for imported datasets, it
            # does not have the dataset yet and we need to use the next commit
            with client.with_commit(selected_commit):
                test_ds = client.load_dataset(short_name)
            if not test_ds:
                commits = client.dataset_commits(dataset_)
                next_commit = selected_commit
                for commit in commits:
                    if commit.hexsha == selected_commit:
                        selected_commit = next_commit.hexsha
                        break
                    next_commit = commit

    with client.with_commit(selected_commit):
        dataset_ = client.load_dataset(short_name)
        if not dataset_:
            raise DatasetNotFound(name=short_name)

        access_token = client.get_value(provider_id, config_key_secret)
        exporter = provider.get_exporter(dataset_, access_token=access_token)

        if access_token is None:
            if handle_access_token_fn:
                access_token = handle_access_token_fn(exporter)

            if access_token is None or len(access_token) == 0:
                raise InvalidAccessToken()

            client.set_value(
                provider_id, config_key_secret, access_token, global_only=True
            )
            exporter.set_access_token(access_token)

        try:
            destination = exporter.export(publish=publish, tag=selected_tag)
        except errors.AuthenticationError:
            client.remove_value(
                provider_id, config_key_secret, global_only=True
            )
            raise

    result = 'Exported to: {0}'.format(destination)
    return result
示例#11
0
def _add_to_dataset(
    client,
    urls,
    short_name,
    external=False,
    force=False,
    overwrite=False,
    create=False,
    sources=(),
    destination='',
    ref=None,
    with_metadata=None,
    urlscontext=contextlib.nullcontext,
    commit_message=None,
    extract=False,
    all_at_once=False,
    destination_names=None,
    progress=None,
    interactive=False,
    total_size=None,
):
    """Add data to a dataset."""
    if len(urls) == 0:
        raise UsageError('No URL is specified')
    if sources and len(urls) > 1:
        raise UsageError('Cannot use "--source" with multiple URLs.')

    if interactive:
        if total_size is None:
            total_size = 0
            for url in urls:
                try:
                    with requests.get(url, stream=True) as r:
                        total_size += int(r.headers.get('content-length', 0))
                except requests.exceptions.RequestException:
                    pass
        usage = shutil.disk_usage(client.path)

        if total_size > usage.free:
            mb = 2**20
            message = 'Insufficient disk space (required: {:.2f} MB' \
                      '/available: {:.2f} MB). '.format(
                          total_size/mb, usage.free/mb
                      )
            raise OperationError(message)

    try:
        with client.with_dataset(
            short_name=short_name, create=create
        ) as dataset:
            with urlscontext(urls) as bar:
                warning_messages, messages = client.add_data_to_dataset(
                    dataset,
                    bar,
                    external=external,
                    force=force,
                    overwrite=overwrite,
                    sources=sources,
                    destination=destination,
                    ref=ref,
                    extract=extract,
                    all_at_once=all_at_once,
                    destination_names=destination_names,
                    progress=progress,
                )

            if messages:
                for msg in messages:
                    click.echo(INFO + msg)

            if warning_messages:
                for msg in warning_messages:
                    click.echo(WARNING + msg)

            if with_metadata:
                for file_ in dataset.files:
                    file_.creator = with_metadata.creator
                    file_.based_on = None
                # dataset has the correct list of files
                with_metadata.files = dataset.files
                with_metadata.url = dataset._id

                dataset.update_metadata(with_metadata)
                dataset.same_as = with_metadata.same_as

    except DatasetNotFound:
        raise DatasetNotFound(
            message='Dataset "{0}" does not exist.\n'
            'Use "renku dataset create {0}" to create the dataset or retry '
            '"renku dataset add {0}" command with "--create" option for '
            'automatic dataset creation.'.format(short_name)
        )
    except (FileNotFoundError, git.exc.NoSuchPathError) as e:
        raise ParameterError(
            'Could not find paths/URLs: \n{0}'.format('\n'.join(urls))
        ) from e
示例#12
0
def import_dataset(
    client,
    uri,
    name="",
    extract=False,
    with_prompt=False,
    yes=False,
    commit_message=None,
    progress=None,
):
    """Import data from a 3rd party provider or another renku project."""
    u = urllib.parse.urlparse(uri)
    if u.scheme not in ("", "file", "git+https", "git+ssh", "doi"):
        # NOTE: Check if the url is a redirect.
        uri = requests.head(uri, allow_redirects=True).url

    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise ParameterError("Could not process {0}.\n{1}".format(uri, err))

    try:
        record = provider.find_record(uri, client)
        dataset = record.as_dataset(client)
        files = dataset.files
        total_size = 0

        if with_prompt and not yes:
            click.echo(
                tabulate(
                    files,
                    headers=OrderedDict((
                        ("checksum", None),
                        ("filename", "name"),
                        ("size_in_mb", "size (mb)"),
                        ("filetype", "type"),
                    )),
                    floatfmt=".2f",
                ))

            text_prompt = "Do you wish to download this version?"
            if record.is_last_version(uri) is False:
                text_prompt = (WARNING + "Newer version found at {}\n".format(
                    record.links.get("latest_html")) + text_prompt)

            click.confirm(text_prompt, abort=True)

            for file_ in files:
                if file_.size_in_mb is not None:
                    total_size += file_.size_in_mb

            total_size *= 2**20

    except KeyError as e:
        raise ParameterError(
            ("Could not process {0}.\n"
             "Unable to fetch metadata due to {1}".format(uri, e)))

    except LookupError as e:
        raise ParameterError(("Could not process {0}.\n"
                              "Reason: {1}".format(uri, str(e))))

    if not files:
        raise ParameterError("Dataset {} has no files.".format(uri))

    dataset.same_as = Url(url_id=remove_credentials(uri))

    if not provider.is_git_based:
        if not name:
            name = generate_default_name(dataset.title, dataset.version)

        if is_doi(dataset.identifier):
            dataset.same_as = Url(url_str=urllib.parse.urljoin(
                "https://doi.org", dataset.identifier))

        urls, names = zip(*[(f.source, f.filename) for f in files])

        _add_to_dataset(
            client,
            urls=urls,
            name=name,
            create=True,
            with_metadata=dataset,
            force=True,
            extract=extract,
            all_at_once=True,
            destination_names=names,
            progress=progress,
            interactive=with_prompt,
            total_size=total_size,
        )

        if dataset.version:
            tag_name = re.sub("[^a-zA-Z0-9.-_]", "_", dataset.version)
            tag_dataset(
                client, name, tag_name,
                "Tag {} created by renku import".format(dataset.version))
    else:
        name = name or dataset.name

        if not dataset.data_dir:
            raise OperationError(
                f"Data directory for dataset must be set: {dataset.name}")

        sources = [f"{dataset.data_dir}/**"]
        for file_ in dataset.files:
            try:
                Path(file_.path).relative_to(dataset.data_dir)
            except ValueError:  # Files that are not in dataset's data directory
                sources.append(file_.path)

        _add_to_dataset(
            client,
            urls=[record.project_url],
            name=name,
            sources=sources,
            with_metadata=dataset,
            create=True,
        )
示例#13
0
def import_dataset(
    client,
    uri,
    short_name='',
    extract=False,
    with_prompt=False,
    commit_message=None,
    progress=None,
):
    """Import data from a 3rd party provider."""
    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise ParameterError('Could not process {0}.\n{1}'.format(uri, err))

    try:
        record = provider.find_record(uri)
        dataset = record.as_dataset(client)
        files = dataset.files

        if with_prompt:
            click.echo(
                tabulate(files,
                         headers=OrderedDict((
                             ('checksum', None),
                             ('filename', 'name'),
                             ('size_in_mb', 'size (mb)'),
                             ('filetype', 'type'),
                         ))))

            text_prompt = 'Do you wish to download this version?'
            if record.is_last_version(uri) is False:
                text_prompt = WARNING + 'Newer version found at {}\n'.format(
                    record.links.get('latest_html')) + text_prompt

            click.confirm(text_prompt, abort=True)

    except KeyError as e:
        raise ParameterError(
            ('Could not process {0}.\n'
             'Unable to fetch metadata due to {1}'.format(uri, e)))

    except LookupError:
        raise ParameterError(('Could not process {0}.\n'
                              'URI not found.'.format(uri)))

    if files:
        if not short_name:
            short_name = generate_default_short_name(dataset.name,
                                                     dataset.version)

        dataset.url = remove_credentials(dataset.url)

        add_to_dataset(
            client,
            urls=[f.url for f in files],
            short_name=short_name,
            create=True,
            with_metadata=dataset,
            force=True,
            extract=extract,
            all_at_once=True,
            progress=progress,
        )

        if dataset.version:
            tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version)
            tag_dataset(
                client, short_name, tag_name,
                'Tag {} created by renku import'.format(dataset.version))
示例#14
0
def add_to_dataset(
    client,
    urls,
    short_name,
    link=False,
    force=False,
    create=False,
    sources=(),
    destination='',
    ref=None,
    with_metadata=None,
    urlscontext=contextlib.nullcontext,
    commit_message=None,
    extract=False,
    all_at_once=False,
    progress=None,
):
    """Add data to a dataset."""
    if len(urls) == 0:
        raise UsageError('No URL is specified')
    if (sources or destination) and len(urls) > 1:
        raise UsageError(
            'Cannot add multiple URLs with --source or --destination')

    try:
        with client.with_dataset(short_name=short_name,
                                 create=create) as dataset:
            with urlscontext(urls) as bar:
                warning_message = client.add_data_to_dataset(
                    dataset,
                    bar,
                    link=link,
                    force=force,
                    sources=sources,
                    destination=destination,
                    ref=ref,
                    extract=extract,
                    all_at_once=all_at_once,
                    progress=progress,
                )

            if warning_message:
                click.echo(WARNING + warning_message)

            if with_metadata:
                for file_ in dataset.files:
                    file_.creator = with_metadata.creator
                # dataset has the correct list of files
                with_metadata.files = dataset.files

                dataset.update_metadata(with_metadata)

    except DatasetNotFound:
        raise DatasetNotFound(
            'Dataset "{0}" does not exist.\n'
            'Use "renku dataset create {0}" to create the dataset or retry '
            '"renku dataset add {0}" command with "--create" option for '
            'automatic dataset creation.'.format(short_name))
    except (FileNotFoundError, git.exc.NoSuchPathError) as e:
        raise ParameterError('Could not find paths/URLs: \n{0}'.format(
            '\n'.join(urls))) from e