示例#1
0
def tabular(client, datasets):
    """Format datasets with a tabular output."""
    from renku.models._tabulate import tabulate

    click.echo(
        tabulate(
            datasets,
            headers=OrderedDict((
                ('uid', 'id'),
                ('display_name', None),
                ('version', None),
                ('created', None),
                ('creators_csv', 'creators'),
            )),
        ))
示例#2
0
def tabular(client, datasets=None):
    """Format datasets with a tabular output."""
    from renku.models._tabulate import tabulate

    datasets = datasets or client.datasets

    click.echo(
        tabulate(
            datasets.values(),
            headers=OrderedDict((
                ('short_id', 'id'),
                ('name', None),
                ('created', None),
                ('authors_csv', 'authors'),
            )),
        ))
示例#3
0
def tabular(client, records):
    """Format dataset files with a tabular output.

    :param client: LocalClient instance.
    :param records: Filtered collection.
    """
    from renku.models._tabulate import tabulate

    echo_via_pager(
        tabulate(
            records,
            headers=OrderedDict((
                ('added', None),
                ('creators_csv', 'creators'),
                ('dataset', None),
                ('full_path', 'path'),
            )),
        ))
示例#4
0
def import_(ctx, client, uri, name, extract):
    """Import data from a 3rd party provider.

    Supported providers: [Zenodo, ]
    """
    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise BadParameter('Could not process {0}.\n{1}'.format(uri, err))

    try:

        record = provider.find_record(uri)
        dataset_ = record.as_dataset(client)
        files_ = dataset_.files

        click.echo(
            tabulate(
                files_,
                headers=OrderedDict((
                    ('checksum', None),
                    ('filename', 'name'),
                    ('size_in_mb', 'size (mb)'),
                    ('filetype', 'type'),
                ))
            )
        )

        text_prompt = 'Do you wish to download this version?'
        if record.is_last_version(uri) is False:
            text_prompt = WARNING + 'Newer version found.\n' + text_prompt

    except KeyError as e:
        raise BadParameter((
            'Could not process {0}.\n'
            'Unable to fetch metadata due to {1}'.format(uri, e)
        ))

    except LookupError:
        raise BadParameter(
            ('Could not process {0}.\n'
             'URI not found.'.format(uri))
        )

    if files_ and click.confirm(text_prompt):
        data_folder = tempfile.mkdtemp()

        pool_size = min(
            int(os.getenv('RENKU_POOL_SIZE',
                          mp.cpu_count() // 2)), 4
        )

        freeze_support()  # Windows support
        pool = mp.Pool(
            pool_size,
            # Windows support
            initializer=tqdm.set_lock,
            initargs=(RLock(), )
        )

        processing = [
            pool.apply_async(
                download_file, args=(
                    i,
                    extract,
                    data_folder,
                    file_,
                )
            ) for i, file_ in enumerate(files_)
        ]

        for p in processing:
            p.wait()
        pool.close()

        dataset_name = name or dataset_.display_name
        if write_dataset(client, dataset_name):
            add_to_dataset(
                client,
                urls=[str(p) for p in Path(data_folder).glob('*')],
                name=dataset_name,
                with_metadata=dataset_
            )

            click.secho('OK', fg='green')
示例#5
0
def import_(ctx, client, uri, name, extract):
    """Import data from a 3rd party provider.

    Supported providers: [Zenodo, Dataverse]
    """
    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise BadParameter('Could not process {0}.\n{1}'.format(uri, err))
    elif err:
        click.echo(WARNING + err)

    try:

        record = provider.find_record(uri)
        dataset_ = record.as_dataset(client)
        files_ = dataset_.files

        click.echo(
            tabulate(files_,
                     headers=OrderedDict((
                         ('checksum', None),
                         ('filename', 'name'),
                         ('size_in_mb', 'size (mb)'),
                         ('filetype', 'type'),
                     ))))

        text_prompt = 'Do you wish to download this version?'
        if record.is_last_version(uri) is False:
            text_prompt = WARNING + 'Newer version found at {}\n'.format(
                record.links.get('latest_html')) + text_prompt

    except KeyError as e:
        raise BadParameter(
            ('Could not process {0}.\n'
             'Unable to fetch metadata due to {1}'.format(uri, e)))

    except LookupError:
        raise BadParameter(('Could not process {0}.\n'
                            'URI not found.'.format(uri)))

    if files_ and click.confirm(text_prompt):
        data_folder = tempfile.mkdtemp()

        pool_size = min(int(os.getenv('RENKU_POOL_SIZE',
                                      mp.cpu_count() // 2)), 4)

        manager = mp.Manager()
        id_queue = manager.Queue()

        for i in range(pool_size):
            id_queue.put(i)

        def _init(lock, id_queue):
            """Set up tqdm lock and worker process index.

            See https://stackoverflow.com/a/42817946
            Fixes tqdm line position when |files| > terminal-height
            so only |workers| progressbars are shown at a time
            """
            global current_process_position
            current_process_position = id_queue.get()
            tqdm.set_lock(lock)

        freeze_support()  # Windows support
        pool = mp.Pool(
            pool_size,
            # Windows support
            initializer=_init,
            initargs=(RLock(), id_queue))

        processing = [
            pool.apply_async(download_file,
                             args=(
                                 extract,
                                 data_folder,
                                 file_,
                             )) for file_ in files_
        ]

        try:
            for p in processing:
                p.get()  # Will internally do the wait() as well.

        except HTTPError as e:
            raise BadParameter(('Could not process {0}.\n'
                                'URI not found.'.format(e.request.url)))
        pool.close()

        dataset_name = name or dataset_.display_name
        if write_dataset(client, dataset_name):
            add_to_dataset(client,
                           urls=[str(p) for p in Path(data_folder).glob('*')],
                           name=dataset_name,
                           with_metadata=dataset_)

            click.secho('OK', fg='green')