Exemplo n.º 1
0
def include_exclude_mapping(
    deps: Dependencies,
    include: typing.Optional[typing.Union[str, typing.Sequence[str]]],
    exclude: typing.Optional[typing.Union[str, typing.Sequence[str]]],
) -> typing.Sequence[str]:
    r"""Map include and exclude to media argument."""
    media = None

    if include is not None:
        archives = set([deps.archive(f) for f in deps.media])
        if isinstance(include, str):
            pattern = re.compile(include)
            include = [a for a in archives if pattern.search(a)]
        media = [x for x in deps.media if deps.archive(x) in include]

    if media is None:
        media = deps.media

    if exclude is not None:
        archives = set([deps.archive(f) for f in deps.media])
        if isinstance(exclude, str):
            pattern = re.compile(exclude)
            exclude = [a for a in archives if pattern.search(a)]
        media = [x for x in media if deps.archive(x) not in exclude]

    return media
Exemplo n.º 2
0
def _find_tables(
    db: audformat.Database,
    db_root: str,
    version: str,
    deps: Dependencies,
    verbose: bool,
) -> typing.List[str]:
    r"""Update tables."""

    # release dependencies to removed tables

    db_tables = [f'db.{table}.csv' for table in db.tables]
    for file in set(deps.tables) - set(db_tables):
        deps._drop(file)

    tables = []
    for table in audeer.progress_bar(
            db.tables,
            desc='Find tables',
            disable=not verbose,
    ):
        file = f'db.{table}.csv'
        checksum = audbackend.md5(os.path.join(db_root, file))
        if file not in deps or checksum != deps.checksum(file):
            deps._add_meta(file, version, table, checksum)
            tables.append(table)

    return tables
Exemplo n.º 3
0
def _put_media(
    media: typing.Set[str],
    db_root: str,
    db_name: str,
    version: str,
    deps: Dependencies,
    backend: audbackend.Backend,
    num_workers: typing.Optional[int],
    verbose: bool,
):
    # create a mapping from archives to media and
    # select archives with new or altered files for upload
    map_media_to_files = collections.defaultdict(list)
    for file in deps.media:
        if not deps.removed(file):
            map_media_to_files[deps.archive(file)].append(file)
            if deps.version(file) == version:
                media.add(deps.archive(file))

    lock = threading.Lock()

    def job(archive):
        if archive in map_media_to_files:
            for file in map_media_to_files[archive]:
                with lock:
                    deps._add_media(db_root, file, version)
            archive_file = backend.join(
                db_name,
                define.DEPEND_TYPE_NAMES[define.DependType.MEDIA],
                archive,
            )
            backend.put_archive(
                db_root,
                map_media_to_files[archive],
                archive_file,
                version,
            )

    # upload new and altered archives if it contains at least one file
    audeer.run_tasks(
        job,
        params=[([archive], {}) for archive in media],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Put media',
    )
Exemplo n.º 4
0
def _cached_files(
    files: typing.Sequence[str],
    deps: Dependencies,
    cached_versions: typing.Sequence[typing.Tuple[LooseVersion, str,
                                                  Dependencies], ],
    flavor: typing.Optional[Flavor],
    verbose: bool,
) -> (typing.Sequence[typing.Union[str, str]], typing.Sequence[str]):
    r"""Find cached files."""

    cached_files = []
    missing_files = []

    for file in audeer.progress_bar(
            files,
            desc='Cached files',
            disable=not verbose,
    ):
        found = False
        file_version = LooseVersion(deps.version(file))
        for cache_version, cache_root, cache_deps in cached_versions:
            if cache_version >= file_version:
                if file in cache_deps:
                    if deps.checksum(file) == cache_deps.checksum(file):
                        path = os.path.join(cache_root, file)
                        if flavor and flavor.format is not None:
                            path = audeer.replace_file_extension(
                                path,
                                flavor.format,
                            )
                        if os.path.exists(path):
                            found = True
                            break
        if found:
            if flavor and flavor.format is not None:
                file = audeer.replace_file_extension(
                    file,
                    flavor.format,
                )
            cached_files.append((cache_root, file))
        else:
            missing_files.append(file)

    return cached_files, missing_files
Exemplo n.º 5
0
def dependencies(
    name: str,
    *,
    version: str = None,
    cache_root: str = None,
) -> Dependencies:
    r"""Database dependencies.

    Args:
        name: name of database
        version: version string
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used

    Returns:
        dependency object

    """
    if version is None:
        version = latest_version(name)

    cache_roots = [
        default_cache_root(True),  # check shared cache first
        default_cache_root(False),
    ] if cache_root is None else [cache_root]
    for cache_root in cache_roots:
        deps_root = audeer.safe_path(os.path.join(
            cache_root,
            name,
            version,
        ))
        if os.path.exists(deps_root):
            break

    audeer.mkdir(deps_root)
    deps_path = os.path.join(deps_root, define.CACHED_DEPENDENCIES_FILE)

    deps = Dependencies()
    if not os.path.exists(deps_path):
        backend = lookup_backend(name, version)
        with tempfile.TemporaryDirectory() as tmp_root:
            archive = backend.join(name, define.DB)
            backend.get_archive(
                archive,
                tmp_root,
                version,
            )
            deps.load(os.path.join(tmp_root, define.DEPENDENCIES_FILE))
            deps.save(deps_path)
    else:
        deps.load(deps_path)

    return deps
Exemplo n.º 6
0
def _find_media(
    db: audformat.Database,
    db_root: str,
    version: str,
    deps: Dependencies,
    archives: typing.Mapping[str, str],
    verbose: bool,
) -> typing.Set[str]:

    # release dependencies to removed media
    # and select according archives for upload
    media = set()
    db_media = db.files
    for file in set(deps.media) - set(db_media):
        media.add(deps.archive(file))
        deps._drop(file)

    # update version of altered media and insert new ones

    for file in audeer.progress_bar(
            db_media,
            desc='Find media',
            disable=not verbose,
    ):
        path = os.path.join(db_root, file)
        if file not in deps:
            checksum = audbackend.md5(path)
            if file in archives:
                archive = archives[file]
            else:
                archive = audeer.uid(from_string=file.replace('\\', '/'))
            deps._add_media(db_root, file, version, archive, checksum)
        elif not deps.removed(file):
            checksum = audbackend.md5(path)
            if checksum != deps.checksum(file):
                archive = deps.archive(file)
                deps._add_media(db_root, file, version, archive, checksum)

    return media
Exemplo n.º 7
0
def _get_media(
    media: typing.List[str],
    db_root: str,
    db_root_tmp: str,
    db_name: str,
    deps: Dependencies,
    backend: audbackend.Backend,
    num_workers: typing.Optional[int],
    verbose: bool,
):

    # create folder tree to avoid race condition
    # in os.makedirs when files are unpacked
    for file in media:
        audeer.mkdir(os.path.dirname(os.path.join(db_root, file)))
        audeer.mkdir(os.path.dirname(os.path.join(db_root_tmp, file)))

    # figure out archives
    archives = set()
    for file in media:
        archives.add((deps.archive(file), deps.version(file)))

    def job(archive: str, version: str):
        archive = backend.join(
            db_name,
            define.DEPEND_TYPE_NAMES[define.DependType.MEDIA],
            archive,
        )
        files = backend.get_archive(archive, db_root_tmp, version)
        for file in files:
            _move_file(db_root_tmp, db_root, file)

    audeer.run_tasks(
        job,
        params=[([archive, version], {}) for archive, version in archives],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Get media',
    )
Exemplo n.º 8
0
def publish(
    db_root: str,
    version: str,
    repository: Repository,
    *,
    archives: typing.Mapping[str, str] = None,
    previous_version: typing.Optional[str] = 'latest',
    cache_root: str = None,
    num_workers: typing.Optional[int] = 1,
    verbose: bool = True,
) -> Dependencies:
    r"""Publish database.

    A database can have dependencies
    to files of an older version of itself.
    E.g. you might add a few new files to an existing database
    and publish as a new version.
    :func:`audb.publish` will upload then only the new files
    and store dependencies on the already published files.

    To allow for dependencies
    you first have to load the version of the database
    that the new version should depend on
    with :func:`audb.load_to` to ``db_root``.
    Afterwards you make your changes to that folder
    and run :func:`audb.publish`.
    :func:`audb.publish` will then check
    that the version of the files inside that folder
    match the version given by ``previous_version``.

    Setting ``previous_version=None`` allows you
    to start from scratch and upload all files
    even if an older versions exist.
    In this case you don't call :func:`audb.load_to`
    before running :func:`audb.publish`.

    Args:
        db_root: root directory of database
        version: version string
        repository: name of repository
        archives: dictionary mapping files to archive names.
            Can be used to bundle files into archives.
            Archive name must not include an extension
        previous_version: specifies the version
            this publication should be based on.
            If ``'latest'``
            it will use automatically the latest published version
            or ``None``
            if no version was published.
            If ``None`` it assumes you start from scratch.
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used.
            Only used to read the dependencies of the previous version
        num_workers: number of parallel jobs or 1 for sequential
            processing. If ``None`` will be set to the number of
            processors on the machine multiplied by 5
        verbose: show debug messages

    Returns:
        dependency object

    Raises:
        RuntimeError: if version already exists
        RuntimeError: if database tables reference non-existing files
        RuntimeError: if database in ``db_root`` depends on other version
            as indicated by ``previous_version``
        RuntimeError: if database is not portable,
            see :meth:`audformat.Database.is_portable`

    """
    db = audformat.Database.load(db_root, load_data=False)

    backend = audbackend.create(
        repository.backend,
        repository.host,
        repository.name,
    )

    remote_header = backend.join(db.name, define.HEADER_FILE)
    versions = backend.versions(remote_header)
    if version in versions:
        raise RuntimeError('A version '
                           f"'{version}' "
                           'already exists for database '
                           f"'{db.name}'.")
    if previous_version == 'latest':
        if len(versions) > 0:
            previous_version = versions[-1]
        else:
            previous_version = None

    # load database and dependencies
    deps_path = os.path.join(db_root, define.DEPENDENCIES_FILE)
    deps = Dependencies()
    if os.path.exists(deps_path):
        deps.load(deps_path)

    # check if database folder depends on the right version

    # dependencies shouldn't be there
    if previous_version is None and len(deps) > 0:
        raise RuntimeError(
            f"You did not set a dependency to a previous version, "
            f"but you have a '{define.DEPENDENCIES_FILE}' file present "
            f"in {db_root}.")

    # dependencies missing
    if previous_version is not None and len(deps) == 0:
        raise RuntimeError(
            f"You want to depend on '{previous_version}' "
            f"of {db.name}, "
            f"but you don't have a '{define.DEPENDENCIES_FILE}' file present "
            f"in {db_root}. "
            f"Did you forgot to call "
            f"'audb.load_to({db_root}, {db.name}, "
            f"version={previous_version}?")

    # dependencies do not match version
    if previous_version is not None and len(deps) > 0:
        with tempfile.TemporaryDirectory() as tmp_dir:
            previous_deps_path = os.path.join(
                tmp_dir,
                define.DEPENDENCIES_FILE,
            )
            previous_deps = dependencies(
                db.name,
                version=previous_version,
                cache_root=cache_root,
            )
            previous_deps.save(previous_deps_path)
            if audbackend.md5(deps_path) != audbackend.md5(previous_deps_path):
                raise RuntimeError(
                    f"You want to depend on '{previous_version}' "
                    f"of {db.name}, "
                    f"but the MD5 sum of your "
                    f"'{define.DEPENDENCIES_FILE}' file "
                    f"in {db_root} "
                    f"does not match the MD5 sum of the corresponding file "
                    f"for the requested version in the repository. "
                    f"Did you forgot to call "
                    f"'audb.load_to({db_root}, {db.name}, "
                    f"version='{previous_version}') "
                    f"or modified the file manually?")

    # load database from folder
    db = audformat.Database.load(db_root)

    if not db.is_portable:
        raise RuntimeError("Some files in the tables have absolute paths "
                           "or use '.' or '..' to address a folder. "
                           "Please replace those paths by relative paths "
                           "and use folder names instead of dots.")

    # check all files referenced in a table exists
    missing_files = [
        f for f in db.files if not os.path.exists(os.path.join(db_root, f))
    ]
    if len(missing_files) > 0:
        number_of_presented_files = 20
        error_msg = (
            f'{len(missing_files)} files are referenced in tables '
            'that cannot be found. '
            f"Missing files are: '{missing_files[:number_of_presented_files]}")
        if len(missing_files) <= number_of_presented_files:
            error_msg += "'."
        else:
            error_msg += ", ...'."
        raise RuntimeError(error_msg)

    # make sure all tables are stored in CSV format
    for table_id, table in db.tables.items():
        table_path = os.path.join(db_root, f'db.{table_id}')
        table_ext = audformat.define.TableStorageFormat.CSV
        if not os.path.exists(table_path + f'.{table_ext}'):
            table.save(table_path, storage_format=table_ext)

    # check archives
    archives = archives or {}

    # publish tables
    tables = _find_tables(db, db_root, version, deps, verbose)
    _put_tables(tables, db_root, db.name, version, backend, num_workers,
                verbose)

    # publish media
    media = _find_media(db, db_root, version, deps, archives, verbose)
    _put_media(media, db_root, db.name, version, deps, backend, num_workers,
               verbose)

    # publish dependencies and header
    deps.save(deps_path)
    archive_file = backend.join(db.name, define.DB)
    backend.put_archive(
        db_root,
        define.DEPENDENCIES_FILE,
        archive_file,
        version,
    )
    try:
        local_header = os.path.join(db_root, define.HEADER_FILE)
        remote_header = db.name + '/' + define.HEADER_FILE
        backend.put_file(local_header, remote_header, version)
    except Exception:  # pragma: no cover
        # after the header is published
        # the new version becomes visible,
        # so if something goes wrong here
        # we better clean up
        if backend.exists(remote_header, version):
            backend.remove_file(remote_header, version)

    return deps
Exemplo n.º 9
0
def remove_media(
    name: str,
    files: typing.Union[str, typing.Sequence[str]],
    *,
    verbose: bool = False,
):
    r"""Remove media from all versions.

    Args:
        name: name of database
        files: list of files that should be removed
        verbose: show debug messages

    """
    if isinstance(files, str):
        files = [files]

    for version in versions(name):

        backend = lookup_backend(name, version)

        with tempfile.TemporaryDirectory() as db_root:

            # download dependencies
            archive = backend.join(name, define.DB)
            deps_path = backend.get_archive(
                archive,
                db_root,
                version,
            )[0]
            deps_path = os.path.join(db_root, deps_path)
            deps = Dependencies()
            deps.load(deps_path)
            upload = False

            for file in files:
                if file in deps.media:
                    archive = deps.archive(file)

                    # if archive exists in this version,
                    # remove file from it and re-publish
                    remote_archive = backend.join(
                        name,
                        define.DEPEND_TYPE_NAMES[define.DependType.MEDIA],
                        archive,
                    )
                    if backend.exists(
                            f'{remote_archive}.zip',
                            version,
                    ):

                        files_in_archive = backend.get_archive(
                            remote_archive,
                            db_root,
                            version,
                        )
                        # skip if file was already deleted
                        if file in files_in_archive:
                            os.remove(os.path.join(db_root, file))
                            files_in_archive.remove(file)
                            backend.put_archive(
                                db_root,
                                files_in_archive,
                                remote_archive,
                                version,
                            )

                    # mark file as removed
                    deps._remove(file)
                    upload = True

            # upload dependencies
            if upload:
                deps.save(deps_path)
                remote_archive = backend.join(name, define.DB)
                backend.put_archive(
                    db_root,
                    define.DEPENDENCIES_FILE,
                    remote_archive,
                    version,
                )
Exemplo n.º 10
0
def _get_media_from_backend(
    name: str,
    media: typing.Sequence[str],
    db_root: str,
    db_root_tmp: str,
    flavor: typing.Optional[Flavor],
    deps: Dependencies,
    backend: audbackend.Backend,
    num_workers: typing.Optional[int],
    verbose: bool,
):
    r"""Load media from backend."""

    # figure out archives
    archives = set()
    archive_names = set()
    for file in media:
        archive_name = deps.archive(file)
        archive_version = deps.version(file)
        archives.add((archive_name, archive_version))
        archive_names.add(archive_name)
    # collect all files that will be extracted,
    # if we have more files than archives
    if len(deps.files) > len(deps.archives):
        files = list()
        for file in deps.media:
            archive = deps.archive(file)
            if archive in archive_names:
                files.append(file)
        media = files

    # create folder tree to avoid race condition
    # in os.makedirs when files are unpacked
    # using multi-processing
    for file in media:
        audeer.mkdir(os.path.dirname(os.path.join(db_root, file)))
        audeer.mkdir(os.path.dirname(os.path.join(db_root_tmp, file)))

    def job(archive: str, version: str):
        archive = backend.join(
            name,
            define.DEPEND_TYPE_NAMES[define.DependType.MEDIA],
            archive,
        )
        # extract and move all files that are stored in the archive,
        # even if only a single file from the archive was requested
        files = backend.get_archive(archive, db_root_tmp, version)
        for file in files:
            if flavor is not None:
                bit_depth = deps.bit_depth(file)
                channels = deps.channels(file)
                sampling_rate = deps.sampling_rate(file)
                src_path = os.path.join(db_root_tmp, file)
                file = flavor.destination(file)
                dst_path = os.path.join(db_root_tmp, file)
                flavor(
                    src_path,
                    dst_path,
                    src_bit_depth=bit_depth,
                    src_channels=channels,
                    src_sampling_rate=sampling_rate,
                )
                if src_path != dst_path:
                    os.remove(src_path)

            _move_file(db_root_tmp, db_root, file)

    audeer.run_tasks(
        job,
        params=[([archive, version], {}) for archive, version in archives],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Load media',
    )