def _find_tables( db: audformat.Database, db_root: str, version: str, deps: Dependencies, verbose: bool, ) -> typing.List[str]: r"""Update tables.""" # release dependencies to removed tables db_tables = [f'db.{table}.csv' for table in db.tables] for file in set(deps.tables) - set(db_tables): deps._drop(file) tables = [] for table in audeer.progress_bar( db.tables, desc='Find tables', disable=not verbose, ): file = f'db.{table}.csv' checksum = audbackend.md5(os.path.join(db_root, file)) if file not in deps or checksum != deps.checksum(file): deps._add_meta(file, version, table, checksum) tables.append(table) return tables
def job(table: str): file = f'db.{table}.csv' full_file = os.path.join(db_root, file) if not os.path.exists(full_file): tables.append(file) else: checksum = audbackend.md5(full_file) # if the table already exists # we have to compare checksum # in case it was altered by flavor if checksum != deps.checksum(file): # pragma: no cover tables.append(file)
def _find_media( db: audformat.Database, db_root: str, version: str, deps: Dependencies, archives: typing.Mapping[str, str], verbose: bool, ) -> typing.Set[str]: # release dependencies to removed media # and select according archives for upload media = set() db_media = db.files for file in set(deps.media) - set(db_media): media.add(deps.archive(file)) deps._drop(file) # update version of altered media and insert new ones for file in audeer.progress_bar( db_media, desc='Find media', disable=not verbose, ): path = os.path.join(db_root, file) if file not in deps: checksum = audbackend.md5(path) if file in archives: archive = archives[file] else: archive = audeer.uid(from_string=file.replace('\\', '/')) deps._add_media(db_root, file, version, archive, checksum) elif not deps.removed(file): checksum = audbackend.md5(path) if checksum != deps.checksum(file): archive = deps.archive(file) deps._add_media(db_root, file, version, archive, checksum) return media
def publish( db_root: str, version: str, repository: Repository, *, archives: typing.Mapping[str, str] = None, previous_version: typing.Optional[str] = 'latest', cache_root: str = None, num_workers: typing.Optional[int] = 1, verbose: bool = True, ) -> Dependencies: r"""Publish database. A database can have dependencies to files of an older version of itself. E.g. you might add a few new files to an existing database and publish as a new version. :func:`audb.publish` will upload then only the new files and store dependencies on the already published files. To allow for dependencies you first have to load the version of the database that the new version should depend on with :func:`audb.load_to` to ``db_root``. Afterwards you make your changes to that folder and run :func:`audb.publish`. :func:`audb.publish` will then check that the version of the files inside that folder match the version given by ``previous_version``. Setting ``previous_version=None`` allows you to start from scratch and upload all files even if an older versions exist. In this case you don't call :func:`audb.load_to` before running :func:`audb.publish`. Args: db_root: root directory of database version: version string repository: name of repository archives: dictionary mapping files to archive names. Can be used to bundle files into archives. Archive name must not include an extension previous_version: specifies the version this publication should be based on. If ``'latest'`` it will use automatically the latest published version or ``None`` if no version was published. If ``None`` it assumes you start from scratch. cache_root: cache folder where databases are stored. If not set :meth:`audb.default_cache_root` is used. Only used to read the dependencies of the previous version num_workers: number of parallel jobs or 1 for sequential processing. If ``None`` will be set to the number of processors on the machine multiplied by 5 verbose: show debug messages Returns: dependency object Raises: RuntimeError: if version already exists RuntimeError: if database tables reference non-existing files RuntimeError: if database in ``db_root`` depends on other version as indicated by ``previous_version`` RuntimeError: if database is not portable, see :meth:`audformat.Database.is_portable` """ db = audformat.Database.load(db_root, load_data=False) backend = audbackend.create( repository.backend, repository.host, repository.name, ) remote_header = backend.join(db.name, define.HEADER_FILE) versions = backend.versions(remote_header) if version in versions: raise RuntimeError('A version ' f"'{version}' " 'already exists for database ' f"'{db.name}'.") if previous_version == 'latest': if len(versions) > 0: previous_version = versions[-1] else: previous_version = None # load database and dependencies deps_path = os.path.join(db_root, define.DEPENDENCIES_FILE) deps = Dependencies() if os.path.exists(deps_path): deps.load(deps_path) # check if database folder depends on the right version # dependencies shouldn't be there if previous_version is None and len(deps) > 0: raise RuntimeError( f"You did not set a dependency to a previous version, " f"but you have a '{define.DEPENDENCIES_FILE}' file present " f"in {db_root}.") # dependencies missing if previous_version is not None and len(deps) == 0: raise RuntimeError( f"You want to depend on '{previous_version}' " f"of {db.name}, " f"but you don't have a '{define.DEPENDENCIES_FILE}' file present " f"in {db_root}. " f"Did you forgot to call " f"'audb.load_to({db_root}, {db.name}, " f"version={previous_version}?") # dependencies do not match version if previous_version is not None and len(deps) > 0: with tempfile.TemporaryDirectory() as tmp_dir: previous_deps_path = os.path.join( tmp_dir, define.DEPENDENCIES_FILE, ) previous_deps = dependencies( db.name, version=previous_version, cache_root=cache_root, ) previous_deps.save(previous_deps_path) if audbackend.md5(deps_path) != audbackend.md5(previous_deps_path): raise RuntimeError( f"You want to depend on '{previous_version}' " f"of {db.name}, " f"but the MD5 sum of your " f"'{define.DEPENDENCIES_FILE}' file " f"in {db_root} " f"does not match the MD5 sum of the corresponding file " f"for the requested version in the repository. " f"Did you forgot to call " f"'audb.load_to({db_root}, {db.name}, " f"version='{previous_version}') " f"or modified the file manually?") # load database from folder db = audformat.Database.load(db_root) if not db.is_portable: raise RuntimeError("Some files in the tables have absolute paths " "or use '.' or '..' to address a folder. " "Please replace those paths by relative paths " "and use folder names instead of dots.") # check all files referenced in a table exists missing_files = [ f for f in db.files if not os.path.exists(os.path.join(db_root, f)) ] if len(missing_files) > 0: number_of_presented_files = 20 error_msg = ( f'{len(missing_files)} files are referenced in tables ' 'that cannot be found. ' f"Missing files are: '{missing_files[:number_of_presented_files]}") if len(missing_files) <= number_of_presented_files: error_msg += "'." else: error_msg += ", ...'." raise RuntimeError(error_msg) # make sure all tables are stored in CSV format for table_id, table in db.tables.items(): table_path = os.path.join(db_root, f'db.{table_id}') table_ext = audformat.define.TableStorageFormat.CSV if not os.path.exists(table_path + f'.{table_ext}'): table.save(table_path, storage_format=table_ext) # check archives archives = archives or {} # publish tables tables = _find_tables(db, db_root, version, deps, verbose) _put_tables(tables, db_root, db.name, version, backend, num_workers, verbose) # publish media media = _find_media(db, db_root, version, deps, archives, verbose) _put_media(media, db_root, db.name, version, deps, backend, num_workers, verbose) # publish dependencies and header deps.save(deps_path) archive_file = backend.join(db.name, define.DB) backend.put_archive( db_root, define.DEPENDENCIES_FILE, archive_file, version, ) try: local_header = os.path.join(db_root, define.HEADER_FILE) remote_header = db.name + '/' + define.HEADER_FILE backend.put_file(local_header, remote_header, version) except Exception: # pragma: no cover # after the header is published # the new version becomes visible, # so if something goes wrong here # we better clean up if backend.exists(remote_header, version): backend.remove_file(remote_header, version) return deps
def load_to( root: str, name: str, *, version: str = None, cache_root: str = None, num_workers: typing.Optional[int] = 1, verbose: bool = True, ) -> audformat.Database: r"""Load database to directory. Loads the original state of the database to a custom directory. No conversion or filtering will be applied. If the target folder already contains some version of the database, it will upgrade to the requested version. Unchanged files will be skipped. Args: root: target directory name: name of database version: version string, latest if ``None`` cache_root: cache folder where databases are stored. If not set :meth:`audb.default_cache_root` is used. Only used to read the dependencies of the requested version num_workers: number of parallel jobs or 1 for sequential processing. If ``None`` will be set to the number of processors on the machine multiplied by 5 verbose: show debug messages Returns: database object """ if version is None: version = latest_version(name) db_root = audeer.safe_path(root) db_root_tmp = database_tmp_folder(db_root) # remove files with a wrong checksum # to ensure we load correct version update = os.path.exists(db_root) and os.listdir(db_root) audeer.mkdir(db_root) deps = dependencies(name, version=version, cache_root=cache_root) if update: for file in deps.files: full_file = os.path.join(db_root, file) if os.path.exists(full_file): checksum = audbackend.md5(full_file) if checksum != deps.checksum(file): os.remove(full_file) # load database header without tables from backend db_header, backend = load_header( db_root_tmp, name, version, overwrite=True, ) # get altered and new tables db_header.save(db_root_tmp, header_only=True) tables = _find_tables(db_header, db_root, deps, num_workers, verbose) _get_tables(tables, db_root, db_root_tmp, name, deps, backend, num_workers, verbose) # load database # move header to root and load database ... _move_file(db_root_tmp, db_root, define.HEADER_FILE) try: db = audformat.Database.load( db_root, num_workers=num_workers, verbose=verbose, ) except (KeyboardInterrupt, Exception): # pragma: no cover # make sure to remove header if user interrupts os.remove(os.path.join(db_root, define.HEADER_FILE)) raise # afterwards remove header to avoid the database # can be loaded before download is complete os.remove(os.path.join(db_root, define.HEADER_FILE)) # get altered and new media files media = _find_media(db, db_root, deps, num_workers, verbose) _get_media(media, db_root, db_root_tmp, name, deps, backend, num_workers, verbose) # save dependencies dep_path_tmp = os.path.join(db_root_tmp, define.DEPENDENCIES_FILE) deps.save(dep_path_tmp) _move_file(db_root_tmp, db_root, define.DEPENDENCIES_FILE) # save database and remove the temporal directory # to signal all files were correctly loaded _save_database(db, db_root, db_root_tmp, num_workers, verbose) try: _remove_empty_dirs(db_root_tmp) except OSError: # pragma: no cover raise RuntimeError('Could not remove temporary directory, ' 'probably there are some leftover files.' 'This should not happen.') return db
def test_publish(version): db = audformat.Database.load(DB_ROOT_VERSION[version]) print(db.is_portable) print(db.files) if not audb.versions(DB_NAME): with pytest.raises(RuntimeError): audb.latest_version(DB_NAME) archives = db['files']['speaker'].get().dropna().to_dict() deps = audb.publish( DB_ROOT_VERSION[version], version, pytest.PUBLISH_REPOSITORY, archives=archives, previous_version=None, num_workers=pytest.NUM_WORKERS, verbose=False, ) backend = audb.core.utils.lookup_backend(DB_NAME, version) number_of_files = len(set(archives.keys())) number_of_archives = len(set(archives.values())) assert len(deps.files) - len(deps.archives) == (number_of_files - number_of_archives) for archive in set(archives.values()): assert archive in deps.archives db = audb.load( DB_NAME, version=version, full_path=False, num_workers=pytest.NUM_WORKERS, ) assert db.name == DB_NAME versions = audb.versions(DB_NAME) latest_version = audb.latest_version(DB_NAME) assert version in versions assert latest_version == versions[-1] df = audb.available(only_latest=False) assert DB_NAME in df.index assert set(df[df.index == DB_NAME]['version']) == set(versions) df = audb.available(only_latest=True) assert DB_NAME in df.index assert df[df.index == DB_NAME]['version'][0] == latest_version for file in db.files: name = archives[file] if file in archives else file file_path = backend.join(db.name, 'media', name) backend.exists(file_path, version) path = os.path.join(DB_ROOT_VERSION[version], file) assert deps.checksum(file) == audbackend.md5(path) if deps.format(file) in [ audb.core.define.Format.WAV, audb.core.define.Format.FLAC, ]: assert deps.bit_depth(file) == audiofile.bit_depth(path) assert deps.channels(file) == audiofile.channels(path) assert deps.duration(file) == audiofile.duration(path) assert deps.sampling_rate(file) == audiofile.sampling_rate(path)