def _put_tables( tables: typing.List[str], db_root: str, db_name: str, version: str, backend: audbackend.Backend, num_workers: typing.Optional[int], verbose: bool, ): def job(table: str): file = f'db.{table}.csv' archive_file = backend.join( db_name, define.DEPEND_TYPE_NAMES[define.DependType.META], table, ) backend.put_archive(db_root, file, archive_file, version) audeer.run_tasks( job, params=[([table], {}) for table in tables], num_workers=num_workers, progress_bar=verbose, task_description='Put tables', )
def _find_media( db: audformat.Database, db_root: str, deps: Dependencies, num_workers: typing.Optional[int], verbose: bool, ) -> typing.List[str]: r"""Find altered and new media.""" media = [] def job(file: str): if not deps.removed(file): full_file = os.path.join(db_root, file) if not os.path.exists(full_file): media.append(file) audeer.run_tasks( job, params=[([file], {}) for file in db.files], num_workers=num_workers, progress_bar=verbose, task_description='Find media', ) return media
def _find_tables( db_header: audformat.Database, db_root: str, deps: Dependencies, num_workers: typing.Optional[int], verbose: bool, ) -> typing.List[str]: tables = [] def job(table: str): file = f'db.{table}.csv' full_file = os.path.join(db_root, file) if not os.path.exists(full_file): tables.append(file) else: checksum = audbackend.md5(full_file) # if the table already exists # we have to compare checksum # in case it was altered by flavor if checksum != deps.checksum(file): # pragma: no cover tables.append(file) audeer.run_tasks( job, params=[([table], {}) for table in db_header.tables], num_workers=num_workers, progress_bar=verbose, task_description='Find tables', ) return tables
def _get_media_from_cache( media: typing.Sequence[str], db_root: str, db_root_tmp: str, deps: Dependencies, cached_versions: typing.Sequence[typing.Tuple[LooseVersion, str, Dependencies]], flavor: Flavor, num_workers: int, verbose: bool, ) -> typing.Sequence[str]: r"""Copy media from cache.""" cached_media, missing_media = _cached_files( media, deps, cached_versions, flavor, verbose, ) def job(cache_root: str, file: str): _copy_file(file, cache_root, db_root_tmp, db_root) audeer.run_tasks( job, params=[([root, file], {}) for root, file in cached_media], num_workers=num_workers, progress_bar=verbose, task_description='Copy media', ) return missing_media
def _get_tables( tables: typing.List[str], db_root: str, db_root_tmp: str, db_name: str, deps: Dependencies, backend: audbackend.Backend, num_workers: typing.Optional[int], verbose: bool, ): def job(table: str): # If a pickled version of the table exists, # we have to remove it to make sure that # later on the new CSV tables are loaded. # This can happen if we upgrading an existing # database to a different version. path_pkl = os.path.join( db_root, table)[:-3] + audformat.define.TableStorageFormat.PICKLE if os.path.exists(path_pkl): os.remove(path_pkl) archive = backend.join( db_name, define.DEPEND_TYPE_NAMES[define.DependType.META], deps.archive(table), ) backend.get_archive(archive, db_root_tmp, deps.version(table)) _move_file(db_root_tmp, db_root, table) audeer.run_tasks( job, params=[([table], {}) for table in tables], num_workers=num_workers, progress_bar=verbose, task_description='Get tables', )
def _fix_media_ext( tables: typing.Sequence[audformat.Table], format: str, num_workers: typing.Optional[int], verbose: bool, ): def job(table): # Faster solution then using db.map_files() cur_ext = r'\.[a-zA-Z0-9]+$' # match file extension new_ext = f'.{format}' if table.is_filewise: table.df.index = table.df.index.str.replace( cur_ext, new_ext, regex=True, ) else: table.df.index = table.df.index.set_levels( table.df.index.levels[0].str.replace( cur_ext, new_ext, regex=True, ), level='file', ) audeer.run_tasks( job, params=[([table], {}) for table in tables], num_workers=num_workers, progress_bar=verbose, task_description='Fix format', )
def _put_media( media: typing.Set[str], db_root: str, db_name: str, version: str, deps: Dependencies, backend: audbackend.Backend, num_workers: typing.Optional[int], verbose: bool, ): # create a mapping from archives to media and # select archives with new or altered files for upload map_media_to_files = collections.defaultdict(list) for file in deps.media: if not deps.removed(file): map_media_to_files[deps.archive(file)].append(file) if deps.version(file) == version: media.add(deps.archive(file)) lock = threading.Lock() def job(archive): if archive in map_media_to_files: for file in map_media_to_files[archive]: with lock: deps._add_media(db_root, file, version) archive_file = backend.join( db_name, define.DEPEND_TYPE_NAMES[define.DependType.MEDIA], archive, ) backend.put_archive( db_root, map_media_to_files[archive], archive_file, version, ) # upload new and altered archives if it contains at least one file audeer.run_tasks( job, params=[([archive], {}) for archive in media], num_workers=num_workers, progress_bar=verbose, task_description='Put media', )
def _get_tables_from_backend( db: audformat.Database, tables: typing.Sequence[str], db_root: str, db_root_tmp: str, deps: Dependencies, backend: audbackend.Backend, num_workers: typing.Optional[int], verbose: bool, ): r"""Load tables from backend.""" def job(table: str): archive = backend.join( db.name, define.DEPEND_TYPE_NAMES[define.DependType.META], deps.archive(table), ) backend.get_archive( archive, db_root_tmp, deps.version(table), ) table_id = table[3:-4] table_path = os.path.join(db_root_tmp, f'db.{table_id}') db[table_id].load(table_path) db[table_id].save( table_path, storage_format=audformat.define.TableStorageFormat.PICKLE, ) for storage_format in [ audformat.define.TableStorageFormat.PICKLE, audformat.define.TableStorageFormat.CSV, ]: _move_file(db_root_tmp, db_root, f'db.{table_id}.{storage_format}') audeer.run_tasks( job, params=[([table], {}) for table in tables], num_workers=num_workers, progress_bar=verbose, task_description='Load tables', )
def test_run_tasks(multiprocessing, num_workers, task_fun, params): expected = [ task_fun(*param[0], **param[1]) for param in params ] results = audeer.run_tasks( task_fun, params, num_workers=num_workers, multiprocessing=multiprocessing, ) assert expected == results
def _get_media( media: typing.List[str], db_root: str, db_root_tmp: str, db_name: str, deps: Dependencies, backend: audbackend.Backend, num_workers: typing.Optional[int], verbose: bool, ): # create folder tree to avoid race condition # in os.makedirs when files are unpacked for file in media: audeer.mkdir(os.path.dirname(os.path.join(db_root, file))) audeer.mkdir(os.path.dirname(os.path.join(db_root_tmp, file))) # figure out archives archives = set() for file in media: archives.add((deps.archive(file), deps.version(file))) def job(archive: str, version: str): archive = backend.join( db_name, define.DEPEND_TYPE_NAMES[define.DependType.MEDIA], archive, ) files = backend.get_archive(archive, db_root_tmp, version) for file in files: _move_file(db_root_tmp, db_root, file) audeer.run_tasks( job, params=[([archive, version], {}) for archive, version in archives], num_workers=num_workers, progress_bar=verbose, task_description='Get media', )
def _get_tables_from_cache( tables: typing.Sequence[str], db_root: str, db_root_tmp: str, deps: Dependencies, cached_versions: typing.Sequence[typing.Tuple[LooseVersion, str, Dependencies]], num_workers: int, verbose: bool, ) -> typing.Sequence[str]: r"""Copy tables from cache.""" cached_tables, missing_tables = _cached_files( tables, deps, cached_versions, None, verbose, ) def job(cache_root: str, file: str): file_pkl = audeer.replace_file_extension( file, audformat.define.TableStorageFormat.PICKLE, ) _copy_file(file, cache_root, db_root_tmp, db_root) _copy_file(file_pkl, cache_root, db_root_tmp, db_root) audeer.run_tasks( job, params=[([root, file], {}) for root, file in cached_tables], num_workers=num_workers, progress_bar=verbose, task_description='Copy tables', ) return missing_tables
def process_files( self, files: typing.Sequence[str], *, starts: typing.Sequence[pd.Timedelta] = None, ends: typing.Sequence[pd.Timedelta] = None, channel: int = None, ) -> pd.Series: r"""Process a list of files. Args: files: list of file paths channel: channel number starts: list with start positions ends: list with end positions Returns: Series with processed files """ if starts is None: starts = [None] * len(files) if ends is None: ends = [None] * len(files) params = [ ( (file, ), {'start': start, 'end': end, 'channel': channel}, ) for file, start, end in zip(files, starts, ends) ] y = audeer.run_tasks( self.process_file, params, num_workers=self.num_workers, multiprocessing=self.multiprocessing, progress_bar=self.verbose, task_description=f'Process {len(files)} files', ) return pd.concat(y)
def process_signal_from_index( self, signal: np.ndarray, sampling_rate: int, index: pd.Index, ) -> pd.Index: r"""Segment parts of a signal. Args: signal: signal values sampling_rate: sampling rate in Hz index: a segmented index conform to audformat_ or a :class:`pandas.MultiIndex` with two levels named `start` and `end` that hold start and end positions as :class:`pandas.Timedelta` objects. See also :func:`audinterface.utils.signal_index` Returns: Segmented index conform to audformat_ Raises: RuntimeError: if sampling rates do not match RuntimeError: if channel selection is invalid ValueError: if index contains duplicates .. _audformat: https://audeering.github.io/audformat/data-format.html """ utils.assert_index(index) if index.empty: return index if isinstance(index, pd.MultiIndex) and len(index.levels) == 2: params = [( (signal, sampling_rate), { 'start': start, 'end': end }, ) for start, end in index] else: index = audformat.utils.to_segmented_index(index) params = [( (signal, sampling_rate), { 'file': file, 'start': start, 'end': end }, ) for file, start, end in index] y = audeer.run_tasks( self.process_signal, params, num_workers=self.process.num_workers, multiprocessing=self.process.multiprocessing, progress_bar=self.process.verbose, task_description=f'Process {len(index)} segments', ) index = y[0] for obj in y[1:]: index = index.union(obj) return index
def _get_media_from_backend( name: str, media: typing.Sequence[str], db_root: str, db_root_tmp: str, flavor: typing.Optional[Flavor], deps: Dependencies, backend: audbackend.Backend, num_workers: typing.Optional[int], verbose: bool, ): r"""Load media from backend.""" # figure out archives archives = set() archive_names = set() for file in media: archive_name = deps.archive(file) archive_version = deps.version(file) archives.add((archive_name, archive_version)) archive_names.add(archive_name) # collect all files that will be extracted, # if we have more files than archives if len(deps.files) > len(deps.archives): files = list() for file in deps.media: archive = deps.archive(file) if archive in archive_names: files.append(file) media = files # create folder tree to avoid race condition # in os.makedirs when files are unpacked # using multi-processing for file in media: audeer.mkdir(os.path.dirname(os.path.join(db_root, file))) audeer.mkdir(os.path.dirname(os.path.join(db_root_tmp, file))) def job(archive: str, version: str): archive = backend.join( name, define.DEPEND_TYPE_NAMES[define.DependType.MEDIA], archive, ) # extract and move all files that are stored in the archive, # even if only a single file from the archive was requested files = backend.get_archive(archive, db_root_tmp, version) for file in files: if flavor is not None: bit_depth = deps.bit_depth(file) channels = deps.channels(file) sampling_rate = deps.sampling_rate(file) src_path = os.path.join(db_root_tmp, file) file = flavor.destination(file) dst_path = os.path.join(db_root_tmp, file) flavor( src_path, dst_path, src_bit_depth=bit_depth, src_channels=channels, src_sampling_rate=sampling_rate, ) if src_path != dst_path: os.remove(src_path) _move_file(db_root_tmp, db_root, file) audeer.run_tasks( job, params=[([archive, version], {}) for archive, version in archives], num_workers=num_workers, progress_bar=verbose, task_description='Load media', )
'b01': 'Was sind denn das für Tüten, die da unter dem Tisch ' 'stehen.', 'b02': 'Sie haben es gerade hochgetragen und jetzt gehen sie ' 'wieder runter.', 'b03': 'An den Wochenenden bin ich jetzt immer nach Hause ' 'gefahren und habe Agnes besucht.', 'b09': 'Ich will das eben wegbringen und dann mit Karl was ' 'trinken gehen.', 'b10': 'Die wird auf dem Platz sein, wo wir sie immer hinlegen.', } transcriptions = list(parse_names(names, from_i=2, to_i=5)) durations = audeer.run_tasks( task_func=lambda x: pd.to_timedelta( af.duration(os.path.join(src_dir, x)), unit='s', ), params=[([f], {}) for f in files], num_workers=12, ) # Convert to audformat db = audformat.Database( name='emodb', author=( 'Felix Burkhardt, ' 'Astrid Paeschke, ' 'Miriam Rolfes, ' 'Walter Sendlmeier, ' 'Benjamin Weiss' ), organization='audEERING',