Пример #1
0
def _put_tables(
    tables: typing.List[str],
    db_root: str,
    db_name: str,
    version: str,
    backend: audbackend.Backend,
    num_workers: typing.Optional[int],
    verbose: bool,
):
    def job(table: str):
        file = f'db.{table}.csv'
        archive_file = backend.join(
            db_name,
            define.DEPEND_TYPE_NAMES[define.DependType.META],
            table,
        )
        backend.put_archive(db_root, file, archive_file, version)

    audeer.run_tasks(
        job,
        params=[([table], {}) for table in tables],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Put tables',
    )
Пример #2
0
def _find_media(
    db: audformat.Database,
    db_root: str,
    deps: Dependencies,
    num_workers: typing.Optional[int],
    verbose: bool,
) -> typing.List[str]:
    r"""Find altered and new media."""

    media = []

    def job(file: str):
        if not deps.removed(file):
            full_file = os.path.join(db_root, file)
            if not os.path.exists(full_file):
                media.append(file)

    audeer.run_tasks(
        job,
        params=[([file], {}) for file in db.files],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Find media',
    )

    return media
Пример #3
0
def _find_tables(
    db_header: audformat.Database,
    db_root: str,
    deps: Dependencies,
    num_workers: typing.Optional[int],
    verbose: bool,
) -> typing.List[str]:

    tables = []

    def job(table: str):
        file = f'db.{table}.csv'
        full_file = os.path.join(db_root, file)
        if not os.path.exists(full_file):
            tables.append(file)
        else:
            checksum = audbackend.md5(full_file)
            # if the table already exists
            # we have to compare checksum
            # in case it was altered by flavor
            if checksum != deps.checksum(file):  # pragma: no cover
                tables.append(file)

    audeer.run_tasks(
        job,
        params=[([table], {}) for table in db_header.tables],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Find tables',
    )

    return tables
Пример #4
0
def _get_media_from_cache(
    media: typing.Sequence[str],
    db_root: str,
    db_root_tmp: str,
    deps: Dependencies,
    cached_versions: typing.Sequence[typing.Tuple[LooseVersion, str,
                                                  Dependencies]],
    flavor: Flavor,
    num_workers: int,
    verbose: bool,
) -> typing.Sequence[str]:
    r"""Copy media from cache."""

    cached_media, missing_media = _cached_files(
        media,
        deps,
        cached_versions,
        flavor,
        verbose,
    )

    def job(cache_root: str, file: str):
        _copy_file(file, cache_root, db_root_tmp, db_root)

    audeer.run_tasks(
        job,
        params=[([root, file], {}) for root, file in cached_media],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Copy media',
    )

    return missing_media
Пример #5
0
def _get_tables(
    tables: typing.List[str],
    db_root: str,
    db_root_tmp: str,
    db_name: str,
    deps: Dependencies,
    backend: audbackend.Backend,
    num_workers: typing.Optional[int],
    verbose: bool,
):
    def job(table: str):
        # If a pickled version of the table exists,
        # we have to remove it to make sure that
        # later on the new CSV tables are loaded.
        # This can happen if we upgrading an existing
        # database to a different version.
        path_pkl = os.path.join(
            db_root, table)[:-3] + audformat.define.TableStorageFormat.PICKLE
        if os.path.exists(path_pkl):
            os.remove(path_pkl)
        archive = backend.join(
            db_name,
            define.DEPEND_TYPE_NAMES[define.DependType.META],
            deps.archive(table),
        )
        backend.get_archive(archive, db_root_tmp, deps.version(table))
        _move_file(db_root_tmp, db_root, table)

    audeer.run_tasks(
        job,
        params=[([table], {}) for table in tables],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Get tables',
    )
Пример #6
0
def _fix_media_ext(
    tables: typing.Sequence[audformat.Table],
    format: str,
    num_workers: typing.Optional[int],
    verbose: bool,
):
    def job(table):
        # Faster solution then using db.map_files()
        cur_ext = r'\.[a-zA-Z0-9]+$'  # match file extension
        new_ext = f'.{format}'
        if table.is_filewise:
            table.df.index = table.df.index.str.replace(
                cur_ext,
                new_ext,
                regex=True,
            )
        else:
            table.df.index = table.df.index.set_levels(
                table.df.index.levels[0].str.replace(
                    cur_ext,
                    new_ext,
                    regex=True,
                ),
                level='file',
            )

    audeer.run_tasks(
        job,
        params=[([table], {}) for table in tables],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Fix format',
    )
Пример #7
0
def _put_media(
    media: typing.Set[str],
    db_root: str,
    db_name: str,
    version: str,
    deps: Dependencies,
    backend: audbackend.Backend,
    num_workers: typing.Optional[int],
    verbose: bool,
):
    # create a mapping from archives to media and
    # select archives with new or altered files for upload
    map_media_to_files = collections.defaultdict(list)
    for file in deps.media:
        if not deps.removed(file):
            map_media_to_files[deps.archive(file)].append(file)
            if deps.version(file) == version:
                media.add(deps.archive(file))

    lock = threading.Lock()

    def job(archive):
        if archive in map_media_to_files:
            for file in map_media_to_files[archive]:
                with lock:
                    deps._add_media(db_root, file, version)
            archive_file = backend.join(
                db_name,
                define.DEPEND_TYPE_NAMES[define.DependType.MEDIA],
                archive,
            )
            backend.put_archive(
                db_root,
                map_media_to_files[archive],
                archive_file,
                version,
            )

    # upload new and altered archives if it contains at least one file
    audeer.run_tasks(
        job,
        params=[([archive], {}) for archive in media],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Put media',
    )
Пример #8
0
def _get_tables_from_backend(
    db: audformat.Database,
    tables: typing.Sequence[str],
    db_root: str,
    db_root_tmp: str,
    deps: Dependencies,
    backend: audbackend.Backend,
    num_workers: typing.Optional[int],
    verbose: bool,
):
    r"""Load tables from backend."""
    def job(table: str):
        archive = backend.join(
            db.name,
            define.DEPEND_TYPE_NAMES[define.DependType.META],
            deps.archive(table),
        )
        backend.get_archive(
            archive,
            db_root_tmp,
            deps.version(table),
        )
        table_id = table[3:-4]
        table_path = os.path.join(db_root_tmp, f'db.{table_id}')
        db[table_id].load(table_path)
        db[table_id].save(
            table_path,
            storage_format=audformat.define.TableStorageFormat.PICKLE,
        )
        for storage_format in [
                audformat.define.TableStorageFormat.PICKLE,
                audformat.define.TableStorageFormat.CSV,
        ]:
            _move_file(db_root_tmp, db_root, f'db.{table_id}.{storage_format}')

    audeer.run_tasks(
        job,
        params=[([table], {}) for table in tables],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Load tables',
    )
Пример #9
0
def test_run_tasks(multiprocessing, num_workers, task_fun, params):
    expected = [
        task_fun(*param[0], **param[1]) for param in params
    ]
    results = audeer.run_tasks(
        task_fun,
        params,
        num_workers=num_workers,
        multiprocessing=multiprocessing,
    )
    assert expected == results
Пример #10
0
def _get_media(
    media: typing.List[str],
    db_root: str,
    db_root_tmp: str,
    db_name: str,
    deps: Dependencies,
    backend: audbackend.Backend,
    num_workers: typing.Optional[int],
    verbose: bool,
):

    # create folder tree to avoid race condition
    # in os.makedirs when files are unpacked
    for file in media:
        audeer.mkdir(os.path.dirname(os.path.join(db_root, file)))
        audeer.mkdir(os.path.dirname(os.path.join(db_root_tmp, file)))

    # figure out archives
    archives = set()
    for file in media:
        archives.add((deps.archive(file), deps.version(file)))

    def job(archive: str, version: str):
        archive = backend.join(
            db_name,
            define.DEPEND_TYPE_NAMES[define.DependType.MEDIA],
            archive,
        )
        files = backend.get_archive(archive, db_root_tmp, version)
        for file in files:
            _move_file(db_root_tmp, db_root, file)

    audeer.run_tasks(
        job,
        params=[([archive, version], {}) for archive, version in archives],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Get media',
    )
Пример #11
0
def _get_tables_from_cache(
    tables: typing.Sequence[str],
    db_root: str,
    db_root_tmp: str,
    deps: Dependencies,
    cached_versions: typing.Sequence[typing.Tuple[LooseVersion, str,
                                                  Dependencies]],
    num_workers: int,
    verbose: bool,
) -> typing.Sequence[str]:
    r"""Copy tables from cache."""

    cached_tables, missing_tables = _cached_files(
        tables,
        deps,
        cached_versions,
        None,
        verbose,
    )

    def job(cache_root: str, file: str):
        file_pkl = audeer.replace_file_extension(
            file,
            audformat.define.TableStorageFormat.PICKLE,
        )
        _copy_file(file, cache_root, db_root_tmp, db_root)
        _copy_file(file_pkl, cache_root, db_root_tmp, db_root)

    audeer.run_tasks(
        job,
        params=[([root, file], {}) for root, file in cached_tables],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Copy tables',
    )

    return missing_tables
Пример #12
0
    def process_files(
            self,
            files: typing.Sequence[str],
            *,
            starts: typing.Sequence[pd.Timedelta] = None,
            ends: typing.Sequence[pd.Timedelta] = None,
            channel: int = None,
    ) -> pd.Series:
        r"""Process a list of files.

        Args:
            files: list of file paths
            channel: channel number
            starts: list with start positions
            ends: list with end positions

        Returns:
            Series with processed files

        """
        if starts is None:
            starts = [None] * len(files)
        if ends is None:
            ends = [None] * len(files)

        params = [
            (
                (file, ),
                {'start': start, 'end': end, 'channel': channel},
            ) for file, start, end in zip(files, starts, ends)
        ]
        y = audeer.run_tasks(
            self.process_file,
            params,
            num_workers=self.num_workers,
            multiprocessing=self.multiprocessing,
            progress_bar=self.verbose,
            task_description=f'Process {len(files)} files',
        )
        return pd.concat(y)
Пример #13
0
    def process_signal_from_index(
        self,
        signal: np.ndarray,
        sampling_rate: int,
        index: pd.Index,
    ) -> pd.Index:
        r"""Segment parts of a signal.

        Args:
            signal: signal values
            sampling_rate: sampling rate in Hz
            index: a segmented index conform to audformat_
                or a :class:`pandas.MultiIndex` with two levels
                named `start` and `end` that hold start and end
                positions as :class:`pandas.Timedelta` objects.
                See also :func:`audinterface.utils.signal_index`

        Returns:
            Segmented index conform to audformat_

        Raises:
            RuntimeError: if sampling rates do not match
            RuntimeError: if channel selection is invalid
            ValueError: if index contains duplicates

        .. _audformat: https://audeering.github.io/audformat/data-format.html

        """
        utils.assert_index(index)

        if index.empty:
            return index

        if isinstance(index, pd.MultiIndex) and len(index.levels) == 2:
            params = [(
                (signal, sampling_rate),
                {
                    'start': start,
                    'end': end
                },
            ) for start, end in index]
        else:
            index = audformat.utils.to_segmented_index(index)
            params = [(
                (signal, sampling_rate),
                {
                    'file': file,
                    'start': start,
                    'end': end
                },
            ) for file, start, end in index]

        y = audeer.run_tasks(
            self.process_signal,
            params,
            num_workers=self.process.num_workers,
            multiprocessing=self.process.multiprocessing,
            progress_bar=self.process.verbose,
            task_description=f'Process {len(index)} segments',
        )

        index = y[0]
        for obj in y[1:]:
            index = index.union(obj)

        return index
Пример #14
0
def _get_media_from_backend(
    name: str,
    media: typing.Sequence[str],
    db_root: str,
    db_root_tmp: str,
    flavor: typing.Optional[Flavor],
    deps: Dependencies,
    backend: audbackend.Backend,
    num_workers: typing.Optional[int],
    verbose: bool,
):
    r"""Load media from backend."""

    # figure out archives
    archives = set()
    archive_names = set()
    for file in media:
        archive_name = deps.archive(file)
        archive_version = deps.version(file)
        archives.add((archive_name, archive_version))
        archive_names.add(archive_name)
    # collect all files that will be extracted,
    # if we have more files than archives
    if len(deps.files) > len(deps.archives):
        files = list()
        for file in deps.media:
            archive = deps.archive(file)
            if archive in archive_names:
                files.append(file)
        media = files

    # create folder tree to avoid race condition
    # in os.makedirs when files are unpacked
    # using multi-processing
    for file in media:
        audeer.mkdir(os.path.dirname(os.path.join(db_root, file)))
        audeer.mkdir(os.path.dirname(os.path.join(db_root_tmp, file)))

    def job(archive: str, version: str):
        archive = backend.join(
            name,
            define.DEPEND_TYPE_NAMES[define.DependType.MEDIA],
            archive,
        )
        # extract and move all files that are stored in the archive,
        # even if only a single file from the archive was requested
        files = backend.get_archive(archive, db_root_tmp, version)
        for file in files:
            if flavor is not None:
                bit_depth = deps.bit_depth(file)
                channels = deps.channels(file)
                sampling_rate = deps.sampling_rate(file)
                src_path = os.path.join(db_root_tmp, file)
                file = flavor.destination(file)
                dst_path = os.path.join(db_root_tmp, file)
                flavor(
                    src_path,
                    dst_path,
                    src_bit_depth=bit_depth,
                    src_channels=channels,
                    src_sampling_rate=sampling_rate,
                )
                if src_path != dst_path:
                    os.remove(src_path)

            _move_file(db_root_tmp, db_root, file)

    audeer.run_tasks(
        job,
        params=[([archive, version], {}) for archive, version in archives],
        num_workers=num_workers,
        progress_bar=verbose,
        task_description='Load media',
    )
Пример #15
0
    'b01': 'Was sind denn das für Tüten, die da unter dem Tisch '
           'stehen.',
    'b02': 'Sie haben es gerade hochgetragen und jetzt gehen sie '
           'wieder runter.',
    'b03': 'An den Wochenenden bin ich jetzt immer nach Hause '
           'gefahren und habe Agnes besucht.',
    'b09': 'Ich will das eben wegbringen und dann mit Karl was '
           'trinken gehen.',
    'b10': 'Die wird auf dem Platz sein, wo wir sie immer hinlegen.',
}
transcriptions = list(parse_names(names, from_i=2, to_i=5))

durations = audeer.run_tasks(
    task_func=lambda x: pd.to_timedelta(
        af.duration(os.path.join(src_dir, x)),
        unit='s',
    ),
    params=[([f], {}) for f in files],
    num_workers=12,
)

# Convert to audformat
db = audformat.Database(
    name='emodb',
    author=(
        'Felix Burkhardt, '
        'Astrid Paeschke, '
        'Miriam Rolfes, '
        'Walter Sendlmeier, '
        'Benjamin Weiss'
    ),
    organization='audEERING',