Python Eaf.get_tier_names示例，pympi.Elan.Eaf.get_tier_names Python示例

示例#1

0

显示文件

文件： select-tiers.py 项目： CoEDL/elan-helpers

def main():
    # Start by asking where the Elan files are
    # Default is a folder named "input" in the same directory as this script
    input_dir_question = [{
        'type': 'input',
        'name': 'input_dir',
        'message': 'Name of folder with Elan files?',
        'default': 'input'
    }]
    input_dir_prompt = prompt(input_dir_question)
    input_dir = input_dir_prompt["input_dir"]
    # Get all files from the input directory
    extensions = set(["*.eaf"])
    tier_names = set()
    all_files = set(glob.glob(os.path.join(input_dir, "**"), recursive=True))
    input_files = find_files_by_ext(all_files, extensions)
    # Compile tier info for the files in the input dir
    for input_file_path in input_files:
        input_file = Eaf(input_file_path)
        # Get the tier names — using pympi-ling
        file_tier_names = list(input_file.get_tier_names())
        # Compile tiers into set to use for user prompt
        for tier_name in file_tier_names:
            tier_names.add(tier_name)
    print(type(tier_names))
    select_tier(tier_names)

示例#2

0

显示文件

def get_elan_tier_attributes(input_eafs_files):
    """
    Iterate a dir of elan files and compiles info about all the files' tiers:
    unique tier types, unique tier names, and the num of tiers
    """
    # Use sets internally for easy uniqueness, conver to lists when done
    _tier_types: Set[str] = set()
    _tier_names: Set[str] = set()
    _tier_max_count: int = 0
    for file_ in input_eafs_files:
        input_eaf = Eaf(file_)
        for tier_type in list(input_eaf.get_linguistic_type_names()):
            _tier_types.add(tier_type)
            tier_ids: List[str] = input_eaf.get_tier_ids_for_linguistic_type(
                tier_type)
            for tier_id in tier_ids:
                _tier_names.add(tier_id)
        # count the number of tiers, use the max from all files
        tier_count = len(list(input_eaf.get_tier_names()))
        if tier_count > _tier_max_count:
            _tier_max_count = tier_count
    tier_types = list(_tier_types)
    tier_names = list(_tier_names)
    tier_max_count = _tier_max_count
    return (tier_types, tier_names, tier_max_count)

示例#3

0

显示文件

def update_ui(file_paths: List[Path], ui):
    """
    Iterate a dir of elan files and compiles info about all the files' tiers:
    unique tier types, unique tier names, and the num of tiers
    """
    # Use sets internally for easy uniqueness, conver to lists when done
    _tier_types: Set[str] = set(ui['data']['tier_type']['options'])
    _tier_names: Set[str] = set(ui['data']['tier_name']['options'])
    tier_max_count = 0

    print('**** ui data')
    print(ui['data'])

    print('**** _tier_types')
    print(_tier_types)

    eaf_paths = [p for p in file_paths if f'{p}'.endswith('.eaf')]
    for eaf_path in eaf_paths:
        input_eaf = Eaf(eaf_path)
        for tier_type in list(input_eaf.get_linguistic_type_names()):
            _tier_types.add(tier_type)
            tier_ids: List[str] = input_eaf.get_tier_ids_for_linguistic_type(
                tier_type)
            for tier_id in tier_ids:
                _tier_names.add(tier_id)
        # count the number of tiers, use the max from all files
        tier_count = len(list(input_eaf.get_tier_names()))
        if tier_count > tier_max_count:
            tier_max_count = tier_count

    ui['data']['tier_type']['options'] = list(_tier_types)
    ui['data']['tier_name']['options'] = list(_tier_names)
    ui['data']['tier_order']['options'] = [i for i in range(tier_max_count)]
    return ui

示例#4

0

显示文件

def main() -> None:
    """
    A command line utility to silence the audio files in a given directory 
    
    Usage: python3 silence_audio.py [-h] -c CORPUS [-s SILENCE_TIER] [-o]
    """

    global silence_mono
    global silence_stereo
    global do_not_publish

    parser = argparse.ArgumentParser(
        description=
        "This script will silence a wave file based on annotations in an Elan tier"
    )
    parser.add_argument('-c',
                        '--corpus',
                        help='Directory of audio and eaf files',
                        type=str,
                        required=True)
    parser.add_argument(
        '-s',
        '--silence_tier',
        help='Silence audio when annotations are found on this tier',
        type=str,
        default='Silence')
    parser.add_argument('-o',
                        '--overwrite',
                        help='Write over existing files',
                        action="store_true")
    arguments = parser.parse_args()

    silence_mono = 0
    silence_stereo = [0, 0]
    suffix = 'S'
    '''
    Look for .eaf files, recursively from the passed arguments.corpus dir for file_path in 
    glob.iglob(arguments.corpus + '/**/*.eaf', recursive=True)
    '''

    for file_path in glob.iglob(arguments.corpus + '/*.eaf'):
        eaf_file = Eaf(file_path)
        names = eaf_file.get_tier_names()

        # Check for existence of silence tier
        if arguments.silence_tier in names:

            print("Have tier %s in %s" % (arguments.silence_tier, file_path))

            basename, extension = os.path.splitext(file_path)

            input = basename + ".wav"
            if arguments.overwrite:
                output = basename + ".wav"
            else:
                output = basename + suffix + ".wav"

            silence_audio(eaf_file, output)

示例#5

0

显示文件

文件： bkw.py 项目： yuhsianglin/persephone

def explore_elan_files(elan_paths):
    """
    A function to explore the tiers of ELAN files.
    """

    for elan_path in elan_paths:
        print(elan_path)
        eafob = Eaf(elan_path)
        tier_names = eafob.get_tier_names()
        for tier in tier_names:
            print("\t", tier)
            try:
                for annotation in eafob.get_annotation_data_for_tier(tier):
                    print("\t\t", annotation)
            except KeyError:
                continue

        input()

示例#6

0

显示文件

文件： split_eafs.py 项目： rakhi-alina/kaldi-helpers

def read_eaf(ie, tier, silence_tier, silence_marker, json_data, output_text_dir, output_audio_dir):

    input_eaf = Eaf(ie)

    # Check if the tiers we have been given exist
    tier_names = input_eaf.get_tier_names()
    if tier not in tier_names:
        print('missing tier: ' + tier, file=sys.stderr)
        return False
    if silence_tier not in tier_names:
        print('missing silence tier: ' + silence_tier, file=sys.stderr)

    # get the input_scripts audio file
    inDir, name = os.path.split(ie)
    basename, ext = os.path.splitext(name)
    ia = os.path.join(inDir, basename + ".wav")
    input_audio = AudioSegment.from_wav(ia)

    # We can pass in an arg for a ref tier that has silence labels
    check_silence_ref_tier = False
    if silence_tier in tier_names:
        silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier)
        if silence_tier_info.get("PARENT_REF") == tier:
            check_silence_ref_tier = True

    # Get annotation values, start and end times, and speaker id
    annotations = sorted(input_eaf.get_annotation_data_for_tier(tier))
    params = input_eaf.get_parameters_for_tier(tier)
    if 'PARTICIPANT' in params:
        speaker_id = params['PARTICIPANT']

    i = 0
    for ann in annotations:
        skip = False
        start = ann[0]
        end = ann[1]
        # output_scripts new values, not the original clip start end times
        clip_start = 0
        clip_end = ann[1] - ann[0]
        annotation = ann[2]

        # Check for annotations labelled with a particular symbol on the main tier
        if annotation == silence_marker:
            skip = True

        # Check for existence of an annotation in ref tier to silence
        # Annotation value doesn't matter
        if check_silence_ref_tier and len(input_eaf.get_ref_annotation_at_time(silence_tier, start)):
            skip = True

        if skip is True:
            # print('skipping annotation: ' + annotation, start, end)
            print("skipping" + str(i))
        else:
            print("processing" + str(i))
            # print('processing annotation: ' + annotation, start, end)
            # build the output_scripts audio/text filename
            fname = basename + "_" + str(i)
            obj = {
                'audioFileName': os.path.join(".", fname + ".wav"),
                'transcript': annotation,
                'startMs': clip_start,
                'stopMs': clip_end
            }
            if 'PARTICIPANT' in params:
                obj["speakerId"] = speaker_id
            json_data.append(obj)
            split_audio_by_start_end(input_audio, start, end, fname, ".wav", output_audio_dir)
            write_text(annotation, fname, ".txt", output_text_dir)
            i += 1

示例#7

0

显示文件

def import_eaf_file(eaf_paths: List[str], context: Dict[str, str],
                    reset_annotations: Callable, add_annotation: Callable,
                    tmp_dir):
    """
    Import handler for processing .eaf files.

    :param eaf_paths: List of string paths to Elan files.
    :param context: The settings that will be used to process data from the Elan files.
    :param reset_annotations: Callback to wipe all annotations that have been previously read.
        Settings such as the tier type/name/order will determine which annotations are read
        into the dataset _annotation_store. When settings are changed—
        (Wait, what? Users change their minds?? OMG yes.)
        —reset_annotations will reset dataset _annotation_store to {}, ready for annotations derived from the new
        settings to be added. Without this, changing settings will result in annotations derived from application
        of new settings being appended to previous annotations.
    :param add_annotation: Callback to append an annotation from selected tier
    :param tmp_dir: Honestly, no idea...
    """

    tier_order = context['tier_order']
    tier_name = context['tier_name']
    tier_type = context['tier_type']
    punctuation_to_collapse_by = context['punctuation_to_collapse_by']
    punctuation_to_explode_by = context['punctuation_to_explode_by']
    special_cases = set(context['special_cases'].splitlines())
    translation_tags = set(context['translation_tags'].splitlines())

    reset_annotations()

    for input_elan_file in eaf_paths:
        # Get paths to files
        input_directory, full_file_name = os.path.split(input_elan_file)
        file_name, extension = os.path.splitext(full_file_name)

        input_eaf = Eaf(input_elan_file)
        tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
        tier_names: List[str] = list(input_eaf.get_tier_names())

        # Get annotations and parameters (things like speaker id) on the target tier
        annotations: List[Tuple[str, str, str]] = []
        annotation_data: List[dict] = []

        # Try using tier_order. Watch out for mixed type, empty str if not selected, int if selected
        if isinstance(tier_order, int):
            try:
                tier_name = tier_names[tier_order]
                print(
                    f"using tier order {tier_order} to get tier name {tier_name}"
                )
            except IndexError:
                print("couldn't find a tier")
                pass
        else:
            # else use tier type to get a tier name
            if tier_type in tier_types:
                print(f"found tier type {tier_type}")
                tier_names = input_eaf.get_tier_ids_for_linguistic_type(
                    tier_type)
                tier_name = tier_names[0]
                if tier_name:
                    print(f"found tier name {tier_name}")
            else:
                print("tier type not found in this file")

        if tier_name in tier_names:
            print(f"using tier name {tier_name}")
            annotations = input_eaf.get_annotation_data_for_tier(tier_name)
        else:
            pass  # TODO: Alert user of a skip due to missing tier_name in file

        if annotations:
            annotations = sorted(annotations)
            parameters: Dict[str, str] = input_eaf.get_parameters_for_tier(
                tier_name)
            speaker_id: str = parameters.get("PARTICIPANT", "")

        for annotation in annotations:
            start = annotation[0]
            end = annotation[1]
            annotation = annotation[2]

            utterance = {
                "audio_file_name": f"{file_name}.wav",
                "transcript": annotation,
                "start_ms": start,
                "stop_ms": end,
                "speaker_id": speaker_id
            }

            utterance_cleaned = clean_json_utterance(
                utterance=utterance,
                punctuation_to_collapse_by=punctuation_to_collapse_by,
                punctuation_to_explode_by=punctuation_to_explode_by,
                special_cases=special_cases,
                translation_tags=translation_tags,
                remove_english=False,
                use_langid=False)
            add_annotation(file_name, utterance_cleaned)

示例#8

0

显示文件

    with wave.open(output, 'wb') as audio:
        samples.shape = params.nframes if num_channels == 1 else (
            params.nframes * 2)
        audio.setparams(params)
        audio.writeframesraw(samples)

    print("Silenced {} intervals ({:.1f}s)".format(
        len(annotations), num_samples / params.framerate))


# look for .eaf files, recirsively from the passed corpus dir
# for fpath in glob.iglob(corpus + '/**/*.eaf', recursive=True):
for fpath in glob.iglob(corpus + '/*.eaf'):
    print(fpath)
    eaffile = Eaf(fpath)
    names = eaffile.get_tier_names()
    # print(names)

    # check for existence of silence tier
    #
    if DO_NOT_PUBLISH in names:
        print("have tier %s in %s" % (DO_NOT_PUBLISH, fpath))

        basename, extn = os.path.splitext(fpath)

        input = basename + ".wav"
        if overwrite == 'yes':
            output = basename + ".wav"
        else:
            output = basename + SUFFIX + ".wav"

示例#9

0

显示文件

def import_eaf_file(eaf_paths, context, add_annotation, tmp_dir):
    """
    Import handler for processing all .wav and .eaf files.

    :param wav_paths: List of string paths to Wave files.
    :param eaf_paths: List of string paths to Elan files.
    """
    """
    Import handler for processing all .eaf files.

    Method to process a particular tier in an eaf file (ELAN Annotation Format). It stores the transcriptions in the 
    following format:
                    {'speaker_id': <speaker_id>,
                    'audio_file_name': <file_name>,
                    'transcript': <transcription_label>,
                    'start_ms': <start_time_in_milliseconds>,
                    'stop_ms': <stop_time_in_milliseconds>}

    :param eaf_paths: List of string paths to Elan files.
    :return: a list of dictionaries, where each dictionary is an annotation
    """
    tier_order = context['tier_order']
    tier_name = context['tier_name']
    tier_type = context['tier_type']
    punctuation_to_collapse_by = context['punctuation_to_collapse_by']
    punctuation_to_explode_by = context['punctuation_to_explode_by']
    # Convert dirty words and tokens from str to set, split by '\n'
    special_cases = set(context['special_cases'].splitlines())
    translation_tags = set(context['translation_tags'].splitlines())

    for input_elan_file in eaf_paths:
        # Get paths to files
        input_directory, full_file_name = os.path.split(input_elan_file)
        file_name, extension = os.path.splitext(full_file_name)

        input_eaf = Eaf(input_elan_file)
        tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
        tier_names: List[str] = list(input_eaf.get_tier_names())

        # TODO: Check if this is necessary? It is possible to process transcription and audio file separately.
        # # Look for wav file matching the eaf file in same directory
        # if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
        #     print("WAV file found for " + file_name, file=sys.stderr)
        # else:
        #     raise ValueError(f"WAV file not found for {full_file_name}. "
        #                     f"Please put it next to the eaf file in {input_directory}.")

        # Get annotations and parameters (things like speaker id) on the target tier
        annotations: List[Tuple[str, str, str]] = []
        annotation_data: List[dict] = []

        # Determine tier_name
        # First try using tier order to get tier name
        if tier_order:
            # Watch out for files that may not have this many tiers
            # tier_order is 1-index but List indexing is 0-index
            try:
                tier_name = tier_names[tier_order - 1]
                print(
                    f"using tier order {tier_order} to get tier name {tier_name}"
                )
            except IndexError:
                print("couldn't find a tier")
                pass
        else:
            # else use tier type to get a tier name
            if tier_type in tier_types:
                print(f"found tier type {tier_type}")
                tier_names = input_eaf.get_tier_ids_for_linguistic_type(
                    tier_type)
                tier_name = tier_names[0]
                if tier_name:
                    print(f"found tier name {tier_name}")
            else:
                print("tier type not found in this file")

        if tier_name in tier_names:
            print(f"using tier name {tier_name}")
            annotations = input_eaf.get_annotation_data_for_tier(tier_name)
        else:
            pass  # TODO: Alert user of a skip due to missing tier_name in file

        if annotations:
            annotations = sorted(annotations)
            parameters: Dict[str, str] = input_eaf.get_parameters_for_tier(
                tier_name)
            speaker_id: str = parameters.get("PARTICIPANT", "")

        for annotation in annotations:
            start = annotation[0]
            end = annotation[1]
            annotation = annotation[2]

            utterance = {
                "audio_file_name": f"{file_name}.wav",
                "transcript": annotation,
                "start_ms": start,
                "stop_ms": end
            }
            # TODO: re-enable later
            # if "PARTICIPANT" in parameters:
            #     obj["speaker_id"] = speaker_id

            utterance_cleaned = clean_json_utterance(
                utterance=utterance,
                punctuation_to_collapse_by=punctuation_to_collapse_by,
                punctuation_to_explode_by=punctuation_to_explode_by,
                special_cases=special_cases,
                translation_tags=translation_tags,
                remove_english=False,
                use_langid=False)
            add_annotation(file_name, utterance_cleaned)

示例#10

0

显示文件

文件： elan_to_json.py 项目： CoEDL/elan-helpers

def process_eaf(input_elan_file: str = '',
                tier_order: int = 0,
                tier_name: str = '',
                tier_type: str = '',
                corpus_tiers_file: str = '') -> List[dict]:
    """
    Method to process a particular tier in an eaf file (ELAN Annotation Format).
    Transcriptions are read from an elan file tier.
    Tiers are nodes from the tree structure in the .eaf file.
    The tier to read from is determined by tier order (eg top tier would be order 1),
    tier type (eg default-lt) or tier name (eg Phrase).
    If tier type is used, the first tier matching this type is used.
    Elan can have multiple tiers of same type, future work would support reading data
    from multiple tiers of the selected type.

    It stores the transcriptions in the following format:
                    {'speaker_id': <speaker_id>,
                    'audio_file_name': <file_name>,
                    'transcript': <transcription_label>,
                    'start_ms': <start_time_in_milliseconds>,
                    'stop_ms': <stop_time_in_milliseconds>}

    :param input_elan_file: name of input elan file
    :param tier_order: index of the elan tier to process
    :param tier_type:  type of the elan tier to process
    :param tier_name:  name of the elan tier to process
    :return: a list of dictionaries, where each dictionary is an annotation
    """

    print(
        f"processing eaf {input_elan_file} using {tier_order} {tier_type} {tier_name}"
    )

    # Get paths to files
    input_directory, full_file_name = os.path.split(input_elan_file)
    file_name, extension = os.path.splitext(full_file_name)

    # Look for wav file matching the eaf file in same directory
    if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
        print("WAV file found for " + file_name, file=sys.stderr)
    else:
        raise ValueError(
            f"WAV file not found for {full_file_name}. "
            f"Please put it next to the eaf file in {input_directory}.")

    # Get tier data from Elan file
    input_eaf = Eaf(input_elan_file)
    tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
    tier_names: List[str] = list(input_eaf.get_tier_names())

    # Keep this data handy for future corpus analysis
    # save_tier_info(input_eaf=input_eaf,
    #               tier_types=tier_types,
    #               file_name=file_name,
    #               corpus_tiers_file=corpus_tiers_file)

    # Get annotations and parameters (things like speaker id) on the target tier
    annotations: List[Tuple[str, str, str]] = []
    annotations_data: List[dict] = []

    # First try using tier order to get tier name
    if tier_order:
        # Watch out for files that may not have this many tiers
        # tier_order is 1-index but List indexing is 0-index
        try:
            tier_name = tier_names[tier_order - 1]
            print(
                f"using tier order {tier_order} to get tier name {tier_name}")
        except IndexError:
            print("couldn't find a tier")
            pass
    else:
        # else use tier type to get a tier name
        if tier_type in tier_types:
            print(f"found tier type {tier_type}")
            tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type)
            tier_name = tier_names[0]
            if tier_name:
                print(f"found tier name {tier_name}")
        else:
            print("tier type not found in this file")

    if tier_name in tier_names:
        print(f"using tier name {tier_name}")
        annotations = input_eaf.get_annotation_data_for_tier(tier_name)

    if annotations:
        print(f"annotations {annotations}")
        annotations = sorted(annotations)
        parameters: Dict[str,
                         str] = input_eaf.get_parameters_for_tier(tier_name)
        print(f"parameters {parameters}")
        speaker_id: str = parameters.get("PARTICIPANT", "")

    for annotation in annotations:
        start: str = annotation[0]
        end: str = annotation[1]
        annotation_text: str = annotation[2]
        print(f"annotation {annotation} {start} {end}")
        obj = {
            "audio_file_name": f"{file_name}.wav",
            "transcript": annotation_text,
            "start_ms": start,
            "stop_ms": end
        }
        if "PARTICIPANT" in parameters:
            obj["speaker_id"] = speaker_id
        annotations_data.append(obj)

    return annotations_data

示例#11

0

显示文件

def read_eaf(ie):

    if verbose:
        print("input file is", ie)

    input_eaf = Eaf(ie)

    # Check if the tiers we have been given exist
    tier_names = list(input_eaf.get_tier_names())
    if verbose:
        print("tier_names", tier_names, file=sys.stderr)

    # Are we working by slice_tier name or order?
    if slice_tier != "default":
        if verbose:
            print("using slice_tier by name:", slice_tier, file=sys.stderr)
    else:

        # Sanity check that the slice_tier num is not greater than the num of tiers
        if tier_order > len(tier_names):
            print("Error: tier number is greater than the number of tiers",
                  file=sys.stderr)
            return False
        if verbose:
            print("using slice_tier by number:",
                  tier_names[tier_order - 1],
                  file=sys.stderr)

    if slice_tier not in tier_names:
        print('Error: missing slice_tier ' + slice_tier, file=sys.stderr)
        return False

    if silence_tier not in tier_names:
        if verbose:
            print('silence tier not found: ' + silence_tier, file=sys.stderr)

    # get the input audio file
    inDir, name = os.path.split(ie)
    basename, ext = os.path.splitext(name)

    # we can write out mp3 or whatever, still require wav input
    ia = os.path.join(inDir, basename + ".wav")
    input_audio = AudioSegment.from_wav(ia)

    # We can pass in an arg for a ref tier that has silence labels
    check_silence_ref_tier = False
    if silence_tier in tier_names:
        silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier)
        if silence_tier_info.get("PARENT_REF") == tier:
            check_silence_ref_tier = True

    # Get annotation values, start and end times, and speaker id
    if text_tier not in tier_names:
        print('Error: missing text tier')
        return False

    annotations = sorted(input_eaf.get_annotation_data_for_tier(text_tier))

    params = input_eaf.get_parameters_for_tier(text_tier)
    if 'PARTICIPANT' in params:
        speaker_id = params['PARTICIPANT']

    annotations_data = []
    i = 0
    for ann in annotations:
        skip = False
        ref_annotation = []
        start = ann[0]
        end = ann[1]
        # output new values, not the original clip start end times
        clip_start = 0
        clip_end = ann[1] - ann[0]
        annotation = ann[2]

        # Check for annotations labelled with a particular symbol on the main tier
        if annotation == silence_marker:
            skip = True

        # Check for existence of an annotation in ref tier to silence
        # Annotation value doesn't matter
        if check_silence_ref_tier:
            ref_annotation = input_eaf.get_ref_annotation_at_time(
                silence_tier, start)
            if len(ref_annotation) is True:
                skip = True

        if skip is True:
            print('skipping annotation: ' + annotation, start, end)
        else:
            print('processing annotation: ' + annotation, start, end)
            # build the output audio/text filename
            fname = basename + "_" + str(i)
            if name_with_annotation:
                fname = slugify(annotation)

            if prefix != '':
                fname = prefix + '_' + fname
            obj = {
                'audioFileName': os.path.join(".", fname + ".wav"),
                'transcript': annotation,
                'startMs': clip_start,
                'stopMs': clip_end
            }
            if 'PARTICIPANT' in params:
                obj["speakerId"] = speaker_id
            annotations_data.append(obj)
            split_audio_by_start_end(input_audio, start, end, fname)
            write_text(annotation, fname)
            i += 1
    # output the json data for the next step in kaldi pipeline
    write_json(annotations_data)

    if verbose:
        print(annotations_data)