예제 #1
0
def get_elan_tier_attributes(input_eafs_files):
    """
    Iterate a dir of elan files and compiles info about all the files' tiers:
    unique tier types, unique tier names, and the num of tiers
    """
    # Use sets internally for easy uniqueness, conver to lists when done
    _tier_types: Set[str] = set()
    _tier_names: Set[str] = set()
    _tier_max_count: int = 0
    for file_ in input_eafs_files:
        input_eaf = Eaf(file_)
        for tier_type in list(input_eaf.get_linguistic_type_names()):
            _tier_types.add(tier_type)
            tier_ids: List[str] = input_eaf.get_tier_ids_for_linguistic_type(
                tier_type)
            for tier_id in tier_ids:
                _tier_names.add(tier_id)
        # count the number of tiers, use the max from all files
        tier_count = len(list(input_eaf.get_tier_names()))
        if tier_count > _tier_max_count:
            _tier_max_count = tier_count
    tier_types = list(_tier_types)
    tier_names = list(_tier_names)
    tier_max_count = _tier_max_count
    return (tier_types, tier_names, tier_max_count)
예제 #2
0
def update_ui(file_paths: List[Path], ui):
    """
    Iterate a dir of elan files and compiles info about all the files' tiers:
    unique tier types, unique tier names, and the num of tiers
    """
    # Use sets internally for easy uniqueness, conver to lists when done
    _tier_types: Set[str] = set(ui['data']['tier_type']['options'])
    _tier_names: Set[str] = set(ui['data']['tier_name']['options'])
    tier_max_count = 0

    print('**** ui data')
    print(ui['data'])

    print('**** _tier_types')
    print(_tier_types)

    eaf_paths = [p for p in file_paths if f'{p}'.endswith('.eaf')]
    for eaf_path in eaf_paths:
        input_eaf = Eaf(eaf_path)
        for tier_type in list(input_eaf.get_linguistic_type_names()):
            _tier_types.add(tier_type)
            tier_ids: List[str] = input_eaf.get_tier_ids_for_linguistic_type(
                tier_type)
            for tier_id in tier_ids:
                _tier_names.add(tier_id)
        # count the number of tiers, use the max from all files
        tier_count = len(list(input_eaf.get_tier_names()))
        if tier_count > tier_max_count:
            tier_max_count = tier_count

    ui['data']['tier_type']['options'] = list(_tier_types)
    ui['data']['tier_name']['options'] = list(_tier_names)
    ui['data']['tier_order']['options'] = [i for i in range(tier_max_count)]
    return ui
예제 #3
0
def save_tier_info(input_eaf: Eaf = None,
                   file_name: str = '',
                   tier_types: List = [],
                   corpus_tiers_file: str = 'corpus_tiers.json'):
    tiers = []
    for tier_type in tier_types:
        tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type)
        tiers.append({tier_type: tier_names})
    file_data = {"file": file_name, "tiers": tiers}
    corpus_tiers = load_json_file(corpus_tiers_file)
    corpus_tiers.append(file_data)
    write_data_to_json_file(data=corpus_tiers, output=corpus_tiers_file)
예제 #4
0
def import_eaf_file(eaf_paths: List[str], context: Dict[str, str],
                    reset_annotations: Callable, add_annotation: Callable,
                    tmp_dir):
    """
    Import handler for processing .eaf files.

    :param eaf_paths: List of string paths to Elan files.
    :param context: The settings that will be used to process data from the Elan files.
    :param reset_annotations: Callback to wipe all annotations that have been previously read.
        Settings such as the tier type/name/order will determine which annotations are read
        into the dataset _annotation_store. When settings are changed—
        (Wait, what? Users change their minds?? OMG yes.)
        —reset_annotations will reset dataset _annotation_store to {}, ready for annotations derived from the new
        settings to be added. Without this, changing settings will result in annotations derived from application
        of new settings being appended to previous annotations.
    :param add_annotation: Callback to append an annotation from selected tier
    :param tmp_dir: Honestly, no idea...
    """

    tier_order = context['tier_order']
    tier_name = context['tier_name']
    tier_type = context['tier_type']
    punctuation_to_collapse_by = context['punctuation_to_collapse_by']
    punctuation_to_explode_by = context['punctuation_to_explode_by']
    special_cases = set(context['special_cases'].splitlines())
    translation_tags = set(context['translation_tags'].splitlines())

    reset_annotations()

    for input_elan_file in eaf_paths:
        # Get paths to files
        input_directory, full_file_name = os.path.split(input_elan_file)
        file_name, extension = os.path.splitext(full_file_name)

        input_eaf = Eaf(input_elan_file)
        tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
        tier_names: List[str] = list(input_eaf.get_tier_names())

        # Get annotations and parameters (things like speaker id) on the target tier
        annotations: List[Tuple[str, str, str]] = []
        annotation_data: List[dict] = []

        # Try using tier_order. Watch out for mixed type, empty str if not selected, int if selected
        if isinstance(tier_order, int):
            try:
                tier_name = tier_names[tier_order]
                print(
                    f"using tier order {tier_order} to get tier name {tier_name}"
                )
            except IndexError:
                print("couldn't find a tier")
                pass
        else:
            # else use tier type to get a tier name
            if tier_type in tier_types:
                print(f"found tier type {tier_type}")
                tier_names = input_eaf.get_tier_ids_for_linguistic_type(
                    tier_type)
                tier_name = tier_names[0]
                if tier_name:
                    print(f"found tier name {tier_name}")
            else:
                print("tier type not found in this file")

        if tier_name in tier_names:
            print(f"using tier name {tier_name}")
            annotations = input_eaf.get_annotation_data_for_tier(tier_name)
        else:
            pass  # TODO: Alert user of a skip due to missing tier_name in file

        if annotations:
            annotations = sorted(annotations)
            parameters: Dict[str, str] = input_eaf.get_parameters_for_tier(
                tier_name)
            speaker_id: str = parameters.get("PARTICIPANT", "")

        for annotation in annotations:
            start = annotation[0]
            end = annotation[1]
            annotation = annotation[2]

            utterance = {
                "audio_file_name": f"{file_name}.wav",
                "transcript": annotation,
                "start_ms": start,
                "stop_ms": end,
                "speaker_id": speaker_id
            }

            utterance_cleaned = clean_json_utterance(
                utterance=utterance,
                punctuation_to_collapse_by=punctuation_to_collapse_by,
                punctuation_to_explode_by=punctuation_to_explode_by,
                special_cases=special_cases,
                translation_tags=translation_tags,
                remove_english=False,
                use_langid=False)
            add_annotation(file_name, utterance_cleaned)
예제 #5
0
def import_eaf_file(eaf_paths, context, add_annotation, tmp_dir):
    """
    Import handler for processing all .wav and .eaf files.

    :param wav_paths: List of string paths to Wave files.
    :param eaf_paths: List of string paths to Elan files.
    """
    """
    Import handler for processing all .eaf files.

    Method to process a particular tier in an eaf file (ELAN Annotation Format). It stores the transcriptions in the 
    following format:
                    {'speaker_id': <speaker_id>,
                    'audio_file_name': <file_name>,
                    'transcript': <transcription_label>,
                    'start_ms': <start_time_in_milliseconds>,
                    'stop_ms': <stop_time_in_milliseconds>}

    :param eaf_paths: List of string paths to Elan files.
    :return: a list of dictionaries, where each dictionary is an annotation
    """
    tier_order = context['tier_order']
    tier_name = context['tier_name']
    tier_type = context['tier_type']
    punctuation_to_collapse_by = context['punctuation_to_collapse_by']
    punctuation_to_explode_by = context['punctuation_to_explode_by']
    # Convert dirty words and tokens from str to set, split by '\n'
    special_cases = set(context['special_cases'].splitlines())
    translation_tags = set(context['translation_tags'].splitlines())

    for input_elan_file in eaf_paths:
        # Get paths to files
        input_directory, full_file_name = os.path.split(input_elan_file)
        file_name, extension = os.path.splitext(full_file_name)

        input_eaf = Eaf(input_elan_file)
        tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
        tier_names: List[str] = list(input_eaf.get_tier_names())

        # TODO: Check if this is necessary? It is possible to process transcription and audio file separately.
        # # Look for wav file matching the eaf file in same directory
        # if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
        #     print("WAV file found for " + file_name, file=sys.stderr)
        # else:
        #     raise ValueError(f"WAV file not found for {full_file_name}. "
        #                     f"Please put it next to the eaf file in {input_directory}.")

        # Get annotations and parameters (things like speaker id) on the target tier
        annotations: List[Tuple[str, str, str]] = []
        annotation_data: List[dict] = []

        # Determine tier_name
        # First try using tier order to get tier name
        if tier_order:
            # Watch out for files that may not have this many tiers
            # tier_order is 1-index but List indexing is 0-index
            try:
                tier_name = tier_names[tier_order - 1]
                print(
                    f"using tier order {tier_order} to get tier name {tier_name}"
                )
            except IndexError:
                print("couldn't find a tier")
                pass
        else:
            # else use tier type to get a tier name
            if tier_type in tier_types:
                print(f"found tier type {tier_type}")
                tier_names = input_eaf.get_tier_ids_for_linguistic_type(
                    tier_type)
                tier_name = tier_names[0]
                if tier_name:
                    print(f"found tier name {tier_name}")
            else:
                print("tier type not found in this file")

        if tier_name in tier_names:
            print(f"using tier name {tier_name}")
            annotations = input_eaf.get_annotation_data_for_tier(tier_name)
        else:
            pass  # TODO: Alert user of a skip due to missing tier_name in file

        if annotations:
            annotations = sorted(annotations)
            parameters: Dict[str, str] = input_eaf.get_parameters_for_tier(
                tier_name)
            speaker_id: str = parameters.get("PARTICIPANT", "")

        for annotation in annotations:
            start = annotation[0]
            end = annotation[1]
            annotation = annotation[2]

            utterance = {
                "audio_file_name": f"{file_name}.wav",
                "transcript": annotation,
                "start_ms": start,
                "stop_ms": end
            }
            # TODO: re-enable later
            # if "PARTICIPANT" in parameters:
            #     obj["speaker_id"] = speaker_id

            utterance_cleaned = clean_json_utterance(
                utterance=utterance,
                punctuation_to_collapse_by=punctuation_to_collapse_by,
                punctuation_to_explode_by=punctuation_to_explode_by,
                special_cases=special_cases,
                translation_tags=translation_tags,
                remove_english=False,
                use_langid=False)
            add_annotation(file_name, utterance_cleaned)
예제 #6
0
def process_eaf(input_elan_file: str = '',
                tier_order: int = 0,
                tier_name: str = '',
                tier_type: str = '',
                corpus_tiers_file: str = '') -> List[dict]:
    """
    Method to process a particular tier in an eaf file (ELAN Annotation Format).
    Transcriptions are read from an elan file tier.
    Tiers are nodes from the tree structure in the .eaf file.
    The tier to read from is determined by tier order (eg top tier would be order 1),
    tier type (eg default-lt) or tier name (eg Phrase).
    If tier type is used, the first tier matching this type is used.
    Elan can have multiple tiers of same type, future work would support reading data
    from multiple tiers of the selected type.

    It stores the transcriptions in the following format:
                    {'speaker_id': <speaker_id>,
                    'audio_file_name': <file_name>,
                    'transcript': <transcription_label>,
                    'start_ms': <start_time_in_milliseconds>,
                    'stop_ms': <stop_time_in_milliseconds>}

    :param input_elan_file: name of input elan file
    :param tier_order: index of the elan tier to process
    :param tier_type:  type of the elan tier to process
    :param tier_name:  name of the elan tier to process
    :return: a list of dictionaries, where each dictionary is an annotation
    """

    print(
        f"processing eaf {input_elan_file} using {tier_order} {tier_type} {tier_name}"
    )

    # Get paths to files
    input_directory, full_file_name = os.path.split(input_elan_file)
    file_name, extension = os.path.splitext(full_file_name)

    # Look for wav file matching the eaf file in same directory
    if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
        print("WAV file found for " + file_name, file=sys.stderr)
    else:
        raise ValueError(
            f"WAV file not found for {full_file_name}. "
            f"Please put it next to the eaf file in {input_directory}.")

    # Get tier data from Elan file
    input_eaf = Eaf(input_elan_file)
    tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
    tier_names: List[str] = list(input_eaf.get_tier_names())

    # Keep this data handy for future corpus analysis
    # save_tier_info(input_eaf=input_eaf,
    #               tier_types=tier_types,
    #               file_name=file_name,
    #               corpus_tiers_file=corpus_tiers_file)

    # Get annotations and parameters (things like speaker id) on the target tier
    annotations: List[Tuple[str, str, str]] = []
    annotations_data: List[dict] = []

    # First try using tier order to get tier name
    if tier_order:
        # Watch out for files that may not have this many tiers
        # tier_order is 1-index but List indexing is 0-index
        try:
            tier_name = tier_names[tier_order - 1]
            print(
                f"using tier order {tier_order} to get tier name {tier_name}")
        except IndexError:
            print("couldn't find a tier")
            pass
    else:
        # else use tier type to get a tier name
        if tier_type in tier_types:
            print(f"found tier type {tier_type}")
            tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type)
            tier_name = tier_names[0]
            if tier_name:
                print(f"found tier name {tier_name}")
        else:
            print("tier type not found in this file")

    if tier_name in tier_names:
        print(f"using tier name {tier_name}")
        annotations = input_eaf.get_annotation_data_for_tier(tier_name)

    if annotations:
        print(f"annotations {annotations}")
        annotations = sorted(annotations)
        parameters: Dict[str,
                         str] = input_eaf.get_parameters_for_tier(tier_name)
        print(f"parameters {parameters}")
        speaker_id: str = parameters.get("PARTICIPANT", "")

    for annotation in annotations:
        start: str = annotation[0]
        end: str = annotation[1]
        annotation_text: str = annotation[2]
        print(f"annotation {annotation} {start} {end}")
        obj = {
            "audio_file_name": f"{file_name}.wav",
            "transcript": annotation_text,
            "start_ms": start,
            "stop_ms": end
        }
        if "PARTICIPANT" in parameters:
            obj["speaker_id"] = speaker_id
        annotations_data.append(obj)

    return annotations_data