예제 #1
0
def process_eaf(input_elan_file: str, tier_name: str) -> List[dict]:
    """
    Method to process a particular tier in an eaf file (ELAN Annotation Format). It stores the transcriptions in the 
    following format:
                    {'speaker_id': <speaker_id>,
                    'audio_file_name': <file_name>,
                    'transcript': <transcription_label>,
                    'start_ms': <start_time_in_milliseconds>,
                    'stop_ms': <stop_time_in_milliseconds>}
                    
    :param input_elan_file: name of input_scripts elan file
    :param tier_name: name of the elan tier to process. these tiers are nodes from the tree structure in the .eaf file.
    :return: a list of dictionaries, where each dictionary is an annotation
    """
    # Get paths to files
    input_directory, full_file_name = os.path.split(input_elan_file)
    file_name, extension = os.path.splitext(full_file_name)

    input_eaf = Eaf(input_elan_file)

    # Look for wav file matching the eaf file in same directory
    if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
        print("WAV file found for " + file_name, file=sys.stderr)
    else:
        raise ValueError(
            f"WAV file not found for {full_file_name}. "
            f"Please put it next to the eaf file in {input_directory}.")

    # Get annotations and parameters (things like speaker id) on the target tier
    annotations = sorted(input_eaf.get_annotation_data_for_tier(tier_name))
    parameters = input_eaf.get_parameters_for_tier(tier_name)
    speaker_id = parameters.get("PARTICIPANT", "")

    annotations_data = []

    for annotation in annotations:
        start = annotation[0]
        end = annotation[1]
        annotation = annotation[2]

        # print("processing annotation: " + annotation, start, end)
        obj = {
            "audio_file_name": f"{file_name}.wav",
            "transcript": annotation,
            "start_ms": start,
            "stop_ms": end
        }
        if "PARTICIPANT" in parameters:
            obj["speaker_id"] = speaker_id
        annotations_data.append(obj)

    return annotations_data
예제 #2
0
def read_eaf(ie):
    # Get paths to files
    inDir, name = os.path.split(ie)
    basename, ext = os.path.splitext(name)

    input_eaf = Eaf(ie)

    # I want the media in the same folder as the eaf. error if not found
    # We could also parse the linked media.. let try this later
    # files = input_eaf.get_linked_files()

    # look for wav file matching the eaf file
    if os.path.isfile(os.path.join(inDir, basename + ".wav")):
        print("WAV file found for " + basename, file=sys.stderr)
    else:
        raise ValueError('Eeeek! WAV file not found for ' + basename +
                         '. Please put it next to the eaf file in ' + inDir)

    # Get annotations and params (thigs like speaker id) on the target tier
    annotations = sorted(input_eaf.get_annotation_data_for_tier(tier))
    params = input_eaf.get_parameters_for_tier(tier)
    if 'PARTICIPANT' in params:
        speaker_id = params['PARTICIPANT']

    for ann in annotations:
        start = ann[0]
        end = ann[1]
        annotation = ann[2]

        # print('processing annotation: ' + annotation, start, end)
        obj = {
            'audioFileName': basename + ".wav",
            'transcript': annotation,
            'startMs': start,
            'stopMs': end
        }
        if 'PARTICIPANT' in params:
            obj["speakerId"] = speaker_id
        annotations_data.append(obj)
예제 #3
0
def read_eaf(ie, tier, silence_tier, silence_marker, json_data, output_text_dir, output_audio_dir):

    input_eaf = Eaf(ie)

    # Check if the tiers we have been given exist
    tier_names = input_eaf.get_tier_names()
    if tier not in tier_names:
        print('missing tier: ' + tier, file=sys.stderr)
        return False
    if silence_tier not in tier_names:
        print('missing silence tier: ' + silence_tier, file=sys.stderr)

    # get the input_scripts audio file
    inDir, name = os.path.split(ie)
    basename, ext = os.path.splitext(name)
    ia = os.path.join(inDir, basename + ".wav")
    input_audio = AudioSegment.from_wav(ia)

    # We can pass in an arg for a ref tier that has silence labels
    check_silence_ref_tier = False
    if silence_tier in tier_names:
        silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier)
        if silence_tier_info.get("PARENT_REF") == tier:
            check_silence_ref_tier = True

    # Get annotation values, start and end times, and speaker id
    annotations = sorted(input_eaf.get_annotation_data_for_tier(tier))
    params = input_eaf.get_parameters_for_tier(tier)
    if 'PARTICIPANT' in params:
        speaker_id = params['PARTICIPANT']

    i = 0
    for ann in annotations:
        skip = False
        start = ann[0]
        end = ann[1]
        # output_scripts new values, not the original clip start end times
        clip_start = 0
        clip_end = ann[1] - ann[0]
        annotation = ann[2]

        # Check for annotations labelled with a particular symbol on the main tier
        if annotation == silence_marker:
            skip = True

        # Check for existence of an annotation in ref tier to silence
        # Annotation value doesn't matter
        if check_silence_ref_tier and len(input_eaf.get_ref_annotation_at_time(silence_tier, start)):
            skip = True

        if skip is True:
            # print('skipping annotation: ' + annotation, start, end)
            print("skipping" + str(i))
        else:
            print("processing" + str(i))
            # print('processing annotation: ' + annotation, start, end)
            # build the output_scripts audio/text filename
            fname = basename + "_" + str(i)
            obj = {
                'audioFileName': os.path.join(".", fname + ".wav"),
                'transcript': annotation,
                'startMs': clip_start,
                'stopMs': clip_end
            }
            if 'PARTICIPANT' in params:
                obj["speakerId"] = speaker_id
            json_data.append(obj)
            split_audio_by_start_end(input_audio, start, end, fname, ".wav", output_audio_dir)
            write_text(annotation, fname, ".txt", output_text_dir)
            i += 1
예제 #4
0
def import_eaf_file(eaf_paths: List[str], context: Dict[str, str],
                    reset_annotations: Callable, add_annotation: Callable,
                    tmp_dir):
    """
    Import handler for processing .eaf files.

    :param eaf_paths: List of string paths to Elan files.
    :param context: The settings that will be used to process data from the Elan files.
    :param reset_annotations: Callback to wipe all annotations that have been previously read.
        Settings such as the tier type/name/order will determine which annotations are read
        into the dataset _annotation_store. When settings are changed—
        (Wait, what? Users change their minds?? OMG yes.)
        —reset_annotations will reset dataset _annotation_store to {}, ready for annotations derived from the new
        settings to be added. Without this, changing settings will result in annotations derived from application
        of new settings being appended to previous annotations.
    :param add_annotation: Callback to append an annotation from selected tier
    :param tmp_dir: Honestly, no idea...
    """

    tier_order = context['tier_order']
    tier_name = context['tier_name']
    tier_type = context['tier_type']
    punctuation_to_collapse_by = context['punctuation_to_collapse_by']
    punctuation_to_explode_by = context['punctuation_to_explode_by']
    special_cases = set(context['special_cases'].splitlines())
    translation_tags = set(context['translation_tags'].splitlines())

    reset_annotations()

    for input_elan_file in eaf_paths:
        # Get paths to files
        input_directory, full_file_name = os.path.split(input_elan_file)
        file_name, extension = os.path.splitext(full_file_name)

        input_eaf = Eaf(input_elan_file)
        tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
        tier_names: List[str] = list(input_eaf.get_tier_names())

        # Get annotations and parameters (things like speaker id) on the target tier
        annotations: List[Tuple[str, str, str]] = []
        annotation_data: List[dict] = []

        # Try using tier_order. Watch out for mixed type, empty str if not selected, int if selected
        if isinstance(tier_order, int):
            try:
                tier_name = tier_names[tier_order]
                print(
                    f"using tier order {tier_order} to get tier name {tier_name}"
                )
            except IndexError:
                print("couldn't find a tier")
                pass
        else:
            # else use tier type to get a tier name
            if tier_type in tier_types:
                print(f"found tier type {tier_type}")
                tier_names = input_eaf.get_tier_ids_for_linguistic_type(
                    tier_type)
                tier_name = tier_names[0]
                if tier_name:
                    print(f"found tier name {tier_name}")
            else:
                print("tier type not found in this file")

        if tier_name in tier_names:
            print(f"using tier name {tier_name}")
            annotations = input_eaf.get_annotation_data_for_tier(tier_name)
        else:
            pass  # TODO: Alert user of a skip due to missing tier_name in file

        if annotations:
            annotations = sorted(annotations)
            parameters: Dict[str, str] = input_eaf.get_parameters_for_tier(
                tier_name)
            speaker_id: str = parameters.get("PARTICIPANT", "")

        for annotation in annotations:
            start = annotation[0]
            end = annotation[1]
            annotation = annotation[2]

            utterance = {
                "audio_file_name": f"{file_name}.wav",
                "transcript": annotation,
                "start_ms": start,
                "stop_ms": end,
                "speaker_id": speaker_id
            }

            utterance_cleaned = clean_json_utterance(
                utterance=utterance,
                punctuation_to_collapse_by=punctuation_to_collapse_by,
                punctuation_to_explode_by=punctuation_to_explode_by,
                special_cases=special_cases,
                translation_tags=translation_tags,
                remove_english=False,
                use_langid=False)
            add_annotation(file_name, utterance_cleaned)
예제 #5
0
def import_eaf_file(eaf_paths, context, add_annotation, tmp_dir):
    """
    Import handler for processing all .wav and .eaf files.

    :param wav_paths: List of string paths to Wave files.
    :param eaf_paths: List of string paths to Elan files.
    """
    """
    Import handler for processing all .eaf files.

    Method to process a particular tier in an eaf file (ELAN Annotation Format). It stores the transcriptions in the 
    following format:
                    {'speaker_id': <speaker_id>,
                    'audio_file_name': <file_name>,
                    'transcript': <transcription_label>,
                    'start_ms': <start_time_in_milliseconds>,
                    'stop_ms': <stop_time_in_milliseconds>}

    :param eaf_paths: List of string paths to Elan files.
    :return: a list of dictionaries, where each dictionary is an annotation
    """
    tier_order = context['tier_order']
    tier_name = context['tier_name']
    tier_type = context['tier_type']
    punctuation_to_collapse_by = context['punctuation_to_collapse_by']
    punctuation_to_explode_by = context['punctuation_to_explode_by']
    # Convert dirty words and tokens from str to set, split by '\n'
    special_cases = set(context['special_cases'].splitlines())
    translation_tags = set(context['translation_tags'].splitlines())

    for input_elan_file in eaf_paths:
        # Get paths to files
        input_directory, full_file_name = os.path.split(input_elan_file)
        file_name, extension = os.path.splitext(full_file_name)

        input_eaf = Eaf(input_elan_file)
        tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
        tier_names: List[str] = list(input_eaf.get_tier_names())

        # TODO: Check if this is necessary? It is possible to process transcription and audio file separately.
        # # Look for wav file matching the eaf file in same directory
        # if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
        #     print("WAV file found for " + file_name, file=sys.stderr)
        # else:
        #     raise ValueError(f"WAV file not found for {full_file_name}. "
        #                     f"Please put it next to the eaf file in {input_directory}.")

        # Get annotations and parameters (things like speaker id) on the target tier
        annotations: List[Tuple[str, str, str]] = []
        annotation_data: List[dict] = []

        # Determine tier_name
        # First try using tier order to get tier name
        if tier_order:
            # Watch out for files that may not have this many tiers
            # tier_order is 1-index but List indexing is 0-index
            try:
                tier_name = tier_names[tier_order - 1]
                print(
                    f"using tier order {tier_order} to get tier name {tier_name}"
                )
            except IndexError:
                print("couldn't find a tier")
                pass
        else:
            # else use tier type to get a tier name
            if tier_type in tier_types:
                print(f"found tier type {tier_type}")
                tier_names = input_eaf.get_tier_ids_for_linguistic_type(
                    tier_type)
                tier_name = tier_names[0]
                if tier_name:
                    print(f"found tier name {tier_name}")
            else:
                print("tier type not found in this file")

        if tier_name in tier_names:
            print(f"using tier name {tier_name}")
            annotations = input_eaf.get_annotation_data_for_tier(tier_name)
        else:
            pass  # TODO: Alert user of a skip due to missing tier_name in file

        if annotations:
            annotations = sorted(annotations)
            parameters: Dict[str, str] = input_eaf.get_parameters_for_tier(
                tier_name)
            speaker_id: str = parameters.get("PARTICIPANT", "")

        for annotation in annotations:
            start = annotation[0]
            end = annotation[1]
            annotation = annotation[2]

            utterance = {
                "audio_file_name": f"{file_name}.wav",
                "transcript": annotation,
                "start_ms": start,
                "stop_ms": end
            }
            # TODO: re-enable later
            # if "PARTICIPANT" in parameters:
            #     obj["speaker_id"] = speaker_id

            utterance_cleaned = clean_json_utterance(
                utterance=utterance,
                punctuation_to_collapse_by=punctuation_to_collapse_by,
                punctuation_to_explode_by=punctuation_to_explode_by,
                special_cases=special_cases,
                translation_tags=translation_tags,
                remove_english=False,
                use_langid=False)
            add_annotation(file_name, utterance_cleaned)
예제 #6
0
def process_eaf(input_elan_file: str = '',
                tier_order: int = 0,
                tier_name: str = '',
                tier_type: str = '',
                corpus_tiers_file: str = '') -> List[dict]:
    """
    Method to process a particular tier in an eaf file (ELAN Annotation Format).
    Transcriptions are read from an elan file tier.
    Tiers are nodes from the tree structure in the .eaf file.
    The tier to read from is determined by tier order (eg top tier would be order 1),
    tier type (eg default-lt) or tier name (eg Phrase).
    If tier type is used, the first tier matching this type is used.
    Elan can have multiple tiers of same type, future work would support reading data
    from multiple tiers of the selected type.

    It stores the transcriptions in the following format:
                    {'speaker_id': <speaker_id>,
                    'audio_file_name': <file_name>,
                    'transcript': <transcription_label>,
                    'start_ms': <start_time_in_milliseconds>,
                    'stop_ms': <stop_time_in_milliseconds>}

    :param input_elan_file: name of input elan file
    :param tier_order: index of the elan tier to process
    :param tier_type:  type of the elan tier to process
    :param tier_name:  name of the elan tier to process
    :return: a list of dictionaries, where each dictionary is an annotation
    """

    print(
        f"processing eaf {input_elan_file} using {tier_order} {tier_type} {tier_name}"
    )

    # Get paths to files
    input_directory, full_file_name = os.path.split(input_elan_file)
    file_name, extension = os.path.splitext(full_file_name)

    # Look for wav file matching the eaf file in same directory
    if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
        print("WAV file found for " + file_name, file=sys.stderr)
    else:
        raise ValueError(
            f"WAV file not found for {full_file_name}. "
            f"Please put it next to the eaf file in {input_directory}.")

    # Get tier data from Elan file
    input_eaf = Eaf(input_elan_file)
    tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
    tier_names: List[str] = list(input_eaf.get_tier_names())

    # Keep this data handy for future corpus analysis
    # save_tier_info(input_eaf=input_eaf,
    #               tier_types=tier_types,
    #               file_name=file_name,
    #               corpus_tiers_file=corpus_tiers_file)

    # Get annotations and parameters (things like speaker id) on the target tier
    annotations: List[Tuple[str, str, str]] = []
    annotations_data: List[dict] = []

    # First try using tier order to get tier name
    if tier_order:
        # Watch out for files that may not have this many tiers
        # tier_order is 1-index but List indexing is 0-index
        try:
            tier_name = tier_names[tier_order - 1]
            print(
                f"using tier order {tier_order} to get tier name {tier_name}")
        except IndexError:
            print("couldn't find a tier")
            pass
    else:
        # else use tier type to get a tier name
        if tier_type in tier_types:
            print(f"found tier type {tier_type}")
            tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type)
            tier_name = tier_names[0]
            if tier_name:
                print(f"found tier name {tier_name}")
        else:
            print("tier type not found in this file")

    if tier_name in tier_names:
        print(f"using tier name {tier_name}")
        annotations = input_eaf.get_annotation_data_for_tier(tier_name)

    if annotations:
        print(f"annotations {annotations}")
        annotations = sorted(annotations)
        parameters: Dict[str,
                         str] = input_eaf.get_parameters_for_tier(tier_name)
        print(f"parameters {parameters}")
        speaker_id: str = parameters.get("PARTICIPANT", "")

    for annotation in annotations:
        start: str = annotation[0]
        end: str = annotation[1]
        annotation_text: str = annotation[2]
        print(f"annotation {annotation} {start} {end}")
        obj = {
            "audio_file_name": f"{file_name}.wav",
            "transcript": annotation_text,
            "start_ms": start,
            "stop_ms": end
        }
        if "PARTICIPANT" in parameters:
            obj["speaker_id"] = speaker_id
        annotations_data.append(obj)

    return annotations_data
예제 #7
0
def read_eaf(ie):

    if verbose:
        print("input file is", ie)

    input_eaf = Eaf(ie)

    # Check if the tiers we have been given exist
    tier_names = list(input_eaf.get_tier_names())
    if verbose:
        print("tier_names", tier_names, file=sys.stderr)

    # Are we working by slice_tier name or order?
    if slice_tier != "default":
        if verbose:
            print("using slice_tier by name:", slice_tier, file=sys.stderr)
    else:

        # Sanity check that the slice_tier num is not greater than the num of tiers
        if tier_order > len(tier_names):
            print("Error: tier number is greater than the number of tiers",
                  file=sys.stderr)
            return False
        if verbose:
            print("using slice_tier by number:",
                  tier_names[tier_order - 1],
                  file=sys.stderr)

    if slice_tier not in tier_names:
        print('Error: missing slice_tier ' + slice_tier, file=sys.stderr)
        return False

    if silence_tier not in tier_names:
        if verbose:
            print('silence tier not found: ' + silence_tier, file=sys.stderr)

    # get the input audio file
    inDir, name = os.path.split(ie)
    basename, ext = os.path.splitext(name)

    # we can write out mp3 or whatever, still require wav input
    ia = os.path.join(inDir, basename + ".wav")
    input_audio = AudioSegment.from_wav(ia)

    # We can pass in an arg for a ref tier that has silence labels
    check_silence_ref_tier = False
    if silence_tier in tier_names:
        silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier)
        if silence_tier_info.get("PARENT_REF") == tier:
            check_silence_ref_tier = True

    # Get annotation values, start and end times, and speaker id
    if text_tier not in tier_names:
        print('Error: missing text tier')
        return False

    annotations = sorted(input_eaf.get_annotation_data_for_tier(text_tier))

    params = input_eaf.get_parameters_for_tier(text_tier)
    if 'PARTICIPANT' in params:
        speaker_id = params['PARTICIPANT']

    annotations_data = []
    i = 0
    for ann in annotations:
        skip = False
        ref_annotation = []
        start = ann[0]
        end = ann[1]
        # output new values, not the original clip start end times
        clip_start = 0
        clip_end = ann[1] - ann[0]
        annotation = ann[2]

        # Check for annotations labelled with a particular symbol on the main tier
        if annotation == silence_marker:
            skip = True

        # Check for existence of an annotation in ref tier to silence
        # Annotation value doesn't matter
        if check_silence_ref_tier:
            ref_annotation = input_eaf.get_ref_annotation_at_time(
                silence_tier, start)
            if len(ref_annotation) is True:
                skip = True

        if skip is True:
            print('skipping annotation: ' + annotation, start, end)
        else:
            print('processing annotation: ' + annotation, start, end)
            # build the output audio/text filename
            fname = basename + "_" + str(i)
            if name_with_annotation:
                fname = slugify(annotation)

            if prefix != '':
                fname = prefix + '_' + fname
            obj = {
                'audioFileName': os.path.join(".", fname + ".wav"),
                'transcript': annotation,
                'startMs': clip_start,
                'stopMs': clip_end
            }
            if 'PARTICIPANT' in params:
                obj["speakerId"] = speaker_id
            annotations_data.append(obj)
            split_audio_by_start_end(input_audio, start, end, fname)
            write_text(annotation, fname)
            i += 1
    # output the json data for the next step in kaldi pipeline
    write_json(annotations_data)

    if verbose:
        print(annotations_data)