def main(): # Start by asking where the Elan files are # Default is a folder named "input" in the same directory as this script input_dir_question = [{ 'type': 'input', 'name': 'input_dir', 'message': 'Name of folder with Elan files?', 'default': 'input' }] input_dir_prompt = prompt(input_dir_question) input_dir = input_dir_prompt["input_dir"] # Get all files from the input directory extensions = set(["*.eaf"]) tier_names = set() all_files = set(glob.glob(os.path.join(input_dir, "**"), recursive=True)) input_files = find_files_by_ext(all_files, extensions) # Compile tier info for the files in the input dir for input_file_path in input_files: input_file = Eaf(input_file_path) # Get the tier names — using pympi-ling file_tier_names = list(input_file.get_tier_names()) # Compile tiers into set to use for user prompt for tier_name in file_tier_names: tier_names.add(tier_name) print(type(tier_names)) select_tier(tier_names)
def get_elan_tier_attributes(input_eafs_files): """ Iterate a dir of elan files and compiles info about all the files' tiers: unique tier types, unique tier names, and the num of tiers """ # Use sets internally for easy uniqueness, conver to lists when done _tier_types: Set[str] = set() _tier_names: Set[str] = set() _tier_max_count: int = 0 for file_ in input_eafs_files: input_eaf = Eaf(file_) for tier_type in list(input_eaf.get_linguistic_type_names()): _tier_types.add(tier_type) tier_ids: List[str] = input_eaf.get_tier_ids_for_linguistic_type( tier_type) for tier_id in tier_ids: _tier_names.add(tier_id) # count the number of tiers, use the max from all files tier_count = len(list(input_eaf.get_tier_names())) if tier_count > _tier_max_count: _tier_max_count = tier_count tier_types = list(_tier_types) tier_names = list(_tier_names) tier_max_count = _tier_max_count return (tier_types, tier_names, tier_max_count)
def update_ui(file_paths: List[Path], ui): """ Iterate a dir of elan files and compiles info about all the files' tiers: unique tier types, unique tier names, and the num of tiers """ # Use sets internally for easy uniqueness, conver to lists when done _tier_types: Set[str] = set(ui['data']['tier_type']['options']) _tier_names: Set[str] = set(ui['data']['tier_name']['options']) tier_max_count = 0 print('**** ui data') print(ui['data']) print('**** _tier_types') print(_tier_types) eaf_paths = [p for p in file_paths if f'{p}'.endswith('.eaf')] for eaf_path in eaf_paths: input_eaf = Eaf(eaf_path) for tier_type in list(input_eaf.get_linguistic_type_names()): _tier_types.add(tier_type) tier_ids: List[str] = input_eaf.get_tier_ids_for_linguistic_type( tier_type) for tier_id in tier_ids: _tier_names.add(tier_id) # count the number of tiers, use the max from all files tier_count = len(list(input_eaf.get_tier_names())) if tier_count > tier_max_count: tier_max_count = tier_count ui['data']['tier_type']['options'] = list(_tier_types) ui['data']['tier_name']['options'] = list(_tier_names) ui['data']['tier_order']['options'] = [i for i in range(tier_max_count)] return ui
def main() -> None: """ A command line utility to silence the audio files in a given directory Usage: python3 silence_audio.py [-h] -c CORPUS [-s SILENCE_TIER] [-o] """ global silence_mono global silence_stereo global do_not_publish parser = argparse.ArgumentParser( description= "This script will silence a wave file based on annotations in an Elan tier" ) parser.add_argument('-c', '--corpus', help='Directory of audio and eaf files', type=str, required=True) parser.add_argument( '-s', '--silence_tier', help='Silence audio when annotations are found on this tier', type=str, default='Silence') parser.add_argument('-o', '--overwrite', help='Write over existing files', action="store_true") arguments = parser.parse_args() silence_mono = 0 silence_stereo = [0, 0] suffix = 'S' ''' Look for .eaf files, recursively from the passed arguments.corpus dir for file_path in glob.iglob(arguments.corpus + '/**/*.eaf', recursive=True) ''' for file_path in glob.iglob(arguments.corpus + '/*.eaf'): eaf_file = Eaf(file_path) names = eaf_file.get_tier_names() # Check for existence of silence tier if arguments.silence_tier in names: print("Have tier %s in %s" % (arguments.silence_tier, file_path)) basename, extension = os.path.splitext(file_path) input = basename + ".wav" if arguments.overwrite: output = basename + ".wav" else: output = basename + suffix + ".wav" silence_audio(eaf_file, output)
def explore_elan_files(elan_paths): """ A function to explore the tiers of ELAN files. """ for elan_path in elan_paths: print(elan_path) eafob = Eaf(elan_path) tier_names = eafob.get_tier_names() for tier in tier_names: print("\t", tier) try: for annotation in eafob.get_annotation_data_for_tier(tier): print("\t\t", annotation) except KeyError: continue input()
def read_eaf(ie, tier, silence_tier, silence_marker, json_data, output_text_dir, output_audio_dir): input_eaf = Eaf(ie) # Check if the tiers we have been given exist tier_names = input_eaf.get_tier_names() if tier not in tier_names: print('missing tier: ' + tier, file=sys.stderr) return False if silence_tier not in tier_names: print('missing silence tier: ' + silence_tier, file=sys.stderr) # get the input_scripts audio file inDir, name = os.path.split(ie) basename, ext = os.path.splitext(name) ia = os.path.join(inDir, basename + ".wav") input_audio = AudioSegment.from_wav(ia) # We can pass in an arg for a ref tier that has silence labels check_silence_ref_tier = False if silence_tier in tier_names: silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier) if silence_tier_info.get("PARENT_REF") == tier: check_silence_ref_tier = True # Get annotation values, start and end times, and speaker id annotations = sorted(input_eaf.get_annotation_data_for_tier(tier)) params = input_eaf.get_parameters_for_tier(tier) if 'PARTICIPANT' in params: speaker_id = params['PARTICIPANT'] i = 0 for ann in annotations: skip = False start = ann[0] end = ann[1] # output_scripts new values, not the original clip start end times clip_start = 0 clip_end = ann[1] - ann[0] annotation = ann[2] # Check for annotations labelled with a particular symbol on the main tier if annotation == silence_marker: skip = True # Check for existence of an annotation in ref tier to silence # Annotation value doesn't matter if check_silence_ref_tier and len(input_eaf.get_ref_annotation_at_time(silence_tier, start)): skip = True if skip is True: # print('skipping annotation: ' + annotation, start, end) print("skipping" + str(i)) else: print("processing" + str(i)) # print('processing annotation: ' + annotation, start, end) # build the output_scripts audio/text filename fname = basename + "_" + str(i) obj = { 'audioFileName': os.path.join(".", fname + ".wav"), 'transcript': annotation, 'startMs': clip_start, 'stopMs': clip_end } if 'PARTICIPANT' in params: obj["speakerId"] = speaker_id json_data.append(obj) split_audio_by_start_end(input_audio, start, end, fname, ".wav", output_audio_dir) write_text(annotation, fname, ".txt", output_text_dir) i += 1
def import_eaf_file(eaf_paths: List[str], context: Dict[str, str], reset_annotations: Callable, add_annotation: Callable, tmp_dir): """ Import handler for processing .eaf files. :param eaf_paths: List of string paths to Elan files. :param context: The settings that will be used to process data from the Elan files. :param reset_annotations: Callback to wipe all annotations that have been previously read. Settings such as the tier type/name/order will determine which annotations are read into the dataset _annotation_store. When settings are changed— (Wait, what? Users change their minds?? OMG yes.) —reset_annotations will reset dataset _annotation_store to {}, ready for annotations derived from the new settings to be added. Without this, changing settings will result in annotations derived from application of new settings being appended to previous annotations. :param add_annotation: Callback to append an annotation from selected tier :param tmp_dir: Honestly, no idea... """ tier_order = context['tier_order'] tier_name = context['tier_name'] tier_type = context['tier_type'] punctuation_to_collapse_by = context['punctuation_to_collapse_by'] punctuation_to_explode_by = context['punctuation_to_explode_by'] special_cases = set(context['special_cases'].splitlines()) translation_tags = set(context['translation_tags'].splitlines()) reset_annotations() for input_elan_file in eaf_paths: # Get paths to files input_directory, full_file_name = os.path.split(input_elan_file) file_name, extension = os.path.splitext(full_file_name) input_eaf = Eaf(input_elan_file) tier_types: List[str] = list(input_eaf.get_linguistic_type_names()) tier_names: List[str] = list(input_eaf.get_tier_names()) # Get annotations and parameters (things like speaker id) on the target tier annotations: List[Tuple[str, str, str]] = [] annotation_data: List[dict] = [] # Try using tier_order. Watch out for mixed type, empty str if not selected, int if selected if isinstance(tier_order, int): try: tier_name = tier_names[tier_order] print( f"using tier order {tier_order} to get tier name {tier_name}" ) except IndexError: print("couldn't find a tier") pass else: # else use tier type to get a tier name if tier_type in tier_types: print(f"found tier type {tier_type}") tier_names = input_eaf.get_tier_ids_for_linguistic_type( tier_type) tier_name = tier_names[0] if tier_name: print(f"found tier name {tier_name}") else: print("tier type not found in this file") if tier_name in tier_names: print(f"using tier name {tier_name}") annotations = input_eaf.get_annotation_data_for_tier(tier_name) else: pass # TODO: Alert user of a skip due to missing tier_name in file if annotations: annotations = sorted(annotations) parameters: Dict[str, str] = input_eaf.get_parameters_for_tier( tier_name) speaker_id: str = parameters.get("PARTICIPANT", "") for annotation in annotations: start = annotation[0] end = annotation[1] annotation = annotation[2] utterance = { "audio_file_name": f"{file_name}.wav", "transcript": annotation, "start_ms": start, "stop_ms": end, "speaker_id": speaker_id } utterance_cleaned = clean_json_utterance( utterance=utterance, punctuation_to_collapse_by=punctuation_to_collapse_by, punctuation_to_explode_by=punctuation_to_explode_by, special_cases=special_cases, translation_tags=translation_tags, remove_english=False, use_langid=False) add_annotation(file_name, utterance_cleaned)
with wave.open(output, 'wb') as audio: samples.shape = params.nframes if num_channels == 1 else ( params.nframes * 2) audio.setparams(params) audio.writeframesraw(samples) print("Silenced {} intervals ({:.1f}s)".format( len(annotations), num_samples / params.framerate)) # look for .eaf files, recirsively from the passed corpus dir # for fpath in glob.iglob(corpus + '/**/*.eaf', recursive=True): for fpath in glob.iglob(corpus + '/*.eaf'): print(fpath) eaffile = Eaf(fpath) names = eaffile.get_tier_names() # print(names) # check for existence of silence tier # if DO_NOT_PUBLISH in names: print("have tier %s in %s" % (DO_NOT_PUBLISH, fpath)) basename, extn = os.path.splitext(fpath) input = basename + ".wav" if overwrite == 'yes': output = basename + ".wav" else: output = basename + SUFFIX + ".wav"
def import_eaf_file(eaf_paths, context, add_annotation, tmp_dir): """ Import handler for processing all .wav and .eaf files. :param wav_paths: List of string paths to Wave files. :param eaf_paths: List of string paths to Elan files. """ """ Import handler for processing all .eaf files. Method to process a particular tier in an eaf file (ELAN Annotation Format). It stores the transcriptions in the following format: {'speaker_id': <speaker_id>, 'audio_file_name': <file_name>, 'transcript': <transcription_label>, 'start_ms': <start_time_in_milliseconds>, 'stop_ms': <stop_time_in_milliseconds>} :param eaf_paths: List of string paths to Elan files. :return: a list of dictionaries, where each dictionary is an annotation """ tier_order = context['tier_order'] tier_name = context['tier_name'] tier_type = context['tier_type'] punctuation_to_collapse_by = context['punctuation_to_collapse_by'] punctuation_to_explode_by = context['punctuation_to_explode_by'] # Convert dirty words and tokens from str to set, split by '\n' special_cases = set(context['special_cases'].splitlines()) translation_tags = set(context['translation_tags'].splitlines()) for input_elan_file in eaf_paths: # Get paths to files input_directory, full_file_name = os.path.split(input_elan_file) file_name, extension = os.path.splitext(full_file_name) input_eaf = Eaf(input_elan_file) tier_types: List[str] = list(input_eaf.get_linguistic_type_names()) tier_names: List[str] = list(input_eaf.get_tier_names()) # TODO: Check if this is necessary? It is possible to process transcription and audio file separately. # # Look for wav file matching the eaf file in same directory # if os.path.isfile(os.path.join(input_directory, file_name + ".wav")): # print("WAV file found for " + file_name, file=sys.stderr) # else: # raise ValueError(f"WAV file not found for {full_file_name}. " # f"Please put it next to the eaf file in {input_directory}.") # Get annotations and parameters (things like speaker id) on the target tier annotations: List[Tuple[str, str, str]] = [] annotation_data: List[dict] = [] # Determine tier_name # First try using tier order to get tier name if tier_order: # Watch out for files that may not have this many tiers # tier_order is 1-index but List indexing is 0-index try: tier_name = tier_names[tier_order - 1] print( f"using tier order {tier_order} to get tier name {tier_name}" ) except IndexError: print("couldn't find a tier") pass else: # else use tier type to get a tier name if tier_type in tier_types: print(f"found tier type {tier_type}") tier_names = input_eaf.get_tier_ids_for_linguistic_type( tier_type) tier_name = tier_names[0] if tier_name: print(f"found tier name {tier_name}") else: print("tier type not found in this file") if tier_name in tier_names: print(f"using tier name {tier_name}") annotations = input_eaf.get_annotation_data_for_tier(tier_name) else: pass # TODO: Alert user of a skip due to missing tier_name in file if annotations: annotations = sorted(annotations) parameters: Dict[str, str] = input_eaf.get_parameters_for_tier( tier_name) speaker_id: str = parameters.get("PARTICIPANT", "") for annotation in annotations: start = annotation[0] end = annotation[1] annotation = annotation[2] utterance = { "audio_file_name": f"{file_name}.wav", "transcript": annotation, "start_ms": start, "stop_ms": end } # TODO: re-enable later # if "PARTICIPANT" in parameters: # obj["speaker_id"] = speaker_id utterance_cleaned = clean_json_utterance( utterance=utterance, punctuation_to_collapse_by=punctuation_to_collapse_by, punctuation_to_explode_by=punctuation_to_explode_by, special_cases=special_cases, translation_tags=translation_tags, remove_english=False, use_langid=False) add_annotation(file_name, utterance_cleaned)
def process_eaf(input_elan_file: str = '', tier_order: int = 0, tier_name: str = '', tier_type: str = '', corpus_tiers_file: str = '') -> List[dict]: """ Method to process a particular tier in an eaf file (ELAN Annotation Format). Transcriptions are read from an elan file tier. Tiers are nodes from the tree structure in the .eaf file. The tier to read from is determined by tier order (eg top tier would be order 1), tier type (eg default-lt) or tier name (eg Phrase). If tier type is used, the first tier matching this type is used. Elan can have multiple tiers of same type, future work would support reading data from multiple tiers of the selected type. It stores the transcriptions in the following format: {'speaker_id': <speaker_id>, 'audio_file_name': <file_name>, 'transcript': <transcription_label>, 'start_ms': <start_time_in_milliseconds>, 'stop_ms': <stop_time_in_milliseconds>} :param input_elan_file: name of input elan file :param tier_order: index of the elan tier to process :param tier_type: type of the elan tier to process :param tier_name: name of the elan tier to process :return: a list of dictionaries, where each dictionary is an annotation """ print( f"processing eaf {input_elan_file} using {tier_order} {tier_type} {tier_name}" ) # Get paths to files input_directory, full_file_name = os.path.split(input_elan_file) file_name, extension = os.path.splitext(full_file_name) # Look for wav file matching the eaf file in same directory if os.path.isfile(os.path.join(input_directory, file_name + ".wav")): print("WAV file found for " + file_name, file=sys.stderr) else: raise ValueError( f"WAV file not found for {full_file_name}. " f"Please put it next to the eaf file in {input_directory}.") # Get tier data from Elan file input_eaf = Eaf(input_elan_file) tier_types: List[str] = list(input_eaf.get_linguistic_type_names()) tier_names: List[str] = list(input_eaf.get_tier_names()) # Keep this data handy for future corpus analysis # save_tier_info(input_eaf=input_eaf, # tier_types=tier_types, # file_name=file_name, # corpus_tiers_file=corpus_tiers_file) # Get annotations and parameters (things like speaker id) on the target tier annotations: List[Tuple[str, str, str]] = [] annotations_data: List[dict] = [] # First try using tier order to get tier name if tier_order: # Watch out for files that may not have this many tiers # tier_order is 1-index but List indexing is 0-index try: tier_name = tier_names[tier_order - 1] print( f"using tier order {tier_order} to get tier name {tier_name}") except IndexError: print("couldn't find a tier") pass else: # else use tier type to get a tier name if tier_type in tier_types: print(f"found tier type {tier_type}") tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type) tier_name = tier_names[0] if tier_name: print(f"found tier name {tier_name}") else: print("tier type not found in this file") if tier_name in tier_names: print(f"using tier name {tier_name}") annotations = input_eaf.get_annotation_data_for_tier(tier_name) if annotations: print(f"annotations {annotations}") annotations = sorted(annotations) parameters: Dict[str, str] = input_eaf.get_parameters_for_tier(tier_name) print(f"parameters {parameters}") speaker_id: str = parameters.get("PARTICIPANT", "") for annotation in annotations: start: str = annotation[0] end: str = annotation[1] annotation_text: str = annotation[2] print(f"annotation {annotation} {start} {end}") obj = { "audio_file_name": f"{file_name}.wav", "transcript": annotation_text, "start_ms": start, "stop_ms": end } if "PARTICIPANT" in parameters: obj["speaker_id"] = speaker_id annotations_data.append(obj) return annotations_data
def read_eaf(ie): if verbose: print("input file is", ie) input_eaf = Eaf(ie) # Check if the tiers we have been given exist tier_names = list(input_eaf.get_tier_names()) if verbose: print("tier_names", tier_names, file=sys.stderr) # Are we working by slice_tier name or order? if slice_tier != "default": if verbose: print("using slice_tier by name:", slice_tier, file=sys.stderr) else: # Sanity check that the slice_tier num is not greater than the num of tiers if tier_order > len(tier_names): print("Error: tier number is greater than the number of tiers", file=sys.stderr) return False if verbose: print("using slice_tier by number:", tier_names[tier_order - 1], file=sys.stderr) if slice_tier not in tier_names: print('Error: missing slice_tier ' + slice_tier, file=sys.stderr) return False if silence_tier not in tier_names: if verbose: print('silence tier not found: ' + silence_tier, file=sys.stderr) # get the input audio file inDir, name = os.path.split(ie) basename, ext = os.path.splitext(name) # we can write out mp3 or whatever, still require wav input ia = os.path.join(inDir, basename + ".wav") input_audio = AudioSegment.from_wav(ia) # We can pass in an arg for a ref tier that has silence labels check_silence_ref_tier = False if silence_tier in tier_names: silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier) if silence_tier_info.get("PARENT_REF") == tier: check_silence_ref_tier = True # Get annotation values, start and end times, and speaker id if text_tier not in tier_names: print('Error: missing text tier') return False annotations = sorted(input_eaf.get_annotation_data_for_tier(text_tier)) params = input_eaf.get_parameters_for_tier(text_tier) if 'PARTICIPANT' in params: speaker_id = params['PARTICIPANT'] annotations_data = [] i = 0 for ann in annotations: skip = False ref_annotation = [] start = ann[0] end = ann[1] # output new values, not the original clip start end times clip_start = 0 clip_end = ann[1] - ann[0] annotation = ann[2] # Check for annotations labelled with a particular symbol on the main tier if annotation == silence_marker: skip = True # Check for existence of an annotation in ref tier to silence # Annotation value doesn't matter if check_silence_ref_tier: ref_annotation = input_eaf.get_ref_annotation_at_time( silence_tier, start) if len(ref_annotation) is True: skip = True if skip is True: print('skipping annotation: ' + annotation, start, end) else: print('processing annotation: ' + annotation, start, end) # build the output audio/text filename fname = basename + "_" + str(i) if name_with_annotation: fname = slugify(annotation) if prefix != '': fname = prefix + '_' + fname obj = { 'audioFileName': os.path.join(".", fname + ".wav"), 'transcript': annotation, 'startMs': clip_start, 'stopMs': clip_end } if 'PARTICIPANT' in params: obj["speakerId"] = speaker_id annotations_data.append(obj) split_audio_by_start_end(input_audio, start, end, fname) write_text(annotation, fname) i += 1 # output the json data for the next step in kaldi pipeline write_json(annotations_data) if verbose: print(annotations_data)