def make_elan(source_parent_dir, target_parent_dir): """ Make elan files based on filenames of wav files Written for the TIDIGITS corpus, so some things are specific to the name formats of that corpus """ for dirname, dirnames, filenames in os.walk(source_parent_dir): # print path to all subdirectories first. for subdirname in dirnames: print(os.path.join(dirname, subdirname)) # print path to all filenames. for filename in filenames: if '.wav' in filename: parent, gender, child = dirname.split(os.path.sep) basename, ext = os.path.splitext(os.path.basename(filename)) print(parent, gender, child, filename) source_path = os.path.join(source_parent_dir, gender, child) target_path = os.path.join(target_parent_dir, gender, child) if not os.path.exists(target_path): print(target_path) os.makedirs(target_path) # Audio file duration - use this as end timeslot duration = int(librosa.get_duration(filename=os.path.join(source_path, filename))*1000) # Make file annotation from filename (minus the suffix) annotation = " ".join([char for char in basename[:-1]]) # These are specific to the TIDIGITS naming convention annotation = annotation.replace("o", "oh") annotation = annotation.replace("z", "zero") text = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation) print(filename, duration, annotation, text) # Make elan output_eaf = Eaf() output_eaf.add_tier('tx') output_eaf.insert_annotation('tx', 0, duration, text) output_eaf.add_linked_file(os.path.join(target_path, f'{basename}.wav')) output_eaf.to_file(os.path.join(target_path, f'{basename}.eaf'))
def make_elans(input_dir: str, output_dir: str, copy_wavs: bool): """ Make ELAN files based on filenames of WAV files and annotation from matching text file :param input_dir: Directory name of folder containing TXT and WAV audio files :param output_dir: Directory name to save EAF files into :param copy_wavs: Setting whether or not to copy the WAV file to the output dir """ # Process each file for _, _, filenames in os.walk(input_dir): for filename in filenames: if '.wav' in filename: basename, ext = os.path.splitext(os.path.basename(filename)) print(basename) # Get audio file duration - use this as the EAF annotation's end timeslot duration = int( librosa.get_duration( filename=os.path.join(input_dir, filename)) * 1000) # Get annotation from the text file matching on file basename annotation = get_annotation(input_dir, basename) # Add any annotation cleaning here # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation) print(duration, annotation) # Make EAF file output_eaf = Eaf() # output_eaf.add_tier('default') output_eaf.insert_annotation('default', 0, duration, annotation) output_eaf.add_linked_file( os.path.join(output_dir, f'{basename}.wav')) output_eaf.to_file(os.path.join(output_dir, f'{basename}.eaf')) # Copy WAV? if copy_wavs: shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename)) print('>>> Done')
def make_elans(spreadsheet: str, source: str, target: str): """ Make ELAN files based on filenames of WAV files :param spreadsheet: Path and file name of the spreadsheet containing WAV filenames and matching annotations :param source: Directory name of folder containing WAV audio files :param target: Directory name to save EAF files into """ # Read spreadsheet data and convert to JSON format print('Loading data from spreadsheet') annotations = get_annotations(spreadsheet) # Process each file print('Processing WAVs') for _, _, filenames in os.walk(source): for filename in filenames: if '.wav' in filename: basename, ext = os.path.splitext(os.path.basename(filename)) # Get audio file duration - use this as the EAF annotation's end timeslot duration = int( librosa.get_duration( filename=os.path.join(source, filename)) * 1000) # Get annotation from the source data matching on filename annotation = get_annotation(annotations, filename) # Add any annotation cleaning here # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation) print(filename, duration, annotation) # Make EAF file output_eaf = Eaf() output_eaf.add_tier('tx') output_eaf.insert_annotation('tx', 0, duration, annotation) output_eaf.add_linked_file( os.path.join(target, f'{basename}.wav')) output_eaf.to_file(os.path.join(target, f'{basename}.eaf')) print('>>> Done')
def make_elans(input_dir: str, output_dir: str, copy_wavs: bool): """ Make ELAN files based on filenames of WAV files and annotation from matching text file :param input_dir: Directory name of folder containing TXT and WAV audio files :param output_dir: Directory name to save EAF files into :param copy_wavs: Setting whether or not to copy the WAV file to the output dir """ # Process each file files = glob.glob(f'{input_dir}/**/*.txt', recursive=True) print(files) for filename in files: filepath, ext = os.path.splitext(filename) basename = os.path.splitext(os.path.basename(filepath))[0] subdirname = os.path.basename(os.path.dirname(filepath)) sex = subdirname[0] participant = subdirname[1:] # SEX :== m | f # SPEAKER_ID :== <INITIALS><DIGIT> # INITIALS :== speaker initials, 3 letters # DIGIT :== number 0-9 to differentiate speakers with identical initials # print(filename) # input/dr1/fmem0/sa2.txt # print(filepath) # input/dr1/fmem0/sa2 # print(subdirname) # fmem0 # print(basename) # sa2 # print(ext) # txt # Get audio file duration - use this as the EAF annotation's end timeslot # duration = int(librosa.get_duration(filename=os.path.join(input_dir, filename))*1000) # Get annotation from the text file matching on file basename with open(filename, 'r', encoding='utf-8') as text_file: annotation = text_file.read() annotation_split = annotation.split() start = int(annotation_split[0]) duration = int(annotation_split[1]) # convert audio samples to seconds to ms duration = int(duration / 16000 * 1000) annotation_text = " ".join(annotation_split[2:]) # Add any annotation cleaning here # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation) print(start, duration, annotation_text) # Make EAF file output_eaf = Eaf() output_eaf.add_tier('default', part=participant) output_eaf.add_annotation('default', start, duration, annotation_text) output_eaf.add_linked_file( os.path.join(output_dir, f'{subdirname}-{basename}.wav')) output_eaf.to_file( os.path.join(output_dir, f'{subdirname}-{basename}.eaf')) # Copy WAV? # if copy_wavs: shutil.copyfile( f'{filepath}.wav', os.path.join(output_dir, f'{subdirname}-{basename}.wav')) print('>>> Done')