def read_eaf(ie, tier, silence_tier, silence_marker, json_data, output_text_dir, output_audio_dir): input_eaf = Eaf(ie) # Check if the tiers we have been given exist tier_names = input_eaf.get_tier_names() if tier not in tier_names: print('missing tier: ' + tier, file=sys.stderr) return False if silence_tier not in tier_names: print('missing silence tier: ' + silence_tier, file=sys.stderr) # get the input_scripts audio file inDir, name = os.path.split(ie) basename, ext = os.path.splitext(name) ia = os.path.join(inDir, basename + ".wav") input_audio = AudioSegment.from_wav(ia) # We can pass in an arg for a ref tier that has silence labels check_silence_ref_tier = False if silence_tier in tier_names: silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier) if silence_tier_info.get("PARENT_REF") == tier: check_silence_ref_tier = True # Get annotation values, start and end times, and speaker id annotations = sorted(input_eaf.get_annotation_data_for_tier(tier)) params = input_eaf.get_parameters_for_tier(tier) if 'PARTICIPANT' in params: speaker_id = params['PARTICIPANT'] i = 0 for ann in annotations: skip = False start = ann[0] end = ann[1] # output_scripts new values, not the original clip start end times clip_start = 0 clip_end = ann[1] - ann[0] annotation = ann[2] # Check for annotations labelled with a particular symbol on the main tier if annotation == silence_marker: skip = True # Check for existence of an annotation in ref tier to silence # Annotation value doesn't matter if check_silence_ref_tier and len(input_eaf.get_ref_annotation_at_time(silence_tier, start)): skip = True if skip is True: # print('skipping annotation: ' + annotation, start, end) print("skipping" + str(i)) else: print("processing" + str(i)) # print('processing annotation: ' + annotation, start, end) # build the output_scripts audio/text filename fname = basename + "_" + str(i) obj = { 'audioFileName': os.path.join(".", fname + ".wav"), 'transcript': annotation, 'startMs': clip_start, 'stopMs': clip_end } if 'PARTICIPANT' in params: obj["speakerId"] = speaker_id json_data.append(obj) split_audio_by_start_end(input_audio, start, end, fname, ".wav", output_audio_dir) write_text(annotation, fname, ".txt", output_text_dir) i += 1
def read_eaf(ie): if verbose: print("input file is", ie) input_eaf = Eaf(ie) # Check if the tiers we have been given exist tier_names = list(input_eaf.get_tier_names()) if verbose: print("tier_names", tier_names, file=sys.stderr) # Are we working by slice_tier name or order? if slice_tier != "default": if verbose: print("using slice_tier by name:", slice_tier, file=sys.stderr) else: # Sanity check that the slice_tier num is not greater than the num of tiers if tier_order > len(tier_names): print("Error: tier number is greater than the number of tiers", file=sys.stderr) return False if verbose: print("using slice_tier by number:", tier_names[tier_order - 1], file=sys.stderr) if slice_tier not in tier_names: print('Error: missing slice_tier ' + slice_tier, file=sys.stderr) return False if silence_tier not in tier_names: if verbose: print('silence tier not found: ' + silence_tier, file=sys.stderr) # get the input audio file inDir, name = os.path.split(ie) basename, ext = os.path.splitext(name) # we can write out mp3 or whatever, still require wav input ia = os.path.join(inDir, basename + ".wav") input_audio = AudioSegment.from_wav(ia) # We can pass in an arg for a ref tier that has silence labels check_silence_ref_tier = False if silence_tier in tier_names: silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier) if silence_tier_info.get("PARENT_REF") == tier: check_silence_ref_tier = True # Get annotation values, start and end times, and speaker id if text_tier not in tier_names: print('Error: missing text tier') return False annotations = sorted(input_eaf.get_annotation_data_for_tier(text_tier)) params = input_eaf.get_parameters_for_tier(text_tier) if 'PARTICIPANT' in params: speaker_id = params['PARTICIPANT'] annotations_data = [] i = 0 for ann in annotations: skip = False ref_annotation = [] start = ann[0] end = ann[1] # output new values, not the original clip start end times clip_start = 0 clip_end = ann[1] - ann[0] annotation = ann[2] # Check for annotations labelled with a particular symbol on the main tier if annotation == silence_marker: skip = True # Check for existence of an annotation in ref tier to silence # Annotation value doesn't matter if check_silence_ref_tier: ref_annotation = input_eaf.get_ref_annotation_at_time( silence_tier, start) if len(ref_annotation) is True: skip = True if skip is True: print('skipping annotation: ' + annotation, start, end) else: print('processing annotation: ' + annotation, start, end) # build the output audio/text filename fname = basename + "_" + str(i) if name_with_annotation: fname = slugify(annotation) if prefix != '': fname = prefix + '_' + fname obj = { 'audioFileName': os.path.join(".", fname + ".wav"), 'transcript': annotation, 'startMs': clip_start, 'stopMs': clip_end } if 'PARTICIPANT' in params: obj["speakerId"] = speaker_id annotations_data.append(obj) split_audio_by_start_end(input_audio, start, end, fname) write_text(annotation, fname) i += 1 # output the json data for the next step in kaldi pipeline write_json(annotations_data) if verbose: print(annotations_data)