def test_convert_xml_invalid(self): """test readalongs.text.convert_xml.convert_xml() with invalid input""" xml = etree.fromstring('<s><w ARPABET="V AA L IY D">valid</w></s>') c_xml, valid = convert_xml(xml) self.assertEqual(etree.tounicode(c_xml), '<s><w ARPABET="V AA L IY D">valid</w></s>') self.assertTrue(valid, "convert_xml with valid pre-g2p'd text") xml = etree.fromstring('<s><w ARPABET="invalid">invalid</w></s>') c_xml, valid = convert_xml(xml) self.assertEqual(etree.tounicode(c_xml), '<s><w ARPABET="invalid">invalid</w></s>') self.assertFalse(valid, "convert_xml with invalid pre-g2p'd text")
def end_to_end(xml, input_filename, unit, word_unit, out_orth): xml = tokenize_xml(xml) xml = add_ids(xml) converted_xml, valid = convert_xml(xml, word_unit, out_orth) # save_xml("test.xml", converted_xml) fsg = make_fsg(converted_xml, input_filename, unit) pronouncing_dictionary = make_dict(converted_xml, input_filename, unit) return xml, fsg, pronouncing_dictionary
def test_invalid_langs_in_xml(self): xml = etree.fromstring(""" <s> <w lang="eng" fallback-langs="foo">français falls back to invalid foo</w> <w lang="crx-syl">no path to arpabet</w> </s> """) with self.assertLogs(LOGGER, level="WARNING") as cm: c_xml, valid = convert_xml(xml) self.assertFalse(valid) logger_output = "\n".join(cm.output) self.assertIn('"foo": invalid language code', logger_output) self.assertIn('"crx-syl": no path to "eng-arpabet"', logger_output)
def align_audio( # noqa: C901 xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None, verbose_g2p_warnings=False, ): """Align an XML input file to an audio file. Args: xml_path (str): Path to XML input file in TEI-like format audio_path (str): Path to audio input. Must be in a format supported by ffmpeg unit (str): Optional; Element to create alignments for, by default 'w' bare (boolean): Optional; If False, split silence into adjoining tokens (default) If True, keep the bare tokens without adjoining silences. config (object): Optional; ReadAlong-Studio configuration to use save_temps (str): Optional; Save temporary files, by default None verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings iff True Returns: Dict[str, List]: TODO Raises: TODO """ results: Dict[str, List] = {"words": [], "audio": None} # First do G2P try: xml = etree.parse(xml_path).getroot() except etree.XMLSyntaxError as e: raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e)) from e if config and "images" in config: xml = add_images(xml, config) if config and "xml" in config: xml = add_supplementary_xml(xml, config) xml = tokenize_xml(xml) if save_temps: save_xml(save_temps + ".tokenized.xml", xml) results["tokenized"] = xml = add_ids(xml) if save_temps: save_xml(save_temps + ".ids.xml", xml) xml, valid = convert_xml(xml, verbose_warnings=verbose_g2p_warnings) if save_temps: save_xml(save_temps + ".g2p.xml", xml) if not valid: raise RuntimeError( "Some words could not be g2p'd correctly. Aborting. " "Run with --g2p-verbose for more detailed g2p error logs.") # Prepare the SoundsSwallower (formerly PocketSphinx) configuration cfg = soundswallower.Decoder.default_config() model_path = soundswallower.get_model_path() cfg.set_boolean("-remove_noise", False) cfg.set_boolean("-remove_silence", False) cfg.set_string("-hmm", os.path.join(model_path, "en-us")) # cfg.set_string('-samprate', "no no") cfg.set_float("-beam", 1e-100) cfg.set_float("-wbeam", 1e-80) # Read the audio file audio = read_audio_from_file(audio_path) audio = audio.set_channels(1).set_sample_width(2) audio_length_in_ms = len(audio.raw_data) # Downsampling is (probably) not necessary cfg.set_float("-samprate", audio.frame_rate) # Process audio, silencing or removing any DNA segments dna_segments = [] removed_segments = [] if config and "do-not-align" in config: # Sort un-alignable segments and join overlapping ones dna_segments = sort_and_join_dna_segments( config["do-not-align"]["segments"]) method = config["do-not-align"].get("method", "remove") # Determine do-not-align method if method == "mute": dna_method = mute_section elif method == "remove": dna_method = remove_section else: LOGGER.error("Unknown do-not-align method declared") # Process audio and save temporary files if method in ("mute", "remove"): processed_audio = audio # Process the DNA segments in reverse order so we don't have to correct # for previously processed ones when using the "remove" method. for seg in reversed(dna_segments): processed_audio = dna_method(processed_audio, int(seg["begin"]), int(seg["end"])) if save_temps: _, ext = os.path.splitext(audio_path) try: processed_audio.export(save_temps + "_processed" + ext, format=ext[1:]) except CouldntEncodeError: try: os.remove(save_temps + "_processed" + ext) except BaseException: pass LOGGER.warning( f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'" ) processed_audio.export(save_temps + "_processed" + ".wav") removed_segments = dna_segments audio_data = processed_audio else: audio_data = audio # Initialize the SoundSwallower decoder with the sample rate from the audio frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen")) fft_size = 1 while fft_size < frame_points: fft_size = fft_size << 1 cfg.set_int("-nfft", fft_size) frame_size = 1.0 / cfg.get_int("-frate") # Note: the frames are typically 0.01s long (i.e., the frame rate is typically 100), # while the audio segments manipulated using pydub are sliced and accessed in # millisecond intervals. For audio segments, the ms slice assumption is hard-coded # all over, while frames_to_time() is used to convert segment boundaries returned by # soundswallower, which are indexes in frames, into durations in seconds. def frames_to_time(frames): return frames * frame_size # Extract the list of sequences of words in the XML word_sequences = get_sequences(xml, xml_path, unit=unit) end = 0 for i, word_sequence in enumerate(word_sequences): i_suffix = "" if i == 0 else "." + str(i + 1) # Generate dictionary and FSG for the current sequence of words dict_data = make_dict(word_sequence.words, xml_path, unit=unit) if save_temps: dict_file = io.open(save_temps + ".dict" + i_suffix, "wb") else: dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False) dict_file.write(dict_data.encode("utf-8")) dict_file.close() fsg_data = make_fsg(word_sequence.words, xml_path) if save_temps: fsg_file = io.open(save_temps + ".fsg" + i_suffix, "wb") else: fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False) fsg_file.write(fsg_data.encode("utf-8")) fsg_file.close() # Extract the part of the audio corresponding to this word sequence audio_segment = extract_section(audio_data, word_sequence.start, word_sequence.end) if save_temps and audio_segment is not audio_data: write_audio_to_file(audio_segment, save_temps + ".wav" + i_suffix) # Configure soundswallower for this sequence's dict and fsg cfg.set_string("-dict", dict_file.name) cfg.set_string("-fsg", fsg_file.name) ps = soundswallower.Decoder(cfg) # Align this word sequence ps.start_utt() ps.process_raw(audio_segment.raw_data, no_search=False, full_utt=True) ps.end_utt() if not ps.seg(): raise RuntimeError( "Alignment produced no segments, " "please examine dictionary and input audio and text.") # List of removed segments for the sequence we are currently processing curr_removed_segments = dna_union(word_sequence.start, word_sequence.end, audio_length_in_ms, removed_segments) prev_segment_count = len(results["words"]) for seg in ps.seg(): if seg.word in ("<sil>", "[NOISE]"): continue start = frames_to_time(seg.start_frame) end = frames_to_time(seg.end_frame + 1) # change to ms start_ms = start * 1000 end_ms = end * 1000 if curr_removed_segments: start_ms += calculate_adjustment(start_ms, curr_removed_segments) end_ms += calculate_adjustment(end_ms, curr_removed_segments) start_ms, end_ms = correct_adjustments(start_ms, end_ms, curr_removed_segments) # change back to seconds to write to smil start = start_ms / 1000 end = end_ms / 1000 results["words"].append({ "id": seg.word, "start": start, "end": end }) LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end) aligned_segment_count = len(results["words"]) - prev_segment_count if aligned_segment_count != len(word_sequence.words): LOGGER.warning( f"Word sequence {i+1} had {len(word_sequence.words)} tokens " f"but produced {aligned_segment_count} segments. " "Check that the anchors are well positioned or " "that the audio corresponds to the text.") final_end = end if len(results["words"]) == 0: raise RuntimeError( "Alignment produced only noise or silence segments, " "please verify that the text is an actual transcript of the audio." ) if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)): LOGGER.warning( "Alignment produced a different number of segments and tokens than " "were in the input. Sequences between some anchors probably did not " "align successfully. Look for more anchors-related warnings above in the log." ) if not bare: # Take all the boundaries (anchors) around segments and add them as DNA # segments for the purpose of splitting silences dna_for_silence_splitting = copy.deepcopy(dna_segments) last_end = None for seq in word_sequences: if last_end or seq.start: dna_for_silence_splitting.append({ "begin": (last_end or seq.start), "end": (seq.start or last_end) }) last_end = seq.end if last_end: dna_for_silence_splitting.append({ "begin": last_end, "end": last_end }) dna_for_silence_splitting = sort_and_join_dna_segments( dna_for_silence_splitting) split_silences(results["words"], final_end, dna_for_silence_splitting) words_dict = { x["id"]: { "start": x["start"], "end": x["end"] } for x in results["words"] } silence_offsets = defaultdict(int) silence = 0 if results["tokenized"].xpath("//silence"): endpoint = 0 all_good = True for el in results["tokenized"].xpath("//*"): if el.tag == "silence" and "dur" in el.attrib: try: silence_ms = parse_time(el.attrib["dur"]) except ValueError as err: LOGGER.error( f'Invalid silence element in {xml_path}: invalid "time" ' f'attribute "{el.attrib["dur"]}": {err}') all_good = False continue silence_segment = AudioSegment.silent( duration=silence_ms) # create silence segment silence += silence_ms # add silence length to total silence audio = (audio[:endpoint] + silence_segment + audio[endpoint:] ) # insert silence at previous endpoint endpoint += silence_ms # add silence to previous endpoint if el.tag == "w": silence_offsets[el.attrib["id"]] += ( silence / 1000 ) # add silence in seconds to silence offset for word id endpoint = (words_dict[el.attrib["id"]]["end"] * 1000 ) + silence # bump endpoint and include silence if not all_good: raise RuntimeError( f"Could not parse all duration attributes in silence elements in {xml_path}, please make sure each silence " 'element is properly formatted, e.g., <silence dur="1.5s"/>. Aborting.' ) if silence: for word in results["words"]: word["start"] += silence_offsets[word["id"]] word["end"] += silence_offsets[word["id"]] results["audio"] = audio return results
def align_audio( xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None, ): """ Align an XML input file to an audio file. Parameters ---------- xml_path : str Path to XML input file in TEI-like format audio_path : str Path to audio input. Must be in a format supported by ffmpeg unit : str, optional Element to create alignments for, by default 'w' bare : boolean, optional If False, split silence into adjoining tokens (default) If True, keep the bare tokens without adjoining silences. config : object, optional Uses ReadAlong-Studio configuration save_temps : Union[str, None], optional save temporary files, by default None #TODO: document return Returns ------- [type] [description] #TODO: document exceptions Raises ------ RuntimeError [description] RuntimeError [description] RuntimeError [description] RuntimeError [description] """ results: Dict[str, List] = {"words": []} # First do G2P try: xml = etree.parse(xml_path).getroot() except etree.XMLSyntaxError as e: raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e)) if config and "images" in config: xml = add_images(xml, config) if config and "xml" in config: xml = add_supplementary_xml(xml, config) xml = add_lang_ids(xml, unit="s") xml = tokenize_xml(xml) if save_temps: save_xml(save_temps + ".tokenized.xml", xml) results["tokenized"] = xml = add_ids(xml) if save_temps: save_xml(save_temps + ".ids.xml", xml) xml = convert_xml(xml) if save_temps: save_xml(save_temps + ".g2p.xml", xml) # Now generate dictionary and FSG dict_data = make_dict(xml, xml_path, unit=unit) if save_temps: dict_file = io.open(save_temps + ".dict", "wb") else: dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False) dict_file.write(dict_data.encode("utf-8")) dict_file.flush() fsg_data = make_fsg(xml, xml_path, unit=unit) if save_temps: fsg_file = io.open(save_temps + ".fsg", "wb") else: fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False) fsg_file.write(fsg_data.encode("utf-8")) fsg_file.flush() # Now do alignment cfg = soundswallower.Decoder.default_config() model_path = soundswallower.get_model_path() cfg.set_boolean("-remove_noise", False) cfg.set_boolean("-remove_silence", False) cfg.set_string("-hmm", os.path.join(model_path, "en-us")) cfg.set_string("-dict", dict_file.name) cfg.set_string("-fsg", fsg_file.name) # cfg.set_string('-samprate', "no no") cfg.set_float("-beam", 1e-100) cfg.set_float("-wbeam", 1e-80) audio = read_audio_from_file(audio_path) audio = audio.set_channels(1).set_sample_width(2) # Downsampling is (probably) not necessary cfg.set_float("-samprate", audio.frame_rate) # Process audio do_not_align_segments = None if config and "do-not-align" in config: # Reverse sort un-alignable segments do_not_align_segments = sorted( config["do-not-align"]["segments"], key=lambda x: x["begin"], reverse=True ) method = config["do-not-align"].get("method", "remove") # Determine do-not-align method if method == "mute": dna_method = mute_section elif method == "remove": dna_method = remove_section else: LOGGER.error("Unknown do-not-align method declared") # Process audio and save temporary files if method == "mute" or method == "remove": processed_audio = audio for seg in do_not_align_segments: processed_audio = dna_method( processed_audio, int(seg["begin"]), int(seg["end"]) ) if save_temps: _, ext = os.path.splitext(audio_path) try: processed_audio.export( save_temps + "_processed" + ext, format=ext[1:] ) except CouldntEncodeError: os.remove(save_temps + "_processed" + ext) LOGGER.warn( f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'" ) processed_audio.export(save_temps + "_processed" + ".wav") raw_data = processed_audio.raw_data else: raw_data = audio.raw_data frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen")) fft_size = 1 while fft_size < frame_points: fft_size = fft_size << 1 cfg.set_int("-nfft", fft_size) ps = soundswallower.Decoder(cfg) frame_size = 1.0 / cfg.get_int("-frate") def frames_to_time(frames): return frames * frame_size ps.start_utt() ps.process_raw(raw_data, no_search=False, full_utt=True) ps.end_utt() if not ps.seg(): raise RuntimeError( "Alignment produced no segments, " "please examine dictionary and input audio and text." ) for seg in ps.seg(): start = frames_to_time(seg.start_frame) end = frames_to_time(seg.end_frame + 1) # change to ms start_ms = start * 1000 end_ms = end * 1000 if do_not_align_segments and method == "remove": start_ms += calculate_adjustment(start_ms, do_not_align_segments) end_ms += calculate_adjustment(end_ms, do_not_align_segments) start_ms, end_ms = correct_adjustments( start_ms, end_ms, do_not_align_segments ) # change back to seconds to write to smil start = start_ms / 1000 end = end_ms / 1000 if seg.word in ("<sil>", "[NOISE]"): continue else: results["words"].append({"id": seg.word, "start": start, "end": end}) LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end) if len(results["words"]) == 0: raise RuntimeError( "Alignment produced only noise or silence segments, " "please examine dictionary and input audio and text." ) if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)): raise RuntimeError( "Alignment produced a different number of segments and tokens, " "please examine dictionary and input audio and text." ) final_end = end if not bare: # Split adjoining silence/noise between words last_end = 0.0 last_word = dict() for word in results["words"]: silence = word["start"] - last_end midpoint = last_end + silence / 2 if silence > 0: if last_word: last_word["end"] = midpoint word["start"] = midpoint last_word = word last_end = word["end"] silence = final_end - last_end if silence > 0: if last_word is not None: last_word["end"] += silence / 2 dict_file.close() if not save_temps: os.unlink(dict_file.name) fsg_file.close() if not save_temps: os.unlink(fsg_file.name) return results
def run_convert_xml(self, input_string): """wrap convert_xml to make unit testing easier""" return etree.tounicode(convert_xml(etree.fromstring(input_string))[0])
def g2p(**kwargs): """Apply g2p mappings to TOKFILE into G2PFILE. TOKFILE should have been produced by 'readalongs tokenize'. G2PFILE can then be modified to adjust the phonetic representation as needed. 'readalongs align' can be called with G2PFILE instead of TOKFILE as XML input. The g2p cascade will be enabled whenever an XML element or any of its ancestors in TOKFILE has the attribute "fallback-langs" containing a comma- or colon-separated list of language codes. Provide multiple language codes to "readalongs prepare" via its -l option to generate this attribute globally, or add it manually where needed. Undetermined, "und", is automatically added at the end of the language list provided via -l. With the g2p cascade, if a word cannot be mapped to valid ARPABET with the language found in the "xml:lang" attribute, the languages in "fallback-langs" are tried in order until a valid ARPABET mapping is generated. The output XML file can be used as input to align. TOKFILE: Path to the input tokenized XML file, or - for stdin G2PFILE: Output path for the g2p'd XML, or - for stdout [default: TOKFILE with .g2p. inserted] """ if kwargs["debug"]: LOGGER.setLevel("DEBUG") LOGGER.info( "Running readalongs g2p(tokfile={}, g2pfile={}, force-overwrite={})." .format( kwargs["tokfile"], kwargs["g2pfile"], kwargs["force_overwrite"], )) input_file = kwargs["tokfile"] if not kwargs["g2pfile"]: output_path = get_click_file_name(input_file) if output_path != "-": if output_path.endswith(".xml"): output_path = output_path[:-4] if output_path.endswith(".tokenized"): output_path = output_path[:-len(".tokenized")] output_path += ".g2p.xml" else: output_path = kwargs["g2pfile"] if not output_path.endswith(".xml") and not output_path == "-": output_path += ".xml" if os.path.exists(output_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % output_path) try: xml = etree.parse(input_file).getroot() except etree.XMLSyntaxError as e: raise click.BadParameter( "Error parsing input file %s as XML, please verify it. Parser error: %s" % (get_click_file_name(input_file), e)) # Add the IDs to paragraph, sentences, word, etc. xml = add_ids(xml) # Apply the g2p mappings. xml, valid = convert_xml( xml, verbose_warnings=kwargs["g2p_verbose"], ) if output_path == "-": write_xml(sys.stdout.buffer, xml) else: save_xml(output_path, xml) LOGGER.info("Wrote {}".format(output_path)) if not valid: LOGGER.error("Some word(s) could not be g2p'd correctly." + ( " Run again with --g2p-verbose to get more detailed error messages." if not kwargs["g2p_verbose"] else "")) sys.exit(1)