def go(input_filename, output_filename, word_unit="w", output_orthography="eng-arpabet"): xml = load_xml(input_filename) converted_xml = convert_xml(xml, word_unit, output_orthography) save_xml(output_filename, converted_xml)
def tokenize(**kwargs): """Tokenize XMLFILE for 'readalongs align' into TOKFILE. XMLFILE should have been produce by 'readalongs prepare'. TOKFILE can be augmented with word-specific language codes. 'readalongs align' can be called with either XMLFILE or TOKFILE as XML input. XMLFILE: Path to the XML file to tokenize, or - for stdin TOKFILE: Output path for the tok'd XML, or - for stdout [default: XMLFILE.tokenized.xml] """ xmlfile = kwargs["xmlfile"] if kwargs["debug"]: LOGGER.setLevel("DEBUG") LOGGER.info( "Running readalongs tokenize(xmlfile={}, tokfile={}, force-overwrite={})." .format( kwargs["xmlfile"], kwargs["tokfile"], kwargs["force_overwrite"], )) if not kwargs["tokfile"]: try: output_tok_path = xmlfile.name except Exception: output_tok_path = "<stdin>" if output_tok_path == "<stdin>": output_tok_path = "-" else: if output_tok_path.endswith(".xml"): output_tok_path = output_tok_path[:-4] output_tok_path += ".tokenized.xml" else: output_tok_path = kwargs["tokfile"] if not output_tok_path.endswith(".xml") and not output_tok_path == "-": output_tok_path += ".xml" if os.path.exists(output_tok_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % output_tok_path) try: xml = etree.parse(xmlfile).getroot() except etree.XMLSyntaxError as e: raise click.BadParameter( "Error parsing input file %s as XML, please verify it. Parser error: %s" % (xmlfile, e)) xml = tokenize_xml(xml) if output_tok_path == "-": write_xml(sys.stdout.buffer, xml) else: save_xml(output_tok_path, xml) LOGGER.info("Wrote {}".format(output_tok_path))
def testConvert(self): xml_path = os.path.join(self.data_dir, "ej-fra-converted.xml") xml = etree.parse(xml_path).getroot() convert_to_xhtml(xml) with PortableNamedTemporaryFile(suffix=".xml") as tf: save_xml(tf.name, xml) txt = load_txt(tf.name) self.maxDiff = None self.assertEqual( txt, load_txt(os.path.join(self.data_dir, "ej-fra-converted.xhtml")), )
def go( input_filename, output_xml_filename, output_fsg_filename, output_dict_filename, unit, word_unit, out_orth, ): xml = load_xml(input_filename) xml, fsg, dct = end_to_end(xml, input_filename, unit, word_unit, out_orth) save_xml(output_xml_filename, xml) save_txt(output_fsg_filename, fsg) save_txt(output_dict_filename, dct)
def main(input_path, output_path, unit="p"): xml = load_xml(input_path) add_lang_ids(xml, unit) save_xml(output_path, xml)
def save_readalong( # noqa C901 # noqa C901 - ignore the complexity of this function # this * forces all arguments to be passed by name, because I don't want any # code to depend on their order in the future *, align_results: Dict[str, List], output_dir: str, output_basename: str, config=None, audiofile: str, audiosegment: AudioSegment = None, output_formats=(), ): """Save the results from align_audio() into the output files required for a readalong Args: align_results(Dict[str,List]): return value from align_audio() output_dir (str): directory where to save the readalong, output_dir should already exist, files it contains may be overwritten output_basename (str): basename of the files to save in output_dir config ([type TODO], optional): alignment configuration loaded from the json audiofile (str): path to the audio file passed to align_audio() output_formats (List[str], optional): list of desired output formats audiosegment (AudioSegment): a pydub.AudioSegment object of processed audio. if None, then original audio will be saved at `audiofile` Returns: None Raises: [TODO] """ # Round all times to three digits, anything more is excess precision # poluting the output files, and usually due to float rounding errors anyway. for w in align_results["words"]: w["start"] = round(w["start"], 3) w["end"] = round(w["end"], 3) output_base = os.path.join(output_dir, output_basename) # Create textgrid object if outputting to TextGrid or eaf if "TextGrid" in output_formats or "eaf" in output_formats: audio = read_audio_from_file(audiofile) duration = audio.frame_count() / audio.frame_rate words, sentences = return_words_and_sentences(align_results) textgrid = write_to_text_grid(words, sentences, duration) if "TextGrid" in output_formats: textgrid.to_file(output_base + ".TextGrid") if "eaf" in output_formats: textgrid.to_eaf().to_file(output_base + ".eaf") # Create webvtt object if outputting to vtt or srt if "srt" in output_formats or "vtt" in output_formats: words, sentences = return_words_and_sentences(align_results) cc_sentences = write_to_subtitles(sentences) cc_words = write_to_subtitles(words) if "srt" in output_formats: cc_sentences.save_as_srt(output_base + "_sentences.srt") cc_words.save_as_srt(output_base + "_words.srt") if "vtt" in output_formats: cc_words.save(output_base + "_words.vtt") cc_sentences.save(output_base + "_sentences.vtt") tokenized_xml_path = output_base + ".xml" save_xml(tokenized_xml_path, align_results["tokenized"]) if "xhtml" in output_formats: convert_to_xhtml(align_results["tokenized"]) tokenized_xhtml_path = output_base + ".xhtml" save_xml(tokenized_xhtml_path, align_results["tokenized"]) _, audio_ext = os.path.splitext(audiofile) audio_path = output_base + audio_ext audio_format = audio_ext[1:] if audiosegment: if audio_format in ["m4a", "aac"]: audio_format = "ipod" try: audiosegment.export(audio_path, format=audio_format) except CouldntEncodeError: LOGGER.warning(f"The audio file at {audio_path} could \ not be exported in the {audio_format} format. \ Please ensure your installation of ffmpeg has \ the necessary codecs.") audio_path = output_base + ".wav" audiosegment.export(audio_path, format="wav") else: shutil.copy(audiofile, audio_path) smil_path = output_base + ".smil" smil = make_smil( os.path.basename(tokenized_xml_path), os.path.basename(audio_path), align_results, ) save_txt(smil_path, smil) if "html" in output_formats: html_out_path = output_base + ".html" html_out = create_web_component_html(tokenized_xml_path, smil_path, audio_path) with open(html_out_path, "w") as f: f.write(html_out) save_minimal_index_html( os.path.join(output_dir, "index.html"), os.path.basename(tokenized_xml_path), os.path.basename(smil_path), os.path.basename(audio_path), ) # Copy the image files to the output's asset directory, if any are found if config and "images" in config: assets_dir = os.path.join(output_dir, "assets") try: os.mkdir(assets_dir) except FileExistsError: if not os.path.isdir(assets_dir): raise for _, image in config["images"].items(): if image[0:4] == "http": LOGGER.warning( f"Please make sure {image} is accessible to clients using your read-along." ) else: try: shutil.copy(image, assets_dir) except Exception as e: LOGGER.warning( f"Please copy {image} to {assets_dir} before deploying your read-along. ({e})" ) if os.path.basename(image) != image: LOGGER.warning( f"Read-along images were tested with absolute urls (starting with http(s):// " f"and filenames without a path. {image} might not work as specified." )
def align_audio( # noqa: C901 xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None, verbose_g2p_warnings=False, ): """Align an XML input file to an audio file. Args: xml_path (str): Path to XML input file in TEI-like format audio_path (str): Path to audio input. Must be in a format supported by ffmpeg unit (str): Optional; Element to create alignments for, by default 'w' bare (boolean): Optional; If False, split silence into adjoining tokens (default) If True, keep the bare tokens without adjoining silences. config (object): Optional; ReadAlong-Studio configuration to use save_temps (str): Optional; Save temporary files, by default None verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings iff True Returns: Dict[str, List]: TODO Raises: TODO """ results: Dict[str, List] = {"words": [], "audio": None} # First do G2P try: xml = etree.parse(xml_path).getroot() except etree.XMLSyntaxError as e: raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e)) from e if config and "images" in config: xml = add_images(xml, config) if config and "xml" in config: xml = add_supplementary_xml(xml, config) xml = tokenize_xml(xml) if save_temps: save_xml(save_temps + ".tokenized.xml", xml) results["tokenized"] = xml = add_ids(xml) if save_temps: save_xml(save_temps + ".ids.xml", xml) xml, valid = convert_xml(xml, verbose_warnings=verbose_g2p_warnings) if save_temps: save_xml(save_temps + ".g2p.xml", xml) if not valid: raise RuntimeError( "Some words could not be g2p'd correctly. Aborting. " "Run with --g2p-verbose for more detailed g2p error logs.") # Prepare the SoundsSwallower (formerly PocketSphinx) configuration cfg = soundswallower.Decoder.default_config() model_path = soundswallower.get_model_path() cfg.set_boolean("-remove_noise", False) cfg.set_boolean("-remove_silence", False) cfg.set_string("-hmm", os.path.join(model_path, "en-us")) # cfg.set_string('-samprate', "no no") cfg.set_float("-beam", 1e-100) cfg.set_float("-wbeam", 1e-80) # Read the audio file audio = read_audio_from_file(audio_path) audio = audio.set_channels(1).set_sample_width(2) audio_length_in_ms = len(audio.raw_data) # Downsampling is (probably) not necessary cfg.set_float("-samprate", audio.frame_rate) # Process audio, silencing or removing any DNA segments dna_segments = [] removed_segments = [] if config and "do-not-align" in config: # Sort un-alignable segments and join overlapping ones dna_segments = sort_and_join_dna_segments( config["do-not-align"]["segments"]) method = config["do-not-align"].get("method", "remove") # Determine do-not-align method if method == "mute": dna_method = mute_section elif method == "remove": dna_method = remove_section else: LOGGER.error("Unknown do-not-align method declared") # Process audio and save temporary files if method in ("mute", "remove"): processed_audio = audio # Process the DNA segments in reverse order so we don't have to correct # for previously processed ones when using the "remove" method. for seg in reversed(dna_segments): processed_audio = dna_method(processed_audio, int(seg["begin"]), int(seg["end"])) if save_temps: _, ext = os.path.splitext(audio_path) try: processed_audio.export(save_temps + "_processed" + ext, format=ext[1:]) except CouldntEncodeError: try: os.remove(save_temps + "_processed" + ext) except BaseException: pass LOGGER.warning( f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'" ) processed_audio.export(save_temps + "_processed" + ".wav") removed_segments = dna_segments audio_data = processed_audio else: audio_data = audio # Initialize the SoundSwallower decoder with the sample rate from the audio frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen")) fft_size = 1 while fft_size < frame_points: fft_size = fft_size << 1 cfg.set_int("-nfft", fft_size) frame_size = 1.0 / cfg.get_int("-frate") # Note: the frames are typically 0.01s long (i.e., the frame rate is typically 100), # while the audio segments manipulated using pydub are sliced and accessed in # millisecond intervals. For audio segments, the ms slice assumption is hard-coded # all over, while frames_to_time() is used to convert segment boundaries returned by # soundswallower, which are indexes in frames, into durations in seconds. def frames_to_time(frames): return frames * frame_size # Extract the list of sequences of words in the XML word_sequences = get_sequences(xml, xml_path, unit=unit) end = 0 for i, word_sequence in enumerate(word_sequences): i_suffix = "" if i == 0 else "." + str(i + 1) # Generate dictionary and FSG for the current sequence of words dict_data = make_dict(word_sequence.words, xml_path, unit=unit) if save_temps: dict_file = io.open(save_temps + ".dict" + i_suffix, "wb") else: dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False) dict_file.write(dict_data.encode("utf-8")) dict_file.close() fsg_data = make_fsg(word_sequence.words, xml_path) if save_temps: fsg_file = io.open(save_temps + ".fsg" + i_suffix, "wb") else: fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False) fsg_file.write(fsg_data.encode("utf-8")) fsg_file.close() # Extract the part of the audio corresponding to this word sequence audio_segment = extract_section(audio_data, word_sequence.start, word_sequence.end) if save_temps and audio_segment is not audio_data: write_audio_to_file(audio_segment, save_temps + ".wav" + i_suffix) # Configure soundswallower for this sequence's dict and fsg cfg.set_string("-dict", dict_file.name) cfg.set_string("-fsg", fsg_file.name) ps = soundswallower.Decoder(cfg) # Align this word sequence ps.start_utt() ps.process_raw(audio_segment.raw_data, no_search=False, full_utt=True) ps.end_utt() if not ps.seg(): raise RuntimeError( "Alignment produced no segments, " "please examine dictionary and input audio and text.") # List of removed segments for the sequence we are currently processing curr_removed_segments = dna_union(word_sequence.start, word_sequence.end, audio_length_in_ms, removed_segments) prev_segment_count = len(results["words"]) for seg in ps.seg(): if seg.word in ("<sil>", "[NOISE]"): continue start = frames_to_time(seg.start_frame) end = frames_to_time(seg.end_frame + 1) # change to ms start_ms = start * 1000 end_ms = end * 1000 if curr_removed_segments: start_ms += calculate_adjustment(start_ms, curr_removed_segments) end_ms += calculate_adjustment(end_ms, curr_removed_segments) start_ms, end_ms = correct_adjustments(start_ms, end_ms, curr_removed_segments) # change back to seconds to write to smil start = start_ms / 1000 end = end_ms / 1000 results["words"].append({ "id": seg.word, "start": start, "end": end }) LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end) aligned_segment_count = len(results["words"]) - prev_segment_count if aligned_segment_count != len(word_sequence.words): LOGGER.warning( f"Word sequence {i+1} had {len(word_sequence.words)} tokens " f"but produced {aligned_segment_count} segments. " "Check that the anchors are well positioned or " "that the audio corresponds to the text.") final_end = end if len(results["words"]) == 0: raise RuntimeError( "Alignment produced only noise or silence segments, " "please verify that the text is an actual transcript of the audio." ) if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)): LOGGER.warning( "Alignment produced a different number of segments and tokens than " "were in the input. Sequences between some anchors probably did not " "align successfully. Look for more anchors-related warnings above in the log." ) if not bare: # Take all the boundaries (anchors) around segments and add them as DNA # segments for the purpose of splitting silences dna_for_silence_splitting = copy.deepcopy(dna_segments) last_end = None for seq in word_sequences: if last_end or seq.start: dna_for_silence_splitting.append({ "begin": (last_end or seq.start), "end": (seq.start or last_end) }) last_end = seq.end if last_end: dna_for_silence_splitting.append({ "begin": last_end, "end": last_end }) dna_for_silence_splitting = sort_and_join_dna_segments( dna_for_silence_splitting) split_silences(results["words"], final_end, dna_for_silence_splitting) words_dict = { x["id"]: { "start": x["start"], "end": x["end"] } for x in results["words"] } silence_offsets = defaultdict(int) silence = 0 if results["tokenized"].xpath("//silence"): endpoint = 0 all_good = True for el in results["tokenized"].xpath("//*"): if el.tag == "silence" and "dur" in el.attrib: try: silence_ms = parse_time(el.attrib["dur"]) except ValueError as err: LOGGER.error( f'Invalid silence element in {xml_path}: invalid "time" ' f'attribute "{el.attrib["dur"]}": {err}') all_good = False continue silence_segment = AudioSegment.silent( duration=silence_ms) # create silence segment silence += silence_ms # add silence length to total silence audio = (audio[:endpoint] + silence_segment + audio[endpoint:] ) # insert silence at previous endpoint endpoint += silence_ms # add silence to previous endpoint if el.tag == "w": silence_offsets[el.attrib["id"]] += ( silence / 1000 ) # add silence in seconds to silence offset for word id endpoint = (words_dict[el.attrib["id"]]["end"] * 1000 ) + silence # bump endpoint and include silence if not all_good: raise RuntimeError( f"Could not parse all duration attributes in silence elements in {xml_path}, please make sure each silence " 'element is properly formatted, e.g., <silence dur="1.5s"/>. Aborting.' ) if silence: for word in results["words"]: word["start"] += silence_offsets[word["id"]] word["end"] += silence_offsets[word["id"]] results["audio"] = audio return results
def align_audio( xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None, ): """ Align an XML input file to an audio file. Parameters ---------- xml_path : str Path to XML input file in TEI-like format audio_path : str Path to audio input. Must be in a format supported by ffmpeg unit : str, optional Element to create alignments for, by default 'w' bare : boolean, optional If False, split silence into adjoining tokens (default) If True, keep the bare tokens without adjoining silences. config : object, optional Uses ReadAlong-Studio configuration save_temps : Union[str, None], optional save temporary files, by default None #TODO: document return Returns ------- [type] [description] #TODO: document exceptions Raises ------ RuntimeError [description] RuntimeError [description] RuntimeError [description] RuntimeError [description] """ results: Dict[str, List] = {"words": []} # First do G2P try: xml = etree.parse(xml_path).getroot() except etree.XMLSyntaxError as e: raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e)) if config and "images" in config: xml = add_images(xml, config) if config and "xml" in config: xml = add_supplementary_xml(xml, config) xml = add_lang_ids(xml, unit="s") xml = tokenize_xml(xml) if save_temps: save_xml(save_temps + ".tokenized.xml", xml) results["tokenized"] = xml = add_ids(xml) if save_temps: save_xml(save_temps + ".ids.xml", xml) xml = convert_xml(xml) if save_temps: save_xml(save_temps + ".g2p.xml", xml) # Now generate dictionary and FSG dict_data = make_dict(xml, xml_path, unit=unit) if save_temps: dict_file = io.open(save_temps + ".dict", "wb") else: dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False) dict_file.write(dict_data.encode("utf-8")) dict_file.flush() fsg_data = make_fsg(xml, xml_path, unit=unit) if save_temps: fsg_file = io.open(save_temps + ".fsg", "wb") else: fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False) fsg_file.write(fsg_data.encode("utf-8")) fsg_file.flush() # Now do alignment cfg = soundswallower.Decoder.default_config() model_path = soundswallower.get_model_path() cfg.set_boolean("-remove_noise", False) cfg.set_boolean("-remove_silence", False) cfg.set_string("-hmm", os.path.join(model_path, "en-us")) cfg.set_string("-dict", dict_file.name) cfg.set_string("-fsg", fsg_file.name) # cfg.set_string('-samprate', "no no") cfg.set_float("-beam", 1e-100) cfg.set_float("-wbeam", 1e-80) audio = read_audio_from_file(audio_path) audio = audio.set_channels(1).set_sample_width(2) # Downsampling is (probably) not necessary cfg.set_float("-samprate", audio.frame_rate) # Process audio do_not_align_segments = None if config and "do-not-align" in config: # Reverse sort un-alignable segments do_not_align_segments = sorted( config["do-not-align"]["segments"], key=lambda x: x["begin"], reverse=True ) method = config["do-not-align"].get("method", "remove") # Determine do-not-align method if method == "mute": dna_method = mute_section elif method == "remove": dna_method = remove_section else: LOGGER.error("Unknown do-not-align method declared") # Process audio and save temporary files if method == "mute" or method == "remove": processed_audio = audio for seg in do_not_align_segments: processed_audio = dna_method( processed_audio, int(seg["begin"]), int(seg["end"]) ) if save_temps: _, ext = os.path.splitext(audio_path) try: processed_audio.export( save_temps + "_processed" + ext, format=ext[1:] ) except CouldntEncodeError: os.remove(save_temps + "_processed" + ext) LOGGER.warn( f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'" ) processed_audio.export(save_temps + "_processed" + ".wav") raw_data = processed_audio.raw_data else: raw_data = audio.raw_data frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen")) fft_size = 1 while fft_size < frame_points: fft_size = fft_size << 1 cfg.set_int("-nfft", fft_size) ps = soundswallower.Decoder(cfg) frame_size = 1.0 / cfg.get_int("-frate") def frames_to_time(frames): return frames * frame_size ps.start_utt() ps.process_raw(raw_data, no_search=False, full_utt=True) ps.end_utt() if not ps.seg(): raise RuntimeError( "Alignment produced no segments, " "please examine dictionary and input audio and text." ) for seg in ps.seg(): start = frames_to_time(seg.start_frame) end = frames_to_time(seg.end_frame + 1) # change to ms start_ms = start * 1000 end_ms = end * 1000 if do_not_align_segments and method == "remove": start_ms += calculate_adjustment(start_ms, do_not_align_segments) end_ms += calculate_adjustment(end_ms, do_not_align_segments) start_ms, end_ms = correct_adjustments( start_ms, end_ms, do_not_align_segments ) # change back to seconds to write to smil start = start_ms / 1000 end = end_ms / 1000 if seg.word in ("<sil>", "[NOISE]"): continue else: results["words"].append({"id": seg.word, "start": start, "end": end}) LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end) if len(results["words"]) == 0: raise RuntimeError( "Alignment produced only noise or silence segments, " "please examine dictionary and input audio and text." ) if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)): raise RuntimeError( "Alignment produced a different number of segments and tokens, " "please examine dictionary and input audio and text." ) final_end = end if not bare: # Split adjoining silence/noise between words last_end = 0.0 last_word = dict() for word in results["words"]: silence = word["start"] - last_end midpoint = last_end + silence / 2 if silence > 0: if last_word: last_word["end"] = midpoint word["start"] = midpoint last_word = word last_end = word["end"] silence = final_end - last_end if silence > 0: if last_word is not None: last_word["end"] += silence / 2 dict_file.close() if not save_temps: os.unlink(dict_file.name) fsg_file.close() if not save_temps: os.unlink(fsg_file.name) return results
def go(input_filename, output_filename): xml = load_xml(input_filename) xml = tokenize_xml(xml) save_xml(output_filename, xml)
def go(input_filename: str, output_filename: str) -> None: xml = load_xml(input_filename) xml = add_ids(xml) save_xml(output_filename, xml)
def align(**kwargs): """Align TEXTFILE and AUDIOFILE and create output files as OUTPUT_BASE.* in directory OUTPUT_BASE/. TEXTFILE: Input text file path (in XML, or plain text with -i) AUDIOFILE: Input audio file path, in any format supported by ffmpeg OUTPUT_BASE: Base name for output files """ config = kwargs.get("config", None) if config: if config.endswith("json"): try: with open(config) as f: config = json.load(f) except json.decoder.JSONDecodeError: LOGGER.error(f"Config file at {config} is not valid json.") else: raise click.BadParameter(f"Config file '{config}' must be in JSON format") output_dir = kwargs["output_base"] if os.path.exists(output_dir): if not os.path.isdir(output_dir): raise click.UsageError( f"Output folder '{output_dir}' already exists but is a not a directory." ) if not kwargs["force_overwrite"]: raise click.UsageError( f"Output folder '{output_dir}' already exists, use -f to overwrite." ) else: os.mkdir(output_dir) # Make sure we can write to the output directory, for early error checking and user # friendly error messages. try: with TemporaryFile(dir=output_dir): pass except Exception: raise click.UsageError( f"Cannot write into output folder '{output_dir}'. Please verify permissions." ) output_basename = os.path.basename(output_dir) output_base = os.path.join(output_dir, output_basename) temp_base = None if kwargs["save_temps"]: temp_dir = os.path.join(output_dir, "tempfiles") if not os.path.isdir(temp_dir): if os.path.exists(temp_dir) and kwargs["force_overwrite"]: os.unlink(temp_dir) os.mkdir(temp_dir) temp_base = os.path.join(temp_dir, output_basename) if kwargs["debug"]: LOGGER.setLevel("DEBUG") if kwargs["text_input"]: if not kwargs["language"]: LOGGER.warn("No input language provided, using undetermined mapping") tempfile, kwargs["textfile"] = create_input_tei( input_file_name=kwargs["textfile"], text_language=kwargs["language"], save_temps=temp_base, ) if kwargs["output_xhtml"]: tokenized_xml_path = "%s.xhtml" % output_base else: _, input_ext = os.path.splitext(kwargs["textfile"]) tokenized_xml_path = "%s%s" % (output_base, input_ext) if os.path.exists(tokenized_xml_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % tokenized_xml_path ) smil_path = output_base + ".smil" if os.path.exists(smil_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % smil_path ) _, audio_ext = os.path.splitext(kwargs["audiofile"]) audio_path = output_base + audio_ext if os.path.exists(audio_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % audio_path ) unit = kwargs.get("unit", "w") bare = kwargs.get("bare", False) if ( not unit ): # .get() above should handle this but apparently the way kwargs is implemented unit = "w" # unit could still be None here. try: results = align_audio( kwargs["textfile"], kwargs["audiofile"], unit=unit, bare=bare, config=config, save_temps=temp_base, ) except RuntimeError as e: LOGGER.error(e) exit(1) if kwargs["text_grid"]: audio = read_audio_from_file(kwargs["audiofile"]) duration = audio.frame_count() / audio.frame_rate words, sentences = return_words_and_sentences(results) textgrid = write_to_text_grid(words, sentences, duration) textgrid.to_file(output_base + ".TextGrid") textgrid.to_eaf().to_file(output_base + ".eaf") if kwargs["closed_captioning"]: words, sentences = return_words_and_sentences(results) webvtt_sentences = write_to_subtitles(sentences) webvtt_sentences.save(output_base + "_sentences.vtt") webvtt_sentences.save_as_srt(output_base + "_sentences.srt") webvtt_words = write_to_subtitles(words) webvtt_words.save(output_base + "_words.vtt") webvtt_words.save_as_srt(output_base + "_words.srt") if kwargs["output_xhtml"]: convert_to_xhtml(results["tokenized"]) save_minimal_index_html( os.path.join(output_dir, "index.html"), os.path.basename(tokenized_xml_path), os.path.basename(smil_path), os.path.basename(audio_path), ) save_xml(tokenized_xml_path, results["tokenized"]) smil = make_smil( os.path.basename(tokenized_xml_path), os.path.basename(audio_path), results ) shutil.copy(kwargs["audiofile"], audio_path) save_txt(smil_path, smil)
def g2p(**kwargs): """Apply g2p mappings to TOKFILE into G2PFILE. TOKFILE should have been produced by 'readalongs tokenize'. G2PFILE can then be modified to adjust the phonetic representation as needed. 'readalongs align' can be called with G2PFILE instead of TOKFILE as XML input. The g2p cascade will be enabled whenever an XML element or any of its ancestors in TOKFILE has the attribute "fallback-langs" containing a comma- or colon-separated list of language codes. Provide multiple language codes to "readalongs prepare" via its -l option to generate this attribute globally, or add it manually where needed. Undetermined, "und", is automatically added at the end of the language list provided via -l. With the g2p cascade, if a word cannot be mapped to valid ARPABET with the language found in the "xml:lang" attribute, the languages in "fallback-langs" are tried in order until a valid ARPABET mapping is generated. The output XML file can be used as input to align. TOKFILE: Path to the input tokenized XML file, or - for stdin G2PFILE: Output path for the g2p'd XML, or - for stdout [default: TOKFILE with .g2p. inserted] """ if kwargs["debug"]: LOGGER.setLevel("DEBUG") LOGGER.info( "Running readalongs g2p(tokfile={}, g2pfile={}, force-overwrite={})." .format( kwargs["tokfile"], kwargs["g2pfile"], kwargs["force_overwrite"], )) input_file = kwargs["tokfile"] if not kwargs["g2pfile"]: output_path = get_click_file_name(input_file) if output_path != "-": if output_path.endswith(".xml"): output_path = output_path[:-4] if output_path.endswith(".tokenized"): output_path = output_path[:-len(".tokenized")] output_path += ".g2p.xml" else: output_path = kwargs["g2pfile"] if not output_path.endswith(".xml") and not output_path == "-": output_path += ".xml" if os.path.exists(output_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % output_path) try: xml = etree.parse(input_file).getroot() except etree.XMLSyntaxError as e: raise click.BadParameter( "Error parsing input file %s as XML, please verify it. Parser error: %s" % (get_click_file_name(input_file), e)) # Add the IDs to paragraph, sentences, word, etc. xml = add_ids(xml) # Apply the g2p mappings. xml, valid = convert_xml( xml, verbose_warnings=kwargs["g2p_verbose"], ) if output_path == "-": write_xml(sys.stdout.buffer, xml) else: save_xml(output_path, xml) LOGGER.info("Wrote {}".format(output_path)) if not valid: LOGGER.error("Some word(s) could not be g2p'd correctly." + ( " Run again with --g2p-verbose to get more detailed error messages." if not kwargs["g2p_verbose"] else "")) sys.exit(1)