def test_align_with_invalid_preg2p(self): """readalongs g2p gracefully handling wrong inputs""" txt = """<document><s xml:lang="und"> <w>word</w> <w ARPABET="G OW D">good</w> <w ARPABET="NOT ARPABET">error</w> </s></document>""" input_file = os.path.join(self.tempdir, "pre-g2p.xml") with open(input_file, "w", encoding="utf8") as f: print(txt, file=f) results = self.runner.invoke(g2p, [input_file, "-"]) self.assertNotEqual(results.exit_code, 0) # print(results.output) self.assertIn("could not be g2p", results.output) self.assertIn('<w id="s0w0" ARPABET="W OW D D">word</w>', results.output) self.assertIn('<w ARPABET="G OW D" id="s0w1">good</w>', results.output) self.assertIn('<w ARPABET="NOT ARPABET" id="s0w2">error</w>', results.output) audio_file = os.path.join(self.data_dir, "ej-fra.m4a") with self.assertRaises(RuntimeError) as e: results = align_audio(input_file, audio_file) self.assertIn("could not be g2p'd", str(e.exception))
def testAlign(self): xml_path = os.path.join(self.data_dir, "ej-fra.xml") wav_path = os.path.join(self.data_dir, "ej-fra.m4a") results = align_audio(xml_path, wav_path, unit="w") # Verify that the same IDs are in the output converted_path = os.path.join(self.data_dir, "ej-fra-converted.xml") xml = etree.parse(converted_path).getroot() words = results["words"] xml_words = xml.xpath(".//w") self.assertEqual(len(words), len(xml_words)) for w, xw in zip(words, xml_words): self.assertEqual(xw.attrib["id"], w["id"])
def test_align_with_preg2p(self): """readalongs align working on previously g2p'd text""" text_file = os.path.join(self.data_dir, "mixed-langs.tokenized.xml") audio_file = os.path.join(self.data_dir, "ej-fra.m4a") with SoundSwallowerStub("t0b0d0p0s0w0:920:1620", "t0b0d0p0s1w0:1620:1690"): _ = align_audio(text_file, audio_file, save_temps=os.path.join(self.tempdir, "foo")) with open(os.path.join(self.tempdir, "foo.dict"), "r", encoding="utf8") as f: dict_file = f.read() self.assertIn("S AH S IY", dict_file) # "ceci" in fra self.assertIn("DH IH S", dict_file) # "this" in eng self.assertIn("HH EH Y", dict_file) # "Hej" in dan self.assertIn("D G IY T UW P IY D", dict_file) # pre-g2p'd OOV
def testAlignText(self): txt_path = os.path.join(self.data_dir, "ej-fra.txt") wav_path = os.path.join(self.data_dir, "ej-fra.m4a") # tempfh, temp_fn = create_input_xml(txt_path, text_language='git', save_temps="unit") tempfh, temp_fn = create_input_tei(input_file_name=txt_path, text_language="fra", save_temps=None) results = align_audio(temp_fn, wav_path, unit="w", save_temps=None) # Verify that the same IDs are in the output converted_path = os.path.join(self.data_dir, "ej-fra-converted.xml") xml = etree.parse(converted_path).getroot() words = results["words"] xml_words = xml.xpath(".//w") self.assertEqual(len(words), len(xml_words)) for w, xw in zip(words, xml_words): self.assertEqual(xw.attrib["id"], w["id"])
def test_align_text(self): """Basic alignment test case with plain text input""" txt_path = os.path.join(self.data_dir, "ej-fra.txt") wav_path = os.path.join(self.data_dir, "ej-fra.m4a") _, temp_fn = create_input_tei(input_file_name=txt_path, text_languages=("fra", ), save_temps=None) results = align_audio(temp_fn, wav_path, unit="w", save_temps=None) # Verify that the same IDs are in the output converted_path = os.path.join(self.data_dir, "ej-fra-converted.xml") xml = etree.parse(converted_path).getroot() words = results["words"] xml_words = xml.xpath(".//w") self.assertEqual(len(words), len(xml_words)) for w, xw in zip(words, xml_words): self.assertEqual(xw.attrib["id"], w["id"])
def test_anchors_inner_only(self): """Test aligning with anchors only between existing text""" # ej-fra-anchors has anchors between words/sentences only results = align_audio( os.path.join(self.data_dir, "ej-fra-anchors.xml"), os.path.join(self.data_dir, "ej-fra.m4a"), ) words = results["words"] # The input text file has 99 words, so should the aligned segments. self.assertEqual(len(words), 99) # Make sure the aligned segments stay on the right side of their anchors self.assertLessEqual(words[0]["end"], 1.62) self.assertGreaterEqual(words[1]["start"], 1.62) self.assertLessEqual(words[8]["end"], 3.81) self.assertGreaterEqual(words[9]["start"], 3.82) self.assertLessEqual(words[21]["end"], 6.74) self.assertGreaterEqual(words[22]["start"], 6.74)
def test_anchors_outer_too(self): """Test aligning with anchors defining DNA segments at start and end too""" # ej-fra-anchors2 also has anchors before the first word and after the last word save_temps_prefix = os.path.join(self.tempdir, "anchors2-temps") results = align_audio( os.path.join(self.data_dir, "ej-fra-anchors2.xml"), os.path.join(self.data_dir, "ej-fra.m4a"), save_temps=save_temps_prefix, ) words = results["words"] # The input text file has 99 words, so should the aligned segments. self.assertEqual(len(words), 99) # Make sure the aligned segments stay on the right side of their anchors, # including the initial and final ones inserted into anchors2.xml self.assertGreaterEqual(words[0]["start"], 0.5) self.assertLessEqual(words[0]["end"], 1.2) self.assertGreaterEqual(words[1]["start"], 1.2) self.assertLessEqual(words[8]["end"], 3.6) self.assertGreaterEqual(words[9]["start"], 3.9) self.assertLessEqual(words[21]["end"], 7.0) self.assertGreaterEqual(words[22]["start"], 7.0) self.assertLessEqual(words[-1]["end"], 33.2) # Make sure the audio segment temp files were written and are not empty for suff in ("", ".2", ".3", ".4"): partial_wav_file = save_temps_prefix + ".wav" + suff self.assertTrue( os.path.exists(partial_wav_file), f"{partial_wav_file} should exist" ) self.assertGreater( os.path.getsize(partial_wav_file), 0, f"{partial_wav_file} should not be empty", )
def align(**kwargs): """Align TEXTFILE and AUDIOFILE and create output files as OUTPUT_BASE.* in directory OUTPUT_BASE/. TEXTFILE: Input text file path (in XML, or plain text with -i) AUDIOFILE: Input audio file path, in any format supported by ffmpeg OUTPUT_BASE: Base name for output files """ config = kwargs.get("config", None) if config: if config.endswith("json"): try: with open(config) as f: config = json.load(f) except json.decoder.JSONDecodeError: LOGGER.error(f"Config file at {config} is not valid json.") else: raise click.BadParameter(f"Config file '{config}' must be in JSON format") output_dir = kwargs["output_base"] if os.path.exists(output_dir): if not os.path.isdir(output_dir): raise click.UsageError( f"Output folder '{output_dir}' already exists but is a not a directory." ) if not kwargs["force_overwrite"]: raise click.UsageError( f"Output folder '{output_dir}' already exists, use -f to overwrite." ) else: os.mkdir(output_dir) # Make sure we can write to the output directory, for early error checking and user # friendly error messages. try: with TemporaryFile(dir=output_dir): pass except Exception: raise click.UsageError( f"Cannot write into output folder '{output_dir}'. Please verify permissions." ) output_basename = os.path.basename(output_dir) output_base = os.path.join(output_dir, output_basename) temp_base = None if kwargs["save_temps"]: temp_dir = os.path.join(output_dir, "tempfiles") if not os.path.isdir(temp_dir): if os.path.exists(temp_dir) and kwargs["force_overwrite"]: os.unlink(temp_dir) os.mkdir(temp_dir) temp_base = os.path.join(temp_dir, output_basename) if kwargs["debug"]: LOGGER.setLevel("DEBUG") if kwargs["text_input"]: if not kwargs["language"]: LOGGER.warn("No input language provided, using undetermined mapping") tempfile, kwargs["textfile"] = create_input_tei( input_file_name=kwargs["textfile"], text_language=kwargs["language"], save_temps=temp_base, ) if kwargs["output_xhtml"]: tokenized_xml_path = "%s.xhtml" % output_base else: _, input_ext = os.path.splitext(kwargs["textfile"]) tokenized_xml_path = "%s%s" % (output_base, input_ext) if os.path.exists(tokenized_xml_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % tokenized_xml_path ) smil_path = output_base + ".smil" if os.path.exists(smil_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % smil_path ) _, audio_ext = os.path.splitext(kwargs["audiofile"]) audio_path = output_base + audio_ext if os.path.exists(audio_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % audio_path ) unit = kwargs.get("unit", "w") bare = kwargs.get("bare", False) if ( not unit ): # .get() above should handle this but apparently the way kwargs is implemented unit = "w" # unit could still be None here. try: results = align_audio( kwargs["textfile"], kwargs["audiofile"], unit=unit, bare=bare, config=config, save_temps=temp_base, ) except RuntimeError as e: LOGGER.error(e) exit(1) if kwargs["text_grid"]: audio = read_audio_from_file(kwargs["audiofile"]) duration = audio.frame_count() / audio.frame_rate words, sentences = return_words_and_sentences(results) textgrid = write_to_text_grid(words, sentences, duration) textgrid.to_file(output_base + ".TextGrid") textgrid.to_eaf().to_file(output_base + ".eaf") if kwargs["closed_captioning"]: words, sentences = return_words_and_sentences(results) webvtt_sentences = write_to_subtitles(sentences) webvtt_sentences.save(output_base + "_sentences.vtt") webvtt_sentences.save_as_srt(output_base + "_sentences.srt") webvtt_words = write_to_subtitles(words) webvtt_words.save(output_base + "_words.vtt") webvtt_words.save_as_srt(output_base + "_words.srt") if kwargs["output_xhtml"]: convert_to_xhtml(results["tokenized"]) save_minimal_index_html( os.path.join(output_dir, "index.html"), os.path.basename(tokenized_xml_path), os.path.basename(smil_path), os.path.basename(audio_path), ) save_xml(tokenized_xml_path, results["tokenized"]) smil = make_smil( os.path.basename(tokenized_xml_path), os.path.basename(audio_path), results ) shutil.copy(kwargs["audiofile"], audio_path) save_txt(smil_path, smil)
def align(**kwargs): """Align TEXTFILE and AUDIOFILE and create output files as OUTPUT_BASE.* in directory OUTPUT_BASE/. TEXTFILE: Input text file path (in XML or plain text) \b If TEXTFILE has a .xml extension or starts with an XML declaration line, it is parsed as XML and can be in one of three formats: - the output of 'readalongs prepare', - the output of 'readalongs tokenize', or - the output of 'readalongs g2p'. \b If TEXTFILE has a .txt extension or does not start with an XML declaration line, is it read as plain text with the following conventions: - The text should be plain UTF-8 text without any markup. - Paragraph breaks are indicated by inserting one blank line. - Page breaks are indicated by inserting two blank lines. One can add the known ARPABET phonetics in the XML for words (<w> elements) that are not correctly handled by g2p in the output of 'readalongs tokenize' or 'readalongs g2p', via the ARPABET attribute. One can add anchor elements in the XML, e.g., '<anchor time="2.345s"/>', to mark known anchor points between the audio and text stream. AUDIOFILE: Input audio file path, in any format supported by ffmpeg OUTPUT_BASE: Output files will be saved as OUTPUT_BASE/OUTPUT_BASE.* """ config_file = kwargs.get("config", None) config = None if config_file: if config_file.endswith("json"): try: with open(config_file, encoding="utf8") as f: config = json.load(f) except json.decoder.JSONDecodeError as e: raise click.BadParameter( f"Config file at {config_file} is not in valid JSON format." ) from e else: raise click.BadParameter( f"Config file '{config_file}' must be in JSON format") output_dir = kwargs["output_base"] if os.path.exists(output_dir): if not os.path.isdir(output_dir): raise click.UsageError( f"Output folder '{output_dir}' already exists but is a not a directory." ) if not kwargs["force_overwrite"]: raise click.UsageError( f"Output folder '{output_dir}' already exists, use -f to overwrite." ) else: os.mkdir(output_dir) # Make sure we can write to the output directory, for early error checking and user # friendly error messages. try: with TemporaryFile(dir=output_dir): pass except Exception as e: raise click.UsageError( f"Cannot write into output folder '{output_dir}'. Please verify permissions." ) from e if kwargs["g2p_fallback"] is not None: raise click.BadParameter( "The --g2p-fallback option is obsolete.\n" "Specify multiple languages with the -l/--language option instead,\n" "or by adding the 'fallback-langs' attribute where relevant in your XML input." ) output_basename = os.path.basename(output_dir) temp_base = None if kwargs["save_temps"]: temp_dir = os.path.join(output_dir, "tempfiles") if not os.path.isdir(temp_dir): if os.path.exists(temp_dir) and kwargs["force_overwrite"]: os.unlink(temp_dir) os.mkdir(temp_dir) temp_base = os.path.join(temp_dir, output_basename) if kwargs["debug"]: LOGGER.setLevel("DEBUG") if kwargs["text_input"] is not None: raise click.BadParameter( "The -i option is obsolete. .txt files are now read as plain text, " ".xml as XML, and other files based on whether they start with <?xml or not." ) # Determine if the file is plain text or XML textfile_name = kwargs["textfile"] if textfile_name.endswith(".xml"): textfile_is_plaintext = False # .xml is XML elif textfile_name.endswith(".txt"): textfile_is_plaintext = True # .txt is plain text else: # Files other than .xml or .txt are parsed using etree. If the parse is # successful or the first syntax error is past the first line, the file # is assumed to be XML. Plain text files will yield an error in the # first few characters of line 1, typically complaining about not # finding "<" at the start. # There are many valid "magic numbers" for XML files, depending on # their encoding (utf8, utf16, endianness, etc). If we looked for # "<?xml " at the beginning, that would only catch some of the valid # XML encodings that etree can parse. # We could also use python-magic or filetype, but why introduce another # dependency when we can ask the library we're already using!? try: _ = etree.parse(textfile_name) textfile_is_plaintext = False except etree.XMLSyntaxError as e: textfile_is_plaintext = e.position <= (1, 10) if textfile_is_plaintext: if not kwargs["language"]: raise click.BadParameter( "No input language specified for plain text input. " "Please provide the -l/--language switch.") languages = kwargs["language"] if not kwargs["lang_no_append_und"] and "und" not in languages: languages.append("und") plain_textfile = kwargs["textfile"] try: _, xml_textfile = create_input_tei( input_file_name=plain_textfile, text_languages=languages, save_temps=temp_base, ) except RuntimeError as e: raise click.UsageError(e) from e else: xml_textfile = kwargs["textfile"] bare = kwargs.get("bare", False) try: results = align_audio( xml_textfile, kwargs["audiofile"], bare=bare, config=config, save_temps=temp_base, verbose_g2p_warnings=kwargs["g2p_verbose"], ) except RuntimeError as e: raise click.UsageError(e) from e # LOGGER.error(e) # sys.exit(1) output_formats = kwargs["output_formats"] save_readalong( align_results=results, output_dir=output_dir, output_basename=output_basename, config=config, audiofile=kwargs["audiofile"], audiosegment=results["audio"], output_formats=output_formats, )