def setUp(self): self.data_path = os.path.join(os.path.dirname(__file__), "data") self.audio_segment = read_audio_from_file( os.path.join(self.data_path, "audio_sample.ogg") ) self.noisy_segment = read_audio_from_file( os.path.join(self.data_path, "noise_at_1500.mp3") ) # Use a TemporaryDirectory object for temp outputs, so they get cleaned # automatically cleaned even in case of errors or aborted runs. self.tempdirobj = tempfile.TemporaryDirectory( prefix="test_audio_tmpdir", dir="." ) self.tempdir = self.tempdirobj.name
def test_write_audio_to_file(self): """Mininal unit testing for write_audio_to_file""" section = extract_section(self.audio_segment, 1000, 2000) output_path = os.path.join(self.tempdir, "section_output.mp3") write_audio_to_file(section, output_path) self.assertTrue(os.path.exists(output_path)) reloaded_section = read_audio_from_file(output_path) self.assertAlmostEqual( len(section), len(reloaded_section), msg="reloaded audio file is more than 50ms shorter or longer", delta=50, )
def save_readalong( # noqa C901 # noqa C901 - ignore the complexity of this function # this * forces all arguments to be passed by name, because I don't want any # code to depend on their order in the future *, align_results: Dict[str, List], output_dir: str, output_basename: str, config=None, audiofile: str, audiosegment: AudioSegment = None, output_formats=(), ): """Save the results from align_audio() into the output files required for a readalong Args: align_results(Dict[str,List]): return value from align_audio() output_dir (str): directory where to save the readalong, output_dir should already exist, files it contains may be overwritten output_basename (str): basename of the files to save in output_dir config ([type TODO], optional): alignment configuration loaded from the json audiofile (str): path to the audio file passed to align_audio() output_formats (List[str], optional): list of desired output formats audiosegment (AudioSegment): a pydub.AudioSegment object of processed audio. if None, then original audio will be saved at `audiofile` Returns: None Raises: [TODO] """ # Round all times to three digits, anything more is excess precision # poluting the output files, and usually due to float rounding errors anyway. for w in align_results["words"]: w["start"] = round(w["start"], 3) w["end"] = round(w["end"], 3) output_base = os.path.join(output_dir, output_basename) # Create textgrid object if outputting to TextGrid or eaf if "TextGrid" in output_formats or "eaf" in output_formats: audio = read_audio_from_file(audiofile) duration = audio.frame_count() / audio.frame_rate words, sentences = return_words_and_sentences(align_results) textgrid = write_to_text_grid(words, sentences, duration) if "TextGrid" in output_formats: textgrid.to_file(output_base + ".TextGrid") if "eaf" in output_formats: textgrid.to_eaf().to_file(output_base + ".eaf") # Create webvtt object if outputting to vtt or srt if "srt" in output_formats or "vtt" in output_formats: words, sentences = return_words_and_sentences(align_results) cc_sentences = write_to_subtitles(sentences) cc_words = write_to_subtitles(words) if "srt" in output_formats: cc_sentences.save_as_srt(output_base + "_sentences.srt") cc_words.save_as_srt(output_base + "_words.srt") if "vtt" in output_formats: cc_words.save(output_base + "_words.vtt") cc_sentences.save(output_base + "_sentences.vtt") tokenized_xml_path = output_base + ".xml" save_xml(tokenized_xml_path, align_results["tokenized"]) if "xhtml" in output_formats: convert_to_xhtml(align_results["tokenized"]) tokenized_xhtml_path = output_base + ".xhtml" save_xml(tokenized_xhtml_path, align_results["tokenized"]) _, audio_ext = os.path.splitext(audiofile) audio_path = output_base + audio_ext audio_format = audio_ext[1:] if audiosegment: if audio_format in ["m4a", "aac"]: audio_format = "ipod" try: audiosegment.export(audio_path, format=audio_format) except CouldntEncodeError: LOGGER.warning(f"The audio file at {audio_path} could \ not be exported in the {audio_format} format. \ Please ensure your installation of ffmpeg has \ the necessary codecs.") audio_path = output_base + ".wav" audiosegment.export(audio_path, format="wav") else: shutil.copy(audiofile, audio_path) smil_path = output_base + ".smil" smil = make_smil( os.path.basename(tokenized_xml_path), os.path.basename(audio_path), align_results, ) save_txt(smil_path, smil) if "html" in output_formats: html_out_path = output_base + ".html" html_out = create_web_component_html(tokenized_xml_path, smil_path, audio_path) with open(html_out_path, "w") as f: f.write(html_out) save_minimal_index_html( os.path.join(output_dir, "index.html"), os.path.basename(tokenized_xml_path), os.path.basename(smil_path), os.path.basename(audio_path), ) # Copy the image files to the output's asset directory, if any are found if config and "images" in config: assets_dir = os.path.join(output_dir, "assets") try: os.mkdir(assets_dir) except FileExistsError: if not os.path.isdir(assets_dir): raise for _, image in config["images"].items(): if image[0:4] == "http": LOGGER.warning( f"Please make sure {image} is accessible to clients using your read-along." ) else: try: shutil.copy(image, assets_dir) except Exception as e: LOGGER.warning( f"Please copy {image} to {assets_dir} before deploying your read-along. ({e})" ) if os.path.basename(image) != image: LOGGER.warning( f"Read-along images were tested with absolute urls (starting with http(s):// " f"and filenames without a path. {image} might not work as specified." )
def align_audio( # noqa: C901 xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None, verbose_g2p_warnings=False, ): """Align an XML input file to an audio file. Args: xml_path (str): Path to XML input file in TEI-like format audio_path (str): Path to audio input. Must be in a format supported by ffmpeg unit (str): Optional; Element to create alignments for, by default 'w' bare (boolean): Optional; If False, split silence into adjoining tokens (default) If True, keep the bare tokens without adjoining silences. config (object): Optional; ReadAlong-Studio configuration to use save_temps (str): Optional; Save temporary files, by default None verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings iff True Returns: Dict[str, List]: TODO Raises: TODO """ results: Dict[str, List] = {"words": [], "audio": None} # First do G2P try: xml = etree.parse(xml_path).getroot() except etree.XMLSyntaxError as e: raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e)) from e if config and "images" in config: xml = add_images(xml, config) if config and "xml" in config: xml = add_supplementary_xml(xml, config) xml = tokenize_xml(xml) if save_temps: save_xml(save_temps + ".tokenized.xml", xml) results["tokenized"] = xml = add_ids(xml) if save_temps: save_xml(save_temps + ".ids.xml", xml) xml, valid = convert_xml(xml, verbose_warnings=verbose_g2p_warnings) if save_temps: save_xml(save_temps + ".g2p.xml", xml) if not valid: raise RuntimeError( "Some words could not be g2p'd correctly. Aborting. " "Run with --g2p-verbose for more detailed g2p error logs.") # Prepare the SoundsSwallower (formerly PocketSphinx) configuration cfg = soundswallower.Decoder.default_config() model_path = soundswallower.get_model_path() cfg.set_boolean("-remove_noise", False) cfg.set_boolean("-remove_silence", False) cfg.set_string("-hmm", os.path.join(model_path, "en-us")) # cfg.set_string('-samprate', "no no") cfg.set_float("-beam", 1e-100) cfg.set_float("-wbeam", 1e-80) # Read the audio file audio = read_audio_from_file(audio_path) audio = audio.set_channels(1).set_sample_width(2) audio_length_in_ms = len(audio.raw_data) # Downsampling is (probably) not necessary cfg.set_float("-samprate", audio.frame_rate) # Process audio, silencing or removing any DNA segments dna_segments = [] removed_segments = [] if config and "do-not-align" in config: # Sort un-alignable segments and join overlapping ones dna_segments = sort_and_join_dna_segments( config["do-not-align"]["segments"]) method = config["do-not-align"].get("method", "remove") # Determine do-not-align method if method == "mute": dna_method = mute_section elif method == "remove": dna_method = remove_section else: LOGGER.error("Unknown do-not-align method declared") # Process audio and save temporary files if method in ("mute", "remove"): processed_audio = audio # Process the DNA segments in reverse order so we don't have to correct # for previously processed ones when using the "remove" method. for seg in reversed(dna_segments): processed_audio = dna_method(processed_audio, int(seg["begin"]), int(seg["end"])) if save_temps: _, ext = os.path.splitext(audio_path) try: processed_audio.export(save_temps + "_processed" + ext, format=ext[1:]) except CouldntEncodeError: try: os.remove(save_temps + "_processed" + ext) except BaseException: pass LOGGER.warning( f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'" ) processed_audio.export(save_temps + "_processed" + ".wav") removed_segments = dna_segments audio_data = processed_audio else: audio_data = audio # Initialize the SoundSwallower decoder with the sample rate from the audio frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen")) fft_size = 1 while fft_size < frame_points: fft_size = fft_size << 1 cfg.set_int("-nfft", fft_size) frame_size = 1.0 / cfg.get_int("-frate") # Note: the frames are typically 0.01s long (i.e., the frame rate is typically 100), # while the audio segments manipulated using pydub are sliced and accessed in # millisecond intervals. For audio segments, the ms slice assumption is hard-coded # all over, while frames_to_time() is used to convert segment boundaries returned by # soundswallower, which are indexes in frames, into durations in seconds. def frames_to_time(frames): return frames * frame_size # Extract the list of sequences of words in the XML word_sequences = get_sequences(xml, xml_path, unit=unit) end = 0 for i, word_sequence in enumerate(word_sequences): i_suffix = "" if i == 0 else "." + str(i + 1) # Generate dictionary and FSG for the current sequence of words dict_data = make_dict(word_sequence.words, xml_path, unit=unit) if save_temps: dict_file = io.open(save_temps + ".dict" + i_suffix, "wb") else: dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False) dict_file.write(dict_data.encode("utf-8")) dict_file.close() fsg_data = make_fsg(word_sequence.words, xml_path) if save_temps: fsg_file = io.open(save_temps + ".fsg" + i_suffix, "wb") else: fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False) fsg_file.write(fsg_data.encode("utf-8")) fsg_file.close() # Extract the part of the audio corresponding to this word sequence audio_segment = extract_section(audio_data, word_sequence.start, word_sequence.end) if save_temps and audio_segment is not audio_data: write_audio_to_file(audio_segment, save_temps + ".wav" + i_suffix) # Configure soundswallower for this sequence's dict and fsg cfg.set_string("-dict", dict_file.name) cfg.set_string("-fsg", fsg_file.name) ps = soundswallower.Decoder(cfg) # Align this word sequence ps.start_utt() ps.process_raw(audio_segment.raw_data, no_search=False, full_utt=True) ps.end_utt() if not ps.seg(): raise RuntimeError( "Alignment produced no segments, " "please examine dictionary and input audio and text.") # List of removed segments for the sequence we are currently processing curr_removed_segments = dna_union(word_sequence.start, word_sequence.end, audio_length_in_ms, removed_segments) prev_segment_count = len(results["words"]) for seg in ps.seg(): if seg.word in ("<sil>", "[NOISE]"): continue start = frames_to_time(seg.start_frame) end = frames_to_time(seg.end_frame + 1) # change to ms start_ms = start * 1000 end_ms = end * 1000 if curr_removed_segments: start_ms += calculate_adjustment(start_ms, curr_removed_segments) end_ms += calculate_adjustment(end_ms, curr_removed_segments) start_ms, end_ms = correct_adjustments(start_ms, end_ms, curr_removed_segments) # change back to seconds to write to smil start = start_ms / 1000 end = end_ms / 1000 results["words"].append({ "id": seg.word, "start": start, "end": end }) LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end) aligned_segment_count = len(results["words"]) - prev_segment_count if aligned_segment_count != len(word_sequence.words): LOGGER.warning( f"Word sequence {i+1} had {len(word_sequence.words)} tokens " f"but produced {aligned_segment_count} segments. " "Check that the anchors are well positioned or " "that the audio corresponds to the text.") final_end = end if len(results["words"]) == 0: raise RuntimeError( "Alignment produced only noise or silence segments, " "please verify that the text is an actual transcript of the audio." ) if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)): LOGGER.warning( "Alignment produced a different number of segments and tokens than " "were in the input. Sequences between some anchors probably did not " "align successfully. Look for more anchors-related warnings above in the log." ) if not bare: # Take all the boundaries (anchors) around segments and add them as DNA # segments for the purpose of splitting silences dna_for_silence_splitting = copy.deepcopy(dna_segments) last_end = None for seq in word_sequences: if last_end or seq.start: dna_for_silence_splitting.append({ "begin": (last_end or seq.start), "end": (seq.start or last_end) }) last_end = seq.end if last_end: dna_for_silence_splitting.append({ "begin": last_end, "end": last_end }) dna_for_silence_splitting = sort_and_join_dna_segments( dna_for_silence_splitting) split_silences(results["words"], final_end, dna_for_silence_splitting) words_dict = { x["id"]: { "start": x["start"], "end": x["end"] } for x in results["words"] } silence_offsets = defaultdict(int) silence = 0 if results["tokenized"].xpath("//silence"): endpoint = 0 all_good = True for el in results["tokenized"].xpath("//*"): if el.tag == "silence" and "dur" in el.attrib: try: silence_ms = parse_time(el.attrib["dur"]) except ValueError as err: LOGGER.error( f'Invalid silence element in {xml_path}: invalid "time" ' f'attribute "{el.attrib["dur"]}": {err}') all_good = False continue silence_segment = AudioSegment.silent( duration=silence_ms) # create silence segment silence += silence_ms # add silence length to total silence audio = (audio[:endpoint] + silence_segment + audio[endpoint:] ) # insert silence at previous endpoint endpoint += silence_ms # add silence to previous endpoint if el.tag == "w": silence_offsets[el.attrib["id"]] += ( silence / 1000 ) # add silence in seconds to silence offset for word id endpoint = (words_dict[el.attrib["id"]]["end"] * 1000 ) + silence # bump endpoint and include silence if not all_good: raise RuntimeError( f"Could not parse all duration attributes in silence elements in {xml_path}, please make sure each silence " 'element is properly formatted, e.g., <silence dur="1.5s"/>. Aborting.' ) if silence: for word in results["words"]: word["start"] += silence_offsets[word["id"]] word["end"] += silence_offsets[word["id"]] results["audio"] = audio return results
def align_audio( xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None, ): """ Align an XML input file to an audio file. Parameters ---------- xml_path : str Path to XML input file in TEI-like format audio_path : str Path to audio input. Must be in a format supported by ffmpeg unit : str, optional Element to create alignments for, by default 'w' bare : boolean, optional If False, split silence into adjoining tokens (default) If True, keep the bare tokens without adjoining silences. config : object, optional Uses ReadAlong-Studio configuration save_temps : Union[str, None], optional save temporary files, by default None #TODO: document return Returns ------- [type] [description] #TODO: document exceptions Raises ------ RuntimeError [description] RuntimeError [description] RuntimeError [description] RuntimeError [description] """ results: Dict[str, List] = {"words": []} # First do G2P try: xml = etree.parse(xml_path).getroot() except etree.XMLSyntaxError as e: raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e)) if config and "images" in config: xml = add_images(xml, config) if config and "xml" in config: xml = add_supplementary_xml(xml, config) xml = add_lang_ids(xml, unit="s") xml = tokenize_xml(xml) if save_temps: save_xml(save_temps + ".tokenized.xml", xml) results["tokenized"] = xml = add_ids(xml) if save_temps: save_xml(save_temps + ".ids.xml", xml) xml = convert_xml(xml) if save_temps: save_xml(save_temps + ".g2p.xml", xml) # Now generate dictionary and FSG dict_data = make_dict(xml, xml_path, unit=unit) if save_temps: dict_file = io.open(save_temps + ".dict", "wb") else: dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False) dict_file.write(dict_data.encode("utf-8")) dict_file.flush() fsg_data = make_fsg(xml, xml_path, unit=unit) if save_temps: fsg_file = io.open(save_temps + ".fsg", "wb") else: fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False) fsg_file.write(fsg_data.encode("utf-8")) fsg_file.flush() # Now do alignment cfg = soundswallower.Decoder.default_config() model_path = soundswallower.get_model_path() cfg.set_boolean("-remove_noise", False) cfg.set_boolean("-remove_silence", False) cfg.set_string("-hmm", os.path.join(model_path, "en-us")) cfg.set_string("-dict", dict_file.name) cfg.set_string("-fsg", fsg_file.name) # cfg.set_string('-samprate', "no no") cfg.set_float("-beam", 1e-100) cfg.set_float("-wbeam", 1e-80) audio = read_audio_from_file(audio_path) audio = audio.set_channels(1).set_sample_width(2) # Downsampling is (probably) not necessary cfg.set_float("-samprate", audio.frame_rate) # Process audio do_not_align_segments = None if config and "do-not-align" in config: # Reverse sort un-alignable segments do_not_align_segments = sorted( config["do-not-align"]["segments"], key=lambda x: x["begin"], reverse=True ) method = config["do-not-align"].get("method", "remove") # Determine do-not-align method if method == "mute": dna_method = mute_section elif method == "remove": dna_method = remove_section else: LOGGER.error("Unknown do-not-align method declared") # Process audio and save temporary files if method == "mute" or method == "remove": processed_audio = audio for seg in do_not_align_segments: processed_audio = dna_method( processed_audio, int(seg["begin"]), int(seg["end"]) ) if save_temps: _, ext = os.path.splitext(audio_path) try: processed_audio.export( save_temps + "_processed" + ext, format=ext[1:] ) except CouldntEncodeError: os.remove(save_temps + "_processed" + ext) LOGGER.warn( f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'" ) processed_audio.export(save_temps + "_processed" + ".wav") raw_data = processed_audio.raw_data else: raw_data = audio.raw_data frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen")) fft_size = 1 while fft_size < frame_points: fft_size = fft_size << 1 cfg.set_int("-nfft", fft_size) ps = soundswallower.Decoder(cfg) frame_size = 1.0 / cfg.get_int("-frate") def frames_to_time(frames): return frames * frame_size ps.start_utt() ps.process_raw(raw_data, no_search=False, full_utt=True) ps.end_utt() if not ps.seg(): raise RuntimeError( "Alignment produced no segments, " "please examine dictionary and input audio and text." ) for seg in ps.seg(): start = frames_to_time(seg.start_frame) end = frames_to_time(seg.end_frame + 1) # change to ms start_ms = start * 1000 end_ms = end * 1000 if do_not_align_segments and method == "remove": start_ms += calculate_adjustment(start_ms, do_not_align_segments) end_ms += calculate_adjustment(end_ms, do_not_align_segments) start_ms, end_ms = correct_adjustments( start_ms, end_ms, do_not_align_segments ) # change back to seconds to write to smil start = start_ms / 1000 end = end_ms / 1000 if seg.word in ("<sil>", "[NOISE]"): continue else: results["words"].append({"id": seg.word, "start": start, "end": end}) LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end) if len(results["words"]) == 0: raise RuntimeError( "Alignment produced only noise or silence segments, " "please examine dictionary and input audio and text." ) if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)): raise RuntimeError( "Alignment produced a different number of segments and tokens, " "please examine dictionary and input audio and text." ) final_end = end if not bare: # Split adjoining silence/noise between words last_end = 0.0 last_word = dict() for word in results["words"]: silence = word["start"] - last_end midpoint = last_end + silence / 2 if silence > 0: if last_word: last_word["end"] = midpoint word["start"] = midpoint last_word = word last_end = word["end"] silence = final_end - last_end if silence > 0: if last_word is not None: last_word["end"] += silence / 2 dict_file.close() if not save_temps: os.unlink(dict_file.name) fsg_file.close() if not save_temps: os.unlink(fsg_file.name) return results
def align(**kwargs): """Align TEXTFILE and AUDIOFILE and create output files as OUTPUT_BASE.* in directory OUTPUT_BASE/. TEXTFILE: Input text file path (in XML, or plain text with -i) AUDIOFILE: Input audio file path, in any format supported by ffmpeg OUTPUT_BASE: Base name for output files """ config = kwargs.get("config", None) if config: if config.endswith("json"): try: with open(config) as f: config = json.load(f) except json.decoder.JSONDecodeError: LOGGER.error(f"Config file at {config} is not valid json.") else: raise click.BadParameter(f"Config file '{config}' must be in JSON format") output_dir = kwargs["output_base"] if os.path.exists(output_dir): if not os.path.isdir(output_dir): raise click.UsageError( f"Output folder '{output_dir}' already exists but is a not a directory." ) if not kwargs["force_overwrite"]: raise click.UsageError( f"Output folder '{output_dir}' already exists, use -f to overwrite." ) else: os.mkdir(output_dir) # Make sure we can write to the output directory, for early error checking and user # friendly error messages. try: with TemporaryFile(dir=output_dir): pass except Exception: raise click.UsageError( f"Cannot write into output folder '{output_dir}'. Please verify permissions." ) output_basename = os.path.basename(output_dir) output_base = os.path.join(output_dir, output_basename) temp_base = None if kwargs["save_temps"]: temp_dir = os.path.join(output_dir, "tempfiles") if not os.path.isdir(temp_dir): if os.path.exists(temp_dir) and kwargs["force_overwrite"]: os.unlink(temp_dir) os.mkdir(temp_dir) temp_base = os.path.join(temp_dir, output_basename) if kwargs["debug"]: LOGGER.setLevel("DEBUG") if kwargs["text_input"]: if not kwargs["language"]: LOGGER.warn("No input language provided, using undetermined mapping") tempfile, kwargs["textfile"] = create_input_tei( input_file_name=kwargs["textfile"], text_language=kwargs["language"], save_temps=temp_base, ) if kwargs["output_xhtml"]: tokenized_xml_path = "%s.xhtml" % output_base else: _, input_ext = os.path.splitext(kwargs["textfile"]) tokenized_xml_path = "%s%s" % (output_base, input_ext) if os.path.exists(tokenized_xml_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % tokenized_xml_path ) smil_path = output_base + ".smil" if os.path.exists(smil_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % smil_path ) _, audio_ext = os.path.splitext(kwargs["audiofile"]) audio_path = output_base + audio_ext if os.path.exists(audio_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % audio_path ) unit = kwargs.get("unit", "w") bare = kwargs.get("bare", False) if ( not unit ): # .get() above should handle this but apparently the way kwargs is implemented unit = "w" # unit could still be None here. try: results = align_audio( kwargs["textfile"], kwargs["audiofile"], unit=unit, bare=bare, config=config, save_temps=temp_base, ) except RuntimeError as e: LOGGER.error(e) exit(1) if kwargs["text_grid"]: audio = read_audio_from_file(kwargs["audiofile"]) duration = audio.frame_count() / audio.frame_rate words, sentences = return_words_and_sentences(results) textgrid = write_to_text_grid(words, sentences, duration) textgrid.to_file(output_base + ".TextGrid") textgrid.to_eaf().to_file(output_base + ".eaf") if kwargs["closed_captioning"]: words, sentences = return_words_and_sentences(results) webvtt_sentences = write_to_subtitles(sentences) webvtt_sentences.save(output_base + "_sentences.vtt") webvtt_sentences.save_as_srt(output_base + "_sentences.srt") webvtt_words = write_to_subtitles(words) webvtt_words.save(output_base + "_words.vtt") webvtt_words.save_as_srt(output_base + "_words.srt") if kwargs["output_xhtml"]: convert_to_xhtml(results["tokenized"]) save_minimal_index_html( os.path.join(output_dir, "index.html"), os.path.basename(tokenized_xml_path), os.path.basename(smil_path), os.path.basename(audio_path), ) save_xml(tokenized_xml_path, results["tokenized"]) smil = make_smil( os.path.basename(tokenized_xml_path), os.path.basename(audio_path), results ) shutil.copy(kwargs["audiofile"], audio_path) save_txt(smil_path, smil)
def setUp(self): super().setUp() self.audio_segment = read_audio_from_file( os.path.join(self.data_dir, "audio_sample.ogg")) self.noisy_segment = read_audio_from_file( os.path.join(self.data_dir, "noise_at_1500.mp3"))