Exemplo n.º 1
0
    def test_tok_all_words(self):
        """By default, all words should get tokenized"""

        txt = """<document xml:lang="fra">
<s>Bonjour! Comment ça va?</s>
<s>Voici une deuxième phrase.</s>
</document>"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        as_txt = etree.tounicode(tokenized)
        # print(etree.tounicode(tokenized))

        ref = """<document xml:lang="fra">
<s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s>
<s><w>Voici</w> <w>une</w> <w>deuxième</w> <w>phrase</w>.</s>
</document>"""
        # print('as_txt="' + as_txt +'"')
        # print('ref="' + ref +'"')
        self.assertEqual(as_txt, ref)

        with_ids = add_ids(tokenized)
        ids_as_txt = etree.tounicode(with_ids)
        # print('with ids="' + ids_as_txt + '"')
        ref_with_ids = """<document xml:lang="fra">
<s id="s0"><w id="s0w0">Bonjour</w>! <w id="s0w1">Comment</w> <w id="s0w2">ça</w> <w id="s0w3">va</w>?</s>
<s id="s1"><w id="s1w0">Voici</w> <w id="s1w1">une</w> <w id="s1w2">deuxième</w> <w id="s1w3">phrase</w>.</s>
</document>"""
        self.assertEqual(ids_as_txt, ref_with_ids)
Exemplo n.º 2
0
    def test_dna_word_nested(self):
        """You also can't have a <w> element inside a DNA element"""

        txt = """<s xml:lang="fra">Une <foo do-not-align="true"><bar><w>exclude</w></bar></foo> phrase.</s>"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        self.assertRaises(RuntimeError, add_ids, tokenized)
Exemplo n.º 3
0
    def test_tok_div_p_s(self):
        """Text inside a DNA div, p or s does not get tokenized"""

        txt = """<document xml:lang="fra">
<div>
<p> <s>Une phrase.</s> </p>
<p> <s>Deux phrases.</s> </p>
</div>
<div do-not-align="TRUE">
<p> <s>Une phrase.</s> </p>
<p> <s>Deux phrases.</s> </p>
</div>
<div>
<p do-not-align="1"> <s>Une phrase.</s> </p>
<p> <s do-not-align="true">Deux phrases.</s> </p>
<p> <s>Trois phrases.</s> </p>
</div>
</document>"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        as_txt = etree.tounicode(tokenized)
        # print('as_txt="' + as_txt +'"')

        ref = """<document xml:lang="fra">
<div>
<p> <s><w>Une</w> <w>phrase</w>.</s> </p>
<p> <s><w>Deux</w> <w>phrases</w>.</s> </p>
</div>
<div do-not-align="TRUE">
<p> <s>Une phrase.</s> </p>
<p> <s>Deux phrases.</s> </p>
</div>
<div>
<p do-not-align="1"> <s>Une phrase.</s> </p>
<p> <s do-not-align="true">Deux phrases.</s> </p>
<p> <s><w>Trois</w> <w>phrases</w>.</s> </p>
</div>
</document>"""
        self.assertEqual(as_txt, ref)

        with_ids = add_ids(tokenized)
        ids_as_txt = etree.tounicode(with_ids)
        # print('with ids="' + ids_as_txt + '"')

        ref_with_ids = """<document xml:lang="fra">
<div id="d0">
<p id="d0p0"> <s id="d0p0s0"><w id="d0p0s0w0">Une</w> <w id="d0p0s0w1">phrase</w>.</s> </p>
<p id="d0p1"> <s id="d0p1s0"><w id="d0p1s0w0">Deux</w> <w id="d0p1s0w1">phrases</w>.</s> </p>
</div>
<div do-not-align="TRUE">
<p> <s>Une phrase.</s> </p>
<p> <s>Deux phrases.</s> </p>
</div>
<div id="d1">
<p do-not-align="1"> <s>Une phrase.</s> </p>
<p id="d1p0"> <s do-not-align="true">Deux phrases.</s> </p>
<p id="d1p1"> <s id="d1p1s0"><w id="d1p1s0w0">Trois</w> <w id="d1p1s0w1">phrases</w>.</s> </p>
</div>
</document>"""
        self.assertEqual(ids_as_txt, ref_with_ids)
Exemplo n.º 4
0
    def test_tok_some_words(self):
        """do-not-align text is excluded from tokenization"""

        txt = """<document xml:lang="fra">
<p><s>Bonjour! Comment ça va?</s></p>
<p do-not-align="true"><s>Bonjour! Comment ça va?</s></p>
<s do-not-align="TRUE">Voici une deuxième phrase.</s>
<s>Un <foo do-not-align="1">mot ou deux</foo> à exclure.</s>
</document>"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        as_txt = etree.tounicode(tokenized)
        # print('as_txt="' + as_txt +'"')

        ref = """<document xml:lang="fra">
<p><s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s></p>
<p do-not-align="true"><s>Bonjour! Comment ça va?</s></p>
<s do-not-align="TRUE">Voici une deuxième phrase.</s>
<s><w>Un</w> <foo do-not-align="1">mot ou deux</foo> <w>à</w> <w>exclure</w>.</s>
</document>"""
        self.assertEqual(as_txt, ref)

        with_ids = add_ids(tokenized)
        ids_as_txt = etree.tounicode(with_ids)
        # print('with ids="' + ids_as_txt + '"')
        ref_with_ids = """<document xml:lang="fra">
<p id="p0"><s id="p0s0"><w id="p0s0w0">Bonjour</w>! <w id="p0s0w1">Comment</w> <w id="p0s0w2">ça</w> <w id="p0s0w3">va</w>?</s></p>
<p do-not-align="true"><s>Bonjour! Comment ça va?</s></p>
<s do-not-align="TRUE">Voici une deuxième phrase.</s>
<s id="s0"><w id="s0w0">Un</w> <foo do-not-align="1">mot ou deux</foo> <w id="s0w1">à</w> <w id="s0w2">exclure</w>.</s>
</document>"""
        self.assertEqual(ids_as_txt, ref_with_ids)
Exemplo n.º 5
0
    def test_dna_word(self):
        """You can't have a DNA <w> element, that's reserved for tokens to align"""

        txt = """<s xml:lang="fra">Une <w do-not-align="true">exclude</w> phrase.</s>"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        self.assertRaises(RuntimeError, add_ids, tokenized)
Exemplo n.º 6
0
    def test_simple(self):
        txt = """<document>
<s xml:lang="atj">Kwei! Tan e ici matisihin?</s>
</document>
"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        print(etree.tounicode(tokenized))
Exemplo n.º 7
0
def end_to_end(xml, input_filename, unit, word_unit, out_orth):
    xml = tokenize_xml(xml)
    xml = add_ids(xml)
    converted_xml, valid = convert_xml(xml, word_unit, out_orth)
    # save_xml("test.xml", converted_xml)
    fsg = make_fsg(converted_xml, input_filename, unit)
    pronouncing_dictionary = make_dict(converted_xml, input_filename, unit)
    return xml, fsg, pronouncing_dictionary
Exemplo n.º 8
0
def tokenize(**kwargs):
    """Tokenize XMLFILE for 'readalongs align' into TOKFILE.
    XMLFILE should have been produce by 'readalongs prepare'.
    TOKFILE can be augmented with word-specific language codes.
    'readalongs align' can be called with either XMLFILE or TOKFILE as XML input.

    XMLFILE: Path to the XML file to tokenize, or - for stdin

    TOKFILE: Output path for the tok'd XML, or - for stdout [default: XMLFILE.tokenized.xml]
    """
    xmlfile = kwargs["xmlfile"]

    if kwargs["debug"]:
        LOGGER.setLevel("DEBUG")
        LOGGER.info(
            "Running readalongs tokenize(xmlfile={}, tokfile={}, force-overwrite={})."
            .format(
                kwargs["xmlfile"],
                kwargs["tokfile"],
                kwargs["force_overwrite"],
            ))

    if not kwargs["tokfile"]:
        try:
            output_tok_path = xmlfile.name
        except Exception:
            output_tok_path = "<stdin>"
        if output_tok_path == "<stdin>":
            output_tok_path = "-"
        else:
            if output_tok_path.endswith(".xml"):
                output_tok_path = output_tok_path[:-4]
            output_tok_path += ".tokenized.xml"
    else:
        output_tok_path = kwargs["tokfile"]
        if not output_tok_path.endswith(".xml") and not output_tok_path == "-":
            output_tok_path += ".xml"

    if os.path.exists(output_tok_path) and not kwargs["force_overwrite"]:
        raise click.BadParameter(
            "Output file %s exists already, use -f to overwrite." %
            output_tok_path)

    try:
        xml = etree.parse(xmlfile).getroot()
    except etree.XMLSyntaxError as e:
        raise click.BadParameter(
            "Error parsing input file %s as XML, please verify it. Parser error: %s"
            % (xmlfile, e))

    xml = tokenize_xml(xml)

    if output_tok_path == "-":
        write_xml(sys.stdout.buffer, xml)
    else:
        save_xml(output_tok_path, xml)
    LOGGER.info("Wrote {}".format(output_tok_path))
Exemplo n.º 9
0
    def test_comments(self):
        txt = """<document>
<s xml:lang="atj">Kwei! (<w xml:lang="fra">Bonjour</w>!)</s>
<s xml:lang="atj">Tan e ici matisihin?</s>
</document>
"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        print(etree.tounicode(tokenized))
Exemplo n.º 10
0
    def test_mixed_lang(self):
        txt = """<document>
<s xml:lang="atj">Kwei! Tan e ici matisihin?</s>
<s xml:lang="fra">Bonjour! Comment ça va?</s>
</document>
"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        print(etree.tounicode(tokenized))
Exemplo n.º 11
0
    def test_simple(self):
        txt = """<document>
<s xml:lang="atj">Kwei! Tan e ici matisihin?</s>
</document>
"""
        ref = """<document>
<s xml:lang="atj"><w>Kwei</w>! <w>Tan</w> <w>e</w> <w>ici</w> <w>matisihin</w>?</s>
</document>"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        # print(etree.tounicode(tokenized))
        self.assertEqual(etree.tounicode(tokenized), ref)
Exemplo n.º 12
0
    def test_mixed_words(self):
        txt = """<document>
<s xml:lang="atj">Kwei! (<w xml:lang="fra">Bonjour</w>!)</s>
<s xml:lang="atj">Tan e ici matisihin?</s>
</document>
"""
        ref = """<document>
<s xml:lang="atj">Kwei! (<w xml:lang="fra">Bonjour</w>!)</s>
<s xml:lang="atj">Tan e ici matisihin?</s>
</document>"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        # print(etree.tounicode(tokenized))
        self.assertEqual(etree.tounicode(tokenized), ref)
Exemplo n.º 13
0
    def test_mixed_lang(self):
        txt = """<document>
<s xml:lang="atj">Kwei! Tan e ici matisihin?</s>
<s xml:lang="fra">Bonjour! Comment ça va?</s>
</document>
"""
        ref = """<document>
<s xml:lang="atj"><w>Kwei</w>! <w>Tan</w> <w>e</w> <w>ici</w> <w>matisihin</w>?</s>
<s xml:lang="fra"><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s>
</document>"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        # print(etree.tounicode(tokenized))
        self.assertEqual(etree.tounicode(tokenized), ref)
Exemplo n.º 14
0
    def test_mixed_words(self):
        """Tokenization should be bypassed when <w> elements are already found in the input"""
        txt = """<document>
<s xml:lang="atj">Kwei! (<w xml:lang="fra">Bonjour</w>!)</s>
<s xml:lang="atj">Tan e ici matisihin?</s>
</document>
"""
        ref = """<document>
<s xml:lang="atj">Kwei! (<w xml:lang="fra">Bonjour</w>!)</s>
<s xml:lang="atj">Tan e ici matisihin?</s>
</document>"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        # print(etree.tounicode(tokenized))
        self.assertEqual(etree.tounicode(tokenized), ref)
Exemplo n.º 15
0
    def test_comments(self):
        """Make sure tokenize_xml ignores stuff inside comments"""
        txt = """<document>
<s xml:lang="atj">Kwei! (<subsent xml:lang="fra">Bonjour</subsent>!)</s>
<!--<s>comments</s> <w>should</w> <p>be ignored</p>-->
<s xml:lang="atj">Tan e ici matisihin?</s>
</document>
"""
        ref = """<document>
<s xml:lang="atj"><w>Kwei</w>! (<subsent xml:lang="fra"><w>Bonjour</w></subsent>!)</s>
<!--<s>comments</s> <w>should</w> <p>be ignored</p>-->
<s xml:lang="atj"><w>Tan</w> <w>e</w> <w>ici</w> <w>matisihin</w>?</s>
</document>"""
        xml = etree.fromstring(txt)
        tokenized = tokenize_xml.tokenize_xml(xml)
        # print(etree.tounicode(tokenized))
        self.assertEqual(etree.tounicode(tokenized), ref)
Exemplo n.º 16
0
 def test_dna_word(self):
     txt = """<s xml:lang="fra">Une <w do-not-align="true">exclude</w> phrase.</s>"""
     xml = etree.fromstring(txt)
     tokenized = tokenize_xml.tokenize_xml(xml)
     self.assertRaises(RuntimeError, add_ids, tokenized)
Exemplo n.º 17
0
def align_audio(
    xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None,
):
    """ Align an XML input file to an audio file.

    Parameters
    ----------
    xml_path : str
        Path to XML input file in TEI-like format
    audio_path : str
        Path to audio input. Must be in a format supported by ffmpeg
    unit : str, optional
        Element to create alignments for, by default 'w'
    bare : boolean, optional
        If False, split silence into adjoining tokens (default)
        If True, keep the bare tokens without adjoining silences.
    config : object, optional
        Uses ReadAlong-Studio configuration
    save_temps : Union[str, None], optional
        save temporary files, by default None

    #TODO: document return
    Returns
    -------
    [type]
        [description]

    #TODO: document exceptions
    Raises
    ------
    RuntimeError
        [description]
    RuntimeError
        [description]
    RuntimeError
        [description]
    RuntimeError
        [description]
    """
    results: Dict[str, List] = {"words": []}

    # First do G2P
    try:
        xml = etree.parse(xml_path).getroot()
    except etree.XMLSyntaxError as e:
        raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e))
    if config and "images" in config:
        xml = add_images(xml, config)
    if config and "xml" in config:
        xml = add_supplementary_xml(xml, config)
    xml = add_lang_ids(xml, unit="s")
    xml = tokenize_xml(xml)
    if save_temps:
        save_xml(save_temps + ".tokenized.xml", xml)
    results["tokenized"] = xml = add_ids(xml)
    if save_temps:
        save_xml(save_temps + ".ids.xml", xml)
    xml = convert_xml(xml)
    if save_temps:
        save_xml(save_temps + ".g2p.xml", xml)

    # Now generate dictionary and FSG
    dict_data = make_dict(xml, xml_path, unit=unit)
    if save_temps:
        dict_file = io.open(save_temps + ".dict", "wb")
    else:
        dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False)
    dict_file.write(dict_data.encode("utf-8"))
    dict_file.flush()
    fsg_data = make_fsg(xml, xml_path, unit=unit)
    if save_temps:
        fsg_file = io.open(save_temps + ".fsg", "wb")
    else:
        fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False)
    fsg_file.write(fsg_data.encode("utf-8"))
    fsg_file.flush()

    # Now do alignment
    cfg = soundswallower.Decoder.default_config()
    model_path = soundswallower.get_model_path()
    cfg.set_boolean("-remove_noise", False)
    cfg.set_boolean("-remove_silence", False)
    cfg.set_string("-hmm", os.path.join(model_path, "en-us"))
    cfg.set_string("-dict", dict_file.name)
    cfg.set_string("-fsg", fsg_file.name)
    # cfg.set_string('-samprate', "no no")
    cfg.set_float("-beam", 1e-100)
    cfg.set_float("-wbeam", 1e-80)

    audio = read_audio_from_file(audio_path)
    audio = audio.set_channels(1).set_sample_width(2)
    #  Downsampling is (probably) not necessary
    cfg.set_float("-samprate", audio.frame_rate)

    # Process audio
    do_not_align_segments = None
    if config and "do-not-align" in config:
        # Reverse sort un-alignable segments
        do_not_align_segments = sorted(
            config["do-not-align"]["segments"], key=lambda x: x["begin"], reverse=True
        )
        method = config["do-not-align"].get("method", "remove")
        # Determine do-not-align method
        if method == "mute":
            dna_method = mute_section
        elif method == "remove":
            dna_method = remove_section
        else:
            LOGGER.error("Unknown do-not-align method declared")
        # Process audio and save temporary files
        if method == "mute" or method == "remove":
            processed_audio = audio
            for seg in do_not_align_segments:
                processed_audio = dna_method(
                    processed_audio, int(seg["begin"]), int(seg["end"])
                )
            if save_temps:
                _, ext = os.path.splitext(audio_path)
                try:
                    processed_audio.export(
                        save_temps + "_processed" + ext, format=ext[1:]
                    )
                except CouldntEncodeError:
                    os.remove(save_temps + "_processed" + ext)
                    LOGGER.warn(
                        f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'"
                    )
                    processed_audio.export(save_temps + "_processed" + ".wav")
        raw_data = processed_audio.raw_data
    else:
        raw_data = audio.raw_data

    frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen"))
    fft_size = 1
    while fft_size < frame_points:
        fft_size = fft_size << 1
    cfg.set_int("-nfft", fft_size)
    ps = soundswallower.Decoder(cfg)
    frame_size = 1.0 / cfg.get_int("-frate")

    def frames_to_time(frames):
        return frames * frame_size

    ps.start_utt()
    ps.process_raw(raw_data, no_search=False, full_utt=True)
    ps.end_utt()

    if not ps.seg():
        raise RuntimeError(
            "Alignment produced no segments, "
            "please examine dictionary and input audio and text."
        )

    for seg in ps.seg():
        start = frames_to_time(seg.start_frame)
        end = frames_to_time(seg.end_frame + 1)
        # change to ms
        start_ms = start * 1000
        end_ms = end * 1000
        if do_not_align_segments and method == "remove":
            start_ms += calculate_adjustment(start_ms, do_not_align_segments)
            end_ms += calculate_adjustment(end_ms, do_not_align_segments)
            start_ms, end_ms = correct_adjustments(
                start_ms, end_ms, do_not_align_segments
            )
            # change back to seconds to write to smil
            start = start_ms / 1000
            end = end_ms / 1000
        if seg.word in ("<sil>", "[NOISE]"):
            continue
        else:
            results["words"].append({"id": seg.word, "start": start, "end": end})
        LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end)

    if len(results["words"]) == 0:
        raise RuntimeError(
            "Alignment produced only noise or silence segments, "
            "please examine dictionary and input audio and text."
        )
    if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)):
        raise RuntimeError(
            "Alignment produced a different number of segments and tokens, "
            "please examine dictionary and input audio and text."
        )

    final_end = end

    if not bare:
        # Split adjoining silence/noise between words
        last_end = 0.0
        last_word = dict()
        for word in results["words"]:
            silence = word["start"] - last_end
            midpoint = last_end + silence / 2
            if silence > 0:
                if last_word:
                    last_word["end"] = midpoint
                word["start"] = midpoint
            last_word = word
            last_end = word["end"]
        silence = final_end - last_end
        if silence > 0:
            if last_word is not None:
                last_word["end"] += silence / 2
    dict_file.close()
    if not save_temps:
        os.unlink(dict_file.name)
    fsg_file.close()
    if not save_temps:
        os.unlink(fsg_file.name)

    return results
Exemplo n.º 18
0
def align_audio(  # noqa: C901
    xml_path,
    audio_path,
    unit="w",
    bare=False,
    config=None,
    save_temps=None,
    verbose_g2p_warnings=False,
):
    """Align an XML input file to an audio file.

    Args:
        xml_path (str): Path to XML input file in TEI-like format
        audio_path (str): Path to audio input. Must be in a format supported by ffmpeg
        unit (str): Optional; Element to create alignments for, by default 'w'
        bare (boolean): Optional;
            If False, split silence into adjoining tokens (default)
            If True, keep the bare tokens without adjoining silences.
        config (object): Optional; ReadAlong-Studio configuration to use
        save_temps (str): Optional; Save temporary files, by default None
        verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings
            iff True

    Returns:
        Dict[str, List]: TODO

    Raises:
        TODO
    """
    results: Dict[str, List] = {"words": [], "audio": None}

    # First do G2P
    try:
        xml = etree.parse(xml_path).getroot()
    except etree.XMLSyntaxError as e:
        raise RuntimeError("Error parsing XML input file %s: %s." %
                           (xml_path, e)) from e
    if config and "images" in config:
        xml = add_images(xml, config)
    if config and "xml" in config:
        xml = add_supplementary_xml(xml, config)
    xml = tokenize_xml(xml)
    if save_temps:
        save_xml(save_temps + ".tokenized.xml", xml)
    results["tokenized"] = xml = add_ids(xml)
    if save_temps:
        save_xml(save_temps + ".ids.xml", xml)
    xml, valid = convert_xml(xml, verbose_warnings=verbose_g2p_warnings)
    if save_temps:
        save_xml(save_temps + ".g2p.xml", xml)
    if not valid:
        raise RuntimeError(
            "Some words could not be g2p'd correctly. Aborting. "
            "Run with --g2p-verbose for more detailed g2p error logs.")

    # Prepare the SoundsSwallower (formerly PocketSphinx) configuration
    cfg = soundswallower.Decoder.default_config()
    model_path = soundswallower.get_model_path()
    cfg.set_boolean("-remove_noise", False)
    cfg.set_boolean("-remove_silence", False)
    cfg.set_string("-hmm", os.path.join(model_path, "en-us"))
    # cfg.set_string('-samprate', "no no")
    cfg.set_float("-beam", 1e-100)
    cfg.set_float("-wbeam", 1e-80)

    # Read the audio file
    audio = read_audio_from_file(audio_path)
    audio = audio.set_channels(1).set_sample_width(2)
    audio_length_in_ms = len(audio.raw_data)
    #  Downsampling is (probably) not necessary
    cfg.set_float("-samprate", audio.frame_rate)

    # Process audio, silencing or removing any DNA segments
    dna_segments = []
    removed_segments = []
    if config and "do-not-align" in config:
        # Sort un-alignable segments and join overlapping ones
        dna_segments = sort_and_join_dna_segments(
            config["do-not-align"]["segments"])
        method = config["do-not-align"].get("method", "remove")
        # Determine do-not-align method
        if method == "mute":
            dna_method = mute_section
        elif method == "remove":
            dna_method = remove_section
        else:
            LOGGER.error("Unknown do-not-align method declared")
        # Process audio and save temporary files
        if method in ("mute", "remove"):
            processed_audio = audio
            # Process the DNA segments in reverse order so we don't have to correct
            # for previously processed ones when using the "remove" method.
            for seg in reversed(dna_segments):
                processed_audio = dna_method(processed_audio,
                                             int(seg["begin"]),
                                             int(seg["end"]))
            if save_temps:
                _, ext = os.path.splitext(audio_path)
                try:
                    processed_audio.export(save_temps + "_processed" + ext,
                                           format=ext[1:])
                except CouldntEncodeError:
                    try:
                        os.remove(save_temps + "_processed" + ext)
                    except BaseException:
                        pass
                    LOGGER.warning(
                        f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'"
                    )
                    processed_audio.export(save_temps + "_processed" + ".wav")
            removed_segments = dna_segments
        audio_data = processed_audio
    else:
        audio_data = audio

    # Initialize the SoundSwallower decoder with the sample rate from the audio
    frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen"))
    fft_size = 1
    while fft_size < frame_points:
        fft_size = fft_size << 1
    cfg.set_int("-nfft", fft_size)
    frame_size = 1.0 / cfg.get_int("-frate")

    # Note: the frames are typically 0.01s long (i.e., the frame rate is typically 100),
    # while the audio segments manipulated using pydub are sliced and accessed in
    # millisecond intervals. For audio segments, the ms slice assumption is hard-coded
    # all over, while frames_to_time() is used to convert segment boundaries returned by
    # soundswallower, which are indexes in frames, into durations in seconds.
    def frames_to_time(frames):
        return frames * frame_size

    # Extract the list of sequences of words in the XML
    word_sequences = get_sequences(xml, xml_path, unit=unit)
    end = 0
    for i, word_sequence in enumerate(word_sequences):

        i_suffix = "" if i == 0 else "." + str(i + 1)

        # Generate dictionary and FSG for the current sequence of words
        dict_data = make_dict(word_sequence.words, xml_path, unit=unit)
        if save_temps:
            dict_file = io.open(save_temps + ".dict" + i_suffix, "wb")
        else:
            dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_",
                                                   delete=False)
        dict_file.write(dict_data.encode("utf-8"))
        dict_file.close()

        fsg_data = make_fsg(word_sequence.words, xml_path)
        if save_temps:
            fsg_file = io.open(save_temps + ".fsg" + i_suffix, "wb")
        else:
            fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_",
                                                  delete=False)
        fsg_file.write(fsg_data.encode("utf-8"))
        fsg_file.close()

        # Extract the part of the audio corresponding to this word sequence
        audio_segment = extract_section(audio_data, word_sequence.start,
                                        word_sequence.end)
        if save_temps and audio_segment is not audio_data:
            write_audio_to_file(audio_segment, save_temps + ".wav" + i_suffix)

        # Configure soundswallower for this sequence's dict and fsg
        cfg.set_string("-dict", dict_file.name)
        cfg.set_string("-fsg", fsg_file.name)
        ps = soundswallower.Decoder(cfg)
        # Align this word sequence
        ps.start_utt()
        ps.process_raw(audio_segment.raw_data, no_search=False, full_utt=True)
        ps.end_utt()

        if not ps.seg():
            raise RuntimeError(
                "Alignment produced no segments, "
                "please examine dictionary and input audio and text.")

        # List of removed segments for the sequence we are currently processing
        curr_removed_segments = dna_union(word_sequence.start,
                                          word_sequence.end,
                                          audio_length_in_ms, removed_segments)

        prev_segment_count = len(results["words"])
        for seg in ps.seg():
            if seg.word in ("<sil>", "[NOISE]"):
                continue
            start = frames_to_time(seg.start_frame)
            end = frames_to_time(seg.end_frame + 1)
            # change to ms
            start_ms = start * 1000
            end_ms = end * 1000
            if curr_removed_segments:
                start_ms += calculate_adjustment(start_ms,
                                                 curr_removed_segments)
                end_ms += calculate_adjustment(end_ms, curr_removed_segments)
                start_ms, end_ms = correct_adjustments(start_ms, end_ms,
                                                       curr_removed_segments)
                # change back to seconds to write to smil
                start = start_ms / 1000
                end = end_ms / 1000
            results["words"].append({
                "id": seg.word,
                "start": start,
                "end": end
            })
            LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end)
        aligned_segment_count = len(results["words"]) - prev_segment_count
        if aligned_segment_count != len(word_sequence.words):
            LOGGER.warning(
                f"Word sequence {i+1} had {len(word_sequence.words)} tokens "
                f"but produced {aligned_segment_count} segments. "
                "Check that the anchors are well positioned or "
                "that the audio corresponds to the text.")
    final_end = end

    if len(results["words"]) == 0:
        raise RuntimeError(
            "Alignment produced only noise or silence segments, "
            "please verify that the text is an actual transcript of the audio."
        )
    if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)):
        LOGGER.warning(
            "Alignment produced a different number of segments and tokens than "
            "were in the input. Sequences between some anchors probably did not "
            "align successfully. Look for more anchors-related warnings above in the log."
        )

    if not bare:
        # Take all the boundaries (anchors) around segments and add them as DNA
        # segments for the purpose of splitting silences
        dna_for_silence_splitting = copy.deepcopy(dna_segments)
        last_end = None
        for seq in word_sequences:
            if last_end or seq.start:
                dna_for_silence_splitting.append({
                    "begin": (last_end or seq.start),
                    "end": (seq.start or last_end)
                })
            last_end = seq.end
        if last_end:
            dna_for_silence_splitting.append({
                "begin": last_end,
                "end": last_end
            })
        dna_for_silence_splitting = sort_and_join_dna_segments(
            dna_for_silence_splitting)

        split_silences(results["words"], final_end, dna_for_silence_splitting)
    words_dict = {
        x["id"]: {
            "start": x["start"],
            "end": x["end"]
        }
        for x in results["words"]
    }
    silence_offsets = defaultdict(int)
    silence = 0
    if results["tokenized"].xpath("//silence"):
        endpoint = 0
        all_good = True
        for el in results["tokenized"].xpath("//*"):
            if el.tag == "silence" and "dur" in el.attrib:
                try:
                    silence_ms = parse_time(el.attrib["dur"])
                except ValueError as err:
                    LOGGER.error(
                        f'Invalid silence element in {xml_path}: invalid "time" '
                        f'attribute "{el.attrib["dur"]}": {err}')
                    all_good = False
                    continue
                silence_segment = AudioSegment.silent(
                    duration=silence_ms)  # create silence segment
                silence += silence_ms  # add silence length to total silence
                audio = (audio[:endpoint] + silence_segment + audio[endpoint:]
                         )  # insert silence at previous endpoint
                endpoint += silence_ms  # add silence to previous endpoint
            if el.tag == "w":
                silence_offsets[el.attrib["id"]] += (
                    silence / 1000
                )  # add silence in seconds to silence offset for word id
                endpoint = (words_dict[el.attrib["id"]]["end"] * 1000
                            ) + silence  # bump endpoint and include silence
        if not all_good:
            raise RuntimeError(
                f"Could not parse all duration attributes in silence elements in {xml_path}, please make sure each silence "
                'element is properly formatted, e.g., <silence dur="1.5s"/>.  Aborting.'
            )
    if silence:
        for word in results["words"]:
            word["start"] += silence_offsets[word["id"]]
            word["end"] += silence_offsets[word["id"]]
        results["audio"] = audio
    return results