示例#1
0
    def test_convert_xml_invalid(self):
        """test readalongs.text.convert_xml.convert_xml() with invalid input"""
        xml = etree.fromstring('<s><w ARPABET="V AA L IY D">valid</w></s>')
        c_xml, valid = convert_xml(xml)
        self.assertEqual(etree.tounicode(c_xml),
                         '<s><w ARPABET="V AA L IY D">valid</w></s>')
        self.assertTrue(valid, "convert_xml with valid pre-g2p'd text")

        xml = etree.fromstring('<s><w ARPABET="invalid">invalid</w></s>')
        c_xml, valid = convert_xml(xml)
        self.assertEqual(etree.tounicode(c_xml),
                         '<s><w ARPABET="invalid">invalid</w></s>')
        self.assertFalse(valid, "convert_xml with invalid pre-g2p'd text")
示例#2
0
def end_to_end(xml, input_filename, unit, word_unit, out_orth):
    xml = tokenize_xml(xml)
    xml = add_ids(xml)
    converted_xml, valid = convert_xml(xml, word_unit, out_orth)
    # save_xml("test.xml", converted_xml)
    fsg = make_fsg(converted_xml, input_filename, unit)
    pronouncing_dictionary = make_dict(converted_xml, input_filename, unit)
    return xml, fsg, pronouncing_dictionary
示例#3
0
 def test_invalid_langs_in_xml(self):
     xml = etree.fromstring("""
         <s>
         <w lang="eng" fallback-langs="foo">français falls back to invalid foo</w>
         <w lang="crx-syl">no path to arpabet</w>
         </s>
     """)
     with self.assertLogs(LOGGER, level="WARNING") as cm:
         c_xml, valid = convert_xml(xml)
     self.assertFalse(valid)
     logger_output = "\n".join(cm.output)
     self.assertIn('"foo": invalid language code', logger_output)
     self.assertIn('"crx-syl": no path to "eng-arpabet"', logger_output)
示例#4
0
def align_audio(  # noqa: C901
    xml_path,
    audio_path,
    unit="w",
    bare=False,
    config=None,
    save_temps=None,
    verbose_g2p_warnings=False,
):
    """Align an XML input file to an audio file.

    Args:
        xml_path (str): Path to XML input file in TEI-like format
        audio_path (str): Path to audio input. Must be in a format supported by ffmpeg
        unit (str): Optional; Element to create alignments for, by default 'w'
        bare (boolean): Optional;
            If False, split silence into adjoining tokens (default)
            If True, keep the bare tokens without adjoining silences.
        config (object): Optional; ReadAlong-Studio configuration to use
        save_temps (str): Optional; Save temporary files, by default None
        verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings
            iff True

    Returns:
        Dict[str, List]: TODO

    Raises:
        TODO
    """
    results: Dict[str, List] = {"words": [], "audio": None}

    # First do G2P
    try:
        xml = etree.parse(xml_path).getroot()
    except etree.XMLSyntaxError as e:
        raise RuntimeError("Error parsing XML input file %s: %s." %
                           (xml_path, e)) from e
    if config and "images" in config:
        xml = add_images(xml, config)
    if config and "xml" in config:
        xml = add_supplementary_xml(xml, config)
    xml = tokenize_xml(xml)
    if save_temps:
        save_xml(save_temps + ".tokenized.xml", xml)
    results["tokenized"] = xml = add_ids(xml)
    if save_temps:
        save_xml(save_temps + ".ids.xml", xml)
    xml, valid = convert_xml(xml, verbose_warnings=verbose_g2p_warnings)
    if save_temps:
        save_xml(save_temps + ".g2p.xml", xml)
    if not valid:
        raise RuntimeError(
            "Some words could not be g2p'd correctly. Aborting. "
            "Run with --g2p-verbose for more detailed g2p error logs.")

    # Prepare the SoundsSwallower (formerly PocketSphinx) configuration
    cfg = soundswallower.Decoder.default_config()
    model_path = soundswallower.get_model_path()
    cfg.set_boolean("-remove_noise", False)
    cfg.set_boolean("-remove_silence", False)
    cfg.set_string("-hmm", os.path.join(model_path, "en-us"))
    # cfg.set_string('-samprate', "no no")
    cfg.set_float("-beam", 1e-100)
    cfg.set_float("-wbeam", 1e-80)

    # Read the audio file
    audio = read_audio_from_file(audio_path)
    audio = audio.set_channels(1).set_sample_width(2)
    audio_length_in_ms = len(audio.raw_data)
    #  Downsampling is (probably) not necessary
    cfg.set_float("-samprate", audio.frame_rate)

    # Process audio, silencing or removing any DNA segments
    dna_segments = []
    removed_segments = []
    if config and "do-not-align" in config:
        # Sort un-alignable segments and join overlapping ones
        dna_segments = sort_and_join_dna_segments(
            config["do-not-align"]["segments"])
        method = config["do-not-align"].get("method", "remove")
        # Determine do-not-align method
        if method == "mute":
            dna_method = mute_section
        elif method == "remove":
            dna_method = remove_section
        else:
            LOGGER.error("Unknown do-not-align method declared")
        # Process audio and save temporary files
        if method in ("mute", "remove"):
            processed_audio = audio
            # Process the DNA segments in reverse order so we don't have to correct
            # for previously processed ones when using the "remove" method.
            for seg in reversed(dna_segments):
                processed_audio = dna_method(processed_audio,
                                             int(seg["begin"]),
                                             int(seg["end"]))
            if save_temps:
                _, ext = os.path.splitext(audio_path)
                try:
                    processed_audio.export(save_temps + "_processed" + ext,
                                           format=ext[1:])
                except CouldntEncodeError:
                    try:
                        os.remove(save_temps + "_processed" + ext)
                    except BaseException:
                        pass
                    LOGGER.warning(
                        f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'"
                    )
                    processed_audio.export(save_temps + "_processed" + ".wav")
            removed_segments = dna_segments
        audio_data = processed_audio
    else:
        audio_data = audio

    # Initialize the SoundSwallower decoder with the sample rate from the audio
    frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen"))
    fft_size = 1
    while fft_size < frame_points:
        fft_size = fft_size << 1
    cfg.set_int("-nfft", fft_size)
    frame_size = 1.0 / cfg.get_int("-frate")

    # Note: the frames are typically 0.01s long (i.e., the frame rate is typically 100),
    # while the audio segments manipulated using pydub are sliced and accessed in
    # millisecond intervals. For audio segments, the ms slice assumption is hard-coded
    # all over, while frames_to_time() is used to convert segment boundaries returned by
    # soundswallower, which are indexes in frames, into durations in seconds.
    def frames_to_time(frames):
        return frames * frame_size

    # Extract the list of sequences of words in the XML
    word_sequences = get_sequences(xml, xml_path, unit=unit)
    end = 0
    for i, word_sequence in enumerate(word_sequences):

        i_suffix = "" if i == 0 else "." + str(i + 1)

        # Generate dictionary and FSG for the current sequence of words
        dict_data = make_dict(word_sequence.words, xml_path, unit=unit)
        if save_temps:
            dict_file = io.open(save_temps + ".dict" + i_suffix, "wb")
        else:
            dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_",
                                                   delete=False)
        dict_file.write(dict_data.encode("utf-8"))
        dict_file.close()

        fsg_data = make_fsg(word_sequence.words, xml_path)
        if save_temps:
            fsg_file = io.open(save_temps + ".fsg" + i_suffix, "wb")
        else:
            fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_",
                                                  delete=False)
        fsg_file.write(fsg_data.encode("utf-8"))
        fsg_file.close()

        # Extract the part of the audio corresponding to this word sequence
        audio_segment = extract_section(audio_data, word_sequence.start,
                                        word_sequence.end)
        if save_temps and audio_segment is not audio_data:
            write_audio_to_file(audio_segment, save_temps + ".wav" + i_suffix)

        # Configure soundswallower for this sequence's dict and fsg
        cfg.set_string("-dict", dict_file.name)
        cfg.set_string("-fsg", fsg_file.name)
        ps = soundswallower.Decoder(cfg)
        # Align this word sequence
        ps.start_utt()
        ps.process_raw(audio_segment.raw_data, no_search=False, full_utt=True)
        ps.end_utt()

        if not ps.seg():
            raise RuntimeError(
                "Alignment produced no segments, "
                "please examine dictionary and input audio and text.")

        # List of removed segments for the sequence we are currently processing
        curr_removed_segments = dna_union(word_sequence.start,
                                          word_sequence.end,
                                          audio_length_in_ms, removed_segments)

        prev_segment_count = len(results["words"])
        for seg in ps.seg():
            if seg.word in ("<sil>", "[NOISE]"):
                continue
            start = frames_to_time(seg.start_frame)
            end = frames_to_time(seg.end_frame + 1)
            # change to ms
            start_ms = start * 1000
            end_ms = end * 1000
            if curr_removed_segments:
                start_ms += calculate_adjustment(start_ms,
                                                 curr_removed_segments)
                end_ms += calculate_adjustment(end_ms, curr_removed_segments)
                start_ms, end_ms = correct_adjustments(start_ms, end_ms,
                                                       curr_removed_segments)
                # change back to seconds to write to smil
                start = start_ms / 1000
                end = end_ms / 1000
            results["words"].append({
                "id": seg.word,
                "start": start,
                "end": end
            })
            LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end)
        aligned_segment_count = len(results["words"]) - prev_segment_count
        if aligned_segment_count != len(word_sequence.words):
            LOGGER.warning(
                f"Word sequence {i+1} had {len(word_sequence.words)} tokens "
                f"but produced {aligned_segment_count} segments. "
                "Check that the anchors are well positioned or "
                "that the audio corresponds to the text.")
    final_end = end

    if len(results["words"]) == 0:
        raise RuntimeError(
            "Alignment produced only noise or silence segments, "
            "please verify that the text is an actual transcript of the audio."
        )
    if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)):
        LOGGER.warning(
            "Alignment produced a different number of segments and tokens than "
            "were in the input. Sequences between some anchors probably did not "
            "align successfully. Look for more anchors-related warnings above in the log."
        )

    if not bare:
        # Take all the boundaries (anchors) around segments and add them as DNA
        # segments for the purpose of splitting silences
        dna_for_silence_splitting = copy.deepcopy(dna_segments)
        last_end = None
        for seq in word_sequences:
            if last_end or seq.start:
                dna_for_silence_splitting.append({
                    "begin": (last_end or seq.start),
                    "end": (seq.start or last_end)
                })
            last_end = seq.end
        if last_end:
            dna_for_silence_splitting.append({
                "begin": last_end,
                "end": last_end
            })
        dna_for_silence_splitting = sort_and_join_dna_segments(
            dna_for_silence_splitting)

        split_silences(results["words"], final_end, dna_for_silence_splitting)
    words_dict = {
        x["id"]: {
            "start": x["start"],
            "end": x["end"]
        }
        for x in results["words"]
    }
    silence_offsets = defaultdict(int)
    silence = 0
    if results["tokenized"].xpath("//silence"):
        endpoint = 0
        all_good = True
        for el in results["tokenized"].xpath("//*"):
            if el.tag == "silence" and "dur" in el.attrib:
                try:
                    silence_ms = parse_time(el.attrib["dur"])
                except ValueError as err:
                    LOGGER.error(
                        f'Invalid silence element in {xml_path}: invalid "time" '
                        f'attribute "{el.attrib["dur"]}": {err}')
                    all_good = False
                    continue
                silence_segment = AudioSegment.silent(
                    duration=silence_ms)  # create silence segment
                silence += silence_ms  # add silence length to total silence
                audio = (audio[:endpoint] + silence_segment + audio[endpoint:]
                         )  # insert silence at previous endpoint
                endpoint += silence_ms  # add silence to previous endpoint
            if el.tag == "w":
                silence_offsets[el.attrib["id"]] += (
                    silence / 1000
                )  # add silence in seconds to silence offset for word id
                endpoint = (words_dict[el.attrib["id"]]["end"] * 1000
                            ) + silence  # bump endpoint and include silence
        if not all_good:
            raise RuntimeError(
                f"Could not parse all duration attributes in silence elements in {xml_path}, please make sure each silence "
                'element is properly formatted, e.g., <silence dur="1.5s"/>.  Aborting.'
            )
    if silence:
        for word in results["words"]:
            word["start"] += silence_offsets[word["id"]]
            word["end"] += silence_offsets[word["id"]]
        results["audio"] = audio
    return results
示例#5
0
def align_audio(
    xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None,
):
    """ Align an XML input file to an audio file.

    Parameters
    ----------
    xml_path : str
        Path to XML input file in TEI-like format
    audio_path : str
        Path to audio input. Must be in a format supported by ffmpeg
    unit : str, optional
        Element to create alignments for, by default 'w'
    bare : boolean, optional
        If False, split silence into adjoining tokens (default)
        If True, keep the bare tokens without adjoining silences.
    config : object, optional
        Uses ReadAlong-Studio configuration
    save_temps : Union[str, None], optional
        save temporary files, by default None

    #TODO: document return
    Returns
    -------
    [type]
        [description]

    #TODO: document exceptions
    Raises
    ------
    RuntimeError
        [description]
    RuntimeError
        [description]
    RuntimeError
        [description]
    RuntimeError
        [description]
    """
    results: Dict[str, List] = {"words": []}

    # First do G2P
    try:
        xml = etree.parse(xml_path).getroot()
    except etree.XMLSyntaxError as e:
        raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e))
    if config and "images" in config:
        xml = add_images(xml, config)
    if config and "xml" in config:
        xml = add_supplementary_xml(xml, config)
    xml = add_lang_ids(xml, unit="s")
    xml = tokenize_xml(xml)
    if save_temps:
        save_xml(save_temps + ".tokenized.xml", xml)
    results["tokenized"] = xml = add_ids(xml)
    if save_temps:
        save_xml(save_temps + ".ids.xml", xml)
    xml = convert_xml(xml)
    if save_temps:
        save_xml(save_temps + ".g2p.xml", xml)

    # Now generate dictionary and FSG
    dict_data = make_dict(xml, xml_path, unit=unit)
    if save_temps:
        dict_file = io.open(save_temps + ".dict", "wb")
    else:
        dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False)
    dict_file.write(dict_data.encode("utf-8"))
    dict_file.flush()
    fsg_data = make_fsg(xml, xml_path, unit=unit)
    if save_temps:
        fsg_file = io.open(save_temps + ".fsg", "wb")
    else:
        fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False)
    fsg_file.write(fsg_data.encode("utf-8"))
    fsg_file.flush()

    # Now do alignment
    cfg = soundswallower.Decoder.default_config()
    model_path = soundswallower.get_model_path()
    cfg.set_boolean("-remove_noise", False)
    cfg.set_boolean("-remove_silence", False)
    cfg.set_string("-hmm", os.path.join(model_path, "en-us"))
    cfg.set_string("-dict", dict_file.name)
    cfg.set_string("-fsg", fsg_file.name)
    # cfg.set_string('-samprate', "no no")
    cfg.set_float("-beam", 1e-100)
    cfg.set_float("-wbeam", 1e-80)

    audio = read_audio_from_file(audio_path)
    audio = audio.set_channels(1).set_sample_width(2)
    #  Downsampling is (probably) not necessary
    cfg.set_float("-samprate", audio.frame_rate)

    # Process audio
    do_not_align_segments = None
    if config and "do-not-align" in config:
        # Reverse sort un-alignable segments
        do_not_align_segments = sorted(
            config["do-not-align"]["segments"], key=lambda x: x["begin"], reverse=True
        )
        method = config["do-not-align"].get("method", "remove")
        # Determine do-not-align method
        if method == "mute":
            dna_method = mute_section
        elif method == "remove":
            dna_method = remove_section
        else:
            LOGGER.error("Unknown do-not-align method declared")
        # Process audio and save temporary files
        if method == "mute" or method == "remove":
            processed_audio = audio
            for seg in do_not_align_segments:
                processed_audio = dna_method(
                    processed_audio, int(seg["begin"]), int(seg["end"])
                )
            if save_temps:
                _, ext = os.path.splitext(audio_path)
                try:
                    processed_audio.export(
                        save_temps + "_processed" + ext, format=ext[1:]
                    )
                except CouldntEncodeError:
                    os.remove(save_temps + "_processed" + ext)
                    LOGGER.warn(
                        f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'"
                    )
                    processed_audio.export(save_temps + "_processed" + ".wav")
        raw_data = processed_audio.raw_data
    else:
        raw_data = audio.raw_data

    frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen"))
    fft_size = 1
    while fft_size < frame_points:
        fft_size = fft_size << 1
    cfg.set_int("-nfft", fft_size)
    ps = soundswallower.Decoder(cfg)
    frame_size = 1.0 / cfg.get_int("-frate")

    def frames_to_time(frames):
        return frames * frame_size

    ps.start_utt()
    ps.process_raw(raw_data, no_search=False, full_utt=True)
    ps.end_utt()

    if not ps.seg():
        raise RuntimeError(
            "Alignment produced no segments, "
            "please examine dictionary and input audio and text."
        )

    for seg in ps.seg():
        start = frames_to_time(seg.start_frame)
        end = frames_to_time(seg.end_frame + 1)
        # change to ms
        start_ms = start * 1000
        end_ms = end * 1000
        if do_not_align_segments and method == "remove":
            start_ms += calculate_adjustment(start_ms, do_not_align_segments)
            end_ms += calculate_adjustment(end_ms, do_not_align_segments)
            start_ms, end_ms = correct_adjustments(
                start_ms, end_ms, do_not_align_segments
            )
            # change back to seconds to write to smil
            start = start_ms / 1000
            end = end_ms / 1000
        if seg.word in ("<sil>", "[NOISE]"):
            continue
        else:
            results["words"].append({"id": seg.word, "start": start, "end": end})
        LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end)

    if len(results["words"]) == 0:
        raise RuntimeError(
            "Alignment produced only noise or silence segments, "
            "please examine dictionary and input audio and text."
        )
    if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)):
        raise RuntimeError(
            "Alignment produced a different number of segments and tokens, "
            "please examine dictionary and input audio and text."
        )

    final_end = end

    if not bare:
        # Split adjoining silence/noise between words
        last_end = 0.0
        last_word = dict()
        for word in results["words"]:
            silence = word["start"] - last_end
            midpoint = last_end + silence / 2
            if silence > 0:
                if last_word:
                    last_word["end"] = midpoint
                word["start"] = midpoint
            last_word = word
            last_end = word["end"]
        silence = final_end - last_end
        if silence > 0:
            if last_word is not None:
                last_word["end"] += silence / 2
    dict_file.close()
    if not save_temps:
        os.unlink(dict_file.name)
    fsg_file.close()
    if not save_temps:
        os.unlink(fsg_file.name)

    return results
示例#6
0
 def run_convert_xml(self, input_string):
     """wrap convert_xml to make unit testing easier"""
     return etree.tounicode(convert_xml(etree.fromstring(input_string))[0])
示例#7
0
文件: cli.py 项目: ReadAlongs/Studio
def g2p(**kwargs):
    """Apply g2p mappings to TOKFILE into G2PFILE.

    TOKFILE should have been produced by 'readalongs tokenize'.
    G2PFILE can then be modified to adjust the phonetic representation as needed.
    'readalongs align' can be called with G2PFILE instead of TOKFILE as XML input.

    The g2p cascade will be enabled whenever an XML element or any of its
    ancestors in TOKFILE has the attribute "fallback-langs" containing a comma-
    or colon-separated list of language codes. Provide multiple language codes to
    "readalongs prepare" via its -l option to generate this attribute globally,
    or add it manually where needed. Undetermined, "und", is automatically
    added at the end of the language list provided via -l.

    With the g2p cascade, if a word cannot be mapped to valid ARPABET with the
    language found in the "xml:lang" attribute, the languages in
    "fallback-langs" are tried in order until a valid ARPABET mapping is
    generated.

    The output XML file can be used as input to align.

    TOKFILE: Path to the input tokenized XML file, or - for stdin

    G2PFILE: Output path for the g2p'd XML, or - for stdout [default: TOKFILE
    with .g2p. inserted]
    """
    if kwargs["debug"]:
        LOGGER.setLevel("DEBUG")
        LOGGER.info(
            "Running readalongs g2p(tokfile={}, g2pfile={}, force-overwrite={})."
            .format(
                kwargs["tokfile"],
                kwargs["g2pfile"],
                kwargs["force_overwrite"],
            ))

    input_file = kwargs["tokfile"]

    if not kwargs["g2pfile"]:
        output_path = get_click_file_name(input_file)
        if output_path != "-":
            if output_path.endswith(".xml"):
                output_path = output_path[:-4]
            if output_path.endswith(".tokenized"):
                output_path = output_path[:-len(".tokenized")]
            output_path += ".g2p.xml"
    else:
        output_path = kwargs["g2pfile"]
        if not output_path.endswith(".xml") and not output_path == "-":
            output_path += ".xml"

    if os.path.exists(output_path) and not kwargs["force_overwrite"]:
        raise click.BadParameter(
            "Output file %s exists already, use -f to overwrite." %
            output_path)

    try:
        xml = etree.parse(input_file).getroot()
    except etree.XMLSyntaxError as e:
        raise click.BadParameter(
            "Error parsing input file %s as XML, please verify it. Parser error: %s"
            % (get_click_file_name(input_file), e))

    # Add the IDs to paragraph, sentences, word, etc.
    xml = add_ids(xml)

    # Apply the g2p mappings.
    xml, valid = convert_xml(
        xml,
        verbose_warnings=kwargs["g2p_verbose"],
    )

    if output_path == "-":
        write_xml(sys.stdout.buffer, xml)
    else:
        save_xml(output_path, xml)
        LOGGER.info("Wrote {}".format(output_path))

    if not valid:
        LOGGER.error("Some word(s) could not be g2p'd correctly." + (
            " Run again with --g2p-verbose to get more detailed error messages."
            if not kwargs["g2p_verbose"] else ""))
        sys.exit(1)