Пример #1
0
def end_to_end(xml,
               input_filename,
               unit,
               word_unit,
               out_orth,
               mapping_dir=None):
    xml = add_lang_ids(xml, mapping_dir, unit="p")
    xml = tokenize_xml(xml, mapping_dir)
    xml = add_ids(xml)
    converted_xml = convert_xml(xml,
                                word_unit,
                                out_orth,
                                mapping_dir=mapping_dir)
    #save_xml("test.xml", converted_xml)
    fsg = make_fsg(converted_xml, input_filename, unit)
    pronouncing_dictionary = make_dict(converted_xml, input_filename, unit)
    return xml, fsg, pronouncing_dictionary
Пример #2
0
 def testConvert(self):
     xml = tokenize_xml(self.xml)
     xml = add_ids(xml)
     xml = convert_xml(xml)
     self.assertEqual(etree.tounicode(xml), EXPECTED_CONVERTED)
Пример #3
0
def align_audio(xml_path, wav_path, unit='w', save_temps=None):
    """
    Align an XML input file to an audio file.

    Args:
      xml_path: Path to XML input file in TEI-like format
      wav_path: Path to audio input (WAV or MP3)
      unit: Element to create alignments for.
      save_temps: Basename for intermediate output files (or
        None if they won't be saved)
    """
    results = {"words": []}

    # First do G2P
    xml = etree.parse(xml_path).getroot()
    xml = add_lang_ids(xml, mapping_dir, unit="s")
    xml = tokenize_xml(xml)
    if save_temps:
        save_xml(save_temps + '.tokenized.xml', xml)
    results['tokenized'] = xml = add_ids(xml)
    if save_temps:
        save_xml(save_temps + '.ids.xml', xml)
    xml = convert_xml(xml)
    if save_temps:
        save_xml(save_temps + '.g2p.xml', xml)

    # Now generate dictionary and FSG
    dict_data = make_dict(xml, xml_path, unit=unit)
    if save_temps:
        dict_file = io.open(save_temps + '.dict', 'wb')
    else:
        dict_file = NamedTemporaryFile(prefix='readalongs_dict_', delete=False)
    dict_file.write(dict_data.encode('utf-8'))
    dict_file.flush()

    fsg_data = make_fsg(xml, xml_path, unit=unit)
    if save_temps:
        fsg_file = io.open(save_temps + '.fsg', 'wb')
    else:
        fsg_file = NamedTemporaryFile(prefix='readalongs_fsg_', delete=False)
    fsg_file.write(fsg_data.encode('utf-8'))
    fsg_file.flush()

    # Now do alignment
    cfg = pocketsphinx.Decoder.default_config()
    model_path = pocketsphinx.get_model_path()
    cfg.set_boolean('-remove_noise', False)
    cfg.set_boolean('-remove_silence', False)
    cfg.set_string('-hmm', os.path.join(model_path, 'en-us'))
    cfg.set_string('-dict', dict_file.name)
    cfg.set_string('-fsg', fsg_file.name)
    # cfg.set_string('-samprate', "no no")
    cfg.set_float('-beam', 1e-100)
    cfg.set_float('-wbeam', 1e-80)

    _, wav_ext = os.path.splitext(wav_path)
    if wav_ext == '.wav':
        with wave.open(wav_path) as wav:
            logging.info("Read %s: %d frames (%f seconds) audio"
                         % (wav_path, wav.getnframes(), wav.getnframes()
                            / wav.getframerate()))
            raw_data = wav.readframes(wav.getnframes())
            # Downsampling is (probably) not necessary
            cfg.set_float('-samprate', wav.getframerate())
    else:  # Try pydub, it might fail
        audio = pydub.AudioSegment.from_file(wav_path)
        audio = audio.set_channels(1).set_sample_width(2)
        # Downsampling is (probably) not necessary
        cfg.set_float('-samprate', audio.frame_rate)
        raw_data = audio.raw_data

    frame_points = int(cfg.get_float('-samprate')
                       * cfg.get_float('-wlen'))
    fft_size = 1
    while fft_size < frame_points:
        fft_size = fft_size << 1
    cfg.set_int('-nfft', fft_size)
    ps = pocketsphinx.Decoder(cfg)
    frame_size = 1.0 / cfg.get_int('-frate')

    def frames_to_time(frames):
        return frames * frame_size
    ps.start_utt()
    ps.process_raw(raw_data, no_search=False, full_utt=True)
    ps.end_utt()

    for seg in ps.seg():
        start = frames_to_time(seg.start_frame)
        end = frames_to_time(seg.end_frame + 1)
        if seg.word in ('<sil>', '[NOISE]'):
            continue
        else:
            results["words"].append({
                "id": seg.word,
                "start": start,
                "end": end
            })
        logging.info("Segment: %s (%.3f : %.3f)",
                     seg.word, start, end)
    final_end = end

    # FIXME: should have the same number of outputs as inputs
    if len(results['words']) == 0:
        raise RuntimeError("Alignment Failed, please examine "
                           "dictionary and input audio and text.")

    # Split adjoining silence/noise between words
    last_end = 0.0
    last_word = dict()
    for word in results['words']:
        silence = word['start'] - last_end
        midpoint = last_end + silence / 2
        if silence > 0:
            if last_word:
                last_word['end'] = midpoint
            word['start'] = midpoint
        last_word = word
        last_end = word['end']
    silence = final_end - last_end
    if silence > 0:
        if last_word is not None:
            last_word['end'] += silence / 2

    dict_file.close()
    os.unlink(dict_file.name)
    fsg_file.close()
    os.unlink(fsg_file.name)

    return results
Пример #4
0
 def testAddIDs(self):
     xml = tokenize_xml(self.xml)
     xml = add_ids(xml)
     self.assertEqual(etree.tounicode(xml), EXPECTED_IDS)