def end_to_end(xml, input_filename, unit, word_unit, out_orth, mapping_dir=None): xml = add_lang_ids(xml, mapping_dir, unit="p") xml = tokenize_xml(xml, mapping_dir) xml = add_ids(xml) converted_xml = convert_xml(xml, word_unit, out_orth, mapping_dir=mapping_dir) #save_xml("test.xml", converted_xml) fsg = make_fsg(converted_xml, input_filename, unit) pronouncing_dictionary = make_dict(converted_xml, input_filename, unit) return xml, fsg, pronouncing_dictionary
def testConvert(self): xml = tokenize_xml(self.xml) xml = add_ids(xml) xml = convert_xml(xml) self.assertEqual(etree.tounicode(xml), EXPECTED_CONVERTED)
def align_audio(xml_path, wav_path, unit='w', save_temps=None): """ Align an XML input file to an audio file. Args: xml_path: Path to XML input file in TEI-like format wav_path: Path to audio input (WAV or MP3) unit: Element to create alignments for. save_temps: Basename for intermediate output files (or None if they won't be saved) """ results = {"words": []} # First do G2P xml = etree.parse(xml_path).getroot() xml = add_lang_ids(xml, mapping_dir, unit="s") xml = tokenize_xml(xml) if save_temps: save_xml(save_temps + '.tokenized.xml', xml) results['tokenized'] = xml = add_ids(xml) if save_temps: save_xml(save_temps + '.ids.xml', xml) xml = convert_xml(xml) if save_temps: save_xml(save_temps + '.g2p.xml', xml) # Now generate dictionary and FSG dict_data = make_dict(xml, xml_path, unit=unit) if save_temps: dict_file = io.open(save_temps + '.dict', 'wb') else: dict_file = NamedTemporaryFile(prefix='readalongs_dict_', delete=False) dict_file.write(dict_data.encode('utf-8')) dict_file.flush() fsg_data = make_fsg(xml, xml_path, unit=unit) if save_temps: fsg_file = io.open(save_temps + '.fsg', 'wb') else: fsg_file = NamedTemporaryFile(prefix='readalongs_fsg_', delete=False) fsg_file.write(fsg_data.encode('utf-8')) fsg_file.flush() # Now do alignment cfg = pocketsphinx.Decoder.default_config() model_path = pocketsphinx.get_model_path() cfg.set_boolean('-remove_noise', False) cfg.set_boolean('-remove_silence', False) cfg.set_string('-hmm', os.path.join(model_path, 'en-us')) cfg.set_string('-dict', dict_file.name) cfg.set_string('-fsg', fsg_file.name) # cfg.set_string('-samprate', "no no") cfg.set_float('-beam', 1e-100) cfg.set_float('-wbeam', 1e-80) _, wav_ext = os.path.splitext(wav_path) if wav_ext == '.wav': with wave.open(wav_path) as wav: logging.info("Read %s: %d frames (%f seconds) audio" % (wav_path, wav.getnframes(), wav.getnframes() / wav.getframerate())) raw_data = wav.readframes(wav.getnframes()) # Downsampling is (probably) not necessary cfg.set_float('-samprate', wav.getframerate()) else: # Try pydub, it might fail audio = pydub.AudioSegment.from_file(wav_path) audio = audio.set_channels(1).set_sample_width(2) # Downsampling is (probably) not necessary cfg.set_float('-samprate', audio.frame_rate) raw_data = audio.raw_data frame_points = int(cfg.get_float('-samprate') * cfg.get_float('-wlen')) fft_size = 1 while fft_size < frame_points: fft_size = fft_size << 1 cfg.set_int('-nfft', fft_size) ps = pocketsphinx.Decoder(cfg) frame_size = 1.0 / cfg.get_int('-frate') def frames_to_time(frames): return frames * frame_size ps.start_utt() ps.process_raw(raw_data, no_search=False, full_utt=True) ps.end_utt() for seg in ps.seg(): start = frames_to_time(seg.start_frame) end = frames_to_time(seg.end_frame + 1) if seg.word in ('<sil>', '[NOISE]'): continue else: results["words"].append({ "id": seg.word, "start": start, "end": end }) logging.info("Segment: %s (%.3f : %.3f)", seg.word, start, end) final_end = end # FIXME: should have the same number of outputs as inputs if len(results['words']) == 0: raise RuntimeError("Alignment Failed, please examine " "dictionary and input audio and text.") # Split adjoining silence/noise between words last_end = 0.0 last_word = dict() for word in results['words']: silence = word['start'] - last_end midpoint = last_end + silence / 2 if silence > 0: if last_word: last_word['end'] = midpoint word['start'] = midpoint last_word = word last_end = word['end'] silence = final_end - last_end if silence > 0: if last_word is not None: last_word['end'] += silence / 2 dict_file.close() os.unlink(dict_file.name) fsg_file.close() os.unlink(fsg_file.name) return results
def testAddIDs(self): xml = tokenize_xml(self.xml) xml = add_ids(xml) self.assertEqual(etree.tounicode(xml), EXPECTED_IDS)