def frontend(): from ttslab.defaultvoice import LwaziVoice voice = LwaziVoice(phoneset=ttslab.fromfile(PHONESET_FILE), g2p=ttslab.fromfile(G2P_FILE), pronundict=ttslab.fromfile(PRONUNDICT_FILE), pronunaddendum=ttslab.fromfile(PRONUNADDENDUM_FILE)) ttslab.tofile(voice, "frontend.voice.pickle")
def us(): from ttslab.defaultvoice import LwaziUSVoice from ttslab.synthesizer_us import SynthesizerUS voice = LwaziUSVoice(phoneset=ttslab.fromfile(PHONESET_FILE), g2p=ttslab.fromfile(G2P_FILE), pronundict=ttslab.fromfile(PRONUNDICT_FILE), pronunaddendum=ttslab.fromfile(PRONUNADDENDUM_FILE), synthesizer=SynthesizerUS(voice=None, unitcatalogue=ttslab.fromfile(USCATALOGUE_FILE))) ttslab.tofile(voice, "us.voice.pickle")
def htsfrontend(): from ttslab.defaultvoice import LwaziHTSVoice from ttslab.synthesizer_htsme import SynthesizerHTSME voice = LwaziHTSVoice(phoneset=ttslab.fromfile(PHONESET_FILE), g2p=ttslab.fromfile(G2P_FILE), pronundict=ttslab.fromfile(PRONUNDICT_FILE), pronunaddendum=ttslab.fromfile(PRONUNADDENDUM_FILE), synthesizer=SynthesizerHTSME(voice=None, models_dir=None)) ttslab.tofile(voice, "frontend.hts.voice.pickle")
def hts(): from ttslab.defaultvoice import LwaziHTSVoice from ttslab.voices.yoruba_default import SynthesizerHTSME_Tone_NoTone voice = LwaziHTSVoice(phoneset=ttslab.fromfile(PHONESET_FILE), g2p=ttslab.fromfile(G2P_FILE), pronundict=ttslab.fromfile(PRONUNDICT_FILE), pronunaddendum=ttslab.fromfile(PRONUNADDENDUM_FILE), synthesizer=SynthesizerHTSME_Tone_NoTone(voice=None, models_dir=os.path.join(os.getcwd(), HTSMODELS_DIR))) ttslab.tofile(voice, "hts.voice.pickle")
def uttdtwdistcalc(args): vfname, ufname = args v = ttslab.fromfile(vfname) u = ttslab.fromfile(ufname) print(u["file_id"], end=" ") u2 = v.synthesize(u["text"], "text-to-wave") t = u.utt_distance(u2) t.name = u["file_id"] u["dtwdists"] = {"utt": u2, "track": t} ttslab.tofile(u, os.path.join(UTTDIR2, u["file_id"] + ".utt.pickle"))
def wordus(): from ttslab.defaultvoice import WordUSVoice from ttslab.synthesizer_us import SynthesizerUSWordUnits voice = WordUSVoice(phoneset=ttslab.fromfile(PHONESET_FILE), g2p=ttslab.fromfile(G2P_FILE), pronundict=ttslab.fromfile(PRONUNDICT_FILE), pronunaddendum=ttslab.fromfile(PRONUNADDENDUM_FILE), synthesizer=SynthesizerUSWordUnits(voice=None, unitcatalogue=ttslab.fromfile(USCATALOGUE_FILE)), silword="PAUSE") ttslab.tofile(voice, "wordus.voice.pickle")
def uttlindistcalc(args): vfname, ufname = args v = ttslab.fromfile(vfname) u = ttslab.fromfile(ufname) print(u["file_id"], end=" ") u2 = copy.deepcopy(u) u2.voice = v u2 = v.resynthesize(u2, processname="utt-to-wave", htsparms={"-vp": True}) t = u.utt_distance(u2, method="linear") t.name = u["file_id"] u["lindists"] = {"utt": u2, "track": t} ttslab.tofile(u, os.path.join(UTTDIR2, u["file_id"] + ".utt.pickle"))
def prev(self): self.save_data() if self.current_index > 0: self.current_index -= 1 self.current_wordindex = self.worklist[self.current_index][1] self.current_utt = ttslab.fromfile(self.worklist[self.current_index][0]) self.current_utt.fill_startendtimes()
def train_standard(parms): #setup dirs... os.makedirs(parms["workingdir"]) t = tarfile.open(parms["template"], "r:*") t.extractall(parms["workingdir"]) #SETUP FILES shutil.copy(parms["questionsfile"], os.path.join(parms["workingdir"], QUESTIONS_SUBDIR)) shutil.copy(parms["uttquestionsfile"], os.path.join(parms["workingdir"], QUESTIONS_SUBDIR)) print(os.getcwd()) for fn in sorted(glob(os.path.join(parms["utts"], "*." + UTT_EXT))): print("PROCESSING: %s" % (fn)) #copy utt with DATASET_SPEAKER_bname to HTS tree: shutil.copy(fn, os.path.join(parms["workingdir"], UTT_SUBDIR, "_".join([DATASET, SPEAKER, os.path.basename(fn)]))) #get raw audio files from utts: u = ttslab.fromfile(fn) waveform = u["waveform"] waveform.write(os.path.join(parms["workingdir"], RAW_SUBDIR, "_".join([DATASET, SPEAKER, os.path.basename(fn)])[:-len(UTT_EXT)] + RAW_EXT)) waveform.write(os.path.join(parms["workingdir"], WAV_SUBDIR, "_".join([DATASET, SPEAKER, os.path.basename(fn)])[:-len(UTT_EXT)] + WAV_EXT)) #TRAIN... os.chdir(parms["workingdir"]) os.system(CONFIGURE % (WITH_SPTK_SEARCH_PATH, WITH_HTS_SEARCH_PATH, WITH_HTS_ENGINE_SEARCH_PATH, SPEAKER, DATASET, parms["pitchmin"], parms["pitchmax"], parms["voice"])) os.system(MAKE)
def main(): try: try: voicefile = sys.argv[1] proc = sys.argv[2] except IndexError: raise CLIException voice = ttslab.fromfile(voicefile) if proc == "auto": auto(voice) elif proc == "to_textgrid": to_textgrid(voice) elif proc == "from_textgrid": from_textgrid(voice) elif proc == "alignments_from_textgrid": alignments_from_textgrid(voice) else: raise CLIException except CLIException: print( "USAGE: ttslab_align.py [VOICEFILE] [auto | to_textgrid | from_textgrid | alignments_from_textgrid]" )
def main(): try: voicefile = sys.argv[1] featconfpath = sys.argv[2] switch = sys.argv[3] except IndexError: print("USAGE: ttslab_make_wordunits.py VOICEFILE FEATSCONF [auto | make_features | make_catalogue]") sys.exit() voice = ttslab.fromfile(voicefile) with open(featconfpath) as conffh: featconfig = ConfigParser() featconfig.readfp(conffh) try: if switch == "auto": auto(featconfig, voice) elif switch == "make_features": make_features(featconfig) elif switch == "make_catalogue": make_catalogue(voice) else: raise CLIException except CLIException: print("USAGE: ttslab_make_wordunits.py VOICEFILE FEATSCONF [auto | make_features | make_catalogue]")
def main(): try: voicefile = sys.argv[1] featconfpath = sys.argv[2] switch = sys.argv[3] except IndexError: print( "USAGE: ttslab_make_wordunits.py VOICEFILE FEATSCONF [auto | make_features | make_catalogue]" ) sys.exit() voice = ttslab.fromfile(voicefile) with open(featconfpath) as conffh: featconfig = ConfigParser() featconfig.readfp(conffh) try: if switch == "auto": auto(featconfig, voice) elif switch == "make_features": make_features(featconfig) elif switch == "make_catalogue": make_catalogue(voice) else: raise CLIException except CLIException: print( "USAGE: ttslab_make_wordunits.py VOICEFILE FEATSCONF [auto | make_features | make_catalogue]" )
def add_feats_to_utt(args): u, lpc_dir, joincoef_dir, f0_dir = args file_id = u["file_id"] print("Processing:", file_id) u.fill_startendtimes() for unit, word in zip(u.gr("Unit"), u.gr("Word")): assert unit["name"] == word["name"] unit["start"] = word["start"] unit["end"] = word["end"] lpctrack = Track() lpctrack.load_track(".".join([os.path.join(lpc_dir, file_id), LPC_EXT])) restrack = Track() restrack.load_wave(".".join([os.path.join(lpc_dir, file_id), RES_EXT])) jointrack = ttslab.fromfile(".".join([os.path.join(joincoef_dir, file_id), JOIN_EXT])) f0track = Track() f0track.load_track(".".join([os.path.join(f0_dir, file_id), F0_EXT])) #get boundarytimes: boundarytimes = [] for i, unit in enumerate(u.gr("Unit")): if i == 0: boundarytimes.append(unit["start"]) boundarytimes.append(unit["end"]) #convert boundtimes into sample ranges: lpcsampleranges = [] f0sampleranges = [] joinsamples = [] for bound in boundarytimes: lpcsampleranges.append(lpctrack.index_at(bound)) f0sampleranges.append(f0track.index_at(bound)) joinsamples.append(jointrack.values[jointrack.index_at(bound)]) #get pitchperiods at lpc indices lpctimes = np.concatenate(([0.0], lpctrack.times)) pitchperiod = np.diff(lpctimes) units = u.get_relation("Unit").as_list() assert len(units) == len(lpcsampleranges) - 1 for jc0, jc1, lti0, lti1, fti0, fti1, i in zip(joinsamples[:-1], joinsamples[1:], lpcsampleranges[:-1], lpcsampleranges[1:], f0sampleranges[:-1], f0sampleranges[1:], units): # print(i["name"], "lpctrack[%s:%s]" % (lti0, lti1), "len(lpctrack)=%s" % len(lpctrack)) i["left-joincoef"] = jc0 i["right-joincoef"] = jc1 i["lpc-coefs"] = lpctrack.slice(lti0, lti1, copy=True) #like python indexing/slicing if lti0 == 0: i["lpc-coefs"].starttime = 0.0 else: i["lpc-coefs"].starttime = lpctrack.times[lti0 - 1] i["lpc-coefs"].zero_starttime() #For windowfactor=2 (save only samples and assume 16kHz) i["residuals"] = restrack.slice(restrack.index_at(lpctrack.times[lti0] - pitchperiod[lti0]), restrack.index_at(lpctrack.times[lti1] + pitchperiod[lti0])).values return u
def __init__(self, worklist, phmap): self.phmap = phmap self.worklist = worklist self.current_index = 0 self.current_wordindex = self.worklist[self.current_index][1] self.current_utt = ttslab.fromfile(self.worklist[self.current_index][0]) self.current_utt.fill_startendtimes() self.transcriptions = {self.worklist[self.current_index][0]: self.current_utt["text"]} self.comments = {self.worklist[self.current_index][0]: ""} self.pronuns = {self.worklist[self.current_index][0]: [" ".join(getpronun(w, self.phmap)) for w in self.current_utt.gr("SylStructure")]}
def __init__(self, worklist, voice): self.voice = voice self.worklist = worklist self.current_index = 0 self.current_wordindex = self.worklist[self.current_index][1] self.current_utt = ttslab.fromfile(self.worklist[self.current_index][0]) self.current_utt.fill_startendtimes() self.transcriptions = {self.worklist[self.current_index][0]: self.current_utt["inputtext"]} self.comments = {self.worklist[self.current_index][0]: ""} self.pronuns = {self.worklist[self.current_index][0]: [" ".join(getpronun(w, self.voice)) for w in self.current_utt.gr("SylStructure")]}
def make_units(voice, utt_dir): """ Run 'maketargetunits' process on Utterances to create Unit level to generate structure for adding acoustic features... """ print("MAKING UNITS..") utts = [] for uttfilename in sorted(glob(os.path.join(utt_dir, ".".join(["*", UTT_EXT])))): print(uttfilename) utt = ttslab.fromfile(uttfilename) utt = voice.synthesizer(utt, "targetunits") #DEMITASSE voice needs resynth method.. utts.append(utt) return utts
def train_standard(parms): #setup dirs... os.makedirs(parms["workingdir"]) t = tarfile.open(parms["template"], "r:*") t.extractall(parms["workingdir"]) #SETUP FILES shutil.copy(parms["questionsfile"], os.path.join(parms["workingdir"], QUESTIONS_SUBDIR)) shutil.copy(parms["uttquestionsfile"], os.path.join(parms["workingdir"], QUESTIONS_SUBDIR)) print(os.getcwd()) for fn in sorted(glob(os.path.join(parms["utts"], "*." + UTT_EXT))): print("PROCESSING: %s" % (fn)) #copy utt with DATASET_SPEAKER_bname to HTS tree: shutil.copy( fn, os.path.join(parms["workingdir"], UTT_SUBDIR, "_".join([DATASET, SPEAKER, os.path.basename(fn)]))) #get raw audio files from utts: u = ttslab.fromfile(fn) waveform = u["waveform"] waveform.write( os.path.join( parms["workingdir"], RAW_SUBDIR, "_".join([DATASET, SPEAKER, os.path.basename(fn)])[:-len(UTT_EXT)] + RAW_EXT)) waveform.write( os.path.join( parms["workingdir"], WAV_SUBDIR, "_".join([DATASET, SPEAKER, os.path.basename(fn)])[:-len(UTT_EXT)] + WAV_EXT)) #TRAIN... os.chdir(parms["workingdir"]) os.system(CONFIGURE % (WITH_SPTK_SEARCH_PATH, WITH_HTS_SEARCH_PATH, WITH_HTS_ENGINE_SEARCH_PATH, SPEAKER, DATASET, parms["pitchmin"], parms["pitchmax"], parms["voice"])) os.system(MAKE) #COPY FILTERS FOR HTSME_ENGINE... print("COPYING FILTERS") for fn in glob( os.path.join(FILTERS_SUBDIR, ".".join(["*", HTSME_ENGINE_EXT]))): destfn = os.path.join( MODELS_SUBDIR, os.path.basename(fn)[:-len(HTSME_ENGINE_EXT) - 1]) print(fn, destfn) shutil.copy(fn, destfn)
def next(self): self.save_data() if self.current_index < len(self.worklist) - 1: self.current_index += 1 self.current_wordindex = self.worklist[self.current_index][1] self.current_utt = ttslab.fromfile(self.worklist[self.current_index][0]) self.current_utt.fill_startendtimes() if self.worklist[self.current_index][0] not in self.transcriptions: self.transcriptions[self.worklist[self.current_index][0]] = self.current_utt["inputtext"] if self.worklist[self.current_index][0] not in self.comments: self.comments[self.worklist[self.current_index][0]] = "" if self.worklist[self.current_index][0] not in self.pronuns: self.pronuns[self.worklist[self.current_index][0]] = [" ".join(getpronun(w, self.voice)) for w in self.current_utt.gr("SylStructure")]
def next(self): self.save_data() if self.current_index < len(self.worklist) - 1: self.current_index += 1 self.current_wordindex = self.worklist[self.current_index][1] self.current_utt = ttslab.fromfile(self.worklist[self.current_index][0]) self.current_utt.fill_startendtimes() if self.worklist[self.current_index][0] not in self.transcriptions: self.transcriptions[self.worklist[self.current_index][0]] = self.current_utt["text"] if self.worklist[self.current_index][0] not in self.comments: self.comments[self.worklist[self.current_index][0]] = "" if self.worklist[self.current_index][0] not in self.pronuns: self.pronuns[self.worklist[self.current_index][0]] = [" ".join(getpronun(w, self.phmap)) for w in self.current_utt.gr("SylStructure")]
def make_units(voice, utt_dir): """Run synthesizer "feats" process on Utterances to create Unit level to generate structure for adding acoustic features... """ print("MAKING UNITS..") utts = [] for uttfilename in sorted( glob(os.path.join(utt_dir, ".".join(["*", UTT_EXT])))): print(uttfilename) utt = ttslab.fromfile(uttfilename) utt = voice.synthesizer(utt, ("feats", None)) utts.append(utt) return utts
def make_units(voice, utt_dir): """ Run 'maketargetunits' process on Utterances to create Unit level to generate structure for adding acoustic features... """ print("MAKING UNITS..") utts = [] for uttfilename in sorted( glob(os.path.join(utt_dir, ".".join(["*", UTT_EXT])))): print(uttfilename) utt = ttslab.fromfile(uttfilename) utt = voice.synthesizer( utt, "targetunits") #DEMITASSE voice needs resynth method.. utts.append(utt) return utts
def make_voice(langs, synthfile="frontend"): pronun = {} for i, lang in enumerate(langs): if i == 0: exec("from ttslab.lang.%(lang)s import Voice" % {"lang": lang}) langpref = "main" else: langpref = lang pronun[langpref] = {} pronun[langpref]["phoneset"] = ttslab.fromfile(langpref + PHONESET_FILESUFFIX) pronun[langpref]["pronundict"] = ttslab.fromfile(langpref + PRONUNDICT_FILESUFFIX) pronun[langpref]["pronunaddendum"] = ttslab.fromfile( langpref + PRONUNADDENDUM_FILESUFFIX) pronun[langpref]["g2p"] = ttslab.fromfile(langpref + G2P_FILESUFFIX) if synthfile == "frontend": voice = Voice(pronun=pronun, synthesizer=None) ttslab.tofile(voice, "frontend.voice.pickle") else: synthesizer = ttslab.fromfile(synthfile) voice = Voice(pronun=pronun, synthesizer=synthesizer) ttslab.tofile(voice, "voice.pickle")
def __init__(self, worklist, voice, previous): self.transcriptions, self.pronuns, self.comments = previous self.voice = voice self.worklist = worklist self.current_index = 0 self.current_wordindex = self.worklist[self.current_index][1] self.current_utt = ttslab.fromfile(self.worklist[self.current_index][0]) self.current_utt.fill_startendtimes() #SET STATE: if self.worklist[self.current_index][0] not in self.transcriptions: self.transcriptions[self.worklist[self.current_index][0]] = self.current_utt["inputtext"] if self.worklist[self.current_index][0] not in self.comments: self.comments[self.worklist[self.current_index][0]] = "" if self.worklist[self.current_index][0] not in self.pronuns: self.pronuns[self.worklist[self.current_index][0]] = [" ".join(getpronun(w, self.voice)) for w in self.current_utt.gr("SylStructure")]
def scores(vfname, method="dtw"): try: os.makedirs(UTTDIR2) indirname = UTTDIR print("Using utts in %s as input..." % UTTDIR) except OSError: indirname = UTTDIR2 print("Using utts in %s as input..." % UTTDIR2) if method == "linear": map(uttlindistcalc, [[vfname, ufname] for ufname in sorted(glob(os.path.join(indirname, "*")))]) elif method == "dtw": map(uttdtwdistcalc, [[vfname, ufname] for ufname in sorted(glob(os.path.join(indirname, "*")))]) elif method == "alignlogl": for uttfn in sorted(glob(os.path.join(indirname, "*"))): print(uttfn) u = ttslab.fromfile(uttfn) ul = sl.Utterance(os.path.join(RECDIR, u["file_id"] + ".rec")) u = parse_logl_from_recs(u, ul, v.phoneset) ttslab.tofile(u, os.path.join(UTTDIR2, u["file_id"] + ".utt.pickle"))
def scores(vfname, method="dtw"): try: os.makedirs(UTTDIR2) indirname = UTTDIR print("Using utts in %s as input..." % UTTDIR) except OSError: indirname = UTTDIR2 print("Using utts in %s as input..." % UTTDIR2) if method == "linear": map(uttlindistcalc, [[vfname, ufname] for ufname in sorted(glob(os.path.join(indirname, "*")))]) elif method == "dtw": map(uttdtwdistcalc, [[vfname, ufname] for ufname in sorted(glob(os.path.join(indirname, "*")))]) elif method == "alignlogl": for uttfn in sorted(glob(os.path.join(indirname, "*"))): print(uttfn) u = ttslab.fromfile(uttfn) ul = sl.Utterance(os.path.join(RECDIR, u["file_id"] + ".rec")) u = parse_logl_from_recs(u, ul, v.pronun["main"]["phoneset"].features["closure_phone"], v.phonemap) ttslab.tofile(u, os.path.join(UTTDIR2, u["file_id"] + ".utt.pickle"))
def main(): try: try: voicefile = sys.argv[1] proc = sys.argv[2] except IndexError: raise CLIException voice = ttslab.fromfile(voicefile) if proc == "auto": auto(voice) elif proc == "to_textgrid": to_textgrid(voice) elif proc == "from_textgrid": from_textgrid(voice) elif proc == "alignments_from_textgrid": alignments_from_textgrid(voice) else: raise CLIException except CLIException: print("USAGE: ttslab_align.py [VOICEFILE] [auto | to_textgrid | from_textgrid | alignments_from_textgrid]")
def make_voice(synthfile=SYNTHESIZER_FILE, pitchmodelfile=PITCHMODEL_FILE): langs = [os.path.basename(os.getcwd())] pronun = {} for i, lang in enumerate(langs): if i == 0: exec("from ttslab.lang.%(lang)s import Voice" % {"lang": lang}) langpref = "main" else: langpref = lang pronun[langpref] = {} pronun[langpref]["phoneset"] = ttslab.fromfile(langpref + PHONESET_FILESUFFIX) pronun[langpref]["pronundict"] = ttslab.fromfile(langpref + PRONUNDICT_FILESUFFIX) pronun[langpref]["pronunaddendum"] = ttslab.fromfile( langpref + PRONUNADDENDUM_FILESUFFIX) pronun[langpref]["g2p"] = ttslab.fromfile(langpref + G2P_FILESUFFIX) synthesizer = ttslab.fromfile(synthfile) pitchmodel = ttslab.fromfile(pitchmodelfile) voice = Voice(pronun=pronun, synthesizer=synthesizer) voice.pitchmodel = pitchmodel ttslab.tofile(voice, VOICE_FILE)
output.close() output = StringIO() mpld3.save_html(fig2, output) pitch_html = output.getvalue() output.close() output = StringIO() mpld3.save_html(fig3, output) wave_html = output.getvalue() output.close() plt.close(fig1) plt.close(fig2) plt.close(fig3) return syl_html, pitch_html, wave_html if __name__ == '__main__': try: uttfname = sys.argv[1] except IndexError: print("USAGE: uttviz_d3.py UTTFNAME") sys.exit() utt = ttslab.fromfile(uttfname) fig1, fig2, fig3 = draw_sylstruct_graph_pitch_waveform(utt) mpld3.save_html( fig1, open(os.path.basename(uttfname) + "_sylstructure.html", "w")) mpld3.save_html(fig2, open(os.path.basename(uttfname) + "_pitch.html", "w")) mpld3.save_html(fig3, open(os.path.basename(uttfname) + "_wave.html", "w"))
default=DEFSTRESSTONE, help="default stress/tone") args = parser.parse_args() phonemap = None if args.outphonemapfn is not None: phonemap = {} with codecs.open(args.outphonemapfn, encoding="utf-8") as infh: for line in infh: a, b = line.split() if args.mapreverse: a, b = (b, a) phonemap[a] = b defstresstone = args.defstresstone phset = ttslab.fromfile(args.phonesetfn) inphmap = dict([(v, k) for k, v in phset.map.iteritems()]) for line in sys.stdin: fields = unicode(line, encoding="utf-8").split() word, pos, stresspat, sylspec = fields[:4] assert len(stresspat) == len(sylspec) phones = map(lambda x: inphmap[x], fields[4:]) #print(word, pos, stresspat, sylspec) #print(phones) i = 0 syls = [] for n, stress in zip([int(slen) for slen in sylspec], stresspat): syl = phones[i:i + n] i += n
#!/usr/bin/env python # -*- coding: utf-8 -*- """ Generates a list of transcriptions that changed during a speechbrowser session. """ from __future__ import unicode_literals, division, print_function #Py2 __author__ = "Daniel van Niekerk" __email__ = "*****@*****.**" import os import sys import codecs import ttslab if __name__ == "__main__": transcrlist, pronunlist, commentlist = ttslab.fromfile(sys.argv[1]) transcr = {} pronun = {} for k in sorted(transcrlist): u = ttslab.fromfile(k) #print(u["text"], transcrlist[k]) if u["text"] != transcrlist[k]: transcr[os.path.basename(k)[:-len(".utt.pickle")]] = transcrlist[k] with codecs.open("newutts.data", "w", encoding="utf-8") as outfh: for k in sorted(transcr): outfh.write('( %s "%s" )\n' % (k, transcr[k]))
def loadvoice(self, name, voice_location): log.info("Loading voice from file '%s'" % (voice_location)) self.voices[name] = ttslab.fromfile(voice_location) log.info("Voice '%s' loaded." % (name))
type=str, help="aligned Utterance file (.utt.pickle)") parser.add_argument('f0fn', metavar='F0FN', type=str, help="corresponding F0 file (.track.pickle)") parser.add_argument( '--qtaspecsfn', metavar='QTASPECSFN', type=str, help="qTA parameter search config: ranges and quantisation (.json)") parser.add_argument( '--extract', action="store_true", help= "extract new parameters and plot instead of using existing annotations." ) args = parser.parse_args() utt = ttslab.fromfile(args.uttfn) f0 = ttslab.fromfile(args.f0fn) if args.extract: utt.fill_startendtimes() if args.qtaspecsfn: with open(args.qtaspecsfn) as infh: qtaspecs = json.load(infh) ttslab.pitchsynth.qta.utt_plot(utt, f0, qtaspecs, args.extract) else: ttslab.pitchsynth.qta.utt_plot(utt, f0, annotate=args.extract)
#!/usr/bin/env python # -*- coding: utf-8 -*- """ Print utterance structure... """ from __future__ import unicode_literals, division, print_function #Py2 __author__ = "Daniel van Niekerk" __email__ = "*****@*****.**" import sys import ttslab if __name__ == '__main__': try: uttfn = sys.argv[1] except IndexError: print("USAGE: uttplay.py UTTFNAME") sys.exit(1) print(ttslab.fromfile(uttfn))
def add_feats_to_utt(args): u, lpc_dir, joincoef_dir, f0_dir = args file_id = u["file_id"] print("Processing:", file_id) u.fill_startendtimes() lpctrack = Track() lpctrack.load_track(".".join([os.path.join(lpc_dir, file_id), LPC_EXT])) restrack = Track() restrack.load_wave(".".join([os.path.join(lpc_dir, file_id), RES_EXT])) jointrack = ttslab.fromfile(".".join([os.path.join(joincoef_dir, file_id), JOIN_EXT])) f0track = Track() f0track.load_track(".".join([os.path.join(f0_dir, file_id), F0_EXT])) #get boundarytimes: boundarytimes = [] durations = [] starttime = 0.0 for seg in u.get_relation("Segment"): endtime = float(seg["end"]) if "cl_end" in seg: splittime = float(seg["cl_end"]) else: splittime = (endtime + starttime) / 2 #TODO: should still add 25% split if diphthong... boundarytimes.append([starttime, splittime, endtime]) durations.extend([splittime - starttime, endtime - splittime]) starttime = endtime #convert boundtimes into sample ranges (and flatten): lpcsampleranges = [] f0sampleranges = [] joinsamples = [] #DEMITASSE: If not pruning pau halfphones: # for bounds in boundarytimes: # lpcsampleranges.extend([lpctrack.get_index_at(bounds[0]), # lpctrack.get_index_at(bounds[1])]) # joinsamples.extend([jointrack.get_sample_at(bounds[0]), # jointrack.get_sample_at(bounds[1])]) # lpcsampleranges.append(len(lpctrack)) # joinsamples.append(jointrack.get_sample_at(len(jointrack))) #DEMITASSE: If pruning pau halfphones: durations = durations[1:-1] for i, bounds in enumerate(boundarytimes): if i == 0: lpcsampleranges.append(lpctrack.index_at(bounds[1])) f0sampleranges.append(f0track.index_at(bounds[1])) joinsamples.append(jointrack.values[bounds[1]]) else: lpcsampleranges.extend([lpctrack.index_at(bounds[0]), lpctrack.index_at(bounds[1])]) f0sampleranges.extend([f0track.index_at(bounds[0]), f0track.index_at(bounds[1])]) joinsamples.extend([jointrack.values[bounds[0]], jointrack.values[bounds[1]]]) #get pitchperiods at lpc indices lpctimes = np.concatenate(([0.0], lpctrack.times)) pitchperiod = np.diff(lpctimes) units = u.get_relation("Unit").as_list() assert len(units) == len(lpcsampleranges) - 1 for jc0, jc1, lti0, lti1, fti0, fti1, dur, i in zip(joinsamples[:-1], joinsamples[1:], lpcsampleranges[:-1], lpcsampleranges[1:], f0sampleranges[:-1], f0sampleranges[1:], durations, units): # print(i["name"], "lpctrack[%s:%s]" % (lti0, lti1), "len(lpctrack)=%s" % len(lpctrack)) i["left-joincoef"] = jc0 i["right-joincoef"] = jc1 i["lpc-coefs"] = lpctrack.slice(lti0, lti1, copy=True) #like python indexing/slicing if lti0 == 0: i["lpc-coefs"].starttime = 0.0 else: i["lpc-coefs"].starttime = lpctrack.times[lti0 - 1] i["lpc-coefs"].zero_starttime() i["dur"] = dur #For windowfactor=2 (save only samples and assume 16kHz) i["residuals"] = restrack.slice(restrack.index_at(lpctrack.times[lti0] - pitchperiod[lti0]), restrack.index_at(lpctrack.times[lti1] + pitchperiod[lti0])).values return u
starttime = endtime return lab if __name__ == "__main__": try: switch = sys.argv[1] voicefile = sys.argv[2] infilename = sys.argv[3] except IndexError: print("usage: utt2lab.py [mono|full] [VOICEFILE] [INFILENAME]") sys.exit(1) #Load voice and utt and link... voice = ttslab.fromfile(voicefile) utt = ttslab.fromfile(infilename) utt.voice = voice if switch == "mono": #t1 = time.time() lab = utt2lab_mono(utt) #print("Time: " + str(time.time() - t1)) elif switch == "full": #t1 = time.time() lab = utt2lab_full(utt) #print("Time: " + str(time.time() - t1)) else: print("Invalid switch: %s" % (switch)) sys.exit(1)
import sys import ttslab PHONESETFN = "phoneset.pickle" ALL_CONTEXTS = {"LL": "%s^*", "L": "*^%s-*", "C": "*-%s+*", "R": "*+%s=*", "RR": "*=%s@*"} VOWEL_CONTEXTS = {"C-Syl": "*|%s/C:*"} if __name__ == "__main__": try: phset = ttslab.fromfile(PHONESETFN) except IOError: print("Could not find file: '%s'" % (PHONESETFN)) #get all feature categories: categories = set() for phn in phset.phones: categories.update(phset.phones[phn]) #get feature categories involving vowels: vcategories = set() for phn in phset.phones: if "vowel" in phset.phones[phn]: vcategories.update(phset.phones[phn]) #do all contexts:
def multihtsfrontend(): from ttslab.defaultvoice import LwaziMultiHTSVoice from ttslab.synthesizer_htsme import SynthesizerHTSME try: voice = LwaziMultiHTSVoice(phoneset=ttslab.fromfile(PHONESET_FILE), g2p=ttslab.fromfile(G2P_FILE), pronundict=ttslab.fromfile(PRONUNDICT_FILE), pronunaddendum=ttslab.fromfile(PRONUNADDENDUM_FILE), engphoneset=ttslab.fromfile(ENGPHONESET_FILE), engg2p=ttslab.fromfile(ENGG2P_FILE), engpronundict=ttslab.fromfile(ENGPRONUNDICT_FILE), engpronunaddendum=ttslab.fromfile(ENGPRONUNADDENDUM_FILE), synthesizer=SynthesizerHTSME(voice=None, models_dir=None)) except IOError: voice = LwaziMultiHTSVoice(phoneset=ttslab.fromfile(PHONESET_FILE), g2p=ttslab.fromfile(G2P_FILE), pronundict=ttslab.fromfile(PRONUNDICT_FILE), pronunaddendum=ttslab.fromfile(PRONUNADDENDUM_FILE), engphoneset=ttslab.fromfile(ENGPHONESET_FILE), engg2p=ttslab.fromfile(ENGG2P_FILE), engpronundict=ttslab.fromfile(ENGPRONUNDICT_FILE), engpronunaddendum={}, synthesizer=SynthesizerHTSME(voice=None, models_dir=None)) ttslab.tofile(voice, "frontend.multihts.voice.pickle")
currentphrase["name"] = "BB" currentphrase.add_daughter(word) elif prevseg["name"] == "pau" and (prevseg["end"] - prevseg["start"]) < thresh: prevseg.remove_content() currentphrase.add_daughter(word) else: currentphrase.add_daughter(word) for phrase in phraserel: phrase["start"] = phrase.first_daughter["start"] phrase["end"] = phrase.last_daughter["end"] return u if __name__ == "__main__": uttin = sys.argv[1] try: thresh = float(sys.argv[2]) # in seconds except IndexError: thresh = PAUSE_LEN_THRESH try: uttoutdir = sys.argv[3] except IndexError: uttoutdir = os.getcwd() u = ttslab.fromfile(uttin) u.fill_startendtimes() u = remphraserel(u) u = phraserelfrompauses(u, thresh) ttslab.tofile(u, os.path.join(uttoutdir, u["file_id"] + ".utt.pickle"))
vectors = voice.pitchmodel(utt, ("feats", None))["sylpitchfeats"] for vector, syl in zip(vectors, utt.gr("Syllable")): vector.extend([syl["qta_endheight"], syl["qta_slope"]]) return vectors if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( 'voicefn', metavar='VOICEFN', type=str, help="Voice containing PitchModel implementation (.voice.pickle)") parser.add_argument( 'uttfn', metavar='UTTFN', type=str, help= "annotated Utterance file, i.e. containing qTA parameters (.utt.pickle)" ) args = parser.parse_args() voice = ttslab.fromfile(args.voicefn) utt = ttslab.fromfile(args.uttfn) for vector in process_utt(voice, utt): print(" ".join(map(str, vector)))
def prepredict(wordsfn, g2p, skipwords): with codecs.open(wordsfn, encoding="utf-8") as infh: words = [ word.strip() for word in infh.readlines() if word.strip() not in skipwords ] pronundict = {} numwords = len(words) for i, word in enumerate(words): print(("%s/%s: %s" % (i + 1, numwords, word)).encode("utf-8")) pronundict[word] = g2p.predict_word(word) return pronundict if __name__ == "__main__": phset = ttslab.fromfile(PHSET_FILE) phmap = dict([(v, k) for k, v in phset.map.items()]) assert len(phmap) == len(phset.map), "mapping not one-to-one..." #load #MAIN try: pronundict = PronunciationDictionary().fromtextfile(PRONUNDICT_INFN, phonemap=phmap) except IOError: print("WARNING: Could not find '%s'" % PRONUNDICT_INFN) pronundict = PronunciationDictionary().fromsimpletextfile( DICT_INFN, phonemap=phmap) #ADDENDUM try: addendum = PronunciationDictionary().fromtextfile(ADDENDUM_INFN, phonemap=phmap)
def _load_unitcatalogue(self, unitcataloguefile): self.unitcatalogue = ttslab.fromfile(unitcataloguefile)
def on_button_playwordorig_clicked(self, obj): self.origwordcontextwav.play() def on_button_playwordsynth_clicked(self, obj): self.synthwordcontextwav.play() def on_toolbutton_open_clicked(self, obj): chooser = gtk.FileChooserDialog(title=None, action=gtk.FILE_CHOOSER_ACTION_OPEN, buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_OPEN, gtk.RESPONSE_OK)) chooser.set_current_folder(os.getcwd()) response = chooser.run() if response == gtk.RESPONSE_OK: filename = chooser.get_filename() worklist = loadworklist(filename) self.corpusview = CorpusView(worklist, self.phmap) elif response == gtk.RESPONSE_CANCEL: print('Closed, no files selected') chooser.destroy() self.update_uttview() self.update_wordview() if __name__ == "__main__": voice = ttslab.fromfile(sys.argv[1]) app = SpeechbrowserApp(voice.phonemap) gtk.main()
# encoding: utf-8 import ttslab voice = ttslab.fromfile("wordus.voice.pickle") utt = voice.synthesize(u'Mea nsia na hunu ewiem nsakrayɛ aa ɛwɔ Kumasi', "text-to-wave") utt["waveform"].write("test.wav")
currentphrase.add_daughter(word) elif prevseg["name"] == "pau" and (prevseg["end"] - prevseg["start"]) < thresh: prevseg.remove_content() currentphrase.add_daughter(word) else: currentphrase.add_daughter(word) for phrase in phraserel: phrase["start"] = phrase.first_daughter["start"] phrase["end"] = phrase.last_daughter["end"] return u if __name__ == "__main__": uttin = sys.argv[1] try: thresh = float(sys.argv[2]) #in seconds except IndexError: thresh = PAUSE_LEN_THRESH try: uttoutdir = sys.argv[3] except IndexError: uttoutdir = os.getcwd() u = ttslab.fromfile(uttin) u.fill_startendtimes() u = remphraserel(u) u = phraserelfrompauses(u, thresh) ttslab.tofile(u, os.path.join(uttoutdir, u["file_id"] + ".utt.pickle"))
#!/usr/bin/env python from __future__ import division __author__ = "Daniel van Niekerk" __email__ = "*****@*****.**" import sys import ttslab from qta3 import plotstuff if __name__ == "__main__": prefix = sys.argv[1] utt = ttslab.fromfile(sys.argv[2]) reff0 = ttslab.fromfile(sys.argv[3]) qtaf0 = ttslab.fromfile(sys.argv[4]) plotstuff(utt, reff0, qtaf0, prefix=prefix)
#!/usr/bin/env python # -*- coding: utf-8 -*- """ Print utterance structure... """ from __future__ import unicode_literals, division, print_function #Py2 __author__ = "Daniel van Niekerk" __email__ = "*****@*****.**" import sys import numpy as np import ttslab import ufuncs_analysis import ttslab_dtw if __name__ == '__main__': uttfn = sys.argv[1] u = ttslab.fromfile(uttfn) t = ufuncs_analysis.utt_mceps(u, shift=0.001) dtwalignment = ttslab_dtw.dtw_align(t.values, t.values) # for i, e in enumerate(dtwalignment): # np.savetxt("pyx.%s.out" % i, e)
def on_button_playwordorig_clicked(self, obj): self.origwordcontextwav.play() def on_button_playwordsynth_clicked(self, obj): self.synthwordcontextwav.play() def on_toolbutton_open_clicked(self, obj): chooser = gtk.FileChooserDialog(title=None, action=gtk.FILE_CHOOSER_ACTION_OPEN, buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_OPEN, gtk.RESPONSE_OK)) chooser.set_current_folder(os.getcwd()) response = chooser.run() if response == gtk.RESPONSE_OK: filename = chooser.get_filename() worklist = loadworklist(filename) self.corpusview = CorpusView(worklist, self.voice) elif response == gtk.RESPONSE_CANCEL: print('Closed, no files selected') chooser.destroy() self.update_uttview() self.update_wordview() if __name__ == "__main__": voice = ttslab.fromfile(sys.argv[1]) app = SpeechbrowserApp(voice) gtk.main()
NASAL = set(["manner_nasal"]) APPROXIMANT = set(["manner_approximant", "manner_trill"]) SHORT = set(["duration_short"]) LONG = set(["duration_long"]) DIPH = set(["duration_diphthong"]) VOICED = set(["vowel", "voiced"]) if __name__ == "__main__": try: voicefn = sys.argv[1] except IndexError: voicefn = None try: voice = ttslab.fromfile(voicefn or VOICEFN) except IOError: print("Could not find file: '%s'" % (VOICEFN)) sys.exit(1) for lang in ["main"] + [k for k in voice.pronun if k != "main"]: phset = voice.pronun[lang]["phoneset"] for phn in phset.phones: phnfeats = phset.phones[phn] if lang == "main": p = voice.phonemap[phn] else: p = voice.phonemap[lang + "_" + phn]
#!/usr/bin/env python # -*- coding: utf-8 -*- """ Save waveform embedded in utt... """ from __future__ import unicode_literals, division, print_function #Py2 __author__ = "Daniel van Niekerk" __email__ = "*****@*****.**" import sys import ttslab WAV_EXT = "wav" if __name__ == '__main__': try: uttfn = sys.argv[1] except IndexError: print("USAGE: utt2textgrid.py UTTFNAME [WAVEFNAME]") sys.exit() utt = ttslab.fromfile(uttfn) try: wavfn = sys.argv[2] except IndexError: wavfn = ".".join([utt["file_id"], WAV_EXT]) utt["waveform"].write(wavfn)
def add_feats_to_utt(args): u, lpc_dir, joincoef_dir, f0_dir = args file_id = u["file_id"] print("Processing:", file_id) u.fill_startendtimes() for unit, word in zip(u.gr("Unit"), u.gr("Word")): assert unit["name"] == word["name"] unit["start"] = word["start"] unit["end"] = word["end"] lpctrack = Track() lpctrack.load_track(".".join([os.path.join(lpc_dir, file_id), LPC_EXT])) restrack = Track() restrack.load_wave(".".join([os.path.join(lpc_dir, file_id), RES_EXT])) jointrack = ttslab.fromfile(".".join( [os.path.join(joincoef_dir, file_id), JOIN_EXT])) f0track = Track() f0track.load_track(".".join([os.path.join(f0_dir, file_id), F0_EXT])) #get boundarytimes: boundarytimes = [] for i, unit in enumerate(u.gr("Unit")): if i == 0: boundarytimes.append(unit["start"]) boundarytimes.append(unit["end"]) #convert boundtimes into sample ranges: lpcsampleranges = [] f0sampleranges = [] joinsamples = [] for bound in boundarytimes: lpcsampleranges.append(lpctrack.index_at(bound)) f0sampleranges.append(f0track.index_at(bound)) joinsamples.append(jointrack.values[jointrack.index_at(bound)]) #get pitchperiods at lpc indices lpctimes = np.concatenate(([0.0], lpctrack.times)) pitchperiod = np.diff(lpctimes) units = u.get_relation("Unit").as_list() assert len(units) == len(lpcsampleranges) - 1 for jc0, jc1, lti0, lti1, fti0, fti1, i in zip(joinsamples[:-1], joinsamples[1:], lpcsampleranges[:-1], lpcsampleranges[1:], f0sampleranges[:-1], f0sampleranges[1:], units): # print(i["name"], "lpctrack[%s:%s]" % (lti0, lti1), "len(lpctrack)=%s" % len(lpctrack)) i["left-joincoef"] = jc0 i["right-joincoef"] = jc1 i["lpc-coefs"] = lpctrack.slice( lti0, lti1, copy=True) #like python indexing/slicing if lti0 == 0: i["lpc-coefs"].starttime = 0.0 else: i["lpc-coefs"].starttime = lpctrack.times[lti0 - 1] i["lpc-coefs"].zero_starttime() #For windowfactor=2 (save only samples and assume 16kHz) i["residuals"] = restrack.slice( restrack.index_at(lpctrack.times[lti0] - pitchperiod[lti0]), restrack.index_at(lpctrack.times[lti1] + pitchperiod[lti0])).values return u
def add_feats_to_utt(args): u, lpc_dir, joincoef_dir, f0_dir = args file_id = u["file_id"] print("Processing:", file_id) u.fill_startendtimes() lpctrack = Track() lpctrack.load_track(".".join([os.path.join(lpc_dir, file_id), LPC_EXT])) restrack = Track() restrack.load_wave(".".join([os.path.join(lpc_dir, file_id), RES_EXT])) jointrack = ttslab.fromfile(".".join( [os.path.join(joincoef_dir, file_id), JOIN_EXT])) f0track = Track() f0track.load_track(".".join([os.path.join(f0_dir, file_id), F0_EXT])) #get boundarytimes: boundarytimes = [] durations = [] starttime = 0.0 for seg in u.get_relation("Segment"): endtime = float(seg["end"]) if "cl_end" in seg: splittime = float(seg["cl_end"]) else: splittime = (endtime + starttime) / 2 #TODO: should still add 25% split if diphthong... boundarytimes.append([starttime, splittime, endtime]) durations.extend([splittime - starttime, endtime - splittime]) starttime = endtime #convert boundtimes into sample ranges (and flatten): lpcsampleranges = [] f0sampleranges = [] joinsamples = [] #DEMITASSE: If not pruning pau halfphones: # for bounds in boundarytimes: # lpcsampleranges.extend([lpctrack.get_index_at(bounds[0]), # lpctrack.get_index_at(bounds[1])]) # joinsamples.extend([jointrack.get_sample_at(bounds[0]), # jointrack.get_sample_at(bounds[1])]) # lpcsampleranges.append(len(lpctrack)) # joinsamples.append(jointrack.get_sample_at(len(jointrack))) #DEMITASSE: If pruning pau halfphones: durations = durations[1:-1] for i, bounds in enumerate(boundarytimes): if i == 0: lpcsampleranges.append(lpctrack.index_at(bounds[1])) f0sampleranges.append(f0track.index_at(bounds[1])) joinsamples.append(jointrack.values[bounds[1]]) else: lpcsampleranges.extend( [lpctrack.index_at(bounds[0]), lpctrack.index_at(bounds[1])]) f0sampleranges.extend( [f0track.index_at(bounds[0]), f0track.index_at(bounds[1])]) joinsamples.extend( [jointrack.values[bounds[0]], jointrack.values[bounds[1]]]) #get pitchperiods at lpc indices lpctimes = np.concatenate(([0.0], lpctrack.times)) pitchperiod = np.diff(lpctimes) units = u.get_relation("Unit").as_list() assert len(units) == len(lpcsampleranges) - 1 for jc0, jc1, lti0, lti1, fti0, fti1, dur, i in zip( joinsamples[:-1], joinsamples[1:], lpcsampleranges[:-1], lpcsampleranges[1:], f0sampleranges[:-1], f0sampleranges[1:], durations, units): # print(i["name"], "lpctrack[%s:%s]" % (lti0, lti1), "len(lpctrack)=%s" % len(lpctrack)) i["left-joincoef"] = jc0 i["right-joincoef"] = jc1 i["lpc-coefs"] = lpctrack.slice( lti0, lti1, copy=True) #like python indexing/slicing if lti0 == 0: i["lpc-coefs"].starttime = 0.0 else: i["lpc-coefs"].starttime = lpctrack.times[lti0 - 1] i["lpc-coefs"].zero_starttime() i["dur"] = dur #For windowfactor=2 (save only samples and assume 16kHz) i["residuals"] = restrack.slice( restrack.index_at(lpctrack.times[lti0] - pitchperiod[lti0]), restrack.index_at(lpctrack.times[lti1] + pitchperiod[lti0])).values return u
except IOError: pass return pronundict def prepredict(wordsfn, g2p, skipwords): with codecs.open(wordsfn, encoding="utf-8") as infh: words = [word.strip() for word in infh.readlines() if word.strip() not in skipwords] pronundict = {} numwords = len(words) for i, word in enumerate(words): print("%s/%s: %s" % (i+1, numwords, word)) pronundict[word] = g2p.predict_word(word) return pronundict if __name__ == "__main__": phset = ttslab.fromfile(PHSET_FILE) phmap = dict([(v, k) for k, v in phset.map.items()]) assert len(phmap) == len(phset.map), "mapping not one-to-one..." g2p = ttslab.fromfile(G2P_FILE) #load try: pronundict = PronunciationDictionary() pronundict.fromtextfile(PRONUNDICT_INFN, phmap) except IOError: pronundict = load_simplepronundict(DICT_INFN, phmap) addendum = load_simplepronundict(ADDENDUM_INFN, phmap) #pre-predict from wordlist and add to addendum try: skipwords = set(list(pronundict) + list(addendum)) addendum.update(prepredict(WORDLIST_INFN, g2p, skipwords)) except IOError: