def relative_local_speechrate(reft, targett, s=0.03, realigntimes=False, debug=False): """ calculate the relative local speech rate between targett and reft by DTW aligning the tracks, fitting a smoothing spline to the frame time difference function (smoothing factor 's') and using this to calculate derivative contour. """ try: assert (targett.times[1] - targett.times[0]) == ( reft.times[1] - reft.times[0] ), "constant timestep for reference and target need to be equal..." except AssertionError: print( "WARNING: timesteps must be equal.... (this may be spurious if 'remove_pau' was used)" ) path = dtw_align(reft, targett)[-1] ltd = _local_timediff(path, reft.times, targett.times) if realigntimes: newreftimes = np.arange(1, len(ltd) + 1) * (ltd.times[1] - ltd.times[0]) else: newreftimes = ltd.times.copy() spline = UnivariateSpline(newreftimes, ltd.values, k=5, s=s) if debug: import pylab as pl pl.plot(ltd.times, ltd.values) pl.plot(ltd.times, spline(ltd.times)) rlstrack = Track() rlstrack.times = newreftimes rlstrack.values = spline(rlstrack.times, 1).reshape((-1, 1)) return rlstrack
def relative_local_x(reft, targett, dtwpath, s=100, debug=False): """Determines the rate of change of target relative to ref, using alignment provided by dtwpath. """ t = reft.times.copy() delta = np.zeros((len(t)), dtype=np.float64) for rti, tti in dtwpath: delta[rti] = targett.values[tti] - reft.values[rti] spline = UnivariateSpline(t, delta, k=5, s=s) if debug: import pylab as pl pl.subplot(211) pl.plot(t, reft.values, label="ref") temp = np.zeros((len(t)), dtype=np.float64) for rti, tti in dtwpath: temp[rti] = targett.values[tti] pl.plot(t, temp, label="tgt") pl.legend() pl.subplot(212) pl.plot(t, delta) pl.plot(t, spline(t)) rlxtrack = Track() rlxtrack.times = t rlxtrack.values = spline(rlxtrack.times, 1).reshape((-1, 1)) return rlxtrack
def get_f0(args): fn, f0_path, f0min, f0max, tstep, semitones, outf0dir = args basename = os.path.basename(fn).split(".")[0] print("PROCESSING: " + basename) t = Track() t.name = basename t.get_f0(fn, f0min, f0max, timestep=tstep, semitones=semitones) ttslab.tofile(t, os.path.join(outf0dir, basename + "." + TRACK_EXT))
def _local_timediff(path, reftimes, targettimes): """ determine the local time difference mapped onto reftimes... """ t = reftimes[:path[-1][0] + 1] delta = np.zeros((len(t)), dtype=np.float64) for rti, tti in path: delta[rti] = targettimes[tti] - reftimes[rti] ttrack = Track() ttrack.times = t ttrack.values = delta return ttrack
def utt_mceps(utt, shift=0.005, remove_pau=False, resettimes=False): temppath = mkdtemp() #wavs wfn1 = os.path.join(temppath, "1." + WAV_EXT) utt["waveform"].write(wfn1) #feats ffn1 = os.path.join(temppath, "1." + FEAT_EXT) cmds = SIG2FV % {"inputfile": wfn1, "outputfile": ffn1, "shift": shift} #print(cmds) os.system(cmds) #tracks t1 = Track() t1.load_track(ffn1) #cleanup shutil.rmtree(temppath) keep_intervals = [] if remove_pau: u = deepcopy(utt) fill_startendtimes(u) for seg in u.gr("Segment"): if seg["name"] != "pau": keep_intervals.append((seg["start"], seg["end"])) indices = t1.mask_indices(keep_intervals) t1.times = t1.times[indices] t1.values = t1.values[indices] if resettimes: t1.times = np.arange(1, len(t1.times) + 1, dtype=np.float) * shift return t1
def linearpath_distances(track, track2, metric="euclidean", VI=None): dist = cdist(track.values, track2.values, metric=str(metric), VI=VI) framedists = [] try: for i in range(len(track.times)): framedists.append(dist[i][i]) except IndexError: pass t = Track() t.values = np.array(framedists) t.values = t.values.reshape(-1, 1) t.times = np.array([track.times[i] for i in range(len(t.values))]) if track2.numframes != track.numframes: print("linearpath_distances: WARNING: num frames difference is %s" % (track2.numframes - track.numframes)) return t
def dtw_distances(track, track2, metric="euclidean", VI=None): cumdist, dist, path = track.dtw_align(track2, metric=str(metric), VI=VI) framedists = [] frametimes = [] for pathcoord in path: x, y = pathcoord framedists.append(dist[x][y]) frametimes.append(track.times[x]) t = Track() t.values = np.array(framedists) t.values = t.values.reshape(-1, 1) t.times = np.array(frametimes) return t
def qta_synth_utt(utt, synthfunc=synth): times = np.array([]) values = np.array([]) for phr in utt.gr("Phrase"): synthparms = [] for word in phr.get_daughters(): for syl in word.gir("SylStructure").get_daughters(): synthparms.append([ syl[STARTLAB], syl[ENDLAB], syl[QTAPREFIX + "_endheight"], syl[QTAPREFIX + "_slope"], syl[QTAPREFIX + "_lambd"] ]) phrf0track = synthfunc(phr[QTAPREFIX + "_startpitch"], synthparms) times = np.concatenate((times, phrf0track.times)) values = np.concatenate((values, phrf0track.values.flatten())) f0track = Track() f0track.times = times f0track.values = values.reshape((-1, 1)) return f0track
def gradient(track, h=2): """Estimate of the gradient using a window length 'h'... must be even... number of points is h+1 """ assert h % 2 == 0 n = h // 2 #timesteps must be constant period = track.times[1] - track.times[0] times = track.times[n:-n].copy() values = np.zeros(len(times), dtype=track.values.dtype) for i in range(len(values)): values[i] = (track.values[i + h] - track.values[i]) / (h * period) t = Track() t.times = times t.values = values return t
def synth(self, voice, utt, args): synthparms = args #not yet implemented... htslabel = "\n".join(utt["hts_label"]).encode( "utf-8").splitlines() #to utf-8 bytestring if synthparms and "use_labalignments" in synthparms: use_labalignments = True else: use_labalignments = False with HTS_EngineME(self.htsvoice_bin, self.mixfilter_bin, self.pdfilter_bin) as htsengine: htsengine.synth(htslabel, use_labalignments=use_labalignments) for segt, seg in zip(htsengine.get_segtimes(), utt.gr("Segment")): seg["start"], seg["end"] = segt # utt["debug_waveform"] = htsengine.get_wav() f0st = 12.0 * np.log2(htsengine.get_f0()) f0st[f0st == -np.inf] = 0.0 f0times = np.arange(len(f0st)) * STEPSIZE f0track = Track() f0track.times = f0times f0track.values = f0st.reshape((-1, 1)) # utt["debug_f0track"] = f0track utt.fill_startendtimes() #add qta_startpitch for phr in utt.gr("Phrase"): syl = phr.first_daughter.gir("SylStructure").first_daughter syltrackvals = f0track.slice(f0track.index_at(syl["start"]), f0track.index_at( syl["end"])).values.flatten() validvals = syltrackvals[syltrackvals.nonzero()] if len(validvals) > 3: phr["qta_startpitch"] = np.mean( validvals[:len(validvals) // 4]) else: phr["qta_startpitch"] = BACKOFF_STARTPITCH utt = voice.pitchmodel(utt, ("synth", None)) f0spline = InterpolatedUnivariateSpline(utt["f0track"].times, utt["f0track"].values) newf0 = f0spline(f0track.times) #HEURISTIC ADJUSTMENT CLOSER TO HTS DYNAMICS m = np.mean(f0track.values[f0track.values.nonzero()]) newf0 *= 1.3 #more dynamic m2 = np.mean(newf0) newf0 += m - m2 ### TRANSFER UNVOICED SECTIONS # newf0[f0track.values.flatten() == 0.0] = 0.0 # import pylab as pl # pl.plot(f0track.times, f0track.values) # pl.plot(f0track.times, newf0) ### newf0 = 2**(newf0 / 12.0) newf0 = tolf0(newf0) htsengine.synth(htslabel, lf0=newf0, use_labalignments=use_labalignments) #populate utt with waveform and segment alignments utt["waveform"] = htsengine.get_wav() return utt
def synth(startpitch, synthparms, numpoints=100, plot=False): times = np.zeros(len(synthparms) * numpoints) contour = np.zeros(len(synthparms) * numpoints) for i, synthparm in enumerate(synthparms): if i == 0: p0 = startpitch dp0 = 0.0 ddp0 = 0.0 if synthparm[0] != synthparms[ i - 1][1]: #not contiguous (e.g. a pause is present) dp0 = 0.0 ddp0 = 0.0 if any([e is None for e in synthparm ]): #no parameters available for this syllable, skip... dp0 = 0.0 ddp0 = 0.0 continue utt_t = np.linspace(synthparm[0], synthparm[1], numpoints, endpoint=False) times[i * numpoints:i * numpoints + numpoints] = utt_t syl_t = utt_t - synthparm[0] #start at 0.0 #y = mx + c syltarget_m = synthparm[3] syltarget_c = get_intercept(syl_t[-1], synthparm[2], synthparm[3]) scontour = sylcontour(syl_t, syltarget_m, syltarget_c, p0, dp0, ddp0, synthparm[4]) if plot: pl.plot(syl_t + synthparm[0], np.polyval(coefs, syl_t), linestyle="dashed", color="red") pl.plot(syl_t + synthparm[0], scontour, color="green") spline = InterpolatedUnivariateSpline(syl_t, scontour) contour[i * numpoints:i * numpoints + numpoints] = scontour p0, dp0, ddp0, temp = spline.derivatives(syl_t[-1]) synthtrack = Track() synthtrack.times = times[contour.nonzero()].copy() synthtrack.values = contour[contour.nonzero()].reshape((-1, 1)).copy() return synthtrack
def make_joincoefs(featconfig, wav_dir): """ Make joincoefs... """ mcep_dir = os.path.join(os.getcwd(), MCEP_DIR) os.mkdir(mcep_dir) join_dir = os.path.join(os.getcwd(), JOIN_DIR) os.mkdir(join_dir) pm_dir = os.path.join(os.getcwd(), PM_DIR) f0_dir = os.path.join(os.getcwd(), F0_DIR) fbank_order = featconfig.get("SIG2FV_MCEP", "FBANK_ORDER") melcep_order = featconfig.get("SIG2FV_MCEP", "MELCEP_ORDER") melcep_coefs = featconfig.get("SIG2FV_MCEP", "MELCEP_COEFS") preemph_coef = featconfig.get("SIG2FV_MCEP", "PREEMPH_COEF") window_factor = featconfig.get("SIG2FV_MCEP", "WINDOW_FACTOR") window_type = featconfig.get("SIG2FV_MCEP", "WINDOW_TYPE") print("MAKING JOINCOEFS...") map(extract_mceps, [(wavfilename, fbank_order, window_factor, preemph_coef, melcep_order, window_type, melcep_coefs, mcep_dir, pm_dir) for wavfilename in sorted(glob(os.path.join(wav_dir, ".".join(["*", WAV_EXT]))))]) print("NORMALISING AND JOINING F0 AND MCEPS...") #Normalising mceps and f0s: upper = +1.0 lower = -1.0 mceptracks = {} for fn in glob(os.path.join(mcep_dir, ".".join(["*", MCEP_EXT]))): t = Track() t.load_track(fn) mceptracks[os.path.basename(fn)] = t allmcepvecs = np.concatenate([mceptracks[tn].values for tn in sorted(mceptracks)]) mcepmean = allmcepvecs.mean(0) mcepstd = allmcepvecs.std(0) for k in mceptracks: mceptracks[k].values = (mceptracks[k].values - mcepmean) / (4 * mcepstd) * (upper - lower) f0tracks = {} for fn in glob(os.path.join(f0_dir, ".".join(["*", F0_EXT]))): t = Track() t.load_track(fn) f0tracks[os.path.basename(fn)] = t #allf0vecs = np.concatenate([f0tracks[tn].values for tn in sorted(f0tracks)]) allf0vecs = np.concatenate([f0tracks[tn].values[f0tracks[tn].values.nonzero()] for tn in sorted(f0tracks)]) f0mean = allf0vecs.mean(0) f0std = allf0vecs.std(0) for k in f0tracks: f0tracks[k].values = (f0tracks[k].values - f0mean) / (4 * f0std) * (upper - lower) #Add f0 to mcep track: for k1, k2 in zip(sorted(mceptracks), sorted(f0tracks)): mceptracks[k1].values = np.concatenate((mceptracks[k1].values, f0tracks[k2].values), 1) for fn in mceptracks: basename = os.path.splitext(os.path.basename(fn))[0] ttslab.tofile(mceptracks[fn], os.path.join(join_dir, basename + "." + JOIN_EXT))
def timenorm_tonecontour_dtw(reftrack, track): #normalise location: tmean = deepcopy(reftrack) tmean.values = tmean.values - tmean.values.mean() #normalise location: t1 = deepcopy(track) t1.values = t1.values - t1.values.mean() #smooth function to facilitate smoother warping: s = UnivariateSpline(t1.times, t1.values) t1.values = s(t1.times).reshape((-1, 1)) #align: dtw = dtw_align(tmean, t1) #construct new track using mapping: newtrack = Track() newtrack.times = np.copy(tmean.times) values = np.zeros(len(tmean), np.float64) for i in range(len(tmean)): es = [e[1] for e in dtw[2] if e[0] == i] values[i] = np.mean(track.values[es]) newtrack.values = values.reshape((-1, 1)) smoothtrack = newtrack.newtrack_from_sspline(newtrack.times, s=len(newtrack.times) / 10.0) return smoothtrack
def utt_distance(utt, utt2, method="dtw", metric="euclidean", sig2fv=SIG2FV, VI=None): """ Uses Trackfile class' distance measurements to compare utts... See docstring in tfuncs_analysis.py for more details... """ temppath = mkdtemp() #wavs wfn1 = os.path.join(temppath, "1." + WAV_EXT) wfn2 = os.path.join(temppath, "2." + WAV_EXT) utt["waveform"].write(wfn1) utt2["waveform"].write(wfn2) #feats ffn1 = os.path.join(temppath, "1." + FEAT_EXT) ffn2 = os.path.join(temppath, "2." + FEAT_EXT) cmds = SIG2FV % {"inputfile": wfn1, "outputfile": ffn1} #print(cmds) os.system(cmds) cmds = SIG2FV % {"inputfile": wfn2, "outputfile": ffn2} #print(cmds) os.system(cmds) #tracks t1 = Track() t1.load_track(ffn1) t2 = Track() t2.load_track(ffn2) #compare and save t3 = t1.distances(t2, method=method, metric=metric, VI=VI) shutil.rmtree(temppath) return t3
def add_feats_to_utt(args): u, lpc_dir, joincoef_dir, f0_dir = args file_id = u["file_id"] print("Processing:", file_id) u.fill_startendtimes() for unit, word in zip(u.gr("Unit"), u.gr("Word")): assert unit["name"] == word["name"] unit["start"] = word["start"] unit["end"] = word["end"] lpctrack = Track() lpctrack.load_track(".".join([os.path.join(lpc_dir, file_id), LPC_EXT])) restrack = Track() restrack.load_wave(".".join([os.path.join(lpc_dir, file_id), RES_EXT])) jointrack = ttslab.fromfile(".".join([os.path.join(joincoef_dir, file_id), JOIN_EXT])) f0track = Track() f0track.load_track(".".join([os.path.join(f0_dir, file_id), F0_EXT])) #get boundarytimes: boundarytimes = [] for i, unit in enumerate(u.gr("Unit")): if i == 0: boundarytimes.append(unit["start"]) boundarytimes.append(unit["end"]) #convert boundtimes into sample ranges: lpcsampleranges = [] f0sampleranges = [] joinsamples = [] for bound in boundarytimes: lpcsampleranges.append(lpctrack.index_at(bound)) f0sampleranges.append(f0track.index_at(bound)) joinsamples.append(jointrack.values[jointrack.index_at(bound)]) #get pitchperiods at lpc indices lpctimes = np.concatenate(([0.0], lpctrack.times)) pitchperiod = np.diff(lpctimes) units = u.get_relation("Unit").as_list() assert len(units) == len(lpcsampleranges) - 1 for jc0, jc1, lti0, lti1, fti0, fti1, i in zip(joinsamples[:-1], joinsamples[1:], lpcsampleranges[:-1], lpcsampleranges[1:], f0sampleranges[:-1], f0sampleranges[1:], units): # print(i["name"], "lpctrack[%s:%s]" % (lti0, lti1), "len(lpctrack)=%s" % len(lpctrack)) i["left-joincoef"] = jc0 i["right-joincoef"] = jc1 i["lpc-coefs"] = lpctrack.slice(lti0, lti1, copy=True) #like python indexing/slicing if lti0 == 0: i["lpc-coefs"].starttime = 0.0 else: i["lpc-coefs"].starttime = lpctrack.times[lti0 - 1] i["lpc-coefs"].zero_starttime() #For windowfactor=2 (save only samples and assume 16kHz) i["residuals"] = restrack.slice(restrack.index_at(lpctrack.times[lti0] - pitchperiod[lti0]), restrack.index_at(lpctrack.times[lti1] + pitchperiod[lti0])).values return u
def synth2(startpitch, synthparms, numpoints=100, plot=False, minlambd=10.0, dlambd=5.0): """ Limit the strength of articulation to avoid acceleration in opposite direction of endheight target... """ times = np.zeros(len(synthparms) * numpoints) contour = np.zeros(len(synthparms) * numpoints) for i, synthparm in enumerate(synthparms): if i == 0: p0 = startpitch dp0 = 0.0 ddp0 = 0.0 if synthparm[0] != synthparms[ i - 1][1]: #not contiguous (e.g. a pause is present) dp0 = 0.0 ddp0 = 0.0 if any([e is None for e in synthparm ]): #no parameters available for this syllable, skip... dp0 = 0.0 ddp0 = 0.0 continue utt_t = np.linspace(synthparm[0], synthparm[1], numpoints, endpoint=False) times[i * numpoints:i * numpoints + numpoints] = utt_t syl_t = utt_t - synthparm[0] #start at 0.0 #y = mx + c syltarget_m = synthparm[3] syltarget_c = get_intercept(syl_t[-1], synthparm[2], synthparm[3]) while True: #resynthesise with lower strength until constraint met scontour = sylcontour(syl_t, syltarget_m, syltarget_c, p0, dp0, ddp0, synthparm[4]) spline = InterpolatedUnivariateSpline(syl_t, scontour) #check acceleration if synthparm[4] <= minlambd: break accels = spline(syl_t, 2) if synthparm[2] > p0: if np.all(accels > 0.0): break elif synthparm[2] < p0: if np.all(accels < 0.0): break else: break synthparm[4] -= dlambd if synthparm[4] < minlambd: synthparm[4] = minlambd if plot: pl.plot(syl_t + synthparm[0], np.polyval(coefs, syl_t), linestyle="dashed", color="red") pl.plot(syl_t + synthparm[0], scontour, color="green") contour[i * numpoints:i * numpoints + numpoints] = scontour p0, dp0, ddp0, temp = spline.derivatives(syl_t[-1]) synthtrack = Track() synthtrack.times = times[contour.nonzero()].copy() synthtrack.values = contour[contour.nonzero()].reshape((-1, 1)).copy() return synthtrack
def draw_sylstruct_graph_pitch_waveform(u): #use seg end times to calculate start and end times for all #items... u.fill_startendtimes() g = nx.Graph() posdict = {} nodelist = [] nodesizelist = [] for word in u.get_relation("SylStructure"): nodelist.append(word) nodesizelist.append(300 * len(str(word))) posdict[word] = [word["end"] + word["start"] / 2, 3] if word.prev_item: g.add_edge(word.prev_item, word) if word.next_item: g.add_edge(word.next_item, word) g.add_edge(word.first_daughter, word) g.add_edge(word.last_daughter, word) for syl in word.get_daughters(): nodelist.append(syl) nodesizelist.append(400) posdict[syl] = [syl["end"] + syl["start"] / 2, 2] if syl.prev_item: g.add_edge(syl.prev_item, syl) if syl.next_item: g.add_edge(syl.next_item, syl) g.add_edge(syl.first_daughter, syl) g.add_edge(syl.last_daughter, syl) for seg in syl.get_daughters(): nodelist.append(seg) nodesizelist.append(350) posdict[seg] = [seg["end"] + seg["start"] / 2, 1] if seg.prev_item: g.add_edge(seg.prev_item, seg) if seg.next_item: g.add_edge(seg.next_item, seg) uttendtime = u.get_relation("Segment").tail_item["end"] bounds = np.array([word["end"] for word in u.get_relation("Word")]) #get the pitch: d = mkdtemp() u["waveform"].write(os.path.join(d, "utt.wav")) f0t = Track() f0t.get_f0(os.path.join(d, "utt.wav"), semitones=True) shutil.rmtree(d) fig1 = plt.figure( ) #edgecolor=(0.921568627451, 0.921568627451, 0.921568627451)) ax = fig1.add_subplot(111) ax.set_title("Utterance") # ax.set_ylim(0, 5) nx.draw(g, pos=posdict, ax=ax, nodelist=nodelist, node_size=nodesizelist) plt.xticks([], []) plt.yticks([1.0, 2.0, 3.0], ["segment", "syllable", "word"]) fig2 = plt.figure() ax1 = fig2.add_subplot(111) ax1.set_title("Pitch") ax1.set_ylabel("Semitones (relative to 1 Hz)") ax1.set_xlabel("Syllables") plt.plot(f0t.times, f0t.values, color='green') ax1.set_ylim(bottom=75.0) plt.xticks([syl["end"] for syl in u.gr("Syllable")], [getsylsegstr(syl) for syl in u.gr("Syllable")]) ax1.grid() fig3 = plt.figure() ax2 = fig3.add_subplot(111) decimate_factor = 10 ax2.set_title("Waveform (decimation factor: %s)" % decimate_factor) ax2.set_ylabel("Amplitude") ax2.set_xlabel("Syllables") waveform = ss.decimate(u["waveform"].samples, decimate_factor) plt.plot(np.arange(len(waveform)) * (1.0 / u["waveform"].samplerate * decimate_factor), waveform, color='b') #ax2.set_xticks(bounds*u["waveform"].samplerate, [''] * len(bounds)) plt.xticks([syl["end"] for syl in u.gr("Syllable")], [getsylsegstr(syl) for syl in u.gr("Syllable")]) # fig3.set_facecolor((0.921568627451, 0.921568627451, 0.921568627451)) ax2.grid() #plt.show() return fig1, fig2, fig3
def draw_sylstruct_graph_pitch_waveform(u): #use seg end times to calculate start and end times for all #items... u.fill_startendtimes() g = nx.Graph() posdict = {} nodelist = [] nodesizelist = [] for word in u.get_relation("SylStructure"): nodelist.append(word) nodesizelist.append(300 * len(str(word))) posdict[word] = [word["end"] + word["start"] / 2, 3] if word.prev_item: g.add_edge(word.prev_item, word) if word.next_item: g.add_edge(word.next_item, word) g.add_edge(word.first_daughter, word) g.add_edge(word.last_daughter, word) for syl in word.get_daughters(): nodelist.append(syl) nodesizelist.append(400) posdict[syl] = [syl["end"] + syl["start"] / 2, 2] if syl.prev_item: g.add_edge(syl.prev_item, syl) if syl.next_item: g.add_edge(syl.next_item, syl) g.add_edge(syl.first_daughter, syl) g.add_edge(syl.last_daughter, syl) for seg in syl.get_daughters(): nodelist.append(seg) nodesizelist.append(350) posdict[seg] = [seg["end"] + seg["start"] / 2, 1] if seg.prev_item: g.add_edge(seg.prev_item, seg) if seg.next_item: g.add_edge(seg.next_item, seg) uttendtime = u.get_relation("Segment").tail_item["end"] bounds = np.array([word["end"] for word in u.get_relation("Word")]) #get the pitch: d = mkdtemp() u["waveform"].write(os.path.join(d, "utt.wav")) f0t = Track() f0t.get_f0(os.path.join(d, "utt.wav")) shutil.rmtree(d) fig = pl.figure( ) #edgecolor=(0.921568627451, 0.921568627451, 0.921568627451)) ax = fig.add_subplot(311) ax.set_title("Utterance") # ax.set_ylim(0, 5) # ax.set_xticks(bounds) # ax.grid() nx.draw(g, pos=posdict, ax=ax, nodelist=nodelist, node_size=nodesizelist) ax1 = fig.add_subplot(312) ax1.set_title("Pitch") ax1.set_ylim(20.0, 300.0) ax1.set_ylabel("Hertz") pl.plot(f0t.times, f0t.values, color='green') pl.xticks([syl["end"] for syl in u.gr("Syllable")], [syl["tone"] for syl in u.gr("Syllable")]) ax1.grid() ax2 = fig.add_subplot(313) ax2.set_title("Waveform") ax2.set_xlim(0, uttendtime * u["waveform"].samplerate) pl.plot(u["waveform"].samples, color='b') ax2.set_xticks(bounds * u["waveform"].samplerate, [''] * len(bounds)) fig.set_facecolor((0.921568627451, 0.921568627451, 0.921568627451)) # ax2.grid() # pl.show() # fig.savefig("output.png") return fig
def add_feats_to_utt(args): u, lpc_dir, joincoef_dir, f0_dir = args file_id = u["file_id"] print("Processing:", file_id) u.fill_startendtimes() for unit, word in zip(u.gr("Unit"), u.gr("Word")): assert unit["name"] == word["name"] unit["start"] = word["start"] unit["end"] = word["end"] lpctrack = Track() lpctrack.load_track(".".join([os.path.join(lpc_dir, file_id), LPC_EXT])) restrack = Track() restrack.load_wave(".".join([os.path.join(lpc_dir, file_id), RES_EXT])) jointrack = ttslab.fromfile(".".join( [os.path.join(joincoef_dir, file_id), JOIN_EXT])) f0track = Track() f0track.load_track(".".join([os.path.join(f0_dir, file_id), F0_EXT])) #get boundarytimes: boundarytimes = [] for i, unit in enumerate(u.gr("Unit")): if i == 0: boundarytimes.append(unit["start"]) boundarytimes.append(unit["end"]) #convert boundtimes into sample ranges: lpcsampleranges = [] f0sampleranges = [] joinsamples = [] for bound in boundarytimes: lpcsampleranges.append(lpctrack.index_at(bound)) f0sampleranges.append(f0track.index_at(bound)) joinsamples.append(jointrack.values[jointrack.index_at(bound)]) #get pitchperiods at lpc indices lpctimes = np.concatenate(([0.0], lpctrack.times)) pitchperiod = np.diff(lpctimes) units = u.get_relation("Unit").as_list() assert len(units) == len(lpcsampleranges) - 1 for jc0, jc1, lti0, lti1, fti0, fti1, i in zip(joinsamples[:-1], joinsamples[1:], lpcsampleranges[:-1], lpcsampleranges[1:], f0sampleranges[:-1], f0sampleranges[1:], units): # print(i["name"], "lpctrack[%s:%s]" % (lti0, lti1), "len(lpctrack)=%s" % len(lpctrack)) i["left-joincoef"] = jc0 i["right-joincoef"] = jc1 i["lpc-coefs"] = lpctrack.slice( lti0, lti1, copy=True) #like python indexing/slicing if lti0 == 0: i["lpc-coefs"].starttime = 0.0 else: i["lpc-coefs"].starttime = lpctrack.times[lti0 - 1] i["lpc-coefs"].zero_starttime() #For windowfactor=2 (save only samples and assume 16kHz) i["residuals"] = restrack.slice( restrack.index_at(lpctrack.times[lti0] - pitchperiod[lti0]), restrack.index_at(lpctrack.times[lti1] + pitchperiod[lti0])).values return u
import sys import array import math import numpy as np import ttslab from ttslab.trackfile import Track ttslab.extend(Track, "ttslab.trackfile.funcs.tfuncs_praat") def friendly_log(f): try: return math.log(f) except ValueError: return float('-1e+10') if __name__ == "__main__": fn = sys.argv[1] outfn = sys.argv[2] minf0 = float(sys.argv[3]) maxf0 = float(sys.argv[4]) t = Track() t.get_f0(fn, minpitch=minf0, maxpitch=maxf0, timestep=0.005, fixocterrs=True) #timestep hardcoded here because of hack below... #hack aligns samples with equiv from HTS script: pad = np.array([0.0, 0.0]).reshape(-1, 1) f0hzvalues = np.concatenate([pad, t.values, pad]) lf0 = array.array(b"f", map(friendly_log, f0hzvalues)) with open(outfn, "wb") as outfh: lf0.tofile(outfh)
def add_feats_to_utt(args): u, lpc_dir, joincoef_dir, f0_dir = args file_id = u["file_id"] print("Processing:", file_id) u.fill_startendtimes() lpctrack = Track() lpctrack.load_track(".".join([os.path.join(lpc_dir, file_id), LPC_EXT])) restrack = Track() restrack.load_wave(".".join([os.path.join(lpc_dir, file_id), RES_EXT])) jointrack = ttslab.fromfile(".".join([os.path.join(joincoef_dir, file_id), JOIN_EXT])) f0track = Track() f0track.load_track(".".join([os.path.join(f0_dir, file_id), F0_EXT])) #get boundarytimes: boundarytimes = [] durations = [] starttime = 0.0 for seg in u.get_relation("Segment"): endtime = float(seg["end"]) if "cl_end" in seg: splittime = float(seg["cl_end"]) else: splittime = (endtime + starttime) / 2 #TODO: should still add 25% split if diphthong... boundarytimes.append([starttime, splittime, endtime]) durations.extend([splittime - starttime, endtime - splittime]) starttime = endtime #convert boundtimes into sample ranges (and flatten): lpcsampleranges = [] f0sampleranges = [] joinsamples = [] #DEMITASSE: If not pruning pau halfphones: # for bounds in boundarytimes: # lpcsampleranges.extend([lpctrack.get_index_at(bounds[0]), # lpctrack.get_index_at(bounds[1])]) # joinsamples.extend([jointrack.get_sample_at(bounds[0]), # jointrack.get_sample_at(bounds[1])]) # lpcsampleranges.append(len(lpctrack)) # joinsamples.append(jointrack.get_sample_at(len(jointrack))) #DEMITASSE: If pruning pau halfphones: durations = durations[1:-1] for i, bounds in enumerate(boundarytimes): if i == 0: lpcsampleranges.append(lpctrack.index_at(bounds[1])) f0sampleranges.append(f0track.index_at(bounds[1])) joinsamples.append(jointrack.values[bounds[1]]) else: lpcsampleranges.extend([lpctrack.index_at(bounds[0]), lpctrack.index_at(bounds[1])]) f0sampleranges.extend([f0track.index_at(bounds[0]), f0track.index_at(bounds[1])]) joinsamples.extend([jointrack.values[bounds[0]], jointrack.values[bounds[1]]]) #get pitchperiods at lpc indices lpctimes = np.concatenate(([0.0], lpctrack.times)) pitchperiod = np.diff(lpctimes) units = u.get_relation("Unit").as_list() assert len(units) == len(lpcsampleranges) - 1 for jc0, jc1, lti0, lti1, fti0, fti1, dur, i in zip(joinsamples[:-1], joinsamples[1:], lpcsampleranges[:-1], lpcsampleranges[1:], f0sampleranges[:-1], f0sampleranges[1:], durations, units): # print(i["name"], "lpctrack[%s:%s]" % (lti0, lti1), "len(lpctrack)=%s" % len(lpctrack)) i["left-joincoef"] = jc0 i["right-joincoef"] = jc1 i["lpc-coefs"] = lpctrack.slice(lti0, lti1, copy=True) #like python indexing/slicing if lti0 == 0: i["lpc-coefs"].starttime = 0.0 else: i["lpc-coefs"].starttime = lpctrack.times[lti0 - 1] i["lpc-coefs"].zero_starttime() i["dur"] = dur #For windowfactor=2 (save only samples and assume 16kHz) i["residuals"] = restrack.slice(restrack.index_at(lpctrack.times[lti0] - pitchperiod[lti0]), restrack.index_at(lpctrack.times[lti1] + pitchperiod[lti0])).values return u
def make_joincoefs(featconfig, wav_dir): """ Make joincoefs... """ mcep_dir = os.path.join(os.getcwd(), MCEP_DIR) os.mkdir(mcep_dir) join_dir = os.path.join(os.getcwd(), JOIN_DIR) os.mkdir(join_dir) pm_dir = os.path.join(os.getcwd(), PM_DIR) f0_dir = os.path.join(os.getcwd(), F0_DIR) fbank_order = featconfig.get("SIG2FV_MCEP", "FBANK_ORDER") melcep_order = featconfig.get("SIG2FV_MCEP", "MELCEP_ORDER") melcep_coefs = featconfig.get("SIG2FV_MCEP", "MELCEP_COEFS") preemph_coef = featconfig.get("SIG2FV_MCEP", "PREEMPH_COEF") window_factor = featconfig.get("SIG2FV_MCEP", "WINDOW_FACTOR") window_type = featconfig.get("SIG2FV_MCEP", "WINDOW_TYPE") print("MAKING JOINCOEFS...") map(extract_mceps, [(wavfilename, fbank_order, window_factor, preemph_coef, melcep_order, window_type, melcep_coefs, mcep_dir, pm_dir) for wavfilename in sorted( glob(os.path.join(wav_dir, ".".join(["*", WAV_EXT]))))]) print("NORMALISING AND JOINING F0 AND MCEPS...") #Normalising mceps and f0s: upper = +1.0 lower = -1.0 mceptracks = {} for fn in glob(os.path.join(mcep_dir, ".".join(["*", MCEP_EXT]))): t = Track() t.load_track(fn) mceptracks[os.path.basename(fn)] = t allmcepvecs = np.concatenate( [mceptracks[tn].values for tn in sorted(mceptracks)]) mcepmean = allmcepvecs.mean(0) mcepstd = allmcepvecs.std(0) for k in mceptracks: mceptracks[k].values = (mceptracks[k].values - mcepmean) / (4 * mcepstd) * (upper - lower) f0tracks = {} for fn in glob(os.path.join(f0_dir, ".".join(["*", F0_EXT]))): t = Track() t.load_track(fn) f0tracks[os.path.basename(fn)] = t #allf0vecs = np.concatenate([f0tracks[tn].values for tn in sorted(f0tracks)]) allf0vecs = np.concatenate([ f0tracks[tn].values[f0tracks[tn].values.nonzero()] for tn in sorted(f0tracks) ]) f0mean = allf0vecs.mean(0) f0std = allf0vecs.std(0) for k in f0tracks: f0tracks[k].values = (f0tracks[k].values - f0mean) / (4 * f0std) * (upper - lower) #Add f0 to mcep track: for k1, k2 in zip(sorted(mceptracks), sorted(f0tracks)): mceptracks[k1].values = np.concatenate( (mceptracks[k1].values, f0tracks[k2].values), 1) for fn in mceptracks: basename = os.path.splitext(os.path.basename(fn))[0] ttslab.tofile(mceptracks[fn], os.path.join(join_dir, basename + "." + JOIN_EXT))
def add_feats_to_utt(args): u, lpc_dir, joincoef_dir, f0_dir = args file_id = u["file_id"] print("Processing:", file_id) u.fill_startendtimes() lpctrack = Track() lpctrack.load_track(".".join([os.path.join(lpc_dir, file_id), LPC_EXT])) restrack = Track() restrack.load_wave(".".join([os.path.join(lpc_dir, file_id), RES_EXT])) jointrack = ttslab.fromfile(".".join( [os.path.join(joincoef_dir, file_id), JOIN_EXT])) f0track = Track() f0track.load_track(".".join([os.path.join(f0_dir, file_id), F0_EXT])) #get boundarytimes: boundarytimes = [] durations = [] starttime = 0.0 for seg in u.get_relation("Segment"): endtime = float(seg["end"]) if "cl_end" in seg: splittime = float(seg["cl_end"]) else: splittime = (endtime + starttime) / 2 #TODO: should still add 25% split if diphthong... boundarytimes.append([starttime, splittime, endtime]) durations.extend([splittime - starttime, endtime - splittime]) starttime = endtime #convert boundtimes into sample ranges (and flatten): lpcsampleranges = [] f0sampleranges = [] joinsamples = [] #DEMITASSE: If not pruning pau halfphones: # for bounds in boundarytimes: # lpcsampleranges.extend([lpctrack.get_index_at(bounds[0]), # lpctrack.get_index_at(bounds[1])]) # joinsamples.extend([jointrack.get_sample_at(bounds[0]), # jointrack.get_sample_at(bounds[1])]) # lpcsampleranges.append(len(lpctrack)) # joinsamples.append(jointrack.get_sample_at(len(jointrack))) #DEMITASSE: If pruning pau halfphones: durations = durations[1:-1] for i, bounds in enumerate(boundarytimes): if i == 0: lpcsampleranges.append(lpctrack.index_at(bounds[1])) f0sampleranges.append(f0track.index_at(bounds[1])) joinsamples.append(jointrack.values[bounds[1]]) else: lpcsampleranges.extend( [lpctrack.index_at(bounds[0]), lpctrack.index_at(bounds[1])]) f0sampleranges.extend( [f0track.index_at(bounds[0]), f0track.index_at(bounds[1])]) joinsamples.extend( [jointrack.values[bounds[0]], jointrack.values[bounds[1]]]) #get pitchperiods at lpc indices lpctimes = np.concatenate(([0.0], lpctrack.times)) pitchperiod = np.diff(lpctimes) units = u.get_relation("Unit").as_list() assert len(units) == len(lpcsampleranges) - 1 for jc0, jc1, lti0, lti1, fti0, fti1, dur, i in zip( joinsamples[:-1], joinsamples[1:], lpcsampleranges[:-1], lpcsampleranges[1:], f0sampleranges[:-1], f0sampleranges[1:], durations, units): # print(i["name"], "lpctrack[%s:%s]" % (lti0, lti1), "len(lpctrack)=%s" % len(lpctrack)) i["left-joincoef"] = jc0 i["right-joincoef"] = jc1 i["lpc-coefs"] = lpctrack.slice( lti0, lti1, copy=True) #like python indexing/slicing if lti0 == 0: i["lpc-coefs"].starttime = 0.0 else: i["lpc-coefs"].starttime = lpctrack.times[lti0 - 1] i["lpc-coefs"].zero_starttime() i["dur"] = dur #For windowfactor=2 (save only samples and assume 16kHz) i["residuals"] = restrack.slice( restrack.index_at(lpctrack.times[lti0] - pitchperiod[lti0]), restrack.index_at(lpctrack.times[lti1] + pitchperiod[lti0])).values return u
def hts_synth(self, utt, processname): htsparms = self.engine_parms.copy() htsparms["-of"] = "%(tempolf0_file)s" if "htsparms" in utt: htsparms.update(utt["htsparms"]) #parm overrides for this utt... #build command string and execute: cmds = self.hts_bin for k in htsparms: if htsparms[k]: if htsparms[k] is True: cmds += " " + k else: cmds += " " + k + " " + str(htsparms[k]) cmds += " %(tempilab_file)s" fd1, tempwav_file = mkstemp(prefix="ttslab_", suffix=".wav") fd2, tempilab_file = mkstemp(prefix="ttslab_") fd3, tempolab_file = mkstemp(prefix="ttslab_") fd4, tempolf0_file = mkstemp(prefix="ttslab_") cmds = cmds % {'models_dir': self.models_dir, 'tempwav_file': tempwav_file, 'tempilab_file': tempilab_file, 'tempolab_file': tempolab_file, 'tempolf0_file': tempolf0_file} #print(cmds) with codecs.open(tempilab_file, "w", encoding="utf-8") as outfh: outfh.write("\n".join(utt["hts_label"])) os.system(cmds) #load seg endtimes into utt: with open(tempolab_file) as infh: lines = infh.readlines() segs = utt.get_relation("Segment").as_list() assert len(segs) == len(lines) for line, seg in zip(lines, segs): seg["end"] = hts_labels_tone.htk_int_to_float(line.split()[1]) #load audio: utt["waveform"] = Waveform(tempwav_file) #load lf0: f0 = np.exp(np.fromfile(tempolf0_file, "float32")) #load and lf0 to hertz #to semitones relative to 1Hz: f0[f0.nonzero()] = 12.0 * np.log2(f0[f0.nonzero()]) # 12 * log2 (F0 / F0reference) where F0reference = 1 f0t = Track() f0t.values = f0 f0t.times = np.arange(len(f0), dtype=np.float64) * 0.005 utt["f0"] = f0t #cleanup tempfiles: os.close(fd1) os.close(fd2) os.close(fd3) os.close(fd4) os.remove(tempwav_file) os.remove(tempolab_file) os.remove(tempilab_file) os.remove(tempolf0_file) return utt