def utt_mceps(utt, shift=0.005, remove_pau=False, resettimes=False): temppath = mkdtemp() #wavs wfn1 = os.path.join(temppath, "1." + WAV_EXT) utt["waveform"].write(wfn1) #feats ffn1 = os.path.join(temppath, "1." + FEAT_EXT) cmds = SIG2FV % {"inputfile": wfn1, "outputfile": ffn1, "shift": shift} #print(cmds) os.system(cmds) #tracks t1 = Track() t1.load_track(ffn1) #cleanup shutil.rmtree(temppath) keep_intervals = [] if remove_pau: u = deepcopy(utt) fill_startendtimes(u) for seg in u.gr("Segment"): if seg["name"] != "pau": keep_intervals.append((seg["start"], seg["end"])) indices = t1.mask_indices(keep_intervals) t1.times = t1.times[indices] t1.values = t1.values[indices] if resettimes: t1.times = np.arange(1, len(t1.times) + 1, dtype=np.float) * shift return t1
def relative_local_x(reft, targett, dtwpath, s=100, debug=False): """Determines the rate of change of target relative to ref, using alignment provided by dtwpath. """ t = reft.times.copy() delta = np.zeros((len(t)), dtype=np.float64) for rti, tti in dtwpath: delta[rti] = targett.values[tti] - reft.values[rti] spline = UnivariateSpline(t, delta, k=5, s=s) if debug: import pylab as pl pl.subplot(211) pl.plot(t, reft.values, label="ref") temp = np.zeros((len(t)), dtype=np.float64) for rti, tti in dtwpath: temp[rti] = targett.values[tti] pl.plot(t, temp, label="tgt") pl.legend() pl.subplot(212) pl.plot(t, delta) pl.plot(t, spline(t)) rlxtrack = Track() rlxtrack.times = t rlxtrack.values = spline(rlxtrack.times, 1).reshape((-1, 1)) return rlxtrack
def relative_local_speechrate(reft, targett, s=0.03, realigntimes=False, debug=False): """ calculate the relative local speech rate between targett and reft by DTW aligning the tracks, fitting a smoothing spline to the frame time difference function (smoothing factor 's') and using this to calculate derivative contour. """ try: assert (targett.times[1] - targett.times[0]) == ( reft.times[1] - reft.times[0] ), "constant timestep for reference and target need to be equal..." except AssertionError: print( "WARNING: timesteps must be equal.... (this may be spurious if 'remove_pau' was used)" ) path = dtw_align(reft, targett)[-1] ltd = _local_timediff(path, reft.times, targett.times) if realigntimes: newreftimes = np.arange(1, len(ltd) + 1) * (ltd.times[1] - ltd.times[0]) else: newreftimes = ltd.times.copy() spline = UnivariateSpline(newreftimes, ltd.values, k=5, s=s) if debug: import pylab as pl pl.plot(ltd.times, ltd.values) pl.plot(ltd.times, spline(ltd.times)) rlstrack = Track() rlstrack.times = newreftimes rlstrack.values = spline(rlstrack.times, 1).reshape((-1, 1)) return rlstrack
def synth(self, voice, utt, args): synthparms = args #not yet implemented... htslabel = "\n".join(utt["hts_label"]).encode( "utf-8").splitlines() #to utf-8 bytestring if synthparms and "use_labalignments" in synthparms: use_labalignments = True else: use_labalignments = False with HTS_EngineME(self.htsvoice_bin, self.mixfilter_bin, self.pdfilter_bin) as htsengine: htsengine.synth(htslabel, use_labalignments=use_labalignments) for segt, seg in zip(htsengine.get_segtimes(), utt.gr("Segment")): seg["start"], seg["end"] = segt # utt["debug_waveform"] = htsengine.get_wav() f0st = 12.0 * np.log2(htsengine.get_f0()) f0st[f0st == -np.inf] = 0.0 f0times = np.arange(len(f0st)) * STEPSIZE f0track = Track() f0track.times = f0times f0track.values = f0st.reshape((-1, 1)) # utt["debug_f0track"] = f0track utt.fill_startendtimes() #add qta_startpitch for phr in utt.gr("Phrase"): syl = phr.first_daughter.gir("SylStructure").first_daughter syltrackvals = f0track.slice(f0track.index_at(syl["start"]), f0track.index_at( syl["end"])).values.flatten() validvals = syltrackvals[syltrackvals.nonzero()] if len(validvals) > 3: phr["qta_startpitch"] = np.mean( validvals[:len(validvals) // 4]) else: phr["qta_startpitch"] = BACKOFF_STARTPITCH utt = voice.pitchmodel(utt, ("synth", None)) f0spline = InterpolatedUnivariateSpline(utt["f0track"].times, utt["f0track"].values) newf0 = f0spline(f0track.times) #HEURISTIC ADJUSTMENT CLOSER TO HTS DYNAMICS m = np.mean(f0track.values[f0track.values.nonzero()]) newf0 *= 1.3 #more dynamic m2 = np.mean(newf0) newf0 += m - m2 ### TRANSFER UNVOICED SECTIONS # newf0[f0track.values.flatten() == 0.0] = 0.0 # import pylab as pl # pl.plot(f0track.times, f0track.values) # pl.plot(f0track.times, newf0) ### newf0 = 2**(newf0 / 12.0) newf0 = tolf0(newf0) htsengine.synth(htslabel, lf0=newf0, use_labalignments=use_labalignments) #populate utt with waveform and segment alignments utt["waveform"] = htsengine.get_wav() return utt
def _local_timediff(path, reftimes, targettimes): """ determine the local time difference mapped onto reftimes... """ t = reftimes[:path[-1][0] + 1] delta = np.zeros((len(t)), dtype=np.float64) for rti, tti in path: delta[rti] = targettimes[tti] - reftimes[rti] ttrack = Track() ttrack.times = t ttrack.values = delta return ttrack
def linearpath_distances(track, track2, metric="euclidean", VI=None): dist = cdist(track.values, track2.values, metric=str(metric), VI=VI) framedists = [] try: for i in range(len(track.times)): framedists.append(dist[i][i]) except IndexError: pass t = Track() t.values = np.array(framedists) t.values = t.values.reshape(-1, 1) t.times = np.array([track.times[i] for i in range(len(t.values))]) if track2.numframes != track.numframes: print("linearpath_distances: WARNING: num frames difference is %s" % (track2.numframes - track.numframes)) return t
def dtw_distances(track, track2, metric="euclidean", VI=None): cumdist, dist, path = track.dtw_align(track2, metric=str(metric), VI=VI) framedists = [] frametimes = [] for pathcoord in path: x, y = pathcoord framedists.append(dist[x][y]) frametimes.append(track.times[x]) t = Track() t.values = np.array(framedists) t.values = t.values.reshape(-1, 1) t.times = np.array(frametimes) return t
def qta_synth_utt(utt, synthfunc=synth): times = np.array([]) values = np.array([]) for phr in utt.gr("Phrase"): synthparms = [] for word in phr.get_daughters(): for syl in word.gir("SylStructure").get_daughters(): synthparms.append([ syl[STARTLAB], syl[ENDLAB], syl[QTAPREFIX + "_endheight"], syl[QTAPREFIX + "_slope"], syl[QTAPREFIX + "_lambd"] ]) phrf0track = synthfunc(phr[QTAPREFIX + "_startpitch"], synthparms) times = np.concatenate((times, phrf0track.times)) values = np.concatenate((values, phrf0track.values.flatten())) f0track = Track() f0track.times = times f0track.values = values.reshape((-1, 1)) return f0track
def gradient(track, h=2): """Estimate of the gradient using a window length 'h'... must be even... number of points is h+1 """ assert h % 2 == 0 n = h // 2 #timesteps must be constant period = track.times[1] - track.times[0] times = track.times[n:-n].copy() values = np.zeros(len(times), dtype=track.values.dtype) for i in range(len(values)): values[i] = (track.values[i + h] - track.values[i]) / (h * period) t = Track() t.times = times t.values = values return t
def synth(startpitch, synthparms, numpoints=100, plot=False): times = np.zeros(len(synthparms) * numpoints) contour = np.zeros(len(synthparms) * numpoints) for i, synthparm in enumerate(synthparms): if i == 0: p0 = startpitch dp0 = 0.0 ddp0 = 0.0 if synthparm[0] != synthparms[ i - 1][1]: #not contiguous (e.g. a pause is present) dp0 = 0.0 ddp0 = 0.0 if any([e is None for e in synthparm ]): #no parameters available for this syllable, skip... dp0 = 0.0 ddp0 = 0.0 continue utt_t = np.linspace(synthparm[0], synthparm[1], numpoints, endpoint=False) times[i * numpoints:i * numpoints + numpoints] = utt_t syl_t = utt_t - synthparm[0] #start at 0.0 #y = mx + c syltarget_m = synthparm[3] syltarget_c = get_intercept(syl_t[-1], synthparm[2], synthparm[3]) scontour = sylcontour(syl_t, syltarget_m, syltarget_c, p0, dp0, ddp0, synthparm[4]) if plot: pl.plot(syl_t + synthparm[0], np.polyval(coefs, syl_t), linestyle="dashed", color="red") pl.plot(syl_t + synthparm[0], scontour, color="green") spline = InterpolatedUnivariateSpline(syl_t, scontour) contour[i * numpoints:i * numpoints + numpoints] = scontour p0, dp0, ddp0, temp = spline.derivatives(syl_t[-1]) synthtrack = Track() synthtrack.times = times[contour.nonzero()].copy() synthtrack.values = contour[contour.nonzero()].reshape((-1, 1)).copy() return synthtrack
def timenorm_tonecontour_dtw(reftrack, track): #normalise location: tmean = deepcopy(reftrack) tmean.values = tmean.values - tmean.values.mean() #normalise location: t1 = deepcopy(track) t1.values = t1.values - t1.values.mean() #smooth function to facilitate smoother warping: s = UnivariateSpline(t1.times, t1.values) t1.values = s(t1.times).reshape((-1, 1)) #align: dtw = dtw_align(tmean, t1) #construct new track using mapping: newtrack = Track() newtrack.times = np.copy(tmean.times) values = np.zeros(len(tmean), np.float64) for i in range(len(tmean)): es = [e[1] for e in dtw[2] if e[0] == i] values[i] = np.mean(track.values[es]) newtrack.values = values.reshape((-1, 1)) smoothtrack = newtrack.newtrack_from_sspline(newtrack.times, s=len(newtrack.times) / 10.0) return smoothtrack
def synth2(startpitch, synthparms, numpoints=100, plot=False, minlambd=10.0, dlambd=5.0): """ Limit the strength of articulation to avoid acceleration in opposite direction of endheight target... """ times = np.zeros(len(synthparms) * numpoints) contour = np.zeros(len(synthparms) * numpoints) for i, synthparm in enumerate(synthparms): if i == 0: p0 = startpitch dp0 = 0.0 ddp0 = 0.0 if synthparm[0] != synthparms[ i - 1][1]: #not contiguous (e.g. a pause is present) dp0 = 0.0 ddp0 = 0.0 if any([e is None for e in synthparm ]): #no parameters available for this syllable, skip... dp0 = 0.0 ddp0 = 0.0 continue utt_t = np.linspace(synthparm[0], synthparm[1], numpoints, endpoint=False) times[i * numpoints:i * numpoints + numpoints] = utt_t syl_t = utt_t - synthparm[0] #start at 0.0 #y = mx + c syltarget_m = synthparm[3] syltarget_c = get_intercept(syl_t[-1], synthparm[2], synthparm[3]) while True: #resynthesise with lower strength until constraint met scontour = sylcontour(syl_t, syltarget_m, syltarget_c, p0, dp0, ddp0, synthparm[4]) spline = InterpolatedUnivariateSpline(syl_t, scontour) #check acceleration if synthparm[4] <= minlambd: break accels = spline(syl_t, 2) if synthparm[2] > p0: if np.all(accels > 0.0): break elif synthparm[2] < p0: if np.all(accels < 0.0): break else: break synthparm[4] -= dlambd if synthparm[4] < minlambd: synthparm[4] = minlambd if plot: pl.plot(syl_t + synthparm[0], np.polyval(coefs, syl_t), linestyle="dashed", color="red") pl.plot(syl_t + synthparm[0], scontour, color="green") contour[i * numpoints:i * numpoints + numpoints] = scontour p0, dp0, ddp0, temp = spline.derivatives(syl_t[-1]) synthtrack = Track() synthtrack.times = times[contour.nonzero()].copy() synthtrack.values = contour[contour.nonzero()].reshape((-1, 1)).copy() return synthtrack
def hts_synth(self, utt, processname): htsparms = self.engine_parms.copy() htsparms["-of"] = "%(tempolf0_file)s" if "htsparms" in utt: htsparms.update(utt["htsparms"]) #parm overrides for this utt... #build command string and execute: cmds = self.hts_bin for k in htsparms: if htsparms[k]: if htsparms[k] is True: cmds += " " + k else: cmds += " " + k + " " + str(htsparms[k]) cmds += " %(tempilab_file)s" fd1, tempwav_file = mkstemp(prefix="ttslab_", suffix=".wav") fd2, tempilab_file = mkstemp(prefix="ttslab_") fd3, tempolab_file = mkstemp(prefix="ttslab_") fd4, tempolf0_file = mkstemp(prefix="ttslab_") cmds = cmds % {'models_dir': self.models_dir, 'tempwav_file': tempwav_file, 'tempilab_file': tempilab_file, 'tempolab_file': tempolab_file, 'tempolf0_file': tempolf0_file} #print(cmds) with codecs.open(tempilab_file, "w", encoding="utf-8") as outfh: outfh.write("\n".join(utt["hts_label"])) os.system(cmds) #load seg endtimes into utt: with open(tempolab_file) as infh: lines = infh.readlines() segs = utt.get_relation("Segment").as_list() assert len(segs) == len(lines) for line, seg in zip(lines, segs): seg["end"] = hts_labels_tone.htk_int_to_float(line.split()[1]) #load audio: utt["waveform"] = Waveform(tempwav_file) #load lf0: f0 = np.exp(np.fromfile(tempolf0_file, "float32")) #load and lf0 to hertz #to semitones relative to 1Hz: f0[f0.nonzero()] = 12.0 * np.log2(f0[f0.nonzero()]) # 12 * log2 (F0 / F0reference) where F0reference = 1 f0t = Track() f0t.values = f0 f0t.times = np.arange(len(f0), dtype=np.float64) * 0.005 utt["f0"] = f0t #cleanup tempfiles: os.close(fd1) os.close(fd2) os.close(fd3) os.close(fd4) os.remove(tempwav_file) os.remove(tempolab_file) os.remove(tempilab_file) os.remove(tempolf0_file) return utt