示例#1
0
def utt_mceps(utt, shift=0.005, remove_pau=False, resettimes=False):
    temppath = mkdtemp()

    #wavs
    wfn1 = os.path.join(temppath, "1." + WAV_EXT)
    utt["waveform"].write(wfn1)
    #feats
    ffn1 = os.path.join(temppath, "1." + FEAT_EXT)
    cmds = SIG2FV % {"inputfile": wfn1,
                     "outputfile": ffn1,
                     "shift": shift}
    #print(cmds)
    os.system(cmds)

    #tracks
    t1 = Track()
    t1.load_track(ffn1)

    #cleanup
    shutil.rmtree(temppath)

    keep_intervals = []
    if remove_pau:
        u = deepcopy(utt)
        fill_startendtimes(u)
        for seg in u.gr("Segment"):
            if seg["name"] != "pau":
                keep_intervals.append((seg["start"], seg["end"]))
        indices = t1.mask_indices(keep_intervals)
        t1.times = t1.times[indices]
        t1.values = t1.values[indices]
    if resettimes:
        t1.times = np.arange(1, len(t1.times) + 1, dtype=np.float) * shift
    return t1
示例#2
0
def relative_local_x(reft, targett, dtwpath, s=100, debug=False):
    """Determines the rate of change of target relative to ref, using
       alignment provided by dtwpath.
    """

    t = reft.times.copy()
    delta = np.zeros((len(t)), dtype=np.float64)
    for rti, tti in dtwpath:
        delta[rti] = targett.values[tti] - reft.values[rti]

    spline = UnivariateSpline(t, delta, k=5, s=s)
    if debug:
        import pylab as pl
        pl.subplot(211)
        pl.plot(t, reft.values, label="ref")
        temp = np.zeros((len(t)), dtype=np.float64)
        for rti, tti in dtwpath:
            temp[rti] = targett.values[tti]
        pl.plot(t, temp, label="tgt")
        pl.legend()
        pl.subplot(212)
        pl.plot(t, delta)
        pl.plot(t, spline(t))

    rlxtrack = Track()
    rlxtrack.times = t
    rlxtrack.values = spline(rlxtrack.times, 1).reshape((-1, 1))
    return rlxtrack
示例#3
0
def relative_local_speechrate(reft,
                              targett,
                              s=0.03,
                              realigntimes=False,
                              debug=False):
    """ calculate the relative local speech rate between targett and
        reft by DTW aligning the tracks, fitting a smoothing spline
        to the frame time difference function (smoothing factor 's')
        and using this to calculate derivative contour.
    """
    try:
        assert (targett.times[1] - targett.times[0]) == (
            reft.times[1] - reft.times[0]
        ), "constant timestep for reference and target need to be equal..."
    except AssertionError:
        print(
            "WARNING: timesteps must be equal.... (this may be spurious if 'remove_pau' was used)"
        )

    path = dtw_align(reft, targett)[-1]
    ltd = _local_timediff(path, reft.times, targett.times)
    if realigntimes:
        newreftimes = np.arange(1,
                                len(ltd) + 1) * (ltd.times[1] - ltd.times[0])
    else:
        newreftimes = ltd.times.copy()
    spline = UnivariateSpline(newreftimes, ltd.values, k=5, s=s)
    if debug:
        import pylab as pl
        pl.plot(ltd.times, ltd.values)
        pl.plot(ltd.times, spline(ltd.times))
    rlstrack = Track()
    rlstrack.times = newreftimes
    rlstrack.values = spline(rlstrack.times, 1).reshape((-1, 1))
    return rlstrack
示例#4
0
    def synth(self, voice, utt, args):
        synthparms = args  #not yet implemented...
        htslabel = "\n".join(utt["hts_label"]).encode(
            "utf-8").splitlines()  #to utf-8 bytestring
        if synthparms and "use_labalignments" in synthparms:
            use_labalignments = True
        else:
            use_labalignments = False
        with HTS_EngineME(self.htsvoice_bin, self.mixfilter_bin,
                          self.pdfilter_bin) as htsengine:
            htsengine.synth(htslabel, use_labalignments=use_labalignments)
            for segt, seg in zip(htsengine.get_segtimes(), utt.gr("Segment")):
                seg["start"], seg["end"] = segt
#            utt["debug_waveform"] = htsengine.get_wav()
            f0st = 12.0 * np.log2(htsengine.get_f0())
            f0st[f0st == -np.inf] = 0.0
            f0times = np.arange(len(f0st)) * STEPSIZE
            f0track = Track()
            f0track.times = f0times
            f0track.values = f0st.reshape((-1, 1))
            #            utt["debug_f0track"] = f0track
            utt.fill_startendtimes()
            #add qta_startpitch
            for phr in utt.gr("Phrase"):
                syl = phr.first_daughter.gir("SylStructure").first_daughter
                syltrackvals = f0track.slice(f0track.index_at(syl["start"]),
                                             f0track.index_at(
                                                 syl["end"])).values.flatten()
                validvals = syltrackvals[syltrackvals.nonzero()]
                if len(validvals) > 3:
                    phr["qta_startpitch"] = np.mean(
                        validvals[:len(validvals) // 4])
                else:
                    phr["qta_startpitch"] = BACKOFF_STARTPITCH
            utt = voice.pitchmodel(utt, ("synth", None))
            f0spline = InterpolatedUnivariateSpline(utt["f0track"].times,
                                                    utt["f0track"].values)
            newf0 = f0spline(f0track.times)
            #HEURISTIC ADJUSTMENT CLOSER TO HTS DYNAMICS
            m = np.mean(f0track.values[f0track.values.nonzero()])
            newf0 *= 1.3  #more dynamic
            m2 = np.mean(newf0)
            newf0 += m - m2
            ### TRANSFER UNVOICED SECTIONS
            # newf0[f0track.values.flatten() == 0.0] = 0.0
            # import pylab as pl
            # pl.plot(f0track.times, f0track.values)
            # pl.plot(f0track.times, newf0)
            ###
            newf0 = 2**(newf0 / 12.0)
            newf0 = tolf0(newf0)
            htsengine.synth(htslabel,
                            lf0=newf0,
                            use_labalignments=use_labalignments)
            #populate utt with waveform and segment alignments
            utt["waveform"] = htsengine.get_wav()
        return utt
示例#5
0
def _local_timediff(path, reftimes, targettimes):
    """ determine the local time difference mapped onto reftimes...
    """
    t = reftimes[:path[-1][0] + 1]
    delta = np.zeros((len(t)), dtype=np.float64)
    for rti, tti in path:
        delta[rti] = targettimes[tti] - reftimes[rti]
    ttrack = Track()
    ttrack.times = t
    ttrack.values = delta
    return ttrack
示例#6
0
def linearpath_distances(track, track2, metric="euclidean", VI=None):

    dist = cdist(track.values, track2.values, metric=str(metric), VI=VI)
    framedists = []
    try:
        for i in range(len(track.times)):
            framedists.append(dist[i][i])
    except IndexError:
        pass
    t = Track()
    t.values = np.array(framedists)
    t.values = t.values.reshape(-1, 1)
    t.times = np.array([track.times[i] for i in range(len(t.values))])
    if track2.numframes != track.numframes:
        print("linearpath_distances: WARNING: num frames difference is %s" % (track2.numframes - track.numframes))
    return t
示例#7
0
def dtw_distances(track, track2, metric="euclidean", VI=None):

    cumdist, dist, path = track.dtw_align(track2, metric=str(metric), VI=VI)

    framedists = []
    frametimes = []
    for pathcoord in path:
        x, y = pathcoord
        framedists.append(dist[x][y])
        frametimes.append(track.times[x])

    t = Track()
    t.values = np.array(framedists)
    t.values = t.values.reshape(-1, 1)
    t.times = np.array(frametimes)

    return t
示例#8
0
def dtw_distances(track, track2, metric="euclidean", VI=None):

    cumdist, dist, path = track.dtw_align(track2, metric=str(metric), VI=VI)

    framedists = []
    frametimes = []
    for pathcoord in path:
        x, y = pathcoord
        framedists.append(dist[x][y])
        frametimes.append(track.times[x])

    t = Track()
    t.values = np.array(framedists)
    t.values = t.values.reshape(-1, 1)
    t.times = np.array(frametimes)
    
    return t
示例#9
0
def qta_synth_utt(utt, synthfunc=synth):
    times = np.array([])
    values = np.array([])
    for phr in utt.gr("Phrase"):
        synthparms = []
        for word in phr.get_daughters():
            for syl in word.gir("SylStructure").get_daughters():
                synthparms.append([
                    syl[STARTLAB], syl[ENDLAB], syl[QTAPREFIX + "_endheight"],
                    syl[QTAPREFIX + "_slope"], syl[QTAPREFIX + "_lambd"]
                ])
        phrf0track = synthfunc(phr[QTAPREFIX + "_startpitch"], synthparms)
        times = np.concatenate((times, phrf0track.times))
        values = np.concatenate((values, phrf0track.values.flatten()))
    f0track = Track()
    f0track.times = times
    f0track.values = values.reshape((-1, 1))
    return f0track
示例#10
0
def gradient(track, h=2):
    """Estimate of the gradient using a window length 'h'... must be even... number of points is h+1
    """
    assert h % 2 == 0
    n = h // 2

    #timesteps must be constant
    period = track.times[1] - track.times[0]

    times = track.times[n:-n].copy()
    values = np.zeros(len(times), dtype=track.values.dtype)

    for i in range(len(values)):
        values[i] = (track.values[i + h] - track.values[i]) / (h * period)

    t = Track()
    t.times = times
    t.values = values
    return t
示例#11
0
def synth(startpitch, synthparms, numpoints=100, plot=False):
    times = np.zeros(len(synthparms) * numpoints)
    contour = np.zeros(len(synthparms) * numpoints)
    for i, synthparm in enumerate(synthparms):
        if i == 0:
            p0 = startpitch
            dp0 = 0.0
            ddp0 = 0.0
        if synthparm[0] != synthparms[
                i - 1][1]:  #not contiguous (e.g. a pause is present)
            dp0 = 0.0
            ddp0 = 0.0
        if any([e is None for e in synthparm
                ]):  #no parameters available for this syllable, skip...
            dp0 = 0.0
            ddp0 = 0.0
            continue
        utt_t = np.linspace(synthparm[0],
                            synthparm[1],
                            numpoints,
                            endpoint=False)
        times[i * numpoints:i * numpoints + numpoints] = utt_t
        syl_t = utt_t - synthparm[0]  #start at 0.0
        #y = mx + c
        syltarget_m = synthparm[3]
        syltarget_c = get_intercept(syl_t[-1], synthparm[2], synthparm[3])
        scontour = sylcontour(syl_t, syltarget_m, syltarget_c, p0, dp0, ddp0,
                              synthparm[4])
        if plot:
            pl.plot(syl_t + synthparm[0],
                    np.polyval(coefs, syl_t),
                    linestyle="dashed",
                    color="red")
            pl.plot(syl_t + synthparm[0], scontour, color="green")
        spline = InterpolatedUnivariateSpline(syl_t, scontour)
        contour[i * numpoints:i * numpoints + numpoints] = scontour
        p0, dp0, ddp0, temp = spline.derivatives(syl_t[-1])
    synthtrack = Track()
    synthtrack.times = times[contour.nonzero()].copy()
    synthtrack.values = contour[contour.nonzero()].reshape((-1, 1)).copy()
    return synthtrack
示例#12
0
def timenorm_tonecontour_dtw(reftrack, track):
    #normalise location:
    tmean = deepcopy(reftrack)
    tmean.values = tmean.values - tmean.values.mean()
    #normalise location:
    t1 = deepcopy(track)
    t1.values = t1.values - t1.values.mean()
    #smooth function to facilitate smoother warping:
    s = UnivariateSpline(t1.times, t1.values)
    t1.values = s(t1.times).reshape((-1, 1))
    #align:
    dtw = dtw_align(tmean, t1)
    #construct new track using mapping:
    newtrack = Track()
    newtrack.times = np.copy(tmean.times)
    values = np.zeros(len(tmean), np.float64)
    for i in range(len(tmean)):
        es = [e[1] for e in dtw[2] if e[0] == i]
        values[i] = np.mean(track.values[es])
    newtrack.values = values.reshape((-1, 1))
    smoothtrack = newtrack.newtrack_from_sspline(newtrack.times,
                                                 s=len(newtrack.times) / 10.0)
    return smoothtrack
示例#13
0
def synth2(startpitch,
           synthparms,
           numpoints=100,
           plot=False,
           minlambd=10.0,
           dlambd=5.0):
    """ Limit the strength of articulation to avoid acceleration in
        opposite direction of endheight target...
    """
    times = np.zeros(len(synthparms) * numpoints)
    contour = np.zeros(len(synthparms) * numpoints)
    for i, synthparm in enumerate(synthparms):
        if i == 0:
            p0 = startpitch
            dp0 = 0.0
            ddp0 = 0.0
        if synthparm[0] != synthparms[
                i - 1][1]:  #not contiguous (e.g. a pause is present)
            dp0 = 0.0
            ddp0 = 0.0
        if any([e is None for e in synthparm
                ]):  #no parameters available for this syllable, skip...
            dp0 = 0.0
            ddp0 = 0.0
            continue
        utt_t = np.linspace(synthparm[0],
                            synthparm[1],
                            numpoints,
                            endpoint=False)
        times[i * numpoints:i * numpoints + numpoints] = utt_t
        syl_t = utt_t - synthparm[0]  #start at 0.0
        #y = mx + c
        syltarget_m = synthparm[3]
        syltarget_c = get_intercept(syl_t[-1], synthparm[2], synthparm[3])
        while True:  #resynthesise with lower strength until constraint met
            scontour = sylcontour(syl_t, syltarget_m, syltarget_c, p0, dp0,
                                  ddp0, synthparm[4])
            spline = InterpolatedUnivariateSpline(syl_t, scontour)
            #check acceleration
            if synthparm[4] <= minlambd:
                break
            accels = spline(syl_t, 2)
            if synthparm[2] > p0:
                if np.all(accels > 0.0):
                    break
            elif synthparm[2] < p0:
                if np.all(accels < 0.0):
                    break
            else:
                break
            synthparm[4] -= dlambd
            if synthparm[4] < minlambd:
                synthparm[4] = minlambd
        if plot:
            pl.plot(syl_t + synthparm[0],
                    np.polyval(coefs, syl_t),
                    linestyle="dashed",
                    color="red")
            pl.plot(syl_t + synthparm[0], scontour, color="green")
        contour[i * numpoints:i * numpoints + numpoints] = scontour
        p0, dp0, ddp0, temp = spline.derivatives(syl_t[-1])
    synthtrack = Track()
    synthtrack.times = times[contour.nonzero()].copy()
    synthtrack.values = contour[contour.nonzero()].reshape((-1, 1)).copy()
    return synthtrack
示例#14
0
    def hts_synth(self, utt, processname):
        htsparms = self.engine_parms.copy()
        htsparms["-of"] = "%(tempolf0_file)s"
        if "htsparms" in utt:
            htsparms.update(utt["htsparms"])   #parm overrides for this utt...

        #build command string and execute:
        cmds = self.hts_bin
        for k in htsparms:
            if htsparms[k]:
                if htsparms[k] is True:
                    cmds += " " + k
                else:
                    cmds += " " + k + " " + str(htsparms[k])
        cmds += " %(tempilab_file)s"

        fd1, tempwav_file = mkstemp(prefix="ttslab_", suffix=".wav")
        fd2, tempilab_file = mkstemp(prefix="ttslab_")
        fd3, tempolab_file = mkstemp(prefix="ttslab_")
        fd4, tempolf0_file = mkstemp(prefix="ttslab_")

        cmds = cmds % {'models_dir': self.models_dir,
                       'tempwav_file': tempwav_file,
                       'tempilab_file': tempilab_file,
                       'tempolab_file': tempolab_file,
                       'tempolf0_file': tempolf0_file}
        #print(cmds)
        with codecs.open(tempilab_file, "w", encoding="utf-8") as outfh:
            outfh.write("\n".join(utt["hts_label"]))

        os.system(cmds)

        #load seg endtimes into utt:
        with open(tempolab_file) as infh:
            lines = infh.readlines()
            segs = utt.get_relation("Segment").as_list()
            assert len(segs) == len(lines)
            for line, seg in zip(lines, segs):
                seg["end"] = hts_labels_tone.htk_int_to_float(line.split()[1])

        #load audio:
        utt["waveform"] = Waveform(tempwav_file)

        #load lf0:
        f0 = np.exp(np.fromfile(tempolf0_file, "float32")) #load and lf0 to hertz
        #to semitones relative to 1Hz:
        f0[f0.nonzero()] = 12.0 * np.log2(f0[f0.nonzero()]) # 12 * log2 (F0 / F0reference) where F0reference = 1
        f0t = Track()
        f0t.values = f0
        f0t.times = np.arange(len(f0), dtype=np.float64) * 0.005
        utt["f0"] = f0t

        #cleanup tempfiles:
        os.close(fd1)
        os.close(fd2)
        os.close(fd3)
        os.close(fd4)
        os.remove(tempwav_file)
        os.remove(tempolab_file)
        os.remove(tempilab_file)
        os.remove(tempolf0_file)

        return utt