def get_one_sentence_data(wave_file,
                          pitch_mark_file,
                          outfile,
                          rate,
                          width,
                          warp='lin',
                          debug=False):

    print wave_file
    if debug:
        print 'get_one_sentence_data'

    if not (os.path.isfile(wave_file) and os.path.isfile(pitch_mark_file)):
        return np.zeros((0, 0))

    pitchmarks = read_pm(pitch_mark_file) * rate

    if pitchmarks.size == 1:
        return np.zeros((0, 0))
    pitchmarks = np.array(pitchmarks, dtype='int')

    _, basename = os.path.split(wave_file)
    os.system('sox %s -r %s /tmp/%s' % (wave_file, rate, basename))
    wave, sample_rate = read_wave('/tmp/%s' % (basename))

    if warp == 'mu':
        wave = mulaw2.lin2mu(wave)

    assert width % 2 == 1, 'please choose odd number for width'

    halfwidth = (width - 1) / 2

    starts = np.clip(pitchmarks - halfwidth, 0, len(wave))
    ends = np.clip(pitchmarks + halfwidth + 1, 0, len(wave))

    # starts = pitchmarks - halfwidth#, 0, len(wave)
    # ends = pitchmarks + halfwidth + 1 # , 0, len(wave)

    frags = [wave[s:e] for (s, e) in zip(starts, ends)]
    # print [len(f) for f in frags]

    fragmat = np.zeros((len(frags), width))
    #frags = np.vstack(frags)
    for (i, f) in enumerate(frags):
        fragmat[i, :len(f)] = f

    if debug:
        pylab.plot(fragmat.transpose())
        pylab.show()
        sys.exit('advadv')

    # print fragmat
    put_speech(fragmat, outfile)
Пример #2
0
def streams2wav(streams, outfile, config):

    bin_dir = config['bindir']

    alpha = config['mcc_alpha']
    order = config['mcc_order']
    sr = config['sample_rate']
    fftl = rate2fftlength(sr)

    ## TODO -- handle tmp better
    os.system('rm /tmp/tmp*')

    for (stream, data) in streams.items():
        put_speech(data, '/tmp/tmp.%s' % (stream))
        comm = bin_dir + "/x2x +fd /tmp/tmp." + stream + " >/tmp/tmp_d." + stream
        print comm
        os.system(comm)

    comm = "%s/mgc2sp -a %s -g 0 -m %s -l %s -o 2 /tmp/tmp.mgc | %s/sopr -d 32768.0 -P | %s/x2x +fd -o > /tmp/tmp.spec" % (
        bin_dir, alpha, order, fftl, bin_dir, bin_dir)
    print comm
    os.system(comm)
    '''Avoid:   x2x : error: input data is over the range of type 'double'!
           -o      : clip by minimum and maximum of output data            
             type if input data is over the range of               
             output data type.
    '''

    comm = "%s/synth %s %s /tmp/tmp_d.lf0 /tmp/tmp.spec /tmp/tmp_d.bap %s" % (
        bin_dir, fftl, sr, outfile)
    print comm
    res = os.system(comm)
    if res != 0:
        print
        print 'trouble with resynth command:'
        print comm
        print
    else:
        #     os.system("mv /tmp/tmp.resyn.wav "+outfile)
        print 'Produced %s' % (outfile)
Пример #3
0
def main_work():

    #################################################

    # ======== Get stuff from command line ==========

    a = ArgumentParser()
    a.add_argument('-i', dest='indir', required=True)
    a.add_argument('-f', dest='foutdir', required=True, \
                    help= "Put formant freq output here: make it if it doesn't exist")
    a.add_argument('-b', dest='boutdir', required=True, \
                    help= "Put formant bandwidth output here: make it if it doesn't exist")
    # a.add_argument('-c', dest='clear', action='store_true', \
    #                 help= "clear any previous training data first")
    # a.add_argument('-p', dest='max_cores', required=False, type=int, help="maximum number of CPU cores to use in parallel")
    opts = a.parse_args()

    # ===============================================

    for direc in [opts.foutdir, opts.boutdir]:
        if not os.path.isdir(direc):
            os.makedirs(direc)

    for wavefile in glob.glob(opts.indir + '/*.wav'):
        _, base = os.path.split(wavefile)
        base = base.replace('.wav', '')
        print base
        os.system('tclsh8.4 %s/get_formant.tcl %s > %s/%s.tmp' %
                  (here, wavefile, opts.foutdir, base))
        mat = np.loadtxt('%s/%s.tmp' % (opts.foutdir, base))
        formfreq = mat[:, :4]
        formband = mat[:, 4:]
        put_speech(formfreq, '%s/%s.formfreq' % (opts.foutdir, base))
        put_speech(formband, '%s/%s.formband' % (opts.boutdir, base))

    for tempfile in glob.glob(opts.foutdir + '/*.tmp'):
        os.system('rm %s' % (tempfile))
Пример #4
0
def main_work():

    #################################################

    # ======== Get stuff from command line ==========

    a = ArgumentParser()
    a.add_argument('-w', dest='wavfile', required=True)
    a.add_argument('-f', dest='feature_dir', required=True)
    a.add_argument('-p', dest='pm_dir', required=True)
    a.add_argument('-o', dest='outdir', required=True)
    a.add_argument('-x', dest='feature_extension', required=True)
    a.add_argument('-d', dest='feature_dim', type=int, required=True)
    a.add_argument('-s',
                   dest='fshift_seconds',
                   type=float,
                   default=0.005,
                   required=False)
    a.add_argument('-l',
                   dest='labdir',
                   default=None,
                   help='not currently used')
    a.add_argument(
        '-win',
        dest='windowing_convention',
        default='',
        help=
        'How to determine locations of windows, by default, guessed based on feature_extension'
    )
    opts = a.parse_args()

    # ===============================================

    ## temporary check not to use labels:
    assert opts.labdir == None

    if not os.path.isdir(opts.outdir):
        os.makedirs(opts.outdir)

    junk, base = os.path.split(opts.wavfile)
    base = base.replace('.wav', '')

    pm_fname = os.path.join(opts.pm_dir, base + '.pm')

    feature_fname = os.path.join(opts.feature_dir,
                                 base + '.' + opts.feature_extension)

    for fname in [opts.wavfile, pm_fname, feature_fname]:
        if not os.path.isfile(fname):
            sys.exit('File does not exist: %s' % (fname))

    ## read data from files
    wave, sample_rate = read_wave(opts.wavfile)

    if opts.feature_extension == 'mfcc':
        features = get_speech(feature_fname,
                              opts.feature_dim,
                              remove_htk_header=True)
    else:
        features = get_speech(feature_fname,
                              opts.feature_dim,
                              remove_htk_header=False)

    pms_seconds = read_pm(pm_fname)

    ## Convert seconds -> waveform sample numbers:-
    pms = np.asarray(np.round(pms_seconds * sample_rate), dtype=int)
    len_wave = len(wave)

    if opts.windowing_convention:
        windowing_convention = opts.windowing_convention
    else:
        if opts.feature_extension == 'mfcc':
            windowing_convention = 'HTK'
        elif opts.feature_extension in ['formfreq', 'formband']:
            windowing_convention = 'snack'
        else:
            windowing_convention = 'world'

    if opts.feature_extension in vuv_stream_names:
        ## then we need to handle voicing decision specially:
        features, vuv = interp_fzero(features)
        ps_features = pitch_synchronous_resample(
            len_wave,
            sample_rate,
            opts.fshift_seconds,
            pms,
            features,
            windowing_convention=windowing_convention)
        ps_vuv = pitch_synchronous_resample(
            len_wave,
            sample_rate,
            opts.fshift_seconds,
            pms,
            vuv,
            int_type='nearest',
            windowing_convention=windowing_convention)
        assert ps_features.shape == ps_vuv.shape
        ## reimpose voicing decision on resampled F0:
        ps_features[ps_vuv == 0] = 0
    else:
        ps_features = pitch_synchronous_resample(
            len_wave,
            sample_rate,
            opts.fshift_seconds,
            pms,
            features,
            windowing_convention=windowing_convention)

    # ps_mgc = pitch_synchronous_resample(len_wave, sample_rate, fshift_seconds, pms, mgc)
    # ps_ap = pitch_synchronous_resample(len_wave, sample_rate, fshift_seconds, pms, ap)

    # put_speech(ps_fz, os.path.join(opts.outdir, 'f0', base + '.f0'))
    # put_speech(ps_mgc, os.path.join(opts.outdir, 'mgc', base + '.mgc'))
    # put_speech(ps_ap, os.path.join(opts.outdir, 'ap', base + '.ap'))

    put_speech(ps_features,
               os.path.join(opts.outdir, base + '.' + opts.feature_extension))

    if opts.labdir != None:
        labfile = os.path.join(opts.labdir, base + '.lab')
        print 'TODO -- labels!'
        pms_htkunit = np.asarray(np.round(pms_seconds * 10000000), dtype=int)
        label = read_label(labfile)
        assign_pm_to_labels(pms_htkunit, label)
Пример #5
0
    encoder = train_network_from_generators(train_provider, val_provider, insize, outsize, \
           opts.output_dir + '/model.krs', architecture=[2048, 2048, opts.outdim, 2048, 2048], activation='relu', max_epoch=3, \
                    patience=5, classification=False, bottleneck=2, truncate_at_bottleneck=True)

    # encoder = truncate_model(autoencoder, 1)

    train_provider.reset()
    while train_provider.file_index < len(train_provider.filelist):
        X, Y = train_provider.get_file_data_from_one_file()
        base = train_provider.get_filename()
        if X.size == 0:
            print 'skip %s' % (base)
            continue
        encoded = encoder.predict(X)
        put_speech(encoded, opts.output_dir + '/aef/' + base + '.aef')
        train_provider.file_index += 1
    val_provider.reset()
    while val_provider.file_index < len(val_provider.filelist):
        X, Y = val_provider.get_file_data_from_one_file()
        base = val_provider.get_filename()
        if X.size == 0:
            print 'skip %s' % (base)
            continue
        encoded = encoder.predict(X)
        put_speech(encoded, opts.output_dir + '/aef/' + base + '.aef')
        val_provider.file_index += 1

    #import pylab
    # pylab.subplot(211)
    # pylab.plot(X.transpose())
Пример #6
0
if os.path.isdir(energy):
    sys.exit('%s already exists'%(energy))

os.makedirs(mfcc12)
os.makedirs(energy)


for mfcc_fname in sorted(glob.glob(mfcc_dir + '/*.mfcc')):
    _,base = os.path.split(mfcc_fname)
    base = base.replace('.mfcc','')
    print base
    speech = get_speech(mfcc_fname, 13)

    ## remove outlying values which make later standardisation of the data crazy:
    speech[speech<-100.0] = 0.0
    speech[speech>100.0] = 0.0
    
    e = speech[:,0].reshape(-1,1)
    m = speech[:,1:]
    put_speech(e, os.path.join(energy, base+'.energy'))
    put_speech(m, os.path.join(mfcc12, base+'.mfcc12'))








Пример #7
0
# ## this is the training data as regenerated by LSTM trained on it (for target cost):
# streams_dir = '/afs/inf.ed.ac.uk/group/cstr/projects/blizzard_entries/blizzard2017/hybrid_voice/data/predicted_params/train/'

# topoutdir = '/tmp/testpad'

## --------

## HTS style labels used in Blizzard:-
hts_quinphone_regex = '([^~]+)~([^-]+)-([^\+]+)\+([^\=]+)\=([^:]+)'
stream_list = ['mgc', 'lf0']
stream_dims = {'mgc': 60, 'lf0': 1}

for labfname in glob.glob(labdir + '/*.lab'):
    print labfname

    lab = read_label(labfname, hts_quinphone_regex)

    base = basename(labfname)
    for stream in stream_list:
        stream_file = os.path.join(streams_dir, stream, base + '.' + stream)
        if not os.path.isfile(stream_file):
            print 'skip!'
            continue
        speech = get_speech(stream_file, stream_dims[stream])
        speech = reinsert_terminal_silence(speech, lab)

        outdir = topoutdir + '/' + stream
        safe_makedir(outdir)
        put_speech(speech, outdir + '/' + base + '.' + stream)