def get_one_sentence_data(wave_file, pitch_mark_file, outfile, rate, width, warp='lin', debug=False): print wave_file if debug: print 'get_one_sentence_data' if not (os.path.isfile(wave_file) and os.path.isfile(pitch_mark_file)): return np.zeros((0, 0)) pitchmarks = read_pm(pitch_mark_file) * rate if pitchmarks.size == 1: return np.zeros((0, 0)) pitchmarks = np.array(pitchmarks, dtype='int') _, basename = os.path.split(wave_file) os.system('sox %s -r %s /tmp/%s' % (wave_file, rate, basename)) wave, sample_rate = read_wave('/tmp/%s' % (basename)) if warp == 'mu': wave = mulaw2.lin2mu(wave) assert width % 2 == 1, 'please choose odd number for width' halfwidth = (width - 1) / 2 starts = np.clip(pitchmarks - halfwidth, 0, len(wave)) ends = np.clip(pitchmarks + halfwidth + 1, 0, len(wave)) # starts = pitchmarks - halfwidth#, 0, len(wave) # ends = pitchmarks + halfwidth + 1 # , 0, len(wave) frags = [wave[s:e] for (s, e) in zip(starts, ends)] # print [len(f) for f in frags] fragmat = np.zeros((len(frags), width)) #frags = np.vstack(frags) for (i, f) in enumerate(frags): fragmat[i, :len(f)] = f if debug: pylab.plot(fragmat.transpose()) pylab.show() sys.exit('advadv') # print fragmat put_speech(fragmat, outfile)
def streams2wav(streams, outfile, config): bin_dir = config['bindir'] alpha = config['mcc_alpha'] order = config['mcc_order'] sr = config['sample_rate'] fftl = rate2fftlength(sr) ## TODO -- handle tmp better os.system('rm /tmp/tmp*') for (stream, data) in streams.items(): put_speech(data, '/tmp/tmp.%s' % (stream)) comm = bin_dir + "/x2x +fd /tmp/tmp." + stream + " >/tmp/tmp_d." + stream print comm os.system(comm) comm = "%s/mgc2sp -a %s -g 0 -m %s -l %s -o 2 /tmp/tmp.mgc | %s/sopr -d 32768.0 -P | %s/x2x +fd -o > /tmp/tmp.spec" % ( bin_dir, alpha, order, fftl, bin_dir, bin_dir) print comm os.system(comm) '''Avoid: x2x : error: input data is over the range of type 'double'! -o : clip by minimum and maximum of output data type if input data is over the range of output data type. ''' comm = "%s/synth %s %s /tmp/tmp_d.lf0 /tmp/tmp.spec /tmp/tmp_d.bap %s" % ( bin_dir, fftl, sr, outfile) print comm res = os.system(comm) if res != 0: print print 'trouble with resynth command:' print comm print else: # os.system("mv /tmp/tmp.resyn.wav "+outfile) print 'Produced %s' % (outfile)
def main_work(): ################################################# # ======== Get stuff from command line ========== a = ArgumentParser() a.add_argument('-i', dest='indir', required=True) a.add_argument('-f', dest='foutdir', required=True, \ help= "Put formant freq output here: make it if it doesn't exist") a.add_argument('-b', dest='boutdir', required=True, \ help= "Put formant bandwidth output here: make it if it doesn't exist") # a.add_argument('-c', dest='clear', action='store_true', \ # help= "clear any previous training data first") # a.add_argument('-p', dest='max_cores', required=False, type=int, help="maximum number of CPU cores to use in parallel") opts = a.parse_args() # =============================================== for direc in [opts.foutdir, opts.boutdir]: if not os.path.isdir(direc): os.makedirs(direc) for wavefile in glob.glob(opts.indir + '/*.wav'): _, base = os.path.split(wavefile) base = base.replace('.wav', '') print base os.system('tclsh8.4 %s/get_formant.tcl %s > %s/%s.tmp' % (here, wavefile, opts.foutdir, base)) mat = np.loadtxt('%s/%s.tmp' % (opts.foutdir, base)) formfreq = mat[:, :4] formband = mat[:, 4:] put_speech(formfreq, '%s/%s.formfreq' % (opts.foutdir, base)) put_speech(formband, '%s/%s.formband' % (opts.boutdir, base)) for tempfile in glob.glob(opts.foutdir + '/*.tmp'): os.system('rm %s' % (tempfile))
def main_work(): ################################################# # ======== Get stuff from command line ========== a = ArgumentParser() a.add_argument('-w', dest='wavfile', required=True) a.add_argument('-f', dest='feature_dir', required=True) a.add_argument('-p', dest='pm_dir', required=True) a.add_argument('-o', dest='outdir', required=True) a.add_argument('-x', dest='feature_extension', required=True) a.add_argument('-d', dest='feature_dim', type=int, required=True) a.add_argument('-s', dest='fshift_seconds', type=float, default=0.005, required=False) a.add_argument('-l', dest='labdir', default=None, help='not currently used') a.add_argument( '-win', dest='windowing_convention', default='', help= 'How to determine locations of windows, by default, guessed based on feature_extension' ) opts = a.parse_args() # =============================================== ## temporary check not to use labels: assert opts.labdir == None if not os.path.isdir(opts.outdir): os.makedirs(opts.outdir) junk, base = os.path.split(opts.wavfile) base = base.replace('.wav', '') pm_fname = os.path.join(opts.pm_dir, base + '.pm') feature_fname = os.path.join(opts.feature_dir, base + '.' + opts.feature_extension) for fname in [opts.wavfile, pm_fname, feature_fname]: if not os.path.isfile(fname): sys.exit('File does not exist: %s' % (fname)) ## read data from files wave, sample_rate = read_wave(opts.wavfile) if opts.feature_extension == 'mfcc': features = get_speech(feature_fname, opts.feature_dim, remove_htk_header=True) else: features = get_speech(feature_fname, opts.feature_dim, remove_htk_header=False) pms_seconds = read_pm(pm_fname) ## Convert seconds -> waveform sample numbers:- pms = np.asarray(np.round(pms_seconds * sample_rate), dtype=int) len_wave = len(wave) if opts.windowing_convention: windowing_convention = opts.windowing_convention else: if opts.feature_extension == 'mfcc': windowing_convention = 'HTK' elif opts.feature_extension in ['formfreq', 'formband']: windowing_convention = 'snack' else: windowing_convention = 'world' if opts.feature_extension in vuv_stream_names: ## then we need to handle voicing decision specially: features, vuv = interp_fzero(features) ps_features = pitch_synchronous_resample( len_wave, sample_rate, opts.fshift_seconds, pms, features, windowing_convention=windowing_convention) ps_vuv = pitch_synchronous_resample( len_wave, sample_rate, opts.fshift_seconds, pms, vuv, int_type='nearest', windowing_convention=windowing_convention) assert ps_features.shape == ps_vuv.shape ## reimpose voicing decision on resampled F0: ps_features[ps_vuv == 0] = 0 else: ps_features = pitch_synchronous_resample( len_wave, sample_rate, opts.fshift_seconds, pms, features, windowing_convention=windowing_convention) # ps_mgc = pitch_synchronous_resample(len_wave, sample_rate, fshift_seconds, pms, mgc) # ps_ap = pitch_synchronous_resample(len_wave, sample_rate, fshift_seconds, pms, ap) # put_speech(ps_fz, os.path.join(opts.outdir, 'f0', base + '.f0')) # put_speech(ps_mgc, os.path.join(opts.outdir, 'mgc', base + '.mgc')) # put_speech(ps_ap, os.path.join(opts.outdir, 'ap', base + '.ap')) put_speech(ps_features, os.path.join(opts.outdir, base + '.' + opts.feature_extension)) if opts.labdir != None: labfile = os.path.join(opts.labdir, base + '.lab') print 'TODO -- labels!' pms_htkunit = np.asarray(np.round(pms_seconds * 10000000), dtype=int) label = read_label(labfile) assign_pm_to_labels(pms_htkunit, label)
encoder = train_network_from_generators(train_provider, val_provider, insize, outsize, \ opts.output_dir + '/model.krs', architecture=[2048, 2048, opts.outdim, 2048, 2048], activation='relu', max_epoch=3, \ patience=5, classification=False, bottleneck=2, truncate_at_bottleneck=True) # encoder = truncate_model(autoencoder, 1) train_provider.reset() while train_provider.file_index < len(train_provider.filelist): X, Y = train_provider.get_file_data_from_one_file() base = train_provider.get_filename() if X.size == 0: print 'skip %s' % (base) continue encoded = encoder.predict(X) put_speech(encoded, opts.output_dir + '/aef/' + base + '.aef') train_provider.file_index += 1 val_provider.reset() while val_provider.file_index < len(val_provider.filelist): X, Y = val_provider.get_file_data_from_one_file() base = val_provider.get_filename() if X.size == 0: print 'skip %s' % (base) continue encoded = encoder.predict(X) put_speech(encoded, opts.output_dir + '/aef/' + base + '.aef') val_provider.file_index += 1 #import pylab # pylab.subplot(211) # pylab.plot(X.transpose())
if os.path.isdir(energy): sys.exit('%s already exists'%(energy)) os.makedirs(mfcc12) os.makedirs(energy) for mfcc_fname in sorted(glob.glob(mfcc_dir + '/*.mfcc')): _,base = os.path.split(mfcc_fname) base = base.replace('.mfcc','') print base speech = get_speech(mfcc_fname, 13) ## remove outlying values which make later standardisation of the data crazy: speech[speech<-100.0] = 0.0 speech[speech>100.0] = 0.0 e = speech[:,0].reshape(-1,1) m = speech[:,1:] put_speech(e, os.path.join(energy, base+'.energy')) put_speech(m, os.path.join(mfcc12, base+'.mfcc12'))
# ## this is the training data as regenerated by LSTM trained on it (for target cost): # streams_dir = '/afs/inf.ed.ac.uk/group/cstr/projects/blizzard_entries/blizzard2017/hybrid_voice/data/predicted_params/train/' # topoutdir = '/tmp/testpad' ## -------- ## HTS style labels used in Blizzard:- hts_quinphone_regex = '([^~]+)~([^-]+)-([^\+]+)\+([^\=]+)\=([^:]+)' stream_list = ['mgc', 'lf0'] stream_dims = {'mgc': 60, 'lf0': 1} for labfname in glob.glob(labdir + '/*.lab'): print labfname lab = read_label(labfname, hts_quinphone_regex) base = basename(labfname) for stream in stream_list: stream_file = os.path.join(streams_dir, stream, base + '.' + stream) if not os.path.isfile(stream_file): print 'skip!' continue speech = get_speech(stream_file, stream_dims[stream]) speech = reinsert_terminal_silence(speech, lab) outdir = topoutdir + '/' + stream safe_makedir(outdir) put_speech(speech, outdir + '/' + base + '.' + stream)