def make_hdf_magphase(datadir, database_fname, fftlength): HALFFFTLEN = (fftlength / 2) + 1 for stream in ['mag', 'real', 'imag', 'f0']: assert os.path.isdir(os.path.join(datadir, stream)) f = h5py.File(database_fname, "w") for magfile in sorted(glob.glob(os.path.join(datadir, 'mag/*.mag'))): base = basename(magfile) print base skip_file = False for stream in ['mag', 'real', 'imag', 'f0']: if not os.path.isfile( os.path.join(datadir, stream, base + '.' + stream)): skip_file = True if skip_file: print ' ---> skip!' continue utt_group = f.create_group(base) for stream in ['mag', 'real', 'imag']: speech = get_speech( os.path.join(datadir, stream, base + '.' + stream), HALFFFTLEN) utt_group.create_dataset(stream, data=speech) f0 = get_speech(os.path.join(datadir, 'f0', base + '.f0'), 1) f0_interp, vuv = speech_manip.lin_interp_f0(f0) utt_group.create_dataset('f0_interp', data=f0_interp) utt_group.create_dataset('vuv', data=vuv) f.close()
def preload_all_magphase_utts(self): start_time = self.start_clock('Preload magphase utts for corpus') for base in np.unique(self.train_filenames): print base mag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'mag', base + '.mag'), FFTHALFLEN) real_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'real', base + '.real'), FFTHALFLEN) imag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'imag', base + '.imag'), FFTHALFLEN) f0_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'f0', base + '.f0'), 1) f0_interp, vuv = speech_manip.lin_interp_f0(f0_full) self.waveforms[base] = (mag_full, real_full, imag_full, f0_interp, vuv) self.stop_clock(start_time)
def preload_magphase_utts(self, path): ''' preload utts used for a given path ''' for index in path: if self.train_filenames[index] in self.waveforms: # self.config['hold_waves_in_memory']: ### i.e. waves or magphase FFT spectra (mag_full, real_full, imag_full, f0_interp, vuv) = self.waveforms[self.train_filenames[index]] else: mag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'mag', self.train_filenames[index] + '.mag'), FFTHALFLEN) real_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'real', self.train_filenames[index] + '.real'), FFTHALFLEN) imag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'imag', self.train_filenames[index] + '.imag'), FFTHALFLEN) f0_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'f0', self.train_filenames[index] + '.f0'), 1) f0_interp, vuv = speech_manip.lin_interp_f0(f0_full) self.waveforms[self.train_filenames[index]] = (mag_full, real_full, imag_full, f0_interp, vuv)
def get_mean(flist, dim, exclude_uv=False): ''' Take mean over each coeff, to centre their trajectories around zero. ''' frame_sum = np.zeros(dim) frame_count = 0 for fname in flist: if not os.path.isfile(fname): continue print 'mean: ' + fname speech = get_speech(fname, dim) if np.sum(np.isnan(speech)) + np.sum(np.isinf(speech)) > 0: print 'EXCLUDE ' + fname continue if exclude_uv: ## remove speech where first column is <= 0.0 speech = speech[speech[:, 0] > 0.0, :] frame_sum += speech.sum(axis=0) m, n = np.shape(speech) frame_count += m mean_vec = frame_sum / float(frame_count) return mean_vec, frame_count
def world_synth(cmpfile, wavefile, config, denorm=False): stream_names = config['stream_names'] datadims = dict(zip(stream_names, config['datadims_list'])) datadims['vuv'] = 1 speech = get_speech(cmpfile, sum(datadims.values()) + 1) #print config if denorm: speech = destandardise(speech, config) streams = split_into_streams(speech, stream_names, datadims) #print streams if 'lf0' in streams: fzero = numpy.exp(streams['lf0']) vuv_thresh = 0.5 if 'vuv' in streams: vuv = streams['vuv'] lf0 = streams['lf0'] fzero[vuv <= vuv_thresh] = 0.0 #fzero *= fzero_scale streams['lf0'] = fzero streams2wav(streams, wavefile, config)
def test(): mfccs = glob.glob( '/afs/inf.ed.ac.uk/user/o/owatts/sim2/oliver/slm_data_work/fls_hybrid/feat_29/world_reaper/mfcc/*.mfcc' ) for mfcc in mfccs: wavfile = mfcc.replace('.mfcc', '.wav').replace( '/mfcc/', '/tmp/' ) # '/afs/inf.ed.ac.uk/user/o/owatts/sim2/oliver/slm_data_work/fls_hybrid/feat_29/world_reaper/tmp/AMidsummerNightsDream_011_000.wav' wave, sample_rate = read_wave(wavfile) mf = get_speech(mfcc, 13, remove_htk_header=True) c = get_mfcc_frame_centres(len(wave), 48000, 0.002, 0.010) print c print len(c) print mf.shape
def load_data(feat_dir, lab_dir, categories): ''' ''' assert len( glob.glob(lab_dir + '/*.lab')) > 0, 'no labels in %s' % (lab_dir) feats = glob.glob(feat_dir + '/*.mfcc') # [:3] #labs = glob.glob(lab_dir + '/*.lab') feats_only = [] unlabelled_names = [] labelled_feats = [] all_labels = [] for feat in feats[:6]: _, base = os.path.split(feat) base = base.replace('.mfcc', '') print 'loading %s' % (base) lab = os.path.join(lab_dir, base + '.lab') features = get_speech(feat, 13, remove_htk_header=True) feats_only.append(features) unlabelled_names.append(base) if not os.path.isfile(lab): continue print 'getting label for %s' % (base) labels = read_labels(lab, categories) feat_frames, n = features.shape lab_frames, = labels.shape frames = min(feat_frames, lab_frames) features = features[:frames, :] labels = labels[:frames] labelled_feats.append(features) all_labels.append(labels) #feats_only = np.vstack(feats_only) labelled_feats = np.vstack(labelled_feats) all_labels = np.concatenate(all_labels) return labelled_feats, all_labels, feats_only, unlabelled_names
def get_std(flist, dim, mean_vec, exclude_uv=False): ''' Unlike mean, use single std value over all coeffs in stream, to preserve relative differences in range of coeffs within a stream The value we use is the largest std across the coeffs, which means that this stream when normalised will have std of 1.0, and other streams decreasing. Reduplicate this single value to vector the width of the stream. ''' diff_sum = np.zeros(dim) frame_count = 0 for fname in flist: if not os.path.isfile(fname): continue print 'std: ' + fname speech = get_speech(fname, dim) if np.sum(np.isnan(speech)) + np.sum(np.isinf(speech)) > 0: print 'EXCLUDE ' + fname continue if exclude_uv: ## remove speech where first column is <= 0.0 speech = speech[speech[:, 0] > 0.0, :] m, n = np.shape(speech) #mean_mat = np.tile(mean_vec,(m,1)) mean_vec = mean_vec.reshape((1, -1)) sq_diffs = (speech - mean_vec)**2 diff_sum += sq_diffs.sum(axis=0) frame_count += m max_diff_sum = diff_sum.max() print mean_vec.tolist() print max_diff_sum.tolist() std_val = (max_diff_sum / float(frame_count))**0.5 std_vec = np.ones((1, dim)) * std_val return std_vec
def compose_speech(feat_dir_dict, base, stream_list, datadims, ignore_streams=['triphone']): ''' where there is trouble, signal this by returning a 1 x 1 matrix ''' stream_list = [ stream for stream in stream_list if stream not in ignore_streams ] # mgc_fn = os.path.join(indir, 'mgc', base+'.mgc' ) # f0_fn = os.path.join(indir, 'f0', base+'.f0' ) # ap_fn = os.path.join(indir, 'ap', base+'.ap' ) stream_data_list = [] for stream in stream_list: stream_fname = os.path.join(feat_dir_dict[stream], base + '.' + stream) if not os.path.isfile(stream_fname): print stream_fname + ' does not exist' return np.zeros((1, 1)) stream_data = get_speech(stream_fname, datadims[stream]) if stream == 'aef': stream_data = np.vstack([ np.zeros((1, datadims[stream])), stream_data, np.zeros((1, datadims[stream])) ]) ### previously: # if stream in vuv_stream_names: # uv_ix = np.arange(stream_data.shape[0])[stream_data[:,0]<=0.0] # vuv = np.ones(stream_data.shape) # vuv[uv_ix, :] = 0.0 # ## set F0 to utterance's voiced frame mean in unvoiced frames: # voiced = stream_data[stream_data>0.0] # if voiced.size==0: # voiced_mean = 100.0 ### TODO: fix artibrary nnumber! # else: # voiced_mean = voiced.mean() # stream_data[stream_data<=0.0] = voiced_mean # stream_data_list.append(stream_data) # stream_data_list.append(vuv) ### Now, just set unvoiced frames to -1.0 (they will be specially weighted later): if stream in vuv_stream_names: # uv_ix = np.arange(stream_data.shape[0])[stream_data[:,0]<=0.0] # vuv = np.ones(stream_data.shape) # vuv[uv_ix, :] = 0.0 ## set F0 to utterance's voiced frame mean in unvoiced frames: # voiced = stream_data[stream_data>0.0] # if voiced.size==0: # voiced_mean = 100.0 ### TODO: fix artibrary nnumber! # else: # voiced_mean = voiced.mean() stream_data[stream_data <= 0.0] = const.special_uv_value stream_data_list.append(stream_data) # stream_data_list.append(vuv) else: stream_data_list.append(stream_data) ## where data has different number of frames per stream, chop off the extra frames: frames = [np.shape(data)[0] for data in stream_data_list] nframe = min(frames) stream_data_list = [data[:nframe, :] for data in stream_data_list] speech = np.hstack(stream_data_list) return speech
def main_work(): ################################################# # ======== Get stuff from command line ========== a = ArgumentParser() a.add_argument('-w', dest='wavfile', required=True) a.add_argument('-f', dest='feature_dir', required=True) a.add_argument('-p', dest='pm_dir', required=True) a.add_argument('-o', dest='outdir', required=True) a.add_argument('-x', dest='feature_extension', required=True) a.add_argument('-d', dest='feature_dim', type=int, required=True) a.add_argument('-s', dest='fshift_seconds', type=float, default=0.005, required=False) a.add_argument('-l', dest='labdir', default=None, help='not currently used') a.add_argument( '-win', dest='windowing_convention', default='', help= 'How to determine locations of windows, by default, guessed based on feature_extension' ) opts = a.parse_args() # =============================================== ## temporary check not to use labels: assert opts.labdir == None if not os.path.isdir(opts.outdir): os.makedirs(opts.outdir) junk, base = os.path.split(opts.wavfile) base = base.replace('.wav', '') pm_fname = os.path.join(opts.pm_dir, base + '.pm') feature_fname = os.path.join(opts.feature_dir, base + '.' + opts.feature_extension) for fname in [opts.wavfile, pm_fname, feature_fname]: if not os.path.isfile(fname): sys.exit('File does not exist: %s' % (fname)) ## read data from files wave, sample_rate = read_wave(opts.wavfile) if opts.feature_extension == 'mfcc': features = get_speech(feature_fname, opts.feature_dim, remove_htk_header=True) else: features = get_speech(feature_fname, opts.feature_dim, remove_htk_header=False) pms_seconds = read_pm(pm_fname) ## Convert seconds -> waveform sample numbers:- pms = np.asarray(np.round(pms_seconds * sample_rate), dtype=int) len_wave = len(wave) if opts.windowing_convention: windowing_convention = opts.windowing_convention else: if opts.feature_extension == 'mfcc': windowing_convention = 'HTK' elif opts.feature_extension in ['formfreq', 'formband']: windowing_convention = 'snack' else: windowing_convention = 'world' if opts.feature_extension in vuv_stream_names: ## then we need to handle voicing decision specially: features, vuv = interp_fzero(features) ps_features = pitch_synchronous_resample( len_wave, sample_rate, opts.fshift_seconds, pms, features, windowing_convention=windowing_convention) ps_vuv = pitch_synchronous_resample( len_wave, sample_rate, opts.fshift_seconds, pms, vuv, int_type='nearest', windowing_convention=windowing_convention) assert ps_features.shape == ps_vuv.shape ## reimpose voicing decision on resampled F0: ps_features[ps_vuv == 0] = 0 else: ps_features = pitch_synchronous_resample( len_wave, sample_rate, opts.fshift_seconds, pms, features, windowing_convention=windowing_convention) # ps_mgc = pitch_synchronous_resample(len_wave, sample_rate, fshift_seconds, pms, mgc) # ps_ap = pitch_synchronous_resample(len_wave, sample_rate, fshift_seconds, pms, ap) # put_speech(ps_fz, os.path.join(opts.outdir, 'f0', base + '.f0')) # put_speech(ps_mgc, os.path.join(opts.outdir, 'mgc', base + '.mgc')) # put_speech(ps_ap, os.path.join(opts.outdir, 'ap', base + '.ap')) put_speech(ps_features, os.path.join(opts.outdir, base + '.' + opts.feature_extension)) if opts.labdir != None: labfile = os.path.join(opts.labdir, base + '.lab') print 'TODO -- labels!' pms_htkunit = np.asarray(np.round(pms_seconds * 10000000), dtype=int) label = read_label(labfile) assign_pm_to_labels(pms_htkunit, label)
def main_work(config, overwrite_existing_data=False): assert config['target_representation'] == 'epoch' database_fname = get_data_dump_name(config) if os.path.isfile(database_fname): if not overwrite_existing_data: sys.exit( 'Data already exists at %s -- run with -X to overwrite it' % (database_fname)) else: os.system('rm ' + database_fname) n_train_utts = config.get('n_train_utts', 0) ## default (0): use all sentences target_feat_dirs = config['target_datadirs'] datadims_target = config['datadims_target'] stream_list_target = config['stream_list_target'] ## get dicts mapping e.g. 'mgc': '/path/to/mgc/' : - target_stream_dirs = locate_stream_directories(target_feat_dirs, stream_list_target) join_feat_dirs = config['join_datadirs'] datadims_join = config['datadims_join'] stream_list_join = config['stream_list_join'] ## get dicts mapping e.g. 'mgc': '/path/to/mgc/' : - join_stream_dirs = locate_stream_directories(join_feat_dirs, stream_list_join) ## First, work out initial list of training utterances based on files present in first stream subdir: first_stream = stream_list_target[ 0] ## <-- typically, mgc, but not really important utt_list = sorted( glob.glob(target_stream_dirs[first_stream] + '/*.' + first_stream)) flist = [ os.path.split(fname)[-1].replace('.' + first_stream, '') for fname in utt_list ] ## Next, limit training utterances by number or by pattern: if type(n_train_utts) == int: if (n_train_utts == 0 or n_train_utts > len(flist)): n_train_utts = len(flist) flist = flist[:n_train_utts] elif type(n_train_utts) == str: match_expression = n_train_utts flist = [name for name in flist if match_expression in name] print 'Selected %s utts with pattern %s' % (len(flist), match_expression) ## Also filter for test material, in case they are in same directory: if 'test_patterns' in config: test_flist = [] for fname in flist: for pattern in config['test_patterns']: if pattern in fname: test_flist.append(fname) flist = [name for name in flist if name not in test_flist] ## Finally, only take utterances which occur in train_list, if it is given in config: if 'train_list' in config: assert os.path.isfile( config['train_list']), 'File %s does not exist' % ( config['train_list']) train_list = readlist(config['train_list']) train_list = dict(zip(train_list, train_list)) flist = [name for name in flist if name in train_list] assert len(flist) > 0 ## 1A) First pass: get mean and std per stream for each of {target,join} (mean_vec_target, std_vec_target) = get_mean_std(target_stream_dirs, stream_list_target, datadims_target, flist) (mean_vec_join, std_vec_join) = get_mean_std(join_stream_dirs, stream_list_join, datadims_join, flist) ## 1B) Initialise HDF5; store mean and std in HDF5: f = h5py.File(database_fname, "w") mean_target_dset = f.create_dataset("mean_target", np.shape(mean_vec_target), dtype='f', track_times=False) std_target_dset = f.create_dataset("std_target", np.shape(std_vec_target), dtype='f', track_times=False) mean_join_dset = f.create_dataset("mean_join", np.shape(mean_vec_join), dtype='f', track_times=False) std_join_dset = f.create_dataset("std_join", np.shape(std_vec_join), dtype='f', track_times=False) mean_target_dset[:] = mean_vec_target[:] std_target_dset[:] = std_vec_target[:] mean_join_dset[:] = mean_vec_join[:] std_join_dset[:] = std_vec_join[:] ## Set some values.... target_dim = mean_vec_target.shape[0] join_dim = mean_vec_join.shape[0] target_rep_size = target_dim * target_rep_widths[config.get( 'target_representation', 'epoch')] fshift_seconds = (0.001 * config['frameshift_ms']) fshift = int(config['sample_rate'] * fshift_seconds) samples_per_frame = fshift print 'Go through data to find number of units:- ' n_units = 0 new_flist = [] first_stream, first_streamdir = sorted(target_stream_dirs.items())[0] for base in flist: featfile = os.path.join(first_streamdir, base + '.' + first_stream) if not os.path.exists(featfile): print 'skipping %s' % (featfile) continue speech = get_speech(featfile, datadims_target[first_stream]) npoint, _ = speech.shape n_units += npoint new_flist.append(base) flist = new_flist print '%s units (%s)' % (n_units, config.get('target_representation', 'epoch')) ## 2) Get ready to store data in HDF5: total_target_dim = target_rep_size ## maxshape makes a dataset resizable train_dset = f.create_dataset("train_unit_features", (n_units, total_target_dim), maxshape=(n_units, total_target_dim), dtype='f', track_times=False) phones_dset = f.create_dataset("train_unit_names", (n_units, ), maxshape=(n_units, ), dtype='|S50', track_times=False) filenames_dset = f.create_dataset("filenames", (n_units, ), maxshape=(n_units, ), dtype='|S50', track_times=False) unit_index_within_sentence_dset = f.create_dataset( "unit_index_within_sentence_dset", (n_units, ), maxshape=(n_units, ), dtype='i', track_times=False) join_contexts_dset = f.create_dataset("join_contexts", (n_units + 1, join_dim), maxshape=(n_units + 1, join_dim), dtype='f', track_times=False) ### TODO: use? if config.get('store_full_magphase', False): mp_mag_dset = f.create_dataset("mp_mag", (n_units, 513), maxshape=(n_units, 513), dtype='f', track_times=False) mp_imag_dset = f.create_dataset("mp_imag", (n_units, 513), maxshape=(n_units, 513), dtype='f', track_times=False) mp_real_dset = f.create_dataset("mp_real", (n_units, 513), maxshape=(n_units, 513), dtype='f', track_times=False) mp_fz_dset = f.create_dataset("mp_fz", (n_units, 1), maxshape=(n_units, 1), dtype='f', track_times=False) ## Standardise data (within streams), compose, add VUV, fill F0 gaps with utterance mean voiced value: start = 0 print 'Composing ....' print flist new_flist = [] for base in flist: print base #! pm_file = os.path.join(config['pm_datadir'], base + '.pm') # if not(os.path.isfile(pm_file)): # print 'Warning: no pm -- skip!' # continue #! ## Get pitchmarks (to join halfphones on detected GCIs):- # pms_seconds = read_pm(pm_file) # if pms_seconds.shape == (1,1): # print 'Warning: trouble reading pm file -- skip!' # continue ### Get speech params for target cost (i.e. probably re-generated speech for consistency): t_speech = compose_speech(target_stream_dirs, base, stream_list_target, datadims_target) if t_speech.shape == [1, 1]: ## bad return value continue t_speech = standardise(t_speech, mean_vec_target, std_vec_target) ### Get speech params for join cost (i.e. probably natural speech). ### These are expected to have already been resampled so that they are pitch-synchronous. j_speech = compose_speech(join_stream_dirs, base, stream_list_join, datadims_join) if j_speech.size == 1: ## bad return value continue j_speech = standardise(j_speech, mean_vec_join, std_vec_join) j_frames, j_dim = j_speech.shape # if j_frames != len(pms_seconds): # print (j_frames, len(pms_seconds)) # print 'Warning: number of rows in join cost features not same as number of pitchmarks:' # print 'these features should be pitch synchronous. Skipping utterance!' # continue t_frames, t_dim = t_speech.shape if j_frames != t_frames: print(j_frames, t_frames) print 'Warning: number of rows in target cost features not same as number in join cost features:' print ' Skipping utterance!' continue first_sentence_in_corpus = base == flist[0] if config.get('REPLICATE_IS2018_EXP', False): unit_features = t_speech[1: -1, :] ## Representations for target cost if first_sentence_in_corpus: context_data = j_speech[:-1, :] else: context_data = j_speech[1:-1, :] else: ## this should be consistent with how hi-dim frames are selected and remove a bug unit_features = t_speech ## do not trim frames if first_sentence_in_corpus: initial_history = j_speech[0, :].reshape( (1, -1)) ### assume that first frame is silence context_data = np.vstack([initial_history, j_speech]) else: context_data = j_speech ## TODO: reinstate this?:-- ADD_PHONETIC_EPOCH = False if ADD_PHONETIC_EPOCH: labfile = os.path.join(config['label_datadir'], base + '.' + config['lab_extension']) labs = read_label(labfile, config['quinphone_regex']) unit_names = resample_labels.pitch_synchronous_resample_label( 48000, 0.005, pms_samples, labs) else: unit_names = ['_'] * (t_speech.shape[0]) unit_names = np.array(unit_names) # # if config.get('REPLICATE_IS2018_EXP', False): unit_names = unit_names[1:-1] m, n = unit_features.shape filenames = [base] * m unit_index_within_sentence = np.arange(m) ## TODO: reinstate this as hi-dim writer?:-- CHECK_MAGPHASE_SIZES = False if CHECK_MAGPHASE_SIZES: # config.get('store_full_magphase', False): print 'CHECK_MAGPHASE_SIZES' for extn in ['mag', 'imag', 'real', 'f0']: direc = extn + '_full' if extn == 'f0': sdim = 1 else: sdim = 513 fname = os.path.join(config['full_magphase_dir'], direc, base + '.' + extn) full_stream = get_speech(fname, sdim) #full_stream = full_stream[1:-1,:] print direc print full_stream.shape ## TODO: reinstate this as hi-dim writer?:-- if config.get('store_full_magphase', False): mp_data = [] for extn in ['mag', 'imag', 'real', 'f0']: direc = extn + '_full' if extn == 'f0': sdim = 1 else: sdim = 513 fname = os.path.join(config['full_magphase_dir'], direc, base + '.' + extn) full_stream = get_speech(fname, sdim) full_stream = full_stream[1:-1, :] print direc print full_stream.shape mp_data.append(full_stream) ## Add everything to database: train_dset[start:start + m, :] = unit_features phones_dset[start:start + m] = unit_names filenames_dset[start:start + m] = filenames unit_index_within_sentence_dset[start:start + m] = unit_index_within_sentence #! cutpoints_dset[start:start+m,:] = cutpoints ### join_contexts has extra initial frame of history -- deal with it: if first_sentence_in_corpus: join_contexts_dset[start:start + m + 1, :] = context_data else: join_contexts_dset[start + 1:start + m + 1, :] = context_data ### TODO: use? if config.get('store_full_magphase', False): (mp_mag, mp_imag, mp_real, mp_fz) = mp_data mp_mag_dset[start:start + m, :] = mp_mag mp_imag_dset[start:start + m, :] = mp_imag mp_real_dset[start:start + m, :] = mp_real mp_fz_dset[start:start + m, :] = mp_fz start += m new_flist.append(base) ## Number of units was computed before without considering dropped utterances, actual number ## will be smaller. Resize the data: actual_nframes = start print '\n\n\nNumber of units actually written:' print actual_nframes print train_dset.resize(actual_nframes, axis=0) phones_dset.resize(actual_nframes, axis=0) filenames_dset.resize(actual_nframes, axis=0) unit_index_within_sentence_dset.resize(actual_nframes, axis=0) join_contexts_dset.resize(actual_nframes + 1, axis=0) ### TODO if config.get('store_full_magphase', False): mp_mag_dset.resize(actual_nframes, axis=0) mp_imag_dset.resize(actual_nframes, axis=0) mp_real_dset.resize(actual_nframes, axis=0) mp_fz_dset.resize(actual_nframes, axis=0) print print 'Storing hybrid voice data:' for thing in f.values(): print thing f.close() print 'Stored training data for %s sentences to %s' % (n_train_utts, database_fname)
if os.path.isdir(mfcc12): sys.exit('%s already exists'%(mfcc12)) if os.path.isdir(energy): sys.exit('%s already exists'%(energy)) os.makedirs(mfcc12) os.makedirs(energy) for mfcc_fname in sorted(glob.glob(mfcc_dir + '/*.mfcc')): _,base = os.path.split(mfcc_fname) base = base.replace('.mfcc','') print base speech = get_speech(mfcc_fname, 13) ## remove outlying values which make later standardisation of the data crazy: speech[speech<-100.0] = 0.0 speech[speech>100.0] = 0.0 e = speech[:,0].reshape(-1,1) m = speech[:,1:] put_speech(e, os.path.join(energy, base+'.energy')) put_speech(m, os.path.join(mfcc12, base+'.mfcc12'))
# ## this is the training data as regenerated by LSTM trained on it (for target cost): # streams_dir = '/afs/inf.ed.ac.uk/group/cstr/projects/blizzard_entries/blizzard2017/hybrid_voice/data/predicted_params/train/' # topoutdir = '/tmp/testpad' ## -------- ## HTS style labels used in Blizzard:- hts_quinphone_regex = '([^~]+)~([^-]+)-([^\+]+)\+([^\=]+)\=([^:]+)' stream_list = ['mgc', 'lf0'] stream_dims = {'mgc': 60, 'lf0': 1} for labfname in glob.glob(labdir + '/*.lab'): print labfname lab = read_label(labfname, hts_quinphone_regex) base = basename(labfname) for stream in stream_list: stream_file = os.path.join(streams_dir, stream, base + '.' + stream) if not os.path.isfile(stream_file): print 'skip!' continue speech = get_speech(stream_file, stream_dims[stream]) speech = reinsert_terminal_silence(speech, lab) outdir = topoutdir + '/' + stream safe_makedir(outdir) put_speech(speech, outdir + '/' + base + '.' + stream)
def retrieve_magphase_frag(self, index, extra_frames=0): if 0: print 'retrieving fragment' print self.train_filenames[index] print self.unit_index_within_sentence[index] if self.use_hdf_magphase: base = self.train_filenames[index] mag_full = self.hdf_magphase_pointer[base]['mag'][:] real_full = self.hdf_magphase_pointer[base]['real'][:] imag_full = self.hdf_magphase_pointer[base]['imag'][:] f0_interp = self.hdf_magphase_pointer[base]['f0_interp'][:] vuv = self.hdf_magphase_pointer[base]['vuv'][:] else: ## side effect -- data persists in self.waveforms. TODO: Protect against mem errors if False: # self.train_filenames[index] in self.waveforms: # self.config['hold_waves_in_memory']: ### i.e. waves or magphase FFT spectra (mag_full, real_full, imag_full, f0_interp, vuv) = self.waveforms[self.train_filenames[index]] else: mag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'mag', self.train_filenames[index] + '.mag'), FFTHALFLEN) real_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'real', self.train_filenames[index] + '.real'), FFTHALFLEN) imag_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'imag', self.train_filenames[index] + '.imag'), FFTHALFLEN) f0_full = get_speech(os.path.join(self.config['full_magphase_dir'], 'f0', self.train_filenames[index] + '.f0'), 1) f0_interp, vuv = speech_manip.lin_interp_f0(f0_full) self.waveforms[self.train_filenames[index]] = (mag_full, real_full, imag_full, f0_interp, vuv) start_index = self.unit_index_within_sentence[index] #start_index -= 1 ### because magphase have extra pms beginning and end multiepoch = self.config.get('multiepoch', 1) end_index = start_index + multiepoch ## start_pad = 0 end_pad = 0 if extra_frames > 0: new_start_index = start_index - extra_frames new_end_index = end_index + extra_frames ## check out of bounds and record to zero pad later if necessary: nframes, _ = mag_full.shape if new_start_index < 0: start_pad = new_start_index * -1 if new_end_index > nframes: end_pad = new_end_index - nframes if start_pad > 0: start_index = 0 else: start_index = new_start_index if end_pad > 0: end_index = nframes else: end_index = new_end_index if 0: print 'se' print (start_pad, end_pad) if 0: print '-----indices: ' print start_index, end_index print end_index - start_index print mag_full.shape mag_frag = mag_full[start_index:end_index, :] real_frag = real_full[start_index:end_index, :] imag_frag = imag_full[start_index:end_index, :] f0_frag = f0_interp[start_index:end_index, :] # f0_frag = f0_full[start_index:end_index, :] ## !!!!!!!!!!!!!!!!!!!!!!!!!! vuv_frag = vuv[start_index:end_index, :] ### add zero padding where : mag_frag = zero_pad_matrix(mag_frag, start_pad, end_pad) real_frag = zero_pad_matrix(real_frag, start_pad, end_pad) imag_frag = zero_pad_matrix(imag_frag, start_pad, end_pad) f0_frag = zero_pad_matrix(f0_frag, start_pad, end_pad) vuv_frag = zero_pad_matrix(vuv_frag, start_pad, end_pad) # print mag_frag.shape # print '======' # print extra_frames # print new_start_index # print new_end_index # print start_pad # print end_pad ## sanity check dimensions m,n = mag_frag.shape if 0: print multiepoch print extra_frames print m assert m == multiepoch + (extra_frames*2) ### add taper (weighting for cross-fade): if extra_frames > 0: mag_frag = taper_matrix(mag_frag, extra_frames*2) real_frag = taper_matrix(real_frag, extra_frames*2) imag_frag = taper_matrix(imag_frag, extra_frames*2) #pylab.plot(f0_frag) f0_frag = taper_matrix(f0_frag, extra_frames*2) #print 'welvinrbo90' #pylab.plot(f0_frag) #pylab.show() vuv_frag = taper_matrix(vuv_frag, extra_frames*2) return (mag_frag, real_frag, imag_frag, f0_frag, vuv_frag)