def babble(hp, num_sentences=0): if num_sentences == 0: num_sentences = 4 # default g1 = BabblerGraph(hp, mode="synthesize"); print("Babbler graph loaded") g2 = SSRNGraph(hp, mode="synthesize"); print("SSRN graph loaded") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) babbler_epoch = restore_latest_model_parameters(sess, hp, 'babbler') ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn') t = start_clock('Babbling...') Y = synth_babble(hp, g1, sess, seed=False, nsamples=num_sentences) stop_clock(t) t = start_clock('Mel2Mag generating...') Z = synth_mel2mag(hp, Y, g2, sess) stop_clock(t) if (np.isnan(Z).any()): ### TODO: keep? Z = np.nan_to_num(Z) # Generate wav files outdir = os.path.join(hp.voicedir, 'synth_babble', '%s_%s'%(babbler_epoch, ssrn_epoch)) safe_makedir(outdir) for i, mag in enumerate(Z): print("Applying Griffin-Lim to sample number %s"%(i)) wav = spectrogram2wav(hp, mag) write(outdir + "/{:03d}.wav".format(i), hp.sr, wav)
def copy_synth_SSRN_GL(hp, outdir): safe_makedir(outdir) dataset = load_data(hp, mode="synthesis") fnames, texts = dataset['fpaths'], dataset['texts'] bases = [basename(fname) for fname in fnames] mels = [np.load(os.path.join(hp.coarse_audio_dir, base + '.npy')) for base in bases] lengths = [a.shape[0] for a in mels] mels = list2batch(mels, 0) g = SSRNGraph(hp, mode="synthesize"); print("Graph (ssrn) loaded") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn') print('Run SSRN...') Z = synth_mel2mag(hp, mels, g, sess) for i, mag in enumerate(Z): print("Working on %s"%(bases[i])) mag = mag[:lengths[i]*hp.r,:] ### trim to generated length wav = spectrogram2wav(hp, mag) soundfile.write(outdir + "/%s.wav"%(base), wav, hp.sr)
def __init__(self, hp, model_type='t2m', t2m_epoch=-1, ssrn_epoch=-1): self.t2m_epoch = t2m_epoch self.ssrn_epoch = ssrn_epoch self.hp = hp if model_type == 't2m': self.g1 = Text2MelGraph(hp, mode="synthesize") print("Graph 1 (t2m) loaded") elif model_type == 'unsup': self.g1 = Graph_style_unsupervised(hp, mode="synthesize") print("Graph 1 (unsup) loaded") self.g2 = SSRNGraph(hp, mode="synthesize") print("Graph 2 (ssrn) loaded") self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) ### TODO: specify epoch from comm line? ### TODO: t2m and ssrn from separate configs? if t2m_epoch > -1: restore_archived_model_parameters(self.sess, hp, model_type, t2m_epoch) else: self.t2m_epoch = restore_latest_model_parameters( self.sess, hp, model_type) if ssrn_epoch > -1: restore_archived_model_parameters(self.sess, hp, 'ssrn', ssrn_epoch) else: self.ssrn_epoch = restore_latest_model_parameters( self.sess, hp, 'ssrn')
def main_work(): ################################################# # ======== Get stuff from command line ========== a = ArgumentParser() a.add_argument('-c', dest='config', required=True, type=str) a.add_argument('-ncores', type=int, default=1) opts = a.parse_args() # =============================================== hp = load_config(opts.config) ### 1) convert saved coarse mels to mags with latest-trained SSRN print('mel2mag: restore last saved SSRN') g = SSRNGraph(hp, mode="synthesize") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ## TODO: use restore_latest_model_parameters from synthesize? var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') saver2 = tf.train.Saver(var_list=var_list) savepath = hp.logdir + "-ssrn" latest_checkpoint = tf.train.latest_checkpoint(savepath) if latest_checkpoint is None: sys.exit('No SSRN at %s?'%(savepath)) ssrn_epoch = latest_checkpoint.strip('/ ').split('/')[-1].replace('model_epoch_', '') saver2.restore(sess, latest_checkpoint) print("SSRN Restored from latest epoch %s"%(ssrn_epoch)) filelist = glob.glob(hp.logdir + '-t2m/validation_epoch_*/*.npy') filelist = [fname for fname in filelist if not fname.endswith('.mag.npy')] batch, lengths = make_mel_batch(hp, filelist, oracle=False) Z = synth_mel2mag(hp, batch, g, sess, batchsize=32) print ('synthesised mags, now splitting batch:') maglist = split_batch(Z, lengths) for (infname, outdata) in tqdm.tqdm(zip(filelist, maglist)): np.save(infname.replace('.npy','.mag.npy'), outdata) ### 2) GL in parallel for both t2m and ssrn validation set print('GL for SSRN validation') filelist = glob.glob(hp.logdir + '-t2m/validation_epoch_*/*.mag.npy') + \ glob.glob(hp.logdir + '-ssrn/validation_epoch_*/*.npy') if opts.ncores==1: for fname in tqdm.tqdm(filelist): synth_wave(hp, fname) else: executor = ProcessPoolExecutor(max_workers=opts.ncores) futures = [] for fpath in filelist: futures.append(executor.submit(synth_wave, hp, fpath)) proc_list = [future.result() for future in tqdm.tqdm(futures)]
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1): ''' topoutdir: store samples under here; defaults to hp.sampledir t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models. ''' assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript fpaths, L = dataset['fpaths'], dataset['texts'] position_in_phone_data = duration_data = labels = None # default if hp.use_external_durations: duration_data = dataset['durations'] if num_sentences > 0: duration_data = duration_data[:num_sentences, :, :] if 'position_in_phone' in hp.history_type: ## TODO: combine + deduplicate with relevant code in train.py for making validation set def duration2position(duration, fractional=False): ### very roundabout -- need to deflate A matrix back to integers: duration = duration.sum(axis=0) #print(duration) # sys.exit('evs') positions = durations_to_position(duration, fractional=fractional) ###positions = end_pad_for_reduction_shape_sync(positions, hp) positions = positions[0::hp.r, :] #print(positions) return positions position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \ for dur in duration_data] position_in_phone_data = list2batch(position_in_phone_data, hp.max_T) # Ensure we aren't trying to generate more utterances than are actually in our test_transcript if num_sentences > 0: assert num_sentences < len(fpaths) L = L[:num_sentences, :] fpaths = fpaths[:num_sentences] bases = [basename(fpath) for fpath in fpaths] if hp.merlin_label_dir: labels = [np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) \ for fpath in fpaths ] labels = list2batch(labels, hp.max_N) if speaker_id: speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list)))) speaker_ix = speaker2ix[speaker_id] ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph: speaker_data = np.ones((len(L), 1)) * speaker_ix else: speaker_data = None # Load graph ## TODO: generalise to combine other types of models into a synthesis pipeline? g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded") g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ### TODO: specify epoch from comm line? ### TODO: t2m and ssrn from separate configs? if t2m_epoch > -1: restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch) else: t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m') if ssrn_epoch > -1: restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch) else: ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn') # Pass input L through Text2Mel Graph t = start_clock('Text2Mel generating...') ### TODO: after futher efficiency testing, remove this fork if 1: ### efficient route -- only make K&V once ## 3.86, 3.70, 3.80 seconds (2 sentences) text_lengths = get_text_lengths(L) K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels) Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \ speaker_data=speaker_data, duration_data=duration_data, \ position_in_phone_data=position_in_phone_data,\ labels=labels) else: ## 5.68, 5.43, 5.38 seconds (2 sentences) Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \ duration_data=duration_data, \ position_in_phone_data=position_in_phone_data, \ labels=labels) stop_clock(t) ### TODO: useful to test this? # print(Y[0,:,:]) # print (np.isnan(Y).any()) # print('nan1') # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z. t = start_clock('Mel2Mag generating...') Z = synth_mel2mag(hp, Y, g2, sess) stop_clock(t) if (np.isnan(Z).any()): ### TODO: keep? Z = np.nan_to_num(Z) # Generate wav files if not topoutdir: topoutdir = hp.sampledir outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch)) if speaker_id: outdir += '_speaker-%s'%(speaker_id) safe_makedir(outdir) print("Generating wav files, will save to following dir: %s"%(outdir)) assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' if ncores==1: for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length synth_wave(hp, mag, outfile) else: executor = ProcessPoolExecutor(max_workers=ncores) futures = [] for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length futures.append(executor.submit(synth_wave, hp, mag, outfile)) proc_list = [future.result() for future in tqdm(futures)] # for i, mag in enumerate(Z): # print("Working on %s"%(bases[i])) # mag = mag[:lengths[i]*hp.r,:] ### trim to generated length # if hp.vocoder=='magphase_compressed': # mag = denorm(mag, s, hp.normtype) # streams = split_streams(mag, ['mag', 'lf0', 'vuv', 'real', 'imag'], [60,1,1,45,45]) # wav = magphase_synth_from_compressed(streams, samplerate=hp.sr) # elif hp.vocoder=='griffin_lim': # wav = spectrogram2wav(hp, mag) # else: # sys.exit('Unsupported vocoder type: %s'%(hp.vocoder)) # #write(outdir + "/{}.wav".format(bases[i]), hp.sr, wav) # soundfile.write(outdir + "/{}.wav".format(bases[i]), wav, hp.sr) # Plot attention alignments for i in range(num_sentences): plot_alignment(hp, alignments[i], utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir)
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1): ''' topoutdir: store samples under here; defaults to hp.sampledir t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models. ''' assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript fpaths, L = dataset['fpaths'], dataset['texts'] position_in_phone_data = duration_data = labels = None # default if hp.use_external_durations: duration_data = dataset['durations'] if num_sentences > 0: duration_data = duration_data[:num_sentences, :, :] if 'position_in_phone' in hp.history_type: ## TODO: combine + deduplicate with relevant code in train.py for making validation set def duration2position(duration, fractional=False): ### very roundabout -- need to deflate A matrix back to integers: duration = duration.sum(axis=0) #print(duration) # sys.exit('evs') positions = durations_to_position(duration, fractional=fractional) ###positions = end_pad_for_reduction_shape_sync(positions, hp) positions = positions[0::hp.r, :] #print(positions) return positions position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \ for dur in duration_data] position_in_phone_data = list2batch(position_in_phone_data, hp.max_T) # Ensure we aren't trying to generate more utterances than are actually in our test_transcript if num_sentences > 0: assert num_sentences <= len(fpaths) L = L[:num_sentences, :] fpaths = fpaths[:num_sentences] bases = [basename(fpath) for fpath in fpaths] if hp.merlin_label_dir: labels = [] for fpath in fpaths: label = np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) if hp.select_central: central_ind = get_labels_indices(hp.merlin_lab_dim) label = label[:,central_ind==1] labels.append(label) labels = list2batch(labels, hp.max_N) if speaker_id: speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list)))) speaker_ix = speaker2ix[speaker_id] ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph: speaker_data = np.ones((len(L), 1)) * speaker_ix else: speaker_data = None if hp.turn_off_monotonic_for_synthesis: # if FIA mechanism is turn off text_lengths = get_text_lengths(L) hp.text_lengths = text_lengths + 1 # Load graph ## TODO: generalise to combine other types of models into a synthesis pipeline? g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded") if hp.norm == None : t2m_layer_norm = False hp.norm = 'layer' hp.lr = 0.001 hp.beta1 = 0.9 hp.beta2 = 0.999 hp.epsilon = 0.00000001 hp.decay_lr = True hp.batchsize = {'t2m': 32, 'ssrn': 8} else: t2m_layer_norm = True g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded") if t2m_layer_norm == False: hp.norm = None hp.lr = 0.0002 hp.beta1 = 0.5 hp.beta2 = 0.9 hp.epsilon = 0.000001 hp.decay_lr = False hp.batchsize = {'t2m': 16, 'ssrn': 8} with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ### TODO: specify epoch from comm line? ### TODO: t2m and ssrn from separate configs? if t2m_epoch > -1: restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch) else: t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m') if ssrn_epoch > -1: restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch) else: ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn') # Pass input L through Text2Mel Graph t = start_clock('Text2Mel generating...') ### TODO: after futher efficiency testing, remove this fork if 1: ### efficient route -- only make K&V once ## 3.86, 3.70, 3.80 seconds (2 sentences) text_lengths = get_text_lengths(L) K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels) Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \ speaker_data=speaker_data, duration_data=duration_data, \ position_in_phone_data=position_in_phone_data,\ labels=labels) else: ## 5.68, 5.43, 5.38 seconds (2 sentences) Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \ duration_data=duration_data, \ position_in_phone_data=position_in_phone_data, \ labels=labels) stop_clock(t) ### TODO: useful to test this? # print(Y[0,:,:]) # print (np.isnan(Y).any()) # print('nan1') # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z. t = start_clock('Mel2Mag generating...') Z = synth_mel2mag(hp, Y, g2, sess) stop_clock(t) if (np.isnan(Z).any()): ### TODO: keep? Z = np.nan_to_num(Z) # Generate wav files if not topoutdir: topoutdir = hp.sampledir outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch)) if speaker_id: outdir += '_speaker-%s'%(speaker_id) safe_makedir(outdir) # Plot trimmed attention alignment with filename print("Plot attention, will save to following dir: %s"%(outdir)) print("File | CDP | Ain") for i, mag in enumerate(Z): outfile = os.path.join(outdir, bases[i]) trimmed_alignment = alignments[i,:text_lengths[i],:lengths[i]] plot_alignment(hp, trimmed_alignment, utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir, outfile=outfile) CDP = getCDP(trimmed_alignment) APin, APout = getAP(trimmed_alignment) print("%s | %.2f | %.2f"%( bases[i], CDP, APin)) print("Generating wav files, will save to following dir: %s"%(outdir)) assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported' if ncores==1: for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length synth_wave(hp, mag, outfile) else: executor = ProcessPoolExecutor(max_workers=ncores) futures = [] for i, mag in tqdm(enumerate(Z)): outfile = os.path.join(outdir, bases[i] + '.wav') mag = mag[:lengths[i]*hp.r,:] ### trim to generated length futures.append(executor.submit(synth_wave, hp, mag, outfile)) proc_list = [future.result() for future in tqdm(futures)]