示例#1
0
def babble(hp, num_sentences=0):

    if num_sentences == 0:
        num_sentences = 4 # default
    g1 = BabblerGraph(hp, mode="synthesize"); print("Babbler graph loaded")
    g2 = SSRNGraph(hp, mode="synthesize"); print("SSRN graph loaded")

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        babbler_epoch = restore_latest_model_parameters(sess, hp, 'babbler')
        ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn')

        t = start_clock('Babbling...')
        Y = synth_babble(hp, g1, sess, seed=False, nsamples=num_sentences)
        stop_clock(t)

        t = start_clock('Mel2Mag generating...')
        Z = synth_mel2mag(hp, Y, g2, sess)
        stop_clock(t) 

        if (np.isnan(Z).any()):  ### TODO: keep?
            Z = np.nan_to_num(Z)

        # Generate wav files
        outdir = os.path.join(hp.voicedir, 'synth_babble', '%s_%s'%(babbler_epoch, ssrn_epoch))
        safe_makedir(outdir)
        for i, mag in enumerate(Z):
            print("Applying Griffin-Lim to sample number %s"%(i))
            wav = spectrogram2wav(hp, mag)
            write(outdir + "/{:03d}.wav".format(i), hp.sr, wav)
示例#2
0
def copy_synth_SSRN_GL(hp, outdir):

    safe_makedir(outdir)

    dataset = load_data(hp, mode="synthesis") 
    fnames, texts = dataset['fpaths'], dataset['texts']
    bases = [basename(fname) for fname in fnames]
    mels = [np.load(os.path.join(hp.coarse_audio_dir, base + '.npy')) for base in bases]
    lengths = [a.shape[0] for a in mels]
    mels = list2batch(mels, 0)

    g = SSRNGraph(hp, mode="synthesize"); print("Graph (ssrn) loaded")

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn')

        print('Run SSRN...')
        Z = synth_mel2mag(hp, mels, g, sess)

        for i, mag in enumerate(Z):
            print("Working on %s"%(bases[i]))
            mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length             
            wav = spectrogram2wav(hp, mag)
            soundfile.write(outdir + "/%s.wav"%(base), wav, hp.sr)
示例#3
0
    def __init__(self, hp, model_type='t2m', t2m_epoch=-1, ssrn_epoch=-1):
        self.t2m_epoch = t2m_epoch
        self.ssrn_epoch = ssrn_epoch

        self.hp = hp
        if model_type == 't2m':
            self.g1 = Text2MelGraph(hp, mode="synthesize")
            print("Graph 1 (t2m) loaded")
        elif model_type == 'unsup':
            self.g1 = Graph_style_unsupervised(hp, mode="synthesize")
            print("Graph 1 (unsup) loaded")
        self.g2 = SSRNGraph(hp, mode="synthesize")
        print("Graph 2 (ssrn) loaded")
        self.sess = tf.Session()

        self.sess.run(tf.global_variables_initializer())

        ### TODO: specify epoch from comm line?
        ### TODO: t2m and ssrn from separate configs?

        if t2m_epoch > -1:
            restore_archived_model_parameters(self.sess, hp, model_type,
                                              t2m_epoch)
        else:
            self.t2m_epoch = restore_latest_model_parameters(
                self.sess, hp, model_type)
        if ssrn_epoch > -1:
            restore_archived_model_parameters(self.sess, hp, 'ssrn',
                                              ssrn_epoch)
        else:
            self.ssrn_epoch = restore_latest_model_parameters(
                self.sess, hp, 'ssrn')
示例#4
0
def main_work():

    #################################################
      
    # ======== Get stuff from command line ==========

    a = ArgumentParser()
    a.add_argument('-c', dest='config', required=True, type=str)
    a.add_argument('-ncores', type=int, default=1)
    opts = a.parse_args()
    
    # ===============================================

    hp = load_config(opts.config)
    
    ### 1) convert saved coarse mels to mags with latest-trained SSRN
    print('mel2mag: restore last saved SSRN')
    g = SSRNGraph(hp,  mode="synthesize")
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        ## TODO: use restore_latest_model_parameters from synthesize?
        var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') 
        saver2 = tf.train.Saver(var_list=var_list)
        savepath = hp.logdir + "-ssrn"        
        latest_checkpoint = tf.train.latest_checkpoint(savepath)
        if latest_checkpoint is None: sys.exit('No SSRN at %s?'%(savepath))
        ssrn_epoch = latest_checkpoint.strip('/ ').split('/')[-1].replace('model_epoch_', '')
        saver2.restore(sess, latest_checkpoint)
        print("SSRN Restored from latest epoch %s"%(ssrn_epoch))

        filelist = glob.glob(hp.logdir + '-t2m/validation_epoch_*/*.npy')
        filelist = [fname for fname in filelist if not fname.endswith('.mag.npy')]
        batch, lengths = make_mel_batch(hp, filelist, oracle=False)
        Z = synth_mel2mag(hp, batch, g, sess, batchsize=32)
        print ('synthesised mags, now splitting batch:')
        maglist = split_batch(Z, lengths)
        for (infname, outdata) in tqdm.tqdm(zip(filelist, maglist)):
            np.save(infname.replace('.npy','.mag.npy'), outdata)



    ### 2) GL in parallel for both t2m and ssrn validation set 
    print('GL for SSRN validation')
    filelist = glob.glob(hp.logdir + '-t2m/validation_epoch_*/*.mag.npy') + \
               glob.glob(hp.logdir + '-ssrn/validation_epoch_*/*.npy')

    if opts.ncores==1:
        for fname in tqdm.tqdm(filelist):
            synth_wave(hp, fname)
    else:
        executor = ProcessPoolExecutor(max_workers=opts.ncores)    
        futures = []
        for fpath in filelist:
            futures.append(executor.submit(synth_wave, hp, fpath))
        proc_list = [future.result() for future in tqdm.tqdm(futures)]
示例#5
0
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1):
    '''
    topoutdir: store samples under here; defaults to hp.sampledir
    t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models.
    '''
    assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

    dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript
    fpaths, L = dataset['fpaths'], dataset['texts']
    position_in_phone_data = duration_data = labels = None # default
    if hp.use_external_durations:
        duration_data = dataset['durations']
        if num_sentences > 0:
            duration_data = duration_data[:num_sentences, :, :]

    if 'position_in_phone' in hp.history_type:
        ## TODO: combine + deduplicate with relevant code in train.py for making validation set
        def duration2position(duration, fractional=False):     
            ### very roundabout -- need to deflate A matrix back to integers:
            duration = duration.sum(axis=0)
            #print(duration)
            # sys.exit('evs')   
            positions = durations_to_position(duration, fractional=fractional)
            ###positions = end_pad_for_reduction_shape_sync(positions, hp)
            positions = positions[0::hp.r, :]         
            #print(positions)
            return positions

        position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \
                        for dur in duration_data]       
        position_in_phone_data = list2batch(position_in_phone_data, hp.max_T)



    # Ensure we aren't trying to generate more utterances than are actually in our test_transcript
    if num_sentences > 0:
        assert num_sentences < len(fpaths)
        L = L[:num_sentences, :]
        fpaths = fpaths[:num_sentences]

    bases = [basename(fpath) for fpath in fpaths]

    if hp.merlin_label_dir:
        labels = [np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy")) \
                              for fpath in fpaths ]
        labels = list2batch(labels, hp.max_N)


    if speaker_id:
        speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list))))
        speaker_ix = speaker2ix[speaker_id]

        ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph:
        speaker_data = np.ones((len(L), 1))  *  speaker_ix
    else:
        speaker_data = None

    # Load graph 
    ## TODO: generalise to combine other types of models into a synthesis pipeline?
    g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded")
    g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded")

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        ### TODO: specify epoch from comm line?
        ### TODO: t2m and ssrn from separate configs?

        if t2m_epoch > -1:
            restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch)
        else:
            t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m')

        if ssrn_epoch > -1:    
            restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch)
        else:
            ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn')

        # Pass input L through Text2Mel Graph
        t = start_clock('Text2Mel generating...')
        ### TODO: after futher efficiency testing, remove this fork
        if 1:  ### efficient route -- only make K&V once  ## 3.86, 3.70, 3.80 seconds (2 sentences)
            text_lengths = get_text_lengths(L)
            K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels)
            Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \
                                speaker_data=speaker_data, duration_data=duration_data, \
                                position_in_phone_data=position_in_phone_data,\
                                labels=labels)
        else: ## 5.68, 5.43, 5.38 seconds (2 sentences)
            Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \
                                            duration_data=duration_data, \
                                            position_in_phone_data=position_in_phone_data, \
                                            labels=labels)
        stop_clock(t)

        ### TODO: useful to test this?
        # print(Y[0,:,:])
        # print (np.isnan(Y).any())
        # print('nan1')
        # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z.
        t = start_clock('Mel2Mag generating...')
        Z = synth_mel2mag(hp, Y, g2, sess)
        stop_clock(t) 

        if (np.isnan(Z).any()):  ### TODO: keep?
            Z = np.nan_to_num(Z)

        # Generate wav files
        if not topoutdir:
            topoutdir = hp.sampledir
        outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch))
        if speaker_id:
            outdir += '_speaker-%s'%(speaker_id)
        safe_makedir(outdir)
        print("Generating wav files, will save to following dir: %s"%(outdir))

        
        assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

        if ncores==1:
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                synth_wave(hp, mag, outfile)
        else:
            executor = ProcessPoolExecutor(max_workers=ncores)    
            futures = []
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                futures.append(executor.submit(synth_wave, hp, mag, outfile))
            proc_list = [future.result() for future in tqdm(futures)]

        # for i, mag in enumerate(Z):
        #     print("Working on %s"%(bases[i]))
        #     mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
            
        #     if hp.vocoder=='magphase_compressed':
        #         mag = denorm(mag, s, hp.normtype)
        #         streams = split_streams(mag, ['mag', 'lf0', 'vuv', 'real', 'imag'], [60,1,1,45,45])
        #         wav = magphase_synth_from_compressed(streams, samplerate=hp.sr)
        #     elif hp.vocoder=='griffin_lim':                
        #         wav = spectrogram2wav(hp, mag)
        #     else:
        #         sys.exit('Unsupported vocoder type: %s'%(hp.vocoder))
        #     #write(outdir + "/{}.wav".format(bases[i]), hp.sr, wav)
        #     soundfile.write(outdir + "/{}.wav".format(bases[i]), wav, hp.sr)
            

            
        # Plot attention alignments 
        for i in range(num_sentences):
            plot_alignment(hp, alignments[i], utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir)
示例#6
0
def synthesize(hp, speaker_id='', num_sentences=0, ncores=1, topoutdir='', t2m_epoch=-1, ssrn_epoch=-1):
    '''
    topoutdir: store samples under here; defaults to hp.sampledir
    t2m_epoch and ssrn_epoch: default -1 means use latest. Otherwise go to archived models.
    '''
    assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

    dataset = load_data(hp, mode="synthesis") #since mode != 'train' or 'validation', will load test_transcript rather than transcript
    fpaths, L = dataset['fpaths'], dataset['texts']
    position_in_phone_data = duration_data = labels = None # default
    if hp.use_external_durations:
        duration_data = dataset['durations']
        if num_sentences > 0:
            duration_data = duration_data[:num_sentences, :, :]

    if 'position_in_phone' in hp.history_type:
        ## TODO: combine + deduplicate with relevant code in train.py for making validation set
        def duration2position(duration, fractional=False):     
            ### very roundabout -- need to deflate A matrix back to integers:
            duration = duration.sum(axis=0)
            #print(duration)
            # sys.exit('evs')   
            positions = durations_to_position(duration, fractional=fractional)
            ###positions = end_pad_for_reduction_shape_sync(positions, hp)
            positions = positions[0::hp.r, :]         
            #print(positions)
            return positions

        position_in_phone_data = [duration2position(dur, fractional=('fractional' in hp.history_type)) \
                        for dur in duration_data]       
        position_in_phone_data = list2batch(position_in_phone_data, hp.max_T)



    # Ensure we aren't trying to generate more utterances than are actually in our test_transcript
    if num_sentences > 0:
        assert num_sentences <= len(fpaths)
        L = L[:num_sentences, :]
        fpaths = fpaths[:num_sentences]

    bases = [basename(fpath) for fpath in fpaths]

    if hp.merlin_label_dir:
        labels = []
        for fpath in fpaths:
            label = np.load("{}/{}".format(hp.merlin_label_dir, basename(fpath)+".npy"))
            if hp.select_central:
                central_ind = get_labels_indices(hp.merlin_lab_dim)
                label = label[:,central_ind==1] 
            labels.append(label)

        labels = list2batch(labels, hp.max_N)


    if speaker_id:
        speaker2ix = dict(zip(hp.speaker_list, range(len(hp.speaker_list))))
        speaker_ix = speaker2ix[speaker_id]

        ## Speaker codes are held in (batch, 1) matrix -- tiling is done inside the graph:
        speaker_data = np.ones((len(L), 1))  *  speaker_ix
    else:
        speaker_data = None
   
    if hp.turn_off_monotonic_for_synthesis: # if FIA mechanism is turn off
        text_lengths = get_text_lengths(L)
        hp.text_lengths = text_lengths + 1
     
    # Load graph 
    ## TODO: generalise to combine other types of models into a synthesis pipeline?
    g1 = Text2MelGraph(hp, mode="synthesize"); print("Graph 1 (t2m) loaded")

    if hp.norm == None :
        t2m_layer_norm = False
        hp.norm = 'layer'
        hp.lr = 0.001
        hp.beta1 = 0.9
        hp.beta2 = 0.999
        hp.epsilon = 0.00000001
        hp.decay_lr = True
        hp.batchsize = {'t2m': 32, 'ssrn': 8}
    else:
        t2m_layer_norm = True

    g2 = SSRNGraph(hp, mode="synthesize"); print("Graph 2 (ssrn) loaded")

    if t2m_layer_norm == False:
        hp.norm = None
        hp.lr = 0.0002
        hp.beta1 = 0.5
        hp.beta2 = 0.9
        hp.epsilon = 0.000001
        hp.decay_lr = False
        hp.batchsize = {'t2m': 16, 'ssrn': 8}
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        ### TODO: specify epoch from comm line?
        ### TODO: t2m and ssrn from separate configs?

        if t2m_epoch > -1:
            restore_archived_model_parameters(sess, hp, 't2m', t2m_epoch)
        else:
            t2m_epoch = restore_latest_model_parameters(sess, hp, 't2m')

        if ssrn_epoch > -1:    
            restore_archived_model_parameters(sess, hp, 'ssrn', ssrn_epoch)
        else:
            ssrn_epoch = restore_latest_model_parameters(sess, hp, 'ssrn')

        # Pass input L through Text2Mel Graph
        t = start_clock('Text2Mel generating...')
        ### TODO: after futher efficiency testing, remove this fork
        if 1:  ### efficient route -- only make K&V once  ## 3.86, 3.70, 3.80 seconds (2 sentences)
            text_lengths = get_text_lengths(L)
            K, V = encode_text(hp, L, g1, sess, speaker_data=speaker_data, labels=labels)
            Y, lengths, alignments = synth_codedtext2mel(hp, K, V, text_lengths, g1, sess, \
                                speaker_data=speaker_data, duration_data=duration_data, \
                                position_in_phone_data=position_in_phone_data,\
                                labels=labels)
        else: ## 5.68, 5.43, 5.38 seconds (2 sentences)
            Y, lengths = synth_text2mel(hp, L, g1, sess, speaker_data=speaker_data, \
                                            duration_data=duration_data, \
                                            position_in_phone_data=position_in_phone_data, \
                                            labels=labels)
        stop_clock(t)

        ### TODO: useful to test this?
        # print(Y[0,:,:])
        # print (np.isnan(Y).any())
        # print('nan1')
        # Then pass output Y of Text2Mel Graph through SSRN graph to get high res spectrogram Z.
        t = start_clock('Mel2Mag generating...')
        Z = synth_mel2mag(hp, Y, g2, sess)
        stop_clock(t) 

        if (np.isnan(Z).any()):  ### TODO: keep?
            Z = np.nan_to_num(Z)

        # Generate wav files
        if not topoutdir:
            topoutdir = hp.sampledir
        outdir = os.path.join(topoutdir, 't2m%s_ssrn%s'%(t2m_epoch, ssrn_epoch))
        if speaker_id:
            outdir += '_speaker-%s'%(speaker_id)
        safe_makedir(outdir)

        # Plot trimmed attention alignment with filename
        print("Plot attention, will save to following dir: %s"%(outdir))
        print("File |  CDP | Ain")
        for i, mag in enumerate(Z):
            outfile = os.path.join(outdir, bases[i])
            trimmed_alignment = alignments[i,:text_lengths[i],:lengths[i]]
            plot_alignment(hp, trimmed_alignment, utt_idx=i+1, t2m_epoch=t2m_epoch, dir=outdir, outfile=outfile)
            CDP = getCDP(trimmed_alignment)
            APin, APout = getAP(trimmed_alignment)
            print("%s | %.2f | %.2f"%( bases[i], CDP, APin))

        print("Generating wav files, will save to following dir: %s"%(outdir))

        
        assert hp.vocoder in ['griffin_lim', 'world'], 'Other vocoders than griffin_lim/world not yet supported'

        if ncores==1:
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                synth_wave(hp, mag, outfile)
        else:
            executor = ProcessPoolExecutor(max_workers=ncores)    
            futures = []
            for i, mag in tqdm(enumerate(Z)):
                outfile = os.path.join(outdir, bases[i] + '.wav')
                mag = mag[:lengths[i]*hp.r,:]  ### trim to generated length
                futures.append(executor.submit(synth_wave, hp, mag, outfile))
            proc_list = [future.result() for future in tqdm(futures)]