def build_model(self): with tf.variable_scope( "Text2Mel"): ## keep scope names consistent with full Text2Mel ## to allow parameters to be reused more easily later # Get S or decoder inputs. (B, T//r, n_mels). This is audio shifted 1 frame to the right. self.S = tf.concat( (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1) ## Babbler has no TextEnc with tf.variable_scope("AudioEnc"): self.Q = AudioEnc(self.hp, self.S, training=self.training, reuse=self.reuse) with tf.variable_scope("Attention"): ## Babbler has no real attention. Dummy (all 0) text encoder outputs are supplied instead. # R: concat Q with zero vector (dummy text encoder outputs) dummy_R_prime = tf.zeros_like( self.Q) ## R_prime shares shape of audio encoder output self.R = tf.concat((dummy_R_prime, self.Q), -1) with tf.variable_scope("AudioDec"): self.Y_logits, self.Y = AudioDec( self.hp, self.R, training=self.training, speaker_codes=self.speakers, reuse=self.reuse) # (B, T/r, n_mels)
def __init__(self, num=1, mode="train"): ''' Args: num: Either 1 or 2. 1 for Text2Mel 2 for SSRN. mode: Either "train" or "synthesize". ''' # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Set flag training = True if mode == "train" else False # Graph # Data Feeding ## L: Text. (B, N), int32 ## mels: Reduced melspectrogram. (B, T/r, n_mels) float32 ## mags: Magnitude. (B, T, n_fft//2+1) float32 self.L = tf.placeholder(tf.int32, shape=(None, None)) self.mels = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels)) self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None, )) with tf.variable_scope("Text2Mel"): # Get S or decoder inputs. (B, T//r, n_mels) self.S = tf.concat( (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1) # Networks with tf.variable_scope("TextEnc"): self.K, self.V = TextEnc(self.L, training=training) # (N, Tx, e) with tf.variable_scope("AudioEnc"): self.Q = AudioEnc(self.S, training=training) with tf.variable_scope("Attention"): # R: (B, T/r, 2d) # alignments: (B, N, T/r) # max_attentions: (B,) self.R, self.alignments, self.max_attentions = Attention( self.Q, self.K, self.V, mononotic_attention=(not training), prev_max_attentions=self.prev_max_attentions) with tf.variable_scope("AudioDec"): self.Y_logits, self.Y = AudioDec( self.R, training=training) # (B, T/r, n_mels) # During inference, the predicted melspectrogram values are fed. with tf.variable_scope("SSRN"): self.Z_logits, self.Z = SSRN(self.Y, training=training) with tf.variable_scope("gs"): self.global_step = tf.Variable(0, name='global_step', trainable=False)
def build_model(self): self.load_data_in_memory() self.add_data(reuse=self.reuse) with tf.variable_scope( "Text2Mel"): ## keep scope names consistent with full Text2Mel ## to allow parameters to be reused more easily later # Get S or decoder inputs. (B, T//r, n_mels). This is audio shifted 1 frame to the right. self.S = tf.concat( (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1) # Build a latent representation of expressiveness, if we defined uee in config file (for unsupervised expressiveness embedding) if self.hp.uee != 0: with tf.variable_scope("Audio2Emo"): with tf.variable_scope("AudioEnc"): self.emos = Audio2Emo( self.hp, self.S, training=self.training, speaker_codes=self.speakers, reuse=self.reuse) # (B, T/r, d=8) self.emo_mean = tf.reduce_mean(self.emos, 1) print(self.emo_mean.get_shape()) self.emo_mean = tf.expand_dims(self.emo_mean, axis=1) print(self.emo_mean.get_shape()) #pdb.set_trace() else: print('No unsupervised expressive embedding') self.emo_mean = None #pdb.set_trace() ## Babbler has no TextEnc with tf.variable_scope("AudioEnc"): self.Q = AudioEnc(self.hp, self.S, training=self.training, reuse=self.reuse) with tf.variable_scope("Attention"): ## Babbler has no real attention. Dummy (all 0) text encoder outputs are supplied instead. # R: concat Q with zero vector (dummy text encoder outputs) dummy_R_prime = tf.zeros_like( self.Q) ## R_prime shares shape of audio encoder output self.R = tf.concat((dummy_R_prime, self.Q), -1) with tf.variable_scope("AudioDec"): self.Y_logits, self.Y = AudioDec( self.hp, self.R, emos=self.emo_mean, training=self.training, speaker_codes=self.speakers, reuse=self.reuse) # (B, T/r, n_mels)
def __init__(self, num=1): # Load vocabulary self.char2idx, self.idx2char = self.load_vocab() # Set flag training = False # Graph # Data Feeding # Synthesize self.L = tf.placeholder(tf.int32, shape=(None, None)) self.mels = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels)) self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None, )) with tf.variable_scope("Text2Mel"): # Get S or decoder inputs. (B, T//r, n_mels) self.S = tf.concat( (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1) # Networks with tf.variable_scope("TextEnc"): self.K, self.V = TextEnc(self.L, training=training) # (N, Tx, e) with tf.variable_scope("AudioEnc"): self.Q = AudioEnc(self.S, training=training) with tf.variable_scope("Attention"): # R: (B, T/r, 2d) # alignments: (B, N, T/r) # max_attentions: (B,) self.R, self.alignments, self.max_attentions = Attention( self.Q, self.K, self.V, mononotic_attention=(not training), prev_max_attentions=self.prev_max_attentions) with tf.variable_scope("AudioDec"): self.Y_logits, self.Y = AudioDec( self.R, training=training) # (B, T/r, n_mels) # During inference, the predicted melspectrogram values are fed. with tf.variable_scope("SSRN"): self.Z_logits, self.Z = SSRN(self.Y, training=training) with tf.variable_scope("gs"): self.global_step = tf.Variable(0, name='global_step', trainable=False)
def __init__(self, num=1, mode="train"): ''' Args: mode: Either "train" or "synthesize". ''' # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Set flag training = True if mode == "train" else False # Graph # Data Feeding ## L: Text. (B, N), int32 ## world: World Vocoder concatenate tensor.(B, 8*T/r, num_lf0+num_mgc+num_bap) float32 if mode == "train": self.L, self.worlds, self.worlds_WSR, self.fnames, self.num_batch = get_batch( ) self.prev_max_attentions = tf.ones(shape=(hp.B, ), dtype=tf.int32) self.gts = tf.convert_to_tensor(guided_attention()) else: # Synthesize self.L = tf.placeholder(tf.int32, shape=(None, None)) self.worlds = tf.placeholder( tf.float32, shape=(None, None, hp.num_bap + hp.num_lf0 + hp.num_mgc + hp.num_vuv)) self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None, )) self.gts = tf.convert_to_tensor(guided_attention()) if num == 1 or (not training): with tf.variable_scope("Text2World"): # Get S or decoder inputs. (B, 8*T/r, num_lf0+num_mgc+num_bap) self.S = tf.concat((tf.zeros_like( self.worlds[:, :1, :]), self.worlds[:, :-1, :]), 1) # Networks with tf.variable_scope("TextEnc"): self.K, self.V = TextEnc(self.L, training=training) # (N, Tx, e) with tf.variable_scope("AudioEnc"): self.Q = AudioEnc(self.S, training=training) with tf.variable_scope("Attention"): # R: (B, T/r, 2d) # alignments: (B, N, T/r) # max_attentions: (B,) self.R, self.alignments, self.max_attentions = Attention( self.Q, self.K, self.V, mononotic_attention=(not training), prev_max_attentions=self.prev_max_attentions) with tf.variable_scope("AudioDec"): self.Y_logits, self.Y = AudioDec( self.R, training=training) # (B, T/r, num_lf0+num_mgc+num_bap) else: # num==2 & training. Note that during training, with tf.variable_scope("WSRN"): self.Z_logits, self.Z = WSRN(self.worlds, training=training) if not training: # During inference, the predicted world values are fed. with tf.variable_scope("WSRN"): self.Z_logits, self.Z = WSRN(self.Y, training=training) with tf.variable_scope("gs"): self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: if num == 1: # Text2World # world L1 loss self.loss_worlds = tf.losses.mean_squared_error( self.worlds, self.Y) #self.loss_worlds = tf.reduce_mean(tf.abs(self.Y - self.worlds)) # world binary divergence loss #self.loss_bd1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Y_logits, labels=self.worlds)) # guided_attention loss self.A = tf.pad(self.alignments, [(0, 0), (0, hp.max_N), (0, hp.max_T)], mode="CONSTANT", constant_values=-1.)[:, :hp.max_N, :hp.max_T] self.attention_masks = tf.to_float(tf.not_equal(self.A, -1)) self.loss_att = tf.reduce_sum( tf.abs(self.A * self.gts) * self.attention_masks) self.mask_sum = tf.reduce_sum(self.attention_masks) self.loss_att /= self.mask_sum # total loss self.loss = self.loss_worlds + self.loss_att #self.loss_bd1 + tf.summary.scalar('train/loss_worlds', self.loss_worlds) #tf.summary.scalar('train/loss_bd1', self.loss_bd1) tf.summary.scalar('train/loss_att', self.loss_att) tf.summary.image( 'train/world_gt', tf.expand_dims(tf.transpose(self.worlds[:1], [0, 2, 1]), -1)) tf.summary.image( 'train/world_hat', tf.expand_dims(tf.transpose(self.Y[:1], [0, 2, 1]), -1)) else: #WSRN # world L1 loss self.loss_WSR = tf.losses.mean_squared_error( self.Z, self.worlds_WSR) # world binary divergence loss #self.loss_bd2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Z_logits, labels=self.worlds_WSR)) # total loss self.loss = self.loss_WSR #+ self.loss_bd2 tf.summary.scalar('train/loss_world_SSRN', self.loss_WSR) #tf.summary.scalar('train/loss_bd2', self.loss_bd2) # Training Scheme self.lr = learning_rate_decay(hp.lr, self.global_step) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) tf.summary.scalar("lr", self.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1., 1.) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary self.merged = tf.summary.merge_all()
def build_model(self): with tf.variable_scope("Text2Mel"): # Get S or decoder inputs. (B, T//r, n_mels). This is audio shifted 1 frame to the right. self.S = tf.concat( (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1) # Networks if self.hp.text_encoder_type == 'none': assert self.hp.merlin_label_dir self.K = self.V = self.merlin_label elif self.hp.text_encoder_type == 'minimal_feedforward': assert self.hp.merlin_label_dir #sys.exit('Not implemented: hp.text_encoder_type=="minimal_feedforward"') self.K = self.V = LinearTransformLabels(self.hp, self.merlin_label, training=self.training, reuse=self.reuse) elif self.hp.text_encoder_type == 'MerlinTextEnc': assert self.hp.merlin_label_dir with tf.variable_scope("MerlinTextEnc"): self.K, self.V = MerlinTextEnc( self.hp, self.L, self.merlin_label, training=self.training, speaker_codes=self.speakers, reuse=self.reuse) # (N, Tx, e) else: ## default DCTTS text encoder with tf.variable_scope("TextEnc"): self.K, self.V = TextEnc(self.hp, self.L, training=self.training, speaker_codes=self.speakers, reuse=self.reuse) # (N, Tx, e) with tf.variable_scope("AudioEnc"): if self.hp.history_type in [ 'fractional_position_in_phone', 'absolute_position_in_phone' ]: self.Q = self.position_in_phone elif self.hp.history_type == 'minimal_history': sys.exit( 'Not implemented: hp.history_type=="minimal_history"') else: assert self.hp.history_type == 'DCTTS_standard' self.Q = AudioEnc(self.hp, self.S, training=self.training, speaker_codes=self.speakers, reuse=self.reuse) with tf.variable_scope("Attention"): # R: (B, T/r, 2d) # alignments: (B, N, T/r) # max_attentions: (B,) if self.hp.use_external_durations: self.R, self.alignments, self.max_attentions = FixedAttention( self.hp, self.durations, self.Q, self.V) elif self.mode is 'synthesize': self.R, self.alignments, self.max_attentions = Attention( self.hp, self.Q, self.K, self.V, monotonic_attention=True, prev_max_attentions=self.prev_max_attentions) elif self.mode is 'train': self.R, self.alignments, self.max_attentions = Attention( self.hp, self.Q, self.K, self.V, monotonic_attention=False, prev_max_attentions=self.prev_max_attentions) elif self.mode is 'generate_attention': self.R, self.alignments, self.max_attentions = Attention( self.hp, self.Q, self.K, self.V, monotonic_attention=False, prev_max_attentions=None) with tf.variable_scope("AudioDec"): self.Y_logits, self.Y = AudioDec( self.hp, self.R, training=self.training, speaker_codes=self.speakers, reuse=self.reuse) # (B, T/r, n_mels)
def __init__(self, num=1, mode="train"): ''' Args: num: Either 1 or 2. 1 for Text2Mel 2 for SSRN. mode: Either "train" or "synthesize". ''' # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Set flag training = True if mode == "train" else False # Graph # Data Feeding ## L: Text. (B, N), int32 ## mels: Reduced melspectrogram. (B, T/r, n_mels) float32 ## mags: Magnitude. (B, T, n_fft//2+1) float32 if mode == "train": self.L, self.mels, self.mags, self.fnames, self.num_batch = get_batch( ) self.prev_max_attentions = tf.ones(shape=(hp.B, ), dtype=tf.int32) self.gts = tf.convert_to_tensor(guided_attention()) else: # Synthesize self.L = tf.placeholder(tf.int32, shape=(None, None)) self.mels = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels)) self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None, )) # Training first neural net or testing if num == 1 or (not training): with tf.variable_scope("Text2Mel"): # Get S or decoder inputs. (B, T//r, n_mels) self.S = tf.concat( (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1) # Networks with tf.variable_scope("TextEnc"): self.K, self.V = TextEnc(self.L, training=training) # (N, Tx, e) with tf.variable_scope("AudioEnc"): self.Q = AudioEnc(self.S, training=training) with tf.variable_scope("Attention"): # R: (B, T/r, 2d) # alignments: (B, N, T/r) # max_attentions: (B,) self.R, self.alignments, self.max_attentions = Attention( self.Q, self.K, self.V, mononotic_attention=(not training), prev_max_attentions=self.prev_max_attentions) with tf.variable_scope("AudioDec"): self.Y_logits, self.Y = AudioDec( self.R, training=training) # (B, T/r, n_mels) # Training second neural net else: # num==2 & training. Note that during training, # the ground truth melspectrogram values are fed. with tf.variable_scope("SSRN"): self.Z_logits, self.Z = SSRN(self.mels, training=training) if not training: # During inference, the predicted melspectrogram values are fed. with tf.variable_scope("SSRN"): self.Z_logits, self.Z = SSRN(self.Y, training=training) with tf.variable_scope("gs"): self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: if num == 1: # Text2Mel # mel L1 loss self.loss_mels = tf.reduce_mean(tf.abs(self.Y - self.mels)) # mel binary divergence loss self.loss_bd1 = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.Y_logits, labels=self.mels)) # guided_attention loss self.A = tf.pad(self.alignments, [(0, 0), (0, hp.max_N), (0, hp.max_T)], mode="CONSTANT", constant_values=-1.)[:, :hp.max_N, :hp.max_T] self.attention_masks = tf.to_float(tf.not_equal(self.A, -1)) self.loss_att = tf.reduce_sum( tf.abs(self.A * self.gts) * self.attention_masks) self.mask_sum = tf.reduce_sum(self.attention_masks) self.loss_att /= self.mask_sum # total loss self.loss = self.loss_mels + self.loss_bd1 + self.loss_att tf.summary.scalar('train/loss_mels', self.loss_mels) tf.summary.scalar('train/loss_bd1', self.loss_bd1) tf.summary.scalar('train/loss_att', self.loss_att) tf.summary.image( 'train/mel_gt', tf.expand_dims(tf.transpose(self.mels[:1], [0, 2, 1]), -1)) tf.summary.image( 'train/mel_hat', tf.expand_dims(tf.transpose(self.Y[:1], [0, 2, 1]), -1)) else: # SSRN # mag L1 loss self.loss_mags = tf.reduce_mean(tf.abs(self.Z - self.mags)) # mag binary divergence loss self.loss_bd2 = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.Z_logits, labels=self.mags)) # total loss self.loss = self.loss_mags + self.loss_bd2 tf.summary.scalar('train/loss_mags', self.loss_mags) tf.summary.scalar('train/loss_bd2', self.loss_bd2) tf.summary.image( 'train/mag_gt', tf.expand_dims(tf.transpose(self.mags[:1], [0, 2, 1]), -1)) tf.summary.image( 'train/mag_hat', tf.expand_dims(tf.transpose(self.Z[:1], [0, 2, 1]), -1)) # Training Scheme self.lr = learning_rate_decay(hp.lr, self.global_step) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) tf.summary.scalar("lr", self.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1., 1.) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary self.merged = tf.summary.merge_all()
def build_model(self): self.load_data_in_memory() self.add_data(reuse=self.reuse) # Get S or decoder inputs. (B, T//r, n_mels). This is audio shifted 1 frame to the right. self.S = tf.concat( (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1) # Build a latent representation of expressiveness, if we defined uee in config file (for unsupervised expressiveness embedding) try: if self.hp.uee != 0: with tf.variable_scope("Audio2Emo"): with tf.variable_scope("AudioEnc"): self.emos = Audio2Emo( self.hp, self.S, training=self.training, speaker_codes=self.speakers, reuse=self.reuse) # (B, T/r, d=8) self.emo_mean = tf.reduce_mean(self.emos, 1) print(self.emo_mean.get_shape()) self.emo_mean = tf.expand_dims(self.emo_mean, axis=1) print(self.emo_mean.get_shape()) #pdb.set_trace() except: print('No unsupervised expressive embedding') self.emo_mean = None #pdb.set_trace() with tf.variable_scope("Text2Mel"): # Networks if self.hp.text_encoder_type == 'none': assert self.hp.merlin_label_dir self.K = self.V = self.merlin_label elif self.hp.text_encoder_type == 'minimal_feedforward': assert self.hp.merlin_label_dir #sys.exit('Not implemented: hp.text_encoder_type=="minimal_feedforward"') self.K = self.V = LinearTransformLabels(self.hp, self.merlin_label, training=self.training, reuse=self.reuse) else: ## default DCTTS text encoder with tf.variable_scope("TextEnc_emotional"): self.K, self.V = TextEnc(self.hp, self.L, training=self.training, emos=self.emo_mean, speaker_codes=self.speakers, reuse=self.reuse) # (N, Tx, e) with tf.variable_scope("AudioEnc"): if self.hp.history_type in [ 'fractional_position_in_phone', 'absolute_position_in_phone' ]: self.Q = self.position_in_phone elif self.hp.history_type == 'minimal_history': sys.exit( 'Not implemented: hp.history_type=="minimal_history"') else: assert self.hp.history_type == 'DCTTS_standard' self.Q = AudioEnc(self.hp, self.S, training=self.training, speaker_codes=self.speakers, reuse=self.reuse) with tf.variable_scope("Attention"): # R: (B, T/r, 2d) # alignments: (B, N, T/r) # max_attentions: (B,) if self.hp.use_external_durations: self.R, self.alignments, self.max_attentions = FixedAttention( self.hp, self.durations, self.Q, self.V) elif self.mode is 'synthesize': self.R, self.alignments, self.max_attentions = Attention( self.hp, self.Q, self.K, self.V, monotonic_attention=True, prev_max_attentions=self.prev_max_attentions) elif self.mode is 'train': self.R, self.alignments, self.max_attentions = Attention( self.hp, self.Q, self.K, self.V, monotonic_attention=False, prev_max_attentions=self.prev_max_attentions) elif self.mode is 'generate_attention': self.R, self.alignments, self.max_attentions = Attention( self.hp, self.Q, self.K, self.V, monotonic_attention=False, prev_max_attentions=None) with tf.variable_scope("AudioDec"): self.Y_logits, self.Y = AudioDec( self.hp, self.R, training=self.training, speaker_codes=self.speakers, reuse=self.reuse) # (B, T/r, n_mels)