def compute_per_example_discriminator_losses(features): y_mb_hat = generator(features['mel'], training=True) audios = features['audio'] y_hat = pqmf.synthesis(y_mb_hat) y = tf.expand_dims(audios, 2) p = discriminator(y) p_hat = discriminator(y_hat) real_loss = 0.0 fake_loss = 0.0 for i in range(len(p)): real_loss += calculate_3d_loss(tf.ones_like(p[i][-1]), p[i][-1], loss_fn=mse_loss) fake_loss += calculate_3d_loss(tf.zeros_like(p_hat[i][-1]), p_hat[i][-1], loss_fn=mse_loss) real_loss /= i + 1 fake_loss /= i + 1 dis_loss = real_loss + fake_loss per_example_losses = dis_loss dict_metrics_losses = { 'real_loss': real_loss, 'fake_loss': fake_loss, 'dis_loss': dis_loss, } return per_example_losses, dict_metrics_losses
def compute_per_example_generator_losses(audios, outputs): y_hat = outputs p_hat = discriminator(y_hat) p = discriminator(tf.expand_dims(audios, 2)) adv_loss = 0.0 for i in range(len(p_hat)): adv_loss += calculate_3d_loss(tf.ones_like(p_hat[i][-1]), p_hat[i][-1], loss_fn=mse_loss) adv_loss /= i + 1 fm_loss = 0.0 for i in range(len(p_hat)): for j in range(len(p_hat[i]) - 1): fm_loss += calculate_3d_loss(p[i][j], p_hat[i][j], loss_fn=mae_loss) fm_loss /= (i + 1) * (j + 1) adv_loss += 10 * fm_loss per_example_losses = adv_loss a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn=mels_loss) dict_metrics_losses = { 'adversarial_loss': adv_loss, 'fm_loss': fm_loss, 'gen_loss': adv_loss, 'mels_spectrogram_loss': tf.reduce_mean(a), } return per_example_losses, dict_metrics_losses
def compute_per_example_discriminator_losses(audios, gen_outputs): y_hat = gen_outputs y = tf.expand_dims(audios, 2) p = discriminator(y) p_hat = discriminator(y_hat) real_loss = 0.0 fake_loss = 0.0 for i in range(len(p)): real_loss += calculate_3d_loss(tf.ones_like(p[i][-1]), p[i][-1], loss_fn=mse_loss) fake_loss += calculate_3d_loss(tf.zeros_like(p_hat[i][-1]), p_hat[i][-1], loss_fn=mse_loss) real_loss /= i + 1 fake_loss /= i + 1 dis_loss = real_loss + fake_loss per_example_losses = dis_loss dict_metrics_losses = { 'real_loss': real_loss, 'fake_loss': fake_loss, 'dis_loss': dis_loss, } return per_example_losses, dict_metrics_losses
def model_fn(features, labels, mode, params): vectors = features['v'] mels = features['mel'] mels_len = features['mel_length'][:, 0] model = autovc.Model(dim_neck=32, dim_pre=512, freq=32) encoder_outputs, mel_before, mel_after, codes = model( mels, vectors, vectors) codes_ = model.call_second(mel_after, vectors) loss_f = tf.losses.absolute_difference max_length = tf.cast(tf.reduce_max(mels_len), tf.int32) mask = tf.sequence_mask(lengths=mels_len, maxlen=max_length, dtype=tf.float32) mask = tf.expand_dims(mask, axis=-1) mse_mel = partial(loss_f, weights=mask) mel_loss_before = calculate_3d_loss(mels, mel_before, mse_mel) mel_loss_after = calculate_3d_loss(mels, mel_after, mse_mel) g_loss_cd = tf.losses.absolute_difference(codes, codes_) loss = mel_loss_before + mel_loss_after + g_loss_cd tf.identity(loss, 'total_loss') tf.identity(mel_loss_before, 'mel_loss_before') tf.identity(mel_loss_after, 'mel_loss_after') tf.identity(g_loss_cd, 'g_loss_cd') tf.summary.scalar('total_loss', loss) tf.summary.scalar('mel_loss_before', mel_loss_before) tf.summary.scalar('mel_loss_after', mel_loss_after) tf.summary.scalar('g_loss_cd', g_loss_cd) global_step = tf.train.get_or_create_global_step() if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer(learning_rate=0.0001) train_op = optimizer.minimize(loss, global_step=global_step) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: estimator_spec = tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.EVAL, loss=loss) return estimator_spec
def compute_per_example_generator_losses(features): y_hat = generator(features['mel'], training = True) audios = features['audio'] sc_loss, mag_loss = calculate_2d_loss( audios, tf.squeeze(y_hat, -1), stft_loss ) sc_loss = tf.where(sc_loss >= 15.0, tf.zeros_like(sc_loss), sc_loss) mag_loss = tf.where(mag_loss >= 15.0, tf.zeros_like(mag_loss), mag_loss) generator_loss = 0.5 * (sc_loss + mag_loss) p_hat = discriminator(y_hat) p = discriminator(tf.expand_dims(audios, 2)) adv_loss = 0.0 for i in range(len(p_hat)): adv_loss += calculate_3d_loss( tf.ones_like(p_hat[i][-1]), p_hat[i][-1], loss_fn = mse_loss ) adv_loss /= i + 1 fm_loss = 0.0 for i in range(len(p_hat)): for j in range(len(p_hat[i]) - 1): fm_loss += calculate_3d_loss( p[i][j], p_hat[i][j], loss_fn = mae_loss ) fm_loss /= (i + 1) * (j + 1) adv_loss += 10.0 * fm_loss generator_loss += 4.0 * adv_loss per_example_losses = generator_loss a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn = mels_loss) dict_metrics_losses = { 'adversarial_loss': adv_loss, 'fm_loss': fm_loss, 'gen_loss': tf.reduce_mean(generator_loss), 'mels_spectrogram_loss': tf.reduce_mean(a), } return per_example_losses, dict_metrics_losses
def compute_per_example_generator_losses(features): y_mb_hat = generator(features['mel'], training=True) audios = features['audio'] y_hat = pqmf.synthesis(y_mb_hat) y_mb = pqmf.analysis(tf.expand_dims(audios, -1)) y_mb = tf.transpose(y_mb, (0, 2, 1)) y_mb = tf.reshape(y_mb, (-1, tf.shape(y_mb)[-1])) y_mb_hat = tf.transpose(y_mb_hat, (0, 2, 1)) y_mb_hat = tf.reshape(y_mb_hat, (-1, tf.shape(y_mb_hat)[-1])) sub_sc_loss, sub_mag_loss = calculate_2d_loss(y_mb, y_mb_hat, sub_band_stft_loss) sub_sc_loss = tf.reduce_mean(tf.reshape(sub_sc_loss, [-1, pqmf.subbands]), -1) sub_mag_loss = tf.reduce_mean( tf.reshape(sub_mag_loss, [-1, pqmf.subbands]), -1) full_sc_loss, full_mag_loss = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), full_band_stft_loss) generator_loss = 0.5 * (sub_sc_loss + sub_mag_loss) + 0.5 * (full_sc_loss + full_mag_loss) p_hat = discriminator(y_hat) p = discriminator(tf.expand_dims(audios, 2)) adv_loss = 0.0 for i in range(len(p_hat)): adv_loss += calculate_3d_loss(tf.ones_like(p_hat[i][-1]), p_hat[i][-1], loss_fn=mse_loss) adv_loss /= i + 1 generator_loss += 2.5 * adv_loss per_example_losses = generator_loss a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn=mels_loss) dict_metrics_losses = { 'adversarial_loss': adv_loss, 'gen_loss': tf.reduce_mean(generator_loss), 'subband_spectral_convergence_loss': tf.reduce_mean(sub_sc_loss), 'subband_log_magnitude_loss': tf.reduce_mean(sub_mag_loss), 'fullband_spectral_convergence_loss': tf.reduce_mean(full_sc_loss), 'fullband_log_magnitude_loss': tf.reduce_mean(full_mag_loss), 'mels_spectrogram_loss': tf.reduce_mean(a), } return per_example_losses, dict_metrics_losses
def model_fn(features, labels, mode, params): tacotron2_config = malaya_speech.config.tacotron2_config tacotron2_config['reduction_factor'] = reduction_factor c = tacotron2.Config( vocab_size = len(MALAYA_SPEECH_SYMBOLS) + 1, **tacotron2_config ) model = tacotron2.Model(c) input_ids = features['text_ids'] input_lengths = features['len_text_ids'][:, 0] speaker_ids = tf.constant([0], dtype = tf.int32) mel_outputs = features['mel'] mel_lengths = features['len_mel'][:, 0] mel_actuals = features['mel'] guided = features['g'] r = model( input_ids, input_lengths, speaker_ids, mel_outputs, mel_lengths, training = True, ) binary_crossentropy = tf.keras.losses.BinaryCrossentropy(from_logits = True) mae = tf.keras.losses.MeanAbsoluteError() decoder_output, post_mel_outputs, stop_token_predictions, alignment_histories = ( r ) mel_loss_before = calculate_3d_loss( mel_actuals, decoder_output, loss_fn = mae ) mel_loss_after = calculate_3d_loss( mel_actuals, post_mel_outputs, loss_fn = mae ) max_mel_length = tf.reduce_max(mel_lengths) stop_gts = tf.expand_dims( tf.range(tf.reduce_max(max_mel_length), dtype = tf.int32), 0 ) stop_gts = tf.tile(stop_gts, [tf.shape(mel_lengths)[0], 1]) stop_gts = tf.cast( tf.math.greater_equal(stop_gts, tf.expand_dims(mel_lengths, 1)), tf.float32, ) stop_token_loss = calculate_2d_loss( stop_gts, stop_token_predictions, loss_fn = binary_crossentropy ) attention_masks = tf.cast(tf.math.not_equal(guided, -1.0), tf.float32) loss_att = tf.reduce_sum( tf.abs(alignment_histories * guided) * attention_masks, axis = [1, 2] ) loss_att /= tf.reduce_sum(attention_masks, axis = [1, 2]) loss_att = tf.reduce_mean(loss_att) loss = stop_token_loss + mel_loss_before + mel_loss_after + loss_att tf.identity(loss, 'loss') tf.identity(stop_token_loss, name = 'stop_token_loss') tf.identity(mel_loss_before, name = 'mel_loss_before') tf.identity(mel_loss_after, name = 'mel_loss_after') tf.identity(loss_att, name = 'loss_att') tf.summary.scalar('stop_token_loss', stop_token_loss) tf.summary.scalar('mel_loss_before', mel_loss_before) tf.summary.scalar('mel_loss_after', mel_loss_after) tf.summary.scalar('loss_att', loss_att) if mode == tf.estimator.ModeKeys.TRAIN: train_op = train.optimizer.adamw.create_optimizer( loss = loss, init_lr = learning_rate, num_train_steps = num_train_steps, num_warmup_steps = num_warmup_steps, end_learning_rate = end_learning_rate, weight_decay_rate = weight_decay_rate, ) estimator_spec = tf.estimator.EstimatorSpec( mode = mode, loss = loss, train_op = train_op ) elif mode == tf.estimator.ModeKeys.EVAL: estimator_spec = tf.estimator.EstimatorSpec( mode = tf.estimator.ModeKeys.EVAL, loss = loss ) return estimator_spec
def model_fn(features, labels, mode, params): input_ids = features['text_ids'] input_lengths = features['len_text_ids'][:, 0] speaker_ids = tf.constant([0], dtype=tf.int32) mel_outputs = features['mel'] mel_lengths = features['len_mel'][:, 0] guided = features['g'] model = tacotron2.Model( [input_ids, input_lengths], [mel_outputs, mel_lengths], len(MALAYA_SPEECH_SYMBOLS), ) r = model.decoder_logits['outputs'] decoder_output, post_mel_outputs, alignment_histories, _, _, _ = r stop_token_predictions = model.decoder_logits['stop_token_prediction'] stop_token_predictions = stop_token_predictions[:, :, 0] binary_crossentropy = tf.keras.losses.BinaryCrossentropy(from_logits=True) mae = tf.keras.losses.MeanAbsoluteError() mel_loss_before = calculate_3d_loss(mel_outputs, decoder_output, loss_fn=mae) mel_loss_after = calculate_3d_loss(mel_outputs, post_mel_outputs, loss_fn=mae) max_mel_length = tf.reduce_max(mel_lengths) stop_gts = tf.expand_dims( tf.range(tf.reduce_max(max_mel_length), dtype=tf.int32), 0) stop_gts = tf.tile(stop_gts, [tf.shape(mel_lengths)[0], 1]) stop_gts = tf.cast( tf.math.greater_equal(stop_gts, tf.expand_dims(mel_lengths, 1)), tf.float32, ) stop_token_loss = calculate_2d_loss(stop_gts, stop_token_predictions, loss_fn=binary_crossentropy) attention_masks = tf.cast(tf.math.not_equal(guided, -1.0), tf.float32) loss_att = tf.reduce_sum(tf.abs(alignment_histories * guided) * attention_masks, axis=[1, 2]) loss_att /= tf.reduce_sum(attention_masks, axis=[1, 2]) loss_att = tf.reduce_mean(loss_att) loss = stop_token_loss + mel_loss_before + mel_loss_after + loss_att tf.identity(loss, 'loss') tf.identity(stop_token_loss, name='stop_token_loss') tf.identity(mel_loss_before, name='mel_loss_before') tf.identity(mel_loss_after, name='mel_loss_after') tf.identity(loss_att, name='loss_att') tf.summary.scalar('stop_token_loss', stop_token_loss) tf.summary.scalar('mel_loss_before', mel_loss_before) tf.summary.scalar('mel_loss_after', mel_loss_after) tf.summary.scalar('loss_att', loss_att) if mode == tf.estimator.ModeKeys.TRAIN: train_op = train.optimizer.optimize_loss( loss, tf.train.AdamOptimizer, parameters['optimizer_params'], learning_rate_scheduler, summaries=[ 'learning_rate', 'variables', 'gradients', 'larc_summaries', 'variable_norm', 'gradient_norm', 'global_gradient_norm', ], larc_params=parameters.get('larc_params', None), loss_scaling=parameters.get('loss_scaling', 1.0), loss_scaling_params=parameters.get('loss_scaling_params', None), clip_gradients=parameters.get('max_grad_norm', None), ) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: estimator_spec = tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.EVAL, loss=loss) return estimator_spec