예제 #1
0
 def call(self, x, target_durations, training, durations_scalar=1.):
     padding_mask = create_encoder_padding_mask(x)
     x = self.encoder_prenet(x)
     x, encoder_attention = self.encoder(x,
                                         training=training,
                                         padding_mask=padding_mask,
                                         drop_n_heads=self.drop_n_heads)
     durations = self.dur_pred(x, training=training) * durations_scalar
     durations = (1. -
                  tf.reshape(padding_mask, tf.shape(durations))) * durations
     if target_durations is not None:
         mels = self.expand(x, target_durations)
     else:
         mels = self.expand(x, durations)
     expanded_mask = create_mel_padding_mask(mels)
     mels = self.decoder_prenet(mels)
     mels, decoder_attention = self.decoder(mels,
                                            training=training,
                                            padding_mask=expanded_mask,
                                            drop_n_heads=self.drop_n_heads,
                                            reduction_factor=1)
     mels = self.out(mels)
     mels = self.decoder_postnet(mels, training=training)
     model_out = {
         'mel': mels,
         'duration': durations,
         'expanded_mask': expanded_mask,
         'encoder_attention': encoder_attention,
         'decoder_attention': decoder_attention
     }
     return model_out
 def _call_encoder(self, inputs, training):
     padding_mask = create_encoder_padding_mask(inputs)
     enc_input = inputs
     enc_output, attn_weights = self.encoder(enc_input,
                                             training=training,
                                             mask=padding_mask)
     return enc_output, padding_mask, attn_weights
예제 #3
0
 def _call_encoder(self, inputs, training):
     padding_mask = create_encoder_padding_mask(inputs)
     enc_input = self.encoder_prenet(inputs)
     enc_output, attn_weights = self.encoder(enc_input,
                                             training=training,
                                             padding_mask=padding_mask,
                                             drop_n_heads=self.drop_n_heads)
     return enc_output, padding_mask, attn_weights
예제 #4
0
 def _call_encoder(self, inputs, xvectors, training):
     #add xvectors
     padding_mask = create_encoder_padding_mask(inputs)
     enc_input = self.encoder_prenet(inputs)
     enc_output, attn_weights = self.encoder(enc_input,
                                             training=training,
                                             padding_mask=padding_mask,
                                             drop_n_heads=self.drop_n_heads)
     x_vec = self.enc_speaker_mod(xvectors)
     #mention axis is concatenation
     enc_output = tf.keras.layers.concatenate([enc_output, x_vec], axis=1)
     return enc_output, padding_mask, attn_weights
예제 #5
0
    def call(self,
             x,
             target_durations,
             spk_emb,
             training,
             durations_scalar=1.,
             max_durations_mask=None,
             min_durations_mask=None):
        encoder_padding_mask = create_encoder_padding_mask(x)
        x = self.encoder_prenet(x)
        x, encoder_attention = self.encoder(x,
                                            training=training,
                                            padding_mask=encoder_padding_mask,
                                            drop_n_heads=0)
        padding_mask = 1. - tf.squeeze(encoder_padding_mask, axis=(1, 2))[:, :,
                                                                          None]
        spk_emb = tf.math.softplus(self.speaker_fc(spk_emb))
        spk_emb = tf.expand_dims(spk_emb, 1)
        x = x + spk_emb  #tf.tile(pitch_embed, [1, tf.shape(x)[1], 1])

        durations = self.dur_pred(x, training=training, mask=padding_mask)

        if target_durations is not None:
            use_durations = target_durations
        else:
            use_durations = durations * durations_scalar
        if max_durations_mask is not None:
            use_durations = tf.math.minimum(
                use_durations, tf.expand_dims(max_durations_mask, -1))
        if min_durations_mask is not None:
            use_durations = tf.math.maximum(
                use_durations, tf.expand_dims(min_durations_mask, -1))
        mels = self.expand(x, use_durations)
        expanded_mask = create_mel_padding_mask(mels)
        mels, decoder_attention = self.decoder(mels,
                                               training=training,
                                               padding_mask=expanded_mask,
                                               drop_n_heads=0)
        mels = self.out(mels)
        model_out = {
            'mel': mels,
            'duration': durations,
            'expanded_mask': expanded_mask,
            'encoder_attention': encoder_attention,
            'decoder_attention': decoder_attention
        }
        return model_out
예제 #6
0
def get_durations_from_alignment(batch_alignments,
                                 mels,
                                 phonemes,
                                 weighted=False,
                                 binary=False,
                                 fill_gaps=False,
                                 fix_jumps=False,
                                 fill_mode='max'):
    """
    
    :param batch_alignments: attention weights from autoregressive model.
    :param mels: mel spectrograms.
    :param phonemes: phoneme sequence.
    :param weighted: if True use weighted average of durations of heads, best head if False.
    :param binary: if True take maximum attention peak, sum if False.
    :param fill_gaps: if True fills zeros durations with ones.
    :param fix_jumps: if True, tries to scan alingments for attention jumps and interpolate.
    :param fill_mode: used only if fill_gaps is True. Is either 'max' or 'next'. Defines where to take the duration
        needed to fill the gap. Next takes it from the next non-zeros duration value, max from the sequence maximum.
    :return:
    """
    assert (binary is True) or (
        fix_jumps is False), 'Cannot fix jumps in non-binary attention.'
    mel_pad_mask = create_mel_padding_mask(mels)
    phon_pad_mask = create_encoder_padding_mask(phonemes)
    durations = []
    # remove start end token or vector
    unpad_mels = []
    unpad_phonemes = []
    final_alignment = []
    for i, al in enumerate(batch_alignments):
        mel_len = int(mel_pad_mask[i].shape[-1] - np.sum(mel_pad_mask[i]))
        phon_len = int(phon_pad_mask[i].shape[-1] - np.sum(phon_pad_mask[i]))
        unpad_alignments = al[:, 1:mel_len - 1,
                              1:phon_len - 1]  # first dim is heads
        unpad_mels.append(mels[i, 1:mel_len - 1, :])
        unpad_phonemes.append(phonemes[i, 1:phon_len - 1])
        alignments_weights = weight_mask(unpad_alignments[0])
        heads_scores = []
        scored_attention = []
        for _, attention_weights in enumerate(unpad_alignments):
            score = np.sum(alignments_weights * attention_weights)
            scored_attention.append(attention_weights / score)
            heads_scores.append(score)

        if weighted:
            ref_attention_weights = np.sum(scored_attention, axis=0)
        else:
            best_head = np.argmin(heads_scores)
            ref_attention_weights = unpad_alignments[best_head]

        if binary:  # pick max attention for each mel time-step
            binary_attn, binary_score = binary_attention(ref_attention_weights)
            if fix_jumps:
                binary_attn = fix_attention_jumps(
                    binary_attn=binary_attn,
                    alignments_weights=alignments_weights,
                    binary_score=binary_score)
            integer_durations = binary_attn.sum(axis=0)

        else:  # takes actual attention values and normalizes to mel_len
            attention_durations = np.sum(ref_attention_weights, axis=0)
            normalized_durations = attention_durations * (
                (mel_len - 2) / np.sum(attention_durations))
            integer_durations = np.round(normalized_durations)
            tot_duration = np.sum(integer_durations)
            duration_diff = tot_duration - (mel_len - 2)
            while duration_diff != 0:
                rounding_diff = integer_durations - normalized_durations
                if duration_diff > 0:  # duration is too long -> reduce highest (positive) rounding difference
                    max_error_idx = np.argmax(rounding_diff)
                    integer_durations[max_error_idx] -= 1
                elif duration_diff < 0:  # duration is too short -> increase lowest (negative) rounding difference
                    min_error_idx = np.argmin(rounding_diff)
                    integer_durations[min_error_idx] += 1
                tot_duration = np.sum(integer_durations)
                duration_diff = tot_duration - (mel_len - 2)

        if fill_gaps:  # fill zeros durations
            integer_durations = fill_zeros(integer_durations,
                                           take_from=fill_mode)

        assert np.sum(
            integer_durations
        ) == mel_len - 2, f'{np.sum(integer_durations)} vs {mel_len - 2}'
        new_alignment = duration_to_alignment_matrix(
            integer_durations.astype(int))
        best_head = np.argmin(heads_scores)
        best_attention = unpad_alignments[best_head]
        final_alignment.append(best_attention.T + new_alignment)
        durations.append(integer_durations)
    return durations, unpad_mels, unpad_phonemes, final_alignment