def infer(PROD, user_settings, text, output, fastpitch, vocoder, speaker_i, pace=1.0, pitch_data=None, logger=None, old_sequence=None): print(f'Inferring: "{text}" ({len(text)})') sigma_infer = 0.9 stft_hop_length = 256 sampling_rate = 22050 denoising_strength = 0.01 sequence = text_to_sequence(text, ['english_cleaners']) cleaned_text = sequence_to_text(sequence) text = torch.LongTensor(sequence) text = pad_sequence([text], batch_first=True).to(fastpitch.device) with torch.no_grad(): if old_sequence is not None: old_sequence = text_to_sequence(old_sequence, ['english_cleaners']) old_sequence = torch.LongTensor(old_sequence) old_sequence = pad_sequence([old_sequence], batch_first=True).to(fastpitch.device) mel, mel_lens, dur_pred, pitch_pred = fastpitch.infer_advanced(text, speaker_i=speaker_i, pace=pace, pitch_data=pitch_data, old_sequence=old_sequence) if "waveglow" in vocoder: init_waveglow(user_settings["use_gpu"], fastpitch, vocoder, logger=logger) audios = fastpitch.waveglow.infer(mel, sigma=sigma_infer) audios = fastpitch.denoiser(audios.float(), strength=denoising_strength).squeeze(1) for i, audio in enumerate(audios): audio = audio[:mel_lens[i].item() * stft_hop_length] audio = audio/torch.max(torch.abs(audio)) write(output, sampling_rate, audio.cpu().numpy()) del audios else: init_hifigan(PROD, fastpitch, user_settings["use_gpu"], vocoder) y_g_hat = fastpitch.hifi_gan(mel) audio = y_g_hat.squeeze() audio = audio * 32768.0 # audio = audio * 2.3026 # This brings it to the same volume, but makes it clip in places audio = audio.cpu().numpy().astype('int16') write(output, sampling_rate, audio) del mel, mel_lens [pitch, durations] = [pitch_pred.cpu().detach().numpy()[0], dur_pred.cpu().detach().numpy()[0]] pitch_durations_text = ",".join([str(v) for v in pitch])+"\n"+",".join([str(v) for v in durations]) del pitch_pred, dur_pred, text return pitch_durations_text +"\n"+cleaned_text
def infer(user_settings, text, output, fastpitch, hifi_gan, speaker_i, pace=1.0, pitch_data=None): print(f'Inferring: "{text}" ({len(text)})') sigma_infer = 0.9 stft_hop_length = 256 sampling_rate = 22050 denoising_strength = 0.01 sequence = text_to_sequence(text, ['english_cleaners']) cleaned_text = sequence_to_text(sequence) text = torch.LongTensor(sequence) text = pad_sequence([text], batch_first=True).to(fastpitch.device) with torch.no_grad(): mel, mel_lens, dur_pred, pitch_pred = fastpitch.infer_advanced( text, speaker_i=speaker_i, pace=pace, pitch_data=pitch_data) if hifi_gan: y_g_hat = fastpitch.hifi_gan(mel) audio = y_g_hat.squeeze() audio = audio * 32768.0 audio = audio.cpu().numpy().astype('int16') write(output, sampling_rate, audio) else: init_waveglow(user_settings["use_gpu"], fastpitch) audios = fastpitch.waveglow.infer(mel, sigma=sigma_infer) audios = fastpitch.denoiser(audios.float(), strength=denoising_strength).squeeze(1) for i, audio in enumerate(audios): audio = audio[:mel_lens[i].item() * stft_hop_length] audio = audio / torch.max(torch.abs(audio)) write(output, sampling_rate, audio.cpu().numpy()) [pitch, durations] = [ pitch_pred.cpu().detach().numpy()[0], dur_pred.cpu().detach().numpy()[0] ] pitch_durations_text = ",".join([str(v) for v in pitch]) + "\n" + ",".join( [str(v) for v in durations]) return pitch_durations_text + "\n" + cleaned_text
def infer_batch(PROD, user_settings, linesBatch, fastpitch, vocoder, speaker_i, logger=None, old_sequence=None): print(f'Inferring batch of {len(linesBatch)} lines') sigma_infer = 0.9 stft_hop_length = 256 sampling_rate = 22050 denoising_strength = 0.01 text_sequences = [] for record in linesBatch: text = record[0] sequence = text_to_sequence(text, ['english_cleaners']) text = torch.LongTensor(sequence) text_sequences.append(text) text_sequences = pad_sequence(text_sequences, batch_first=True).to(fastpitch.device) with torch.no_grad(): pace = torch.tensor([record[3] for record in linesBatch ]).unsqueeze(1).to(fastpitch.device) pitch_data = None # Maybe in the future mel, mel_lens, dur_pred, pitch_pred = fastpitch.infer_advanced( logger, text_sequences, speaker_i=speaker_i, pace=pace, pitch_data=pitch_data, old_sequence=None) if "waveglow" in vocoder: init_waveglow(user_settings["use_gpu"], fastpitch, vocoder, logger=logger) audios = fastpitch.waveglow.infer(mel, sigma=sigma_infer) audios = fastpitch.denoiser(audios.float(), strength=denoising_strength).squeeze(1) for i, audio in enumerate(audios): audio = audio[:mel_lens[i].item() * stft_hop_length] audio = audio / torch.max(torch.abs(audio)) output = linesBatch[i][4] write(output, sampling_rate, audio.cpu().numpy()) del audios else: init_hifigan(PROD, fastpitch, user_settings["use_gpu"], vocoder) y_g_hat = fastpitch.hifi_gan(mel) audios = y_g_hat.view((y_g_hat.shape[0], y_g_hat.shape[2])) # audio = audio * 2.3026 # This brings it to the same volume, but makes it clip in places for i, audio in enumerate(audios): audio = audio[:mel_lens[i].item() * stft_hop_length] audio = audio.cpu().numpy() audio = audio * 32768.0 audio = audio.astype('int16') output = linesBatch[i][4] write(output, sampling_rate, audio) del mel, mel_lens return ""