def generate_cloned_samples(model, cloning_text_path=None, no_speakers=108, fast=True, p=0): cloning_texts = ["this is the first", "this is the second"] if (cloning_text_path == None): cloning_text_path = "./Cloning_Audio/cloning_text.txt" # cloning_texts = open("./Cloning_Audio/cloning_text.txt").read().splitlines() # no_cloning_texts = len(cloning_texts) all_speakers = [] for speaker_id in range(no_speakers): speaker_cloning_mel = [] # print("The Speaker being cloned speaker-{}".format(speaker_id)) for text in cloning_texts: waveform, alignment, spectrogram, mel = _tts( model, text, p, speaker_id, fast) speaker_cloning_mel.append(mel) #print(np.array(speaker_cloning_mel).shape) all_speakers.append(speaker_cloning_mel) with open("./Cloning_Audio/speakers_cloned_voices_mel.p", "wb") as fp: #Pickling pickle.dump(all_speakers, fp) # print("") print("Shape of all speakers:", np.array(all_speakers).shape) # print(all_speakers.shape) # all speakers[speaker_id][cloned_audio_number] # print(all_speakers[0][1].shape) return all_speakers
def tts(model, text, p=0, speaker_id=0, fast=True, figures=True): from dv3.synthesis import tts as _tts waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast) if figures: visualize(alignment, spectrogram) IPython.display.display(Audio(waveform, rate=fs))