def synthesis_post(): global model, waveglow_model if not model: return redirect("/synthesis-setup") if request.method == "GET": return render_template("synthesis.html") else: text = request.form["text"] folder_name = get_suffix() results_folder = os.path.join(paths["results"], folder_name) os.makedirs(results_folder) graph_path = os.path.join(results_folder, GRAPH_FILE) audio_path = os.path.join(results_folder, RESULTS_FILE) graph_web_path = graph_path.replace("\\", "/") audio_web_path = audio_path.replace("\\", "/") synthesize(model, waveglow_model, text, inflect_engine, graph_path, audio_path) return render_template( "synthesis.html", text=text.strip(), alertnative_words=get_alternative_word_suggestions( audio_path, text), graph=graph_web_path, audio=audio_web_path, )
def test_synthesize(): model_path = os.path.join("files", "tacotron2_statedict.pt") waveglow_path = os.path.join("files", "waveglow_256channels_universal_v5.pt") graph_path = "graph.png" audio_path = "synthesized_audio.wav" model = load_model(model_path) assert model waveglow = load_waveglow_model(waveglow_path) assert waveglow text = "hello everybody my name is david attenborough" inflect_engine = inflect.engine() synthesize(model, waveglow, text, inflect_engine, graph=graph_path, audio=audio_path) assert text_similarity(text, transcribe(audio_path)) > 0.5 assert os.path.isfile(graph_path) assert os.path.isfile(audio_path) os.remove(graph_path) os.remove(audio_path)
def synthesis_post(): global model, vocoder, symbols if not model or not vocoder or not symbols: return redirect("/synthesis-setup") if request.method == "GET": return render_template("synthesis.html") else: text = request.form.getlist("text") if len(text) == 1: text = text[0] method = request.form["text_method"] split_text = method == "paragraph" parent_folder = os.path.join(paths["results"], datetime.now().strftime("%Y-%m")) os.makedirs(parent_folder, exist_ok=True) first_line = text[0] if type(text) == list else text results_folder = os.path.join( parent_folder, get_suffix() + "-" + re.sub("[^0-9a-zA-Z _]+", "", first_line.replace(" ", "_"))[:20]) os.makedirs(results_folder) graph_path = os.path.join(results_folder, GRAPH_FILE) audio_path = os.path.join(results_folder, RESULTS_FILE) graph_web_path = os.path.relpath(graph_path).replace("\\", "/") audio_web_path = os.path.relpath(audio_path).replace("\\", "/") silence = float(request.form["silence"]) max_decoder_steps = int(request.form["max_decoder_steps"]) synthesize( model, text, symbols, graph_path, audio_path, vocoder, silence, max_decoder_steps=max_decoder_steps, split_text=split_text, ) return render_template( "synthesis.html", text=text, method=method, graph=graph_web_path, audio=audio_web_path, silence=silence, max_decoder_steps=max_decoder_steps, )
def test_hifigan_synthesis(): hifigan_model_path = os.path.join("test_samples", "hifigan.pt") hifigan_config_path = os.path.join("test_samples", "config.json") audio_path = "synthesized_audio.wav" transcription_model = Silero() hifigan = Hifigan(hifigan_model_path, hifigan_config_path) text = "the monkeys live" synthesize( model=FakeModelForSynthesis(), text=text, graph_path=None, audio_path=audio_path, vocoder=hifigan, ) assert os.path.isfile(audio_path) assert similarity( text, transcription_model.transcribe(audio_path)) > MIN_SYNTHESIS_SCORE os.remove(audio_path)
def test_synthesize(): graph_path = "graph.png" audio_path = "synthesized_audio.wav" model = FakeModel() vocoder = FakeVocoder() # Single line text = "hello everybody my name is david attenborough" synthesize( model=model, text=text, graph_path=graph_path, audio_path=audio_path, vocoder=vocoder, sample_rate=22050, ) assert os.path.isfile(graph_path) assert os.path.isfile(audio_path) assert librosa.get_duration(filename=audio_path) == 1 os.remove(graph_path) os.remove(audio_path) # Multi line text = [ "the monkeys live in the jungle with their families.", "however, i prefer to live on the beach and enjoy the sun.", ] synthesize( model=model, text=text, graph_path=graph_path, audio_path=audio_path, vocoder=vocoder, silence_padding=0.5, sample_rate=22050, ) assert os.path.isfile(graph_path) assert os.path.isfile(audio_path) assert librosa.get_duration(filename=audio_path) == 2.5 os.remove(graph_path) os.remove(audio_path) # Split text text = ( "the monkeys live in the jungle with their families. however, i prefer to live on the beach and enjoy the sun." ) synthesize( model=model, text=text, graph_path=None, audio_path=audio_path, vocoder=vocoder, silence_padding=0.5, sample_rate=22050, split_text=True, ) assert os.path.isfile(audio_path) assert librosa.get_duration(filename=audio_path) == 2.5 os.remove(audio_path)