def generate_audio(answer): sentences = [answer] print(sentences) spectograms = [ synthesize(model_taco, "|" + s + ACCENT) for s in sentences if len(s) > 0 ] return [ audio.inverse_spectrogram(_s, not hp.predict_linear) for _s in spectograms ]
def evaluation(eval_step, losses, mcd, source_len, target_len, source, target, prediction_forced, prediction, stop_prediction, stop_target, alignment, classifier): """Log evaluation results. Arguments: eval_step -- number of the current evaluation step (i.e. epoch) losses (dictionary of {loss name, value})-- dictionary with values of batch losses mcd (float) -- evaluation Mel Cepstral Distorsion source_len (tensor) -- number of characters of input utterances target_len (tensor) -- number of frames of ground-truth spectrograms source (tensor) -- input utterances target (tensor) -- ground-truth spectrograms prediction_forced (tensor) -- ground-truth-aligned spectrograms prediction (tensor) -- predicted spectrograms stop_prediction (tensor) -- predicted stop token probabilities stop_target (tensor) -- true stop token probabilities alignment (tensor) -- alignments (attention weights for each frame) of the last evaluation batch classifier (float) -- accuracy of the reversal classifier """ # log losses total_loss = sum(losses.values()) Logger._sw.add_scalar(f'Eval/loss_total', total_loss, eval_step) for n, l in losses.items(): Logger._sw.add_scalar(f'Eval/loss_{n}', l, eval_step) # show random sample: spectrogram, stop token probability, alignment and audio idx = random.randint(0, alignment.size(0) - 1) predicted_spec = prediction[ idx, :, :target_len[idx]].data.cpu().numpy() f_predicted_spec = prediction_forced[ idx, :, :target_len[idx]].data.cpu().numpy() target_spec = target[idx, :, :target_len[idx]].data.cpu().numpy() # log spectrograms if hp.normalize_spectrogram: predicted_spec = audio.denormalize_spectrogram( predicted_spec, not hp.predict_linear) f_predicted_spec = audio.denormalize_spectrogram( f_predicted_spec, not hp.predict_linear) target_spec = audio.denormalize_spectrogram( target_spec, not hp.predict_linear) Logger._sw.add_figure(f"Predicted/generated", Logger._plot_spectrogram(predicted_spec), eval_step) Logger._sw.add_figure(f"Predicted/forced", Logger._plot_spectrogram(f_predicted_spec), eval_step) Logger._sw.add_figure(f"Target/eval", Logger._plot_spectrogram(target_spec), eval_step) # log audio waveform = audio.inverse_spectrogram(predicted_spec, not hp.predict_linear) Logger._sw.add_audio(f"Audio/generated", waveform, eval_step, sample_rate=hp.sample_rate) waveform = audio.inverse_spectrogram(f_predicted_spec, not hp.predict_linear) Logger._sw.add_audio(f"Audio/forced", waveform, eval_step, sample_rate=hp.sample_rate) # log alignment alignment = alignment[ idx, :target_len[idx], :source_len[idx]].data.cpu().numpy().T Logger._sw.add_figure(f"Alignment/eval", Logger._plot_alignment(alignment), eval_step) # log source text utterance = text.to_text( source[idx].data.cpu().numpy()[:source_len[idx]], hp.use_phonemes) Logger._sw.add_text(f"Text/eval", utterance, eval_step) # log stop tokens Logger._sw.add_figure( f"Stop/eval", Logger._plot_stop_tokens(stop_target[idx].data.cpu().numpy(), stop_prediction[idx].data.cpu().numpy()), eval_step) # log mel cepstral distorsion Logger._sw.add_scalar(f'Eval/mcd', mcd, eval_step) # log reversal language classifier accuracy if hp.reversal_classifier: Logger._sw.add_scalar(f'Eval/classifier', classifier, eval_step)
help="Does not save waveforms if set.") args = parser.parse_args() print("Building model ...") model = build_model(args.checkpoint, args.cpu) model.eval() #total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) #print(f"Builded model with {total_params} parameters") inputs = [l.rstrip() for l in sys.stdin.readlines() if l] spectrograms = [] for i, item in enumerate(inputs): print(f'Synthesizing({i+1}/{len(inputs)}): "{item}"') id = item.split("|")[0] s = synthesize(model, item, args.cpu) if not os.path.exists(args.output): os.makedirs(args.output) if args.save_spec: np.save(os.path.join(args.output, f'{id}.npy'), s) if not args.ignore_wav: w = audio.inverse_spectrogram(s, not hp.predict_linear) audio.save(w, os.path.join(args.output, f'{id}.wav'))