def infer_hifigan(args): import subprocess load_model(args.model_name) model_name = args.model_name if args.model_name != 'hifigan' else 'hifigan_v1' inference_file = "thirdparty/hifi-gan/inference.py" exe_inference_file = "thirdparty/hifi-gan/exe_inference.py" subprocess.call([ f"sed '1 i #!/usr/bin/env python' {inference_file} > {exe_inference_file}" ], shell=True) args_list = [ "--input_wavs_dir", args.folder_in, "--output_dir", f"data/out/{model_name}", "--checkpoint_file", f"pretrained/{model_name}/model.pth", ] cmd = [exe_inference_file] + args_list os.chmod(exe_inference_file, 777) popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True) for stdout_line in iter(popen.stdout.readline, ""): print(stdout_line, end='') popen.stdout.close()
def infer_wavenet(args): import sys sys.path.append('thirdparty/wavenet_vocoder') from train import build_model from synthesis import wavegen from tqdm import tqdm target_sample_rate = 22050 hparams, model = load_model(args.model_name) meller = MelSpectrogram() files = [ item for item in os.listdir(args.folder_in) if item.endswith('wav') ] for idx, audio in enumerate(files): wav_path = os.path.join(args.folder_in, audio) wav = load_wav(wav_path, target_sample_rate) c = meller(wav)[0] if c.shape[1] != hparams.num_mels: c = c.transpose(0, 1) # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1] # c = np.interp(c, (0, 4), (0, 1)) # Generate waveform = wavegen(model, c=c, fast=True, tqdm=tqdm) path = os.path.join(args.folder_out, audio) folder = os.path.dirname(path) if not os.path.exists(folder): os.makedirs(folder) torchaudio.save(path, waveform, hparams.sample_rate)
def infer_melgan(args): target_sample_rate = 22050 model = load_model(args.model_name) files = [ item for item in os.listdir(args.folder_in) if item.endswith('wav') ] for idx, audio in enumerate(files): wav_path = os.path.join(args.folder_in, audio) wav = load_wav(wav_path, target_sample_rate) with torch.no_grad(): mel = model(wav) waveform = model.inverse(mel) path = os.path.join(args.folder_out, audio) folder = os.path.dirname(path) if not os.path.exists(folder): os.makedirs(folder) torchaudio.save(path, waveform.cpu(), target_sample_rate)
def infer_waveglow(args): target_sample_rate = 22050 n_mels = 80 device = 'cuda' if torch.cuda.is_available() else 'cpu' model = load_model(args.model_name, device=device) meller = MelSpectrogram().to(device) files = [ item for item in os.listdir(args.folder_in) if item.endswith('wav') ] for idx, audio in enumerate(files): wav_path = os.path.join(args.folder_in, audio) wav = load_wav(wav_path, target_sample_rate).to(device) mel = meller(wav) if mel.shape[1] != n_mels: mel = mel.permute(0, 2, 1) waveform = model.inference(mel) path = os.path.join(args.folder_out, audio) folder = os.path.dirname(path) if not os.path.exists(folder): os.makedirs(folder) torchaudio.save(path, waveform.cpu(), target_sample_rate)
from speech_distances.models import load_model import argparse from scipy.stats import wilcoxon, mannwhitneyu import numpy as np # Test hypothesis that path1 and path2 files have the same quality # against one-sided alternative that files in path1 are better than path2 files if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--path1", type=str, help="path to .wav files which are assumed to be better") parser.add_argument( "--path2", type=str, help="path to .wav files which are assumed to be worse") args = parser.parse_args() mos_pred = load_model("wave2vec_mos") moses_1 = np.array(mos_pred.calculate(args.path1, False)) moses_2 = np.array(mos_pred.calculate(args.path2, False)) print("Ratio:", (moses_1 > moses_2).sum() / len(moses_1)) print("p-value:", wilcoxon(moses_1, moses_2, alternative="greater")[1])
def calculate_all_metrics(path, reference_path, n_max_files=None): metrics = {} FD = FrechetDistance( path=path, reference_path=reference_path, backbone="deepspeech2", sr=16000, sample_size=10000, num_runs=1, window_size=None, conditional=True, use_cached=True, ) metrics["FDSD"] = FD.calculate_metric()[0].data.item() FD.backbone.encoder.cpu() mos_pred = load_model("wave2vec_mos") moses = np.array(mos_pred.calculate(path, False)) moses_ref = np.array(mos_pred.calculate(reference_path, False)) mos_pred.cpu() metrics["MOS_wav2vec"] = moses.mean(), moses.std() metrics["MOSdeg_wav2vec"] = np.mean(np.maximum( moses_ref - moses, 0)), np.std(np.maximum(moses_ref - moses, 0)) metrics["MOSdeg_wav2vec_nonzero"] = np.sum(moses_ref - moses > 0) / len( moses.squeeze()) computer = speechmetrics.load( ["bsseval", "mosnet", "pesq", "stoi", "sisdr"], None) ll = glob.glob(os.path.join(path, "*.wav")) ll_gt = glob.glob(os.path.join(reference_path, "*.wav")) scores = [] for path_to_estimate_file, path_to_reference in tqdm( itertools.islice(zip(ll, ll_gt), n_max_files), total=n_max_files if n_max_files is not None else len(ll), desc="Calculating metrics from speechmetrics", ): scores.append(computer(path_to_estimate_file, path_to_reference)) scores = {k: [dic[k] for dic in scores] for k in scores[0]} scores_ref = [] for path_to_estimate_file, path_to_reference in tqdm( itertools.islice(zip(ll, ll_gt), n_max_files), total=n_max_files if n_max_files is not None else len(ll), desc="Calculating reference values of metrics", ): scores_ref.append(computer(path_to_reference, path_to_reference)) scores_ref = {k: [dic[k] for dic in scores_ref] for k in scores_ref[0]} metrics["MOS_orig"] = np.mean(np.stack(scores["mosnet"])), np.std( np.stack(scores["mosnet"])) mosdeg = np.maximum( -np.stack(scores["mosnet"]) + np.stack(scores_ref["mosnet"]), 0) metrics["MOSdeg_orig"] = np.mean(mosdeg), np.std(mosdeg) metrics["MOSdeg_orig_nonzero"] = np.sum(mosdeg > 0) / len(mosdeg.squeeze()) metrics["sisdr"] = np.mean(np.stack(scores["sisdr"])), np.std( np.stack(scores["sisdr"])) metrics["stoi"] = np.mean(np.stack(scores["stoi"])), np.std( np.stack(scores["stoi"])) metrics["pesq"] = np.mean(np.stack(scores["pesq"])), np.std( np.stack(scores["pesq"])) metrics["sdr"] = np.mean(np.stack(scores["sdr"])), np.std( np.stack(scores["sdr"])) LSD = [] SNR = [] for path_to_estimate_file, path_to_reference in tqdm( itertools.islice(zip(ll, ll_gt), n_max_files), total=n_max_files if n_max_files is not None else len(ll), desc="Calculating LSD and SNR metrics", ): x = librosa.load(path_to_estimate_file, sr=16000)[0] y = librosa.load(path_to_reference, sr=16000)[0] x = librosa.util.normalize(x[:min(len(x), len(y))]) y = librosa.util.normalize(y[:min(len(x), len(y))]) SNR.append(snr(x, y)) LSD.append(lsd(x, y)) metrics["snr"] = np.mean(SNR), np.std(SNR) metrics["lsd"] = np.mean(LSD), np.std(LSD) return metrics