def load_vocoder(self, model_file, model_config, use_cuda): self.vocoder_config = load_config(model_config) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_state_dict( torch.load(model_file, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 self.vocoder_config = load_config(model_config) if use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval()
def __init__(self, path, INPUT_SR, TARGET_SR, WINDOW_LENGTH): self.INPUT_SR = INPUT_SR self.TARGET_SR = TARGET_SR self.WINDOW_LENGTH = WINDOW_LENGTH self.CONFIG = load_config('config_fr.json') self.CONFIG['audio']['sample_rate'] = self.INPUT_SR self.AP_INPUT = AudioProcessor(**self.CONFIG['audio']) self.CONFIG['audio']['sample_rate'] = self.TARGET_SR self.AP_TARGET = AudioProcessor(**self.CONFIG['audio']) self.files = glob.glob(path + '/**/*.wav', recursive=True) #If you change your dataset, delete cache.json if os.path.isfile('./cache.json'): with open('./cache.json', "r") as json_file: self.pre_repertoir = json.load(json_file) else: print("> Computing wave files length...") self.pre_repertoir = [ librosa.get_duration(filename=file) for file in tqdm(self.files) ] with open('./cache.json', mode="w") as json_file: json.dump(self.pre_repertoir, json_file) self.repertoir = [ int(item / WINDOW_LENGTH) for item in self.pre_repertoir ] self.length = self.get_len()
def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append( lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file, map_location="cpu") self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval()
def test_in_out(self): self._create_random_model() config = load_config( os.path.join(get_tests_input_path(), 'server_config.json')) tts_root_path = get_tests_output_path() config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint']) config['tts_config'] = os.path.join(tts_root_path, config['tts_config']) synthesizer = Synthesizer(config) synthesizer.tts("Better this test works!!")
def _create_random_model(self): # pylint: disable=global-statement global symbols, phonemes config = load_config( os.path.join(get_tests_output_path(), 'dummy_model_config.json')) if 'characters' in config.keys(): symbols, phonemes = make_symbols(**config.characters) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, 10, 10, 1, output_path)
def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) print(f" > model reduction factor: {cp['r']}")
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description="Compute mean and variance of spectrogtram features.") parser.add_argument( "--config_path", type=str, required=True, help="TTS config file path to define audio processin parameters.") parser.add_argument("--out_path", default=None, type=str, help="directory to save the output file.") args = parser.parse_args() # load config CONFIG = load_config(args.config_path) CONFIG.audio['signal_norm'] = False # do not apply earlier normalization CONFIG.audio['stats_path'] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) # load the meta data of target dataset dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data print(f" > There are {len(dataset_items)} files.") mel_sum = 0 mel_square_sum = 0 linear_sum = 0 linear_square_sum = 0 N = 0 for item in tqdm(dataset_items): # compute features wav = ap.load_wav(item[1]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) # compute stats N += mel.shape[1] mel_sum += mel.sum(1) linear_sum += linear.sum(1) mel_square_sum += (mel**2).sum(axis=1) linear_square_sum += (linear**2).sum(axis=1) mel_mean = mel_sum / N mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2) linear_mean = linear_sum / N linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2) output_file_path = os.path.join(args.out_path, "scale_stats.npy") stats = {} stats['mel_mean'] = mel_mean stats['mel_std'] = mel_scale stats['linear_mean'] = linear_mean stats['linear_std'] = linear_scale print(f' > Avg mel spec mean: {mel_mean.mean()}') print(f' > Avg mel spec scale: {mel_scale.mean()}') print(f' > Avg linear spec mean: {linear_mean.mean()}') print(f' > Avg lienar spec scale: {linear_scale.mean()}') # set default config values for mean-var scaling CONFIG.audio['stats_path'] = output_file_path CONFIG.audio['signal_norm'] = True # remove redundant values del CONFIG.audio['max_norm'] del CONFIG.audio['min_level_db'] del CONFIG.audio['symmetric_norm'] del CONFIG.audio['clip_norm'] stats['audio_config'] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) print(f' > scale_stats.npy is saved to {output_file_path}')
default="", help='DISTRIBUTED: process group id.') args = parser.parse_args() if args.continue_path != '': args.output_path = args.continue_path args.config_path = os.path.join(args.continue_path, 'config.json') list_of_files = glob.glob( args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv latest_model_file = max(list_of_files, key=os.path.getctime) args.restore_path = latest_model_file print(f" > Training continues for {args.restore_path}") # setup output paths and read configs c = load_config(args.config_path) # check_config(c) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path if args.continue_path == '': OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') c_logger = ConsoleLogger() if args.rank == 0: os.makedirs(AUDIO_PATH, exist_ok=True) new_fields = {}
from tests import get_tests_input_path from mozilla_voice_tts.tts.tf.models.tacotron2 import Tacotron2 from mozilla_voice_tts.tts.tf.utils.tflite import (convert_tacotron2_to_tflite, load_tflite_model) from mozilla_voice_tts.utils.io import load_config tf.get_logger().setLevel('INFO') #pylint: disable=unused-variable torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) class TacotronTFTrainTest(unittest.TestCase): @staticmethod def generate_dummy_inputs(): chars_seq = torch.randint(0, 24, (8, 128)).long().to(device) chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device) chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0] mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_ids = torch.randint(0, 5, (8, )).long().to(device) chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
import os import unittest import torch as T from tests import get_tests_input_path from mozilla_voice_tts.speaker_encoder.losses import GE2ELoss, AngleProtoLoss from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder from mozilla_voice_tts.utils.io import load_config file_path = get_tests_input_path() c = load_config(os.path.join(file_path, "test_config.json")) class SpeakerEncoderTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): dummy_input = T.rand(4, 20, 80) # B x T x D dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)] model = SpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3) # computing d vectors output = model.forward(dummy_input) assert output.shape[0] == 4 assert output.shape[1] == 256 output = model.inference(dummy_input) assert output.shape[0] == 4 assert output.shape[1] == 256 # compute d vectors by passing LSTM hidden
parser = argparse.ArgumentParser() parser.add_argument('--torch_model_path', type=str, help='Path to target torch model to be converted to TF.') parser.add_argument('--config_path', type=str, help='Path to config file of torch model.') parser.add_argument( '--output_path', type=str, help='path to output file including file name to save TF model.') args = parser.parse_args() # load model config config_path = args.config_path c = load_config(config_path) num_speakers = 0 # init torch model model = setup_generator(c) checkpoint = torch.load(args.torch_model_path, map_location=torch.device('cpu')) state_dict = checkpoint['model'] model.load_state_dict(state_dict) model.remove_weight_norm() state_dict = model.state_dict() # init tf model model_tf = setup_tf_generator(c) common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
help="JSON file for multi-speaker model.", default="") parser.add_argument( '--speaker_fileid', type=str, help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.", default=None) parser.add_argument( '--gst_style', help="Wav path file for GST stylereference.", default=None) args = parser.parse_args() # load the config C = load_config(args.config_path) C.forward_attn_mask = True # load the audio processor ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) speaker_embedding = None speaker_embedding_dim = None num_speakers = 0 # load speakers if args.speakers_json != '':