def load(self, checkpoint_path, fast=True): # Presets if hparams.preset is not None and hparams.preset != "": preset = hparams.presets[hparams.preset] import json hparams.parse_json(json.dumps(preset)) print("Override hyper parameters with preset \"{}\": {}".format( hparams.preset, json.dumps(preset, indent=4))) self._frontend = getattr(frontend, hparams.frontend) import train train._frontend = self._frontend from train import build_model # Model self.model = build_model() # Load checkpoints separately checkpoint = torch.load(checkpoint_path) self.model.load_state_dict(checkpoint["state_dict"]) #model.seq2seq.decoder.max_decoder_steps = max_decoder_steps self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.model = self.model.cuda() self.model.eval() if fast: self.model.make_generation_fast_()
def main(): parser = argparse.ArgumentParser(description='Train FFTNet') parser.add_argument('--base_dir', default='') parser.add_argument( '--hparams', default='', help= 'Hyper parameter overrides as a comma-separated list of name=value pairs' ) parser.add_argument('--train_file', default='training_data/train.txt') parser.add_argument('--val_file', default='training_data/val.txt') parser.add_argument('--name', help='Name of logging directory.') parser.add_argument('--model', default='fftnet') parser.add_argument('--preset', default=None, type=str, help='the preset config json file') parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms') parser.add_argument('--restore_step', default=None, type=int, help='the restore step') parser.add_argument('--summary_interval', type=int, default=200, help='Steps between running summary ops') parser.add_argument('--summary_val_interval', type=int, default=10, help='Steps between running summary ops') parser.add_argument('--eval_interval', type=int, default=100, help='Steps between train eval ops') parser.add_argument('--checkpoint_interval', type=int, default=2000, help='Steps between writing checkpoints') parser.add_argument('--epochs', type=int, default=2000, help='total number of tacotron training steps') parser.add_argument('--tf_log_level', type=int, default=2, help='TensorFlow C++ log level.') args = parser.parse_args() # load preset config, so u don't need to change anything in the hparams if args.preset is not None: with open(args.preset) as f: hparams.parse_json(f.read()) log_dir, hp = prepare_run(args) train(log_dir, args, hp)
def load_hparams_from_preset(preset): hparams_json_string = "" with open(preset) as f: for line in f: if line.strip().startswith("//"): continue hparams_json_string += line wavenet_hparams.parse_json(hparams_json_string)
def main(): args = docopt(__doc__) print("Command line args:\n", args) checkpoint_dir = args["--checkpoint-dir"] source_data_root = args["--source-data-root"] target_data_root = args["--target-data-root"] selected_list_dir = args["--selected-list-dir"] use_multi_gpu = args["--multi-gpus"] if args["--hparam-json-file"]: with open(args["--hparam-json-file"]) as f: json = "".join(f.readlines()) hparams.parse_json(json) hparams.parse(args["--hparams"]) training_list = list(load_key_list("train.csv", selected_list_dir)) validation_list = list(load_key_list("validation.csv", selected_list_dir)) training_source_files = [ os.path.join(source_data_root, f"{key}.{hparams.source_file_extension}") for key in training_list ] training_target_files = [ os.path.join(target_data_root, f"{key}.{hparams.target_file_extension}") for key in training_list ] validation_source_files = [ os.path.join(source_data_root, f"{key}.{hparams.source_file_extension}") for key in validation_list ] validation_target_files = [ os.path.join(target_data_root, f"{key}.{hparams.target_file_extension}") for key in validation_list ] print("training source", len(training_source_files)) print("training target", len(training_target_files)) log = logging.getLogger("tensorflow") log.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh = logging.FileHandler(hparams.logfile) fh.setLevel(logging.INFO) fh.setFormatter(formatter) log.addHandler(fh) tf.logging.set_verbosity(tf.logging.INFO) tf.logging.info(hparams_debug_string()) train_and_evaluate(hparams, checkpoint_dir, training_source_files, training_target_files, validation_source_files, validation_target_files, use_multi_gpu)
def main(): args = get_args() if args.preset is not None: with open(args.preset) as f: hparams.parse_json(f.read()) modified_hp = hparams.parse(args.hparams) print(hparams_debug_string()) synthesis(args.checkpoint_path, args.local_path, args.global_id, args.output_dir, modified_hp)
def prepare_run(args): if args.hparams_fp is not None: modified_hp = hparams.parse_json(open(args.hparams_fp, 'r').read()) else: modified_hp = hparams.parse(args.hparams) os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) run_name = args.name or args.model log_dir = os.path.join(args.base_dir, 'logs-{}'.format(run_name)) os.makedirs(log_dir, exist_ok=True) infolog.init(os.path.join(log_dir, 'Terminal_train_log'), run_name, args.slack_url) return log_dir, modified_hp
def main(): args = docopt(__doc__) print("Command line args:\n", args) checkpoint_dir = args["--checkpoint-dir"] checkpoint_path = args["--checkpoint"] source_data_root = args["--source-data-root"] target_data_root = args["--target-data-root"] selected_list_dir = args["--selected-list-dir"] output_dir = args["--output-dir"] selected_list_filename = args["--selected-list-filename"] or "test.csv" tf.logging.set_verbosity(tf.logging.INFO) if args["--hparam-json-file"]: with open(args["--hparam-json-file"]) as f: json = "".join(f.readlines()) hparams.parse_json(json) hparams.parse(args["--hparams"]) tf.logging.info(hparams_debug_string()) tf.logging.info( f"A selected list file to use: {os.path.join(selected_list_dir, selected_list_filename)}" ) test_list = list(load_key_list(selected_list_filename, selected_list_dir)) test_source_files = [ os.path.join(source_data_root, f"{key}.{hparams.source_file_extension}") for key in test_list ] test_target_files = [ os.path.join(target_data_root, f"{key}.{hparams.target_file_extension}") for key in test_list ] predict(hparams, checkpoint_dir, checkpoint_path, output_dir, test_source_files, test_target_files)
def get_model(checkpoint_path): max_decoder_steps = 500 assert hparams.name == "deepvoice3" preset = join(dirname(__file__), 'presets/nyanko_ljspeech.json') with open(preset) as f: hparams.parse_json(f.read()) global _frontend _frontend = getattr(frontend, hparams.frontend) train._frontend = _frontend # Model model = build_model() checkpoint = _load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) checkpoint_name = splitext(basename(checkpoint_path))[0] model.seq2seq.decoder.max_decoder_steps = max_decoder_steps return model
out0 = np.clip(out0, -1, 1) sf.write(join(writing_dir, "out0_{}.wav".format(sigma)), out0, hparams.sample_rate) out1 = inv_linear_quantize(x1[0].detach().cpu().numpy(), hparams.quantize_channels - 1) out1 = np.clip(out1, -1, 1) sf.write(join(writing_dir, "out1_{}.wav".format(sigma)), out1, hparams.sample_rate) if __name__ == "__main__": args = docopt(__doc__) # Load preset if specified if args["--preset"] is not None: with open(args["--preset"]) as f: hparams.parse_json(f.read()) else: hparams_json = join(dirname(args["<checkpoint1>"]), "hparams.json") if exists(hparams_json): print("Loading hparams from {}".format(hparams_json)) with open(hparams_json) as f: hparams.parse_json(f.read()) # Override hyper parameters hparams.parse(args["--hparams"]) assert hparams.name == "wavenet_vocoder" main(args)
from pyspark import SparkContext from docopt import docopt from hparams import hparams, hparams_debug_string from preprocess.vctk import VCTK if __name__ == "__main__": args = docopt(__doc__) in_dir = args["<in_dir>"] out_dir = args["<out_dir>"] source_only = args["--source-only"] target_only = args["--target-only"] if args["--hparam-json-file"]: with open(args["--hparam-json-file"]) as f: hparams_json = "".join(f.readlines()) hparams.parse_json(hparams_json) hparams.parse(args["--hparams"]) print(hparams_debug_string()) if source_only: process_source = True process_target = False elif target_only: process_source = False process_target = True else: process_source = True process_target = True instance = VCTK(in_dir, out_dir, hparams)
if data_root is None: data_root = join(dirname(__file__), "data", "ljspeech") log_event_path = args["--log-event-path"] reset_optimizer = args["--reset-optimizer"] # Override hyper parameters hparams.parse(args["--hparams"]) print(hparams_debug_string()) assert hparams.name == "wavenet_vocoder" # Presets if hparams.preset is not None and hparams.preset != "": preset = hparams.presets[hparams.preset] import json hparams.parse_json(json.dumps(preset)) print("Override hyper parameters with preset \"{}\": {}".format( hparams.preset, json.dumps(preset, indent=4))) os.makedirs(checkpoint_dir, exist_ok=True) # Dataloader setup data_loaders = get_data_loaders(data_root, speaker_id, test_shuffle=True) # Model model = build_model() print(model) if use_cuda: model = model.cuda() receptive_field = model.receptive_field
from pyspark import SparkContext from docopt import docopt from hparams import hparams, hparams_debug_string from preprocess.vctk import VCTK if __name__ == "__main__": args = docopt(__doc__) in_dir = args["<in_dir>"] out_dir = args["<out_dir>"] source_only = args["--source-only"] target_only = args["--target-only"] if args["--hparam-json-file"]: with open(args["--hparam-json-file"]) as f: json = "".join(f.readlines()) hparams.parse_json(json) hparams.parse(args["--hparams"]) print(hparams_debug_string()) if source_only: process_source = True process_target = False elif target_only: process_source = False process_target = True else: process_source = True process_target = True instance = VCTK(in_dir, out_dir, hparams)
def synthesis(checkpoint_path, preset, dst_dir, srt_path, face_path): global _frontend checkpoint_seq2seq_path = None checkpoint_postnet_path = None max_decoder_steps = 500 file_name_suffix = "" replace_pronunciation_prob = float(0.0) # Load preset if specified if preset is not None: with open(preset) as f: hparams.parse_json(f.read()) # Override hyper parameters hparams.parse("") assert hparams.name == "deepvoice3" _frontend = getattr(frontend, hparams.frontend) print(_frontend) import train train._frontend = _frontend from train import plot_alignment, build_model # Model model = build_model() # Load checkpoints separately if checkpoint_postnet_path is not None and checkpoint_seq2seq_path is not None: checkpoint = _load(checkpoint_seq2seq_path) model.seq2seq.load_state_dict(checkpoint["state_dict"]) checkpoint = _load(checkpoint_postnet_path) model.postnet.load_state_dict(checkpoint["state_dict"]) checkpoint_name = splitext(basename(checkpoint_seq2seq_path))[0] else: checkpoint = _load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) checkpoint_name = splitext(basename(checkpoint_path))[0] model.seq2seq.decoder.max_decoder_steps = max_decoder_steps os.makedirs(dst_dir, exist_ok=True) task = load_srt(srt_path, face_path) idx = 0 for i in task: speaker_id = i[3] text = i[4] words = nltk.word_tokenize(text) file_name = "{} speaker_{} {}-{}".format(idx, speaker_id, i[1], i[2]) print(text) waveform, alignment, _, _ = tts(model, text, p=replace_pronunciation_prob, speaker_id=speaker_id, fast=True) dst_wav_path = join(dst_dir, "{}.wav".format(file_name)) dst_alignment_path = join(dst_dir, "{}_alignment.png".format(file_name)) plot_alignment(alignment.T, dst_alignment_path, info="{}, {}".format(hparams.builder, basename(checkpoint_path))) audio.save_wav(waveform, dst_wav_path) print( idx, ": {}\n ({} chars, {} words)".format(text, len(text), len(words))) idx += 1 print( "Finished! Check out {} for generated audio samples.".format(dst_dir))
name = args["<name>"] in_dir = args["<in_dir>"] out_dir = args["<out_dir>"] num_workers = args["--num_workers"] num_workers = cpu_count() // 2 if num_workers is None else int(num_workers) preset = args["--preset"] # Load preset if specified if preset is not None: hparams_json_string = "" with open(preset) as f: for line in f: if line.strip().startswith("//"): continue hparams_json_string += line hparams.parse_json(hparams_json_string) # Override hyper parameters hparams.parse(args["--hparams"]) assert hparams.name == "wavenet_vocoder" print("Using name: '%s'" % name) print("Sampling frequency: {}".format(hparams.sample_rate)) if name in ["cmu_arctic", "jsut", "librivox"]: print("""warn!: {} is no longer explicitly supported! Please use a generic dataest 'wavallin' instead. All you need to do is to put all wav files in a single directory.""".format( name)) sys.exit(1) if name == "ljspeech":
def main(): args = docopt(__doc__) print("Command line args:\n", args) run_name = args["--run-name"] # dataset root device = args["--device"] phase = args["--phase"] # train or synthesis data_root = args["--data-root"] # dataset root checkpoint_name = args["--checkpoint-name"] speaker_id = args["--speaker-id"] log_event_path = args["--log-event-path"] reset_optimizer = args["--reset-optimizer"] text_list_file_path = args["--text-list-file"] preset = args["--preset"] speaker_id = int(speaker_id) if speaker_id is not None else None if run_name is None: run_name = "Tacotron2" + time_string() log_dir = prepare_run(run_name) if data_root is None: data_root = os.path.join(dirname(__file__), "data", "mandarin") # Load preset if specified if preset is not None: with open(preset) as f: hparams.parse_json(f.read()) # Override hyper parameters hparams.parse(args["--hparams"]) assert hparams.builder == "Tacotron2" if device is not None: hparams.device = device print(hparams_debug_string()) train_path = os.path.join(log_dir, "train") val_path = os.path.join(log_dir, "val") checkpoint_path = os.path.join(log_dir, "pretrained") os.makedirs(train_path, exist_ok=True) os.makedirs(val_path, exist_ok=True) os.makedirs(checkpoint_path, exist_ok=True) best_loss = 0 global global_epoch global_epoch = 0 global global_step global_step = 0 if hparams.seed is not None: random.seed(hparams.seed) torch.manual_seed(hparams.seed) cudnn.deterministic = hparams.cudnn_deterministic cudnn.benchmark = hparams.cudnn_benchmark warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') log("The system set the random number to:{}".format(hparams.seed)) if hparams.device > -1: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') distributed = hparams.world_size > 1 if distributed: dist.init_process_group(backend=hparams.dist_backend, init_method=hparams.dist_url, world_size=hparams.world_size) model = build_model() print(model) if hparams.device > -1: model = model.cuda(hparams.device) elif distributed: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer mels_criterion = MaskedMSELoss() stop_criterion = MaskedBCELoss() optimizer = torch.optim.Adam(model.get_trainable_parameters(), lr=hparams.init_learning_rate, betas=(hparams.adam_beta1, hparams.adam_beta2), eps=hparams.adam_epsilon, weight_decay=hparams.weight_decay) scheduler = ExpLRDecay(init_learning_rate=hparams.init_learning_rate, decay_rate=hparams.decay_rate, start_step=hparams.start_decay, decay_steps=hparams.decay_step) # optionally resume from a checkpoint if checkpoint_name is not None: if os.path.isfile(checkpoint_name): load_checkpoint(checkpoint_name, model, hparams.device, optimizer, reset_optimizer) else: file_full_path = os.path.join(checkpoint_path, checkpoint_name) if os.path.isfile(file_full_path): load_checkpoint(file_full_path, model, hparams.device, optimizer, reset_optimizer) else: log("=> no checkpoint found at '{}'".format(checkpoint_name)) # synthesis if phase == "synthesis": if text_list_file_path is None: test_lines = [ "yun2cong2ke1ji4cheng2li4yu2er4ling2yi1wu3nian2si4yue4", "shi4yi1jia1fu1hua4yu2zhong1guo2ke1xue2yuan4chong2qing4yan2jiu1yuan4de0gao1ke1ji4qi3ye4" "zhuan1zhu4yu2ji4suan4ji1shi4jue2yu3ren2gong1zhi4neng2", "yi2ge4hao3zheng4quan2zhi1de2yi3bao3chi2da4bu4fen4zai4yu2bu4tong2de0zheng4jian4", "he2li3de0fa1hui1qi2gong1yong4" ] else: test_lines = [] with open(text_list_file_path, "rb") as f: lines = f.readlines() for line in lines: text = line.decode("utf-8")[:-1] test_lines.append(text) synthesis(test_lines, model, device, log_dir) return # Setup summary writer for tensorboard if log_event_path is None: log_event_path = os.path.join(log_dir, "log_event_path") print("Los event path: {}".format(log_event_path)) writer = SummaryWriter(log_dir=log_event_path) # Prepare dataset dataset_dir = os.path.join(dirname(__file__), data_root) texts_list, mels_list, mels_length_list, speaker_ids_list = get_item_list( dataset_dir, "train.txt") #indices = np.arange(256*16) indices = np.arange(len(texts_list) - len(texts_list) % hparams.batch_size) test_size = hparams.test_batches * hparams.batch_size train_indices, val_indices = train_test_split(indices, test_size=test_size, random_state=hparams.seed) collate_fn = AudioCollate(padding_mels=hparams.padding_mels) # prepare train dataset train_dataset_text_ids = [texts_list[i] for i in train_indices] train_dataset_mels_ids = [mels_list[i] for i in train_indices] train_dataset_mels_length_ids = [ mels_length_list[i] for i in train_indices ] if speaker_ids_list is not None: train_dataset_speaker_ids = [ speaker_ids_list[i] for i in train_indices ] else: train_dataset_speaker_ids = None train_dataset = AudiobookDataset(train_dataset_text_ids, train_dataset_mels_ids, train_dataset_speaker_ids, dataset_dir) if distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) train_loader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=hparams.batch_size, num_workers=2, shuffle=True, pin_memory=hparams.pin_memory) else: if hparams.dynamical_batch_size: train_sampler = DynamicalSimilarTimeLengthSampler( train_dataset_mels_length_ids, batch_size_min=hparams.batch_size, batch_expand_level=hparams.batch_size_level, batch_group=hparams.batch_group, permutate=hparams.permutate) train_batch_sampler = DynamicalBatchSampler(train_sampler) train_loader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=hparams.batch_size, batch_sampler=train_batch_sampler, num_workers=2, shuffle=False, pin_memory=True) else: train_sampler = SimilarTimeLengthSampler( train_dataset_mels_length_ids, descending=True, batch_size=hparams.batch_size, batch_group_size=hparams.batch_group_size, permutate=hparams.permutate) train_sampler = None shuffle = (train_sampler == None) train_loader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=hparams.batch_size, sampler=train_sampler, num_workers=2, shuffle=False, pin_memory=True) # prepare val dataset val_dataset_text_ids = [texts_list[i] for i in val_indices] val_dataset_mels_ids = [mels_list[i] for i in val_indices] val_dataset_mels_length_ids = [mels_length_list[i] for i in val_indices] if speaker_ids_list is not None: val_dataset_speaker_ids = [speaker_ids_list[i] for i in val_indices] else: val_dataset_speaker_ids = None val_dataset = AudiobookDataset(val_dataset_text_ids, val_dataset_mels_ids, val_dataset_speaker_ids, dataset_dir) val_loader = DataLoader(val_dataset, collate_fn=collate_fn, batch_size=hparams.batch_size, num_workers=2, shuffle=True, pin_memory=True) for epoch in range(global_epoch, hparams.nepochs): # train for one epoch train(train_loader, model, hparams.device, mels_criterion, stop_criterion, optimizer, scheduler, writer, train_path) # evaluate on validation set loss = validate(val_loader, model, hparams.device, mels_criterion, stop_criterion, writer, val_path) # remember best prec@1 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint(model, optimizer, checkpoint_path)
conditional_path = args["--conditional"] # From https://github.com/Rayhane-mamah/Tacotron-2 symmetric_mels = args["--symmetric-mels"] max_abs_value = float(args["--max-abs-value"]) file_name_suffix = args["--file-name-suffix"] output_html = args["--output-html"] speaker_id = args["--speaker-id"] speaker_id = None if speaker_id is None else int(speaker_id) preset = args["--preset"] # Load preset if specified if preset is not None: with open(preset) as f: hparams.parse_json(f.read()) # Override hyper parameters hparams.parse(args["--hparams"]) assert hparams.name == "wavenet_vocoder" # Load conditional features if conditional_path is not None: c = np.load(conditional_path) if c.shape[1] != hparams.num_mels: np.swapaxes(c, 0, 1) if max_abs_value > 0: min_, max_ = 0, max_abs_value if symmetric_mels: min_ = -max_ print("Normalize features to desired range [0, 1] from [{}, {}]". format(min_, max_))
def wavsynthesis(): args = docopt(__doc__) print("Command line args:\n", args) checkpoint_path = args["<checkpoint>"] dst_dir = args["<dst_dir>"] length = int(args["--length"]) initial_value = args["--initial-value"] initial_value = None if initial_value is None else float(initial_value) conditional_path = args["--conditional"] file_name_suffix = args["--file-name-suffix"] output_html = args["--output-html"] speaker_id = args["--speaker-id"] speaker_id = None if speaker_id is None else int(speaker_id) preset = args["--preset"] # Force CPU synthesis mode if required if args["--force-cpu"]: use_cuda = False device = torch.device("cpu") # Load preset if specified if preset is not None: with open(preset) as f: hparams.parse_json(f.read()) # Override hyper parameters hparams.parse(args["--hparams"]) assert hparams.name == "wavenet_vocoder" # Load conditional features if conditional_path is not None: c = np.load(conditional_path) if c.shape[1] != hparams.num_mels: c = np.swapaxes(c, 0, 1) else: c = None from train import build_model # Model model = build_model().to(device) # Load checkpoint print("Load checkpoint from {}".format(checkpoint_path)) if use_cuda: checkpoint = torch.load(checkpoint_path) else: checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["state_dict"]) checkpoint_name = splitext(basename(checkpoint_path))[0] os.makedirs(dst_dir, exist_ok=True) if not file_name_suffix: file_name_suffix = splitext(basename(conditional_path))[0] dst_wav_path = join(dst_dir, "{}.wav".format(file_name_suffix)) # Prepare mel spectrogram condition C = FileSourceDataset(SingleFileDataSource(conditional_path)) data_loader = data_utils.DataLoader(C, batch_size=hparams.batch_size, drop_last=False, num_workers=hparams.num_workers, sampler=None, shuffle=False, collate_fn=dummy_collate, pin_memory=hparams.pin_memory) cin_pad = hparams.cin_pad for idx, (x, y, c, g, input_lengths) in enumerate(data_loader): if cin_pad > 0: c = F.pad(c, pad=(cin_pad, cin_pad), mode="replicate") # B x 1 x T if x[0] is not None: B, _, T = x.shape else: B, _, Tn = c.shape T = Tn * audio.get_hop_size() # DO generate y_hats = batch_wavegen(model, c=c, g=g, fast=True, tqdm=tqdm) for i, (ref, gen, length) in enumerate(zip(x, y_hats, input_lengths)): gen = gen[:length] gen = np.clip(gen, -1.0, 1.0) # save wavfile.write(dst_wav_path, hparams.sample_rate, to_int16(gen)) print( "Finished! Check out {} for generated audio samples.".format(dst_dir)) sys.exit(0)
def load_model(name: str, device="cpu"): if name.lower() == 'uniglow': from nemo.collections.tts.models import UniGlowModel return UniGlowModel.from_pretrained(model_name="tts_uniglow", map_location=device) elif name.lower() == 'tacotron': import nemo.collections.tts as nemo_tts return nemo_tts.models.Tacotron2Model.from_pretrained( model_name="Tacotron2-22050Hz", map_location=device) elif name.lower() == 'quartznet': import nemo.collections.asr as nemo_asr return nemo_asr.models.EncDecCTCModel.from_pretrained( model_name="QuartzNet15x5Base-En", map_location=device) elif name.lower() == 'speakerverification_speakernet': import nemo.collections.asr as nemo_asr stt = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained( model_name="speakerverification_speakernet", map_location=device) return stt elif name.lower() == 'speakerrecognition_speakernet': import nemo.collections.asr as nemo_asr stt = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained( model_name="speakerrecognition_speakernet", map_location=device) return stt elif name.lower() == 'jasper': import nemo.collections.asr as nemo_asr stt = nemo_asr.models.ASRModel.from_pretrained( model_name="stt_en_jasper10x5dr") return stt elif name.lower() == 'quartznet_de': import nemo.collections.asr as nemo_asr stt = nemo_asr.models.ASRModel.from_pretrained( model_name="stt_de_quartznet15x5") return stt elif name.lower() == 'deepspeech2': from .deep_speech import DeepSpeechEncoderWrapper if os.path.isfile('weights/an4_pretrained_v2.pth'): return DeepSpeechEncoderWrapper("weights/an4_pretrained_v2.pth", device=device) else: if not os.path.exists("weights"): os.makedirs("weights") wget.download( "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/an4_pretrained_v2.pth", out="weights") return DeepSpeechEncoderWrapper("weights/an4_pretrained_v2.pth", device=device) elif name.lower() == 'wav2vec2': from .wav2vec2 import Wav2Vec2FullEncoder return Wav2Vec2FullEncoder(device) elif name.lower() == 'wav2vec2_conv': from .wav2vec2 import Wav2Vec2ConvEncoder return Wav2Vec2ConvEncoder(device) elif name.lower() == 'melgan': import torch import gdown os.makedirs("weights", exist_ok=True) # wget.download('https://github.com/descriptinc/melgan-neurips/archive/master.zip', out="weights") url = 'https://drive.google.com/uc?id=' + '1vNp5ZsfEBZQBXqsUOJZUYTkTedk6HZQS' gdown.download(url, 'weights/melgan-neurips-master.zip', quiet=True) os.system('unzip weights/melgan-neurips-master.zip -d weights/') vocoder = torch.hub.load('weights/melgan-neurips-master', 'load_melgan', source='local') return vocoder elif name.lower() == 'waveglow': from .waveglow import Vocoder vocoder = Vocoder().to(device) return vocoder elif name.lower() == 'wavenet': wn_preset = "weights/20180510_mixture_lj_checkpoint_step000320000_ema.json" wn_checkpoint_path = "weights/20180510_mixture_lj_checkpoint_step000320000_ema.pth" if not os.path.exists(wn_preset): os.makedirs("weights", exist_ok=True) # wget.download( # 'https://www.dropbox.com/s/0vsd7973w20eskz/20180510_mixture_lj_checkpoint_step000320000_ema.json', # out="weights" # ) os.system( 'curl -L "https://www.dropbox.com/s/0vsd7973w20eskz/20180510_mixture_lj_checkpoint_step000320000_ema.json" -o weights/20180510_mixture_lj_checkpoint_step000320000_ema.json' ) if not os.path.exists(wn_checkpoint_path): os.makedirs("weights", exist_ok=True) # wget.download( # 'https://www.dropbox.com/s/zdbfprugbagfp2w/20180510_mixture_lj_checkpoint_step000320000_ema.pth', # out="weights" # ) os.system( 'curl -L "https://www.dropbox.com/s/zdbfprugbagfp2w/20180510_mixture_lj_checkpoint_step000320000_ema.pth" -o weights/20180510_mixture_lj_checkpoint_step000320000_ema.pth' ) from hparams import hparams with open(wn_preset) as f: hparams.parse_json(f.read()) import sys sys.path.append('thirdparty/wavenet_vocoder') from train import build_model from synthesis import wavegen import torch model = build_model().to(device) print("Load checkpoint from {}".format(wn_checkpoint_path)) checkpoint = torch.load(wn_checkpoint_path, map_location=device) model.load_state_dict(checkpoint["state_dict"]) return (hparams, model) elif name.lower() in ['hifigan', 'hifigan_v1', 'hifigan_v2', 'hifigan_v3']: import gdown name = name.lower() header = "https://drive.google.com/uc?id=" if name in ['hifigan', 'hifigan_v1']: name = 'hifigan_v1' model_url = header + "1QEBKespXTmsMzsSRBXWdpIT0Ve7nnaRZ" config_url = header + "1l5EUVBKM0SK7ec4HWf_wZvEITAsdOLFC" elif name == 'hifigan_v2': model_url = header + "1I415g2Cdx5FWy6ECma0zEc9GhX_TnbFv" config_url = header + "11LnhSum3EAeo5zag-tpU8HKk0MdbrQxF" else: model_url = header + "1fnkOteyRdPq4Gh2cfso3gqqrC6inLWsF" config_url = header + "1mke75axgO2sdJ41GL2HTrcb4KyAl0i45" if not os.path.exists(f'pretrained/{name}'): os.makedirs(f'pretrained/{name}', exist_ok=True) model_output = f'pretrained/{name}/model.pth' config_output = f'pretrained/{name}/config.json' gdown.download(model_url, model_output, quiet=True) gdown.download(config_url, config_output, quiet=True) elif name.lower() == "wave2vec_mos": from .wav2vec2 import Wav2Vec2MOS import gdown if not os.path.isfile('weights/wave2vec2mos.pth'): if not os.path.exists("weights"): os.makedirs("weights") gdown.download( "https://drive.google.com/uc?id=18kMTxj2VbRDrs_CBcCmZT-kGTvFvbVmm", output="weights/wave2vec2mos.pth") return Wav2Vec2MOS('weights/wave2vec2mos.pth') else: raise NotImplementedError