description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--dataset", required=True, help='dataset name') parser.add_argument("--warmstart", help='Warmstart (transfer learn) from a pre-trained model') args = parser.parse_args() use_gpu = torch.cuda.is_available() print('use_gpu', use_gpu) if use_gpu: torch.backends.cudnn.benchmark = True if args.dataset not in ['ljspeech', 'mbspeech']: from datasets.generic import vocab, Generic as SpeechDataset train_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset( ['texts', 'mels', 'mel_gates'], args.dataset), batch_size=64, mode='train') valid_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset( ['texts', 'mels', 'mel_gates'], args.dataset), batch_size=64, mode='valid') else: if args.dataset == 'ljspeech': from datasets.lj_speech import vocab, LJSpeech as SpeechDataset elif args.dataset == 'mbspeech': from datasets.mb_speech import vocab, MBSpeech as SpeechDataset train_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset( ['texts', 'mels', 'mel_gates']), batch_size=64, mode='train') valid_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset(
def valid_dataloader(self, dataset, batch_size=32, \ num_workers=0 if sys.platform.startswith('win') else 8): return Text2MelDataLoader(dataset, batch_size=batch_size, mode='valid', num_workers=num_workers)
from datasets.swara import vocab, SWARA as SpeechDataset elif args.dataset == 'swara_test': from datasets.swara_test import vocab, SWARA as SpeechDataset else: print('No such dataset') sys.exit(1) # os.environ["CUDA_VISIBLE_DEVICES"]="3" use_gpu = torch.cuda.is_available() print('use_gpu', use_gpu) if use_gpu: torch.backends.cudnn.benchmark = True train_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset( ['texts', 'mels', 'mel_gates', 'speakers', 'filenames']), batch_size=16, mode='train') valid_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset( ['texts', 'mels', 'mel_gates', 'speakers', 'filenames']), batch_size=16, mode='valid') text2mel = Text2Mel(vocab).cuda() optimizer = torch.optim.Adam(text2mel.parameters(), lr=hp.text2mel_lr) start_timestamp = int(time.time() * 1000) start_epoch = 0 global_step = 0 logger = Logger(args.dataset, 'text2mel')
formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--voice", default='Keira', help='voice name') parser.add_argument("--script", default='Keira_all.csv', help='script filename') args = parser.parse_args() use_gpu = torch.cuda.is_available() print('use_gpu', use_gpu) if use_gpu: torch.backends.cudnn.benchmark = True Speech.load(['texts', 'mels', 'mel_gates'], args.voice, args.script) train_data_loader = Text2MelDataLoader(Speech, batch_size=hp.text2mel_batch_size, mode='train') valid_data_loader = Text2MelDataLoader(Speech, batch_size=hp.text2mel_batch_size, mode='valid') text2mel = Text2Mel(vocab).cuda() optimizer = torch.optim.Adam(text2mel.parameters(), lr=hp.text2mel_lr) start_timestamp = int(time.time() * 1000) start_epoch = 0 global_step = 0 logger = Logger( f'{args.voice}-{args.script}-{hp.d}-{hp.text2mel_lr}-{hp.text2mel_batch_size}',
else: index = 'cpu' device = select_device(index) hparams = HParam(args.config) \ if args.config else HParam(osp.join(osp.abspath(os.getcwd()), 'config', 'default.yaml')) checkpoint = args.checkpoint or get_last_chkpt_path( osp.join(hparams.trainer.logdir, f"{hparams.data.dataset}-{args.name}")) extractor = DurationTrainer(hparams, device=device).load_checkpoint(checkpoint).model extractor.train(False) dataset_root = osp.join(hparams.data.datasets_path, hparams.data.dataset_dir) dataset = SpeechDataset(['mels', 'mlens', 'texts', 'tlens', 'files'], dataset_root, hparams.text) dataloader = Text2MelDataLoader(dataset, args.batch_size, mode='whole') normalizer = MinMaxNorm(hparams.audio.spec_min, hparams.audio.spec_max) pbar = tqdm(dataloader, unit="audios", unit_scale=dataloader.batch_size, \ disable=hparams.trainer.disable_progress_bar) with open(osp.join(dataset.path, 'duration.txt'), 'w', encoding='utf-8') as fw: for it, batch in enumerate(pbar, start=1): mels, mlens, texts, tlens = \ batch['mels'], batch['mlens'].squeeze(1), batch['texts'].long(), batch['tlens'].squeeze(1) mels, mlens, texts, tlens = \ mels.to(device), mlens.to(device), texts.to(device), tlens.to(device) mels = normalizer(mels) with torch.no_grad(): melspecs, attns = extractor((texts, tlens, mels, True))