def run_inference_on_tier(source, tier, text, timestep): # Returns a tuple, (inference, next_tier) # inference is the conditional inference on the current tier # next_tier interleaves the inference with the input to generate the next tier args = parse_inference_args(['-c', 'config/blizzard_compressed_experiments.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'hw_blizzard_compressed', '-i', text]) hp = HParam('./config/blizzard_compressed_experiments.yaml') infererence_hp = HParam(args.infer_config) assert timestep % t_div[hp.model.tier] == 0, \ "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], timestep) model = MelNet(hp, args, infererence_hp).cuda() model.load_tiers() model.eval() audio_lengths = torch.LongTensor([0]).cuda() if tier > 1: for t in tqdm(range(model.args.timestep // model.t_div)): audio_lengths += 1 # source = breakdown[tier][0] x = torch.unsqueeze(torch.from_numpy(source), 0) mu, std, pi = model.tiers[tier](x, audio_lengths) temp = sample_gmm(mu, std, pi) next_tier = model.tierutil.interleave(x, temp, tier + 1) return next_tier[0].cpu().detach().numpy(), temp[0].cpu().detach().numpy()
def main(args): checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = Generator(hp.audio.n_mel_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval() with torch.no_grad(): for melpath in tqdm.tqdm(glob.glob(os.path.join(args.input_folder, '*.mel'))): mel = torch.load(melpath) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() # pad input mel with zeros to cut artifact # see https://github.com/seungwonpark/melgan/issues/8 zero = torch.full((1, hp.audio.n_mel_channels, 10), -11.5129).cuda() mel = torch.cat((mel, zero), axis=2) audio = model(mel) audio = audio.squeeze() # collapse all dimension except time axis audio = audio[:-(hp.audio.hop_length*10)] audio = MAX_WAV_VALUE * audio audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE) audio = audio.short() audio = audio.cpu().detach().numpy() out_path = melpath.replace('.mel', '_reconstructed_epoch%04d.wav' % checkpoint['epoch']) write(out_path, hp.audio.sampling_rate, audio)
def get_checkpoint_loss(chkpt_path): config_path = 'config/blizzard.yaml' tier = int( chkpt_path.split('tier')[1].split('_')[0]) # Yes it's hacky, sorry print('tier: %d' % tier) checkpoint = torch.load(chkpt_path) print("Checkpoint loaded") hp = HParam(config_path) with open(config_path, 'r') as f: model_hp = checkpoint['hp_str'] hp_str = ''.join(f.readlines()) if model_hp != hp_str: print(model_hp) print('') print(hp_str) print("ERROR: ISSUE WITH DIFFERENT HPs") model = get_model(tier, hp) print("Got model") model.load_state_dict(checkpoint['model']) print("Model loaded") optimizer = torch.optim.Adam(model.parameters(), lr=hp.train.adam.lr) print("Got optimizer") args = get_args(tier) print("Got args") testloader = get_testloader(hp, args) print("Got testloader") loss = compute_loss(args, model, testloader, criterion) print("Got loss") return loss
def main(args): checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = ModifiedGenerator(hp.audio.n_mel_channels, hp.model.n_residual_layers, ratios=hp.model.generator_ratio, mult = hp.model.mult, out_band = hp.model.out_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval(inference=True) with torch.no_grad(): mel = torch.from_numpy(np.load(args.input)) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() audio = model.inference(mel) audio = audio.squeeze(0) # collapse all dimension except time axis if args.d: denoiser = Denoiser(model).cuda() audio = denoiser(audio, 0.01) audio = audio.squeeze() audio = audio[:-(hp.audio.hop_length*10)] audio = MAX_WAV_VALUE * audio audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1) audio = audio.short() audio = audio.cpu().detach().numpy() out_path = args.input.replace('.npy', '_reconstructed_epoch%04d.wav' % checkpoint['epoch']) write(out_path, hp.audio.sampling_rate, audio)
def main(args): checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = Generator(hp.audio.n_mel_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval(inference=False) with torch.no_grad(): for melpath in tqdm.tqdm( glob.glob(os.path.join(args.input_folder, '*.mel'))): mel = torch.load(melpath) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() audio = model.inference(mel) audio = audio.cpu().detach().numpy() out_path = melpath.replace( '.mel', '_reconstructed_epoch%04d.wav' % checkpoint['epoch']) write(out_path, hp.audio.sampling_rate, audio)
def main(cmd_args): parser = get_parser() args, _ = parser.parse_known_args(cmd_args) args = parser.parse_args(cmd_args) hp = HParam(args.config) idim = len(valid_symbols) odim = hp.audio.num_mels model = fs2.FeedForwardTransformer(idim, odim, hp) my_script_module = torch.jit.script(model) print("Scripting") my_script_module.save("{}/{}.pt".format(args.outdir, args.name)) print("Script done") if args.trace: print("Tracing") model.eval() with torch.no_grad(): my_trace_module = torch.jit.trace( model, torch.ones(50).to(dtype=torch.int64)) my_trace_module.save("{}/trace_{}.pt".format(args.outdir, args.name)) print("Trace Done")
def main(args): checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = Generator(hp.audio.n_mel_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval() with torch.no_grad(): mel = torch.from_numpy(np.load(args.input)) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() audio = model(mel) # For multi-band inference print(audio.shape) audio = audio.squeeze(0) # collapse all dimension except time axis if args.d: denoiser = Denoiser(model).cuda() audio = denoiser(audio, 0.1) audio = audio.squeeze() audio = audio[:-(hp.audio.hop_length * 10)] audio = MAX_WAV_VALUE * audio audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE - 1) audio = audio.short() audio = audio.cpu().detach().numpy() out_path = args.input.replace( '.npy', '_hifi_GAN_epoch%04d.wav' % checkpoint['epoch']) write(out_path, hp.audio.sampling_rate, audio)
def load_testset(): # args = parse_train_args(['-c', './config/blizzard_compressed_experiments.yaml', '-n', 'blizzard_compressed_validation', '-t', str(tier), '-b', '1', '-s', 'TTS']) hp = HParam('./config/blizzard_compressed_experiments.yaml') dataset = [] raw_data = None with open(os.path.join(hp.data.path, 'prompts.gui'), 'r') as f: lines = f.read().splitlines() filenames = lines[::3] sentences = lines[1::3] raw_data = list(zip(filenames, sentences)) random.seed(123) random.shuffle(raw_data) raw_data = raw_data[int(0.95 * len(raw_data)):] for filename, sentence in tqdm(raw_data, total=len(raw_data)): wav_path = os.path.join(hp.data.path, 'wavn', filename + '.wav') length = get_length(wav_path, hp.audio.sr) if length < hp.audio.duration: dataset.append((wav_path, sentence)) for i in range(len(dataset)): text = dataset[i][1] wav = read_wav_np(dataset[i][0], sample_rate=hp.audio.sr) filename = os.path.basename(dataset[i][0]) yield filename, text, wav
def reconstruct_audio(filename, tier_to_breakdown): hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) tierutil = TierUtil(hp) final_reconstruction = None # Verify that tier 2 is conditionally generated from just tier 1 assert (breakdown[2][0] == breakdown[1][1] ).all(), "Tier 2 not created from Tier 1" for tier in range(2, 7): source = tier_to_breakdown[tier][0] target = tier_to_breakdown[tier][1] source_tensor = torch.unsqueeze(torch.from_numpy(source), 0) target_tensor = torch.unsqueeze(torch.from_numpy(target), 0) reconstructed_mel_tensor = tierutil.interleave(source_tensor, target_tensor, tier + 1) reconstructed_mel = reconstructed_mel_tensor.numpy()[0] # Verify that interleaving the source and target of the current tier conditionally generates the source of the next tier if tier < 6: next_tier = tier_to_breakdown[tier + 1][0] assert (reconstructed_mel == next_tier).all( ), "Tier %d not created from Tier %d" % (tier + 1, tier) else: final_reconstruction = reconstructed_mel print('reconstructing audio...') reconstructed_audio = melgen.reconstruct_audio(final_reconstruction) melgen.save_audio('reconstructed_' + filename, reconstructed_audio)
def main(cmd_args): """Run training.""" parser = get_parser() args, _ = parser.parse_known_args(cmd_args) args = parser.parse_args(cmd_args) if os.path.exists(args.checkpoint_path): checkpoint = torch.load(args.checkpoint_path) else: print("Checkpoint not exixts") return None if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint["hp_str"]) validloader = loader.get_tts_dataset(hp.data.data_dir, 1, hp, True) print("Checkpoint : ", args.checkpoint_path) idim = len(valid_symbols) odim = hp.audio.num_mels model = FeedForwardTransformer(idim, odim, hp) # os.makedirs(args.out, exist_ok=True) checkpoint = torch.load(args.checkpoint_path) model.load_state_dict(checkpoint["model"]) evaluate(hp, validloader, model)
def get_timestep(wav): hp = HParam('./config/blizzard_compressed_experiments.yaml') hop_length = hp.audio.hop_length frames = len(wav) timestep_goal = float(frames) / float(hop_length) final_timestep = 4 while final_timestep < timestep_goal: final_timestep += 4 return final_timestep-4
def inference(text, timestep=64): args = parse_inference_args(['-c', 'config/blizzard_compressed_experiments.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'hw_blizzard_compressed', '-i', text]) hp = HParam('./config/blizzard_compressed_experiments.yaml') infererence_hp = HParam(args.infer_config) assert timestep % t_div[hp.model.tier] == 0, \ "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], timestep) model = MelNet(hp, args, infererence_hp).cuda() model.load_tiers() model.eval() with torch.no_grad(): # generated = model.sample(args.input) breakdown, generated = sample_model_with_breakdown(model, args.input) melspec = generated[0].cpu().detach().numpy() return breakdown, melspec
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', type=str, required=True, help="yaml file for config.") parser.add_argument('-p', '--checkpoint_path', type=str, default=None, help="path of checkpoint pt file for resuming") parser.add_argument( '-n', '--name', type=str, required=True, help="Name of the model. Used for both logging and saving chkpt.") args = parser.parse_args() hp = HParam(args.config) hp_str = yaml.dump(hp) args_str = yaml.dump(vars(args)) pt_dir = os.path.join(hp.log.chkpt_dir, args.name) log_dir = os.path.join(hp.log.log_dir, args.name) os.makedirs(pt_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler( os.path.join( log_dir, '%s-%d.log' % (args.name, time.time()))), logging.StreamHandler() ]) logger = logging.getLogger() logger.info('Config by yaml file') logger.info(hp_str) logger.info('Command Line Config') logger.info(args_str) if hp.data.train == '' or hp.data.test == '': logger.error("train or test data directory cannot be empty.") raise Exception("Please specify directories of data in %s" % args.config) writer = Writer(hp, log_dir) train_loader = create_dataloader(hp, args, DataloaderMode.train) test_loader = create_dataloader(hp, args, DataloaderMode.test) train(args, pt_dir, train_loader, test_loader, writer, logger, hp, hp_str)
def get_audio(): hp = HParam('./config/blizzard_compressed_experiments.yaml') file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(file_list) file_list = file_list[int(0.95 * len(file_list)):] for idx in range(len(file_list)): filename = os.path.basename(file_list[idx]) wav = read_wav_np(file_list[idx], sample_rate=hp.audio.sr) yield filename, wav
def deconstruct_audio(wav): hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) tierutil = TierUtil(hp) mel = melgen.get_normalized_mel(wav) tier_to_breakdown = {} for tier in range(1, 7): source, target = tierutil.cut_divide_tiers(mel, tier) print("Tier %d has source dims: %s, target dims %s" % (tier, source.shape, target.shape)) tier_to_breakdown[tier] = (source, target) tier_to_breakdown[7] = (mel, mel) return tier_to_breakdown
def run_inference(source, timestep, tier_to_breakdown): # First load in the model hp = HParam('./config/blizzard_alldata_v5.yaml') infer_hp = HParam('./config/inference.yaml') args = parse_inference_args([ '-c', 'config/blizzard_alldata_v5.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'test_tiers', '-i', SENTENCE ]) model = MelNet(hp, args, infer_hp).cuda() model.load_tiers() model.eval() audio_lengths = torch.LongTensor([0]).cuda() for t in tqdm(range(model.args.timestep // model.t_div)): audio_lengths += 1 ## Tier 2~N ## x = torch.unsqueeze(torch.from_numpy(source), 0) for tier in tqdm( range(model.hp.model.tier + 1 - TESTING_TIERS, model.hp.model.tier + 1)): tqdm.write('Tier %d' % tier) # Save original source and inference source actual_source = tier_to_breakdown[tier][0] actual_target = tier_to_breakdown[tier][1] actual_interleaved = tier_to_breakdown[tier + 1][0] current_source = x save_image(x.detach().numpy()[0], 'tier_%d_inference_source' % tier) save_image(actual_source, 'tier_%d_actual_source' % tier) save_image(actual_target, 'tier_%d_actual_target' % tier) mu, std, pi = model.tiers[tier](x, audio_lengths) temp = sample_gmm(mu, std, pi) save_image(temp[0].cpu().detach().numpy(), 'tier_%d_inference_target' % tier) x = model.tierutil.interleave(x, temp, tier + 1) save_image(x.detach().numpy()[0], 'tier_%d_inference_interleaved' % tier) save_image(actual_interleaved, 'tier_%d_actual_interleaved' % tier) reconstructed_mel_tensor = x.detach().numpy() return reconstructed_mel_tensor[0]
def test_fastspeech(): idim = len(valid_symbols) hp = HParam("configs/default.yaml") hp.train.ngpu = 0 odim = hp.audio.num_mels model = FeedForwardTransformer(idim, odim, hp) x = torch.ones(2, 100).to(dtype=torch.int64) input_length = torch.tensor([100, 100]) y = torch.ones(2, 100, 80) out_length = torch.tensor([100, 100]) dur = torch.ones(2, 100) e = torch.ones(2, 100) p = torch.ones(2, 100) loss, report_dict = model(x, input_length, y, out_length, dur, e, p)
def init(config, checkpoint_path, device="cuda"): checkpoint = torch.load(checkpoint_path) if config is not None: hp = HParam(config) else: hp = load_hparam_str(checkpoint['hp_str']) model = Generator(hp.audio.n_mel_channels, hp.model.n_residual_layers, ratios=hp.model.generator_ratio, mult=hp.model.mult, out_band=hp.model.out_channels).to(device) model.load_state_dict(checkpoint['model_g']) model.eval(inference=True) return hp, model
def main(args): args = { "config": 'config/config.yaml', "embedder_path": 'model/embedder.pt', "checkpoint_path": 'enhance_my_voice/chkpt_201000.pt', "mixed_file": 'utils/speakerA.wav', "reference_file": 'utils/speakerA.wav', "out_dir": 'output', } hp = HParam(args['config']) with torch.no_grad(): model = VoiceFilter(hp).cuda() chkpt_model = torch.load(args['checkpoint_path'])['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args['embedder_path']) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(args['reference_file'], sr=16000) dvec_mel = audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float().cuda() dvec = embedder(dvec_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(args['mixed_file'], sr=16000) mag, phase = audio.wav2spec(mixed_wav) mag = torch.from_numpy(mag).float().cuda() mag = mag.unsqueeze(0) mask = model(mag, dvec) est_mag = mag * mask est_mag = est_mag[0].cpu().detach().numpy() # est_wav = audio.spec2wav(est_mag, phase) # os.makedirs(args['out_dir'], exist_ok=True) # out_path = os.path.join(args['out_dir'], 'result.wav') # librosa.output.write_wav(out_path, est_wav, sr=16000) return audio.spec2wav(est_mag, phase)
def main(cmd_args): """Run training.""" parser = get_parser() args, _ = parser.parse_known_args(cmd_args) args = parser.parse_args(cmd_args) hp = HParam(args.config) with open(args.config, "r") as f: hp_str = "".join(f.readlines()) # logging info os.makedirs(hp.train.log_dir, exist_ok=True) logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[ logging.FileHandler( os.path.join(hp.train.log_dir, "%s-%d.log" % (args.name, time.time()))), logging.StreamHandler(), ], ) logger = logging.getLogger() # If --ngpu is not given, # 1. if CUDA_VISIBLE_DEVICES is set, all visible devices # 2. if nvidia-smi exists, use all devices # 3. else ngpu=0 ngpu = hp.train.ngpu logger.info(f"ngpu: {ngpu}") # set random seed logger.info("random seed = %d" % hp.train.seed) random.seed(hp.train.seed) np.random.seed(hp.train.seed) vocoder = torch.hub.load("seungwonpark/melgan", "melgan") # load the vocoder for validation if hp.train.GTA: create_gta(args, hp, hp_str, logger) else: train(args, hp, hp_str, logger, vocoder)
def main(args): checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = ModifiedGenerator(hp.audio.n_mel_channels, hp.model.n_residual_layers, ratios=hp.model.generator_ratio, mult = hp.model.mult, out_band = hp.model.out_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval(inference=True) with torch.no_grad(): mel = torch.from_numpy(np.load(args.input)) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() zero = torch.full((1, 80, 10), -11.5129).to(mel.device) mel = torch.cat((mel, zero), dim=2) vocgan_trace = torch.jit.trace(model, mel) vocgan_trace.save("{}/vocgan_ex_female_en_{}_{}.pt".format(args.out, checkpoint['githash'], checkpoint['epoch']))
def main(args): checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = Generator(hp.audio.n_mel_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval() #model.remove_weight_norm() with torch.no_grad(): mel = torch.from_numpy(np.load(args.input)) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() #zero = torch.full((1, 80, 10), -11.5129).to(mel.device) #mel = torch.cat((mel, zero), dim=2) hifigan_trace = torch.jit.trace(model, mel) #print(state_dict_g.keys()) hifigan_trace.save("{}/hifigan_{}.pt".format(args.out, args.name))
def main(args): """Run deocding.""" para_mel = [] parser = get_parser() args = parser.parse_args(args) logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) print("Text : ", args.text) print("Checkpoint : ", args.checkpoint_path) if os.path.exists(args.checkpoint_path): checkpoint = torch.load(args.checkpoint_path) else: logging.info("Checkpoint not exixts") return None if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint["hp_str"]) idim = len(valid_symbols) odim = hp.audio.num_mels model = FeedForwardTransformer( idim, odim, hp) # torch.jit.load("./etc/fastspeech_scrip_new.pt") os.makedirs(args.out, exist_ok=True) if args.old_model: logging.info("\nSynthesis Session...\n") model.load_state_dict(checkpoint, strict=False) else: checkpoint = torch.load(args.checkpoint_path) model.load_state_dict(checkpoint["model"]) text = process_paragraph(args.text) for i in range(0, len(text)): txt = preprocess(text[i]) audio = synth(txt, model, hp) m = audio.T para_mel.append(m) m = torch.cat(para_mel, dim=1) np.save("mel.npy", m.cpu().numpy()) plot_mel(m) if hp.train.melgan_vocoder: m = m.unsqueeze(0) print("Mel shape: ", m.shape) vocoder = torch.hub.load("seungwonpark/melgan", "melgan") vocoder.eval() if torch.cuda.is_available(): vocoder = vocoder.cuda() mel = m.cuda() with torch.no_grad(): wav = vocoder.inference( mel) # mel ---> batch, num_mels, frames [1, 80, 234] wav = wav.cpu().float().numpy() else: stft = STFT(filter_length=1024, hop_length=256, win_length=1024) print(m.size()) m = m.unsqueeze(0) wav = griffin_lim(m, stft, 30) wav = wav.cpu().numpy() save_path = "{}/test_tts.wav".format(args.out) write(save_path, hp.audio.sample_rate, wav.astype("int16"))
import numpy as np import os from utils.util import get_files from tqdm import tqdm from utils.util import remove_outlier from utils.hparams import HParam if __name__ == "__main__": hp = HParam("./configs/default.yaml") min_e = [] min_p = [] max_e = [] max_p = [] nz_min_p = [] nz_min_e = [] energy_path = os.path.join(hp.data.data_dir, "energy") pitch_path = os.path.join(hp.data.data_dir, "pitch") mel_path = os.path.join(hp.data.data_dir, "mels") energy_files = get_files(energy_path, extension=".npy") pitch_files = get_files(pitch_path, extension=".npy") mel_files = get_files(mel_path, extension=".npy") assert len(energy_files) == len(pitch_files) == len(mel_files) energy_vecs = [] for f in tqdm(energy_files): e = np.load(f) e = remove_outlier(e)
# inference_breakdown[i][0] # save_image('tier%d_inferred_breakdown_%s.png' % (i, filename), inference_breakdown[i][0]) # save_image('final_inferred_%s.png' % filename, inferred) tier = 5 source = breakdown[tier][0] print("Source tier 5 shape: %s" % str(source.shape)) save_image('source_tier_%d_%s.png' % (tier, filename), breakdown[tier][0]) inferred_source_6, inferred_5 = run_inference_on_tier(source, tier, text, timestep) print("inferred tier 5 target shape: %s" % str(inferred_5.shape)) print("inferred tier 6 source shape: %s" % str(inferred_source_6.shape)) tier = 6 inferred_final, inferred_6 = run_inference_on_tier(inferred_source_6, tier, text, timestep) print("inferred tier 6 target shape: %s" % str(inferred_6.shape)) print("inferred final shape: %s" % str(inferred_final.shape)) print("original final shape: %s" % str(breakdown[tier+1][0].shape)) save_image('target_tier_%d_%s.png' % (tier, filename), breakdown[tier][1]) save_image('next_tier_%d_%s.png' % (tier, filename), breakdown[tier+1][0]) save_image('inferred_tier_%d_%s.png' % (tier, filename), inferred_6) save_image('inferred_next_tier_%d_%s.png' % (tier, filename), inferred_final) # Save the actual audio hp = HParam('./config/blizzard_compressed_experiments.yaml') melgen = MelGen(hp) source_wav = melgen.reconstruct_audio(breakdown[tier+1][0]) inference_wav = melgen.reconstruct_audio(inferred_final) melgen.save_audio('source_'+filename, source_wav) melgen.save_audio('inference_'+filename, inference_wav) break
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', type=str, required=True, help="yaml file for configuration") parser.add_argument('-p', '--infer_config', type=str, required=True, help="yaml file for inference configuration") parser.add_argument('-t', '--timestep', type=int, default=240, help="timestep of mel-spectrogram to generate") parser.add_argument('-n', '--name', type=str, default="result", required=False, help="Name for sample") parser.add_argument('-i', '--input', type=str, default=None, required=False, help="Input for conditional generation, leave empty for unconditional") args = parser.parse_args() hp = HParam(args.config) infer_hp = HParam(args.infer_config) assert args.timestep % t_div[hp.model.tier] == 0, \ "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], args.timestep) model = MelNet(hp, args, infer_hp).cuda() model.load_tiers() model.eval() with torch.no_grad(): generated = model.sample(args.input) os.makedirs('temp', exist_ok=True) torch.save(generated, os.path.join('temp', args.name + '.pt')) spectrogram = plot_spectrogram_to_numpy(generated[0].cpu().detach().numpy())
from helpers.processor import Processor from datasets.dataset import SpeechDataset parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--config', default=None, type=str, help='Config file path') parser.add_argument('--compute', action='store_true', help='Pre-compute dataset statistics') args = parser.parse_args() hparams = HParam(args.config) \ if args.config else HParam(osp.join(osp.abspath(os.getcwd()), 'config', 'default.yaml')) datasets_path = hparams.data.datasets_path dataset_file_url = \ f'https://open-speech-data.oss-cn-hangzhou.aliyuncs.com/{hparams.data.dataset_dir}.tar.bz2' dataset_file_name = osp.basename(dataset_file_url) dataset_dir = dataset_file_name[:-8] dataset_path = osp.join(datasets_path, dataset_dir) wavfile_path = osp.join(dataset_path, "wavs") melspec_path = osp.join(dataset_path, "mels") if osp.isdir(melspec_path) and False: print("%s dataset folder already exists" % dataset_dir) sys.exit(0) else:
help="yaml file for configuration") parser.add_argument('-p', '--checkpoint_path', type=str, default=None, help="path of checkpoint pt file to resume training") parser.add_argument( '-n', '--name', type=str, required=True, help="name of the model for logging, saving checkpoint") #argv = ['-c', './config/mb_melgan.yaml', '-n', 'melgan-male', '-p', './checkpoints/mb_melgan_901be72_0600.pt'] args = parser.parse_args() hp = HParam(args.config) with open(args.config, 'r') as f: hp_str = ''.join(f.readlines()) pt_dir = os.path.join(hp.log.chkpt_dir, args.name) log_dir = os.path.join(hp.log.log_dir, args.name) os.makedirs(pt_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler( os.path.join( log_dir, '%s-%d.log' % (args.name, time.time()))),
help="Directory of VoxCeleb2 dataset, ends with 'aac'") parser.add_argument('-cu', '--current_corpus_dir', type=str, default=None, help="Directory of currentCorpus dataset") parser.add_argument('-o', '--out_dir', type=str, required=True, help="Directory of output training triplet") parser.add_argument('-p', '--process_num', type=int, default=None, help='number of processes to run. default: cpu_count') parser.add_argument('--vad', type=int, default=0, help='apply vad to wav file. yes(1) or no(0, default)') args = parser.parse_args() os.makedirs(args.out_dir, exist_ok=True) # Creates output directory os.makedirs(os.path.join(args.out_dir, 'train'), exist_ok=True) # Creates train output directory os.makedirs(os.path.join(args.out_dir, 'test'), exist_ok=True) # Creates test output dorectory hp = HParam(args.config) # hp contains the informations of config.yaml cpu_num = cpu_count() if args.process_num is None else args.process_num if args.libri_dir is None and args.voxceleb_dir is None and args.current_corpus_dir is None: raise Exception("Please provide directory of data") if args.libri_dir is not None: # train_folders = all subfolders of train-clean-100 train_folders = [x for x in glob.glob(os.path.join(args.libri_dir, 'train-clean-100', '*')) if os.path.isdir(x)] + \ [x for x in glob.glob(os.path.join(args.libri_dir, 'train-clean-360', '*')) if os.path.isdir(x)] # + \ #[x for x in glob.glob(os.path.join(args.libri_dir, 'train-other-500', '*')) # if os.path.isdir(x)]
type=str, help="Append to logdir name") parser.add_argument("--config", default=None, type=str, help="Config file path") args = parser.parse_args() if torch.cuda.is_available(): index = args.device if args.device else str( 0 if gm is None else gm.auto_choice()) else: index = 'cpu' device = select_device(index) hparams = HParam(args.config) \ if args.config else HParam(osp.join(osp.abspath(os.getcwd()), "config", "default.yaml")) logdir = osp.join(hparams.trainer.logdir, f"%s-%s" % (hparams.data.dataset, args.name)) checkpoint = args.checkpoint or get_last_chkpt_path(logdir) normalizer = StandardNorm(hparams.audio.spec_mean, hparams.audio.spec_std) processor = TextProcessor(hparams.text) text2mel = ParallelText2Mel(hparams.parallel) text2mel.eval() synthesizer = Synthesizer(model=text2mel, checkpoint=checkpoint, processor=processor, normalizer=normalizer,