def convert(cfg): dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) synthesis_list_path = Path(utils.to_absolute_path(cfg.synthesis_list)) with open(synthesis_list_path) as file: synthesis_list = json.load(file) in_dir = Path(utils.to_absolute_path(cfg.in_dir)) out_dir = Path(utils.to_absolute_path(cfg.out_dir)) out_dir.mkdir(exist_ok=True, parents=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) decoder = Decoder(**cfg.model.decoder) encoder.to(device) decoder.to(device) print("Load checkpoint from: {}:".format(cfg.checkpoint)) checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() for wav_path, speaker_id, out_filename in tqdm(synthesis_list): wav_path = in_dir / wav_path wav, _ = librosa.load( wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) with torch.no_grad(): z, _ = encoder.encode(mel) output = decoder.generate(z, speaker) path = out_dir / out_filename librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr)
def encode_dataset(cfg): out_dir = Path(utils.to_absolute_path(cfg.out_dir)) out_dir.mkdir(exist_ok=True, parents=True) root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path with open(root_path / "test.json") as file: metadata = json.load(file) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) encoder.to(device) print("Load checkpoint from: {}:".format(cfg.checkpoint)) checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) encoder.eval() if cfg.save_auxiliary: auxiliary = [] def hook(module, input, output): auxiliary.append(output.clone()) encoder.encoder[-1].register_forward_hook(hook) for _, _, _, path in tqdm(metadata): path = root_path.parent / path mel = torch.from_numpy(np.load( path.with_suffix(".mel.npy"))).unsqueeze(0).to(device) with torch.no_grad(): z, c, indices = encoder.encode(mel) z = z.squeeze().cpu().numpy() out_path = out_dir / path.stem with open(out_path.with_suffix(".txt"), "w") as file: np.savetxt(file, z, fmt="%.16f") if cfg.save_auxiliary: aux_path = out_dir.parent / "auxiliary_embedding1" aux_path.mkdir(exist_ok=True, parents=True) out_path = aux_path / path.stem c = c.squeeze().cpu().numpy() with open(out_path.with_suffix(".txt"), "w") as file: np.savetxt(file, c, fmt="%.16f") aux_path = out_dir.parent / "auxiliary_embedding2" aux_path.mkdir(exist_ok=True, parents=True) out_path = aux_path / path.stem aux = auxiliary.pop().squeeze().cpu().numpy() with open(out_path.with_suffix(".txt"), "w") as file: np.savetxt(file, aux, fmt="%.16f")
def convert(cfg): dataset_path = Path(utils.to_absolute_path( "datasets")) / cfg.dataset.path #zerospeech/datasets/2019/english with open(dataset_path / "speakers.json") as file: # 말하는 사람들 이름 써있는 데이터 speakers = sorted(json.load(file)) # speakers라는 객체로 저장 synthesis_list_path = Path(utils.to_absolute_path( cfg.synthesis_list)) # ???인걸 보니 우리가 파이썬에서 돌릴때 지정해줘야함 with open(synthesis_list_path) as file: synthesis_list = json.load( file) # datasets/2019/english에 있는 synthesis.json보면됨 in_dir = Path(utils.to_absolute_path( cfg.in_dir)) # ???임. zerospeech 폴더로 경로따면 될듯. (./) out_dir = Path(utils.to_absolute_path( cfg.out_dir)) #???임. 목소리 변환된 결과를 저장할 경로 out_dir.mkdir(exist_ok=True, parents=True) device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # gpu안되면 cpu로 encoder = Encoder( **cfg.model.encoder) #ZeroSpeech/config/model/default에 있는 encoder decoder = Decoder( **cfg.model.decoder) #ZeroSpeech/config/model/default에 있는 decoder encoder.to(device) # cpu or gpu decoder.to(device) # cpu or gpu print("Load checkpoint from: {}:".format(cfg.checkpoint) ) ### ???로 되어있는데 pretrained, 혹은 checkpoint까지 학습된 모델 있으면 그 모델의 위치로 지정 checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage ) # checkpoint에 지정된 weight들을 불러옵니다 encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() meter = pyloudnorm.Meter( cfg.preprocessing.sr ) #sr:16000으로 조정?? https://www.christiansteinmetz.com/projects-blog/pyloudnorm 소음 관련같습니다.. for wav_path, speaker_id, out_filename in tqdm( synthesis_list ): #"english/test/S002_0379088085","V002","V002_0379088085" wav_path = in_dir / wav_path # ./english/test/S002_0379088085 wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) #인풋의 음량을 측정인듯 wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to( device) #unsqueeze()함수는 인수로 받은 위치에 새로운 차원을 삽입 #https://subinium.github.io/pytorch-Tensor-Variable/#%EB%8D%94%EB%AF%B8-%EC%B0%A8%EC%9B%90-%EC%B6%94%EA%B0%80%EC%99%80-%EC%82%AD%EC%A0%9C--squeeze--unsqueeze #https://datascienceschool.net/view-notebook/4f3606fd839f4320a4120a56eec1e228/ speaker = torch.LongTensor([speakers.index(speaker_id) ]).to(device) # 마찬가지로 텐서로 만드는데 #텐서에는 자료형이라는 것이 있습니다. 각 데이터형별로 정의되어져 있는데, #예를 들어 32비트의 유동 소수점은 torch.FloatTensor를, 64비트의 부호 있는 정수는 torch.LongTensor를 사용합니다. #GPU 연산을 위한 자료형도 있습니다. 예를 들어 torch.cuda.FloatTensor가 그 예입니다. # 즉 mel은 소수점있고 speaker는 소숫점 없으니까! with torch.no_grad( ): # 자동미분,벡터연산한 결과의 연산기록 추적못하게 https://bob3rdnewbie.tistory.com/315 z, _ = encoder.encode(mel) output = decoder.generate(z, speaker) output_loudness = meter.integrated_loudness(output) #아웃풋의 음량을 측정인듯 output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) # 아웃풋의 음량을 input에 넣은 wav의 음량과 동일하게 변경 path = out_dir / out_filename librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr)
def DDF(cfg): filter_list_path = Path(utils.to_absolute_path(cfg.filter_list)) with open(filter_list_path) as file: filter_list = json.load(file) in_dir = Path(utils.to_absolute_path(cfg.in_dir)) out_dir = Path(utils.to_absolute_path(cfg.out_dir)) out_dir.mkdir(exist_ok=True, parents=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) decoder = Decoder(**cfg.model.decoder) encoder.to(device) decoder.to(device) print("Load checkpoint from: {}:".format(cfg.checkpoint)) checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() meter = pyloudnorm.Meter(cfg.preprocessing.sr) #--------------------------------------- if cfg.privacy_preference == "Low": for wav_path, speaker_id, out_filename in tqdm(filter_list): wav_path = in_dir / wav_path # librosa.load (it will return audio time series, and its sampling rate) wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 path = out_dir / out_filename # to return raw recording in mel-spectrogram without any filtering if cfg.output_type == "Embedding": mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db( mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).squeeze().to(device).numpy() np.savetxt(path.with_suffix(".mel.txt"), mel) # to return raw recording in waveform without any filtering if cfg.output_type == "Recording": librosa.output.write_wav(path.with_suffix(".wav"), wav.astype(np.float32), sr=cfg.preprocessing.sr) #--------------------------------------- if cfg.privacy_preference == "Moderate": dataset_path = Path( utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) for wav_path, speaker_id, out_filename in tqdm(filter_list): wav_path = in_dir / wav_path wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) path = out_dir / out_filename if cfg.output_type == "Recording": with torch.no_grad(): vq, _ = encoder.encode(mel) output = decoder.generate(vq, speaker) output_loudness = meter.integrated_loudness(output) output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr) if cfg.output_type == "Embedding": with torch.no_grad(): vq, _ = encoder.encode(mel) speaker = decoder.speaker(speaker) vq = vq.squeeze().to(device).numpy() speaker = speaker.squeeze().to(device).numpy() np.savetxt(path.with_suffix(".vq.txt"), vq) np.savetxt(path.with_suffix(".speaker.txt"), speaker) #--------------------------------------- if cfg.privacy_preference == "High": dataset_path = Path( utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) for wav_path, speaker_id, out_filename in tqdm(filter_list): wav_path = in_dir / wav_path wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) path = out_dir / out_filename if cfg.output_type == "Recording": with torch.no_grad(): vq, _ = encoder.encode(mel) output = decoder.generate(vq, speaker) output_loudness = meter.integrated_loudness(output) output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr) if cfg.output_type == "Embedding": with torch.no_grad(): vq, _ = encoder.encode(mel) vq = vq.squeeze().cpu().numpy() np.savetxt(path.with_suffix(".vq.txt"), vq)
def train_model(cfg): tensorboard_path = Path(utils.to_absolute_path("tensorboard")) / cfg.checkpoint_dir checkpoint_dir = Path(utils.to_absolute_path(cfg.checkpoint_dir)) writer = SummaryWriter(tensorboard_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) vocoder = Vocoder(**cfg.model.vocoder) encoder.to(device) vocoder.to(device) optimizer = optim.Adam( vocoder.parameters(), lr=cfg.training.optimizer.lr) vocoder, optimizer = amp.initialize(vocoder, optimizer, opt_level="O1") scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.training.scheduler.milestones, gamma=cfg.training.scheduler.gamma) if cfg.resume: print("Resume checkpoint from: {}:".format(cfg.resume)) resume_path = utils.to_absolute_path(cfg.resume) checkpoint = torch.load(resume_path, map_location=lambda storage, loc: storage) vocoder.load_state_dict(checkpoint["vocoder"]) optimizer.load_state_dict(checkpoint["optimizer"]) amp.load_state_dict(checkpoint["amp"]) scheduler.load_state_dict(checkpoint["scheduler"]) global_step = checkpoint["step"] else: global_step = 0 print("Resume cpc encoder from: {}:".format(cfg.cpc_checkpoint)) encoder_path = utils.to_absolute_path(cfg.cpc_checkpoint) checkpoint = torch.load(encoder_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) encoder.eval() root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path dataset = WavDataset( root=root_path, hop_length=cfg.preprocessing.hop_length, sr=cfg.preprocessing.sr, sample_frames=cfg.training.sample_frames) dataloader = DataLoader( dataset, batch_size=cfg.training.batch_size, shuffle=True, num_workers=cfg.training.n_workers, pin_memory=True, drop_last=True) n_epochs = cfg.training.n_steps // len(dataloader) + 1 start_epoch = global_step // len(dataloader) + 1 for epoch in range(start_epoch, n_epochs + 1): average_loss = 0 for i, (audio, mels, speakers) in enumerate(tqdm(dataloader), 1): audio, mels, speakers = audio.to(device), mels.to(device), speakers.to(device) optimizer.zero_grad() with torch.no_grad(): _, _, indices = encoder.encode(mels) output = vocoder(audio[:, :-1], indices, speakers) loss = F.cross_entropy(output.transpose(1, 2), audio[:, 1:]) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1) optimizer.step() scheduler.step() average_loss += (loss.item() - average_loss) / i global_step += 1 if global_step % cfg.training.checkpoint_interval == 0: save_checkpoint( vocoder, optimizer, amp, scheduler, global_step, checkpoint_dir) writer.add_scalar("loss/train", average_loss, global_step) print("epoch:{}, loss:{:.3E}".format(epoch, average_loss))
def convert(): ''' dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) ''' dataset_path = Path('./cfg').absolute() with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) with open(Path("./cfg/cfg.json").absolute()) as file: para = json.load(file) synthesis_list_path = Path('./dataset/english/synthesis.txt').absolute() with open(synthesis_list_path) as file: synthesis_list = json.load(file) in_dir = Path('./dataset/english').absolute() out_dir = Path('./output').absolute() out_dir.mkdir(exist_ok=True, parents=True) print(synthesis_list) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(in_channels=para['encoder']['in_channels'], channels=para['encoder']['channels'], n_embeddings=para['encoder']['n_embeddings'], embedding_dim=para['encoder']['embedding_dim'], jitter=para['encoder']['jitter']) decoder = Decoder( in_channels=para['decoder']['in_channels'], conditioning_channels=para['decoder']['conditioning_channels'], n_speakers=para['decoder']['n_speakers'], speaker_embedding_dim=para['decoder']['speaker_embedding_dim'], mu_embedding_dim=para['decoder']['mu_embedding_dim'], rnn_channels=para['decoder']['rnn_channels'], fc_channels=para['decoder']['fc_channels'], bits=para['decoder']['bits'], hop_length=para['decoder']['hop_length']) encoder.to(device) decoder.to(device) print("Load checkpoint from: {}:".format('./checkpoint/model.pt')) checkpoint_path = Path('./checkpoint/model.pt').absolute() checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() #meter = pyloudnorm.Meter(160000) print('load finish') for wav_path, speaker_id, out_filename in tqdm(synthesis_list): wav_path = in_dir / wav_path wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=para['preprocess']['sr']) #ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, para['preprocess']['preemph']), sr=para['preprocess']['sr'], n_fft=para['preprocess']['n_fft'], n_mels=para['preprocess']['n_mels'], hop_length=para['preprocess']['hop_length'], win_length=para['preprocess']['win_length'], fmin=para['preprocess']['fmin'], power=1) logmel = librosa.amplitude_to_db(mel, top_db=para['preprocess']['top_db']) logmel = logmel / para['preprocess']['top_db'] + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) with torch.no_grad(): z, _ = encoder.encode(mel) output = decoder.generate(z, speaker) #output_loudness = meter.integrated_loudness(output) #output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) path = out_dir / out_filename librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=para['preprocess']['sr'])