def infer_random(self, gender=False): #using the first sample from in_test if gender: source_sp = random.sample(self.speakers_man, 1)[0] target_sp = random.sample(self.speakers_woman, 1)[0] else: source_sp, target_sp = random.sample(self.speakers, 2) source_utt = random.sample(self.indexes[source_sp], 1)[0] target_utt = random.sample(self.indexes[target_sp], 1)[0] source_sp = '3436' target_sp = '6064' source_utt = '172162_000012_000003' target_utt = '300880_000029_000002' # source_sp = '226' # target_sp = '238' # source_utt = '001' # target_utt = '001' display = False output_image_path = os.path.join( self.args.output_image_path, self.args.val_set, self.args.load_iteration, 's{}_{}---t{}_{}'.format(source_sp, source_utt, target_sp, target_utt)) output_audio_path = os.path.join( self.args.output_audio_path, self.args.val_set, self.args.load_iteration, 's{}_{}---t{}_{}'.format(source_sp, source_utt, target_sp, target_utt)) os.makedirs(output_image_path, exist_ok=True) os.makedirs(output_audio_path, exist_ok=True) source_spec_ori = self.dataset[f'{source_sp}/{source_utt}/mel'][:] target_spec_ori = self.dataset[f'{target_sp}/{target_utt}/mel'][:] print('source time {} | target time {}'.format(source_spec_ori.shape, target_spec_ori.shape)) b = 8 source_spec, source_len_pad = self.pad_seq(source_spec_ori, base=b) target_spec, target_len_pad = self.pad_seq(target_spec_ori, base=b) source_spec = cc(torch.from_numpy(source_spec)) target_spec = cc(torch.from_numpy(target_spec)) source_dspec = source_spec target_dspec = target_spec wav2wav(self.speaker2filenames[source_sp][source_utt], os.path.join(output_audio_path, 'src_ori.wav')) wav2wav(self.speaker2filenames[target_sp][target_utt], os.path.join(output_audio_path, 'tar_ori.wav')) # sync self.plot_spectrograms(source_spec_ori, os.path.join(output_image_path, 'src_sync.png')) self.wave_generate(source_spec_ori, os.path.join(output_audio_path, 'src_sync')) self.plot_spectrograms(target_spec_ori, os.path.join(output_image_path, 'tar_sync.png')) self.wave_generate(target_spec_ori, os.path.join(output_audio_path, 'tar_sync')) # reconstruction source_spec_rec = self.inference_one_utterance(source_spec, source_dspec, source_len_pad, pic_path=os.path.join( output_image_path, 's_indices.txt')) self.plot_spectrograms(source_spec_rec, os.path.join(output_image_path, 'src_rec.png')) self.wave_generate(source_spec_rec, os.path.join(output_audio_path, 'src_rec')) target_spec_rec = self.inference_one_utterance(target_spec, target_dspec, target_len_pad, pic_path=os.path.join( output_image_path, 't_indices.txt')) self.plot_spectrograms(target_spec_rec, os.path.join(output_image_path, 'tar_rec.png')) self.wave_generate(target_spec_rec, os.path.join(output_audio_path, 'tar_rec')) criterion = nn.L1Loss() loss_src_rec = criterion(torch.from_numpy(source_spec_rec), torch.from_numpy(source_spec_ori)) loss_trg_rec = criterion(torch.from_numpy(target_spec_rec), torch.from_numpy(target_spec_ori)) print('Source Rec Loss: {} | Target Rec Loss: {}'.format( loss_src_rec, loss_trg_rec)) # conversion speaker s2t_spec = self.inference_one_utterance(source_spec, target_dspec, len_pad=source_len_pad) self.plot_spectrograms(s2t_spec, os.path.join(output_image_path, 's2t.png')) self.wave_generate(s2t_spec, os.path.join(output_audio_path, 's2t')) t2s_spec = self.inference_one_utterance(target_spec, source_dspec, len_pad=target_len_pad) self.plot_spectrograms(t2s_spec, os.path.join(output_image_path, 't2s.png')) self.wave_generate(t2s_spec, os.path.join(output_audio_path, 't2s')) print('Complete...') return
def infer_random(self,gender=False): #using the first sample from in_test if gender: source_sp = random.sample(self.speakers_man, 1)[0] target_sp = random.sample(self.speakers_woman, 1)[0] else: source_sp, target_sp = random.sample(self.speakers,2) source_utt = random.sample(self.indexes[source_sp],1)[0] target_utt = random.sample(self.indexes[target_sp],1)[0] if self.args.val_set == 'out_test': source_sp = '237' target_sp = '225' source_utt = '007' target_utt = '008' # source_sp = '227' # target_sp = '312' # source_utt = '099' # target_utt = '285' display = False output_image_path = os.path.join(self.args.output_image_path,self.args.val_set,self.args.load_iteration, 's{}_{}---t{}_{}'.format(source_sp, source_utt, target_sp, target_utt)) output_audio_path = os.path.join(self.args.output_audio_path,self.args.val_set,self.args.load_iteration, 's{}_{}---t{}_{}'.format(source_sp, source_utt, target_sp, target_utt)) os.makedirs(output_image_path,exist_ok=True) os.makedirs(output_audio_path,exist_ok=True) source_spec_ori = self.dataset[f'{source_sp}/{source_utt}/mel'][:] target_spec_ori = self.dataset[f'{target_sp}/{target_utt}/mel'][:] print('source time {} | target time {}'.format(source_spec_ori.shape, target_spec_ori.shape)) b = 32 if self.args.model_type=='AutoVC' else 8 source_spec, source_len_pad = self.pad_seq(source_spec_ori, base=b) target_spec, target_len_pad = self.pad_seq(target_spec_ori, base=b) source_spec = cc(torch.from_numpy(source_spec)) target_spec = cc(torch.from_numpy(target_spec)) if self.args.model_type=='AdaVAEd': target_slices = self.dataset[f'{target_sp}/{target_utt}/dmels'][:] source_slices = self.dataset[f'{source_sp}/{source_utt}/dmels'][:] target_dspec = self.dataset[f'{target_sp}/{target_utt}/dmel'][:] source_dspec = self.dataset[f'{source_sp}/{source_utt}/dmel'][:] target_dspec = cc(torch.from_numpy(np.array([target_dspec[t0:t1] for t0, t1 in target_slices]))) source_dspec = cc(torch.from_numpy(np.array([source_dspec[t0:t1] for t0, t1 in source_slices]))) else: source_dspec = source_spec target_dspec = target_spec wav2wav(self.speaker2filenames[source_sp][source_utt],os.path.join(output_audio_path,'src_ori.wav')) wav2wav(self.speaker2filenames[target_sp][target_utt],os.path.join(output_audio_path,'tar_ori.wav')) # sync self.plot_spectrograms(source_spec_ori, os.path.join(output_image_path,'src_sync.png')) self.wave_generate(source_spec_ori, os.path.join(output_audio_path,'src_sync')) self.plot_spectrograms(target_spec_ori, os.path.join(output_image_path,'tar_sync.png')) self.wave_generate(target_spec_ori, os.path.join(output_audio_path,'tar_sync')) # reconstruction source_spec_rec = self.inference_one_utterance(source_spec, source_dspec, source_len_pad) self.plot_spectrograms(source_spec_rec, os.path.join(output_image_path,'src_rec.png')) self.wave_generate(source_spec_rec, os.path.join(output_audio_path,'src_rec')) target_spec_rec = self.inference_one_utterance(target_spec, target_dspec ,target_len_pad) self.plot_spectrograms(target_spec_rec, os.path.join(output_image_path,'tar_rec.png')) self.wave_generate(target_spec_rec, os.path.join(output_audio_path,'tar_rec')) criterion = nn.L1Loss() loss_src_rec = criterion(torch.from_numpy(source_spec_rec), torch.from_numpy(source_spec_ori)) loss_trg_rec = criterion(torch.from_numpy(target_spec_rec), torch.from_numpy(target_spec_ori)) print('Source Rec Loss: {} | Target Rec Loss: {}'.format(loss_src_rec, loss_trg_rec)) # conversion speaker s2t_spec = self.inference_one_utterance(source_spec, target_dspec, len_pad=source_len_pad) self.plot_spectrograms(s2t_spec, os.path.join(output_image_path,'s2t.png')) self.wave_generate(s2t_spec, os.path.join(output_audio_path,'s2t')) t2s_spec = self.inference_one_utterance(target_spec, source_dspec, len_pad=target_len_pad) self.plot_spectrograms(t2s_spec, os.path.join(output_image_path,'t2s.png')) self.wave_generate(t2s_spec, os.path.join(output_audio_path,'t2s')) print('Complete...') return