def main(args, hp): with torch.no_grad(): model = VoiceFilter(hp).cuda() chkpt_model = torch.load(args.checkpoint_path)['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args.embedder_path) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) ref_wav, _ = librosa.load(args.reference_file, sr=16000) ref_mel = audio.get_mel(ref_wav) ref_mel = torch.from_numpy(ref_mel).float().cuda() dvec = embedder(ref_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(args.mixed_file, sr=16000) mixed_mag, mixed_phase = audio.wav2spec(mixed_wav) mixed_mag = torch.from_numpy(mixed_mag).float().cuda() mixed_mag = mixed_mag.unsqueeze(0) shadow_mag = model(mixed_mag, dvec) shadow_mag = shadow_mag[0].cpu().detach().numpy() recorded_mag = tensor_normalize(mixed_mag + shadow_mag) recorded_mag = recorded_mag[0].cpu().detach().numpy() recorded_wav = audio.spec2wav(recorded_mag, mixed_mag) os.makedirs(args.out_dir, exist_ok=True) out_path = os.path.join(args.out_dir, 'result.wav') librosa.output.write_wav(out_path, recorded_wav, sr=16000)
def main(args, hp): model = VoiceFilter(hp).cuda() chkpt_model = torch.load(args.checkpoint_path)['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args.embedder_path) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(args.reference_file, sr=16000) dvec_mel = audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float().cuda() dvec = embedder(dvec_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(args.mixed_file, sr=16000) mag, phase = audio.wav2spec(mixed_wav) mag = torch.from_numpy(mag).float().cuda() mag = mag.unsqueeze(0) mask = model(mag, dvec) est_mag = mag * mask est_mag = est_mag[0].cpu().detach().numpy() est_wav = audio.spec2wav(est_mag, phase) os.makedirs(args.out_dir, exist_ok=True) out_path = os.path.join(args.out_dir, 'result.wav') librosa.output.write_wav(out_path, est_wav, sr=16000)
class VFDataset(Dataset): def __init__(self, train): def find_all(file_format): return sorted(glob.glob(os.path.join(self.data_dir, file_format))) self.train = train self.data_dir = config.data['base_dir'] + config.data[ 'train_dir'] if train else config.data['base_dir'] + config.data[ 'test_dir'] self.dvec_list = find_all(config.form['dvec']) self.target_wav_list = find_all(config.form['target']['wav']) self.mixed_wav_list = find_all(config.form['mixed']['wav']) self.target_mag_list = find_all(config.form['target']['mag']) self.mixed_mag_list = find_all(config.form['mixed']['mag']) assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \ len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match" assert len(self.dvec_list) != 0, \ "no training file found" self.audio = Audio() def __len__(self): return len(self.dvec_list) def __getitem__(self, idx): with open(self.dvec_list[idx], 'r') as f: dvec_path = f.readline().strip() dvec_wav, _ = librosa.load(config.data['base_dir'] + dvec_path, sr=config.audio['sample_rate']) dvec_mel = self.audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float() if self.train: # need to be fast target_mag = torch.load(self.target_mag_list[idx]) mixed_mag = torch.load(self.mixed_mag_list[idx]) return dvec_mel, target_mag, mixed_mag else: target_wav, _ = librosa.load(self.target_wav_list[idx], config.audio['sample_rate']) mixed_wav, _ = librosa.load(self.mixed_wav_list[idx], config.audio['sample_rate']) target_mag, _ = self.wav2magphase(self.target_wav_list[idx]) mixed_mag, mixed_phase = self.wav2magphase( self.mixed_wav_list[idx]) target_mag = torch.from_numpy(target_mag) mixed_mag = torch.from_numpy(mixed_mag) # mixed_phase = torch.from_numpy(mixed_phase) return dvec_mel, target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase def wav2magphase(self, path): wav, _ = librosa.load(path, config.audio['sample_rate']) mag, phase = self.audio.wav2spec(wav) return mag, phase
class VFDataset(Dataset): def __init__(self, hp, args, train): def find_all(file_format): return sorted(glob.glob(os.path.join(self.data_dir, file_format))) self.hp = hp self.args = args self.train = train self.data_dir = hp.data.train_dir if train else hp.data.test_dir self.dvec_list = find_all(hp.form.dvec) self.target_wav_list = find_all(hp.form.target.wav) self.mixed_wav_list = find_all(hp.form.mixed.wav) self.target_mag_list = find_all(hp.form.target.mag) self.mixed_mag_list = find_all(hp.form.mixed.mag) assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \ len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match" assert len(self.dvec_list) != 0, \ "no training file found" self.audio = Audio(hp) def __len__(self): return len(self.dvec_list) def __getitem__(self, idx): with open(self.dvec_list[idx], 'r') as f: dvec_path = f.readline().strip() dvec_wav, _ = librosa.load(dvec_path, sr=self.hp.audio.sample_rate) dvec_mel = self.audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float() if self.train: # need to be fast target_mag = torch.load(self.target_mag_list[idx]) mixed_mag = torch.load(self.mixed_mag_list[idx]) return dvec_mel, target_mag, mixed_mag else: target_wav, _ = librosa.load(self.target_wav_list[idx], self.hp.audio.sample_rate) mixed_wav, _ = librosa.load(self.mixed_wav_list[idx], self.hp.audio.sample_rate) target_mag, _ = self.wav2magphase(self.target_wav_list[idx]) mixed_mag, mixed_phase = self.wav2magphase( self.mixed_wav_list[idx]) target_mag = torch.from_numpy(target_mag) mixed_mag = torch.from_numpy(mixed_mag) # mixed_phase = torch.from_numpy(mixed_phase) return dvec_mel, target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase def wav2magphase(self, path): wav, _ = librosa.load(path, self.hp.audio.sample_rate) mag, phase = self.audio.wav2spec(wav) return mag, phase
def main(args): args = { "config": 'config/config.yaml', "embedder_path": 'model/embedder.pt', "checkpoint_path": 'enhance_my_voice/chkpt_201000.pt', "mixed_file": 'utils/speakerA.wav', "reference_file": 'utils/speakerA.wav', "out_dir": 'output', } hp = HParam(args['config']) with torch.no_grad(): model = VoiceFilter(hp).cuda() chkpt_model = torch.load(args['checkpoint_path'])['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args['embedder_path']) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(args['reference_file'], sr=16000) dvec_mel = audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float().cuda() dvec = embedder(dvec_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(args['mixed_file'], sr=16000) mag, phase = audio.wav2spec(mixed_wav) mag = torch.from_numpy(mag).float().cuda() mag = mag.unsqueeze(0) mask = model(mag, dvec) est_mag = mag * mask est_mag = est_mag[0].cpu().detach().numpy() # est_wav = audio.spec2wav(est_mag, phase) # os.makedirs(args['out_dir'], exist_ok=True) # out_path = os.path.join(args['out_dir'], 'result.wav') # librosa.output.write_wav(out_path, est_wav, sr=16000) return audio.spec2wav(est_mag, phase)
class VFWSDataset(Dataset): def __init__(self, hp, train): def find_all(data_dir,file_format): return sorted(glob.glob(os.path.join(data_dir, file_format))) self.hp = hp self.train = train self.mixed_dir = hp.data.vfws_dir + 'mixed/train/' if train else hp.data.vfws_dir + 'mixed/test/' self.clean_dir = hp.data.vfws_dir + 'clean/train/' if train else hp.data.vfws_dir + 'clean/test/' self.target_wav_list = find_all(self.clean_dir, hp.form.target.wav) self.mixed_wav_list = find_all(self.mixed_dir, hp.form.mixed.wav) self.target_mag_list = find_all(self.clean_dir, hp.form.target.mag) self.mixed_mag_list = find_all(self.mixed_dir, hp.form.mixed.mag) assert len(self.target_wav_list) == len(self.mixed_wav_list) == \ len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match" self.audio = Audio(hp) def __len__(self): return len(self.target_mag_list) def __getitem__(self, idx): if self.train : # need to be fast target_mag = torch.load(self.target_mag_list[idx]) mixed_mag = torch.load(self.mixed_mag_list[idx]) return target_mag, mixed_mag else: target_wav, _ = librosa.load(self.target_wav_list[idx], self.hp.audio.sample_rate) mixed_wav, _ = librosa.load(self.mixed_wav_list[idx], self.hp.audio.sample_rate) target_mag, _ = self.wav2magphase(self.target_wav_list[idx]) mixed_mag, mixed_phase = self.wav2magphase(self.mixed_wav_list[idx]) target_mag = torch.from_numpy(target_mag) mixed_mag = torch.from_numpy(mixed_mag) # mixed_phase = torch.from_numpy(mixed_phase) return target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase def wav2magphase(self, path): wav, _ = librosa.load(path, self.hp.audio.sample_rate) mag, phase = self.audio.wav2spec(wav) return mag, phase
def main(args, hp): with open('out1.txt') as f: for line in f: res = line.split('\t') with torch.no_grad(): model = VoiceFilter(hp) chkpt_model = torch.load(args.checkpoint_path, map_location='cpu')['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp) chkpt_embed = torch.load(args.embedder_path, map_location='cpu') embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(res[1], sr=16000) dvec_mel = audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float() dvec = embedder(dvec_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(res[0], sr=16000) mag, phase = audio.wav2spec(mixed_wav) mag = torch.from_numpy(mag).float() mag = mag.unsqueeze(0) mask = model(mag, dvec) est_mag = mag * mask est_mag = est_mag[0].cpu().detach().numpy() est_wav = audio.spec2wav(est_mag, phase) os.makedirs('/root/voicefilter/res', exist_ok=True) out_path = os.path.join('/root/voicefilter/res', f'{res[2]}') librosa.output.write_wav(out_path, est_wav, sr=16000)
model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args.embedder_path) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(dvec_path, sr=16000) ref_mel = audio.get_mel(dvec_wav) ref_mel = torch.from_numpy(ref_mel).float().cuda() dvec = embedder(ref_mel) dvec = dvec.unsqueeze(0) # (1, 256) mixed_wav, _ = librosa.load(mixed_wav_path, sr=16000) mixed_mag, mixed_phase = audio.wav2spec(mixed_wav) mixed_mag = torch.from_numpy(mixed_mag).float().cuda() mixed_mag = mixed_mag.unsqueeze(0) shadow_mag = model(mixed_mag, dvec) # shadow_mag.size() = [1, 301, 601] recorded_mag = tensor_normalize(mixed_mag + shadow_mag) recorded_mag = recorded_mag[0].cpu().detach().numpy() mixed_mag = mixed_mag[0].cpu().detach().numpy() shadow_mag = shadow_mag[0].cpu().detach().numpy() shadow_wav = audio.spec2wav(shadow_mag, mixed_phase) # scale is frequency pass to time domain, used on wav signal normalization
speaker_ids = os.listdir(base_path) print(speaker_ids) wavs = {} for speaker_id in speaker_ids: wavs[speaker_id] = glob.glob( os.path.join(base_path, speaker_id, '*-norm.wav'))[:10] speakers = len(speaker_ids) LTAF = np.empty([speakers, 10, fre_bins]) sk_count = 0 for k, v in wavs.items(): for idx, wav in enumerate(v): loaded_wav, _ = librosa.load(wav, 16000) # loaded_wav = loaded_wav[16000: 20000] spect, _ = myaudio.wav2spec(loaded_wav) # (time, freq) spect = spect[:, :fre_bins] averaged = np.sum(spect, axis=0) / spect.shape[0] averaged = averaged.reshape(1, -1) LTAF[sk_count][idx] = averaged sk_count = sk_count + 1 print(LTAF.shape) # (speakers, sentencePspk, chosen_freq_bins) (18, 10, 375) LTAF = LTAF.reshape(-1, fre_bins) print(LTAF.shape) # output re shape (180*180) re = cosine_similarity(LTAF) re = np.empty([speakers * 10, speakers * 10]) for idx1, sen1 in enumerate(LTAF): for idx2, sen2 in enumerate(LTAF):
# plot_spectrogram(ssk1_1, 'sk1_1') # # ssk1_2, _ = myaudio.wav2spec(sk1_2) # ssk1_2 = ssk1_2[:102, :125] # plot_spectrogram(ssk1_2, 'sk1_2') # # ssk2_1, _ = myaudio.wav2spec(sk2_1) # ssk2_1 = ssk2_1[:102, :125] # plot_spectrogram(ssk2_1, 'sk2_1') # # ssk2_2, _ = myaudio.wav2spec(sk2_2) # ssk2_2 = ssk2_2[:102, :125] # plot_spectrogram(ssk2_2, 'sk2_2') #############################################3 ssk1_1, _ = myaudio.wav2spec(sk1_1) ssk1_1 = ssk1_1[:300, :125] print(ssk1_1.shape) plot_spectrogram(ssk1_1, 'sk1_1') ssk1_2, _ = myaudio.wav2spec(sk1_2) ssk1_2 = ssk1_2[50:350, :125] plot_spectrogram(ssk1_2, 'sk1_2') ssk2_1, _ = myaudio.wav2spec(sk2_1) ssk2_1 = ssk2_1[:300, :125] plot_spectrogram(ssk2_1, 'sk2_1') ssk2_2, _ = myaudio.wav2spec(sk2_2) ssk2_2 = ssk2_2[:300, :125] plot_spectrogram(ssk2_2, 'sk2_2')