Exemplo n.º 1
0
def main(args, hp):
    with torch.no_grad():
        model = VoiceFilter(hp).cuda()
        chkpt_model = torch.load(args.checkpoint_path)['model']
        model.load_state_dict(chkpt_model)
        model.eval()

        embedder = SpeechEmbedder(hp).cuda()
        chkpt_embed = torch.load(args.embedder_path)
        embedder.load_state_dict(chkpt_embed)
        embedder.eval()

        audio = Audio(hp)
        ref_wav, _ = librosa.load(args.reference_file, sr=16000)
        ref_mel = audio.get_mel(ref_wav)
        ref_mel = torch.from_numpy(ref_mel).float().cuda()
        dvec = embedder(ref_mel)
        dvec = dvec.unsqueeze(0)

        mixed_wav, _ = librosa.load(args.mixed_file, sr=16000)
        mixed_mag, mixed_phase = audio.wav2spec(mixed_wav)
        mixed_mag = torch.from_numpy(mixed_mag).float().cuda()

        mixed_mag = mixed_mag.unsqueeze(0)
        shadow_mag = model(mixed_mag, dvec)

        shadow_mag = shadow_mag[0].cpu().detach().numpy()
        recorded_mag = tensor_normalize(mixed_mag + shadow_mag)
        recorded_mag = recorded_mag[0].cpu().detach().numpy()
        recorded_wav = audio.spec2wav(recorded_mag, mixed_mag)

        os.makedirs(args.out_dir, exist_ok=True)
        out_path = os.path.join(args.out_dir, 'result.wav')
        librosa.output.write_wav(out_path, recorded_wav, sr=16000)
Exemplo n.º 2
0
def main(args, hp):
    model = VoiceFilter(hp).cuda()
    chkpt_model = torch.load(args.checkpoint_path)['model']
    model.load_state_dict(chkpt_model)
    model.eval()

    embedder = SpeechEmbedder(hp).cuda()
    chkpt_embed = torch.load(args.embedder_path)
    embedder.load_state_dict(chkpt_embed)
    embedder.eval()

    audio = Audio(hp)
    dvec_wav, _ = librosa.load(args.reference_file, sr=16000)
    dvec_mel = audio.get_mel(dvec_wav)
    dvec_mel = torch.from_numpy(dvec_mel).float().cuda()
    dvec = embedder(dvec_mel)
    dvec = dvec.unsqueeze(0)

    mixed_wav, _ = librosa.load(args.mixed_file, sr=16000)
    mag, phase = audio.wav2spec(mixed_wav)
    mag = torch.from_numpy(mag).float().cuda()

    mag = mag.unsqueeze(0)
    mask = model(mag, dvec)
    est_mag = mag * mask

    est_mag = est_mag[0].cpu().detach().numpy()
    est_wav = audio.spec2wav(est_mag, phase)

    os.makedirs(args.out_dir, exist_ok=True)
    out_path = os.path.join(args.out_dir, 'result.wav')
    librosa.output.write_wav(out_path, est_wav, sr=16000)
Exemplo n.º 3
0
class VFDataset(Dataset):
    def __init__(self, train):
        def find_all(file_format):
            return sorted(glob.glob(os.path.join(self.data_dir, file_format)))

        self.train = train
        self.data_dir = config.data['base_dir'] + config.data[
            'train_dir'] if train else config.data['base_dir'] + config.data[
                'test_dir']

        self.dvec_list = find_all(config.form['dvec'])
        self.target_wav_list = find_all(config.form['target']['wav'])
        self.mixed_wav_list = find_all(config.form['mixed']['wav'])
        self.target_mag_list = find_all(config.form['target']['mag'])
        self.mixed_mag_list = find_all(config.form['mixed']['mag'])

        assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \
            len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match"
        assert len(self.dvec_list) != 0, \
            "no training file found"

        self.audio = Audio()

    def __len__(self):
        return len(self.dvec_list)

    def __getitem__(self, idx):
        with open(self.dvec_list[idx], 'r') as f:
            dvec_path = f.readline().strip()

        dvec_wav, _ = librosa.load(config.data['base_dir'] + dvec_path,
                                   sr=config.audio['sample_rate'])
        dvec_mel = self.audio.get_mel(dvec_wav)
        dvec_mel = torch.from_numpy(dvec_mel).float()

        if self.train:  # need to be fast
            target_mag = torch.load(self.target_mag_list[idx])
            mixed_mag = torch.load(self.mixed_mag_list[idx])
            return dvec_mel, target_mag, mixed_mag
        else:
            target_wav, _ = librosa.load(self.target_wav_list[idx],
                                         config.audio['sample_rate'])
            mixed_wav, _ = librosa.load(self.mixed_wav_list[idx],
                                        config.audio['sample_rate'])
            target_mag, _ = self.wav2magphase(self.target_wav_list[idx])
            mixed_mag, mixed_phase = self.wav2magphase(
                self.mixed_wav_list[idx])
            target_mag = torch.from_numpy(target_mag)
            mixed_mag = torch.from_numpy(mixed_mag)
            # mixed_phase = torch.from_numpy(mixed_phase)
            return dvec_mel, target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase

    def wav2magphase(self, path):
        wav, _ = librosa.load(path, config.audio['sample_rate'])
        mag, phase = self.audio.wav2spec(wav)
        return mag, phase
Exemplo n.º 4
0
class VFDataset(Dataset):
    def __init__(self, hp, args, train):
        def find_all(file_format):
            return sorted(glob.glob(os.path.join(self.data_dir, file_format)))

        self.hp = hp
        self.args = args
        self.train = train
        self.data_dir = hp.data.train_dir if train else hp.data.test_dir

        self.dvec_list = find_all(hp.form.dvec)
        self.target_wav_list = find_all(hp.form.target.wav)
        self.mixed_wav_list = find_all(hp.form.mixed.wav)
        self.target_mag_list = find_all(hp.form.target.mag)
        self.mixed_mag_list = find_all(hp.form.mixed.mag)

        assert len(self.dvec_list) == len(self.target_wav_list) == len(self.mixed_wav_list) == \
            len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match"
        assert len(self.dvec_list) != 0, \
            "no training file found"

        self.audio = Audio(hp)

    def __len__(self):
        return len(self.dvec_list)

    def __getitem__(self, idx):
        with open(self.dvec_list[idx], 'r') as f:
            dvec_path = f.readline().strip()

        dvec_wav, _ = librosa.load(dvec_path, sr=self.hp.audio.sample_rate)
        dvec_mel = self.audio.get_mel(dvec_wav)
        dvec_mel = torch.from_numpy(dvec_mel).float()

        if self.train:  # need to be fast
            target_mag = torch.load(self.target_mag_list[idx])
            mixed_mag = torch.load(self.mixed_mag_list[idx])
            return dvec_mel, target_mag, mixed_mag
        else:
            target_wav, _ = librosa.load(self.target_wav_list[idx],
                                         self.hp.audio.sample_rate)
            mixed_wav, _ = librosa.load(self.mixed_wav_list[idx],
                                        self.hp.audio.sample_rate)
            target_mag, _ = self.wav2magphase(self.target_wav_list[idx])
            mixed_mag, mixed_phase = self.wav2magphase(
                self.mixed_wav_list[idx])
            target_mag = torch.from_numpy(target_mag)
            mixed_mag = torch.from_numpy(mixed_mag)
            # mixed_phase = torch.from_numpy(mixed_phase)
            return dvec_mel, target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase

    def wav2magphase(self, path):
        wav, _ = librosa.load(path, self.hp.audio.sample_rate)
        mag, phase = self.audio.wav2spec(wav)
        return mag, phase
Exemplo n.º 5
0
def main(args):
    args = {
        "config": 'config/config.yaml',
        "embedder_path": 'model/embedder.pt',
        "checkpoint_path": 'enhance_my_voice/chkpt_201000.pt',
        "mixed_file": 'utils/speakerA.wav',
        "reference_file": 'utils/speakerA.wav',
        "out_dir": 'output',
    }

    hp = HParam(args['config'])

    with torch.no_grad():
        model = VoiceFilter(hp).cuda()
        chkpt_model = torch.load(args['checkpoint_path'])['model']
        model.load_state_dict(chkpt_model)
        model.eval()

        embedder = SpeechEmbedder(hp).cuda()
        chkpt_embed = torch.load(args['embedder_path'])
        embedder.load_state_dict(chkpt_embed)
        embedder.eval()

        audio = Audio(hp)
        dvec_wav, _ = librosa.load(args['reference_file'], sr=16000)
        dvec_mel = audio.get_mel(dvec_wav)
        dvec_mel = torch.from_numpy(dvec_mel).float().cuda()
        dvec = embedder(dvec_mel)
        dvec = dvec.unsqueeze(0)

        mixed_wav, _ = librosa.load(args['mixed_file'], sr=16000)
        mag, phase = audio.wav2spec(mixed_wav)
        mag = torch.from_numpy(mag).float().cuda()

        mag = mag.unsqueeze(0)
        mask = model(mag, dvec)
        est_mag = mag * mask

        est_mag = est_mag[0].cpu().detach().numpy()
        # est_wav = audio.spec2wav(est_mag, phase)

        # os.makedirs(args['out_dir'], exist_ok=True)
        # out_path = os.path.join(args['out_dir'], 'result.wav')
        # librosa.output.write_wav(out_path, est_wav, sr=16000)
        return audio.spec2wav(est_mag, phase)
Exemplo n.º 6
0
class VFWSDataset(Dataset):
    def __init__(self, hp, train):
        def find_all(data_dir,file_format):
            return sorted(glob.glob(os.path.join(data_dir, file_format)))
        self.hp = hp
        self.train = train

        self.mixed_dir = hp.data.vfws_dir + 'mixed/train/' if train else hp.data.vfws_dir + 'mixed/test/'
        self.clean_dir = hp.data.vfws_dir + 'clean/train/' if train else hp.data.vfws_dir + 'clean/test/'

        self.target_wav_list = find_all(self.clean_dir, hp.form.target.wav)
        self.mixed_wav_list  = find_all(self.mixed_dir, hp.form.mixed.wav)
        self.target_mag_list = find_all(self.clean_dir, hp.form.target.mag)
        self.mixed_mag_list  = find_all(self.mixed_dir, hp.form.mixed.mag)

        assert len(self.target_wav_list) == len(self.mixed_wav_list) == \
            len(self.target_mag_list) == len(self.mixed_mag_list), "number of training files must match"

        self.audio = Audio(hp)

    def __len__(self):
        return len(self.target_mag_list)

    def __getitem__(self, idx):
        if self.train :  # need to be fast
            target_mag = torch.load(self.target_mag_list[idx])
            mixed_mag = torch.load(self.mixed_mag_list[idx])
            return target_mag, mixed_mag
        else:
            target_wav, _ = librosa.load(self.target_wav_list[idx], self.hp.audio.sample_rate)
            mixed_wav, _ = librosa.load(self.mixed_wav_list[idx], self.hp.audio.sample_rate)
            target_mag, _ = self.wav2magphase(self.target_wav_list[idx])
            mixed_mag, mixed_phase = self.wav2magphase(self.mixed_wav_list[idx])
            target_mag = torch.from_numpy(target_mag)
            mixed_mag = torch.from_numpy(mixed_mag)
            # mixed_phase = torch.from_numpy(mixed_phase)
            return target_wav, mixed_wav, target_mag, mixed_mag, mixed_phase

    def wav2magphase(self, path):
        wav, _ = librosa.load(path, self.hp.audio.sample_rate)
        mag, phase = self.audio.wav2spec(wav)
        return mag, phase
Exemplo n.º 7
0
def main(args, hp):
    with open('out1.txt') as f:
        for line in f:
            res = line.split('\t')
            with torch.no_grad():
                model = VoiceFilter(hp)
                chkpt_model = torch.load(args.checkpoint_path, map_location='cpu')['model']
                model.load_state_dict(chkpt_model)
                model.eval()

                embedder = SpeechEmbedder(hp)
                chkpt_embed = torch.load(args.embedder_path, map_location='cpu')
                embedder.load_state_dict(chkpt_embed)
                embedder.eval()

                audio = Audio(hp)
                dvec_wav, _ = librosa.load(res[1], sr=16000)
                dvec_mel = audio.get_mel(dvec_wav)
                dvec_mel = torch.from_numpy(dvec_mel).float()
                dvec = embedder(dvec_mel)
                dvec = dvec.unsqueeze(0)

                mixed_wav, _ = librosa.load(res[0], sr=16000)
                mag, phase = audio.wav2spec(mixed_wav)
                mag = torch.from_numpy(mag).float()

                mag = mag.unsqueeze(0)
                mask = model(mag, dvec)
                est_mag = mag * mask

                est_mag = est_mag[0].cpu().detach().numpy()
                est_wav = audio.spec2wav(est_mag, phase)

                os.makedirs('/root/voicefilter/res', exist_ok=True)
                out_path = os.path.join('/root/voicefilter/res', f'{res[2]}')
                librosa.output.write_wav(out_path, est_wav, sr=16000)
            model.eval()

            embedder = SpeechEmbedder(hp).cuda()
            chkpt_embed = torch.load(args.embedder_path)
            embedder.load_state_dict(chkpt_embed)
            embedder.eval()

            audio = Audio(hp)
            dvec_wav, _ = librosa.load(dvec_path, sr=16000)
            ref_mel = audio.get_mel(dvec_wav)
            ref_mel = torch.from_numpy(ref_mel).float().cuda()
            dvec = embedder(ref_mel)
            dvec = dvec.unsqueeze(0)  # (1, 256)

            mixed_wav, _ = librosa.load(mixed_wav_path, sr=16000)
            mixed_mag, mixed_phase = audio.wav2spec(mixed_wav)
            mixed_mag = torch.from_numpy(mixed_mag).float().cuda()

            mixed_mag = mixed_mag.unsqueeze(0)

            shadow_mag = model(mixed_mag, dvec)
            # shadow_mag.size() = [1, 301, 601]

            recorded_mag = tensor_normalize(mixed_mag + shadow_mag)
            recorded_mag = recorded_mag[0].cpu().detach().numpy()
            mixed_mag = mixed_mag[0].cpu().detach().numpy()

            shadow_mag = shadow_mag[0].cpu().detach().numpy()
            shadow_wav = audio.spec2wav(shadow_mag, mixed_phase)

            # scale is frequency pass to time domain, used on wav signal normalization
Exemplo n.º 9
0
speaker_ids = os.listdir(base_path)
print(speaker_ids)
wavs = {}

for speaker_id in speaker_ids:
    wavs[speaker_id] = glob.glob(
        os.path.join(base_path, speaker_id, '*-norm.wav'))[:10]

speakers = len(speaker_ids)
LTAF = np.empty([speakers, 10, fre_bins])
sk_count = 0
for k, v in wavs.items():
    for idx, wav in enumerate(v):
        loaded_wav, _ = librosa.load(wav, 16000)
        # loaded_wav = loaded_wav[16000: 20000]
        spect, _ = myaudio.wav2spec(loaded_wav)  # (time, freq)
        spect = spect[:, :fre_bins]
        averaged = np.sum(spect, axis=0) / spect.shape[0]
        averaged = averaged.reshape(1, -1)
        LTAF[sk_count][idx] = averaged
    sk_count = sk_count + 1
print(LTAF.shape)  # (speakers, sentencePspk, chosen_freq_bins) (18, 10, 375)

LTAF = LTAF.reshape(-1, fre_bins)
print(LTAF.shape)

# output re shape (180*180)
re = cosine_similarity(LTAF)
re = np.empty([speakers * 10, speakers * 10])
for idx1, sen1 in enumerate(LTAF):
    for idx2, sen2 in enumerate(LTAF):
Exemplo n.º 10
0
# plot_spectrogram(ssk1_1, 'sk1_1')
#
# ssk1_2, _ = myaudio.wav2spec(sk1_2)
# ssk1_2 = ssk1_2[:102, :125]
# plot_spectrogram(ssk1_2, 'sk1_2')
#
# ssk2_1, _ = myaudio.wav2spec(sk2_1)
# ssk2_1 = ssk2_1[:102, :125]
# plot_spectrogram(ssk2_1, 'sk2_1')
#
# ssk2_2, _ = myaudio.wav2spec(sk2_2)
# ssk2_2 = ssk2_2[:102, :125]
# plot_spectrogram(ssk2_2, 'sk2_2')

#############################################3
ssk1_1, _ = myaudio.wav2spec(sk1_1)
ssk1_1 = ssk1_1[:300, :125]
print(ssk1_1.shape)
plot_spectrogram(ssk1_1, 'sk1_1')

ssk1_2, _ = myaudio.wav2spec(sk1_2)
ssk1_2 = ssk1_2[50:350, :125]
plot_spectrogram(ssk1_2, 'sk1_2')

ssk2_1, _ = myaudio.wav2spec(sk2_1)
ssk2_1 = ssk2_1[:300, :125]
plot_spectrogram(ssk2_1, 'sk2_1')

ssk2_2, _ = myaudio.wav2spec(sk2_2)
ssk2_2 = ssk2_2[:300, :125]
plot_spectrogram(ssk2_2, 'sk2_2')