def _convolve_mird(titles, reverb=0.160, degrees=[0], mic_intervals=[8,8,8,8,8,8,8], mic_indices=[0], samples=None): intervals = '-'.join([str(interval) for interval in mic_intervals]) T_min = None for title in titles: source, _ = read_wav("data/single-channel/{}.wav".format(title)) T = len(source) if T_min is None or T < T_min: T_min = T mixed_signals = [] for mic_idx in mic_indices: _mixture = 0 for title_idx in range(len(titles)): degree = degrees[title_idx] title = titles[title_idx] rir_path = "data/MIRD/Reverb{:.3f}_{}/Impulse_response_Acoustic_Lab_Bar-Ilan_University_(Reverberation_{:.3f}s)_{}_1m_{:03d}.mat".format(reverb, intervals, reverb, intervals, degree) rir_mat = loadmat(rir_path) rir = rir_mat['impulse_response'] if samples is not None: rir = rir[:samples] source, sr = read_wav("data/single-channel/{}.wav".format(title)) _mixture = _mixture + np.convolve(source[:T_min], rir[:, mic_idx]) mixed_signals.append(_mixture) mixed_signals = np.array(mixed_signals) return mixed_signals
def __getitem__(self, idx): """ Returns: mixture (1, T) <torch.Tensor> sources (n_sources, T) <torch.Tensor> segment_IDs (n_sources,) <list<str>> """ data = self.json_data[idx] sources = [] for key in data['sources'].keys(): source_data = data['sources'][key] start, end = source_data['start'], source_data['end'] wav_path = os.path.join(self.wav_root, source_data['path']) wave, sr = read_wav(wav_path) wave = np.array(wave)[start:end] wave = wave[None] sources.append(wave) sources = np.concatenate(sources, axis=0) mixture_data = data['mixture'] start, end = mixture_data['start'], mixture_data['end'] wav_path = os.path.join(self.wav_root, mixture_data['path']) wave, sr = read_wav(wav_path) wave = np.array(wave)[start:end] mixture = wave[None] segment_ID = self.json_data[idx]['ID'] + '_{}-{}'.format(start, end) mixture = torch.Tensor(mixture).float() sources = torch.Tensor(sources).float() return mixture, sources, segment_ID
def __init__(self, wav_root, list_path, max_samples=None, max_n_sources=3): super().__init__(wav_root, list_path, max_n_sources=max_n_sources) self.json_data = [] with open(list_path) as f: for line in f: ID = line.strip() wav_path = os.path.join(wav_root, 'mix', '{}.wav'.format(ID)) y, sr = read_wav(wav_path) T_total = len(y) if max_samples is None: samples = T_total else: if T_total < max_samples: samples = T_total else: samples = max_samples n_sources = 0 for source_idx in range(max_n_sources): wav_path = os.path.join(wav_root, 's{}'.format(source_idx+1), '{}.wav'.format(ID)) if not os.path.exists(wav_path): break n_sources += 1 data = { 'sources': {}, 'mixture': {} } for source_idx in range(n_sources): source_data = { 'path': os.path.join('s{}'.format(source_idx+1), '{}.wav'.format(ID)), 'start': 0, 'end': samples } data['sources']['s{}'.format(source_idx+1)] = source_data mixture_data = { 'path': os.path.join('mix', '{}.wav'.format(ID)), 'start': 0, 'end': samples } data['mixture'] = mixture_data data['ID'] = ID self.json_data.append(data)
def __init__(self, wav_root, list_path, samples=32000, overlap=None, n_sources=2): super().__init__(wav_root, list_path, n_sources=n_sources) if overlap is None: overlap = samples // 2 self.json_data = [] with open(list_path) as f: for line in f: ID = line.strip() wav_path = os.path.join(wav_root, 'mix', '{}.wav'.format(ID)) y, sr = read_wav(wav_path) T_total = len(y) for start_idx in range(0, T_total, samples - overlap): end_idx = start_idx + samples if end_idx > T_total: break data = {'sources': {}, 'mixture': {}} for source_idx in range(n_sources): source_data = { 'path': os.path.join('s{}'.format(source_idx + 1), '{}.wav'.format(ID)), 'start': start_idx, 'end': end_idx } data['sources']['s{}'.format(source_idx + 1)] = source_data mixture_data = { 'path': os.path.join('mix', '{}.wav'.format(ID)), 'start': start_idx, 'end': end_idx } data['mixture'] = mixture_data data['ID'] = ID self.json_data.append(data)
def __init__(self, wav_root, list_path, fft_size, hop_size=None, window_fn='hann', normalize=False, mask_type='ibm', threshold=40, max_samples=None, n_sources=2, eps=EPS): super().__init__(wav_root, list_path, fft_size, hop_size=hop_size, window_fn=window_fn, normalize=normalize, mask_type=mask_type, threshold=threshold, n_sources=n_sources, eps=eps) self.json_data = [] with open(list_path) as f: for line in f: ID = line.strip() wav_path = os.path.join(wav_root, 'mix', '{}.wav'.format(ID)) y, sr = read_wav(wav_path) T_total = len(y) if max_samples is None: samples = T_total else: if T_total < max_samples: samples = T_total else: samples = max_samples data = { 'sources': {}, 'mixture': {} } for source_idx in range(n_sources): source_data = { 'path': os.path.join('s{}'.format(source_idx+1), '{}.wav'.format(ID)), 'start': 0, 'end': samples } data['sources']['s{}'.format(source_idx+1)] = source_data mixture_data = { 'path': os.path.join('mix', '{}.wav'.format(ID)), 'start': 0, 'end': samples } data['mixture'] = mixture_data data['ID'] = ID self.json_data.append(data)
def _test(metric='EUC'): torch.manual_seed(111) fft_size, hop_size = 1024, 256 n_bases = 6 iteration = 100 signal, sr = read_wav("data/music-8000.wav") T = len(signal) signal = torch.Tensor(signal).unsqueeze(dim=0) stft = BatchSTFT(fft_size=fft_size, hop_size=hop_size) istft = BatchInvSTFT(fft_size=fft_size, hop_size=hop_size) spectrogram = stft(signal).squeeze(dim=0) real = spectrogram[...,0] imag = spectrogram[...,1] amplitude = torch.sqrt(real**2 + imag**2) power = amplitude**2 log_spectrogram = 10 * torch.log10(power + EPS) plt.figure() plt.pcolormesh(log_spectrogram, cmap='jet') plt.colorbar() plt.savefig('data/NMF/spectrogram.png', bbox_inches='tight') plt.close() nmf = NMF(n_bases, metric=metric) nmf.update(power, iteration=iteration) estimated_power = torch.matmul(nmf.base, nmf.activation) estimated_amplitude = torch.sqrt(estimated_power) ratio = estimated_amplitude / (amplitude + EPS) estimated_real, estimated_imag = ratio * real, ratio * imag estimated_spectrogram = torch.cat([estimated_real.unsqueeze(dim=2), estimated_imag.unsqueeze(dim=2)], dim=2).unsqueeze(dim=0) estimated_signal = istft(estimated_spectrogram, T=T) estimated_signal = estimated_signal.squeeze(dim=0).numpy() estimated_signal = estimated_signal / np.abs(estimated_signal).max() write_wav("data/NMF/{}/music-8000-estimated-iter{}.wav".format(metric, iteration), signal=estimated_signal, sr=8000) for idx in range(n_bases): estimated_power = torch.matmul(nmf.base[:, idx: idx+1], nmf.activation[idx: idx+1, :]) estimated_amplitude = torch.sqrt(estimated_power) ratio = estimated_amplitude / (amplitude + EPS) estimated_real, estimated_imag = ratio * real, ratio * imag estimated_spectrogram = torch.cat([estimated_real.unsqueeze(dim=2), estimated_imag.unsqueeze(dim=2)], dim=2).unsqueeze(dim=0) estimated_signal = istft(estimated_spectrogram, T=T) estimated_signal = estimated_signal.squeeze(dim=0).numpy() estimated_signal = estimated_signal / np.abs(estimated_signal).max() write_wav("data/NMF/{}/music-8000-estimated-iter{}-base{}.wav".format(metric, iteration, idx), signal=estimated_signal, sr=8000) log_spectrogram = 10 * torch.log10(estimated_power + EPS).numpy() plt.figure() plt.pcolormesh(log_spectrogram, cmap='jet') plt.colorbar() plt.savefig('data/NMF/{}/estimated-spectrogram-iter{}-base{}.png'.format(metric, iteration, idx), bbox_inches='tight') plt.close() plt.figure() plt.plot(nmf.loss) plt.savefig('data/NMF/{}/loss.png'.format(metric), bbox_inches='tight') plt.close()
if __name__ == '__main__': import os import numpy as np from scipy.signal import resample_poly from utils.utils_audio import read_wav, write_wav os.makedirs("data/GriffinLim", exist_ok=True) torch.manual_seed(111) fft_size, hop_size = 1024, 256 n_basis = 4 signal, sr = read_wav("data/man-44100.wav") signal = resample_poly(signal, up=16000, down=sr) write_wav("data/man-16000.wav", signal=signal, sr=16000) T = len(signal) signal = torch.Tensor(signal).unsqueeze(dim=0) stft = BatchSTFT(fft_size=fft_size, hop_size=hop_size) istft = BatchInvSTFT(fft_size=fft_size, hop_size=hop_size) spectrogram = stft(signal) oracle_signal = istft(spectrogram, T=T) oracle_signal = oracle_signal.squeeze(dim=0).numpy() write_wav("data/man-oracle.wav", signal=oracle_signal, sr=16000) griffin_lim = GriffinLim(fft_size, hop_size=hop_size)
return mask if __name__ == '__main__': import numpy as np from scipy.signal import resample_poly from utils.utils_audio import read_wav, write_wav from stft import BatchSTFT, BatchInvSTFT torch.manual_seed(111) fft_size, hop_size = 1024, 256 n_basis = 4 source1, sr = read_wav("data/man-44100.wav") source1 = resample_poly(source1, up=16000, down=sr) write_wav("data/man-16000.wav", signal=source1, sr=16000) T = len(source1) source2, sr = read_wav("data/woman-44100.wav") source2 = resample_poly(source2, up=16000, down=sr) write_wav("data/woman-16000.wav", signal=source2, sr=16000) mixture = source1 + source2 write_wav("data/mixture-16000.wav", signal=mixture, sr=16000) stft = BatchSTFT(fft_size=fft_size, hop_size=hop_size) istft = BatchInvSTFT(fft_size=fft_size, hop_size=hop_size) mixture = torch.Tensor(mixture).unsqueeze(dim=0)
def _test(metric='EUC'): np.random.seed(111) fft_size, hop_size = 1024, 256 n_bases = 6 iteration = 100 signal, sr = read_wav("data/single-channel/music-8000.wav") T = len(signal) spectrogram = stft(signal, fft_size=fft_size, hop_size=hop_size) amplitude = np.abs(spectrogram) power = amplitude**2 if metric == 'EUC': nmf = EUCNMF(n_bases) elif metric == 'IS': nmf = ISNMF(n_bases) elif metric == 'KL': nmf = KLNMF(n_bases) else: raise NotImplementedError("Not support {}-NMF".format(metric)) nmf.update(power, iteration=iteration) amplitude[amplitude < EPS] = EPS estimated_power = nmf.base @ nmf.activation estimated_amplitude = np.sqrt(estimated_power) ratio = estimated_amplitude / amplitude estimated_spectrogram = ratio * spectrogram estimated_signal = istft(estimated_spectrogram, fft_size=fft_size, hop_size=hop_size, length=T) estimated_signal = estimated_signal / np.abs(estimated_signal).max() write_wav("data/NMF/{}/music-8000-estimated-iter{}.wav".format( metric, iteration), signal=estimated_signal, sr=8000) power[power < EPS] = EPS log_spectrogram = 10 * np.log10(power) plt.figure() plt.pcolormesh(log_spectrogram, cmap='jet') plt.colorbar() plt.savefig('data/NMF/spectrogram.png', bbox_inches='tight') plt.close() for idx in range(n_bases): estimated_power = nmf.base[:, idx:idx + 1] @ nmf.activation[idx:idx + 1, :] estimated_amplitude = np.sqrt(estimated_power) ratio = estimated_amplitude / amplitude estimated_spectrogram = ratio * spectrogram estimated_signal = istft(estimated_spectrogram, fft_size=fft_size, hop_size=hop_size, length=T) estimated_signal = estimated_signal / np.abs(estimated_signal).max() write_wav("data/NMF/{}/music-8000-estimated-iter{}-base{}.wav".format( metric, iteration, idx), signal=estimated_signal, sr=8000) estimated_power[estimated_power < EPS] = EPS log_spectrogram = 10 * np.log10(estimated_power) plt.figure() plt.pcolormesh(log_spectrogram, cmap='jet') plt.colorbar() plt.savefig( 'data/NMF/{}/estimated-spectrogram-iter{}-base{}.png'.format( metric, iteration, idx), bbox_inches='tight') plt.close() plt.figure() plt.plot(nmf.loss, color='black') plt.xlabel('Iteration') plt.ylabel('Loss') plt.savefig('data/NMF/{}/loss.png'.format(metric), bbox_inches='tight') plt.close()