示例#1
0
    def __getitem__(self,index):
        while True:
            notnoise = 1
            # Clean files
            if len(self.clean_files) != 0:
                # Randomly sample a clean file
                f = random.choice(self.clean_files)
                fs,audio = read('{}{}/{}_{}.wav'.format(self.clean_root_dir,'clean',f,'clean'))
                audio = audio.astype('float32')
                # Randomly sample a clean clip
                r = random.random()
                if r < self.pure_noise and self.flag == 'train':
                    normalized_clean = torch.zeros(LEN*self.sr).float()
                    notnoise = 0
                else: 
                    start = random.randint(START*fs,len(audio)-LEN*fs)
                    clip = resample(audio[start:start+LEN*fs],fs,self.sr)/1e5

                    if r >= self.pure_noise and np.sum(clip**2) < self.threshold and self.flag == 'train':
                        continue
                    mu, sigma = np.mean(clip), np.std(clip)
                    normalized_clean = torch.from_numpy((clip-mu)/sigma)
                
            # Noise files
            if len(self.noise_files) != 0:
                nf = random.choice(self.noise_files)
                audio_noise, fs = sf.read(nf)
                if len(audio_noise.shape) > 1:
                    audio_noise = np.mean(audio_noise,axis=1)
                audio_noise = audio_noise.astype('float32')
                # Randomly sample a clip of noise
                if len(audio_noise) < LEN*fs: continue
                start = random.randint(0,len(audio_noise)-LEN*fs)
                clip_noise = resample(audio_noise[start:start+LEN*fs],fs,self.sr)
                mu_noise, sigma_noise = np.mean(clip_noise), np.std(clip_noise)
                normalized_noise = torch.from_numpy((clip_noise-mu_noise)/(sigma_noise+EPS))
                
                # Mix the noise with the clean audio clip at given SNR level
                interference = 10**(-self.snr/20)*normalized_noise
                if r < self.pure_noise and self.flag == 'train':
                    mixture = interference
                else:
                    mixture = normalized_clean + interference
                mu_mixture, sigma_mixture = torch.mean(mixture), torch.std(mixture)
                mixture = (mixture-mu_mixture) / sigma_mixture 

            if len(self.noise_files) != 0:
                if self.flag == 'train':
                    return mixture, normalized_clean, notnoise 
                if self.flag == 'test':
                    return mixture, normalized_clean
            return normalized_clean
示例#2
0
def SaveSpectrogram(y_mix, y_vocal, y_inst, fname, original_sr=44100):
    """extract features and save"""
    y_mix = resample(y_mix, original_sr, C.SR)
    y_vocal = resample(y_vocal, original_sr, C.SR)
    y_inst = resample(y_inst, original_sr, C.SR)

    S_mix = np.abs(stft(y_mix, n_fft=C.FFT_SIZE,
                        hop_length=C.H)).astype(np.float32)
    S_vocal = np.abs(stft(y_vocal, n_fft=C.FFT_SIZE,
                          hop_length=C.H)).astype(np.float32)
    S_inst = np.abs(stft(y_inst, n_fft=C.FFT_SIZE,
                         hop_length=C.H)).astype(np.float32)

    norm = S_mix.max()
    S_mix /= norm
    S_vocal /= norm
    S_inst /= norm

    # np.savez(os.path.join(C.PATH_FFT, fname+".npz"), mix=S_mix, vocal=S_vocal, inst=S_inst)

    # Generate sequence (1,512,128) and save
    cnt = 1
    i = 0
    while i + C.PATCH_LENGTH < S_mix.shape[1]:
        mix_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        #vocal_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        inst_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        mix_spec[0, :, :] = S_mix[1:, i:i + C.PATCH_LENGTH]
        #vocal_spec[0, :, :] = S_vocal[1:, i:i + C.PATCH_LENGTH]
        inst_spec[0, :, :] = S_inst[1:, i:i + C.PATCH_LENGTH]

        np.savez(os.path.join(C.VAL_PATH_FFT, fname + str(cnt) + ".npz"),
                 data=mix_spec,
                 label=inst_spec)

        i += C.PATCH_LENGTH
        cnt += 1

    if S_mix.shape[1] >= 128:
        mix_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        #vocal_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        inst_spec = np.zeros((1, 512, C.PATCH_LENGTH), dtype=np.float32)
        mix_spec[0, :, :] = S_mix[1:, S_mix.shape[1] -
                                  C.PATCH_LENGTH:S_mix.shape[1]]
        #vocal_spec[0, :, :] = S_vocal[1:, S_mix.shape[1] - C.PATCH_LENGTH:S_mix.shape[1]]
        inst_spec[0, :, :] = S_inst[1:, S_mix.shape[1] -
                                    C.PATCH_LENGTH:S_mix.shape[1]]

        np.savez(os.path.join(C.VAL_PATH_FFT, fname + str(cnt) + ".npz"),
                 data=mix_spec,
                 label=inst_spec)
        cnt += 1
示例#3
0
 def apply(self, sample, clock=0.0):
     # late binding librosa and its dependencies
     # pre-importing sklearn fixes https://github.com/scikit-learn/scikit-learn/issues/14485
     import sklearn  # pylint: disable=import-outside-toplevel
     from librosa.core import resample  # pylint: disable=import-outside-toplevel
     sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
     rate = pick_value_from_range(self.rate, clock=clock)
     audio = sample.audio
     orig_len = len(audio)
     audio = np.swapaxes(audio, 0, 1)
     audio = resample(audio, sample.audio_format.rate, rate)
     audio = resample(audio, rate, sample.audio_format.rate)
     audio = np.swapaxes(audio, 0, 1)[0:orig_len]
     sample.audio = audio
示例#4
0
def validate(hp, args, generator, discriminator, valloader, writer, step):
    generator.eval()
    discriminator.eval()
    torch.backends.cudnn.benchmark = False

    loader = tqdm.tqdm(valloader, desc='Validation loop')
    loss_g_sum = 0.0
    loss_d_sum = 0.0
    for mel, audio in loader:
        mel = mel.cuda()
        audio = audio.cuda()

        # generator
        fake_audio = generator(mel)
        disc_fake = discriminator(fake_audio[:, :, :audio.size(2)])
        disc_real = discriminator(audio)
        loss_g = 0.0
        loss_d = 0.0
        for (feats_fake,
             score_fake), (feats_real,
                           score_real) in zip(disc_fake, disc_real):
            loss_g += torch.mean(
                torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2]))
            for feat_f, feat_r in zip(feats_fake, feats_real):
                loss_g += hp.model.feat_match * torch.mean(
                    torch.abs(feat_f - feat_r))
            loss_d += torch.mean(
                torch.sum(torch.pow(score_real - 1.0, 2), dim=[1, 2]))
            loss_d += torch.mean(
                torch.sum(torch.pow(score_fake, 2), dim=[1, 2]))

        loss_g_sum += loss_g.item()
        loss_d_sum += loss_d.item()

    loss_g_avg = loss_g_sum / len(valloader.dataset)
    loss_d_avg = loss_d_sum / len(valloader.dataset)

    audio = audio[0][0].cpu().detach().numpy()
    fake_audio = fake_audio[0][0].cpu().detach().numpy()

    audio_16k = resample(audio, hp.audio.sampling_rate, 16000)
    fake_audio_16k = resample(fake_audio, hp.audio.sampling_rate, 16000)
    pesq_score = pesq(16000, audio_16k, fake_audio_16k, 'wb')

    writer.log_validation(loss_g_avg, loss_d_avg, pesq_score, generator,
                          discriminator, audio, fake_audio, step)

    torch.backends.cudnn.benchmark = True
示例#5
0
 def preprocess_audio(self, x, audio_fps):
     x = ap.to_mono(x.numpy())
     if audio_fps != self.audio_fps:
         x = ap.resample(x, audio_fps, self.audio_fps)
     if x.shape[0] < self.audio_fps:
         x = np.pad(x, (0, self.audio_fps - x.shape[0]))
     return x.reshape((1, -1))
示例#6
0
def generate_cqt(file_path, st_status):
    st_status.text('Opening {}'.format(file_path))
    data, sample_rate = auto_load(file_path, sr=None)
    print('Sample Rate:', sample_rate, 'shape:', data.shape)

    if len(data.shape) == 2:
        print('Converting to mono channel...')
        data = to_mono(data)

    st_status.text('Resampling to {} Hz...'.format(TARGET_SAMPLE_RATE))
    downsampled_data = resample(data,
                                orig_sr=sample_rate,
                                target_sr=TARGET_SAMPLE_RATE)
    # downsampled_data = data
    st_status.text('Downsampled to {} Hz, shape is now {}'.format(
        TARGET_SAMPLE_RATE, downsampled_data.shape))

    st_status.text('Generating CQT...')
    cqt_result = np.abs(
        cqt(downsampled_data,
            sr=TARGET_SAMPLE_RATE,
            hop_length=HOP_LENGTH,
            n_bins=TOTAL_BINS,
            bins_per_octave=BINS_PER_OCTAVE))

    return cqt_result
示例#7
0
def generate_cqt(i, file_path, offset=0, duration=None):
    print('[{}] Opening'.format(i), file_path)
    data, sample_rate = load(file_path,
                             sr=None,
                             offset=offset,
                             duration=duration)
    print('[{}] Sample Rate:'.format(i), sample_rate, 'shape:', data.shape)

    if len(data.shape) == 2:
        with Timer('[{}] Converted to mono'.format(i)):
            print('[{}] Converting to mono channel...'.format(i))
            data = to_mono(data)

    with Timer('[{}] Resampling'.format(i)):
        print('[{}] Resampling to'.format(i), TARGET_SAMPLE_RATE, 'Hz...')
        downsampled_data = resample(data,
                                    orig_sr=sample_rate,
                                    target_sr=TARGET_SAMPLE_RATE)
        # downsampled_data = data
        print('[{}] Downsampled to'.format(i), TARGET_SAMPLE_RATE,
              'Hz shape is now', downsampled_data.shape)

    with Timer('[{}] CQT'.format(i)):
        print('[{}] Generating CQT...'.format(i))
        cqt_result = np.abs(
            cqt(downsampled_data,
                sr=TARGET_SAMPLE_RATE,
                hop_length=HOP_LENGTH,
                n_bins=TOTAL_BINS,
                bins_per_octave=BINS_PER_OCTAVE))

    return cqt_result
def separate(PATH_INPUT, PATH_OUTPUT, MODEL, SR=16000, FFT_SIZE = 1024, H = 512):
    
    if os.path.isdir( PATH_INPUT):
        # 入力がディレクトリーの場合、ファイルリストをつくる
        filelist_mixdown = find_files(PATH_INPUT, ext="wav", case_sensitive=True)
    else:
    	# 入力が単一ファイルの場合
        filelist_mixdown=[PATH_INPUT]
    print ('number of mixdown file', len(filelist_mixdown))
    
    # 出力用のディレクトリーがない場合は 作成する。
    _, path_output_ext = os.path.splitext(PATH_OUTPUT)
    print ('path_output_ext',path_output_ext)
    if len(path_output_ext)==0  and  not os.path.exists(PATH_OUTPUT):
        os.mkdir(PATH_OUTPUT)
    
    # モデルの読み込み
    unet = train.UNet()
    chainer.serializers.load_npz( MODEL,unet)
    config.train = False
    config.enable_backprop = False
    
    # ミックスされたものを読み込み、vocal(speech)の分離を試みる
    for fmixdown in filelist_mixdown:
        # audioread でエラーが発生した場合は、scipyを使う。
        try:
            y_mixdown, _ = load(fmixdown,  sr=SR, mono=True)
        except:
            sr_mixdown, y_mixdown = read(fmixdown)
            if not sr_mixdown == SR:
                y_mixdown = resample(y_mixdown, sr_mixdown, SR)
        
        # 入力の短時間スペクトラムを計算して、正規化する。
        spec = stft(y_mixdown, n_fft=FFT_SIZE, hop_length=H, win_length=FFT_SIZE)
        mag = np.abs(spec)
        mag /= np.max(mag)
        phase = np.exp(1.j*np.angle(spec))
        print ('mag.shape', mag.shape)  
        start = 0
        end = 128 * (mag.shape[1] // 128)  # 入力のフレーム数以下で、networkの定義に依存して 適切な値を選ぶこと。
        # speech(vocal)を分離するためのマスクを求める
        mask = unet(mag[:, start:end][np.newaxis, np.newaxis, 1:, :]).data[0, 0, :, :]
        mask = np.vstack((np.zeros(mask.shape[1], dtype="float32"), mask))
        # 入力の短時間スペクトラムにマスクを掛けて、逆FFTで波形を合成する。
        mag2=mag[:, start:end]*mask 
        phase2=phase[:, start:end]
        y = istft(mag2*phase2, hop_length=H, win_length=FFT_SIZE)
        
        # 分離した speech(vocal)を出力ファイルとして保存する。
        if len(path_output_ext)==0:
            # ディレクトリーへ出力
            foutname, _ = os.path.splitext( os.path.basename(fmixdown) )
            fname= os.path.join(PATH_OUTPUT, (foutname + '.wav'))
        else:
            # 指定されたファイルへ出力
            fname= PATH_OUTPUT
        print ('saving... ', fname)
        write_wav(fname, y, SR, norm=True)
示例#9
0
def LoadAudio(fname):
    y, sr = load(fname, sr=None)
    if sr != C.SR:
        y = resample(y, sr, C.SR)
    spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE)
    mag = np.abs(spec)
    mag /= np.max(mag)
    phase = np.exp(1.j * np.angle(spec))
    return mag, phase
示例#10
0
def transpose(audio_data, rate=1.0, mul=1.0, **kwargs):
    """
    see https://librosa.github.io/librosa/generated/librosa.core.resample.html
    """
    return resample(
        y=audio_data,
        orig_sr=44100.0,  # dummy value for resampler
        target_sr=44100.0 / rate,
        **kwargs) * mul
示例#11
0
 def apply(self, sample, clock=0.0):
     # late binding librosa and its dependencies
     # pre-importing sklearn fixes https://github.com/scikit-learn/scikit-learn/issues/14485
     from librosa.core import resample  # pylint: disable=import-outside-toplevel
     sample.change_audio_type(new_audio_type=AUDIO_TYPE_NP)
     rate = pick_value_from_range(self.rate, clock=clock)
     audio = sample.audio
     orig_len = len(audio)
     audio = np.swapaxes(audio, 0, 1)
     if audio.shape[0] < 2:
         # since v0.8 librosa enforces a shape of (samples,) instead of (channels, samples) for mono samples
         resampled = resample(audio[0], sample.audio_format.rate, rate)
         audio[0] = resample(resampled, rate, sample.audio_format.rate)[:orig_len]
     else:
         audio = resample(audio, sample.audio_format.rate, rate)
         audio = resample(audio, rate, sample.audio_format.rate)
     audio = np.swapaxes(audio, 0, 1)[0:orig_len]
     sample.audio = audio
def SaveSpectrogram(y_mix, y_vocal, y_inst, fname, original_sr=44100):
    y_mix = resample(y_mix, original_sr, C.SR)
    y_vocal = resample(y_vocal, original_sr, C.SR)
    y_inst = resample(y_inst, original_sr, C.SR)

    S_mix = np.abs(
        stft(y_mix, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32)
    S_vocal = np.abs(
        stft(y_vocal, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32)
    S_inst = np.abs(
        stft(y_inst, n_fft=C.FFT_SIZE, hop_length=C.H)).astype(np.float32)

    norm = S_mix.max()
    S_mix /= norm
    S_vocal /= norm
    S_inst /= norm

    np.savez(os.path.join(C.PATH_FFT, fname+".npz"),
             mix=S_mix, vocal=S_vocal, inst=S_inst)
示例#13
0
    def __getitem__(self,index):
        while True:
            notnoise = 1
            # Randomly sample a file
            f = random.choice(self.files)
            fs, audio = read('{}{}/{}_{}.wav'.format(self.root_dir,self.version[0],f,self.version[0]))
            audio = audio.astype('float32')
            # Randomly sample a clip
            r = random.random()
            is_silence = False
            if r < self.pure_noise and self.flag == 'train':
                start = random.randint(0, START*fs-LEN*fs)
                is_silence = True
                notnoise = 0
            else: 
                start = random.randint(START*fs,len(audio)-LEN*fs)
            # Resample the clip
            clip = resample(audio[start:start+LEN*fs],fs,self.sr) / 1e5
            # Thresholding: discard clip if the clip contains too much silence
            if not is_silence and np.sum(clip**2) < self.threshold:
                continue
            # Normalize the clip
            mu, sigma = np.mean(clip), np.std(clip)
            normalized = torch.from_numpy((clip-mu)/sigma)

            if len(self.version) > 1:
                fs, audio_clean = read('{}{}/{}_{}.wav'.format(self.root_dir,self.version[1],f,self.version[1]))
                audio_clean = audio_clean.astype('float32')
                # Extract the corresponding clean clip
                if is_silence:
                    normalized_clean = torch.zeros(LEN*self.sr).float()
                else:
                    clip_clean = resample(audio_clean[start:start+LEN*fs],fs,self.sr)
                    mu_clean, sigma_clean = np.mean(clip_clean), np.std(clip_clean)
                    normalized_clean = torch.from_numpy((clip_clean-mu_clean)/sigma_clean)
                
                if self.flag == 'train':
                    return normalized, normalized_clean, notnoise
                else:
                    return normalized, normalized_clean
            
            return normalized
示例#14
0
def LoadAudio_Arg(fname, pitch_shift, time_stretch):
    y, sr = load(fname, sr=C.SR)
    if sr != C.SR:
        y = resample(y, sr, C.SR)
    y = pitch_shift(y, C.SR, pitch_shift)
    y = time_stretch(y, time_stretch)
    spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE)
    mag = np.abs(spec)
    mag /= np.max(mag)
    phase = np.exp(1.j * np.angle(spec))
    return mag, phase
示例#15
0
def downsample_mono(path, sr):
    rate, wav = wavfile.read(path)
    wav = resample(wav.astype(np.float32), rate, sr)
    wav = wav.astype(np.int16)
    # checks stereo and converts to mono if nessesary
    try:
        tmp = wav.shape[1]
        wav = wav[:, 0] + wav[:, 1] / 2
    except:
        pass
    return sr, wav
示例#16
0
def downsample_mono(path, sr):
    rate, wav = wavfile.read(path)
    wav = wav.astype(np.float32, order='F')
    try:
        tmp = wav.shape[1]
        wav = to_mono(wav.T)
    except:
        pass
    wav = resample(wav, rate, sr)
    wav = wav.astype(np.int16)
    return sr, wav
示例#17
0
def SaveSpectrogram(y_mix, y_vocal, y_inst, fname, original_sr=44100):
    y_mix = resample(y_mix, original_sr, C.SR)
    y_vocal = resample(y_vocal, original_sr, C.SR)
    y_inst = resample(y_inst, original_sr, C.SR)

    S_mix = np.abs(stft(y_mix, n_fft=C.FFT_SIZE,
                        hop_length=C.H)).astype(np.float32)
    S_vocal = np.abs(stft(y_vocal, n_fft=C.FFT_SIZE,
                          hop_length=C.H)).astype(np.float32)
    S_inst = np.abs(stft(y_inst, n_fft=C.FFT_SIZE,
                         hop_length=C.H)).astype(np.float32)

    norm = S_mix.max()
    S_mix /= norm
    S_vocal /= norm
    S_inst /= norm

    np.savez(os.path.join(C.PATH_FFT, fname + ".npz"),
             mix=S_mix,
             vocal=S_vocal,
             inst=S_inst)
def from_flac_to_tfrecords(train_r=0.8, valid_test_r=0.2):
	# Extract the information about this subset (speakers, chapters)
	# Dictionary with the following shape: 
	# {speaker_key: {chapters: [...], sex:'M/F', ... } }
	folder = config.data_root+'/'+config.data_subset
	speakers_info = data_tools.read_metadata(config.data_subset)
	keys_to_index = {}
	for i, key in enumerate(speakers_info.keys()):
		keys_to_index[key] = i

	sex = ['M' for i in range(len(speakers_info))]
	for k, v in speakers_info.items():
		i = keys_to_index[k]
		sex[i] = v['sex']

	np.save('genders_index.arr', sex)
	# exit()

	allfiles = np.array([os.path.join(r,f) for r,dirs,files in os.walk(folder) for f in files if f.endswith(".flac")])
	L = len(allfiles)
	np.random.shuffle(allfiles)
	train = allfiles[:int(L*train_r)]
	valid = allfiles[int(L*train_r):int(L*(train_r+valid_test_r/2))]
	test = allfiles[int(L*(train_r+valid_test_r/2)):]
	
	print len(train), len(valid), len(test)

	for group_name, data_split in [("train", train),("test", test), ("valid", valid)]:

		for s in ['M', 'F']:

			writer = tf.python_io.TFRecordWriter(group_name + '_' + s +'.tfrecords')

			for file in data_split:

				splits = file.split('/')
				key = splits[-3]
				sex = speakers_info[key]['sex']

				if sex == s:

					raw_audio, sr = load(file, sr=16000)
					raw_audio = resample(raw_audio, sr, config.fs)
					raw_audio = raw_audio.astype(np.float32).tostring()

					feature = tf.train.Example(features=tf.train.Features(
						feature = { 'audio' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[raw_audio])),
									'key' : tf.train.Feature(int64_list=tf.train.Int64List(value=[keys_to_index[key]]))
					}))
					print group_name, s, key, keys_to_index[key]
					writer.write(feature.SerializeToString())

			writer.close()
示例#19
0
def save_mp3(path, y, sr):
    if sr != 44100:
        y = core.resample(y, sr, 44100)

    yint = (y / np.max(y) * 0.49 * (2**16)).astype(np.int16)
    audio = AudioSegment(data=yint,
                         frame_rate=44100,
                         sample_width=2,
                         channels=1)
    audio.export(path + '.mp3', format='mp3')

    return
示例#20
0
def downsample_mono(path, sr):
    obj = wavio.read(path)
    wav = obj.data.astype(np.float32, order='F')
    rate = obj.rate
    try:
        # tmp = wav.shape[1]
        wav = to_mono(wav.T)
    except:
        pass
    wav = resample(wav, rate, sr)
    wav = wav.astype(np.int16)
    return sr, wav
示例#21
0
def preprocess_noise(noise_audio, fs_noise, fs):
    """[summary]

    Args:
        noise_list ([type]): [description]

    Returns:
        [type]: [description]
    """
    # Downsample to 16kHz
    if fs != fs_noise:
        noise_audio_resamp = resample(noise_audio, fs_noise, fs)

    return noise_audio_resamp
示例#22
0
def set_samplerate(datadir, words, samplerate):

    # only check samplerate for files in word list
    wav_files = [f for word in words for f in glob.glob(os.path.join('data', word,'*.wav'))]

    print(f'Verifying data is at {samplerate} Hz')
    for wav_file in tqdm(wav_files):
        samples, sr = sf.read(wav_file)
        
        if sr != samplerate:
            samples = resample(samples, sr, samplerate)
            sf.write(wav_file, samples, samplerate)    
        
    print(f'All Data sampled at {samplerate} Hz')
示例#23
0
def downsample_mono(path, sr):
    rate, wav = wavfile.read(path)
    # print(type(wav))
    # np.asfortranarray(wav[0,0:Length-1,:].copy())
    wav = resample(wav.astype(np.float32), rate, sr)
    # print(type(wav))
    wav = wav.astype(np.int16)
    # checks stereo and converts to mono if nessesary
    try:
        tmp = wav.shape[1]
        wav = wav[:, 0] + wav[:, 1] / 2
    except:
        pass
    return sr, wav
示例#24
0
def SaveSpectrogramA(y_mix,
                     y_target,
                     fname,
                     original_sr=44100,
                     generate_high_data=False):
    if original_sr != C.SR:
        y_mix = resample(y_mix, original_sr, C.SR)
        y_target = resample(y_target, original_sr, C.SR)

    S_mix = np.abs(
        stft(y_mix, n_fft=C.FFT_SIZE, hop_length=C.H,
             window=C.WINDOW)).astype(np.float32)
    S_target = np.abs(
        stft(y_target, n_fft=C.FFT_SIZE, hop_length=C.H,
             window=C.WINDOW)).astype(np.float32)

    S_mix_low = S_mix[:C.SP]
    S_target_low = S_target[:C.SP]
    norm = S_mix_low.max()
    S_mix_low /= norm
    S_target_low /= norm

    np.savez(C.PATH_TRAINDATA / (fname + "-low.npz"),
             mix=S_mix_low,
             target=S_target_low)
    del S_mix_low, S_target_low

    if generate_high_data:
        S_mix_high = S_mix[C.SP:C.SP + C.SP]
        S_target_high = S_target[C.SP:C.SP + C.SP]
        norm = S_mix_high.max()
        S_mix_high /= norm
        S_target_high /= norm

        np.savez(C.PATH_TRAINDATA / (fname + "-high.npz"),
                 mix=S_mix_high,
                 target=S_target_high)
示例#25
0
    def save_as_torch_file(self):
        """Download the yesno data if it doesn't exist in processed_folder already."""
        import tarfile

        if self._check_exists():
            return

        raw_abs_dir = os.path.join(self.root, self.raw_folder)
        processed_abs_dir = os.path.join(self.root, self.processed_folder)
        # dset_abs_path = os.path.join(
        # self.root, self.raw_folder)

        dset_abs_path = YOUTUBE_PIANOS_RAW
        # process and save as torch files
        print('Processing...')
        # shutil.copyfile(
        #     os.path.join(dset_abs_path, "README"),
        #     os.path.join(processed_abs_dir, "YESNO_README")
        # )
        audios = [x for x in os.listdir(dset_abs_path) if ".wav" in x]
        print("Found {} audio files".format(len(audios)))
        tensors = []
        labels = []
        lengths = []
        for i, f in enumerate(audios):
            if i >= self.dataset_size:
                break
            print("Reading: {0}".format(f))
            full_path = os.path.join(dset_abs_path, f)

            sig, sr = read_audio(full_path, 44100)

            sig = resample(sig.numpy(), sr, self.sample_rate)
            sig = sig.reshape((1, -1))

            sig = torch.FloatTensor(sig)

            tensors.append(sig)
            lengths.append(sig.size(1))

            labels.append(os.path.basename(f).split(".", 1)[0].split("_"))
        # sort sigs/labels: longest -> shortest
        tensors, labels = zip(*[(b, c) for (a, b, c) in sorted(
            zip(lengths, tensors, labels), key=lambda x: x[0], reverse=True)])
        self.max_len = tensors[0].size(1)

        torch.save((tensors, labels),
                   os.path.join(self.processed_folder, self.processed_file))
        print('Done!')
    def create_raw_audio_dataset(output_fn,
                                 subset=config.data_subset,
                                 data_root=config.data_root):
        """
		Create a H5 file from the LibriSpeech dataset and the subset given:

		Inputs:
			output_fn: filename for the created file
			subset: LibriSpeech subset : 'dev-clean' , ...
			data_root: LibriSpeech folder path

		"""
        from librosa.core import resample, load

        # Extract the information about this subset (speakers, chapters)
        # Dictionary with the following shape:
        # {speaker_key: {chapters: [...], sex:'M/F', ... } }
        speakers_info = data_tools.read_metadata(subset)
        with h5py.File(output_fn, 'w') as data_file:

            for key, elements in tqdm(speakers_info.items(),
                                      total=len(speakers_info),
                                      desc='Speakers'):
                if key not in data_file:
                    # Create an H5 Group for each key/speaker
                    data_file.create_group(key)

                # Current speaker folder path
                folder = data_root + '/' + subset + '/' + key
                # For all the chapters read by this speaker
                for i, chapter in enumerate(
                        tqdm(elements['chapters'], desc='Chapters')):
                    # Find all .flac audio
                    for root, dirs, files in os.walk(folder + '/' + chapter):
                        for file in tqdm(files, desc='Files'):
                            if file.endswith(".flac"):
                                path = os.path.join(root, file)
                                raw_audio, sr = load(path, sr=16000)
                                raw_audio = resample(raw_audio, sr, config.fs)
                                data_file[key].create_dataset(
                                    file,
                                    shape=raw_audio.shape,
                                    data=raw_audio,
                                    chunks=raw_audio.shape,
                                    maxshape=raw_audio.shape,
                                    compression="gzip",
                                    compression_opts=9)

        print 'Dataset for the subset: ' + subset + ' has been built'
def downsample(path, down_sample):
    sample_rate, wave = wavfile.read(path)
    wave = wave.astype(np.float32, order='F')

    ##    wave, sample_rate = librosa.load(path, sr = args.down_sample, mono=True)

    try:
        tmp = wave.shape[1]
        wave = to_mono(wave.T)
    except:
        pass
    wave = resample(wave, sample_rate, down_sample)
    wave = wave.astype(np.int16)

    return wave, down_sample
示例#28
0
def read_wav(wav_path, fs):
    """Read a single-channel wav file from given path. Perform resampling and amp normalization

    :param wav_path: Path where the single-channel wav file is located
    :param fs: Desired sampling rate
    :return: Amp normalized wav at specified sampling rate
    """

    fs_wav, wav = wavfile.read(wav_path)
    wav = wav / np.max(np.abs(wav))

    if fs_wav != fs:
        warnings.warn("Sampling rate of wav file is not ", fs, ". Will be resampled")
        wav = resample(wav, wav, fs)

    return fs_wav, wav / np.max(np.abs(wav))
示例#29
0
def read_test_data():
    data = np.empty((4512, 862, 40))
    for i in np.arange(4512):
        audio = np.load('audio/' + str(i) + '.npy')
        # Resampling to 44100
        audio = core.resample(audio, orig_sr=48000, target_sr=44100)
        audio = audio * 1 / np.max(np.abs(audio))
        spec = melspectrogram(y=audio,
                              sr=44100,
                              n_fft=1024,
                              hop_length=512,
                              n_mels=40)
        spec = spec.T
        data[i, :, :] = spec

    return data
示例#30
0
def vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=0):
    # # check data
    # if data.dtype.kind == 'i':
    #     if data.max() > 2**15 - 1 or data.min() < -2**15:
    #         raise ValueError(
    #             'When data.type is int, data must be -32768 < data < 32767.')
    #     data = data.astype('f') / 2.0**15

    # elif data.dtype.kind == 'f':
    #     if np.abs(data).max() > 1:
    #         raise ValueError(
    #             'When data.type is float, data must be -1.0 <= data <= 1.0.')
    #     data = data.astype('f')

    # else:
    #     raise ValueError('data.dtype must be int or float.')

    data = data.squeeze()
    if not data.ndim == 1:
        raise ValueError('data must be mono (1 ch).')

    # resampling
    if fs != fs_vad:
        resampled = resample(data, fs, fs_vad)
        if np.abs(resampled).max() > 1.0:
            resampled *= (0.99 / np.abs(resampled).max())
            # warn('Resampling causes data clipping. data was rescaled.')
    else:
        resampled = data

    resampled = (resampled * 2.0**15).astype('int16')

    hop = fs_vad * hop_length // 1000
    framelen = resampled.size // hop + 1
    padlen = framelen * hop - resampled.size
    paded = np.lib.pad(resampled, (0, padlen), 'constant', constant_values=0)
    framed = frame(paded, frame_length=hop, hop_length=hop).T

    vad = webrtcvad.Vad()
    vad.set_mode(vad_mode)
    valist = [vad.is_speech(tmp.tobytes(), fs_vad) for tmp in framed]

    hop_origin = fs * hop_length // 1000
    va_framed = np.zeros([len(valist), hop_origin])
    va_framed[valist] = 1

    return va_framed.reshape(-1)[:data.size]
def load_audio_file(file_path):
    data_l = os.listdir(file_path)
    input_length = 16000
    x = 1
    for i in data_l:
        rate, data_in = wavfile.read(file_path + i)
        data_in = data_in.astype(np.float32, order='F')
        try:
            tmp = data_in.shape[1]
            data_in = to_mono(data_in.T)
        except:
            pass
        data_in = resample(data_in, rate, 16000)
        data_in = data_in.astype(np.float32)
        data_ap.append(data_in)
        x += 1
    return data_ap