def synthesize_pure_tone_segment(duration_s: float, fs: float, ft: float, dBFS=-6.0, sample_width=2) -> audiosegment.AudioSegment: """ Synthesize a pure tone of `ft` Hz, sampled at `fs` samples per second, of duration `duration_s` seconds. Return an AudioSegment. """ def dtype(arr): if sample_width == 1: return np.int8(arr) elif sample_width == 2: return np.int16(arr) elif sample_width == 4: return np.int32(arr) else: raise ValueError( "Sample width of {} is not allowed.".format(sample_width)) pure_tone = 100 * synthesize_pure_tone_array(duration_s, fs, ft) pure_seg = audiosegment.from_numpy_array(dtype(pure_tone), fs) curdb = pure_seg.dBFS pure_seg += (dBFS - curdb) return pure_seg
def test_stereo_to_and_from_numpy_array(self): """ Tests that we can convert a stereo file to a numpy array and then back again without any changes. """ before = audiosegment.from_file("stereo_furelise.wav") arr = before.to_numpy_array() after = audiosegment.from_numpy_array(arr, before.frame_rate) self.assertEqual(before.sample_width, after.sample_width) self.assertEqual(before.duration_seconds, after.duration_seconds) self.assertEqual(before.channels, after.channels) self.assertSequenceEqual(before.raw_data, after.raw_data) self.assertTrue(common.is_playable(after))
def test_stereo_from_numpy_array(self): """ Test that we can create and play a stereo numpy array. """ duration_s = 2.0 fs = 16000 tone_one = 100 * common.synthesize_pure_tone_array( duration_s, fs, ft=3200) tone_two = 100 * common.synthesize_pure_tone_array( duration_s, fs, ft=2800) stereo_arr = np.array([tone_one, tone_two], dtype=np.int16).reshape( (-1, 2)) stereo_seg = audiosegment.from_numpy_array(stereo_arr, fs) self.assertTrue(common.is_playable(stereo_seg))
def _test_create_file_from_n_segments(self, mono: audiosegment.AudioSegment, nchannels: int): """ Create a single segment and test it against expected, from multiple segments. """ arr = mono.to_numpy_array() arr_multi = np.tile(arr, (nchannels, 1)).T multi = audiosegment.from_numpy_array(arr_multi, mono.frame_rate) self.assertEqual(multi.channels, nchannels) self.assertEqual(multi.duration_seconds, mono.duration_seconds) self.assertEqual(multi.frame_rate, mono.frame_rate) return multi
def test_mono_to_and_from(self): """ Test that a mono file converts to a numpy array and back again without any change. """ seg = audiosegment.from_file("furelise.wav") for width in (1, 2, 4): with self.subTest(width): seg = seg.resample(sample_width=width) arr = seg.to_numpy_array() seg = audiosegment.from_numpy_array(arr, seg.frame_rate) nsamples = int(round(seg.frame_rate * seg.duration_seconds)) self.assertEqual(seg.sample_width, self._look_up_sample_width(arr.dtype)) self.assertEqual(arr.shape, (nsamples, )) self._check_underlying_data(seg, arr) self.assertTrue(common.is_playable(seg))
def helper(data, name, hp, store_path): if not os.path.exists(store_path): os.makedirs(store_path, exist_ok=True) spectrogram = plot_spectrogram_to_numpy(data[0].cpu().detach().numpy()) plt.imsave(os.path.join(store_path, name + '.png'), spectrogram.transpose((1, 2, 0))) with torch.enable_grad(): waveform, wavespec = Reconstruct(hp).inverse(data[0], iters=2000) wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy()) plt.imsave(os.path.join(store_path, 'Final ' + name + '.png'), wavespec.transpose((1, 2, 0))) waveform = waveform.unsqueeze(-1) waveform = waveform.cpu().detach().numpy() waveform *= 32768 / waveform.max() waveform = waveform.astype(np.int16) audio = audiosegment.from_numpy_array(waveform, framerate=hp.audio.sr) audio.export(os.path.join(store_path, name + '.wav'), format='wav')
def test_mono_from_numpy_array(self): """ Test that creating a mono audio segment from a numpy array creates what we expected. """ duration_s = 3.5 fs = 32000 ftone = 4000 arr = np.int16( 100 * common.synthesize_pure_tone_array(duration_s, fs, ftone)) seg = audiosegment.from_numpy_array(arr, fs) sample_width = self._look_up_sample_width(arr.dtype) nsamples = int(round(seg.frame_rate * seg.duration_seconds)) self.assertEqual(seg.sample_width, sample_width) self.assertEqual(nsamples, len(arr)) self.assertEqual(arr.shape, (nsamples, )) self._check_underlying_data(seg, arr) self.assertTrue(common.is_playable(seg))
def store(generated, path, hp, idx, class_label): if not os.path.exists(path): os.makedirs(path) torch.save(generated, os.path.join(path, '{}_{}.pt'.format(class_label, idx))) spectrogram = plot_spectrogram_to_numpy( generated[0].cpu().detach().numpy()) plt.imsave(os.path.join(path, '{}_{}.png'.format(class_label, idx)), spectrogram.transpose((1, 2, 0))) with torch.enable_grad(): waveform, wavespec = Reconstruct(hp).inverse(generated[0]) wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy()) plt.imsave(os.path.join(path, 'Final {}_{}.png'.format(class_label, idx)), wavespec.transpose((1, 2, 0))) waveform = waveform.unsqueeze(-1) waveform = waveform.cpu().detach().numpy() waveform *= 32768 / waveform.max() waveform = waveform.astype(np.int16) audio = audiosegment.from_numpy_array(waveform, framerate=hp.audio.sr) audio.export(os.path.join(path, '{}_{}.wav'.format(class_label, idx)), format='wav')
infer_hp = HParam(args.infer_config) assert args.timestep % t_div[hp.model.tier] == 0, \ "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], args.timestep) model = MelNet(hp, args, infer_hp).cuda() model.load_tiers() model.eval() with torch.no_grad(): generated = model.sample(args.input) os.makedirs('temp', exist_ok=True) torch.save(generated, os.path.join('temp', args.name + '.pt')) spectrogram = plot_spectrogram_to_numpy(generated[0].cpu().detach().numpy()) plt.imsave(os.path.join('temp', args.name + '.png'), spectrogram.transpose((1, 2, 0))) waveform, wavespec = Reconstruct(hp).inverse(generated[0]) wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy()) plt.imsave(os.path.join('temp', 'Final ' + args.name + '.png'), wavespec.transpose((1, 2, 0))) waveform = waveform.unsqueeze(-1) waveform = waveform.cpu().detach().numpy() waveform *= 32768 / waveform.max() waveform = waveform.astype(np.int16) audio = audiosegment.from_numpy_array( waveform, framerate=hp.audio.sr ) audio.export(os.path.join('temp', args.name + '.wav'), format='wav')
# 1875, 1878, 1880, 1883, 1884, 1886, 1888, 1890, 1892, 1893, 1930, 1931, 1932, 1969, # 1970, 1971, 1975, 1976, 1977, 1979, 1980, 1981, 1984, 1985, 1986, 1987, 1988, 1989, # 1990, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2002, 2003, 2076, 2106, 2110, # 2177, 2178, 2179, 2180, 2206, 2241, 2242, 2243, 2245, 2246, 2253, 2254, 2262, 2263, # 2357, 2358, 2359, 2362, 2368, 2373, 2374, 2418, 2523, 2525, 2526, 2534, 2539, 2542, # 2549, 2552, 2553, 2554, 2555, 2556, 2561, 2562, 2563, 2564, 2578, 2670, 2671, 2672, # 2692, 2694, 2695, 2728, 2733, 2889, 2890, 3034, 3304, 3511, 3524, 3525, 3528, 3655, # 3802, 3864, 3930, 4038, 4049, 4051, 4061, 4193, 4241, 4301, 4302, 4307, 4569, 4570), 0) # doing silence removal, only threshold under 0.01 wokrks voiced_feat = [] duration = 0.1 # 0.1, 0.06, 0.01 threshold = 0.1 # 0.1, 0.07, 0.01 for i in range(len(speech)): x_head = data2[i]['signal'] #x_head = speech[i] seg = audiosegment.from_numpy_array(speech[i], framerate) seg = seg.filter_silence(duration_s=duration, threshold_percentage=threshold) st_features = calculate_features(seg.to_numpy_array(), framerate, None) #st_features = calculate_features(x_head, framerate, None) st_features, _ = pad_sequence_into_array(st_features, maxlen=100) voiced_feat.append(st_features.T) if i % 100 == 0: print(i) voiced_feat = np.array(voiced_feat) voiced_feat.shape np.save('featAS/voiced_feat_file_01_01.npy', voiced_feat)