def test_sph_files(self, file, fails): # Some SPHERE files can be read with soundfile, but not all. path = get_file_path(file) if fails: with pytest.raises(RuntimeError): load_audio(path) else: load_audio(path)
def test_dtype( self, array_dtype, dump_type, dumped_type, load_type, loaded_dtype, ): a = np.array([1, 2, -4, 4], dtype=array_dtype) dump_audio(a, path, dtype=dump_type, normalize=False) assert get_audio_type(path) == dumped_type b = load_audio(path, dtype=load_type) assert b.dtype == loaded_dtype content = io.BytesIO(dumps_audio(a, dtype=dump_type, normalize=False)) c = load_audio(content, dtype=load_type)
def test_default_wo_normalize(self): a = np.array([1, 2, -4, 4], dtype=np.int16) dump_audio(a, path, normalize=False) assert get_audio_type(path) == "PCM_16" b = load_audio(path) assert b.dtype == np.float64 np.testing.assert_allclose(b, a / 2**15)
def pre_batch_transform(inputs): return { 's': np.ascontiguousarray( [load_audio(p) for p in inputs['audio_path']['speech_source']], np.float32), 'y': np.ascontiguousarray(load_audio(inputs['audio_path']['observation']), np.float32), 'num_samples': inputs['num_samples'], 'example_id': inputs['example_id'], 'audio_path': inputs['audio_path'], }
def setUp(self): path = get_file_path("sample.wav") self.time_signal = load_audio(path) # self.time_signal = np.random.randn(5, 3, 5324) self.torch_signal = torch.from_numpy(self.time_signal) self.stft = STFT(size=self.size, shift=self.shift, window_length=self.window_length, fading=self.fading, complex_representation='concat', window=self.window) self.fbins = self.stft.size // 2 + 1
def eval_estimator(db_json, scenario, ref_node_id, vad_threshold, activity_threshold): msg = ('scenario must be "Scenario-1", "Scenario-2", ' '"Scenario-3" or "Scenario-4"') scenarios = ['Scenario-1', 'Scenario-2', 'Scenario-3', 'Scenario-4'] assert scenario in scenarios, msg if scenario == 'Scenario-1': db = AsyncWASN(db_json).get_data_set_scenario_1() elif scenario == 'Scenario-2': db = AsyncWASN(db_json).get_data_set_scenario_2() elif scenario == 'Scenario-3': db = AsyncWASN(db_json).get_data_set_scenario_3() elif scenario == 'Scenario-4': db = AsyncWASN(db_json).get_data_set_scenario_4() sro_estimator = DynamicWACD() voice_activity_detector = VoiceActivityDetector(vad_threshold) num_examples = 3 * len(db) errors = np.zeros(num_examples) for ex_id, example in enumerate(db): print(f'Process example {example["example_id"].split("_")[-1]}') all_dists = get_distances(example) ref_sig = load_audio(example['audio_path'][f'node_{ref_node_id}']) other_nodes = [i for i in range(4) if i != ref_node_id] for cnt, node_id in enumerate(other_nodes): sig = load_audio(example['audio_path'][f'node_{node_id}']) # Align the signals coarsely sig_sync, ref_sig_sync, offset = \ coarse_sync(sig, ref_sig, len_sync=320000) # Estimate the sampling rate offset (SRO) activity_sig = voice_activity_detector(sig_sync) activity_ref_sig = voice_activity_detector(ref_sig_sync) sro_est = sro_estimator( sig_sync, ref_sig_sync, activity_sig, activity_ref_sig ) # Compensate for the SRO sig_sync = compensate_sro(sig_sync, sro_est) ref_sig_sync = ref_sig_sync[:len(sig_sync)] # Estimate the time shifts and distances sig_shifts = est_time_shift(sig_sync, ref_sig_sync, 16384, 2048) if offset > 0: dists = all_dists[int(np.round(offset)):, node_id] dists_ref = all_dists[:, ref_node_id] else: dists = all_dists[:, node_id] dists_ref = all_dists[int(np.round(-offset)):, ref_node_id] frame_ids = \ 8192 + np.asarray([i*2048 for i in range(len(sig_shifts))]) dists = dists[frame_ids] dists_ref = dists_ref[frame_ids] # Discard estimates corresponding to periods in time # without source activity activity_ref_sig = voice_activity_detector(ref_sig_sync) activity_ref_sig = \ (segment_axis(activity_ref_sig, 16384, 2048).sum(-1) > activity_threshold) activity_sig = voice_activity_detector(sig_sync) activity_sig = (segment_axis(activity_sig, 16384, 2048).sum(-1) > activity_threshold) activity_mask = np.logical_and(activity_sig, activity_ref_sig) sig_shifts = sig_shifts[activity_mask] dists = dists[activity_mask] dists_ref = dists_ref[activity_mask] # Estimate the sampling time offsett (STO) sto_est = est_sto(sig_shifts, dists, dists_ref) - offset # Calculate the estimation error sto = (example['sto'][f'node_{node_id}'] - example['sto'][f'node_{ref_node_id}']) errors[3*ex_id+cnt] = sto - sto_est print(f'node {node_id}: error = ' f'{np.round(errors[3*ex_id+cnt], 2)} samples') print(f'\nRMSE = {np.round(np.sqrt(np.mean(errors**2)), 2)} samples')
def generate_audio(example, node_id, std_sensor_noise, sig_len=None, single_channel=False, max_sro=400): """ Generates the audio signal recorded by a sensor node using the given simulation description. This function is typically used as map function in combination with the lazy_data set package. Args: example: Example dictionary specifying how to generate the audio signal node_id: Integer identifying the sensor node for which the recorded signal should be simulated. std_sensor_noise: Standard deviation of the simulated sensor noise. sig_len: Length (in samples) of the signal to be created. single_channel: Boolean specifying if all microphone signals should be simulated. If true only one channel is simulated. Otherwise, all microphone channels are simulated. max_sro: Expected maximum value for the sampling rate offset (SRO) Returns: Example dictionary with additionally added audio signal """ min_sto = np.minimum(np.min([sto for sto in example['sto'].values()]), 0) stos = {node_id: sto - min_sto for node_id, sto in example['sto'].items()} src_diary = example['src_diary'] if single_channel: num_channels = 1 else: num_channels = len(load_audio(src_diary[0]['rirs']['node_0'])) if sig_len is not None: min_sig_len = sig_len max_sro_delay = int(np.ceil(max_sro * 1e-6 * sig_len)) min_sig_len += \ max_sro_delay + np.max([np.abs(sto) for sto in stos.values()]) if min_sig_len > example['src_diary'][-1]['offset']: min_sig_len = example['src_diary'][-1]['offset'] warnings.warn( 'Specified signal length is larger than maximum signal length' 'defined by the source diary. The signal length is set to' 'maximum signal length defined by the source diary') else: min_sig_len = src_diary[-1]['offset'] audio_data = np.zeros((num_channels, min_sig_len)) for source in src_diary: onset = source['onset'] clean_audio = load_audio(source['audio_path']) rirs = load_audio(source['rirs'][node_id]) if single_channel: rirs = rirs[0, None] reverberant_audio = reverb_signal(clean_audio, rirs) if onset + reverberant_audio.shape[-1] > audio_data.shape[-1]: missing_len = \ onset + reverberant_audio.shape[-1] - audio_data.shape[-1] audio_data = \ np.pad(audio_data, ((0, 0), (0, missing_len)), mode='constant') audio_data[:, onset:onset + reverberant_audio.shape[-1]] += \ reverberant_audio break audio_data[:, onset:onset + reverberant_audio.shape[-1]] += \ reverberant_audio audio_data = audio_data[:, stos[node_id]:] sro = example['sro'][node_id] if isinstance(sro, str): sro = load_binary(sro) audio_data = np.asarray([sim_sro(ch, sro) for ch in audio_data]) audio_data += np.random.normal(0, std_sensor_noise, size=audio_data.shape) if sig_len is not None: audio_data = audio_data[:, :sig_len] if 'audio_data' in example.keys(): example['audio_data'][node_id] = audio_data else: example['audio_data'] = {node_id: audio_data} return example