def test_serialize_deserialize_transform_speed(audio): speed_orig = Speed(factor=1.1) data_speed = speed_orig.to_dict() speed = AudioTransform.from_dict(data_speed) perturbed_speed = speed(audio, SAMPLING_RATE) assert perturbed_speed.shape == (1, 14546)
def test_serialize_deserialize_transform_volume(audio): volume_orig = Volume(factor=0.5) data_volume = volume_orig.to_dict() volume = AudioTransform.from_dict(data_volume) perturbed_volume = volume(audio, SAMPLING_RATE) assert perturbed_volume.shape == audio.shape assert_array_almost_equal(perturbed_volume, audio * 0.5)
def test_deserialize_transform_speed(audio): speed = AudioTransform.from_dict({ "name": "Speed", "kwargs": { "factor": 1.1 } }) perturbed_speed = speed(audio, SAMPLING_RATE) assert perturbed_speed.shape == (1, 14546)
def test_deserialize_transform_volume(audio): volume = AudioTransform.from_dict({ "name": "Volume", "kwargs": { "factor": 0.5 } }) perturbed_volume = volume(audio, SAMPLING_RATE) assert perturbed_volume.shape == audio.shape assert_array_almost_equal(perturbed_volume, audio * 0.5)
def load_audio( self, channels: Optional[Channels] = None, offset_seconds: float = 0.0, duration_seconds: Optional[float] = None, ) -> np.ndarray: if channels is None: channels = SetContainingAnything() elif isinstance(channels, int): channels = frozenset([channels]) else: channels = frozenset(channels) samples_per_source = [] for source in self.sources: # Case: source not requested if not channels.intersection(source.channels): continue offset, duration = self._determine_offset_and_duration( offset_seconds, duration_seconds) samples = source.load_audio( offset_seconds=offset, duration_seconds=duration, ) # Case: two-channel audio file but only one channel requested # it might not be optimal to load all channels, but IDK if there's anything we can do about it channels_to_remove = [ idx for idx, cid in enumerate(source.channels) if cid not in channels ] if channels_to_remove: samples = np.delete(samples, channels_to_remove, axis=0) samples_per_source.append(samples) # shape: (n_channels, n_samples) audio = np.vstack(samples_per_source) # We'll apply the transforms now (if any). for params in self.transforms or []: transform = AudioTransform.from_dict(params) audio = transform(audio, self.sampling_rate) return audio
def load_audio( self, channels: Optional[Channels] = None, offset: Seconds = 0.0, duration: Optional[Seconds] = None, ) -> np.ndarray: """ Read the audio samples from the underlying audio source (path, URL, unix pipe/command). :param channels: int or iterable of ints, a subset of channel IDs to read (reads all by default). :param offset: seconds, where to start reading the audio (at offset 0 by default). Note that it is only efficient for local filesystem files, i.e. URLs and commands will read all the samples first and discard the unneeded ones afterwards. :param duration: seconds, indicates the total audio time to read (starting from ``offset``). :return: a numpy array of audio samples with shape ``(num_channels, num_samples)``. """ if channels is None: channels = SetContainingAnything() else: channels = frozenset( [channels] if isinstance(channels, int) else channels) recording_channels = frozenset(self.channel_ids) assert channels.issubset(recording_channels), "Requested to load audio from a channel " \ "that does not exist in the recording: " \ f"(recording channels: {recording_channels} -- " \ f"requested channels: {channels})" transforms = [ AudioTransform.from_dict(params) for params in self.transforms or [] ] # Do a "backward pass" over data augmentation transforms to get the # offset and duration for loading a piece of the original audio. offset_aug, duration_aug = offset, duration for tfn in reversed(transforms): offset_aug, duration_aug = tfn.reverse_timestamps( offset=offset_aug, duration=duration_aug, sampling_rate=self.sampling_rate) samples_per_source = [] for source in self.sources: # Case: source not requested if not channels.intersection(source.channels): continue samples = source.load_audio( offset=offset_aug, duration=duration_aug, ) # Case: two-channel audio file but only one channel requested # it might not be optimal to load all channels, but IDK if there's anything we can do about it channels_to_remove = [ idx for idx, cid in enumerate(source.channels) if cid not in channels ] if channels_to_remove: samples = np.delete(samples, channels_to_remove, axis=0) samples_per_source.append(samples) # shape: (n_channels, n_samples) audio = np.vstack(samples_per_source) # We'll apply the transforms now (if any). for tfn in transforms: audio = tfn(audio, self.sampling_rate) # Transformation chains can introduce small mismatches in the number of samples: # we'll fix them here, or raise an error if they exceeded a tolerance threshold. audio = assert_and_maybe_fix_num_samples(audio, offset=offset, duration=duration, recording=self) return audio
def load_audio( self, channels: Optional[Channels] = None, offset: Seconds = 0.0, duration: Optional[Seconds] = None, ) -> np.ndarray: if channels is None: channels = SetContainingAnything() else: channels = frozenset([channels] if isinstance(channels, int) else channels) recording_channels = frozenset(self.channel_ids) assert channels.issubset(recording_channels), "Requested to load audio from a channel " \ "that does not exist in the recording: " \ f"(recording channels: {recording_channels} -- " \ f"requested channels: {channels})" offset_sp, duration_sp = self._adjust_for_speed_perturbation(offset, duration) samples_per_source = [] for source in self.sources: # Case: source not requested if not channels.intersection(source.channels): continue samples = source.load_audio( offset=offset_sp, duration=duration_sp, ) # Case: two-channel audio file but only one channel requested # it might not be optimal to load all channels, but IDK if there's anything we can do about it channels_to_remove = [ idx for idx, cid in enumerate(source.channels) if cid not in channels ] if channels_to_remove: samples = np.delete(samples, channels_to_remove, axis=0) samples_per_source.append(samples) # shape: (n_channels, n_samples) audio = np.vstack(samples_per_source) # We'll apply the transforms now (if any). for params in self.transforms or []: transform = AudioTransform.from_dict(params) audio = transform(audio, self.sampling_rate) # When resampling in high sampling rates (48k -> 44.1k) # it is difficult to estimate how sox will perform rounding; # we will just add/remove one sample to be consistent with # what we have estimated. expected_num_samples = self._expected_num_samples(offset, duration) diff = expected_num_samples - audio.shape[1] if diff == 0: pass # this is normal condition elif diff == 1: # note the extra colon in -1:, which preserves the shape audio = np.append(audio, audio[:, -1:], axis=1) elif diff == -1: audio = audio[:, :-1] else: raise ValueError("The number of declared samples in the recording diverged from the one obtained " "when loading audio. This could be internal Lhotse's error or a faulty " "transform implementation. Please report this issue in Lhotse and show the " f"following: diff={diff}, audio.shape={audio.shape}, recording={self}") return audio
def load_audio( self, channels: Optional[Channels] = None, offset: Seconds = 0.0, duration: Optional[Seconds] = None, ) -> np.ndarray: if channels is None: channels = SetContainingAnything() else: channels = frozenset( [channels] if isinstance(channels, int) else channels) recording_channels = frozenset(self.channel_ids) assert channels.issubset(recording_channels), "Requested to load audio from a channel " \ "that does not exist in the recording: " \ f"(recording channels: {recording_channels} -- " \ f"requested channels: {channels})" transforms = [ AudioTransform.from_dict(params) for params in self.transforms or [] ] # Do a "backward pass" over data augmentation transforms to get the # offset and duration for loading a piece of the original audio. offset_aug, duration_aug = offset, duration for tfn in reversed(transforms): offset_aug, duration_aug = tfn.reverse_timestamps( offset=offset_aug, duration=duration_aug, sampling_rate=self.sampling_rate) samples_per_source = [] for source in self.sources: # Case: source not requested if not channels.intersection(source.channels): continue samples = source.load_audio( offset=offset_aug, duration=duration_aug, ) # Case: two-channel audio file but only one channel requested # it might not be optimal to load all channels, but IDK if there's anything we can do about it channels_to_remove = [ idx for idx, cid in enumerate(source.channels) if cid not in channels ] if channels_to_remove: samples = np.delete(samples, channels_to_remove, axis=0) samples_per_source.append(samples) # shape: (n_channels, n_samples) audio = np.vstack(samples_per_source) # We'll apply the transforms now (if any). for tfn in transforms: audio = tfn(audio, self.sampling_rate) # Transformation chains can introduce small mismatches in the number of samples: # we'll fix them here, or raise an error if they exceeded a tolerance threshold. audio = assert_and_maybe_fix_num_samples(audio, offset=offset, duration=duration, recording=self) return audio