예제 #1
0
def test_serialize_deserialize_transform_speed(audio):
    speed_orig = Speed(factor=1.1)
    data_speed = speed_orig.to_dict()
    speed = AudioTransform.from_dict(data_speed)
    perturbed_speed = speed(audio, SAMPLING_RATE)

    assert perturbed_speed.shape == (1, 14546)
예제 #2
0
def test_serialize_deserialize_transform_volume(audio):
    volume_orig = Volume(factor=0.5)
    data_volume = volume_orig.to_dict()
    volume = AudioTransform.from_dict(data_volume)
    perturbed_volume = volume(audio, SAMPLING_RATE)

    assert perturbed_volume.shape == audio.shape
    assert_array_almost_equal(perturbed_volume, audio * 0.5)
예제 #3
0
def test_deserialize_transform_speed(audio):
    speed = AudioTransform.from_dict({
        "name": "Speed",
        "kwargs": {
            "factor": 1.1
        }
    })
    perturbed_speed = speed(audio, SAMPLING_RATE)

    assert perturbed_speed.shape == (1, 14546)
예제 #4
0
def test_deserialize_transform_volume(audio):
    volume = AudioTransform.from_dict({
        "name": "Volume",
        "kwargs": {
            "factor": 0.5
        }
    })
    perturbed_volume = volume(audio, SAMPLING_RATE)

    assert perturbed_volume.shape == audio.shape
    assert_array_almost_equal(perturbed_volume, audio * 0.5)
예제 #5
0
    def load_audio(
        self,
        channels: Optional[Channels] = None,
        offset_seconds: float = 0.0,
        duration_seconds: Optional[float] = None,
    ) -> np.ndarray:
        if channels is None:
            channels = SetContainingAnything()
        elif isinstance(channels, int):
            channels = frozenset([channels])
        else:
            channels = frozenset(channels)

        samples_per_source = []
        for source in self.sources:
            # Case: source not requested
            if not channels.intersection(source.channels):
                continue
            offset, duration = self._determine_offset_and_duration(
                offset_seconds, duration_seconds)
            samples = source.load_audio(
                offset_seconds=offset,
                duration_seconds=duration,
            )

            # Case: two-channel audio file but only one channel requested
            #       it might not be optimal to load all channels, but IDK if there's anything we can do about it
            channels_to_remove = [
                idx for idx, cid in enumerate(source.channels)
                if cid not in channels
            ]
            if channels_to_remove:
                samples = np.delete(samples, channels_to_remove, axis=0)
            samples_per_source.append(samples)

        # shape: (n_channels, n_samples)
        audio = np.vstack(samples_per_source)

        # We'll apply the transforms now (if any).
        for params in self.transforms or []:
            transform = AudioTransform.from_dict(params)
            audio = transform(audio, self.sampling_rate)

        return audio
예제 #6
0
    def load_audio(
        self,
        channels: Optional[Channels] = None,
        offset: Seconds = 0.0,
        duration: Optional[Seconds] = None,
    ) -> np.ndarray:
        """
        Read the audio samples from the underlying audio source (path, URL, unix pipe/command).

        :param channels: int or iterable of ints, a subset of channel IDs to read (reads all by default).
        :param offset: seconds, where to start reading the audio (at offset 0 by default).
            Note that it is only efficient for local filesystem files, i.e. URLs and commands will read
            all the samples first and discard the unneeded ones afterwards.
        :param duration: seconds, indicates the total audio time to read (starting from ``offset``).
        :return: a numpy array of audio samples with shape ``(num_channels, num_samples)``.
        """
        if channels is None:
            channels = SetContainingAnything()
        else:
            channels = frozenset(
                [channels] if isinstance(channels, int) else channels)
            recording_channels = frozenset(self.channel_ids)
            assert channels.issubset(recording_channels), "Requested to load audio from a channel " \
                                                          "that does not exist in the recording: " \
                                                          f"(recording channels: {recording_channels} -- " \
                                                          f"requested channels: {channels})"

        transforms = [
            AudioTransform.from_dict(params)
            for params in self.transforms or []
        ]

        # Do a "backward pass" over data augmentation transforms to get the
        # offset and duration for loading a piece of the original audio.
        offset_aug, duration_aug = offset, duration
        for tfn in reversed(transforms):
            offset_aug, duration_aug = tfn.reverse_timestamps(
                offset=offset_aug,
                duration=duration_aug,
                sampling_rate=self.sampling_rate)

        samples_per_source = []
        for source in self.sources:
            # Case: source not requested
            if not channels.intersection(source.channels):
                continue
            samples = source.load_audio(
                offset=offset_aug,
                duration=duration_aug,
            )

            # Case: two-channel audio file but only one channel requested
            #       it might not be optimal to load all channels, but IDK if there's anything we can do about it
            channels_to_remove = [
                idx for idx, cid in enumerate(source.channels)
                if cid not in channels
            ]
            if channels_to_remove:
                samples = np.delete(samples, channels_to_remove, axis=0)
            samples_per_source.append(samples)

        # shape: (n_channels, n_samples)
        audio = np.vstack(samples_per_source)

        # We'll apply the transforms now (if any).
        for tfn in transforms:
            audio = tfn(audio, self.sampling_rate)

        # Transformation chains can introduce small mismatches in the number of samples:
        # we'll fix them here, or raise an error if they exceeded a tolerance threshold.
        audio = assert_and_maybe_fix_num_samples(audio,
                                                 offset=offset,
                                                 duration=duration,
                                                 recording=self)

        return audio
예제 #7
0
    def load_audio(
            self,
            channels: Optional[Channels] = None,
            offset: Seconds = 0.0,
            duration: Optional[Seconds] = None,
    ) -> np.ndarray:
        if channels is None:
            channels = SetContainingAnything()
        else:
            channels = frozenset([channels] if isinstance(channels, int) else channels)
            recording_channels = frozenset(self.channel_ids)
            assert channels.issubset(recording_channels), "Requested to load audio from a channel " \
                                                          "that does not exist in the recording: " \
                                                          f"(recording channels: {recording_channels} -- " \
                                                          f"requested channels: {channels})"

        offset_sp, duration_sp = self._adjust_for_speed_perturbation(offset, duration)

        samples_per_source = []
        for source in self.sources:
            # Case: source not requested
            if not channels.intersection(source.channels):
                continue
            samples = source.load_audio(
                offset=offset_sp,
                duration=duration_sp,
            )

            # Case: two-channel audio file but only one channel requested
            #       it might not be optimal to load all channels, but IDK if there's anything we can do about it
            channels_to_remove = [
                idx for idx, cid in enumerate(source.channels)
                if cid not in channels
            ]
            if channels_to_remove:
                samples = np.delete(samples, channels_to_remove, axis=0)
            samples_per_source.append(samples)

        # shape: (n_channels, n_samples)
        audio = np.vstack(samples_per_source)

        # We'll apply the transforms now (if any).
        for params in self.transforms or []:
            transform = AudioTransform.from_dict(params)
            audio = transform(audio, self.sampling_rate)

        # When resampling in high sampling rates (48k -> 44.1k)
        # it is difficult to estimate how sox will perform rounding;
        # we will just add/remove one sample to be consistent with
        # what we have estimated.
        expected_num_samples = self._expected_num_samples(offset, duration)
        diff = expected_num_samples - audio.shape[1]
        if diff == 0:
            pass  # this is normal condition
        elif diff == 1:
            # note the extra colon in -1:, which preserves the shape
            audio = np.append(audio, audio[:, -1:], axis=1)
        elif diff == -1:
            audio = audio[:, :-1]
        else:
            raise ValueError("The number of declared samples in the recording diverged from the one obtained "
                             "when loading audio. This could be internal Lhotse's error or a faulty "
                             "transform implementation. Please report this issue in Lhotse and show the "
                             f"following: diff={diff}, audio.shape={audio.shape}, recording={self}")

        return audio
예제 #8
0
    def load_audio(
        self,
        channels: Optional[Channels] = None,
        offset: Seconds = 0.0,
        duration: Optional[Seconds] = None,
    ) -> np.ndarray:
        if channels is None:
            channels = SetContainingAnything()
        else:
            channels = frozenset(
                [channels] if isinstance(channels, int) else channels)
            recording_channels = frozenset(self.channel_ids)
            assert channels.issubset(recording_channels), "Requested to load audio from a channel " \
                                                          "that does not exist in the recording: " \
                                                          f"(recording channels: {recording_channels} -- " \
                                                          f"requested channels: {channels})"

        transforms = [
            AudioTransform.from_dict(params)
            for params in self.transforms or []
        ]

        # Do a "backward pass" over data augmentation transforms to get the
        # offset and duration for loading a piece of the original audio.
        offset_aug, duration_aug = offset, duration
        for tfn in reversed(transforms):
            offset_aug, duration_aug = tfn.reverse_timestamps(
                offset=offset_aug,
                duration=duration_aug,
                sampling_rate=self.sampling_rate)

        samples_per_source = []
        for source in self.sources:
            # Case: source not requested
            if not channels.intersection(source.channels):
                continue
            samples = source.load_audio(
                offset=offset_aug,
                duration=duration_aug,
            )

            # Case: two-channel audio file but only one channel requested
            #       it might not be optimal to load all channels, but IDK if there's anything we can do about it
            channels_to_remove = [
                idx for idx, cid in enumerate(source.channels)
                if cid not in channels
            ]
            if channels_to_remove:
                samples = np.delete(samples, channels_to_remove, axis=0)
            samples_per_source.append(samples)

        # shape: (n_channels, n_samples)
        audio = np.vstack(samples_per_source)

        # We'll apply the transforms now (if any).
        for tfn in transforms:
            audio = tfn(audio, self.sampling_rate)

        # Transformation chains can introduce small mismatches in the number of samples:
        # we'll fix them here, or raise an error if they exceeded a tolerance threshold.
        audio = assert_and_maybe_fix_num_samples(audio,
                                                 offset=offset,
                                                 duration=duration,
                                                 recording=self)

        return audio