Exemplo n.º 1
0
    def extract_from_recording_and_store(
            self,
            recording: Recording,
            output_dir: Pathlike,
            offset: Seconds = 0,
            duration: Optional[Seconds] = None,
            channels: Union[int, List[int]] = None,
            augmenter: Optional[WavAugmenter] = None,
            compress: bool = True,
            lilcom_tick_power: int = -5,
            root_dir: Optional[Pathlike] = None):
        """
        Extract the features from a ``Recording`` in a full pipeline:

        * load audio from disk;
        * optionally, perform audio augmentation;
        * extract the features;
        * save them to disk in a specified directory;
        * return a ``Features`` object with a description of the extracted features and the source data used.

        :param recording: a ``Recording`` that specifies what's the input audio.
        :param output_dir: a path to the directory where the features will be stored.
        :param offset: an optional offset in seconds for where to start reading the recording.
        :param duration: an optional duration specifying how much audio to load from the recording.
        :param channels: an optional int or list of ints, specifying the channels;
            by default, all channels will be used.
        :param augmenter: an optional ``WavAugmenter`` instance to modify the waveform before feature extraction.
        :param compress: a bool, whether the saved features should be compressed with ``lilcom``.
        :param lilcom_tick_power: precision of ``lilcom`` compression - greater negative values (e.g. -8).
            might be appropriate for non-log space features.
        :param root_dir: an optional path prefix for the audio source file in Recording.
        :return: a ``Features`` manifest item for the extracted feature matrix.
        """
        samples = recording.load_audio(offset_seconds=offset,
                                       duration_seconds=duration,
                                       channels=channels,
                                       root_dir=root_dir)
        if augmenter is not None:
            samples = augmenter.apply(samples)
        feats = self.extract(samples=samples,
                             sampling_rate=recording.sampling_rate)
        output_features_path = store_feature_array(feats, output_dir, compress,
                                                   lilcom_tick_power)
        return Features(
            recording_id=recording.id,
            channels=channels
            if channels is not None else recording.channel_ids,
            # The start is relative to the beginning of the recording.
            start=offset,
            # We simplify the relationship between num_frames and duration - we guarantee that
            #  the duration is always num_frames * frame_shift
            duration=feats.shape[0] * self.frame_shift,
            type=self.name,
            num_frames=feats.shape[0],
            num_features=feats.shape[1],
            sampling_rate=recording.sampling_rate,
            storage_type='lilcom' if compress else 'numpy',
            storage_path=str(output_features_path))
Exemplo n.º 2
0
    def extract_from_recording_and_store(
        self,
        recording: Recording,
        storage: FeaturesWriter,
        offset: Seconds = 0,
        duration: Optional[Seconds] = None,
        channels: Union[int, List[int]] = None,
        augment_fn: Optional[AugmentFn] = None,
    ) -> "Features":
        """
        Extract the features from a ``Recording`` in a full pipeline:

        * load audio from disk;
        * optionally, perform audio augmentation;
        * extract the features;
        * save them to disk in a specified directory;
        * return a ``Features`` object with a description of the extracted features and the source data used.

        :param recording: a ``Recording`` that specifies what's the input audio.
        :param storage: a ``FeaturesWriter`` object that will handle storing the feature matrices.
        :param offset: an optional offset in seconds for where to start reading the recording.
        :param duration: an optional duration specifying how much audio to load from the recording.
        :param channels: an optional int or list of ints, specifying the channels;
            by default, all channels will be used.
        :param augment_fn: an optional ``WavAugmenter`` instance to modify the waveform before feature extraction.
        :return: a ``Features`` manifest item for the extracted feature matrix.
        """
        from lhotse.qa import validate_features

        samples = recording.load_audio(
            offset=offset,
            duration=duration,
            channels=channels,
        )
        if augment_fn is not None:
            samples = augment_fn(samples, recording.sampling_rate)
        feats = self.extract(samples=samples,
                             sampling_rate=recording.sampling_rate)
        storage_key = store_feature_array(feats, storage=storage)
        manifest = Features(
            recording_id=recording.id,
            channels=channels
            if channels is not None else recording.channel_ids,
            # The start is relative to the beginning of the recording.
            start=offset,
            duration=recording.duration
            if duration is not None else recording.duration,
            type=self.name,
            num_frames=feats.shape[0],
            num_features=feats.shape[1],
            frame_shift=self.frame_shift,
            sampling_rate=recording.sampling_rate,
            storage_type=storage.name,
            storage_path=str(storage.storage_path),
            storage_key=storage_key,
        )
        validate_features(manifest, feats_data=feats)
        return manifest
Exemplo n.º 3
0
def validate_recording(r: Recording, read_data: bool = False) -> None:
    assert r.duration > 0, f'Recording {r.id}: duration has to be greater than 0 (is {r.duration})'
    expected_duration = r.num_samples / r.sampling_rate
    assert r.num_channels > 0, f'Recording {r.id}: no channels available'
    assert isclose(expected_duration, r.duration), \
        f'Recording {r.id}: mismatched declared duration ({r.duration}) with ' \
        f'num_samples / sampling_rate ({expected_duration}).'
    if read_data:
        samples = r.load_audio()
        n_ch, n_s = samples.shape
        assert r.num_channels == n_ch, f'Recording {r.id}: expected {r.num_channels} channels, got {n_ch}'
        assert r.num_samples == n_s, f'Recording {r.id}: expected {r.num_samples} samples, got {n_s}'
Exemplo n.º 4
0
    def _process_and_store_recording(
        self,
        recording: Recording,
        segmentation: Optional[SupervisionSegment] = None,
        compressed: bool = True,
        lilcom_tick_power: int = -8,
    ) -> List[Features]:
        results = []
        for channel in recording.channel_ids:
            output_features_path = (
                self.output_dir / 'storage' /
                str(uuid4())).with_suffix('.llc' if compressed else '.npy')

            samples = torch.from_numpy(
                recording.load_audio(channels=channel, root_dir=self.root_dir))

            # TODO: use augmentation manifest here
            feats = self.feature_extractor.extract(
                samples=samples,
                sampling_rate=recording.sampling_rate).numpy()

            if compressed:
                # TODO: use segmentation manifest here
                serialized_feats = lilcom.compress(
                    feats, tick_power=lilcom_tick_power)
                with open(output_features_path, 'wb') as f:
                    f.write(serialized_feats)
            else:
                np.save(output_features_path, feats, allow_pickle=False)

            results.append(
                Features(
                    recording_id=recording.id,
                    channel_id=channel,
                    # TODO: revise start and duration with segmentation manifest info
                    start=0.0,
                    # We simplify the relationship between num_frames and duration - we guarantee that
                    #  the duration is always num_frames * frame_shift
                    duration=feats.shape[0] *
                    self.feature_extractor.spectrogram_config.frame_shift,
                    type=self.feature_extractor.type,
                    num_frames=feats.shape[0],
                    num_features=feats.shape[1],
                    storage_type='lilcom' if compressed else 'numpy',
                    storage_path=str(output_features_path)))
        return results