示例#1
0
 def supervision_masks(
         self,
         cuts: CutSet,
         use_alignment_if_exists: Optional[str] = None) -> torch.Tensor:
     """Returns the mask for supervised samples.
     :param use_alignment_if_exists: optional str, key for alignment type to use for generating the mask. If not
         exists, fall back on supervision time spans.
     """
     return collate_vectors([
         cut.supervisions_audio_mask(
             use_alignment_if_exists=use_alignment_if_exists)
         for cut in cuts
     ])
示例#2
0
 def supervision_masks(self, cuts: CutSet) -> torch.Tensor:
     """Returns the mask for supervised frames."""
     return collate_vectors(
         [cut.supervisions_feature_mask() for cut in cuts])
示例#3
0
    def __call__(
        self, cuts: CutSet
    ) -> Union[
        Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, CutSet]
    ]:
        """
        Reads the audio samples from recordings on disk/other storage
        and computes their features.
        The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``.

        :return: a tuple of objcets: ``(feats, feat_lens, [audios, audio_lens], [cuts])``.
            Tensors ``audios`` and ``audio_lens`` are returned when ``return_audio=True``.
            CutSet ``cuts`` is returned when ``fault_tolerant=True``.
        """
        audios, cuts = read_audio_from_cuts(
            cuts,
            executor=_get_executor(self.num_workers, executor_type=self._executor_type),
            suppress_errors=self.fault_tolerant,
        )

        for tfnm in self.wave_transforms:
            for idx in range(len(audios)):
                audios[idx] = tfnm(audios[idx])

        if self.use_batch_extract:
            # Batch extraction is possibly faster depending on the implementation
            # of the feature extractor.
            assert all(c.sampling_rate == cuts[0].sampling_rate for c in cuts)
            features_single = self.extractor.extract_batch(
                audios, sampling_rate=cuts[0].sampling_rate
            )
        else:
            # Sequential extraction allows the sampling rates to be different.
            features_single = []
            for idx, cut in enumerate(cuts):
                samples = audios[idx].numpy()
                try:
                    features = self.extractor.extract(samples, cuts[idx].sampling_rate)
                except:
                    logging.error(
                        f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}"
                    )
                    raise
                features_single.append(torch.from_numpy(features))

        features_batch = collate_matrices(features_single, padding_value=LOG_EPSILON)

        feature_lens = torch.tensor(
            [
                compute_num_frames(
                    cut.duration, self.extractor.frame_shift, cut.sampling_rate
                )
                for cut in cuts
            ],
            dtype=torch.int64,
        )

        out = (features_batch, feature_lens)

        if self.return_audio:
            audios = [a.squeeze(0) for a in audios]  # (1, T) -> (T, )
            audio_lens = torch.tensor([a.size(0) for a in audios], dtype=torch.int64)
            audios = collate_vectors(audios, padding_value=0)

            out = out + (audios, audio_lens)

        if self.fault_tolerant:
            out = out + (cuts,)

        return out
示例#4
0
 def supervision_masks(self, cuts: CutSet) -> torch.Tensor:
     """Returns the mask for supervised samples."""
     return collate_vectors([
         compute_supervisions_frame_mask(cut, self.extractor.frame_shift)
         for cut in cuts
     ])