예제 #1
0
    def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
        """
        Returns a dict that specifies the start and end bounds for each supervision,
        as a 1-D int tensor, in terms of frames:

        .. code-block:

            {
                "sequence_idx": tensor(shape=(S,)),
                "start_frame": tensor(shape=(S,)),
                "num_frames": tensor(shape=(S,))
            }

        Where ``S`` is the total number of supervisions encountered in the :class:`CutSet`.
        Note that ``S`` might be different than the number of cuts (``B``).
        ``sequence_idx`` means the index of the corresponding feature matrix (or cut) in a batch.
        """
        start_frames, nums_frames = zip(*(supervision_to_frames(
            sup, self.extractor.frame_shift, cut.sampling_rate) for cut in cuts
                                          for sup in cut.supervisions))
        sequence_idx = [i for i, c in enumerate(cuts) for s in c.supervisions]
        return {
            "sequence_idx": torch.tensor(sequence_idx, dtype=torch.int32),
            "start_frame": torch.tensor(start_frames, dtype=torch.int32),
            "num_frames": torch.tensor(nums_frames, dtype=torch.int32),
        }
예제 #2
0
 def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
     """
     Returns a dict that specifies the start and end bounds for each supervision,
     as a 1-D int tensor, in terms of frames.
     """
     start_frames, nums_frames = zip(*(supervision_to_frames(
         sup, self.extractor.frame_shift, cut.sampling_rate) for cut in cuts
                                       for sup in cut.supervisions))
     return {
         'start_frame': torch.tensor(start_frames, dtype=torch.int32),
         'num_frames': torch.tensor(nums_frames, dtype=torch.int32)
     }
예제 #3
0
    def __next__(self) -> Dict[str, Union[torch.Tensor, List[str]]]:
        """
        Return a new batch, with the batch size automatically determined using the contraints
        of max_frames and max_cuts.
        """
        # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames.
        # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.)
        cuts: CutSet = self._collect_batch()

        # Sort the cuts by duration so that the first one determines the batch time dimensions.
        cuts = cuts.sort_by_duration(ascending=False)

        # Perform the padding (and possibly augmentation at the same time).
        if self.aug_cuts is not None:
            # Mix in the signal from the augmentation CutSet; use them as padding at the same time.
            cuts = cuts.mix(self.aug_cuts,
                            duration=cuts[0].duration,
                            snr=self.aug_snr,
                            mix_prob=self.aug_prob)
        else:
            # We'll just pad it with low energy values to match the longest Cut's duration in the batch.
            cuts = cuts.pad()

        # Get a tensor with batched feature matrices, shape (B, T, F)
        features = collate_features(cuts)

        batch = {
            'features':
            features,
            'supervisions':
            default_collate(
                [{
                    'sequence_idx': sequence_idx,
                    'text': supervision.text,
                    'start_frame': start_frame,
                    'num_frames': num_frames
                } for sequence_idx, cut in enumerate(cuts)
                 for supervision, (
                     start_frame, num_frames) in zip(cut.supervisions, (
                         supervision_to_frames(s,
                                               cut.frame_shift,
                                               cut.sampling_rate,
                                               max_frames=cut.num_frames)
                         for s in cut.supervisions))])
        }
        if self.return_cuts:
            batch['supervisions']['cut'] = [
                cut for cut in cuts for sup in cut.supervisions
            ]

        return batch
예제 #4
0
    def __getitem__(
            self,
            cut_ids: List[str]) -> Dict[str, Union[torch.Tensor, List[str]]]:
        """
        Return a new batch, with the batch size automatically determined using the contraints
        of max_frames and max_cuts.
        """
        # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames.
        # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.)
        cuts = self.cuts.subset(cut_ids=cut_ids)

        # Sort the cuts by duration so that the first one determines the batch time dimensions.
        cuts = cuts.sort_by_duration(ascending=False)

        # Optional transforms.
        for tnfm in self.cut_transforms:
            cuts = tnfm(cuts)

        # Get a tensor with batched feature matrices, shape (B, T, F)
        # Collation performs auto-padding, if necessary.
        features = collate_features(cuts)

        batch = {
            'features':
            features,
            'supervisions':
            default_collate(
                [{
                    'sequence_idx': sequence_idx,
                    'text': supervision.text,
                    'start_frame': start_frame,
                    'num_frames': num_frames
                } for sequence_idx, cut in enumerate(cuts)
                 for supervision, (
                     start_frame, num_frames) in zip(cut.supervisions, (
                         supervision_to_frames(s,
                                               cut.frame_shift,
                                               cut.sampling_rate,
                                               max_frames=cut.num_frames)
                         for s in cut.supervisions))])
        }
        if self.return_cuts:
            batch['supervisions']['cut'] = [
                cut for cut in cuts for sup in cut.supervisions
            ]

        return batch