Exemplo n.º 1
0
    def __call__(self, cuts: CutSet) -> Tuple[torch.Tensor, torch.IntTensor]:
        """
        Reads the pre-computed features from disk/other storage.
        The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``.

        :return: a tensor with collated features, and a tensor of ``num_frames`` of each cut before padding."""
        return collate_features(cuts)
Exemplo n.º 2
0
 def __getitem__(self, cuts: CutSet) -> Dict[str, Any]:
     self._validate(cuts)
     features, features_lens = collate_features(cuts)
     return {
         "cuts": cuts,
         "features": features,
         "features_lens": features_lens,
     }
Exemplo n.º 3
0
def test_collate_feature_padding():
    cuts = CutSet.from_json("test/fixtures/ljspeech/cuts.json")
    assert len(set(cut.num_frames for cut in cuts)) > 1

    correct_pad = max(cut.num_frames for cut in cuts)
    features, features_lens = collate_features(cuts)

    assert features.shape[1] == correct_pad
    assert max(features_lens).item() == correct_pad
Exemplo n.º 4
0
def test_specaugment_batch(num_feature_masks, num_frame_masks):
    cuts = CutSet.from_json('test/fixtures/ljspeech/cuts.json')
    feats, feat_lens = collate_features(cuts)
    tfnm = SpecAugment(p=1.0,
                       time_warp_factor=10,
                       features_mask_size=5,
                       frames_mask_size=20,
                       num_feature_masks=num_feature_masks,
                       num_frame_masks=num_frame_masks)
    augmented = tfnm(feats)
    assert (feats != augmented).any()
Exemplo n.º 5
0
    def __next__(self) -> Dict[str, Union[torch.Tensor, List[str]]]:
        """
        Return a new batch, with the batch size automatically determined using the contraints
        of max_frames and max_cuts.
        """
        # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames.
        # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.)
        cuts: CutSet = self._collect_batch()

        # Sort the cuts by duration so that the first one determines the batch time dimensions.
        cuts = cuts.sort_by_duration(ascending=False)

        # Perform the padding (and possibly augmentation at the same time).
        if self.aug_cuts is not None:
            # Mix in the signal from the augmentation CutSet; use them as padding at the same time.
            cuts = cuts.mix(self.aug_cuts,
                            duration=cuts[0].duration,
                            snr=self.aug_snr,
                            mix_prob=self.aug_prob)
        else:
            # We'll just pad it with low energy values to match the longest Cut's duration in the batch.
            cuts = cuts.pad()

        # Get a tensor with batched feature matrices, shape (B, T, F)
        features = collate_features(cuts)

        batch = {
            'features':
            features,
            'supervisions':
            default_collate(
                [{
                    'sequence_idx': sequence_idx,
                    'text': supervision.text,
                    'start_frame': start_frame,
                    'num_frames': num_frames
                } for sequence_idx, cut in enumerate(cuts)
                 for supervision, (
                     start_frame, num_frames) in zip(cut.supervisions, (
                         supervision_to_frames(s,
                                               cut.frame_shift,
                                               cut.sampling_rate,
                                               max_frames=cut.num_frames)
                         for s in cut.supervisions))])
        }
        if self.return_cuts:
            batch['supervisions']['cut'] = [
                cut for cut in cuts for sup in cut.supervisions
            ]

        return batch
Exemplo n.º 6
0
    def __getitem__(
            self,
            cut_ids: List[str]) -> Dict[str, Union[torch.Tensor, List[str]]]:
        """
        Return a new batch, with the batch size automatically determined using the contraints
        of max_frames and max_cuts.
        """
        # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames.
        # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.)
        cuts = self.cuts.subset(cut_ids=cut_ids)

        # Sort the cuts by duration so that the first one determines the batch time dimensions.
        cuts = cuts.sort_by_duration(ascending=False)

        # Optional transforms.
        for tnfm in self.cut_transforms:
            cuts = tnfm(cuts)

        # Get a tensor with batched feature matrices, shape (B, T, F)
        # Collation performs auto-padding, if necessary.
        features = collate_features(cuts)

        batch = {
            'features':
            features,
            'supervisions':
            default_collate(
                [{
                    'sequence_idx': sequence_idx,
                    'text': supervision.text,
                    'start_frame': start_frame,
                    'num_frames': num_frames
                } for sequence_idx, cut in enumerate(cuts)
                 for supervision, (
                     start_frame, num_frames) in zip(cut.supervisions, (
                         supervision_to_frames(s,
                                               cut.frame_shift,
                                               cut.sampling_rate,
                                               max_frames=cut.num_frames)
                         for s in cut.supervisions))])
        }
        if self.return_cuts:
            batch['supervisions']['cut'] = [
                cut for cut in cuts for sup in cut.supervisions
            ]

        return batch
Exemplo n.º 7
0
 def __getitem__(self, cut_ids: Iterable[str]) -> Dict[str, torch.Tensor]:
     cuts = self.cuts.subset(cut_ids=cut_ids)
     features, features_lens = collate_features(cuts)
     return {
         'features':
         features,
         'features_lens':
         features_lens,
         'speaker_activity':
         collate_matrices(
             (cut.speakers_feature_mask(
                 min_speaker_dim=self.min_speaker_dim,
                 speaker_to_idx_map=self.speakers,
             ) for cut in cuts),
             # In case padding is needed, we will add a special symbol
             # that tells the cross entropy loss to ignore the frame during scoring.
             padding_value=CrossEntropyLoss().ignore_index)
     }
Exemplo n.º 8
0
 def __getitem__(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
     features, features_lens = collate_features(cuts)
     return {
         "features":
         features,
         "features_lens":
         features_lens,
         "speaker_activity":
         collate_matrices(
             (cut.speakers_feature_mask(
                 min_speaker_dim=self.min_speaker_dim,
                 speaker_to_idx_map=self.speakers,
             ) for cut in cuts),
             # In case padding is needed, we will add a special symbol
             # that tells the cross entropy loss to ignore the frame during scoring.
             padding_value=CrossEntropyLoss().ignore_index,
         ),
     }
Exemplo n.º 9
0
    def __next__(self) -> Dict[str, Union[torch.Tensor, List[str]]]:
        """
        Return a new batch, with the batch size automatically determined using the contraints
        of max_frames and max_cuts.
        """
        # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames.
        # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.)
        cuts: CutSet = self._collect_batch()

        # For now, we'll just pad it with low energy values to match the longest Cut's
        # duration in the batch. We might want to do something more interesting here
        # later on - padding/mixing with noises, etc.
        cuts = cuts.sort_by_duration().pad()

        # Get a tensor with batched feature matrices, shape (B, T, F)
        features = collate_features(cuts)

        batch = {
            'features': features,
            'supervisions': default_collate([
                {
                    'sequence_idx': sequence_idx,
                    'text': supervision.text,
                    'start_frame': compute_num_frames(
                        supervision.start,
                        frame_shift=cut.frame_shift,
                        # Note: Rounding "floor" can sometimes result in one extra frame being included
                        # in the left context; but it guarantees that we will never go out-of-bounds when
                        # summing start_frame + num_frames.
                        rounding=ROUND_FLOOR
                    ),
                    'num_frames': compute_num_frames(
                        supervision.duration,
                        frame_shift=cut.frame_shift
                    )
                }
                for sequence_idx, cut in enumerate(cuts)
                for supervision in cut.supervisions
            ])
        }
        if self.return_cuts:
            batch['supervisions']['cut'] = [cut for cut in cuts for sup in cut.supervisions]

        return batch