def supervision_masks( self, cuts: CutSet, use_alignment_if_exists: Optional[str] = None) -> torch.Tensor: """Returns the mask for supervised samples. :param use_alignment_if_exists: optional str, key for alignment type to use for generating the mask. If not exists, fall back on supervision time spans. """ return collate_vectors([ cut.supervisions_audio_mask( use_alignment_if_exists=use_alignment_if_exists) for cut in cuts ])
def supervision_masks(self, cuts: CutSet) -> torch.Tensor: """Returns the mask for supervised frames.""" return collate_vectors( [cut.supervisions_feature_mask() for cut in cuts])
def __call__( self, cuts: CutSet ) -> Union[ Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, CutSet] ]: """ Reads the audio samples from recordings on disk/other storage and computes their features. The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``. :return: a tuple of objcets: ``(feats, feat_lens, [audios, audio_lens], [cuts])``. Tensors ``audios`` and ``audio_lens`` are returned when ``return_audio=True``. CutSet ``cuts`` is returned when ``fault_tolerant=True``. """ audios, cuts = read_audio_from_cuts( cuts, executor=_get_executor(self.num_workers, executor_type=self._executor_type), suppress_errors=self.fault_tolerant, ) for tfnm in self.wave_transforms: for idx in range(len(audios)): audios[idx] = tfnm(audios[idx]) if self.use_batch_extract: # Batch extraction is possibly faster depending on the implementation # of the feature extractor. assert all(c.sampling_rate == cuts[0].sampling_rate for c in cuts) features_single = self.extractor.extract_batch( audios, sampling_rate=cuts[0].sampling_rate ) else: # Sequential extraction allows the sampling rates to be different. features_single = [] for idx, cut in enumerate(cuts): samples = audios[idx].numpy() try: features = self.extractor.extract(samples, cuts[idx].sampling_rate) except: logging.error( f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}" ) raise features_single.append(torch.from_numpy(features)) features_batch = collate_matrices(features_single, padding_value=LOG_EPSILON) feature_lens = torch.tensor( [ compute_num_frames( cut.duration, self.extractor.frame_shift, cut.sampling_rate ) for cut in cuts ], dtype=torch.int64, ) out = (features_batch, feature_lens) if self.return_audio: audios = [a.squeeze(0) for a in audios] # (1, T) -> (T, ) audio_lens = torch.tensor([a.size(0) for a in audios], dtype=torch.int64) audios = collate_vectors(audios, padding_value=0) out = out + (audios, audio_lens) if self.fault_tolerant: out = out + (cuts,) return out
def supervision_masks(self, cuts: CutSet) -> torch.Tensor: """Returns the mask for supervised samples.""" return collate_vectors([ compute_supervisions_frame_mask(cut, self.extractor.frame_shift) for cut in cuts ])