def load( self, start: Optional[Seconds] = None, duration: Optional[Seconds] = None, ) -> np.ndarray: # noinspection PyArgumentList storage = get_reader(self.storage_type)(self.storage_path) left_offset_frames, right_offset_frames = 0, None if start is None: start = self.start # In case the caller requested only a sub-span of the features, trim them. # Left trim if start < self.start - 1e-5: raise ValueError(f"Cannot load features for recording {self.recording_id} starting from {start}s. " f"The available range is ({self.start}, {self.end}) seconds.") if not isclose(start, self.start): left_offset_frames = compute_num_frames(start - self.start, frame_shift=self.frame_shift, sampling_rate=self.sampling_rate) # Right trim end = start + duration if duration is not None else None if duration is not None and not isclose(end, self.end): right_offset_frames = left_offset_frames + compute_num_frames(duration, frame_shift=self.frame_shift, sampling_rate=self.sampling_rate) # Load and return the features (subset) from the storage return storage.read( self.storage_key, left_offset_frames=left_offset_frames, right_offset_frames=right_offset_frames )
def __call__(self, cuts: CutSet) -> Tuple[torch.Tensor, torch.IntTensor]: """ Reads the audio samples from recordings on disk/other storage and computes their features. The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``. :return: a tensor with collated features, and a tensor of ``num_frames`` of each cut before padding. """ audio, _ = collate_audio(cuts) for tfnm in self.wave_transforms: audio = tfnm(audio) features_single = [] for idx, cut in enumerate(cuts): samples = audio[idx].numpy() try: features = self.extractor.extract(samples, cuts[idx].sampling_rate) except: logging.error( f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}" ) raise features_single.append(torch.from_numpy(features)) features_batch = torch.stack(features_single) feature_lens = torch.tensor([ compute_num_frames(cut.duration, self.extractor.frame_shift, cut.sampling_rate) for cut in cuts ], dtype=torch.int32) return features_batch, feature_lens
def validate_features(f: Features, read_data: bool = False, feats_data: Optional[np.ndarray] = None) -> None: assert f.start >= 0, \ f'Features: start has to be greater than 0 (is {f.start})' assert f.duration > 0, \ f'Features: duration has to be greater than 0 (is {f.duration})' assert f.num_frames > 0, \ f'Features: num_frames has to be greater than 0 (is {f.num_frames})' assert f.num_features > 0, \ f'Features: num_features has to be greater than 0 (is {f.num_features})' assert f.sampling_rate > 0, \ f'Features: sampling_rate has to be greater than 0 (is {f.sampling_rate})' assert f.frame_shift > 0, \ f'Features: frame_shift has to be greater than 0 (is {f.frame_shift})' window_hop = round(f.frame_shift * f.sampling_rate, ndigits=12) assert float(int(window_hop)) == window_hop, \ f'Features: frame_shift of {f.frame_shift} is incorrect because it is physically impossible; ' \ f'multiplying it by a sampling rate of {f.sampling_rate} results in a fractional window hop ' \ f'of {window_hop} samples.' expected_num_frames = compute_num_frames(duration=f.duration, frame_shift=f.frame_shift, sampling_rate=f.sampling_rate) assert expected_num_frames == f.num_frames, \ f'Features: manifest is inconsistent: declared num_frames is {f.num_frames}, ' \ f'but duration ({f.duration}s) / frame_shift ({f.frame_shift}s) results in {expected_num_frames} frames. ' \ f'If you\'re using a custom feature extractor, you might need to ensure that it preserves ' \ f'this relationship between duration, frame_shift and num_frames (use rounding up if needed - ' \ f'see lhotse.utils.compute_num_frames).' if read_data or feats_data is not None: if read_data: feats_data = f.load() n_fr, n_ft = feats_data.shape assert f.num_frames == n_fr, f'Features: expected num_frames: {f.num_frames}, actual: {n_fr}' assert f.num_features == n_ft, f'Features: expected num_features: {f.num_features}, actual: {n_ft}'
def test_cut_with_temporal_array_move_to_memory_large_offset(): path = "test/fixtures/libri/cuts.json" cut = CutSet.from_file(path)[0] cut.start = 10.0 cut.duration = 1.5 with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(f.name) as w: arr = np.array( np.arange( compute_num_frames(cut.duration, frame_shift=0.01, sampling_rate=16000))) cut.custom_array = w.store_array( key="dummy-key", value=arr, frame_shift=0.01, temporal_dim=0, start=cut.start, ) cut_mem = cut.move_to_memory() arr_mem = cut_mem.load_custom_array() assert arr.dtype == arr_mem.dtype np.testing.assert_equal(arr, arr_mem) arr_trunc = cut.truncate(duration=0.5).load_custom_array() arr_mem_trunc = cut_mem.truncate(duration=0.5).load_custom_array() assert arr_trunc.dtype == arr_mem_trunc.dtype np.testing.assert_equal(arr_trunc, arr_mem_trunc)
def __call__( self, cuts: CutSet ) -> Union[ Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, CutSet] ]: """ Reads the audio samples from recordings on disk/other storage and computes their features. The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``. :return: a tensor with collated features, and a tensor of ``num_frames`` of each cut before padding. """ audios, cuts = read_audio_from_cuts( cuts, executor=_get_executor(self.num_workers, executor_type=self._executor_type), suppress_errors=self.fault_tolerant, ) for tfnm in self.wave_transforms: for idx in range(len(audios)): audios[idx] = tfnm(audios[idx]) if self.use_batch_extract: # Batch extraction is possibly faster depending on the implementation # of the feature extractor. assert all(c.sampling_rate == cuts[0].sampling_rate for c in cuts) features_single = self.extractor.extract_batch( audios, sampling_rate=cuts[0].sampling_rate ) else: # Sequential extraction allows the sampling rates to be different. features_single = [] for idx, cut in enumerate(cuts): samples = audios[idx].numpy() try: features = self.extractor.extract(samples, cuts[idx].sampling_rate) except: logging.error( f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}" ) raise features_single.append(torch.from_numpy(features)) features_batch = collate_matrices(features_single, padding_value=LOG_EPSILON) feature_lens = torch.tensor( [ compute_num_frames( cut.duration, self.extractor.frame_shift, cut.sampling_rate ) for cut in cuts ], dtype=torch.int32, ) if self.fault_tolerant: return features_batch, feature_lens, cuts else: return features_batch, feature_lens
def asserted_num_frames(start: Seconds, duration: Seconds, frame_shift: Seconds) -> int: """ This closure with compute the num_frames, correct off-by-one errors in edge cases, and assert that the supervision does not exceed the feature matrix temporal dimension. """ offset = compute_num_frames(start, frame_shift=frame_shift) num_frames = compute_num_frames(duration, frame_shift=frame_shift) diff = features.shape[1] - (offset + num_frames) # Note: we tolerate off-by-ones because some mixed cuts could have one frame more # than their duration suggests (we will try to change this eventually). if diff == -1: num_frames -= 1 assert offset + num_frames <= features.shape[1], \ f"Unexpected num_frames ({offset + num_frames}) exceeding features time dimension for a supervision " \ f"({features.shape[1]}) when constructing a batch; please report this in Lhotse's GitHub issues, " \ "ideally providing the Cut data that triggered this." return num_frames
def __next__(self) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the contraints of max_frames and max_cuts. """ from torch.utils.data._utils.collate import default_collate # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames. # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.) cuts: CutSet = self._collect_batch() # For now, we'll just pad it with low energy values to match the longest Cut's # duration in the batch. We might want to do something more interesting here # later on - padding/mixing with noises, etc. cuts = cuts.sort_by_duration().pad() # Get a tensor with batched feature matrices, shape (B, T, F) features = _collate_features(cuts) def asserted_num_frames(start: Seconds, duration: Seconds, frame_shift: Seconds) -> int: """ This closure with compute the num_frames, correct off-by-one errors in edge cases, and assert that the supervision does not exceed the feature matrix temporal dimension. """ offset = compute_num_frames(start, frame_shift=frame_shift) num_frames = compute_num_frames(duration, frame_shift=frame_shift) diff = features.shape[1] - (offset + num_frames) # Note: we tolerate off-by-ones because some mixed cuts could have one frame more # than their duration suggests (we will try to change this eventually). if diff == -1: num_frames -= 1 assert offset + num_frames <= features.shape[1], \ f"Unexpected num_frames ({offset + num_frames}) exceeding features time dimension for a supervision " \ f"({features.shape[1]}) when constructing a batch; please report this in Lhotse's GitHub issues, " \ "ideally providing the Cut data that triggered this." return num_frames return { 'features': features, 'supervisions': default_collate([{ 'cut_id': cut.id, 'sequence_idx': sequence_idx, 'text': supervision.text, 'start_frame': compute_num_frames(supervision.start, cut.frame_shift), 'num_frames': asserted_num_frames(supervision.start, supervision.duration, cut.frame_shift), } for sequence_idx, cut in enumerate(cuts) for supervision in cut.supervisions]) }
def __next__(self) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the contraints of max_frames and max_cuts. """ # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames. # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.) cuts: CutSet = self._collect_batch() # For now, we'll just pad it with low energy values to match the longest Cut's # duration in the batch. We might want to do something more interesting here # later on - padding/mixing with noises, etc. cuts = cuts.sort_by_duration().pad() # Get a tensor with batched feature matrices, shape (B, T, F) features = collate_features(cuts) batch = { 'features': features, 'supervisions': default_collate([ { 'sequence_idx': sequence_idx, 'text': supervision.text, 'start_frame': compute_num_frames( supervision.start, frame_shift=cut.frame_shift, # Note: Rounding "floor" can sometimes result in one extra frame being included # in the left context; but it guarantees that we will never go out-of-bounds when # summing start_frame + num_frames. rounding=ROUND_FLOOR ), 'num_frames': compute_num_frames( supervision.duration, frame_shift=cut.frame_shift ) } for sequence_idx, cut in enumerate(cuts) for supervision in cut.supervisions ]) } if self.return_cuts: batch['supervisions']['cut'] = [cut for cut in cuts for sup in cut.supervisions] return batch
def _pad_frames(self, samples: np.ndarray, feats: np.ndarray, sampling_rate: int) -> np.ndarray: """Adds last diff frames to the end of feats matrix to fit lhotse.utils.compute_num_frames.""" duration = np.shape(samples)[1] / sampling_rate diff = (compute_num_frames(duration, self.frame_shift, sampling_rate) - np.shape(feats)[0]) if abs(diff) >= 6: warnings.warn(f"Unusual difference in number of frames: {diff}") if diff > 0: feats = np.append(feats, feats[-diff:, :], axis=0) elif diff < 0: feats = feats[:-diff, :] return feats
def test_num_frames( feature_set, feature_level, ): sr = 8000 duration = 12.059 config = OpenSmileConfig( feature_set=feature_set, feature_level=feature_level, sampling_rate=sr, resample=True, ) feature_extractor = OpenSmileExtractor(config=config) num_frames = compute_num_frames(duration, feature_extractor.frame_shift, sr) num_samples = compute_num_samples(duration, sr) signal = np.random.rand(1, num_samples) y = feature_extractor.extract(signal, sr) assert np.shape(y)[0] == num_frames
def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the constraints of max_frames and max_cuts. """ validate_for_asr(cuts) self.hdf5_fix.update() # Sort the cuts by duration so that the first one determines the batch time dimensions. cuts = cuts.sort_by_duration(ascending=False) # Optional CutSet transforms - e.g. padding, or speed perturbation that adjusts # the supervision boundaries. for tnfm in self.cut_transforms: cuts = tnfm(cuts) # Get a tensor with batched feature matrices, shape (B, T, F) # Collation performs auto-padding, if necessary. input_tpl = self.input_strategy(cuts) if len(input_tpl) == 3: # An input strategy with fault tolerant audio reading mode. # "cuts" may be a subset of the original "cuts" variable, # that only has cuts for which we succesfully read the audio. inputs, _, cuts = input_tpl else: inputs, _ = input_tpl # Get a dict of tensors that encode the positional information about supervisions # in the batch of feature matrices. The tensors are named "sequence_idx", # "start_frame/sample" and "num_frames/samples". supervision_intervals = self.input_strategy.supervision_intervals(cuts) # Apply all available transforms on the inputs, i.e. either audio or features. # This could be feature extraction, global MVN, SpecAugment, etc. segments = torch.stack(list(supervision_intervals.values()), dim=1) for tnfm in self.input_transforms: inputs = tnfm(inputs, supervision_segments=segments) batch = { "inputs": inputs, "supervisions": default_collate([{ "text": supervision.text, } for sequence_idx, cut in enumerate(cuts) for supervision in cut.supervisions]), } # Update the 'supervisions' field with sequence_idx and start/num frames/samples batch["supervisions"].update(supervision_intervals) if self.return_cuts: batch["supervisions"]["cut"] = [ cut for cut in cuts for sup in cut.supervisions ] has_word_alignments = all( s.alignment is not None and "word" in s.alignment for c in cuts for s in c.supervisions) if has_word_alignments: # TODO: might need to refactor BatchIO API to move the following conditional logic # into these objects (e.g. use like: self.input_strategy.convert_timestamp(), # that returns either num_frames or num_samples depending on the strategy). words, starts, ends = [], [], [] frame_shift = cuts[0].frame_shift sampling_rate = cuts[0].sampling_rate if frame_shift is None: try: frame_shift = self.input_strategy.extractor.frame_shift except AttributeError: raise ValueError( "Can't determine the frame_shift -- it is not present either in cuts or the input_strategy. " ) for c in cuts: for s in c.supervisions: words.append( [aliword.symbol for aliword in s.alignment["word"]]) starts.append([ compute_num_frames( aliword.start, frame_shift=frame_shift, sampling_rate=sampling_rate, ) for aliword in s.alignment["word"] ]) ends.append([ compute_num_frames( aliword.end, frame_shift=frame_shift, sampling_rate=sampling_rate, ) for aliword in s.alignment["word"] ]) batch["supervisions"]["word"] = words batch["supervisions"]["word_start"] = starts batch["supervisions"]["word_end"] = ends return batch
def add_to_mix( self, feats: np.ndarray, sampling_rate: int, snr: Optional[Decibels] = None, offset: Seconds = 0.0, ): """ Add feature matrix of a new track into the mix. :param feats: A 2D feature matrix to be mixed in. :param sampling_rate: The sampling rate of ``feats`` :param snr: Signal-to-noise ratio, assuming ``feats`` represents noise (positive SNR - lower ``feats`` energy, negative SNR - higher ``feats`` energy) :param offset: How many seconds to shift ``feats`` in time. For mixing, the signal will be padded before the start with low energy values. """ assert offset >= 0.0, "Negative offset in mixing is not supported." reference_feats = self.tracks[0] num_frames_offset = compute_num_frames(duration=offset, frame_shift=self.frame_shift, sampling_rate=sampling_rate) current_num_frames = reference_feats.shape[0] incoming_num_frames = feats.shape[0] + num_frames_offset mix_num_frames = max(current_num_frames, incoming_num_frames) feats_to_add = feats # When the existing frames are less than what we anticipate after the mix, # we need to pad after the end of the existing features mixed so far. if current_num_frames < mix_num_frames: for idx in range(len(self.tracks)): padded_track = np.vstack([ self.tracks[idx], self.padding_value * np.ones( (mix_num_frames - current_num_frames, self.num_features), dtype=self.dtype, ), ]) self.tracks[idx] = padded_track # When there is an offset, we need to pad before the start of the features we're adding. if offset > 0: feats_to_add = np.vstack([ self.padding_value * np.ones( (num_frames_offset, self.num_features), dtype=self.dtype), feats_to_add, ]) # When the features we're mixing in are shorter that the anticipated mix length, # we need to pad after their end. # Note: we're doing that inefficiently, as we potentially re-allocate numpy arrays twice, # during this padding and the offset padding before. If that's a bottleneck, we'll optimize. if incoming_num_frames < mix_num_frames: feats_to_add = np.vstack([ feats_to_add, self.padding_value * np.ones( (mix_num_frames - incoming_num_frames, self.num_features), dtype=self.dtype, ), ]) # When SNR is requested, find what gain is needed to satisfy the SNR gain = 1.0 if snr is not None: # Compute the added signal energy before it was padded added_feats_energy = self.feature_extractor.compute_energy(feats) if added_feats_energy <= 0.0: raise NonPositiveEnergyError( f"To perform mix, energy must be non-zero and non-negative (got {added_feats_energy}). " ) target_energy = self.reference_energy * (10.0**(-snr / 10)) gain = target_energy / added_feats_energy self.tracks.append(feats_to_add) self.gains.append(gain)
def __call__( self, cuts: CutSet ) -> Union[ Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, CutSet] ]: """ Reads the audio samples from recordings on disk/other storage and computes their features. The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``. :return: a tuple of objcets: ``(feats, feat_lens, [audios, audio_lens], [cuts])``. Tensors ``audios`` and ``audio_lens`` are returned when ``return_audio=True``. CutSet ``cuts`` is returned when ``fault_tolerant=True``. """ audios, cuts = read_audio_from_cuts( cuts, executor=_get_executor(self.num_workers, executor_type=self._executor_type), suppress_errors=self.fault_tolerant, ) for tfnm in self.wave_transforms: for idx in range(len(audios)): audios[idx] = tfnm(audios[idx]) if self.use_batch_extract: # Batch extraction is possibly faster depending on the implementation # of the feature extractor. assert all(c.sampling_rate == cuts[0].sampling_rate for c in cuts) features_single = self.extractor.extract_batch( audios, sampling_rate=cuts[0].sampling_rate ) else: # Sequential extraction allows the sampling rates to be different. features_single = [] for idx, cut in enumerate(cuts): samples = audios[idx].numpy() try: features = self.extractor.extract(samples, cuts[idx].sampling_rate) except: logging.error( f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}" ) raise features_single.append(torch.from_numpy(features)) features_batch = collate_matrices(features_single, padding_value=LOG_EPSILON) feature_lens = torch.tensor( [ compute_num_frames( cut.duration, self.extractor.frame_shift, cut.sampling_rate ) for cut in cuts ], dtype=torch.int64, ) out = (features_batch, feature_lens) if self.return_audio: audios = [a.squeeze(0) for a in audios] # (1, T) -> (T, ) audio_lens = torch.tensor([a.size(0) for a in audios], dtype=torch.int64) audios = collate_vectors(audios, padding_value=0) out = out + (audios, audio_lens) if self.fault_tolerant: out = out + (cuts,) return out
def logmelfilterbank( audio: np.ndarray, sampling_rate: int, fft_size: int = 1024, hop_size: int = 256, win_length: int = None, window: str = "hann", num_mel_bins: int = 80, fmin: int = 80, fmax: int = 7600, eps: float = EPSILON, ): """Compute log-Mel filterbank feature. Args: audio (ndarray): Audio signal (T,). sampling_rate (int): Sampling rate. fft_size (int): FFT size. hop_size (int): Hop size. win_length (int): Window length. If set to None, it will be the same as fft_size. window (str): Window function type. num_mel_bins (int): Number of mel basis. fmin (int): Minimum frequency in mel basis calculation. fmax (int): Maximum frequency in mel basis calculation. eps (float): Epsilon value to avoid inf in log calculation. Returns: ndarray: Log Mel filterbank feature (#source_feats, num_mel_bins). """ if is_module_available("librosa"): import librosa else: raise ImportError( "Librosa is not installed. Please install librosa before using LibrosaFbank extractor." ) if len(audio.shape) == 2: assert ( audio.shape[0] == 1 ), f"LibrosaFbank works only with single-channel recordings (shape: {audio.shape})" audio = audio[0] else: assert ( len(audio.shape) == 1 ), f"LibrosaFbank works only with single-channel recordings (shape: {audio.shape})" x_stft = librosa.stft( audio, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window, pad_mode="reflect", ) spc = np.abs(x_stft).T fmin = 0 if fmin is None else fmin fmax = sampling_rate / 2 if fmax is None else fmax mel_basis = librosa.filters.mel(sampling_rate, fft_size, num_mel_bins, fmin, fmax) feats = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) expected_num_frames = compute_num_frames( duration=len(audio) / sampling_rate, frame_shift=hop_size / sampling_rate, sampling_rate=sampling_rate, ) feats = pad_or_truncate_features(feats, expected_num_frames) return feats