Exemplo n.º 1
0
    def _next_batch(self) -> CutSet:
        # Keep iterating the underlying CutSet as long as we hit or exceed the constraints
        # provided by user (the max number of frames or max number of cuts).
        # Note: no actual data is loaded into memory yet because the manifests contain all the metadata
        # required to do this operation.
        self.time_constraint.reset()
        cuts = []
        while True:

            # Check that we have not reached the end of the dataset.
            try:
                # If this doesn't raise (typical case), it's not the end: keep processing.
                next_cut = next(self.data_source)
            except StopIteration:
                # No more cuts to sample from: if we have a partial batch,
                # we may output it, unless the user requested to drop it.
                # We also check if the batch is "almost there" to override drop_last.
                if cuts and (not self.drop_last
                             or self.time_constraint.close_to_exceeding()):
                    # We have a partial batch and we can return it.
                    self.diagnostics.keep(cuts)
                    return CutSet.from_cuts(cuts)
                else:
                    # There is nothing more to return or it's discarded:
                    # signal the iteration code to stop.
                    self.diagnostics.discard(cuts)
                    raise StopIteration()

            # Check whether the cut we're about to sample satisfies optional user-requested predicate.
            if not self._filter_fn(next_cut):
                # No - try another one.
                self.diagnostics.discard_single(next_cut)
                continue

            # Track the duration/frames/etc. constraints.
            self.time_constraint.add(next_cut)

            # Did we exceed the max_frames and max_cuts constraints?
            if not self.time_constraint.exceeded():
                # No - add the next cut to the batch, and keep trying.
                cuts.append(next_cut)
            else:
                # Yes. Do we have at least one cut in the batch?
                if cuts:
                    # Yes. Return the batch, but keep the currently drawn cut for later.
                    self.data_source.take_back(next_cut)
                    break
                else:
                    # No. We'll warn the user that the constrains might be too tight,
                    # and return the cut anyway.
                    warnings.warn(
                        "The first cut drawn in batch collection violates "
                        "the max_frames, max_cuts, or max_duration constraints - "
                        "we'll return it anyway. "
                        "Consider increasing max_frames/max_cuts/max_duration."
                    )
                    cuts.append(next_cut)

        self.diagnostics.keep(cuts)
        return CutSet.from_cuts(cuts)
Exemplo n.º 2
0
 def _next_batch(self) -> Union[CutSet, Tuple[CutSet]]:
     self.allow_iter_to_reset_state()
     if self.merge_batches:
         # Take a batch from each sampler and merge it.
         # Useful when the Dataset class doesn't treat
         # different sources of cuts in any different way.
         #
         # Note: merging batches is tricky because the samplers can be either
         # SimpleCutSampler or CutPairsSampler, and we need to handle them differently.
         cuts: List[Union[CutSet, Tuple[CutSet]]] = []
         for sampler in self.samplers:
             batch = next(sampler)
             cuts.append(batch)
         if not cuts:
             return CutSet()
         if isinstance(batch, CutSet):
             # Each returned batch is a CutSet -- flatten them.
             return CutSet.from_cuts(c for batch in cuts for c in batch)
         else:
             # Each returned batch is a tuple of CutSets -- determine the tuple size N
             # and merge each N-th CutSet together; return a tuple of merged CutSets.
             tuple_len = len(batch)
             cut_sets = []
             for i in range(tuple_len):
                 cut_sets.append(
                     CutSet.from_cuts(c for batch in cuts for c in batch[i])
                 )
             return tuple(cut_sets)
     else:
         # Take a batch from each sampler and return tuple of batches.
         # Useful when the Dataset treats each source differently.
         cuts: List[CutSet] = []
         for sampler in self.samplers:
             cuts.append(next(sampler))
         return tuple(cuts)
Exemplo n.º 3
0
def test_token_collater(add_bos, add_eos):
    test_sentences = [
        "Testing the first sentence.",
        "Let's add some more punctuation, shall we?",
        "How about number 42!",
    ]

    cuts = CutSet.from_cuts(
        dummy_cut(idx,
                  idx,
                  supervisions=[dummy_supervision(idx, idx, text=sentence)])
        for idx, sentence in enumerate(test_sentences))

    token_collater = TokenCollater(cuts, add_bos=add_bos, add_eos=add_eos)
    tokens_batch, tokens_lens = token_collater(cuts)

    assert isinstance(tokens_batch, torch.LongTensor)
    assert isinstance(tokens_lens, torch.IntTensor)

    extend = int(add_bos) + int(add_eos)
    expected_len = len(max(test_sentences, key=len)) + extend
    assert tokens_batch.shape == (len(test_sentences), expected_len)
    assert torch.all(tokens_lens == torch.IntTensor(
        [len(sentence) + extend for sentence in test_sentences]))

    reconstructed = token_collater.inverse(tokens_batch, tokens_lens)
    assert reconstructed == test_sentences
Exemplo n.º 4
0
def test_cut_set_perturb_speed(cut_with_supervision, cut_with_supervision_start01):
    cut_set = CutSet.from_cuts([cut_with_supervision, cut_with_supervision_start01])
    cs_sp = cut_set.perturb_speed(1.1)
    for cut_sp, cut in zip(cs_sp, cut_set):
        samples = cut_sp.load_audio()
        assert samples.shape[1] == cut_sp.num_samples
        assert samples.shape[1] < cut.num_samples
Exemplo n.º 5
0
def read_audio_from_cuts(
    cuts: Iterable[Cut],
    executor: Optional[Executor] = None,
    suppress_errors: bool = False,
) -> Tuple[List[torch.Tensor], CutSet]:
    """
    Loads audio data from an iterable of cuts.

    :param cuts: a CutSet or iterable of cuts.
    :param executor: optional Executor (e.g., ThreadPoolExecutor or ProcessPoolExecutor)
        to perform the audio reads in parallel.
    :param suppress_errors: when set to ``True``, will enable fault-tolerant data reads;
        we will skip the cuts and audio data for the instances that failed (and emit a warning).
        When ``False`` (default), the errors will not be suppressed.
    :return: a tuple of two items: a list of audio tensors (with different shapes),
        and a list of cuts for which we read the data successfully.
    """
    map_fn = map if executor is None else executor.map
    audios = []
    ok_cuts = []
    for idx, (cut, maybe_audio) in enumerate(
            zip(
                cuts,
                map_fn(partial(_read_audio, suppress_errors=suppress_errors),
                       cuts))):
        if maybe_audio is None:
            continue
        else:
            audios.append(maybe_audio)
            ok_cuts.append(cut)
    return audios, CutSet.from_cuts(ok_cuts)
Exemplo n.º 6
0
def _create_buckets_equal_duration_single(cuts: CutSet,
                                          num_buckets: int) -> List[CutSet]:
    """
    Helper method to partition a single CutSet into buckets that have the same
    cumulative duration.

    See also: :meth:`.create_buckets_from_duration_percentiles`.
    """
    total_duration = np.sum(c.duration for c in cuts)
    bucket_duration = total_duration / num_buckets
    iter_cuts = iter(cuts)
    buckets = []
    for bucket_idx in range(num_buckets):
        bucket = []
        current_duration = 0
        try:
            while current_duration < bucket_duration:
                bucket.append(next(iter_cuts))
                current_duration += bucket[-1].duration
            # Every odd bucket, take the cut that exceeded the bucket's duration
            # and put it in the front of the iterable, so that it goes to the
            # next bucket instead. It will ensure that the last bucket is not too
            # thin (otherwise all the previous buckets are a little too large).
            if bucket_idx % 2:
                last_cut = bucket.pop()
                iter_cuts = chain([last_cut], iter_cuts)
        except StopIteration:
            assert bucket_idx == num_buckets - 1
        buckets.append(CutSet.from_cuts(bucket))
    return buckets
Exemplo n.º 7
0
def test_cutset_from_webdataset_sharded_pipe():
    cuts = CutSet.from_file("test/fixtures/libri/cuts.json")
    cut = cuts[0]
    cuts = []
    for i in range(10):
        cuts.append(fastcopy(cut, id=cut.id + "-" + str(i)))
    cuts = CutSet.from_cuts(cuts)

    with TemporaryDirectory() as dir_path:
        tar_pattern = f"pipe:gzip -c > {dir_path}/shard-%06d.tar.gz"
        export_to_webdataset(cuts, output_path=tar_pattern, shard_size=2)

        # disabling shard shuffling for testing purposes here
        cuts_ds = CutSet.from_webdataset(
            "pipe:gunzip -c " + dir_path + "/shard-{000000..000004}.tar.gz",
            shuffle_shards=False,
        )

        assert list(cuts.ids) == list(cuts_ds.ids)

        for c, cds in zip(cuts, cuts_ds):
            np.testing.assert_equal(c.load_audio(), cds.load_audio())
            np.testing.assert_almost_equal(
                c.load_features(), cds.load_features(), decimal=2
            )
Exemplo n.º 8
0
def test_webdataset_sampler_epoch_increment():
    cuts = CutSet.from_file("test/fixtures/libri/cuts.json").repeat(10)

    with TemporaryDirectory() as dir_path:
        tar_pattern = f"{dir_path}/shard-%06d.tar"
        export_to_webdataset(cuts, output_path=tar_pattern, shard_size=1)

        cuts_ds = CutSet.from_webdataset(
            [str(p) for p in Path(dir_path).glob("*.tar")], shuffle_shards=True
        )
        sampler = DynamicCutSampler(cuts_ds, max_cuts=1)
        dloader = DataLoader(
            IterableDatasetWrapper(DummyDataset(), sampler, auto_increment_epoch=True),
            batch_size=None,
            num_workers=1,
            persistent_workers=True,
        )

        epoch_batches = {}
        for epoch in [0, 1]:
            batches = []
            for batch in dloader:
                for cut in batch:
                    batches.append(cut)
            epoch_batches[epoch] = CutSet.from_cuts(batches)

        # Both epochs have the same cut IDs.
        assert sorted(epoch_batches[0].ids) == sorted(epoch_batches[1].ids)
        # Both epochs have different cut order (shards were re-shuffled).
        assert list(epoch_batches[0].ids) != list(epoch_batches[1].ids)
Exemplo n.º 9
0
def libri_cut_set():
    cuts = CutSet.from_json("test/fixtures/libri/cuts.json")
    return CutSet.from_cuts([
        cuts[0],
        cuts[0].with_id("copy-1"),
        cuts[0].with_id("copy-2"),
        cuts[0].append(cuts[0]),
    ])
Exemplo n.º 10
0
 def detuplify(
     cuts: List[Union[Cut,
                      Tuple[Cut]]]) -> Union[CutSet, Tuple[CutSet]]:
     """Helper to do the right thing whether we sampled single cuts or cut tuples."""
     if isinstance(cuts[0], tuple):
         if len(cuts[0]) == 1:
             cuts = CutSet.from_cuts(cs[0] for cs in cuts)
             self.diagnostics.keep(cuts)
             return cuts
         else:
             tuple_of_cut_lists = list(zip(*cuts))
             self.diagnostics.keep(cuts[0])
             return tuple(
                 [CutSet.from_cuts(cs) for cs in tuple_of_cut_lists])
     else:
         self.diagnostics.keep(cuts)
         return CutSet.from_cuts(cuts)
Exemplo n.º 11
0
 def __call__(self, cuts: CutSet) -> CutSet:
     if self.random is None:
         self.random = random
     return CutSet.from_cuts(
         cut.perturb_speed(factor=self.random.choice(self.factors),
                           affix_id=not self.preserve_id
                           ) if self.random.random() <= self.p else cut
         for cut in cuts)
Exemplo n.º 12
0
def libri_cut_set():
    cs = CutSet.from_json('test/fixtures/libri/cuts.json')
    return CutSet.from_cuts([
        cs[0],
        cs[0].with_id('copy-1'),
        cs[0].with_id('copy-2'),
        cs[0].append(cs[0])
    ])
Exemplo n.º 13
0
 def compute_features(self, cuts: Union[Cut, CutSet]) -> torch.Tensor:
     if isinstance(cuts, Cut):
         cuts = CutSet.from_cuts([cuts])
     assert cuts[0].sampling_rate == self.sampling_rate, f'{cuts[0].sampling_rate} != {self.sampling_rate}'
     otf = OnTheFlyFeatures(self.extractor)
     # feats: (batch, seq_len, n_feats)
     feats, _ = otf(cuts)
     return feats
Exemplo n.º 14
0
 def __call__(self, cuts: CutSet) -> CutSet:
     if self.random is None:
         self.random = random
     return CutSet.from_cuts(
         cut.perturb_volume(
             factor=self.random.uniform(self.scale_low, self.scale_high),
             affix_id=not self.preserve_id,
         ) if self.random.random() <= self.p else cut for cut in cuts)
Exemplo n.º 15
0
 def __call__(self, cuts: CutSet) -> CutSet:
     if self.random is None:
         self.random = random
     return CutSet.from_cuts(
         cut.perturb_speed(factor=self.random.choice(self.factors))
         if self.random.random() >= self.p
         else cut
         for cut in cuts
     )
Exemplo n.º 16
0
def test_cut_set_resample_doesnt_duplicate_transforms(cut_with_supervision):
    cuts = CutSet.from_cuts(
        [cut_with_supervision,
         cut_with_supervision.with_id('other-id')])
    cuts_sp = cuts.resample(44100)
    for cut in cuts_sp:
        # This prevents a bug regression where multiple cuts referencing the same recording would
        # attach transforms to the same manifest
        assert len(cut.recording.transforms) == 1
Exemplo n.º 17
0
 def __call__(self, cuts: CutSet) -> CutSet:
     if self.random is None:
         self.random = random
     return CutSet.from_cuts(
         cut.reverb_rir(
             rir_recording=self.random.choice(self.rir_recordings),
             normalize_output=self.normalize_output,
             affix_id=not self.preserve_id,
         ) if self.random.random() >= self.p else cut for cut in cuts)
Exemplo n.º 18
0
def test_cut_set_reverb_rir_doesnt_duplicate_transforms(cut_with_supervision, rir):
    rirs = RecordingSet.from_recordings([rir])
    cuts = CutSet.from_cuts(
        [cut_with_supervision, cut_with_supervision.with_id("other-id")]
    )
    cuts_vp = cuts.reverb_rir(rir_recordings=rirs)
    for cut in cuts_vp:
        # This prevents a bug regression where multiple cuts referencing the same recording would
        # attach transforms to the same manifest
        assert len(cut.recording.transforms) == 1
Exemplo n.º 19
0
 def test_no_off_by_one_errors_in_dataset_batch_collation(
         self, sampling_rate: int, data):
     ### Test data preparation ###
     # Generate 10 - 20 cut durations in numbers of samples
     nums_samples = data.draw(
         st.lists(
             st.integers(round(sampling_rate * 0.1),
                         round(sampling_rate * 5.0)),
             min_size=10,
             max_size=20,
         ),
         label="Cuts numbers of samples",
     )
     # Generate random cuts
     cuts = [
         self.with_cut(sampling_rate=sampling_rate,
                       num_samples=num_samples,
                       supervision=True) for num_samples in nums_samples
     ]
     # Mix them with random offsets
     mixed_cuts = CutSet.from_cuts(
         lhs.mix(
             rhs,
             # Sample the offset in terms of number of samples, and then divide by the sampling rate
             # to obtain "realistic" offsets
             offset_other_by=data.draw(
                 st.integers(
                     min_value=int(0.1 * sampling_rate),
                     max_value=int(lhs.duration * sampling_rate),
                 ),
                 label=f"Offset for pair {idx + 1}",
             ) / sampling_rate,
         ) for idx, (lhs, rhs) in enumerate(zip(cuts, cuts[1:])))
     # Create an ASR dataset
     dataset = K2SpeechRecognitionDataset(
         return_cuts=True,
         cut_transforms=[CutConcatenate(duration_factor=3.0)],
     )
     sampler = SimpleCutSampler(
         mixed_cuts,
         shuffle=False,
     )
     dloader = DataLoader(dataset, batch_size=None, sampler=sampler)
     ### End of test data preparation ###
     # Test the invariants
     for batch in dloader:
         sups = batch["supervisions"]
         cuts = sups["cut"]
         for idx, cut in enumerate(cuts):
             assert (sups["start_frame"][idx] + sups["num_frames"][idx] <=
                     cut.num_frames), f"Error at index {idx}"
             # assert sups['start_sample'][idx] + sups['num_samples'][
             #     idx] <= cut.num_samples, f"Error at index {idx}"
     # Need to call cleanup manually to free the file handles, otherwise the test may crash
     self.cleanup()
Exemplo n.º 20
0
def test_cut_set_extend_by():
    cut1 = dummy_cut(int(uuid4()), start=0.0, duration=0.5)
    cut2 = dummy_cut(int(uuid4()), start=0.2, duration=0.4)
    cut_set = CutSet.from_cuts([cut1, cut2])
    extended_cut_set = cut_set.extend_by(duration=0.3,
                                         direction="both",
                                         preserve_id=True)
    assert isclose(extended_cut_set[cut1.id].start, 0.0)
    assert isclose(extended_cut_set[cut1.id].end, 0.8)
    assert isclose(extended_cut_set[cut2.id].start, 0.0)
    assert isclose(extended_cut_set[cut2.id].end, 0.9)
Exemplo n.º 21
0
 def test_wav_augment_with_executor(self, exec_type):
     cut = self.with_cut(sampling_rate=16000, num_samples=16000)
     with TemporaryDirectory() as d, \
             exec_type(max_workers=4) as ex:
         cut_set = CutSet.from_cuts(
             cut.with_id(str(i)) for i in range(100)).perturb_speed(
                 1.1
             )  # perturb_speed uses torchaudio SoX effect that could hang
         # Just test that it runs and does not hang.
         cut_set_feats = cut_set.compute_and_store_features(
             extractor=Fbank(), storage_path=d, executor=ex)
Exemplo n.º 22
0
def random_cut_set(n_cuts=100) -> CutSet:
    return CutSet.from_cuts(
        MonoCut(id=uuid4(),
                start=round(random.uniform(0, 5), ndigits=8),
                duration=round(random.uniform(3, 10), ndigits=8),
                channel=0,
                recording=Recording(id=uuid4(),
                                    sources=[],
                                    sampling_rate=16000,
                                    num_samples=1600000,
                                    duration=100.0)) for _ in range(n_cuts))
Exemplo n.º 23
0
def test_single_cut_sampler_len():
    # total duration is 55 seconds
    # each second has 100 frames
    cuts = CutSet.from_cuts(dummy_cut(idx, duration=float(idx)) for idx in range(1, 11))
    sampler = SimpleCutSampler(cuts, shuffle=True, max_frames=10 * 100, max_cuts=6)

    for epoch in range(5):
        sampler.set_epoch(epoch)
        sampler_len = len(sampler)
        num_batches = len([batch for batch in sampler])
        assert sampler_len == num_batches
Exemplo n.º 24
0
 def test_wav_augment_with_executor(self, exec_type):
     cut = self.with_cut(sampling_rate=16000, num_samples=16000)
     with TemporaryDirectory() as d, \
             LilcomFilesWriter(storage_path=d) as storage, \
             exec_type(max_workers=4) as ex:
         cut_set = CutSet.from_cuts(cut.with_id(str(i)) for i in range(100))
         # Just test that it runs and does not hang.
         cut_set_feats = cut_set.compute_and_store_features(
             extractor=Fbank(),
             storage=storage,
             augment_fn=SoxEffectTransform(speed(16000)),
             executor=ex)
Exemplo n.º 25
0
def concat_cuts(
        cuts: Sequence[Cut],
        gap: Seconds = 1.0,
        max_duration: Optional[Seconds] = None
) -> CutSet:
    """
    We're going to concatenate the cuts to minimize the amount of total padding frames used.
    This means that some samples in the batch will be merged together into one sample,
    separated by an interval of silence.
    This is actually solving a knapsack problem.
    In this initial implementation we're using a greedy approach:
    going from the back (i.e. the shortest cuts) we'll try to concat them to the longest cut
    that still has some "space" at the end.

    :param cuts: a list of cuts to pack.
    :param gap: the duration of silence inserted between concatenated cuts.
    :param max_duration: the maximum duration for the concatenated cuts
        (by default set to the duration of the first cut).
    :return a list of packed cuts.
    """
    if len(cuts) <= 1:
        # Nothing to do.
        return CutSet.from_cuts(cuts)
    cuts = sorted(cuts, key=lambda c: c.duration, reverse=True)
    max_duration = cuts[0].duration if max_duration is None else max_duration
    current_idx = 0
    while True:
        can_fit = False
        shortest = cuts[-1]
        for idx in range(current_idx, len(cuts) - 1):
            cut = cuts[current_idx]
            can_fit = cut.duration + gap + shortest.duration <= max_duration
            if can_fit:
                cuts[current_idx] = cut.pad(cut.duration + gap).append(shortest)
                cuts = cuts[:-1]
                break
            current_idx += 1
        if not can_fit:
            break
    return CutSet.from_cuts(cuts)
Exemplo n.º 26
0
 def __call__(self, cuts: CutSet) -> CutSet:
     if self.extra_frames is not None:
         return CutSet.from_cuts(
             c.pad(num_frames=c.num_frames + maybe_sample_int(
                 value=self.extra_frames, sample=self.randomized),
                   pad_feat_value=self.pad_feat_value,
                   direction='both') for c in cuts)
     if self.extra_samples is not None:
         return CutSet.from_cuts(
             c.pad(num_samples=c.num_samples + maybe_sample_int(
                 value=self.extra_samples, sample=self.randomized),
                   direction='both') for c in cuts)
     if self.extra_seconds is not None:
         return CutSet.from_cuts(
             c.pad(duration=c.duration + maybe_sample_float(
                 value=self.extra_seconds,
                 sample=self.randomized,
             ),
                   pad_feat_value=self.pad_feat_value,
                   direction='both') for c in cuts)
     raise ValueError(
         "Implementation error in ExtraPadding (please report this issue).")
Exemplo n.º 27
0
def test_cut_pairs_sampler_2():
    cut_set = CutSet.from_cuts([
        dummy_cut(0, duration=10),
        dummy_cut(1, duration=20),
    ])
    sampler = CutPairsSampler(
        source_cuts=cut_set,
        target_cuts=cut_set,
        max_source_duration=50,
        max_target_duration=50,
    )
    batch = next(iter(sampler))
    assert len(batch) == 2
Exemplo n.º 28
0
 def __call__(self, cuts: CutSet) -> CutSet:
     if len(self.rir_recordings) == 0:
         return cuts
     if self.random is None:
         self.random = random
     return CutSet.from_cuts(
         cut.reverb_rir(
             rir_recording=self.random.choice(self.rir_recordings),
             normalize_output=self.normalize_output,
             early_only=self.early_only,
             affix_id=not self.preserve_id,
             rir_channels=self.rir_channels,
         ) if self.random.random() <= self.p else cut for cut in cuts)
def cutset():
    return CutSet.from_cuts([
        # MonoCut
        dummy_cut(0, supervisions=[dummy_supervision(0)]),
        # PaddingCut
        PaddingCut('pad', duration=1.0, sampling_rate=16000, feat_value=-100,
                   num_frames=100, frame_shift=0.01, num_features=80, num_samples=16000),
        # MixedCut
        dummy_cut(0, supervisions=[dummy_supervision(0)]).mix(
            dummy_cut(1, supervisions=[dummy_supervision(1)]),
            offset_other_by=0.5,
            snr=10
        )
    ])
Exemplo n.º 30
0
def test_cut_set_mixed_cut_copy_feats(cuts):
    # Make a CutSet with MonoCut, PaddingCut, and MixedCut
    cuts = CutSet.from_cuts([
        # MixedCut
        cuts[0].pad(duration=30)
    ])
    with TemporaryDirectory() as d, NumpyFilesWriter(d) as w:
        cpy = cuts.copy_feats(writer=w)
        assert len(cpy) == len(cuts)
        for cut, orig in zip(cpy, cuts):
            data = cut.load_features()
            assert isinstance(data, np.ndarray)
            ref_data = orig.load_features()
            np.testing.assert_almost_equal(data, ref_data)