Exemplo n.º 1
0
 def __iter__(self):
     return iter(
         streaming_shuffle(
             iter(self.iterator),
             bufsize=self.buffer_size,
             rng=self.rng,
         ))
Exemplo n.º 2
0
 def __iter__(self) -> "DynamicCutSampler":
     if self._just_restored_state:
         return self
     self.rng = random.Random(self.seed + self.epoch)
     # Initiate iteration
     self.cuts_iter = [iter(cs) for cs in self.cuts]
     # Optionally shuffle
     if self.shuffle:
         self.cuts_iter = [
             # Important -- every shuffler has a copy of RNG seeded in the same way,
             # so that they are reproducible.
             streaming_shuffle(
                 cs,
                 rng=random.Random(self.seed + self.epoch),
                 bufsize=self.shuffle_buffer_size,
             ) for cs in self.cuts_iter
         ]
     # Apply filter predicate
     self.cuts_iter = Filter(
         iterator=zip(*self.cuts_iter),
         predicate=lambda tpl: all(self._filter_fn(c) for c in tpl),
         diagnostics=self.diagnostics,
     )
     # Convert Iterable[Cut] -> Iterable[CutSet]
     self.cuts_iter = DurationBatcher(
         self.cuts_iter,
         max_duration=self.max_duration,
         max_cuts=self.max_cuts,
         drop_last=self.drop_last,
         diagnostics=self.diagnostics,
         strict=self.strict,
     )
     self.cuts_iter = iter(self.cuts_iter)
     return self
Exemplo n.º 3
0
def test_streaming_shuffle(datasize, bufsize):
    data = list(range(int(datasize)))
    shuffled = list(
        streaming_shuffle(iter(data),
                          bufsize=int(bufsize),
                          rng=random.Random(42)))
    assert len(data) == len(shuffled)
    assert len(shuffled) == len(set(shuffled))
    assert data != shuffled
Exemplo n.º 4
0
    def __init__(
        self,
        *cuts: CutSet,
        max_duration: float,
        max_cuts: Optional[int] = None,
        num_buckets: int = 10,
        shuffle: bool = False,
        drop_last: bool = False,
        consistent_ids: bool = True,
        num_cuts_for_bins_estimate: int = 10000,
        buffer_size: int = 10000,
        shuffle_buffer_size: int = 20000,
        strict: bool = True,
        world_size: Optional[int] = None,
        rank: Optional[int] = None,
        seed: int = 0,
    ) -> None:
        """
        :param cuts: one or more CutSets (when more than one, will yield tuples of CutSets as mini-batches)
        :param max_duration: The maximum total recording duration from ``cuts``.
            Note: with multiple CutSets, ``max_duration`` constraint applies only to the first CutSet.
        :param max_cuts: The maximum total number of ``cuts`` per batch.
            When only ``max_duration`` is specified, this sampler yields static batch sizes.
        :param num_buckets: how many buckets to create.
        :param shuffle: When ``True``, the cuts will be shuffled dynamically with
            a reservoir-sampling-based algorithm.
            Convenient when mini-batch loop is inside an outer epoch-level loop, e.g.:
            `for epoch in range(10): for batch in dataset: ...` as every epoch will see a
            different cuts order.
        :param drop_last: When ``True``, we will drop all incomplete batches.
            A batch is considered incomplete if it depleted a bucket before
            hitting the constraint such as max_duration, max_cuts, etc.
        :param consistent_ids: Only affects processing of multiple CutSets.
            When ``True``, at each sampling step we check cuts from all CutSets have the same ID
            (i.e., the first cut from every CutSet should have the same ID, same for the second, third, etc.).
        :param num_cuts_for_bins_estimate: We will draw this many cuts to estimate the duration bins
            for creating similar-duration buckets.
            Larger number means a better estimate to the data distribution, possibly at a longer init cost.
        :param buffer_size: How many cuts (or cut pairs, triplets) we hold at any time across all
            of the buckets.
            Increasing ``max_duration`` (batch_size) or ``num_buckets`` might require increasing this number.
            It will result in larger memory usage.
        :param shuffle_buffer_size: How many cuts (or cut pairs, triplets) are being held in memory
            a buffer used for streaming shuffling. Larger number means better randomness at the cost
            of higher memory usage.
        :param strict: When ``True``, for the purposes of determining dynamic batch size,
            we take the longest cut sampled so far and multiply its duration/num_frames/num_samples
            by the number of cuts currently in mini-batch to check if it exceeded max_duration/etc.
            This can help make the GPU memory usage more predictable when there is a large variance
            in cuts duration.
        :param world_size: Total number of distributed nodes. We will try to infer it by default.
        :param rank: Index of distributed node. We will try to infer it by default.
        :param seed: Random seed used to consistently shuffle the dataset across different processes.
        """
        super().__init__(world_size=world_size, rank=rank, seed=seed)
        if not all(cs.is_lazy for cs in cuts if isinstance(cs, CutSet)):
            warnings.warn(
                "You are using DynamicBucketingSampler with an eagerly read CutSet. "
                "You won't see any memory/speed benefits with that setup. "
                "Either use 'CutSet.from_jsonl_lazy' to read the CutSet lazily, or use a BucketingSampler instead."
            )
        self.cuts = cuts
        self.max_duration = max_duration
        self.max_cuts = max_cuts
        self.shuffle = shuffle
        self.drop_last = drop_last
        self.consistent_ids = consistent_ids
        self.num_cuts_for_bins_estimate = num_cuts_for_bins_estimate
        self.buffer_size = buffer_size
        self.shuffle_buffer_size = shuffle_buffer_size
        self.strict = strict
        self.rng = None

        if self.shuffle:
            cuts_for_bins_estimate = streaming_shuffle(
                iter(self.cuts[0]),
                rng=random.Random(self.seed),
                bufsize=self.shuffle_buffer_size,
            )
        else:
            cuts_for_bins_estimate = self.cuts[0]
        self.duration_bins = estimate_duration_buckets(
            islice(cuts_for_bins_estimate, num_cuts_for_bins_estimate),
            num_buckets=num_buckets,
        )