Exemplo n.º 1
0
    def __init__(
        self,
        dataset,
        batch_sampler,
        seed=1,
        num_shards=1,
        shard_id=0,
        num_workers=0,
        epoch=1,
    ):

        assert isinstance(dataset, OrderedDict)
        assert len(dataset)
        assert isinstance(dataset[next(iter(dataset))], FairseqDataset)

        self.iterators = []

        self.epoch = epoch
        for key, dt in dataset.items():
            epoch_iter = iterators.EpochBatchIterator(
                dataset=dt,
                collate_fn=dt.collater,
                batch_sampler=batch_sampler[key],
                seed=seed,
                num_shards=num_shards,
                shard_id=shard_id,
                num_workers=0,
                epoch=epoch,
            )
            self.iterators.append(epoch_iter)
    def get_batch_iterator(
            self, dataset, max_tokens=None, max_sentences=None, max_positions=None,
            ignore_invalid_inputs=False, required_batch_size_multiple=1,
            seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0,
    ):
        assert isinstance(dataset, FairseqDataset)
        max_positions = (10240, 1024)
        # get indices ordered by example size
        with data_utils.numpy_seed(seed):
            indices = dataset.ordered_indices()

        # filter examples that are too large
        indices = data_utils.filter_by_size(
            indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs),
        )

        # create mini-batches with given size constraints
        batch_sampler = data_utils.batch_by_size(
            indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences,
            required_batch_size_multiple=required_batch_size_multiple,
        )

        # return a reusable, sharded iterator
        return iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=batch_sampler,
            seed=seed,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
            epoch=epoch,
        )
Exemplo n.º 3
0
def get_epoch_iterator(task,
                       dataset,
                       max_tokens=None,
                       max_sentences=None,
                       max_positions=None,
                       ignore_invalid_inputs=False,
                       required_batch_size_multiple=1,
                       num_workers=0,
                       seed=215,
                       num_shards=1,
                       shard_id=0,
                       epoch=0):
    """ Get an iterator that yields batches of data from the given dataset. """
    if dataset in task.dataset_to_epoch_iter:
        return task.dataset_to_epoch_iter[dataset]

    # initialize the dataset with the correct starting epoch
    dataset.set_epoch(epoch)

    # get indices ordered by example size
    with data_utils.numpy_seed(seed):
        indices = dataset.ordered_indices()

    assert isinstance(dataset, FairseqDataset)

    # get indices ordered by example size
    with data_utils.numpy_seed(seed):
        indices = dataset.ordered_indices()

    # filter examples that are too large
    if max_positions is not None:
        indices = data_utils.filter_by_size(
            indices,
            dataset,
            max_positions,
            raise_exception=(not ignore_invalid_inputs))

    # create mini-batches with given size constraints
    batch_sampler = data_utils.batch_by_size(
        indices,
        dataset.num_tokens,
        max_tokens=max_tokens,
        max_sentences=max_sentences,
        required_batch_size_multiple=required_batch_size_multiple)

    epoch_iter = iterators.EpochBatchIterator(dataset=dataset,
                                              collate_fn=collate,
                                              batch_sampler=batch_sampler,
                                              seed=seed,
                                              num_shards=num_shards,
                                              shard_id=shard_id,
                                              num_workers=num_workers,
                                              epoch=epoch)
    task.dataset_to_epoch_iter[dataset] = epoch_iter
    return epoch_iter
Exemplo n.º 4
0
def _get_epoch_batch_itr(ref, bsz, skip_remainder_batch):
    dsz = len(ref)
    indices = range(dsz)
    starts = indices[::bsz]
    batch_sampler = [indices[s:s + bsz] for s in starts]
    dataset = ListDataset(ref)
    itr = iterators.EpochBatchIterator(
        dataset=dataset,
        collate_fn=dataset.collater,
        batch_sampler=batch_sampler,
        skip_remainder_batch=skip_remainder_batch,
    )
    return itr.next_epoch_itr()
Exemplo n.º 5
0
 def get_batch_iterator(
     self, dataset, num_workers=0, epoch=1, data_buffer_size=0, **kwargs
 ):
     return iterators.EpochBatchIterator(
         dataset=dataset,
         collate_fn=self._collate_fn,
         num_workers=num_workers,
         epoch=epoch,
         buffer_size=data_buffer_size,
         # we don't use the batching functionality from EpochBatchIterator;
         # instead every item in *dataset* is a whole batch
         batch_sampler=[[i] for i in range(len(dataset))],
         disable_shuffling=True,
     )
Exemplo n.º 6
0
    def get_batch_iterator(
        self,
        dataset,
        max_tokens=None,
        max_sentences=None,
        max_positions=None,
        ignore_invalid_inputs=False,
        required_batch_size_multiple=1,
        seed=1,
        num_shards=1,
        shard_id=0,
        num_workers=0,
        epoch=1,
        data_buffer_size=0,
        disable_iterator_cache=False,
    ):
        """
        Get an iterator that yields batches of data from the given dataset.

        Args:
            dataset (~fairseq.data.FairseqDataset): dataset to batch
            max_tokens (int, optional): max number of tokens in each batch
                (default: None).
            max_sentences (int, optional): max number of sentences in each
                batch (default: None).
            max_positions (optional): max sentence length supported by the
                model (default: None).
            ignore_invalid_inputs (bool, optional): don't raise Exception for
                sentences that are too long (default: False).
            required_batch_size_multiple (int, optional): require batch size to
                be a multiple of N (default: 1).
            seed (int, optional): seed for random number generator for
                reproducibility (default: 1).
            num_shards (int, optional): shard the data iterator into N
                shards (default: 1).
            shard_id (int, optional): which shard of the data iterator to
                return (default: 0).
            num_workers (int, optional): how many subprocesses to use for data
                loading. 0 means the data will be loaded in the main process
                (default: 0).
            epoch (int, optional): the epoch to start the iterator from
                (default: 1).
            data_buffer_size (int, optional): number of batches to
                preload (default: 0).
            disable_iterator_cache (bool, optional): don't cache the
                EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`)
                (default: False).
        Returns:
            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
                given dataset split
        """
        can_reuse_epoch_itr = not disable_iterator_cache and self.can_reuse_epoch_itr(
            dataset
        )
        if can_reuse_epoch_itr and dataset in self.dataset_to_epoch_iter:
            logger.debug("reusing EpochBatchIterator for epoch {}".format(epoch))
            return self.dataset_to_epoch_iter[dataset]

        assert isinstance(dataset, FairseqDataset)

        # initialize the dataset with the correct starting epoch
        dataset.set_epoch(epoch)

        # get indices ordered by example size
        with data_utils.numpy_seed(seed):
            indices = dataset.ordered_indices()

        # filter examples that are too large
        if max_positions is not None:
            indices = self.filter_indices_by_size(
                indices, dataset, max_positions, ignore_invalid_inputs
            )

        # create mini-batches with given size constraints
        batch_sampler = dataset.batch_by_size(
            indices,
            max_tokens=max_tokens,
            max_sentences=max_sentences,
            required_batch_size_multiple=required_batch_size_multiple,
        )

        # return a reusable, sharded iterator
        epoch_iter = iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=batch_sampler,
            seed=seed,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
            epoch=epoch,
            buffer_size=data_buffer_size,
        )

        if can_reuse_epoch_itr:
            self.dataset_to_epoch_iter[dataset] = epoch_iter

        return epoch_iter
Exemplo n.º 7
0
    def get_batch_iterator(
        self,
        dataset,
        max_tokens=None,
        max_sentences=None,
        max_positions=None,
        ignore_invalid_inputs=False,
        required_batch_size_multiple=1,
        seed=1,
        num_shards=1,
        shard_id=0,
        num_workers=0,
        epoch=1,
        data_buffer_size=0,
        disable_iterator_cache=False,
    ):
        """
        Get an iterator that yields batches of data from the given dataset.

        Args:
            dataset (~fairseq.data.FairseqDataset): dataset to batch
            max_tokens (int, optional): max number of tokens in each batch
                (default: None).
            max_sentences (int, optional): max number of sentences in each
                batch (default: None).
            max_positions (optional): max sentence length supported by the
                model (default: None).
            ignore_invalid_inputs (bool, optional): don't raise Exception for
                sentences that are too long (default: False).
            required_batch_size_multiple (int, optional): require batch size to
                be a multiple of N (default: 1).
            seed (int, optional): seed for random number generator for
                reproducibility (default: 1).
            num_shards (int, optional): shard the data iterator into N
                shards (default: 1).
            shard_id (int, optional): which shard of the data iterator to
                return (default: 0).
            num_workers (int, optional): how many subprocesses to use for data
                loading. 0 means the data will be loaded in the main process
                (default: 0).
            epoch (int, optional): the epoch to start the iterator from
                (default: 0).
            data_buffer_size (int, optional): number of batches to
                preload (default: 0).
            disable_iterator_cache (bool, optional): don't cache the
                EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`)
                (default: False).
        Returns:
            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
                given dataset split
        """
        # initialize the dataset with the correct starting epoch
        assert isinstance(dataset, FairseqDataset)
        if dataset in self.dataset_to_epoch_iter:
            return self.dataset_to_epoch_iter[dataset]
        if self.args.sampling_method == "RoundRobin":
            batch_iter = super().get_batch_iterator(
                dataset,
                max_tokens=max_tokens,
                max_sentences=max_sentences,
                max_positions=max_positions,
                ignore_invalid_inputs=ignore_invalid_inputs,
                required_batch_size_multiple=required_batch_size_multiple,
                seed=seed,
                num_shards=num_shards,
                shard_id=shard_id,
                num_workers=num_workers,
                epoch=epoch,
                data_buffer_size=data_buffer_size,
                disable_iterator_cache=disable_iterator_cache,
            )
            self.dataset_to_epoch_iter[dataset] = batch_iter
            return batch_iter

        construct_batch_sampler = self.create_batch_sampler_func(
            max_positions,
            ignore_invalid_inputs,
            max_tokens,
            max_sentences,
            required_batch_size_multiple=required_batch_size_multiple,
            seed=seed,
        )

        epoch_iter = iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=construct_batch_sampler,
            seed=seed,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
            epoch=epoch,
        )
        return epoch_iter
Exemplo n.º 8
0
def filter_by_data_actor(indices,
                         dataset,
                         data_actor,
                         data_filter_percentage=-1,
                         trainer=None):
    """
    Filter indices based on their size.

    Args:
        indices (List[int]): ordered list of dataset indices
        dataset (FairseqDataset): fairseq dataset instance
        max_positions (tuple): filter elements larger than this size.
            Comparisons are done component-wise.
        raise_exception (bool, optional): if ``True``, raise an exception if
            any elements are filtered (default: False).
    """
    bins = 50
    random_filter = False
    if trainer.args.random_data_filter:
        orig_data_size = len(indices)
        indices = np.array(indices)
        np.random.shuffle(indices)

        #interval = int(len(indices)/bins)
        #start_idx, end_idx, numfiltered = 0, 0, 0
        #while end_idx < len(indices):
        #    end_idx = min(len(indices), start_idx + interval)
        #    current_indices = indices[start_idx:end_idx]
        #    numfiltered += int(len(current_indices)*data_filter_percentage)
        #    start_idx = end_idx

        #indices = indices[numfiltered:]
        indices = indices[int(len(indices) * data_filter_percentage):]
        print("Orignial data size={}; filtered data size={}".format(
            orig_data_size, len(indices)))
        indices.sort()
        return indices
    # elif trainer.args.random_data_filter_by_len:
    elif random_filter:
        orig_data_size = len(indices)
        indices = np.array(indices)
        selected = []
        interval = int(len(indices) / bins)
        if interval == 0:
            # too little data for the number of bins we defined
            interval += 1
        start_idx, end_idx = 0, 0
        while end_idx < len(indices):
            end_idx = min(len(indices), start_idx + interval)
            current_indices = indices[start_idx:end_idx]
            np.random.shuffle(current_indices)
            selected.extend(current_indices[
                int(len(current_indices) * data_filter_percentage):].tolist())
            start_idx = end_idx
        indices = np.array(selected)
        indices.sort()
        print("Orignial data size={}; filtered data size={}".format(
            orig_data_size, len(indices)))
        return indices
    else:
        # calculate data actor score
        # create mini-batches with given size constraints
        print("Use RL agent to filter data")
        max_tokens = 600
        max_sentences = 100
        batch_sampler = batch_by_size(
            indices,
            dataset.num_tokens,
            max_tokens=max_tokens,
            max_sentences=max_sentences,
        )
        # return a reusable, sharded iterator
        itr = iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=batch_sampler).next_epoch_itr(shuffle=False)
        idx_start, idx_end = 0, 0
        scores = np.zeros(len(indices))
        ids = np.zeros(len(indices), dtype=np.int64)
        for i, sample in enumerate(itr):
            sample = trainer._prepare_sample(sample)
            sample = list(sample.values())[0]

            # sample is a batch, use src & trg in this batch to predict how likely we choose this data
            # score should be size [batch_size], use the score to order the datapoints in this batch

            # the data actor cannot take 2 arguments and predict on token with Base data actor
            # score = data_actor(sample['net_input']['src_tokens'], sample['target']).data.cpu().numpy()
            score = data_actor(sample).data.cpu().numpy()
            idx_start = idx_end
            idx_end = idx_start + score.shape[0]
            scores[idx_start:idx_end] = score.ravel()
            ids[idx_start:idx_end] = sample['id'].data.cpu().numpy().ravel()
            # print(ids[idx_start:idx_end])
            # print(scores[idx_start:idx_end])
            # print("")
        # argsort is ascending order
        print(scores)
        preserved_indices = np.argsort(
            scores)[int(len(indices) * data_filter_percentage):]
        worst_id = np.array(ids)[np.argsort(scores)[:10]]
        best_id = np.array(ids)[np.argsort(scores)[-10:]]
        print("worst sentence id: ", worst_id)
        print("best sentence id: ", best_id)

        filter_indices = np.array(ids)[preserved_indices]
        filter_indices.sort()
        filter_indices = np.array(
            [item for item in filter_indices if item in indices])
        print("Orignial data size={}; filtered data size={}".format(
            len(ids), len(filter_indices)))
        return batch_by_size(filter_indices,
                             dataset.num_tokens,
                             max_tokens=max_tokens,
                             max_sentences=max_sentences)
Exemplo n.º 9
0
    def get_batch_iterator(
        self,
        dataset,
        max_tokens=None,
        max_sentences=None,
        max_positions=None,
        ignore_invalid_inputs=False,
        required_batch_size_multiple=1,
        seed=1,
        num_shards=1,
        shard_id=0,
        num_workers=0,
        epoch=0,
    ):
        """
        Get an iterator that yields batches of data from the given dataset.

        Args:
            dataset (~fairseq.data.FairseqDataset): dataset to batch
            max_tokens (int, optional): max number of tokens in each batch
            (default: None). (we do not use it anymore and must ensure that everywhere)
            max_sentences (int, optional): max number of sentences in each
                batch (default: None).
            max_positions (optional): max sentence length supported by the
                model (default: None). (we do not use it anymore and must ensure that everywhere)
            ignore_invalid_inputs (bool, optional): don't raise Exception for
                sentences that are too long (default: False).
            required_batch_size_multiple (int, optional): require batch size to
                be a multiple of N (default: 1).
            seed (int, optional): seed for random number generator for
                reproducibility (default: 1).
            num_shards (int, optional): shard the data iterator into N
                shards (default: 1).
            shard_id (int, optional): which shard of the data iterator to
                return (default: 0).
            num_workers (int, optional): how many subprocesses to use for data
                loading. 0 means the data will be loaded in the main process
                (default: 0).
            epoch (int, optional): the epoch to start the iterator from
                (default: 0).

        Returns:
            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
                given dataset split
        """
        assert isinstance(dataset, FairseqDataset)

        # get indices ordered by example size
        #this should be already fixed in the dataset
        with data_utils.numpy_seed(seed):
            #will get our ordered_indices
            #will be filtering by our size
            indices = dataset.ordered_indices()

        # filter_by_size was removed as we believe we do not need it (Christine)(18-12-2019)

        # create mini-batches which has batches with given size constraints
        # it just adjusts the batch by size (works by batch_by_size implemented above)
        # should be tested later if it suits the framework(Christine)(18-12-2019)
        batch_sampler = batch_by_size(indices, max_sentences=max_sentences)

        # batches should be here returned correctly, mini batches should be ???
        # return a reusable, sharded iterator
        return iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=batch_sampler,
            seed=seed,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
            epoch=epoch,
        )
Exemplo n.º 10
0
    def get_batch_iterator(
        self,
        dataset,
        max_tokens=None,
        max_sentences=None,
        max_positions=None,
        ignore_invalid_inputs=False,
        required_batch_size_multiple=1,
        seed=1,
        num_shards=1,
        shard_id=0,
        num_workers=0,
        epoch=1,
        data_buffer_size=0,
        disable_iterator_cache=False,
    ):
        can_reuse_epoch_itr = not disable_iterator_cache and self.can_reuse_epoch_itr(
            dataset)
        if can_reuse_epoch_itr and dataset in self.dataset_to_epoch_iter:
            logger.debug(
                "reusing EpochBatchIterator for epoch {}".format(epoch))
            return self.dataset_to_epoch_iter[dataset]

        assert isinstance(dataset, FairseqDataset)

        # initialize the dataset with the correct starting epoch
        dataset.set_epoch(epoch)

        # get indices ordered by example size
        with data_utils.numpy_seed(seed):
            indices = dataset.ordered_indices()

        # filter examples that are too large
        if max_positions is not None:
            indices = self.filter_indices_by_size(indices, dataset,
                                                  max_positions,
                                                  ignore_invalid_inputs)
        logger.info("the max number of tokens are {}".format(max_tokens))
        print("the indices are: {}".format(indices))
        print("the max number of tokens are {}".format(max_tokens))
        print("the number of indices are: {}".format(len(dataset.src_sizes)))
        print(dataset.sizes[indices])
        # create mini-batches with given size constraints
        batch_sampler = dataset.batch_by_size(
            indices,
            max_tokens=max_tokens,
            max_sentences=max_sentences,
            required_batch_size_multiple=required_batch_size_multiple,
        )

        # return a reusable, sharded iterator
        epoch_iter = iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=batch_sampler,
            seed=seed,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
            epoch=epoch,
            buffer_size=data_buffer_size,
        )

        if can_reuse_epoch_itr:

            self.dataset_to_epoch_iter[dataset] = epoch_iter

        return epoch_iter
Exemplo n.º 11
0
def filter_by_data_actor(indices, dataset, data_actor, data_filter_percentage=-1, trainer=None):
    """
    Filter indices based on their size.

    Args:
        indices (List[int]): ordered list of dataset indices
        dataset (FairseqDataset): fairseq dataset instance
        max_positions (tuple): filter elements larger than this size.
            Comparisons are done component-wise.
        raise_exception (bool, optional): if ``True``, raise an exception if
            any elements are filtered (default: False).
    """
    bins = 50
    if trainer.args.random_data_filter:
        orig_data_size = len(indices)
        indices = np.array(indices)
        np.random.shuffle(indices)

        #interval = int(len(indices)/bins)
        #start_idx, end_idx, numfiltered = 0, 0, 0
        #while end_idx < len(indices):
        #    end_idx = min(len(indices), start_idx + interval)
        #    current_indices = indices[start_idx:end_idx]
        #    numfiltered += int(len(current_indices)*data_filter_percentage)
        #    start_idx = end_idx

        #indices = indices[numfiltered:]
        indices = indices[int(len(indices)*data_filter_percentage):]
        print("Orignial data size={}; filtered data size={}".format(orig_data_size, len(indices)))
        indices.sort()
        return indices
    elif trainer.args.random_data_filter_by_len:
        orig_data_size = len(indices)
        indices = np.array(indices)
        selected = []
        interval = int(len(indices)/bins)
        start_idx, end_idx = 0, 0
        while end_idx < len(indices):
            end_idx = min(len(indices), start_idx + interval)
            current_indices = indices[start_idx:end_idx]
            np.random.shuffle(current_indices)
            selected.extend(current_indices[int(len(current_indices)*data_filter_percentage):].tolist())
            start_idx = end_idx
        indices = np.array(selected)
        indices.sort()
        print("Orignial data size={}; filtered data size={}".format(orig_data_size, len(indices)))
        return indices
    else:
        # calculate data actor score
        # create mini-batches with given size constraints
        max_tokens = 4800
        max_sentences = 100
        batch_sampler = batch_by_size(
            indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences,
        )
        # return a reusable, sharded iterator
        itr = iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=batch_sampler
        ).next_epoch_itr(shuffle=False)
        idx_start, idx_end = 0, 0
        scores = np.zeros(len(indices))
        ids = np.zeros(len(indices), dtype=np.int64)
        for i, sample in enumerate(itr):
            sample = trainer._prepare_sample(sample)
            sample = list(sample.values())[0]
            #print(sample)
            score = data_actor(sample['net_input']['src_tokens'], sample['target']).data.cpu().numpy()
            idx_start = idx_end
            idx_end = idx_start + score.shape[0]
            scores[idx_start:idx_end] = score.ravel()
            ids[idx_start:idx_end] = sample['id'].data.cpu().numpy().ravel()
        # argsort is ascending order
        preserved_indices = np.argsort(scores)[int(len(indices)*data_filter_percentage):]
        indices = np.array(ids)[preserved_indices]

        #score_indices = np.argsort(scores)
        #selected = []
        #interval = int(len(scores)/bins)
        #start_idx, end_idx = 0, 0
        #while end_idx < len(score_indices):
        #    end_idx = min(len(scores), start_idx + interval)
        #    current_indices = score_indices[start_idx:end_idx]
        #    np.random.shuffle(current_indices)
        #    selected.extend(current_indices[int(len(current_indices)*data_filter_percentage):].tolist())
        #    start_idx = end_idx
        #indices = np.array(selected)
        indices.sort()
        print("Orignial data size={}; filtered data size={}".format(len(ids), len(indices)))
        return indices
Exemplo n.º 12
0
    def get_batch_iterator(
        self, dataset, max_tokens=None, max_sentences=None, max_positions=None,
        ignore_invalid_inputs=False, required_batch_size_multiple=1,
        seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0,
    ):
        """
        Get an iterator that yields batches of data from the given dataset.

        Args:
            dataset (~fairseq.data.FairseqDataset): dataset to batch
            max_tokens (int, optional): max number of tokens in each batch
                (default: None).
            max_sentences (int, optional): max number of sentences in each
                batch (default: None).
            max_positions (optional): max sentence length supported by the
                model (default: None).
            ignore_invalid_inputs (bool, optional): don't raise Exception for
                sentences that are too long (default: False).
            required_batch_size_multiple (int, optional): require batch size to
                be a multiple of N (default: 1).
            seed (int, optional): seed for random number generator for
                reproducibility (default: 1).
            num_shards (int, optional): shard the data iterator into N
                shards (default: 1).
            shard_id (int, optional): which shard of the data iterator to
                return (default: 0).
            num_workers (int, optional): how many subprocesses to use for data
                loading. 0 means the data will be loaded in the main process
                (default: 0).
            epoch (int, optional): the epoch to start the iterator from
                (default: 0).
        Returns:
            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
                given dataset split
        """
        assert isinstance(dataset, FairseqDataset)

        # initialize the dataset with the correct starting epoch
        global_start_time = time.time()
        dataset.set_epoch(epoch)
        set_epoch_time = time.time() - global_start_time

        # get indices ordered by example size
        start_time = time.time()
        with data_utils.numpy_seed(seed, epoch):
            indices = dataset.ordered_indices()
        sort_time = time.time() - start_time

        # create mini-batches with given size constraints
        start_time = time.time()
        batch_sampler = data_utils.batch_by_size(
            indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences,
            required_batch_size_multiple=required_batch_size_multiple,
        )
        batch_by_size_time = time.time() - start_time
        logger.info(
            'get batch iterator [seed=%d, epoch=%d, num_shards=%d] is done in %.3f seconds '
            '(set epoch=%.3f, sorting=%.3f, batch by size=%.3f)' % (
                seed,
                epoch,
                num_shards,
                time.time() - global_start_time,
                set_epoch_time,
                sort_time,
                batch_by_size_time,
        ))

        # return a reusable, sharded iterator
        epoch_iter = iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=batch_sampler,
            seed=seed,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
            epoch=epoch,
        )
        return epoch_iter
Exemplo n.º 13
0
    def get_batch_iterator(
        self, dataset, max_tokens=None, max_sentences=None, max_positions=None,
        ignore_invalid_inputs=False, required_batch_size_multiple=1,
        seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0,
    ):
        """
        Get an iterator that yields batches of data from the given dataset.

        Args:
            dataset (~fairseq.data.FairseqDataset): dataset to batch
            max_tokens (int, optional): max number of tokens in each batch
                (default: None).
            max_sentences (int, optional): max number of sentences in each
                batch (default: None).
            max_positions (optional): max sentence length supported by the
                model (default: None).
            ignore_invalid_inputs (bool, optional): don't raise Exception for
                sentences that are too long (default: False).
            required_batch_size_multiple (int, optional): require batch size to
                be a multiple of N (default: 1).
            seed (int, optional): seed for random number generator for
                reproducibility (default: 1).
            num_shards (int, optional): shard the data iterator into N
                shards (default: 1).
            shard_id (int, optional): which shard of the data iterator to
                return (default: 0).
            num_workers (int, optional): how many subprocesses to use for data
                loading. 0 means the data will be loaded in the main process
                (default: 0).
            epoch (int, optional): the epoch to start the iterator from
                (default: 0).
        Returns:
            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
                given dataset split
        """
        # For default fairseq task, return same iterator across epochs
        # as datasets are not dynamic, can be overridden in task specific
        # setting. # 如果不是每个epoch都换数据,那么就可以直接用上次构建的epoch_itr的结果
        # 这样的问题就是, 如果不是每个epoch换数据,那么同一个数据在epoch与epoch之间就不会随机,减低了全局随机性
        if dataset in self.dataset_to_epoch_iter:
            return self.dataset_to_epoch_iter[dataset]

        assert isinstance(dataset, FairseqDataset)

        # initialize the dataset with the correct starting epoch
        dataset.set_epoch(epoch)

        # get indices ordered by example size ## 依照这个顺序读取数据生成batch
        with data_utils.numpy_seed(seed):
            indices = dataset.ordered_indices()

        # filter examples that are too large
        if max_positions is not None: ## 把长度不符合条件的indice从indices中去掉, 根据dataset.size函数
            indices = data_utils.filter_by_size(
                indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs),
            )

        print("indices length and type", len(indices), type(indices))
        # create mini-batches with given size constraints 
        # batch_sampler应该是一个大list,每个值也是一个list,存放各个batch包含的sen IDs.
        batch_sampler = data_utils.batch_by_size(
            indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences,
            required_batch_size_multiple=required_batch_size_multiple,
        )
        print("batch_sampler length and type", len(batch_sampler), type(batch_sampler))

        # return a reusable, sharded iterator
        epoch_iter = iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=batch_sampler,
            seed=seed,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
            epoch=epoch,
        )
        self.dataset_to_epoch_iter[dataset] = epoch_iter
        print(self.dataset_to_epoch_iter)
        return epoch_iter
Exemplo n.º 14
0
    def get_batch_iterator(
        self,
        dataset,
        max_tokens=None,
        max_sentences=None,
        max_positions=None,
        ignore_invalid_inputs=False,
        required_batch_size_multiple=1,
        seed=1,
        num_shards=1,
        shard_id=0,
        num_workers=0,
        epoch=0,
    ):
        """
        Get an iterator that yields batches of data from the given dataset.

        Args:
            dataset (~fairseq.data.FairseqDataset): dataset to batch
            max_tokens (int, optional): max number of tokens in each batch
                (default: None).
            max_sentences (int, optional): max number of sentences in each
                batch (default: None).
            max_positions (optional): max sentence length supported by the
                model (default: None).
            ignore_invalid_inputs (bool, optional): don't raise Exception for
                sentences that are too long (default: False).
            required_batch_size_multiple (int, optional): require batch size to
                be a multiple of N (default: 1).
            seed (int, optional): seed for random number generator for
                reproducibility (default: 1).
            num_shards (int, optional): shard the data iterator into N
                shards (default: 1).
            shard_id (int, optional): which shard of the data iterator to
                return (default: 0).
            num_workers (int, optional): how many subprocesses to use for data
                loading. 0 means the data will be loaded in the main process
                (default: 0).
            epoch (int, optional): the epoch to start the iterator from
                (default: 0).
        Returns:
            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
                given dataset split
        """
        # For default fairseq task, return same iterator across epochs
        # as datasets are not dynamic, can be overridden in task specific
        # setting.
        print("| At task.get_batch_iterator ...", flush=True)
        if dataset in self.dataset_to_epoch_iter:
            return self.dataset_to_epoch_iter[dataset]

        assert isinstance(dataset, FairseqDataset)

        # initialize the dataset with the correct starting epoch
        dataset.set_epoch(epoch)

        # get indices ordered by example size
        with data_utils.numpy_seed(seed):
            indices = dataset.ordered_indices()
        print("| At task.get_batch_iterator, indices ordered ... ", flush=True)

        # filter examples that are too large
        if max_positions is not None:
            indices = data_utils.filter_by_size(
                indices,
                dataset,
                max_positions,
                raise_exception=(not ignore_invalid_inputs),
            )
        print("| At task.get_batch_iterator, examples filtered ... ",
              flush=True)

        # create mini-batches with given size constraints
        batch_sampler = data_utils.batch_by_size(
            indices,
            dataset.num_tokens,
            max_tokens=max_tokens,
            max_sentences=max_sentences,
            required_batch_size_multiple=required_batch_size_multiple,
        )
        print("| At task.get_batch_iterator, batch_sampler created ... ",
              flush=True)

        # return a reusable, sharded iterator
        epoch_iter = iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=batch_sampler,
            seed=seed,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
            epoch=epoch,
        )
        self.dataset_to_epoch_iter[dataset] = epoch_iter
        print("| At task.get_batch_iterator, iterator created ... ",
              flush=True)
        return epoch_iter
Exemplo n.º 15
0
    def get_batch_iterator(self,
                           dataset,
                           assistant=None,
                           max_tokens=None,
                           max_sentences=None,
                           max_positions=None,
                           ignore_invalid_inputs=False,
                           required_batch_size_multiple=1,
                           seed=1,
                           num_shards=1,
                           shard_id=0,
                           batch_method='sentences'):
        """
        Get an iterator that yields batches of data from the given dataset.

        Args:
            dataset (~fairseq.data.FairseqDataset): dataset to batch
            max_tokens (int, optional): max number of tokens in each batch.
                Default: ``None``
            max_sentences (int, optional): max number of sentences in each
                batch. Default: ``None``
            max_positions (optional): max sentence length supported by the
                model. Default: ``None``
            ignore_invalid_inputs (bool, optional): don't raise Exception for
                sentences that are too long. Default: ``False``
            required_batch_size_multiple (int, optional): require batch size to
                be a multiple of N. Default: ``1``
            seed (int, optional): seed for random number generator for
                reproducibility. Default: ``1``
            num_shards (int, optional): shard the data iterator into N
                shards. Default: ``1``
            shard_id (int, optional): which shard of the data iterator to
                return. Default: ``0``

        Returns:
            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
                given dataset split
        """
        assert isinstance(dataset, FairseqDataset)

        # get indices ordered by example size
        with data_utils.numpy_seed(seed):
            indices = dataset.ordered_indices()

        # filter examples that are too large
        indices = data_utils.filter_by_size(
            indices,
            dataset.size,
            max_positions,
            raise_exception=(not ignore_invalid_inputs),
        )

        # create mini-batches with given size constraints
        if assistant is not None:
            assistant.associate_data(dataset, indices)
        else:
            batch_sampler = data_utils.batch_by_size(
                indices,
                dataset.num_tokens,
                max_tokens=max_tokens,
                max_sentences=max_sentences,
                required_batch_size_multiple=required_batch_size_multiple,
            )

        if assistant is not None:
            # return a reusable, sharded iterator
            return iterators.AssistantEpochBatchIterator(
                dataset=dataset,
                collate_fn=dataset.collater,
                assistant=assistant,
                max_tokens=max_tokens,
                max_sentences=max_sentences,
                required_batch_size_multiple=required_batch_size_multiple,
                shard_num=num_shards,
                shard_id=shard_id,
                batch_method=batch_method,
                seed=seed,
            )

        else:
            # return a reusable, sharded iterator
            return iterators.EpochBatchIterator(
                dataset=dataset,
                collate_fn=dataset.collater,
                batch_sampler=batch_sampler,
                seed=seed,
                num_shards=num_shards,
                shard_id=shard_id,
            )
Exemplo n.º 16
0
    def get_batch_iterator(
            self, dataset, max_tokens=None, max_sentences=None, max_positions=None,
            ignore_invalid_inputs=False, required_batch_size_multiple=1,
            seed=1, num_shards=1, shard_id=0, num_workers=0,
    ):
        """
        Get an iterator that yields batches of data from the given dataset.

        Args:
            dataset (~fairseq.data.FairseqDataset): dataset to batch
            max_tokens (int, optional): max number of tokens in each batch
                (default: None).
            max_sentences (int, optional): max number of sentences in each
                batch (default: None).
            max_positions (optional): max sentence length supported by the
                model (default: None).
            ignore_invalid_inputs (bool, optional): don't raise Exception for
                sentences that are too long (default: False).
            required_batch_size_multiple (int, optional): require batch size to
                be a multiple of N (default: 1).
            seed (int, optional): seed for random number generator for
                reproducibility (default: 1).
            num_shards (int, optional): shard the data iterator into N
                shards (default: 1).
            shard_id (int, optional): which shard of the data iterator to
                return (default: 0).
            num_workers (int, optional): how many subprocesses to use for data
                loading. 0 means the data will be loaded in the main process
                (default: 0).

        Returns:
            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
                given dataset split
        """
        assert isinstance(dataset, FairseqDataset)

        # get indices ordered by example size
        with data_utils.numpy_seed(seed):
            indices = dataset.ordered_indices()

        # filter examples that are too large
        indices = data_utils.filter_by_size(
            indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs),
        )

        # create mini-batches with given size constraints
        batch_sampler = data_utils.batch_by_size(
            indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences,
            required_batch_size_multiple=required_batch_size_multiple,
        )

        # return a reusable, sharded iterator
        return iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=batch_sampler,
            seed=seed,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
        )
Exemplo n.º 17
0
    def get_batch_iterator(
        self, dataset, max_tokens=None, max_sentences=None, max_positions=None,
        ignore_invalid_inputs=False, required_batch_size_multiple=1,
        seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, noskip=False, source_lang=None, target_lang=None,
        data_actor=None, trainer=None, data_filter_percentage=-1, filtered_maxpos_indices=None, dev_grad_dotprod=None,
    ):
        """
        Get an iterator that yields batches of data from the given dataset.

        Args:
            dataset (~fairseq.data.FairseqDataset): dataset to batch
            max_tokens (int, optional): max number of tokens in each batch
                (default: None).
            max_sentences (int, optional): max number of sentences in each
                batch (default: None).
            max_positions (optional): max sentence length supported by the
                model (default: None).
            ignore_invalid_inputs (bool, optional): don't raise Exception for
                sentences that are too long (default: False).
            required_batch_size_multiple (int, optional): require batch size to
                be a multiple of N (default: 1).
            seed (int, optional): seed for random number generator for
                reproducibility (default: 1).
            num_shards (int, optional): shard the data iterator into N
                shards (default: 1).
            shard_id (int, optional): which shard of the data iterator to
                return (default: 0).
            num_workers (int, optional): how many subprocesses to use for data
                loading. 0 means the data will be loaded in the main process
                (default: 0).
            epoch (int, optional): the epoch to start the iterator from
                (default: 0).
            data_actor: if not None, it will be used to filter out data

        Returns:
            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
                given dataset split
        """
        assert isinstance(dataset, FairseqDataset)

        # get indices ordered by example size
        with data_utils.numpy_seed(seed):
            indices = dataset.ordered_indices()

        # filter examples that are too large
        if max_positions is not None:
            if filtered_maxpos_indices is None:
                indices = data_utils.filter_by_size(
                    indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), noskip=noskip,
                )
                filtered_maxpos_indices = indices
            else:
                indices = filtered_maxpos_indices

        # data selection: filter a subset of data
        if data_filter_percentage > 0:
            indices = data_utils.filter_by_data_actor(indices, dataset, data_actor, data_filter_percentage, trainer=trainer)

        # create mini-batches with given size constraints
        batch_sampler = data_utils.batch_by_size(
            indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences,
            required_batch_size_multiple=required_batch_size_multiple,
        )

        # return a reusable, sharded iterator
        return iterators.EpochBatchIterator(
            dataset=dataset,
            collate_fn=dataset.collater,
            batch_sampler=batch_sampler,
            seed=seed,
            num_shards=num_shards,
            shard_id=shard_id,
            num_workers=num_workers,
            epoch=epoch,
        ), filtered_maxpos_indices