def construct_batch_sampler( dataset, epoch ): splits = [s for s, _ in self.datasets.items() if self.datasets[s] == dataset] split = splits[0] if len(splits) > 0 else None if epoch is not None: dataset.set_epoch(epoch) start_time = time.time() # get indices ordered by example size indices = dataset.ordered_indices() logger.debug(f'[{split}] @batch_sampler order indices time: {get_time_gap(start_time, time.time())}') # filter examples that are too large if max_positions is not None: my_time = time.time() indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), ) logger.debug(f'[{split}] @batch_sampler filter_by_size time: {get_time_gap(my_time, time.time())}') # create mini-batches with given size constraints my_time = time.time() batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, ) logger.debug(f'[{split}] @batch_sampler batch_by_size time: {get_time_gap(my_time, time.time())}') logger.debug(f'[{split}] per epoch batch_sampler set-up time: {get_time_gap(start_time, time.time())}') return batch_sampler
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): assert isinstance(dataset, FairseqDataset) max_positions = (10240, 1024) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large indices = data_utils.filter_by_size( indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs), ) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, )
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch. Default: ``None`` max_sentences (int, optional): max number of sentences in each batch. Default: ``None`` max_positions (optional): max sentence length supported by the model. Default: ``None`` ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long. Default: ``False`` required_batch_size_multiple (int, optional): require batch size to be a multiple of N. Default: ``1`` seed (int, optional): seed for random number generator for reproducibility. Default: ``1`` num_shards (int, optional): shard the data iterator into N shards. Default: ``1`` shard_id (int, optional): which shard of the data iterator to return. Default: ``0`` Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large indices = data_utils.filter_by_size( indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs), ) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, )
def get_epoch_iterator(task, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, num_workers=0, seed=215, num_shards=1, shard_id=0, epoch=0): """ Get an iterator that yields batches of data from the given dataset. """ if dataset in task.dataset_to_epoch_iter: return task.dataset_to_epoch_iter[dataset] # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs)) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple) epoch_iter = iterators.EpochBatchIterator(dataset=dataset, collate_fn=collate, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch) task.dataset_to_epoch_iter[dataset] = epoch_iter return epoch_iter
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large indices = data_utils.filter_by_size( indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs), ) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, )
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ # For default fairseq task, return same iterator across epochs # as datasets are not dynamic, can be overridden in task specific # setting. print("| At task.get_batch_iterator ...", flush=True) if dataset in self.dataset_to_epoch_iter: return self.dataset_to_epoch_iter[dataset] assert isinstance(dataset, FairseqDataset) # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() print("| At task.get_batch_iterator, indices ordered ... ", flush=True) # filter examples that are too large if max_positions is not None: indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), ) print("| At task.get_batch_iterator, examples filtered ... ", flush=True) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) print("| At task.get_batch_iterator, batch_sampler created ... ", flush=True) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) self.dataset_to_epoch_iter[dataset] = epoch_iter print("| At task.get_batch_iterator, iterator created ... ", flush=True) return epoch_iter
def get_batch_iterator(self, dataset, assistant=None, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, batch_method='sentences'): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch. Default: ``None`` max_sentences (int, optional): max number of sentences in each batch. Default: ``None`` max_positions (optional): max sentence length supported by the model. Default: ``None`` ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long. Default: ``False`` required_batch_size_multiple (int, optional): require batch size to be a multiple of N. Default: ``1`` seed (int, optional): seed for random number generator for reproducibility. Default: ``1`` num_shards (int, optional): shard the data iterator into N shards. Default: ``1`` shard_id (int, optional): which shard of the data iterator to return. Default: ``0`` Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large indices = data_utils.filter_by_size( indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs), ) # create mini-batches with given size constraints if assistant is not None: assistant.associate_data(dataset, indices) else: batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) if assistant is not None: # return a reusable, sharded iterator return iterators.AssistantEpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, assistant=assistant, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, shard_num=num_shards, shard_id=shard_id, batch_method=batch_method, seed=seed, ) else: # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, )
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ # For default fairseq task, return same iterator across epochs # as datasets are not dynamic, can be overridden in task specific # setting. # 如果不是每个epoch都换数据,那么就可以直接用上次构建的epoch_itr的结果 # 这样的问题就是, 如果不是每个epoch换数据,那么同一个数据在epoch与epoch之间就不会随机,减低了全局随机性 if dataset in self.dataset_to_epoch_iter: return self.dataset_to_epoch_iter[dataset] assert isinstance(dataset, FairseqDataset) # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size ## 依照这个顺序读取数据生成batch with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: ## 把长度不符合条件的indice从indices中去掉, 根据dataset.size函数 indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), ) print("indices length and type", len(indices), type(indices)) # create mini-batches with given size constraints # batch_sampler应该是一个大list,每个值也是一个list,存放各个batch包含的sen IDs. batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) print("batch_sampler length and type", len(batch_sampler), type(batch_sampler)) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) self.dataset_to_epoch_iter[dataset] = epoch_iter print(self.dataset_to_epoch_iter) return epoch_iter
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, noskip=False, source_lang=None, target_lang=None, data_actor=None, trainer=None, data_filter_percentage=-1, filtered_maxpos_indices=None, dev_grad_dotprod=None, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). data_actor: if not None, it will be used to filter out data Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: if filtered_maxpos_indices is None: indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), noskip=noskip, ) filtered_maxpos_indices = indices else: indices = filtered_maxpos_indices # data selection: filter a subset of data if data_filter_percentage > 0: indices = data_utils.filter_by_data_actor(indices, dataset, data_actor, data_filter_percentage, trainer=trainer) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ), filtered_maxpos_indices