예제 #1
0
    def __init__(
        self,
        dataset,
        sizes,
        block_size,
        pad,
        eos,
        break_mode=None,
        include_targets=False,
        document_sep_len=1,
    ):
        super().__init__()
        self.dataset = dataset
        self.pad = pad
        self.eos = eos
        self.include_targets = include_targets

        assert len(dataset) == len(sizes)
        assert len(dataset) > 0

        if isinstance(sizes, list):
            sizes = np.array(sizes, dtype=np.int64)
        else:
            sizes = sizes.astype(np.int64)

        break_mode = break_mode if break_mode is not None else 'none'

        # For "eos" break-mode, block_size is not required parameters.
        if break_mode == "eos" and block_size is None:
            block_size = 0

        slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size,
                                                document_sep_len)
        self._sizes = slice_indices[:, 1] - slice_indices[:, 0]

        # build index mapping block indices to the underlying dataset indices
        if break_mode == "eos":
            # much faster version for eos break mode
            block_to_dataset_index = np.stack(
                [
                    np.arange(len(sizes)),  # starting index in dataset
                    np.zeros(len(sizes), dtype=np.long
                             ),  # starting offset within starting index
                    np.arange(len(sizes)),  # ending index in dataset
                ],
                1,
            )
        else:
            block_to_dataset_index = _get_block_to_dataset_index_fast(
                sizes,
                slice_indices,
            )
        self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
        self._sizes = plasma_utils.PlasmaArray(self._sizes)
        self._block_to_dataset_index = plasma_utils.PlasmaArray(
            block_to_dataset_index)
    def __init__(
        self,
        dataset,
        sizes,
        block_size,
        pad=0,
        eos=2,
        include_targets=False,
        break_mode=None,
        document_sep_len=1,
        two_inputs=False,
    ):
        try:
            from fairseq.data.token_block_utils_fast import (
                _get_slice_indices_fast,
                _get_block_to_dataset_index_fast,
            )
        except ImportError:
            raise ImportError(
                'Please build Cython components with: `pip install --editable .` '
                'or `python setup.py build_ext --inplace`'
            )

        super().__init__()
        self.dataset = dataset
        self.two_inputs = two_inputs

        assert len(dataset) == len(sizes)
        assert len(dataset) > 0

        if isinstance(sizes, list):
            sizes = np.array(sizes, dtype=np.int64)
        else:
            if torch.is_tensor(sizes):
                sizes = sizes.numpy()
            sizes = sizes.astype(np.int64)

        break_mode = break_mode if break_mode is not None else 'none'

        slice_indices = _get_slice_indices_fast(
            sizes, break_mode, block_size, document_sep_len)
        self._sizes = slice_indices[:, 1] - slice_indices[:, 0]

        # build index mapping block indices to the underlying dataset indices

        block_to_dataset_index = _get_block_to_dataset_index_fast(
            sizes,
            slice_indices,
        )
        self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
        self._sizes = plasma_utils.PlasmaArray(self._sizes)
        self._block_to_dataset_index = plasma_utils.PlasmaArray(
            block_to_dataset_index)
        self.pad = pad
        self.eos = eos
        self.include_targets = include_targets
    def set_epoch(self, epoch):
        logger.debug('ResamplingDataset.set_epoch: {}'.format(epoch))
        super().set_epoch(epoch)

        if epoch == self._cur_epoch:
            return

        self._cur_epoch = epoch

        # Generate a weighted sample of indices as a function of the
        # random seed and the current epoch.

        rng = np.random.RandomState(
            [
                42,  # magic number
                self.seed % (2 ** 32),  # global seed
                self._cur_epoch,  # epoch index
            ]
        )
        self._cur_indices = plasma_utils.PlasmaArray(
            rng.choice(
                len(self.dataset),
                self.actual_size,
                replace=self.replace,
                p=(None if self.weights is None else self.weights.array),
            )
        )
예제 #4
0
    def set_epoch(self, epoch):
        logger.info("SubsampleLanguagePairDataset.set_epoch: {}".format(epoch))
        super().set_epoch(epoch)

        if epoch == self._cur_epoch:
            return

        self._cur_epoch = epoch

        # Generate a weighted sample of indices as a function of the
        # random seed and the current epoch.

        rng = np.random.RandomState([
            42,  # magic number
            self.seed % (2**32),  # global seed
            self._cur_epoch,  # epoch index
        ])
        self._cur_indices = plasma_utils.PlasmaArray(
            rng.choice(
                len(self.dataset),
                self.actual_size,
                replace=self.replace,
                p=(None if self.weights is None else self.weights.array),
            ))

        logger.info(
            "Dataset is sub-sampled: {} -> {}, first 3 ids are: {}".format(
                len(self.dataset), self.actual_size,
                ",".join([str(_i) for _i in self._cur_indices.array[:3]])))
예제 #5
0
 def __init__(self,
              dataset,
              size_ratio,
              weights=None,
              replace=False,
              seed=0,
              epoch=1):
     super().__init__(dataset)
     assert size_ratio <= 1
     self.actual_size = np.ceil(len(dataset) * size_ratio).astype(int)
     logger.info("subsampled dataset from {} to {} (ratio={})".format(
         len(self.dataset), self.actual_size, size_ratio))
     self.src_dict = self.dataset.src_dict
     self.tgt_dict = self.dataset.tgt_dict
     self.left_pad_source = self.dataset.left_pad_source
     self.left_pad_target = self.dataset.left_pad_target
     self.seed = seed
     self._cur_epoch = None
     self._cur_indices = None
     self.replace = replace
     if weights is None:
         self.weights = None
     else:
         assert len(weights) == len(dataset)
         weights_arr = np.array(weights, dtype=np.float64)
         weights_arr /= weights_arr.sum()
         self.weights = plasma_utils.PlasmaArray(weights_arr)
     self.set_epoch(epoch)
 def _next_global_indices(self, epoch):
     rng = np.random.RandomState([
         int(
             hashlib.sha1(str(
                 self.__class__.__name__).encode('utf-8')).hexdigest(), 16)
         % (2**32),
         self.seed % (2**32),  # global seed
         epoch,  # epoch index,
     ])
     del self._random_globa_indices
     self._random_globa_indices = plasma_utils.PlasmaArray(
         rng.choice(self.virtual_size, self.virtual_size, replace=False))
     if self.load_next_shard is None:
         self.load_next_shard = False
     else:
         # increase shard epoch for next loading
         self.shard_epoch += 1
         self.load_next_shard = True
         # a hack to avoid possible out of sync of shard epoch number
         # TODO: to confirm whether this is needed; without it, CUDA event error is occassionally observed
         synced_shard_epoch = self._sync_shard_epoch(self.shard_epoch)
         logger.info(
             'to load next epoch/shard in next load_dataset: '
             f'epoch={epoch}/shard_epoch={self.shard_epoch}[synced={synced_shard_epoch}]'
         )
예제 #7
0
    def __init__(
        self,
        dataset,
        sizes,
        block_size,
        pad,
        eos,
        break_mode=None,
        include_targets=False,
        document_sep_len=1,
        use_plasma_view=False,
        split_path=None,
        plasma_path=None,
    ):

        super().__init__()
        self.dataset = dataset
        self.pad = pad
        self.eos = eos
        self.include_targets = include_targets

        assert len(dataset) > 0

        assert len(dataset) == len(sizes)
        _sizes, block_to_dataset_index, slice_indices = self._build_slice_indices(
            sizes, break_mode, document_sep_len, block_size
        )
        if use_plasma_view:
            plasma_id = (block_size, document_sep_len, str(break_mode), len(dataset))
            self._slice_indices = plasma_utils.PlasmaView(
                slice_indices, split_path, (plasma_id, 0), plasma_path=plasma_path
            )
            self._sizes = plasma_utils.PlasmaView(
                _sizes, split_path, (plasma_id, 1), plasma_path=plasma_path
            )
            self._block_to_dataset_index = plasma_utils.PlasmaView(
                block_to_dataset_index,
                split_path,
                (plasma_id, 2),
                plasma_path=plasma_path,
            )
        else:
            self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
            self._sizes = plasma_utils.PlasmaArray(_sizes)
            self._block_to_dataset_index = plasma_utils.PlasmaArray(
                block_to_dataset_index
            )
    def _establish_virtual_datasets(self):
        if self.sample_ratios is None and self._cur_indices is not None:
            # not a samping dataset, no need to resample if indices are already established
            return
        self._reset_cached_properties()

        start_time = time.time()
        # Generate a weighted sample of indices as a function of the
        # random seed and the current epoch.
        rng = np.random.RandomState([
            int(
                hashlib.sha1(str(
                    self.__class__.__name__).encode('utf-8')).hexdigest(), 16)
            % (2**32),
            self.seed % (2**32),  # global seed
            self._cur_epoch,  # epoch index,
        ])
        indices, cumulated_sizes, virtual_size_per_dataset = self.get_virtual_indices(
            rng, self.datasets, self.sample_ratios, self.virtual_size)

        self._clean_if_not_none(
            [self.cumulated_sizes, self.virtual_size_per_dataset])
        self._cur_indices = plasma_utils.PlasmaArray(indices)
        self.cumulated_sizes = plasma_utils.PlasmaArray(cumulated_sizes)
        self.virtual_size_per_dataset = plasma_utils.PlasmaArray(
            virtual_size_per_dataset)

        raw_sizes = [len(d) for d in self.datasets]
        sampled_sizes = self.virtual_size_per_dataset.array
        logger.info(
            f'[{self.split}] Raw sizes: {str(dict(zip(self.keys, raw_sizes)))}; '
            f'raw total size: {sum(raw_sizes)}')
        logger.info(
            f'[{self.split}] Resampled sizes: {str(dict(zip(self.keys, sampled_sizes)))}; '
            f'resampled total size: {sum(sampled_sizes)}')
        if self.sample_ratios is not None:
            logger.info(
                f'[{self.split}] Upsampling ratios: {str(dict(zip(self.keys, self.sample_ratios.array)))}'
            )
        else:
            logger.info(f'[{self.split}] A concat dataset')
        logger.debug(
            f'[{self.split}] virtual dataset established time: {get_time_gap(start_time, time.time())}'
        )
 def setup_sampling(self, sample_ratios, virtual_size):
     sizes = [len(d) for d in self.datasets]
     if sample_ratios is None:
         # default back to concating datasets
         self.sample_ratios = None
         self.virtual_size = sum(sizes)
     else:
         if not isinstance(sample_ratios, np.ndarray):
             sample_ratios = np.array(sample_ratios)
         self.sample_ratios = plasma_utils.PlasmaArray(sample_ratios)
         virtual_size = default_virtual_size_func if virtual_size is None else virtual_size
         self.virtual_size = (virtual_size(self.datasets,
                                           self.sample_ratios.array)
                              if callable(virtual_size) else virtual_size)
예제 #10
0
    def ordered_indices(self):
        if self._epoch_ordered_indices is not None:
            return self._epoch_ordered_indices.array

        if self.batch_by_size:
            # No need to do shuffle as the data items are already randomized
            indices = np.arange(len(self))
            sizes = self.sizes
            tgt_sizes = sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None
            src_sizes = sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes

            # sort by target length, then source length
            if tgt_sizes is not None:
                indices = indices[
                    np.argsort(tgt_sizes[indices], kind='mergesort')
                ]
            sort_indices = indices[np.argsort(src_sizes[indices], kind='mergesort')]
        else:
            sort_indices = np.arange(len(self))
        self._epoch_ordered_indices = plasma_utils.PlasmaArray(sort_indices)
        return self._epoch_ordered_indices.array
예제 #11
0
    def sizes(self):
        if self._epoch_sizes is not None:
            return self._epoch_sizes.array
        start_time = time.time()

        size_cache = self._size_cache
        ret = []
        for i in range(len(self)):
            index = self._map_epoch_index_to_global(i)
            ds_idx, ds_sample_idx = self._get_dataset_and_index(index)

            if (ds_idx, ds_sample_idx) in size_cache:
                ret.append(size_cache[(ds_idx, ds_sample_idx)])
            else:
                s = self.datasets[ds_idx].size(ds_sample_idx)
                s = (s, s) if not isinstance(s, tuple) else s
                size_cache[(ds_idx, ds_sample_idx)] = s
                ret.append(s)
        self._epoch_sizes = plasma_utils.PlasmaArray(np.array(ret, np.int64))
        logger.info(f'sizes() calling time: {get_time_gap(start_time, time.time())}')
        return self._epoch_sizes.array
    def __init__(
        self,
        dataset,
        weights=None,
        replace=True,
        size_ratio=1.0,
        batch_by_size=True,
        seed=0,
        epoch=1,
    ):
        super().__init__(dataset)

        if weights is None:
            self.weights = None

        else:
            assert len(weights) == len(dataset)
            weights_arr = np.array(weights, dtype=np.float64)
            weights_arr /= weights_arr.sum()
            self.weights = plasma_utils.PlasmaArray(weights_arr)

        self.replace = replace

        assert size_ratio > 0.0
        if not self.replace:
            assert size_ratio < 1.0
        self.size_ratio = float(size_ratio)
        self.actual_size = np.ceil(len(dataset) * self.size_ratio).astype(int)

        self.batch_by_size = batch_by_size
        self.seed = seed

        self._cur_epoch = None
        self._cur_indices = None

        self.set_epoch(epoch)
예제 #13
0
    def __init__(
        self,
        dataset,
        sizes,
        block_size,
        pad,
        eos,
        break_mode=None,
        include_targets=False,
        document_sep_len=1,
    ):
        try:
            from fairseq.data.token_block_utils_fast import (
                _get_slice_indices_fast,
                _get_block_to_dataset_index_fast,
            )
        except ImportError:
            raise ImportError(
                "Please build Cython components with: `pip install --editable .` "
                "or `python setup.py build_ext --inplace`")

        super().__init__()
        self.dataset = dataset
        self.pad = pad
        self.eos = eos
        self.include_targets = include_targets

        assert len(dataset) == len(sizes)
        assert len(dataset) > 0

        if isinstance(sizes, list):
            sizes = np.array(sizes, dtype=np.int64)
        else:
            if torch.is_tensor(sizes):
                sizes = sizes.numpy()
            sizes = sizes.astype(np.int64)

        break_mode = break_mode if break_mode is not None else "none"

        # For "eos" break-mode, block_size is not required parameters.
        if break_mode == "eos" and block_size is None:
            block_size = 0

        slice_indices = _get_slice_indices_fast(sizes, str(break_mode),
                                                block_size, document_sep_len)
        self._sizes = slice_indices[:, 1] - slice_indices[:, 0]

        # build index mapping block indices to the underlying dataset indices
        if break_mode == "eos":
            # much faster version for eos break mode
            block_to_dataset_index = np.stack(
                [
                    np.arange(len(sizes)),  # starting index in dataset
                    np.zeros(len(sizes), dtype=np.compat.long
                             ),  # starting offset within starting index
                    np.arange(len(sizes)),  # ending index in dataset
                ],
                1,
            )
        else:
            block_to_dataset_index = _get_block_to_dataset_index_fast(
                sizes,
                slice_indices,
            )
        size_dtype = np.uint16 if block_size < 65535 else np.uint32
        slice_indices_dtype = best_fitting_int_dtype(slice_indices[-1].max())

        self._slice_indices = plasma_utils.PlasmaArray(
            slice_indices.astype(slice_indices_dtype))
        self._sizes = plasma_utils.PlasmaArray(self._sizes.astype(size_dtype))
        self._block_to_dataset_index = plasma_utils.PlasmaArray(
            block_to_dataset_index.astype(slice_indices_dtype))
예제 #14
0
    def __init__(
        self,
        dataset,
        sizes,
        block_size,
        pad,
        eos,
        break_mode=None,
        include_targets=False,
        document_sep_len=1,
    ):
        super().__init__()
        self.dataset = dataset
        self.pad = pad
        self.eos = eos
        self.include_targets = include_targets
        slice_indices = []

        assert len(dataset) == len(sizes)
        assert len(dataset) > 0
        sizes = np.array(sizes, dtype=int)
        if break_mode is None or break_mode == 'none':
            total_size = sum(sizes)
            length = math.ceil(total_size / block_size)

            def block_at(i):
                start = i * block_size
                end = min(start + block_size, total_size)
                return (start, end)

            slice_indices = [block_at(i) for i in range(length)]
        elif break_mode == 'complete':
            tok_idx = 0
            sz_idx = 0
            curr_size = 0
            while sz_idx < len(sizes):
                if curr_size + sizes[sz_idx] <= block_size or curr_size == 0:
                    curr_size += sizes[sz_idx]
                    sz_idx += 1
                else:
                    slice_indices.append((tok_idx, tok_idx + curr_size))
                    tok_idx += curr_size
                    curr_size = 0
            if curr_size > 0:
                slice_indices.append((tok_idx, tok_idx + curr_size))
        elif break_mode == 'complete_doc':
            tok_idx = 0
            sz_idx = 0
            curr_size = 0
            while sz_idx < len(sizes):
                if ((curr_size + sizes[sz_idx] <= block_size or curr_size == 0)
                        # an empty sentence indicates end-of-document:
                        and sizes[sz_idx] != document_sep_len):
                    curr_size += sizes[sz_idx]
                    sz_idx += 1
                else:
                    slice_indices.append((tok_idx, tok_idx + curr_size))
                    tok_idx += curr_size
                    curr_size = 0
                    if sizes[sz_idx] == document_sep_len:
                        tok_idx += sizes[sz_idx]
                        sz_idx += 1
            if curr_size > 0:
                slice_indices.append((tok_idx, tok_idx + curr_size))
        elif break_mode == 'eos':
            slice_indices = np.empty((len(sizes), 2), dtype=int)
            if not torch.is_tensor(sizes):
                sizes = torch.tensor(sizes)
            cumsum = torch.cumsum(sizes, dim=0)
            slice_indices[0] = [0, sizes[0]]
            if len(cumsum) > 1:
                slice_indices[1:] = cumsum.unfold(0, 2, 1)
        else:
            raise ValueError('Invalid break_mode: ' + break_mode)

        slice_indices = np.array(slice_indices, dtype=int)
        self._sizes = slice_indices[:, 1] - slice_indices[:, 0]

        # build index mapping block indices to the underlying dataset indices
        if break_mode == 'eos':
            # much faster version for eos break mode
            block_to_dataset_index = np.stack(
                [
                    np.arange(len(sizes)),  # starting index in dataset
                    np.zeros(len(sizes), dtype=np.long
                             ),  # starting offset within starting index
                    np.arange(len(sizes))  # ending index in dataset
                ],
                1,
            )
        else:
            ds = DatasetSearcher(sizes)
            block_to_dataset_index = np.empty((len(slice_indices), 3),
                                              dtype=int)
            for i, (s, e) in enumerate(slice_indices):
                ds.seek(s)
                start_ds_idx = ds.current_index
                start_offset = ds.current_offset
                if e <= s:
                    continue
                ds.seek(e - 1)
                end_ds_idx = ds.current_index
                block_to_dataset_index[i] = (
                    start_ds_idx,  # starting index in dataset
                    start_offset,  # starting offset within starting index
                    end_ds_idx,  # ending index in dataset
                )

        self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
        self._sizes = plasma_utils.PlasmaArray(self._sizes)
        self._block_to_dataset_index = plasma_utils.PlasmaArray(
            block_to_dataset_index)
예제 #15
0
    def __init__(
        self,
        dataset,
        sizes,
        block_size,
        pad,
        eos,
        break_mode='complete_doc',
        include_targets=False,
        document_sep_len=1,
        context_mode='doc',
        window_size=3,
    ):
        try:
            from fairseq.data.token_block_utils_fast import (
                _get_slice_indices_fast,
                _get_block_to_dataset_index_fast,
            )
        except ImportError:
            raise ImportError(
                'Please build Cython components with: `pip install --editable .` '
                'or `python setup.py build_ext --inplace`')

        super().__init__()
        self.dataset = dataset
        self.pad = pad
        self.eos = eos
        self.include_targets = include_targets

        assert len(dataset) == len(sizes)
        assert len(dataset) > 0

        if isinstance(sizes, list):
            sizes = np.array(sizes, dtype=np.int64)
        else:
            sizes = sizes.astype(np.int64)

        break_mode = break_mode if break_mode is not None else 'none'

        # For "eos" break-mode, block_size is not required parameters.
        if break_mode == "eos" and block_size is None:
            block_size = 0

        slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size,
                                                document_sep_len)

        # build index mapping block indices to the underlying dataset indices
        if break_mode == "eos":
            # much faster version for eos break mode
            block_to_dataset_index = np.stack(
                [
                    np.arange(len(sizes)),  # starting index in dataset
                    np.zeros(len(sizes), dtype=np.long
                             ),  # starting offset within starting index
                    np.arange(len(sizes)),  # ending index in dataset
                ],
                1,
            )
        else:
            block_to_dataset_index = _get_block_to_dataset_index_fast(
                sizes,
                slice_indices,
            )
        self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
        self._block_to_dataset_index = plasma_utils.PlasmaArray(
            block_to_dataset_index)

        self.context_mode = context_mode
        self.window_size = window_size

        context_index, _sizes = self.rebuild_index()
        self._context_index = plasma_utils.PlasmaArray(np.array(context_index))
        self._sizes = plasma_utils.PlasmaArray(np.array(_sizes))
    def __init__(
        self,
        dataset,
        sizes,
        block_sizes,
        pad,
        eos,
        document_sep_len=1,
    ):
        try:
            from fairseq.data.token_block_utils_fast import (
                _get_slice_indices_fast,
                _get_block_to_dataset_index_fast,
            )
        except ImportError:
            raise ImportError(
                'Please build Cython components with: `pip install --editable .` '
                'or `python setup.py build_ext --inplace`')

        super().__init__()
        self.dataset = dataset
        self.pad = pad
        self.eos = eos

        assert len(dataset) == len(sizes)
        assert len(dataset) > 0

        if isinstance(sizes, list):
            sizes = np.array(sizes, dtype=np.int64)
        else:
            if torch.is_tensor(sizes):
                sizes = sizes.numpy()
            sizes = sizes.astype(np.int64)

        assert min(block_sizes) > 0
        block_sizes = [0] + block_sizes
        slice_indices_list = []
        sizes_list = []
        block_to_dataset_index_list = []
        number_of_inst_in_block = []
        for block_size in block_sizes:
            break_mode = "eos" if block_size == 0 else "complete"
            slice_indices = _get_slice_indices_fast(sizes, break_mode,
                                                    block_size,
                                                    document_sep_len)
            slice_indices_list.append(slice_indices)
            sizes_list.append(slice_indices[:, 1] - slice_indices[:, 0])
            number_of_inst_in_block.append(len(slice_indices))

            # build index mapping block indices to the underlying dataset indices
            if break_mode == "eos":
                # much faster version for eos break mode
                block_to_dataset_index = np.stack(
                    [
                        np.arange(len(sizes)),  # starting index in dataset
                        np.zeros(len(sizes), dtype=np.long
                                 ),  # starting offset within starting index
                        np.arange(len(sizes)),  # ending index in dataset
                    ],
                    1,
                )
            else:
                block_to_dataset_index = _get_block_to_dataset_index_fast(
                    sizes, slice_indices)
            block_to_dataset_index_list.append(block_to_dataset_index)

        self._sizes = np.concatenate(sizes_list)
        self._slice_indices = np.concatenate(slice_indices_list, axis=0)
        self._block_to_dataset_index = np.concatenate(
            block_to_dataset_index_list, axis=0)
        self._number_of_inst_in_block = np.array(number_of_inst_in_block,
                                                 dtype=np.int64)

        self._slice_indices = plasma_utils.PlasmaArray(self._slice_indices)
        self._sizes = plasma_utils.PlasmaArray(self._sizes)
        self._block_to_dataset_index = plasma_utils.PlasmaArray(
            self._block_to_dataset_index)
        self._number_of_inst_in_block = plasma_utils.PlasmaArray(
            self._number_of_inst_in_block)