Пример #1
0
    def default_hparams():
        r"""Returns a dictionary of default hyperparameters.

        See the specific subclasses for the details.
        """
        hparams = DatasetBase.default_hparams()
        hparams.update({
            "bucket_boundaries": [],
            "bucket_batch_sizes": None,
            "bucket_length_fn": None})
        return hparams
Пример #2
0
 def _test_data(self, data: DatasetBase,
                returns_data: bool = False,
                always_returns_data: bool = False):
     sampler = BufferShuffleSampler(data, self.buffer_size)
     for epoch in range(2):
         indices = list(iter(sampler))
         if always_returns_data or (returns_data and epoch == 0):
             examples = [ex[1] for ex in indices]
             indices = [ex[0] for ex in indices]
             np.testing.assert_array_equal(indices, examples)
         self.assertEqual(len(set(indices)), self.size)
         self.assertEqual(min(indices), 0)
         self.assertEqual(max(indices), self.size - 1)
         data._fully_cached = True
Пример #3
0
    def default_hparams():
        r"""Returns a dictionary of default hyperparameters.

        .. code-block:: python

            {
                # (1) Hyperparams specific to scalar dataset
                "dataset": {
                    "files": [],
                    "compression_type": None,
                    "data_type": "int",
                    "other_transformations": [],
                    "data_name": "data",
                }
                # (2) General hyperparams
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "scalar_data",
            }

        Here:

        1. For the hyperparameters in the :attr:`"dataset"` field:

            `"files"`: str or list
                A (list of) file path(s).

                Each line contains a single scalar number.

            `"compression_type"`: str, optional
                One of "" (no compression), "ZLIB", or "GZIP".

            `"data_type"`: str
                The scalar type. Types defined in
                :meth:`~texar.torch.utils.dtypes.get_supported_scalar_types` are
                supported.

            `"other_transformations"`: list
                A list of transformation functions or function names/paths to
                further transform each single data instance.

                (More documentations to be added.)

            `"data_name"`: str
                Name of the dataset.

        2. For the **general** hyperparameters, see
           :meth:`texar.torch.data.DatasetBase.default_hparams` for details.

        """
        hparams = DatasetBase.default_hparams()
        hparams["name"] = "scalar_data"
        hparams.update({"dataset": _default_scalar_dataset_hparams()})
        return hparams
Пример #4
0
    def default_hparams():
        r"""Returns a dictionary of default hyperparameters.

        .. code-block:: python

            {
                # (1) Hyperparameters specific to the record data
                'dataset': {
                    'files': [],
                    'feature_types': {},
                    'feature_convert_types': {},
                    'image_options': {},
                    "num_shards": None,
                    "shard_id": None,
                    "other_transformations": [],
                    "data_name": None,
                }
                # (2) General hyperparameters
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "tfrecord_data",
            }

        Here:

        1. For the hyperparameters in the :attr:`"dataset"` field:

           `"files"`: str or list
               A (list of) pickled file path(s).

           `"feature_types"`: dict
               The feature names (`str`) with their descriptions in the form of
               ``feature_name: [dtype, feature_collate_method, shape]``:

               - ``dtype`` is a Python type (``int``, ``str``), dtype instance
                 from PyTorch (``torch.float``), NumPy (``np.int64``),
                 or TensorFlow (``tf.string``), or their stringified names such
                 as ``"torch.float"`` and ``"np.int64"``. The feature will be
                 read from the files and parsed into this dtype.

               - ``feature_collate_method`` is of type ``str``, and describes
                 how features are collated in the batch. Available values are:

                 - ``"stacked_tensor"``: Features are assumed to be tensors of a
                   fixed shape (or scalars). When collating, features are
                   stacked, with the batch dimension being the first dimension.
                   This is the default value if ``feature_collate_method`` is
                   not specified. For example:

                   - 5 scalar features -> a tensor of shape [5].
                   - 4 tensor features, each of shape [6, 5] -> a tensor of
                     shape [4, 6, 5].

                 - ``"padded_tensor"``: Features are assumed to be tensors, with
                   all dimensions except the first having the same size. When
                   collating, features are padded with zero values along the
                   end of the first dimension so that every tensor has the same
                   size, and then stacked, with the batch dimension being the
                   first dimension. For example:

                   - 3 tensor features, with shapes [4, 7, 8], [5, 7, 8], and
                     [4, 7, 8] -> a tensor of shape [3, 5, 7, 8].

                 - ``"list"``: Features can be any objects. When collating, the
                   features are stored in a Python list.

               - ``shape`` is optional, and can be of type ``int``, `tuple``, or
                 ``torch.Size``. If specified, shapes of tensor features will be
                 checked, depending on the ``feature_collate_method``:

                 - ``"stacked_tensor"``: The shape of every feature tensor must
                   be ``shape``.
                 - ``"padded_tensor"``: The shape (excluding first dimension)
                   of every feature tensor must be ``shape``.
                 - ``"list"``: ``shape`` is ignored.

                 .. note::
                    Shape check is performed before any transformations are
                    applied.

               Example:

               .. code-block:: python

                   feature_types = {
                       "input_ids": ["int64", "stacked_tensor", 128],
                       "label_ids": ["int64", "stacked_tensor"],
                       "name_lists": ["string", "list"],
                   }

               .. note::
                   This field is named `"feature_original_types"` in Texar-TF.
                   This name is still supported, but is deprecated in favor of
                   `"feature_types"`.

                   Texar-TF also uses different names for feature types:

                   - ``"FixedLenFeature"`` corresponds to ``"stacked_tensor"``.
                   - ``"FixedLenSequenceFeature"`` corresponds to
                     ``"padded_tensor"``.
                   - ``"VarLenFeature"`` corresponds to ``"list"``.

                   These names are also accepted in Texar-PyTorch, but are
                   deprecated in favor of the new names.

           `"feature_convert_types"`: dict, optional
               Specifies dtype converting after reading the data files. This
               `dict` maps feature names to desired data dtypes. For example,
               you can first read a feature into dtype ``torch.int32`` by
               specifying in :attr:`"feature_types"` above, and convert
               the feature to dtype ``"torch.long"`` by specifying here.
               Features not specified here will not do dtype-convert.

               - ``dtype`` is a Python type (`int`, `str`), dtype instance from
                 PyTorch (``torch.float``), NumPy (``np.int64``),
                 or TensorFlow (``tf.string``), or their stringified names such
                 as ``"torch.float"`` and ``"np.int64"``.

               Note that this converting process happens after all the data
               are restored.

               Example:

               .. code-block:: python

                   feature_convert_types = {
                       "input_ids": "int32",
                       "label_ids": "int32",
                   }

           `"image_options"`: dict, optional
               Specifies the image feature name and performs image resizing,
               includes three fields:

               - `"image_feature_name"`: str
                   The name of the feature which contains the image data. If
                   set, the image data will be restored in a `numpy.ndarray`.
               - `"resize_height"`: int
                   The height of the image after resizing.
               - `"resize_width"`: int
                   The width of the image after resizing.

               If any of :attr:`"resize_height"` or :attr:`"resize_width"` is
               not set, image data will be restored with original shape.

           `"num_shards"`: int, optional
               The number of data shards in distributed mode. Usually set to
               the number of processes in distributed computing.
               Used in combination with :attr:`"shard_id"`.

               .. warning::
                   Sharding is not yet supported. This option (and
                   related ones below) will be ignored.

           `"shard_id"`: int, optional
               Sets the unique id to identify a shard. The module will
               processes only the corresponding shard of the whole data.
               Used in combination with :attr:`"num_shards"`.

               For example, in a case of distributed computing on 2 GPUs, the
               hyperparameters of the data module for the two processes can be
               configured as below, respectively.

               For GPU 0:

               .. code-block:: python

                   dataset: {
                       ...
                       "num_shards": 2,
                       "shard_id": 0
                   }

               For GPU 1:

               .. code-block:: python

                   dataset: {
                       ...
                       "num_shards": 2,
                       "shard_id": 1
                   }

               Also refer to `examples/bert` for a use case.

           `"other_transformations"`: list
               A list of transformation functions or function names/paths to
               further transform each single data instance.

           `"data_name"`: str
               Name of the dataset.

        2. For the **general** hyperparameters, see
           :meth:`texar.torch.data.DatasetBase.default_hparams` for details.
        """
        hparams = DatasetBase.default_hparams()
        hparams["name"] = "record_data"
        hparams.update({"dataset": _default_record_dataset_hparams()})
        return hparams
Пример #5
0
    def default_hparams():
        r"""Returns a dictionary of default hyperparameters.

        .. code-block:: python

            {
                # (1) Hyperparameters specific to the record data
                'dataset': {
                    'files': [],
                    'feature_original_types': {},
                    'feature_convert_types': {},
                    'image_options': {},
                    "num_shards": None,
                    "shard_id": None,
                    "other_transformations": [],
                    "data_name": None,
                }
                # (2) General hyperparameters
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "tfrecord_data",
            }

        Here:

        1. For the hyperparameters in the :attr:`"dataset"` field:

           `"files"`: str or list
               A (list of) pickled file path(s).

           `"feature_original_types"`: dict
               The feature names (`str`) with their data types and length types,
               key and value in pair
               ``feature_name: [dtype, feature_len_type, len]``,

               - ``dtype`` is a Python type (``int``, ``str``), dtype instance
                 from PyTorch (``torch.float``), NumPy (``np.int64``),
                 or TensorFlow (``tf.string``), or their stringified names such
                 as ``"torch.float"`` and ``"np.int64"``. The feature will be
                 read from the files and parsed into this dtype.

               - ``feature_len_type`` is of type ``str``, and can be either
                 ``"FixedLenFeature"`` or ``"VarLenFeature"`` for fixed length
                 features and non-fixed length features, respectively.

               - ``len`` is an ``int`` and is optional. It is the length for
                 ``"FixedLenFeature"``. Ignored if ``"VarLenFeature"`` is used.

               Example:

               .. code-block:: python

                   feature_original_types = {
                       "input_ids": ["int64", "FixedLenFeature", 128],
                       "label_ids": ["int64", "FixedLenFeature"],
                       "name_lists": ["string", "VarLenFeature"],
                   }

           `"feature_convert_types"`: dict, optional
               Specifies dtype converting after reading the data files. This
               `dict` maps feature names to desired data dtypes. For example,
               you can first read a feature into dtype ``torch.int32`` by
               specifying in :attr:`"feature_original_types"` above, and convert
               the feature to dtype ``"torch.long"`` by specifying here.
               Features not specified here will not do dtype-convert.

               - ``dtype`` is a Python type (`int`, `str`), dtype instance from
                 PyTorch (``torch.float``), NumPy (``np.int64``),
                 or TensorFlow (``tf.string``), or their stringified names such
                 as ``"torch.float"`` and ``"np.int64"``.

               Note that this converting process happens after all the data
               are restored.

               Example:

               .. code-block:: python

                   feature_convert_types = {
                       "input_ids": "int32",
                       "label_ids": "int32",
                   }

           `"image_options"`: dict, optional
               Specifies the image feature name and performs image resizing,
               includes three fields:

               - `"image_feature_name"`: str
                   The name of the feature which contains the image data. If
                   set, the image data will be restored in a `numpy.ndarray`.
               - `"resize_height"`: int
                   The height of the image after resizing.
               - `"resize_width"`: int
                   The width of the image after resizing.

               If any of :attr:`"resize_height"` or :attr:`"resize_width"` is
               not set, image data will be restored with original shape.

           `"num_shards"`: int, optional
               The number of data shards in distributed mode. Usually set to
               the number of processes in distributed computing.
               Used in combination with :attr:`"shard_id"`.

               .. warning::
                   Sharding is not yet supported. This option (and
                   related ones below) will be ignored.

           `"shard_id"`: int, optional
               Sets the unique id to identify a shard. The module will
               processes only the corresponding shard of the whole data.
               Used in combination with :attr:`"num_shards"`.

               For example, in a case of distributed computing on 2 GPUs, the
               hyperparameters of the data module for the two processes can be
               configured as below, respectively.

               For GPU 0:

               .. code-block:: python

                   dataset: {
                       ...
                       "num_shards": 2,
                       "shard_id": 0
                   }

               For GPU 1:

               .. code-block:: python

                   dataset: {
                       ...
                       "num_shards": 2,
                       "shard_id": 1
                   }

               Also refer to `examples/bert` for a use case.

           `"other_transformations"`: list
               A list of transformation functions or function names/paths to
               further transform each single data instance.

           `"data_name"`: str
               Name of the dataset.

        2. For the **general** hyperparameters, see
           :meth:`texar.torch.data.DatasetBase.default_hparams` for details.
        """
        hparams = DatasetBase.default_hparams()
        hparams["name"] = "record_data"
        hparams.update({"dataset": _default_record_dataset_hparams()})
        return hparams