Пример #1
0
    def default_hparams():
        r"""Returns a dictionary of default hyperparameters.

        .. code-block:: python

            {
                # (1) Hyperparams specific to scalar dataset
                "dataset": {
                    "files": [],
                    "compression_type": None,
                    "data_type": "int",
                    "other_transformations": [],
                    "data_name": "data",
                }
                # (2) General hyperparams
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "scalar_data",
            }

        Here:

        1. For the hyperparameters in the :attr:`"dataset"` field:

            `"files"`: str or list
                A (list of) file path(s).

                Each line contains a single scalar number.

            `"compression_type"`: str, optional
                One of "" (no compression), "ZLIB", or "GZIP".

            `"data_type"`: str
                The scalar type. Currently supports "int" and "float".

            `"other_transformations"`: list
                A list of transformation functions or function names/paths to
                further transform each single data instance.

                (More documentations to be added.)

            `"data_name"`: str
                Name of the dataset.

        2. For the **general** hyperparameters, see
        :meth:`texar.data.DataBase.default_hparams` for details.

        """
        hparams = DataBase.default_hparams()
        hparams["name"] = "scalar_data"
        hparams.update({"dataset": _default_scalar_dataset_hparams()})
        return hparams
Пример #2
0
 def default_hparams():
     """Returns a dicitionary of default hyperparameters.
     """
     hparams = DataBase.default_hparams()
     hparams["name"] = "scalar_data"
     hparams.update({
         "dataset": _default_scalar_dataset_hparams()
     })
     return hparams
Пример #3
0
 def default_hparams():
     """Returns a dictionary of default hyperparameters.
     """
     hparams = DataBase.default_hparams()
     hparams.update({
         "bucket_boundaries": [],
         "bucket_batch_sizes": None,
         "bucket_length_fn": None
     })
     return hparams
Пример #4
0
    def default_hparams():
        r"""Returns a dictionary of default hyperparameters.

        See the specific subclasses for the details.
        """
        hparams = DataBase.default_hparams()
        hparams.update({
            "bucket_boundaries": [],
            "bucket_batch_sizes": None,
            "bucket_length_fn": None
        })
        return hparams
 def _test_data(self, data: DataBase,
                returns_data: bool = False,
                always_returns_data: bool = False):
     sampler = BufferShuffleSampler(data, self.buffer_size)
     for epoch in range(2):
         indices = list(iter(sampler))
         if always_returns_data or (returns_data and epoch == 0):
             examples = [ex[1] for ex in indices]
             indices = [ex[0] for ex in indices]
             np.testing.assert_array_equal(indices, examples)
         self.assertEqual(len(set(indices)), self.size)
         self.assertEqual(min(indices), 0)
         self.assertEqual(max(indices), self.size - 1)
         data._fully_cached = True
Пример #6
0
    def default_hparams():
        r"""Returns a dictionary of default hyperparameters.

        .. code-block:: python

            {
                # (1) Hyperparameters specific to TFRecord dataset
                'dataset': {
                    'files': [],
                    'feature_original_types': {},
                    'feature_convert_types': {},
                    'image_options': {},
                    "num_shards": None,
                    "shard_id": None,
                    "other_transformations": [],
                    "data_name": None,
                }
                # (2) General hyperparameters
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "tfrecord_data",
            }

        Here:

        1. For the hyperparameters in the :attr:`"dataset"` field:

            `"files"`: str or list
                A (list of) TFRecord file path(s).

            `"feature_original_types"`: dict
                The feature names (str) with their data types and length types,
                key and value in pair
                `feature_name: [dtype, feature_len_type, len]`,

                - `dtype` is a Python type (`int`, `str`), dtype instance from
                  PyTorch (``torch.float``), NumPy (``np.int64``),
                  or TensorFlow (``tf.string``), or their stringified names such
                  as ``"torch.float"`` and ``"np.int64"``. The feature will be
                  read from the files and parsed into this dtype.

                - `feature_len_type` is of type `str`, and can be either
                  'FixedLenFeature' or 'VarLenFeature' for fixed length
                  features and non-fixed length features, respectively.

                - `len` is an `int` and is optional. It is the length for
                  'FixedLenFeature'. Ignored if 'VarLenFeature' is used.

                Example:

                .. code-block:: python

                    feature_original_types = {
                        "input_ids": ["tf.int64", "FixedLenFeature", 128],
                        "label_ids": ["tf.int64", "FixedLenFeature"],
                        "name_lists": ["tf.string", "VarLenFeature"],
                    }

            `"feature_convert_types"`: dict, optional
                Specifies dtype converting after reading the data files. This
                `dict` maps feature names to desired data dtypes. For example,
                you can first read a feature into dtype ``torch.int32`` by
                specifying in "feature_original_types" above, and convert
                the feature to dtype ``"torch.long"`` by specifying here.
                Features not specified here will not do dtype-convert.

                - `dtype` is a Python type (`int`, `str`), dtype instance from
                  PyTorch (``torch.float``), NumPy (``np.int64``),
                  or TensorFlow (``tf.string``), or their stringified names such
                  as ``"torch.float"`` and ``"np.int64"``.

                Be noticed that this converting process is after all the data
                are restored, `feature_original_types` has to be set firstly.

                Example:

                .. code-block:: python

                    feature_convert_types = {
                        "input_ids": "tf.int32",
                        "label_ids": "tf.int32",
                    }

            `"image_options"`: dict, optional
                Specifies the image feature name and performs image resizing,
                includes three fields:

                - "image_feature_name":
                    A `str`, the name of the feature which contains
                    the image data. If set, the image data
                    will be restored in format `numpy.ndarray`.
                - "resize_height":
                    A `int`, the height of the image after resizing.
                - "resize_width":
                    A `int`, the width of the image after resizing

                If either `resize_height` or `resize_width` is not set,
                image data will be restored with original shape.

            .. warning::
                  Sharding is not yet supported. This option (and
                  related ones below) will be ignored.

            "num_shards": int, optional
                The number of data shards in distributed mode. Usually set to
                the number of processes in distributed computing.
                Used in combination with :attr:`"shard_id"`.

            `"shard_id"`: int, optional
                Sets the unique id to identify a shard. The module will
                processes only the corresponding shard of the whole data.
                Used in combination with :attr:`"num_shards"`.

                E.g., in a case of distributed computing on 2 GPUs, the hparams
                of the data module for the two processes can be as below,
                respectively.

                For gpu 0:

                .. code-block:: python

                    dataset: {
                        ...
                        "num_shards": 2,
                        "shard_id": 0
                    }

                For gpu 1:

                .. code-block:: python

                    dataset: {
                        ...
                        "num_shards": 2,
                        "shard_id": 1
                    }

                Also refer to `examples/bert` for a use case.

            `"other_transformations"`: list
                A list of transformation functions or function names/paths to
                further transform each single data instance.

            `"data_name"`: str
                Name of the dataset.

        2. For the **general** hyperparameters, see
        :meth:`texar.data.DataBase.default_hparams` for details.
        """
        hparams = DataBase.default_hparams()
        hparams["name"] = "record_data"
        hparams.update({"dataset": _default_record_dataset_hparams()})
        return hparams
Пример #7
0
 def __init__(self, hparams):
     DataBase.__init__(self, hparams)
     with tf.name_scope(self.name, self.default_hparams()["name"]):
         self._make_data()
Пример #8
0
 def __init__(self, hparams):
     DataBase.__init__(self, hparams)