def default_hparams(): r"""Returns a dictionary of default hyperparameters. .. code-block:: python { # (1) Hyperparams specific to scalar dataset "dataset": { "files": [], "compression_type": None, "data_type": "int", "other_transformations": [], "data_name": "data", } # (2) General hyperparams "num_epochs": 1, "batch_size": 64, "allow_smaller_final_batch": True, "shuffle": True, "shuffle_buffer_size": None, "shard_and_shuffle": False, "num_parallel_calls": 1, "prefetch_buffer_size": 0, "max_dataset_size": -1, "seed": None, "name": "scalar_data", } Here: 1. For the hyperparameters in the :attr:`"dataset"` field: `"files"`: str or list A (list of) file path(s). Each line contains a single scalar number. `"compression_type"`: str, optional One of "" (no compression), "ZLIB", or "GZIP". `"data_type"`: str The scalar type. Currently supports "int" and "float". `"other_transformations"`: list A list of transformation functions or function names/paths to further transform each single data instance. (More documentations to be added.) `"data_name"`: str Name of the dataset. 2. For the **general** hyperparameters, see :meth:`texar.data.DataBase.default_hparams` for details. """ hparams = DataBase.default_hparams() hparams["name"] = "scalar_data" hparams.update({"dataset": _default_scalar_dataset_hparams()}) return hparams
def default_hparams(): """Returns a dicitionary of default hyperparameters. """ hparams = DataBase.default_hparams() hparams["name"] = "scalar_data" hparams.update({ "dataset": _default_scalar_dataset_hparams() }) return hparams
def default_hparams(): """Returns a dictionary of default hyperparameters. """ hparams = DataBase.default_hparams() hparams.update({ "bucket_boundaries": [], "bucket_batch_sizes": None, "bucket_length_fn": None }) return hparams
def default_hparams(): r"""Returns a dictionary of default hyperparameters. See the specific subclasses for the details. """ hparams = DataBase.default_hparams() hparams.update({ "bucket_boundaries": [], "bucket_batch_sizes": None, "bucket_length_fn": None }) return hparams
def _test_data(self, data: DataBase, returns_data: bool = False, always_returns_data: bool = False): sampler = BufferShuffleSampler(data, self.buffer_size) for epoch in range(2): indices = list(iter(sampler)) if always_returns_data or (returns_data and epoch == 0): examples = [ex[1] for ex in indices] indices = [ex[0] for ex in indices] np.testing.assert_array_equal(indices, examples) self.assertEqual(len(set(indices)), self.size) self.assertEqual(min(indices), 0) self.assertEqual(max(indices), self.size - 1) data._fully_cached = True
def default_hparams(): r"""Returns a dictionary of default hyperparameters. .. code-block:: python { # (1) Hyperparameters specific to TFRecord dataset 'dataset': { 'files': [], 'feature_original_types': {}, 'feature_convert_types': {}, 'image_options': {}, "num_shards": None, "shard_id": None, "other_transformations": [], "data_name": None, } # (2) General hyperparameters "num_epochs": 1, "batch_size": 64, "allow_smaller_final_batch": True, "shuffle": True, "shuffle_buffer_size": None, "shard_and_shuffle": False, "num_parallel_calls": 1, "prefetch_buffer_size": 0, "max_dataset_size": -1, "seed": None, "name": "tfrecord_data", } Here: 1. For the hyperparameters in the :attr:`"dataset"` field: `"files"`: str or list A (list of) TFRecord file path(s). `"feature_original_types"`: dict The feature names (str) with their data types and length types, key and value in pair `feature_name: [dtype, feature_len_type, len]`, - `dtype` is a Python type (`int`, `str`), dtype instance from PyTorch (``torch.float``), NumPy (``np.int64``), or TensorFlow (``tf.string``), or their stringified names such as ``"torch.float"`` and ``"np.int64"``. The feature will be read from the files and parsed into this dtype. - `feature_len_type` is of type `str`, and can be either 'FixedLenFeature' or 'VarLenFeature' for fixed length features and non-fixed length features, respectively. - `len` is an `int` and is optional. It is the length for 'FixedLenFeature'. Ignored if 'VarLenFeature' is used. Example: .. code-block:: python feature_original_types = { "input_ids": ["tf.int64", "FixedLenFeature", 128], "label_ids": ["tf.int64", "FixedLenFeature"], "name_lists": ["tf.string", "VarLenFeature"], } `"feature_convert_types"`: dict, optional Specifies dtype converting after reading the data files. This `dict` maps feature names to desired data dtypes. For example, you can first read a feature into dtype ``torch.int32`` by specifying in "feature_original_types" above, and convert the feature to dtype ``"torch.long"`` by specifying here. Features not specified here will not do dtype-convert. - `dtype` is a Python type (`int`, `str`), dtype instance from PyTorch (``torch.float``), NumPy (``np.int64``), or TensorFlow (``tf.string``), or their stringified names such as ``"torch.float"`` and ``"np.int64"``. Be noticed that this converting process is after all the data are restored, `feature_original_types` has to be set firstly. Example: .. code-block:: python feature_convert_types = { "input_ids": "tf.int32", "label_ids": "tf.int32", } `"image_options"`: dict, optional Specifies the image feature name and performs image resizing, includes three fields: - "image_feature_name": A `str`, the name of the feature which contains the image data. If set, the image data will be restored in format `numpy.ndarray`. - "resize_height": A `int`, the height of the image after resizing. - "resize_width": A `int`, the width of the image after resizing If either `resize_height` or `resize_width` is not set, image data will be restored with original shape. .. warning:: Sharding is not yet supported. This option (and related ones below) will be ignored. "num_shards": int, optional The number of data shards in distributed mode. Usually set to the number of processes in distributed computing. Used in combination with :attr:`"shard_id"`. `"shard_id"`: int, optional Sets the unique id to identify a shard. The module will processes only the corresponding shard of the whole data. Used in combination with :attr:`"num_shards"`. E.g., in a case of distributed computing on 2 GPUs, the hparams of the data module for the two processes can be as below, respectively. For gpu 0: .. code-block:: python dataset: { ... "num_shards": 2, "shard_id": 0 } For gpu 1: .. code-block:: python dataset: { ... "num_shards": 2, "shard_id": 1 } Also refer to `examples/bert` for a use case. `"other_transformations"`: list A list of transformation functions or function names/paths to further transform each single data instance. `"data_name"`: str Name of the dataset. 2. For the **general** hyperparameters, see :meth:`texar.data.DataBase.default_hparams` for details. """ hparams = DataBase.default_hparams() hparams["name"] = "record_data" hparams.update({"dataset": _default_record_dataset_hparams()}) return hparams
def __init__(self, hparams): DataBase.__init__(self, hparams) with tf.name_scope(self.name, self.default_hparams()["name"]): self._make_data()
def __init__(self, hparams): DataBase.__init__(self, hparams)