def __init__(self, hparams): TextDataBase.__init__(self, hparams) # Defaultizes hparams of each dataset datasets_hparams = self._hparams.datasets defaultized_datasets_hparams = [] for ds_hpms in datasets_hparams: data_type = ds_hpms.get("data_type", None) defaultized_ds_hpms = HParams(ds_hpms, _default_dataset_hparams(data_type)) defaultized_datasets_hparams.append(defaultized_ds_hpms) self._hparams.datasets = defaultized_datasets_hparams with tf.name_scope(self.name, self.default_hparams()["name"]): self._make_data()
def default_hparams(): """Returns a dicitionary of default hyperparameters: .. code-block:: python { # (1) Hyperparams specific to text dataset "dataset": { "files": [], "compression_type": None, "vocab_file": "", "embedding_init": {}, "delimiter": " ", "max_seq_length": None, "length_filter_mode": "truncate", "pad_to_max_seq_length": False, "bos_token": "<BOS>" "eos_token": "<EOS>" "other_transformations": [], "variable_utterance": False, "utterance_delimiter": "|||", "max_utterance_cnt": 5, "data_name": None, } # (2) General hyperparams "num_epochs": 1, "batch_size": 64, "allow_smaller_final_batch": True, "shuffle": True, "shuffle_buffer_size": None, "shard_and_shuffle": False, "num_parallel_calls": 1, "prefetch_buffer_size": 0, "max_dataset_size": -1, "seed": None, "name": "mono_text_data", # (3) Bucketing "bucket_boundaries": [], "bucket_batch_sizes": None, "bucket_length_fn": None, } Here: 1. For the hyperparameters in the :attr:`"dataset"` field: "files": str or list A (list of) text file path(s). Each line contains a single text sequence. "compression_type": str, optional One of "" (no compression), "ZLIB", or "GZIP". "vocab_file": str Path to vocabulary file. Each line of the file should contain one vocabulary token. Used to create an instance of :class:`~texar.tf.data.Vocab`. "embedding_init": dict The hyperparameters for pre-trained embedding loading and initialization. The structure and default values are defined in :meth:`texar.tf.data.Embedding.default_hparams`. "delimiter": str The delimiter to split each line of the text files into tokens. "max_seq_length": int, optional Maximum length of output sequences. Data samples exceeding the length will be truncated or discarded according to :attr:`"length_filter_mode"`. The length does not include any added :attr:`"bos_token"` or :attr:`"eos_token"`. If `None` (default), no filtering is performed. "length_filter_mode": str Either "truncate" or "discard". If "truncate" (default), tokens exceeding the :attr:`"max_seq_length"` will be truncated. If "discard", data samples longer than the :attr:`"max_seq_length"` will be discarded. "pad_to_max_seq_length": bool If `True`, pad all data instances to length :attr:`"max_seq_length"`. Raises error if :attr:`"max_seq_length"` is not provided. "bos_token": str The Begin-Of-Sequence token prepended to each sequence. Set to an empty string to avoid prepending. "eos_token": str The End-Of-Sequence token appended to each sequence. Set to an empty string to avoid appending. "other_transformations": list A list of transformation functions or function names/paths to further transform each single data instance. (More documentations to be added.) "variable_utterance": bool If `True`, each line of the text file is considered to contain multiple sequences (utterances) separated by :attr:`"utterance_delimiter"`. For example, in dialog data, each line can contain a series of dialog history utterances. See the example in `examples/hierarchical_dialog` for a use case. "utterance_delimiter": str The delimiter to split over utterance level. Should not be the same with :attr:`"delimiter"`. Used only when :attr:`"variable_utterance"``==True`. "max_utterance_cnt": int Maximally allowed number of utterances in a data instance. Extra utterances are truncated out. "data_name": str Name of the dataset. 2. For the **general** hyperparameters, see :meth:`texar.tf.data.DataBase.default_hparams` for details. 3. **Bucketing** is to group elements of the dataset together by length and then pad and batch. (See more at :tf_main:`bucket_by_sequence_length <contrib/data/bucket_by_sequence_length>`). For bucketing hyperparameters: "bucket_boundaries": list An int list containing the upper length boundaries of the buckets. Set to an empty list (default) to disable bucketing. "bucket_batch_sizes": list An int list containing batch size per bucket. Length should be `len(bucket_boundaries) + 1`. If `None`, every bucket whill have the same batch size specified in :attr:`batch_size`. "bucket_length_fn": str or callable Function maps dataset element to `tf.int32` scalar, determines the length of the element. This can be a function, or the name or full module path to the function. If function name is given, the function must be in the :mod:`texar.tf.custom` module. If `None` (default), length is determined by the number of tokens (including BOS and EOS if added) of the element. """ hparams = TextDataBase.default_hparams() hparams["name"] = "mono_text_data" hparams.update({"dataset": _default_mono_text_dataset_hparams()}) return hparams
def __init__(self, hparams): TextDataBase.__init__(self, hparams) with tf.name_scope(self.name, self.default_hparams()["name"]): self._make_data()
def default_hparams(): """Returns a dicitionary of default hyperparameters. .. code-block:: python { # (1) Hyperparams specific to text dataset "source_dataset": { "files": [], "compression_type": None, "vocab_file": "", "embedding_init": {}, "delimiter": " ", "max_seq_length": None, "length_filter_mode": "truncate", "pad_to_max_seq_length": False, "bos_token": None, "eos_token": "<EOS>", "other_transformations": [], "variable_utterance": False, "utterance_delimiter": "|||", "max_utterance_cnt": 5, "data_name": "source", }, "target_dataset": { # ... # Same fields are allowed as in "source_dataset" with the # same default values, except the # following new fields/values: "bos_token": "<BOS>" "vocab_share": False, "embedding_init_share": False, "processing_share": False, "data_name": "target" } # (2) General hyperparams "num_epochs": 1, "batch_size": 64, "allow_smaller_final_batch": True, "shuffle": True, "shuffle_buffer_size": None, "shard_and_shuffle": False, "num_parallel_calls": 1, "prefetch_buffer_size": 0, "max_dataset_size": -1, "seed": None, "name": "paired_text_data", # (3) Bucketing "bucket_boundaries": [], "bucket_batch_sizes": None, "bucket_length_fn": None, } Here: 1. Hyperparameters in the :attr:`"source_dataset"` and attr:`"target_dataset"` fields have the same definition as those in :meth:`texar.tf.data.MonoTextData.default_hparams`, for source and target text, respectively. For the new hyperparameters in "target_dataset": "vocab_share": bool Whether to share the vocabulary of source. If `True`, the vocab file of target is ignored. "embedding_init_share": bool Whether to share the embedding initial value of source. If `True`, :attr:`"embedding_init"` of target is ignored. :attr:`"vocab_share"` must be true to share the embedding initial value. "processing_share": bool Whether to share the processing configurations of source, including "delimiter", "bos_token", "eos_token", and "other_transformations". 2. For the **general** hyperparameters, see :meth:`texar.tf.data.DataBase.default_hparams` for details. 3. For **bucketing** hyperparameters, see :meth:`texar.tf.data.MonoTextData.default_hparams` for details, except that the default bucket_length_fn is the maximum sequence length of source and target sequences. """ hparams = TextDataBase.default_hparams() hparams["name"] = "paired_text_data" hparams.update(_default_paired_text_dataset_hparams()) return hparams
def default_hparams(): """Returns a dicitionary of default hyperparameters. .. code-block:: python { # (1) Hyperparams specific to text dataset "datasets": [] # (2) General hyperparams "num_epochs": 1, "batch_size": 64, "allow_smaller_final_batch": True, "shuffle": True, "shuffle_buffer_size": None, "shard_and_shuffle": False, "num_parallel_calls": 1, "prefetch_buffer_size": 0, "max_dataset_size": -1, "seed": None, "name": "multi_aligned_data", } Here: 1. "datasets" is a list of `dict` each of which specifies a dataset which can be text, scalar or TFRecord. The :attr:`"data_name"` field of each dataset is used as the name prefix of the data fields from the respective dataset. The :attr:`"data_name"` field of each dataset should not be the same. - For scalar dataset, the allowed hyperparameters and default \ values are the same as the "dataset" field of \ :meth:`texar.tf.data.ScalarData.default_hparams`. Note that \ :attr:`"data_type"` must be explicily specified \ (either "int" or "float"). \ - For TFRecord dataset, the allowed hyperparameters and default \ values are the same as the "dataset" field of \ :meth:`texar.tf.data.TFRecordData.default_hparams`. Note that \ :attr:`"data_type"` must be explicily specified \ (tf_record"). \ - For text dataset, the allowed hyperparameters and default values\ are the same as the "dataset" filed of \ :meth:`texar.tf.data.MonoTextData.default_hparams`, with several \ extra hyperparameters: "data_type": str The type of the dataset, one of {"text", "int", "float", "tf_record"}. If set to "int" or "float", the dataset is considered to be a scalar dataset. If set to "tf_record", the dataset is considered to be a TFRecord dataset. If not specified or set to "text", the dataset is considered to be a text dataset. "vocab_share_with": int, optional Share the vocabulary of a preceding text dataset with the specified index in the list (starting from 0). The specified dataset must be a text dataset, and must have an index smaller than the current dataset. If specified, the vocab file of current dataset is ignored. Default is `None` which disables the vocab sharing. "embedding_init_share_with": int, optional Share the embedding initial value of a preceding text dataset with the specified index in the list (starting from 0). The specified dataset must be a text dataset, and must have an index smaller than the current dataset. If specified, the :attr:`"embedding_init"` field of the current dataset is ignored. Default is `None` which disables the initial value sharing. "processing_share_with": int, optional Share the processing configurations of a preceding text dataset with the specified index in the list (starting from 0). The specified dataset must be a text dataset, and must have an index smaller than the current dataset. If specified, relevant field of the current dataset are ignored, including "delimiter", "bos_token", "eos_token", and "other_transformations". Default is `None` which disables the processing sharing. 2. For the **general** hyperparameters, see :meth:`texar.tf.data.DataBase.default_hparams` for details. """ hparams = TextDataBase.default_hparams() hparams["name"] = "multi_aligned_data" hparams["datasets"] = [] return hparams