예제 #1
0
    def __init__(self, hparams):
        TextDataBase.__init__(self, hparams)
        # Defaultizes hparams of each dataset
        datasets_hparams = self._hparams.datasets
        defaultized_datasets_hparams = []
        for ds_hpms in datasets_hparams:
            data_type = ds_hpms.get("data_type", None)
            defaultized_ds_hpms = HParams(ds_hpms,
                                          _default_dataset_hparams(data_type))
            defaultized_datasets_hparams.append(defaultized_ds_hpms)
        self._hparams.datasets = defaultized_datasets_hparams

        with tf.name_scope(self.name, self.default_hparams()["name"]):
            self._make_data()
예제 #2
0
 def default_hparams():
     """Returns a dicitionary of default hyperparameters.
     """
     hparams = TextDataBase.default_hparams()
     hparams["name"] = "paired_text_data"
     hparams.update(_default_paired_text_dataset_hparams())
     return hparams
예제 #3
0
 def default_hparams():
     """Returns a dicitionary of default hyperparameters.
     """
     hparams = TextDataBase.default_hparams()
     hparams["name"] = "multi_aligned_data"
     hparams["datasets"] = [_default_dataset_hparams()]
     return hparams
예제 #4
0
    def default_hparams():
        r"""Returns a dictionary of default hyperparameters:

        .. code-block:: python

            {
                # (1) Hyperparams specific to text dataset
                "datasets": []
                # (2) General hyperparams
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "multi_aligned_data",
            }

        Here:

        1. "datasets" is a list of `dict` each of which specifies a
           dataset which can be text, scalar or Record. The :attr:`"data_name"`
           field of each dataset is used as the name prefix of the data fields
           from the respective dataset. The :attr:`"data_name"` field of each
           dataset should not be the same.

           i) For scalar dataset, the allowed hyperparameters and default
              values are the same as the "dataset" field of
              :meth:`texar.data.ScalarData.default_hparams`. Note that
              :attr:`"data_type"` must be explicitly specified
              (either "int" or "float").

           ii) For Record dataset, the allowed hyperparameters and default
               values are the same as the "dataset" field of
               :meth:`texar.data.RecordData.default_hparams`. Note that
               :attr:`"data_type"` must be explicitly specified ("record").

           iii) For text dataset, the allowed hyperparameters and default
                values are the same as the "dataset" filed of
                :meth:`texar.data.MonoTextData.default_hparams`, with several
                extra hyperparameters:

                `"data_type"`: str
                    The type of the dataset, one of {"text", "int", "float",
                    "record"}. If set to "int" or "float", the dataset is
                    considered to be a scalar dataset. If set to
                    "record", the dataset is considered to be a Record
                    dataset.

                    If not specified or set to "text", the dataset is
                    considered to be a text dataset.

                `"vocab_share_with"`: int, optional
                    Share the vocabulary of a preceding text dataset with
                    the specified index in the list (starting from 0). The
                    specified dataset must be a text dataset, and must have
                    an index smaller than the current dataset.

                    If specified, the vocab file of current dataset is
                    ignored. Default is `None` which disables the vocab
                    sharing.

                `"embedding_init_share_with"`: int, optional
                    Share the embedding initial value of a preceding text
                    dataset with the specified index in the list (starting
                    from 0). The specified dataset must be a text dataset,
                    and must have an index smaller than the current dataset.

                    If specified, the :attr:`"embedding_init"` field of the
                    current dataset is ignored. Default is `None` which
                    disables the initial value sharing.

                `"processing_share_with"`: int, optional
                    Share the processing configurations of a preceding text
                    dataset with the specified index in the list (starting
                    from 0). The specified dataset must be a text dataset,
                    and must have an index smaller than the current dataset.

                    If specified, relevant field of the current dataset are
                    ignored, including `delimiter`, `bos_token`,
                    `eos_token`, and "other_transformations". Default is
                    `None` which disables the processing sharing.

        2. For the **general** hyperparameters, see
        :meth:`texar.data.DataBase.default_hparams` for details.

        """
        hparams = TextDataBase.default_hparams()
        hparams["name"] = "multi_aligned_data"
        hparams["datasets"] = []
        return hparams
예제 #5
0
    def default_hparams():
        """Returns a dicitionary of default hyperparameters:

        .. code-block:: python

            {
                "files": [],
                "compression_type": None,
                "vocab_file": "",
                "embedding_init": {},
                "delimiter": " ",
                "max_seq_length": None,
                "length_filter_mode": "truncate",
                "pad_to_max_seq_length": False,
                "bos_token": SpecialTokens.BOS,
                "eos_token": SpecialTokens.EOS,
                "other_transformations": [],
                "variable_utterance": False,
                "utterance_delimiter": "|||",
                "max_utterance_cnt": 5,
                "data_name": None,
            }

        Here:

        "files" : str or list
            A (list of) text file path(s).

            Each line contains a single text sequence.

        "compression_type" : str, optional
            One of "" (no compression), "ZLIB", or "GZIP".

        "vocab_file": str
            Path to vocabulary file. Each line of the file should contain
            one vocabulary token.

            Used to create an instance of :class:`~texar.data.Vocab`.

        "embedding_init" : dict
            The hyperparameters for pre-trained embedding loading and
            initialization.

            The structure and default values are defined in
            :meth:`texar.data.Embedding.default_hparams`.

        "delimiter" : str
            The delimiter to split each line of the text files into tokens.

        "max_seq_length" : int, optional
            Maximum length of output sequences. Data samples exceeding the
            length will be truncated or discarded according to
            :attr:`"length_filter_mode"`. The length does not include any added
            :attr:`"bos_token"` or :attr:`"eos_token"`. If `None` (default),
            no filtering is performed.

        "length_filter_mode" : str
            Either "truncate" or "discard". If "truncate" (default),
            tokens exceeding the :attr:`"max_seq_length"` will be truncated.
            If "discard", data samples longer than the :attr:`"max_seq_length"`
            will be discarded.

        "pad_to_max_seq_length" : bool
            If `True`, pad all data instances to length
            :attr:`"max_seq_length"`.
            Raises error if :attr:`"max_seq_length"` is not provided.

        "bos_token" : str
            The Begin-Of-Sequence token prepended to each sequence.

            Set to an empty string to avoid prepending.

        "eos_token" : str
            The End-Of-Sequence token appended to each sequence.

            Set to an empty string to avoid appending.

        "other_transformations" : list
            A list of transformation functions or function names/paths to
            further transform the data instances.

            (More documentations to be added.)

        "variable_utterance" : bool
            If `True`, each line of the text file is considered to contain
            multiple sequences (utterances) separated by
            :attr:`"utterance_delimiter"`.

            For example, in dialog data, each line can contain a series of
            dialog history utterances. See the example in
            `examples/hierarchical_dialog` for a use case.

        "utterance_delimiter" : str
            The delimiter to split over utterance level. Should not be the
            same with :attr:`"delimiter"`. Used only when
            :attr:`"variable_utterance"``==True`.

        "max_utterance_cnt" : int
            Maximally allowed number of utterances in a data instance.
            Extra utterances are truncated out.

        "data_name" : str
            Name of the data.
        """
        hparams = TextDataBase.default_hparams()
        hparams["name"] = "mono_text_data"
        hparams.update({"dataset": _default_mono_text_dataset_hparams()})
        return hparams
예제 #6
0
 def __init__(self, hparams):
     TextDataBase.__init__(self, hparams)
     with tf.name_scope(self.name, self.default_hparams()["name"]):
         self._make_data()
예제 #7
0
    def default_hparams():
        r"""Returns a dictionary of default hyperparameters:

        .. code-block:: python

            {
                # (1) Hyperparameters specific to text dataset
                "dataset": {
                    "files": [],
                    "compression_type": None,
                    "vocab_file": "",
                    "embedding_init": {},
                    "delimiter": " ",
                    "max_seq_length": None,
                    "length_filter_mode": "truncate",
                    "pad_to_max_seq_length": False,
                    "bos_token": "<BOS>"
                    "eos_token": "<EOS>"
                    "other_transformations": [],
                    "variable_utterance": False,
                    "utterance_delimiter": "|||",
                    "max_utterance_cnt": 5,
                    "data_name": None,
                }
                # (2) General hyperparameters
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "mono_text_data",
                # (3) Bucketing
                "bucket_boundaries": [],
                "bucket_batch_sizes": None,
                "bucket_length_fn": None,
            }

        Here:

        1. For the hyperparameters in the :attr:`"dataset"` field:

          "files" : str or list
              A (list of) text file path(s).

              Each line contains a single text sequence.

          "compression_type" : str, optional
              One of ``None`` (no compression), ``"ZLIB"``, or ``"GZIP"``.

          "vocab_file": str
              Path to vocabulary file. Each line of the file should contain
              one vocabulary token.

              Used to create an instance of :class:`~texar.data.Vocab`.

          "embedding_init" : dict
              The hyperparameters for pre-trained embedding loading and
              initialization.

              The structure and default values are defined in
              :meth:`texar.data.Embedding.default_hparams`.

          "delimiter" : str
              The delimiter to split each line of the text files into tokens.

          "max_seq_length" : int, optional
              Maximum length of output sequences. Data samples exceeding the
              length will be truncated or discarded according to
              :attr:`"length_filter_mode"`. The length does not include
              any added
              :attr:`"bos_token"` or :attr:`"eos_token"`. If `None` (default),
              no filtering is performed.

          "length_filter_mode" : str
              Either ``"truncate"`` or ``"discard"``. If ``"truncate"``
              (default), tokens exceeding :attr:`"max_seq_length"` will be
              truncated.
              If ``"discard"``, data samples longer than
              :attr:`"max_seq_length"` will be discarded.

          "pad_to_max_seq_length" : bool
              If `True`, pad all data instances to length
              :attr:`"max_seq_length"`.
              Raises error if :attr:`"max_seq_length"` is not provided.

          "bos_token" : str
              The Begin-Of-Sequence token prepended to each sequence.

              Set to an empty string to avoid prepending.

          "eos_token" : str
              The End-Of-Sequence token appended to each sequence.

              Set to an empty string to avoid appending.

          "other_transformations" : list
              A list of transformation functions or function names/paths to
              further transform each single data instance.

              (More documentations to be added.)

          "variable_utterance" : bool
              If `True`, each line of the text file is considered to contain
              multiple sequences (utterances) separated by
              :attr:`"utterance_delimiter"`.

              For example, in dialog data, each line can contain a series of
              dialog history utterances. See the example in
              `examples/hierarchical_dialog` for a use case.

              .. warning::
                  Variable utterances is not yet supported. This option (and
                  related ones below) will be ignored.

          "utterance_delimiter" : str
              The delimiter to split over utterance level. Should not be the
              same with :attr:`"delimiter"`. Used only when
              :attr:`"variable_utterance"` is ``True``.

          "max_utterance_cnt" : int
              Maximally allowed number of utterances in a data instance.
              Extra utterances are truncated out.

          "data_name" : str
              Name of the dataset.

        2. For the **general** hyperparameters, see
        :meth:`texar.data.DataBase.default_hparams` for details.

        3. **Bucketing** is to group elements of the dataset
        together by length and then pad and batch. For bucketing
        hyperparameters:

          "bucket_boundaries" : list
              An int list containing the upper length boundaries of the
              buckets.

              Set to an empty list (default) to disable bucketing.

          "bucket_batch_sizes" : list
              An int list containing batch size per bucket. Length should be
              `len(bucket_boundaries) + 1`.

              If `None`, every bucket will have the same batch size specified
              in :attr:`batch_size`.

          "bucket_length_fn" : str or callable
              Function maps dataset element to ``int``, determines
              the length of the element.

              This can be a function, or the name or full module path to the
              function. If function name is given, the function must be in the
              :mod:`texar.custom` module.

              If `None` (default), length is determined by the number of
              tokens (including BOS and EOS if added) of the element.

          .. warning::
              Bucketing is not yet supported. These options will be ignored.

        """
        hparams = TextDataBase.default_hparams()
        hparams["name"] = "mono_text_data"
        hparams.update({
            "dataset": _default_mono_text_dataset_hparams()
        })
        return hparams
예제 #8
0
    def default_hparams():
        r"""Returns a dictionary of default hyperparameters.

        .. code-block:: python

            {
                # (1) Hyperparams specific to text dataset
                "source_dataset": {
                    "files": [],
                    "compression_type": None,
                    "vocab_file": "",
                    "embedding_init": {},
                    "delimiter": " ",
                    "max_seq_length": None,
                    "length_filter_mode": "truncate",
                    "pad_to_max_seq_length": False,
                    "bos_token": None,
                    "eos_token": "<EOS>",
                    "other_transformations": [],
                    "variable_utterance": False,
                    "utterance_delimiter": "|||",
                    "max_utterance_cnt": 5,
                    "data_name": "source",
                },
                "target_dataset": {
                    # ...
                    # Same fields are allowed as in "source_dataset" with the
                    # same default values, except the
                    # following new fields/values:
                    "bos_token": "<BOS>"
                    "vocab_share": False,
                    "embedding_init_share": False,
                    "processing_share": False,
                    "data_name": "target"
                }
                # (2) General hyperparams
                "num_epochs": 1,
                "batch_size": 64,
                "allow_smaller_final_batch": True,
                "shuffle": True,
                "shuffle_buffer_size": None,
                "shard_and_shuffle": False,
                "num_parallel_calls": 1,
                "prefetch_buffer_size": 0,
                "max_dataset_size": -1,
                "seed": None,
                "name": "paired_text_data",
                # (3) Bucketing
                "bucket_boundaries": [],
                "bucket_batch_sizes": None,
                "bucket_length_fn": None,
            }

        Here:

        1. Hyperparameters in the :attr:`"source_dataset"` and
           attr:`"target_dataset"` fields have the same definition as those
           in :meth:`texar.data.MonoTextData.default_hparams`, for source and
           target text, respectively.

           For the new hyperparameters in "target_dataset":

           "vocab_share" : bool
               Whether to share the vocabulary of source.
               If `True`, the vocab file of target is ignored.

           "embedding_init_share" : bool
               Whether to share the embedding initial value of source. If
               `True`, :attr:`"embedding_init"` of target is ignored.

              :attr:`"vocab_share"` must be true to share the embedding
              initial value.

           "processing_share" : bool
               Whether to share the processing configurations of source,
               including
               "delimiter", "bos_token", "eos_token", and
               "other_transformations".

        2. For the **general** hyperparameters, see
           :meth:`texar.data.DataBase.default_hparams` for details.

        3. For **bucketing** hyperparameters, see
           :meth:`texar.data.MonoTextData.default_hparams` for details, except
           that the default bucket_length_fn is the maximum sequence length
           of source and target sequences.

           .. warning::
               Bucketing is not yet supported. These options will be ignored.

        """
        hparams = TextDataBase.default_hparams()
        hparams["name"] = "paired_text_data"
        hparams.update(_default_paired_text_dataset_hparams())
        return hparams