Пример #1
0
    def _process_dataset(self, dataset, hparams, data_spec):
        name_prefix = PairedTextData._get_name_prefix(
            hparams["source_dataset"], hparams["target_dataset"])
        tran_fn, data_spec = self._make_processor(hparams["source_dataset"],
                                                  hparams["target_dataset"],
                                                  data_spec,
                                                  name_prefix=name_prefix)

        num_parallel_calls = hparams["num_parallel_calls"]
        dataset = dataset.map(lambda *args: tran_fn(dsutils.maybe_tuple(args)),
                              num_parallel_calls=num_parallel_calls)

        # Filters by length
        src_length_name = dsutils._connect_name(
            data_spec.name_prefix[0], data_spec.decoder[0].length_tensor_name)
        tgt_length_name = dsutils._connect_name(
            data_spec.name_prefix[1], data_spec.decoder[1].length_tensor_name)
        filter_fn = self._make_length_filter(hparams["source_dataset"],
                                             hparams["target_dataset"],
                                             src_length_name, tgt_length_name,
                                             data_spec.decoder[0],
                                             data_spec.decoder[1])
        if filter_fn:
            dataset = dataset.filter(filter_fn)

        # Truncates data count
        dataset = dataset.take(hparams["max_dataset_size"])

        return dataset, data_spec
Пример #2
0
 def text_id_name(self):
     """The name of text index tensor, "text_ids" by default.
     """
     name = dsutils._connect_name(
         self._data_spec.name_prefix,
         self._data_spec.decoder.text_id_tensor_name)
     return name
Пример #3
0
 def length_name(self):
     """The name of length tensor, "length" by default.
     """
     name = dsutils._connect_name(
         self._data_spec.name_prefix,
         self._data_spec.decoder.length_tensor_name)
     return name
Пример #4
0
 def _get_length_name(i):
     if not _is_text_data(hparams["datasets"][i]["data_type"]):
         return None
     name = dsutils._connect_name(
         data_spec.name_prefix[i],
         data_spec.decoder[i].length_tensor_name)
     return name
Пример #5
0
 def target_text_id_name(self):
     """The name of the target text index tensor, "target_text_ids" by
     default.
     """
     name = dsutils._connect_name(self._data_spec.name_prefix[1],
                                  self._tgt_decoder.text_id_tensor_name)
     return name
Пример #6
0
 def source_text_id_name(self):
     """The name of the source text index tensor, "source_text_ids" by
     default.
     """
     name = dsutils._connect_name(self._data_spec.name_prefix[0],
                                  self._src_decoder.text_id_tensor_name)
     return name
Пример #7
0
 def utterance_cnt_name(self):
     """The name of utterance count tensor, "utterance_cnt" by default.
     """
     if not self._hparams.dataset.variable_utterance:
         raise ValueError("`utterance_cnt_name` is not defined.")
     name = dsutils._connect_name(
         self._data_spec.name_prefix,
         self._data_spec.decoder.utterance_cnt_tensor_name)
     return name
Пример #8
0
 def target_utterance_cnt_name(self):
     """The name of the target text utterance count tensor.
     """
     if not self._hparams.target_dataset.variable_utterance:
         raise ValueError(
             "`utterance_cnt_name` of target data is undefined.")
     name = dsutils._connect_name(
         self._data_spec.name_prefix[1],
         self._tgt_decoder.utterance_cnt_tensor_name)
     return name
Пример #9
0
 def source_utterance_cnt_name(self):
     """The name of the source text utterance count tensor.
     """
     if not self._hparams.source_dataset.variable_utterance:
         raise ValueError(
             "`utterance_cnt_name` of source data is undefined.")
     name = dsutils._connect_name(
         self._data_spec.name_prefix[0],
         self._src_decoder.utterance_cnt_tensor_name)
     return name
Пример #10
0
 def data_name(self, name_or_id):
     """The name of the data tensor of scalar dataset by its name or id..
     If the dataset is not a scalar data, returns `None`.
     """
     i = self._maybe_name_to_id(name_or_id)
     if not _is_scalar_data(self._hparams.datasets[i]["data_type"]):
         return None
     name = dsutils._connect_name(
         self._data_spec.name_prefix[i],
         self._data_spec.decoder[i].data_tensor_name)
     return name
Пример #11
0
 def text_id_name(self, name_or_id):
     """The name of length tensor of text dataset by its name or id. If the
     dataset is not of text type, returns `None`.
     """
     i = self._maybe_name_to_id(name_or_id)
     if not _is_text_data(self._hparams.datasets[i]["data_type"]):
         return None
     name = dsutils._connect_name(
         self._data_spec.name_prefix[i],
         self._data_spec.decoder[i].text_id_tensor_name)
     return name
Пример #12
0
 def utterance_cnt_name(self, name_or_id):
     """The name of utterance count tensor of text dataset by its name or id.
     If the dataset is not variable utterance text data, returns `None`.
     """
     i = self._maybe_name_to_id(name_or_id)
     if not _is_text_data(self._hparams.datasets[i]["data_type"]) or \
             not self._hparams.datasets[i]["variable_utterance"]:
         return None
     name = dsutils._connect_name(
         self._data_spec.name_prefix[i],
         self._data_spec.decoder[i].utterance_cnt_tensor_name)
     return name
Пример #13
0
    def _process_dataset(self, dataset, hparams, data_spec):
        chained_tran, data_spec = self._make_processor(
            hparams["dataset"],
            data_spec,
            name_prefix=hparams["dataset"]["data_name"])
        num_parallel_calls = hparams["num_parallel_calls"]
        dataset = dataset.map(
            lambda *args: chained_tran(dsutils.maybe_tuple(args)),
            num_parallel_calls=num_parallel_calls)

        # Filters by length
        length_name = dsutils._connect_name(
            data_spec.name_prefix, data_spec.decoder.length_tensor_name)
        filter_fn = self._make_length_filter(hparams["dataset"], length_name,
                                             data_spec.decoder)
        if filter_fn:
            dataset = dataset.filter(filter_fn)

        # Truncates data count
        dataset = dataset.take(hparams["max_dataset_size"])

        return dataset, data_spec
Пример #14
0
 def target_length_name(self):
     """The name of the target length tensor.
     """
     name = dsutils._connect_name(self._data_spec.name_prefix[1],
                                  self._tgt_decoder.length_tensor_name)
     return name
Пример #15
0
 def source_length_name(self):
     """The name of the source length tensor.
     """
     name = dsutils._connect_name(self._data_spec.name_prefix[0],
                                  self._src_decoder.length_tensor_name)
     return name