Пример #1
0
 def _pack_or_pad(self,
                  ds: tf.data.Dataset,
                  packed_lengths: Mapping[str, int]) -> tf.data.Dataset:
   """Trim/pad to packed_lengths and optionally pack the input dataset."""
   if self.pack:
     ds = utils.trim_and_pack_dataset(ds, packed_lengths,
                                      self._use_custom_packing_ops)
   else:
     ds = utils.trim_and_pad_dataset(ds, packed_lengths)
   return ds
Пример #2
0
 def test_trim_and_pad_dataset(self):
   x = [{"inputs": [7, 8, 5, 6, 1], "targets": [3, 9, 1], "idx": [0]},
        {"inputs": [8, 4, 9, 3, 5, 7, 9, 1], "targets": [4, 1], "idx": [1, 2]}]
   ds = create_default_dataset(x, feature_names=("inputs", "targets", "idx"))
   padded_ds = utils.trim_and_pad_dataset(
       ds,
       feature_lengths={"inputs": 7, "targets": 3})
   expected = [
       {
           "inputs": [7, 8, 5, 6, 1, 0, 0],
           "targets": [3, 9, 1],
           "idx": [0],
       },
       {
           # EOS is trimmed
           "inputs": [8, 4, 9, 3, 5, 7, 9],
           "targets": [4, 1, 0],
           "idx": [1, 2],
       }
   ]
   assert_dataset(
       padded_ds, expected, {"inputs": tf.int32, "targets": tf.int32})
    def test_caching(self):
        task_name = "caching"
        x = [{
            "inputs": [7, 8],
            "targets": [3, 9],
            "targets_pretokenized": "ex 1"
        }, {
            "inputs": [8, 4],
            "targets": [4],
            "targets_pretokenized": "ex 2"
        }]
        dtypes = {
            "inputs": tf.int32,
            "targets": tf.int32,
            "targets_pretokenized": tf.string
        }
        shapes = {
            "inputs": [None],
            "targets": [None],
            "targets_pretokenized": []
        }
        ds = tf.data.Dataset.from_generator(lambda: x,
                                            output_types=dtypes,
                                            output_shapes=shapes)
        dataset_fn = lambda split, shuffle_files: ds
        register_dummy_task(task_name,
                            dataset_fn=dataset_fn,
                            metrics_fn=[_sequence_accuracy_metric])

        # Feature converter that just pads "inputs" and "targets".
        feature_converter = mock.Mock(get_model_feature_lengths=lambda x: {
            "inputs": 4,
            "targets": 4
        })
        feature_converter.side_effect = (
            lambda ds, length: utils.trim_and_pad_dataset(
                ds, {
                    "inputs": 4,
                    "targets": 4
                }))
        evaluator = Evaluator(mixture_or_task_name=task_name,
                              feature_converter=feature_converter,
                              eval_split="validation")
        expected_task_examples = [{
            "inputs": [7, 8, 1],
            "targets": [3, 9, 1],
            "targets_pretokenized": b"ex 1"
        }, {
            "inputs": [8, 4, 1],
            "targets": [4, 1],
            "targets_pretokenized": b"ex 2"
        }]
        expected_examples = [{
            "inputs": [7, 8, 1, 0],
            "targets": [3, 9, 1, 0],
            "targets_pretokenized": b"ex 1"
        }, {
            "inputs": [8, 4, 1, 0],
            "targets": [4, 1, 0, 0],
            "targets_pretokenized": b"ex 2"
        }]

        test_utils.assert_dataset(evaluator._cached_task_datasets[task_name],
                                  expected_task_examples)

        # _cached_model_datasets are enumerated. Remove the index for assertion.
        eval_ds = evaluator._cached_model_datasets[task_name].map(
            lambda i, ds: ds)
        test_utils.assert_dataset(eval_ds, expected_examples)
        self.assertEqual(evaluator.cached_targets[task_name], ["ex 1", "ex 2"])
        self.assertDictEqual(evaluator.model_feature_lengths, {
            "inputs": 4,
            "targets": 4
        })