def _pack_or_pad(self, ds: tf.data.Dataset, packed_lengths: Mapping[str, int]) -> tf.data.Dataset: """Trim/pad to packed_lengths and optionally pack the input dataset.""" if self.pack: ds = utils.trim_and_pack_dataset(ds, packed_lengths, self._use_custom_packing_ops) else: ds = utils.trim_and_pad_dataset(ds, packed_lengths) return ds
def test_trim_and_pad_dataset(self): x = [{"inputs": [7, 8, 5, 6, 1], "targets": [3, 9, 1], "idx": [0]}, {"inputs": [8, 4, 9, 3, 5, 7, 9, 1], "targets": [4, 1], "idx": [1, 2]}] ds = create_default_dataset(x, feature_names=("inputs", "targets", "idx")) padded_ds = utils.trim_and_pad_dataset( ds, feature_lengths={"inputs": 7, "targets": 3}) expected = [ { "inputs": [7, 8, 5, 6, 1, 0, 0], "targets": [3, 9, 1], "idx": [0], }, { # EOS is trimmed "inputs": [8, 4, 9, 3, 5, 7, 9], "targets": [4, 1, 0], "idx": [1, 2], } ] assert_dataset( padded_ds, expected, {"inputs": tf.int32, "targets": tf.int32})
def test_caching(self): task_name = "caching" x = [{ "inputs": [7, 8], "targets": [3, 9], "targets_pretokenized": "ex 1" }, { "inputs": [8, 4], "targets": [4], "targets_pretokenized": "ex 2" }] dtypes = { "inputs": tf.int32, "targets": tf.int32, "targets_pretokenized": tf.string } shapes = { "inputs": [None], "targets": [None], "targets_pretokenized": [] } ds = tf.data.Dataset.from_generator(lambda: x, output_types=dtypes, output_shapes=shapes) dataset_fn = lambda split, shuffle_files: ds register_dummy_task(task_name, dataset_fn=dataset_fn, metrics_fn=[_sequence_accuracy_metric]) # Feature converter that just pads "inputs" and "targets". feature_converter = mock.Mock(get_model_feature_lengths=lambda x: { "inputs": 4, "targets": 4 }) feature_converter.side_effect = ( lambda ds, length: utils.trim_and_pad_dataset( ds, { "inputs": 4, "targets": 4 })) evaluator = Evaluator(mixture_or_task_name=task_name, feature_converter=feature_converter, eval_split="validation") expected_task_examples = [{ "inputs": [7, 8, 1], "targets": [3, 9, 1], "targets_pretokenized": b"ex 1" }, { "inputs": [8, 4, 1], "targets": [4, 1], "targets_pretokenized": b"ex 2" }] expected_examples = [{ "inputs": [7, 8, 1, 0], "targets": [3, 9, 1, 0], "targets_pretokenized": b"ex 1" }, { "inputs": [8, 4, 1, 0], "targets": [4, 1, 0, 0], "targets_pretokenized": b"ex 2" }] test_utils.assert_dataset(evaluator._cached_task_datasets[task_name], expected_task_examples) # _cached_model_datasets are enumerated. Remove the index for assertion. eval_ds = evaluator._cached_model_datasets[task_name].map( lambda i, ds: ds) test_utils.assert_dataset(eval_ds, expected_examples) self.assertEqual(evaluator.cached_targets[task_name], ["ex 1", "ex 2"]) self.assertDictEqual(evaluator.model_feature_lengths, { "inputs": 4, "targets": 4 })