def test_c4_bare_preprocess_fn_denoising_objective(self): _t5_gin_config() dataset = _c4_dataset() dataset = tf_inputs.c4_bare_preprocess_fn(dataset, spm_path=_spm_path()) example = list(tfds.as_numpy(dataset.take(1)))[0] # Assertions now. self.assertIn('targets', example) targets = example['targets'] self.assertIsInstance(targets, np.ndarray) self.assertEqual(targets.dtype, np.int64) self.assertGreater(len(targets), 0) self.assertIn('inputs', example) _inputs = example['inputs'] # pylint: disable=invalid-name self.assertIsInstance(_inputs, np.ndarray) self.assertEqual(_inputs.dtype, np.int64) self.assertGreater(len(_inputs), 0) # WHP inputs will have the bulk of the text. self.assertGreater(len(_inputs), len(targets)) # WHP there will be one sentinel token in the inputs and targets. inputs_counter = collections.Counter(_inputs.tolist()) targets_counter = collections.Counter(targets.tolist()) self.assertEqual(1, inputs_counter[31999]) self.assertEqual(1, targets_counter[31999])
def test_c4_bare_preprocess_fn(self): dataset = _c4_dataset() example = list(tfds.as_numpy(dataset.take(1)))[0] # Targets are NOT in the example. self.assertNotIn('targets', example) self.assertIn('text', example) text = example['text'] # This should convert the dataset to an inputs/targets that are tokenized. dataset = tf_inputs.c4_bare_preprocess_fn(dataset, spm_path=_spm_path()) example = list(tfds.as_numpy(dataset.take(1)))[0] # Earlier text is now stored in targets_pretokenized self.assertIn('targets_pretokenized', example) self.assertEqual(example['targets_pretokenized'], text) # Targets are now tokenized. self.assertIn('targets', example) self.assertIsInstance(example['targets'], np.ndarray) self.assertEqual(example['targets'].dtype, np.int64) self.assertGreater(len(example['targets']), 0) self.assertEqual(example['targets'][-1], 1) # we add EOS at the end. # Inputs exist but is empty because t5 preprocessors' unsupervised wasn't # gin configured with any. self.assertIn('inputs', example) self.assertEqual(len(example['inputs']), 0)