def test_lm_generate_distilgpt2(self): model = TFGPT2LMHeadModel.from_pretrained("distilgpt2") input_ids = tf.convert_to_tensor([[464, 1893]], dtype=tf.int32) # The president expected_output_ids = [ 464, 1893, 286, 262, 1578, 1829, 11, 290, 262, 1893, 286, 262, 1578, 7526, 11, 423, 587, 287, 262, 2635, ] # The president of the United States, and the president of the United Kingdom, have been in the White output_ids = model.generate(input_ids, do_sample=False) self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
def test_lm_generate_gpt2_sample_xla(self): # NOTE: due to the small numerical differences that are natural when we compile to XLA, sampling the same # output out of the same seed is far from guaranteed. We can, however, confirm that the results are sensible # and that we can seed both versions. # forces the generation to happen on CPU, to avoid GPU-related quirks with tf.device(":/CPU:0"): model = TFGPT2LMHeadModel.from_pretrained("gpt2") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" sentence = ["The dog", "The flying machine"] expected_output_string = [ "The dog owner asked why did our vet decide there needed to be extra ventilation inside because most" " puppies", "The flying machine was made by an artist who found it difficult to control it as it did not use", ] expected_output_string_xla = [ "The dog has been named in connection with the murder of a 20-year-old man in", "The flying machine is a new and improved system to operate and operate a new system and system " "system system", ] input_ids = tokenizer(sentence, return_tensors="tf", padding=True) output_ids = model.generate(**input_ids, do_sample=True, seed=[7, 0]) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) self.assertListEqual(output_strings, expected_output_string) xla_generate = tf.function(model.generate, jit_compile=True) output_ids = xla_generate(**input_ids, do_sample=True, seed=[7, 0]) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) self.assertListEqual(output_strings, expected_output_string_xla)
def test_lm_generate_gpt2(self): model = TFGPT2LMHeadModel.from_pretrained("gpt2") input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32) # The dog expected_output_ids = [ 464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290, ] # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog output_ids = model.generate(input_ids, do_sample=False) self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
def test_lm_generate_distilgpt2_left_padding(self): """Tests that the generated text is the same, regarless of left padding""" model = TFGPT2LMHeadModel.from_pretrained("distilgpt2") tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" generation_kwargs = { "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], "no_repeat_ngram_size": 2, "do_sample": False, "repetition_penalty": 1.3, } expected_output_string = ( "Today is a beautiful day and I am so happy to be able take part in this amazing event." ) sentences = ["Today is a beautiful day and"] input_ids = tokenizer(sentences, return_tensors="tf", padding=True) # using default length output_ids = model.generate(**input_ids, **generation_kwargs) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) self.assertEqual(output_strings[0], expected_output_string) sentences = ["Today is a beautiful day and", "This is a very long input that we absolutely don't care about"] input_ids = tokenizer(sentences, return_tensors="tf", padding=True) # longer max length to capture the full length (remember: it is left padded) output_ids = model.generate(**input_ids, **generation_kwargs, max_length=27) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) self.assertEqual(output_strings[0], expected_output_string)
def test_lm_generate_sample_distilgpt2_batch_special(self): model = TFGPT2LMHeadModel.from_pretrained("distilgpt2") tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" sentences = ["Today is a beautiful day and", "Yesterday was"] input_ids = tokenizer(sentences, return_tensors="tf", padding=True) generation_kwargs = { "do_sample": True, "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], "no_repeat_ngram_size": 2, "repetition_penalty": 1.3, "temperature": 1.5, "top_k": 500, "top_p": 0.9, "seed": [42, 0], # seed set -> deterministic sampling sequence -> deterministic generation } # forces the generation to happen on CPU, to avoid GPU-related quirks with tf.device(":/CPU:0"): output_ids = model.generate(**input_ids, **generation_kwargs) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) expected_output_string = [ "Today is a beautiful day and we will make you feel very hot/terrific in all your", "Yesterday was known by national television networks as Le Big Show or Wild Dog Jeopard", ] self.assertListEqual(output_strings, expected_output_string)
def test_lm_generate_greedy_distilgpt2_beam_search_special(self): model = TFGPT2LMHeadModel.from_pretrained("distilgpt2") tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" sentences = ["Today is a beautiful day and", "Yesterday was"] input_ids = tokenizer(sentences, return_tensors="tf", padding=True) generation_kwargs = { "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], "no_repeat_ngram_size": 2, "do_sample": False, "num_beams": 2, } output_ids = model.generate(**input_ids, **generation_kwargs) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) expected_output_string = [ "Today is a beautiful day and a great day for all of us.\n\nI’m", "Yesterday was the first time that a person has been arrested in the United States for", ] self.assertListEqual(output_strings, expected_output_string)
def test_lm_generate_gpt2_greedy_xla(self): # TODO (Joao): convert this to an example with a batch size>1 with different input lengths that works (and fix # the underlying problem) model = TFGPT2LMHeadModel.from_pretrained("gpt2") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" sentences = ["The dog"] expected_output_strings = [ "The dog was found in a field near the intersection of West and West Streets.\n\nThe dog", ] input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids output_ids = model.generate(input_ids, do_sample=False) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) self.assertListEqual(output_strings, expected_output_strings) xla_generate = tf.function(model.generate, jit_compile=True) output_ids = xla_generate(input_ids, do_sample=False) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) self.assertListEqual(output_strings, expected_output_strings)
def test_lm_generate_greedy_distilgpt2_batch_special(self): model = TFGPT2LMHeadModel.from_pretrained("distilgpt2") tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" sentences = ["Today is a beautiful day and", "Yesterday was"] input_ids = tokenizer(sentences, return_tensors="tf", padding=True) generation_kwargs = { "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], "no_repeat_ngram_size": 2, "do_sample": False, "repetition_penalty": 1.3, } output_ids = model.generate(**input_ids, **generation_kwargs) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) expected_output_string = [ "Today is a beautiful day and I am so happy to be able take part in this amazing event.", "Yesterday was a very interesting time for the world to see how much of this is", ] self.assertListEqual(output_strings, expected_output_string)
def test_lm_generate_greedy_distilgpt2_beam_search_special(self): model = TFGPT2LMHeadModel.from_pretrained("distilgpt2") tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" sentences = ["Today is a beautiful day and", "Yesterday was"] input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids generation_kwargs = { "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], "no_repeat_ngram_size": 2, "do_sample": False, "repetition_penalty": 1.3, "num_beams": 2, } output_ids = model.generate(input_ids, **generation_kwargs) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) expected_output_string = [ "Today is a beautiful day and I hope you enjoy it.\nI am very happy to announce that", "Yesterday was the first time I've ever seen a game where you can play with", ] self.assertListEqual(output_strings, expected_output_string)
def test_lm_generate_gpt2_xla_sample(self): model = TFGPT2LMHeadModel.from_pretrained("gpt2") input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32) # The dog # fmt: off expected_output_ids = [ 464, 3290, 550, 284, 307, 4376, 287, 281, 4044, 1363, 329, 734, 812, 878, 852, 4376, 757, 329, 2267, 0 ] # fmt: on xla_generate = tf.function(model.generate, jit_compile=True) output_ids = xla_generate(input_ids, do_sample=True, seed=[42, 0]) self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
def test_lm_generate_gpt2_xla_greedy(self): """This test gives the exact same results as the non-xla test above""" model = TFGPT2LMHeadModel.from_pretrained("gpt2") input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32) # The dog # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog # fmt: off expected_output_ids = [ 464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290 ] # fmt: on xla_generate = tf.function(model.generate, jit_compile=True) output_ids = xla_generate(input_ids, do_sample=False) self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
def test_lm_generate_sample_distilgpt2_batch_special(self): model = TFGPT2LMHeadModel.from_pretrained("distilgpt2") tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" sentences = ["Today is a beautiful day and", "Yesterday was"] input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids generation_kwargs = { "do_sample": True, "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], "no_repeat_ngram_size": 2, "repetition_penalty": 1.3, "temperature": 1.5, "top_k": 500, "top_p": 0.9, } # forces the generation to happen on CPU, to avoid GPU-related quirks with tf.device(":/CPU:0"): tf.random.set_seed( 42 ) # deterministic sampling sequence -> deterministic generation output_ids = model.generate(input_ids, **generation_kwargs) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) expected_output_string = [ "Today is a beautiful day and this makes finding holiday travel easier for you to do other project\nOh", "Yesterday was an enjoyable but especially great note though it certainly upset many Democrats who say", ] self.assertListEqual(output_strings, expected_output_string)
def test_lm_generate_gpt2_greedy_xla(self): model = TFGPT2LMHeadModel.from_pretrained("gpt2") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" sentences = ["The dog", "The flying machine"] expected_output_strings = [ "The dog was found in a field near the intersection of West and West Streets.\n\nThe", "The flying machine is a small, lightweight, and lightweight aircraft that can be used for any type of", ] input_ids = tokenizer(sentences, return_tensors="tf", padding=True) output_ids = model.generate(**input_ids, do_sample=False) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) self.assertListEqual(output_strings, expected_output_strings) xla_generate = tf.function(model.generate, jit_compile=True) output_ids = xla_generate(**input_ids, do_sample=False) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) self.assertListEqual(output_strings, expected_output_strings)
def test_lm_generate_gpt2_beam_search_xla(self): model = TFGPT2LMHeadModel.from_pretrained("gpt2") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" sentences = ["The dog", "The flying machine"] expected_output_strings = [ "The dog was found in the backyard of a home in the 6500 block of South Main Street", "The flying machine is a very powerful machine, but it's not a very powerful machine. It's", ] input_ids = tokenizer(sentences, return_tensors="tf", padding=True) output_ids = model.generate(**input_ids, do_sample=False, num_beams=2) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) self.assertListEqual(output_strings, expected_output_strings) xla_generate = tf.function(model.generate, jit_compile=True) output_ids = xla_generate(**input_ids, do_sample=False, num_beams=2) output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) self.assertListEqual(output_strings, expected_output_strings)