def test_generate_fp16(self): config, input_ids, batch_size = self._get_config_and_data() attention_mask = input_ids.ne(1).to(torch_device) model = FSMTForConditionalGeneration(config).eval().to(torch_device) if torch_device == "cuda": model.half() model.generate(input_ids, attention_mask=attention_mask) model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
from transformers import FSMTTokenizer, FSMTConfig, FSMTForConditionalGeneration mname = "facebook/wmt19-en-de" tokenizer = FSMTTokenizer.from_pretrained(mname) # get the correct vocab sizes, etc. from the master model config = FSMTConfig.from_pretrained(mname) config.update(dict( d_model=4, encoder_layers=1, decoder_layers=1, encoder_ffn_dim=4, decoder_ffn_dim=4, encoder_attention_heads=1, decoder_attention_heads=1)) tiny_model = FSMTForConditionalGeneration(config) print(f"num of params {tiny_model.num_parameters()}") # Test batch = tokenizer(["Making tiny model"], return_tensors="pt") outputs = tiny_model(**batch) print("test output:", len(outputs.logits[0])) # Save mname_tiny = "tiny-wmt19-en-de" tiny_model.half() # makes it smaller tiny_model.save_pretrained(mname_tiny) tokenizer.save_pretrained(mname_tiny) print(f"Generated {mname_tiny}") # Upload # transformers-cli upload tiny-wmt19-en-de