def pre_init(self, hparams): teacher = T5ForConditionalGeneration.from_pretrained(hparams.teacher) n_layer = hparams.student_decoder_layers assert n_layer == hparams.student_encoder_layers # TODO(SS): relax this d_layers_to_copy = get_layers_to_copy(n_layer, len(teacher.decoder.block)) e_layers_to_copy: List = get_layers_to_copy(n_layer, len(teacher.encoder.block)) student_updates = {"num_layers": n_layer} hparams.d_layer_to_copy = d_layers_to_copy hparams.e_layer_to_copy = e_layers_to_copy kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = T5Config(**kw) student = T5ForConditionalGeneration(student_cfg) student, _ = init_student(student, teacher) self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams, student, teacher) Path(hparams.output_dir).mkdir(exist_ok=True) task_specific_params = student.config.task_specific_params if task_specific_params is not None: student.config.update(task_specific_params.get( "summarization", {})) return d_layers_to_copy, student, student_cfg, teacher
def pre_init(self, hparams): raise NotImplementedError("T5 Distillation does not work yet") self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) teacher = T5ForConditionalGeneration.from_pretrained(hparams.teacher) n_layer = hparams.student_decoder_layers assert n_layer == hparams.student_encoder_layers # TODO(SS): relax this constraint so that we can do 12-6. d_layers_to_copy = get_layers_to_copy(n_layer, len(teacher.decoder.block)) e_layers_to_copy: List = get_layers_to_copy(n_layer, len(teacher.encoder.block)) student_updates = {"num_layers": n_layer} hparams.d_layer_to_copy = d_layers_to_copy hparams.e_layer_to_copy = e_layers_to_copy kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = T5Config(**kw) student = T5ForConditionalGeneration(student_cfg) student, _ = init_student(student, teacher) self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams, student, teacher) Path(hparams.output_dir).mkdir(exist_ok=True) task_specific_params = student.config.task_specific_params if task_specific_params is not None: student.config.update(task_specific_params.get( "summarization", {})) # TODO: dont hardcode save_dir = self.output_dir.joinpath("student") save_dir.mkdir(exist_ok=True) student.save_pretrained(save_dir) hparams.model_name_or_path = str(save_dir) return student, student_cfg, teacher
def convert_model(base_model, path, new_path): model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model)) print("loading weights...") load_tf_weights_in_t5(model, None, path) model.eval() print("saving HF weights...") model.save_pretrained(new_path)
def __init__(self, config): super().__init__(config) self.save_mem = config.save_mem # self.t5 = T5ForConditionalGeneration(config) if self.save_mem: self.dummy_tensor = torch.ones(1, dtype=torch.float32, requires_grad=True) self.t5_wrapped = ModuleWrapperIgnores2ndArg(T5ForConditionalGeneration(config)) else: self.t5 = self.t5_wrapped = T5ForConditionalGeneration(config) #choose to use hidden states or softmaxed values for classification? currently softmaxed_values # self.classifier = Linear(int((config.max_seq_len/2)-1), 1) self.init_weights()
def create_t5_and_check_t5_generate_with_past_key_value_states( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5ForConditionalGeneration(config=config) model.to(torch_device) model.eval() torch.manual_seed(0) output_without_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False) torch.manual_seed(0) output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True) self.parent.assertTrue( torch.all(output_with_past_cache == output_without_past_cache))
def __init__(self, model: str = None): log.info(model) torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") log.info(torch_device) if model is None: model = "t5" self.modelName = model # path to all the files that will be used for inference self.path = f"./app/api/{model}/" self.model_path = self.path + "pytorch_model.bin" self.config_path = self.path + "config.json" # Selecting the correct model based on the passed madel input. Default t5 if model == "t5": self.config = T5Config.from_json_file(self.config_path) self.model = T5ForConditionalGeneration(self.config) self.tokenizer = T5Tokenizer.from_pretrained(self.path) self.model.eval() self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device)) elif model == "google/pegasus-newsroom": self.config = PegasusConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = PegasusTokenizer.from_pretrained(model) elif model == "facebook/bart-large-cnn": self.config = BartConfig.from_json_file(self.config_path) # self.model = PegasusForConditionalGeneration(self.config) # self.tokenizer = PegasusTokenizer.from_pretrained(self.path) self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device) self.tokenizer = BartTokenizer.from_pretrained(model) else: raise Exception("This model is not supported") self.text = str()
def build_model_and_check_forward_pass(self, **kwargs): tester = T5ModelTester(self, **kwargs) config, *inputs = tester.prepare_config_and_inputs() ( input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ) = inputs model = T5ForConditionalGeneration( config=config).to(torch_device).eval() outputs = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=lm_labels, ) # outputs = model(*inputs) assert len(outputs) == 4 assert outputs["logits"].size() == (tester.batch_size, tester.decoder_seq_length, tester.vocab_size) assert outputs["loss"].size() == () return model
def create_and_check_t5_with_lm_head( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5ForConditionalGeneration(config=config) model.to(torch_device) model.eval() outputs = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=lm_labels, ) loss, prediction_scores, _, _ = outputs self.parent.assertEqual(len(outputs), 4) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]) self.check_loss_output(result)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): # Initialise PyTorch model config = T5Config.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = T5ForConditionalGeneration(config) # Load weights from tf checkpoint load_tf_weights_in_t5(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) model.save_pretrained(pytorch_dump_path)
def create_and_check_with_lm_head( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5ForConditionalGeneration(config=config).to(torch_device).eval() outputs = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, labels=lm_labels, ) self.parent.assertEqual(len(outputs), 4) self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size)) self.parent.assertEqual(outputs["loss"].size(), ())
def main( model_path: str, corpus: Corpus = "kaggle", split_name: str = "valid", max_len: int = 128, batch_size: int = 32): if "mt5" in Path(model_path).stem: tokenizer = MT5Tokenizer.from_pretrained(model_path) # print(tokenizer.encode("</s>")) model = MT5ForConditionalGeneration( MT5Config.from_pretrained(model_path) ).eval() else: tokenizer = T5Tokenizer.from_pretrained(model_path) # print(tokenizer.encode("</s>")) model = T5ForConditionalGeneration( T5Config.from_pretrained(model_path) ).eval() shrink_vocab(model_path, model) model.lm_head = torch.nn.Linear(model.lm_head.in_features, 3, bias=False) model.load_state_dict(torch.load(Path(model_path) / "pytorch_model.bin")) model = model.cuda() # model.load_state_dict(torch.load(model_path)) context_tokens_1 = tokenizer.encode("mnli hypothesis:")[:-1] context_tokens_2 = tokenizer.encode("premise:")[:-1] collate_fn = partial( collate_batch, pad=model.config.decoder_start_token_id, decode_start_token=model.config.pad_token_id, max_len=max_len, is_classifier=True ) dataset = XNLIDataset( corpus, split_name + ".jbl", context_tokens_1, context_tokens_2) data_loader = DataLoader( dataset, num_workers=1, shuffle=False, drop_last=False, batch_size=batch_size, collate_fn=collate_fn) preds, labels = [], [] for input_batch, label_batch in tqdm(data_loader, ncols=100): for key, val in input_batch.items(): input_batch[key] = val.cuda() outputs = model(**input_batch) preds_local = torch.argmax(outputs["logits"][:, 0, :].cpu(), dim=-1) preds.append(preds_local.numpy()) labels.append(np.asarray([x[0] for x in label_batch["ids"].cpu().numpy()])) full_labels = np.concatenate(labels) full_preds = np.concatenate(preds) # print("Label mapping:") # for key in np.unique(full_labels): # print(f"{key}: {tokenizer.decode([key])}") print("Labels:") print(pd.Series(full_labels).value_counts()) print("Predictions:") print(pd.Series(full_preds).value_counts()) print("Acc: %.2f%%" % (np.mean(full_labels == full_preds) * 100))
def get_model(tokenizer_len=None): if args.mode == 'train' or args.mode == 'test_without_train': model = T5ForConditionalGeneration.from_pretrained( args.t5_model, cache_dir=args.cache_dir) if tokenizer_len is not None: model.resize_token_embeddings(tokenizer_len) elif args.mode == 'test' or args.mode == 'continue_train': model = T5ForConditionalGeneration( T5Config.from_json_file(output_config_file)) model.load_state_dict(torch.load(output_model_file)) else: raise NotImplementedError( f'No such mode called {args.mode}, error raised from get_model.') if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) return model.to(device)
def load_pretained_model_and_tokenizer( base_model: str, model_dict_path: str, gpu_device: str, eval=False, ): ''' Load pretainted T5 model on UnifiedQA base_model: base model name for T5 model_dict_path: trained model checkpoint for unifiedQA ''' tokenizer = T5Tokenizer.from_pretrained(base_model) model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model)) if eval: model = torch.load(model_dict_path, map_location=gpu_device) else: load_tf_weights_in_t5(model, None, model_dict_path) return tokenizer, model
def __init__(self, model: str = None, service: str = "summ"): if model is None: model = "t5" # path to all the files that will be used for inference self.path = f"./{service}/{model}/" self.model_path = self.path + "model.bin" self.config_path = self.path + "config.json" # Selecting the correct model based on the passed madel input. Default t5 if model == "t5": self.config = T5Config.from_json_file(self.config_path) self.model = T5ForConditionalGeneration(self.config) self.tokenizer = T5Tokenizer.from_pretrained(self.path) else: raise Exception("This model is not supported") self.model.eval() self.model.load_state_dict( torch.load(self.model_path, map_location=device)) self.text = str()
def __init__(self, hparams, batch_fn=None): """ :param batch_fn: function to process batch """ super(RaceModule, self).__init__(hparams, batch_fn) if self.hparams.pretrained_model in ["t5-base","t5-small"]: # Model: config = T5Config(decoder_start_token_id = self.hparams.padding_token) self.model = T5ForConditionalGeneration(config).from_pretrained(self.hparams.pretrained_model) # Tokenizer: self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.pretrained_model) self.tokenizer_.add_special_tokens({"additional_special_tokens": ["[CON]","[QUE]","[ANS]","[DIS]"]}) # Metrics: self.metrics = Metrics() try: self.model.resize_token_embeddings(self.hparams.tokenizer_len) except: self.model.resize_token_embeddings(32104) else: raise NotImplementedError
def __init__(self, tokenizer): super(T5Model, self).__init__() self.tokenizer = tokenizer config = T5Config.from_pretrained('t5-small') self.model = T5ForConditionalGeneration(config=config)
def execute_inference( self, metadata: NetworkMetadata, network_fpaths: NetworkModels, inference_input: str, timing_profile: TimingProfile, ) -> NetworkResult: # Execute some tests tokenizer = T5Tokenizer.from_pretrained(metadata.variant) input_ids = tokenizer(inference_input, return_tensors="pt").input_ids # By default, huggingface model structure is one giant file. t5_torch_fpath = network_fpaths.torch[0].fpath config = T5Config( use_cache=metadata.other.kv_cache, num_layers=T5ModelTRTConfig.NUMBER_OF_LAYERS[metadata.variant], ) t5_model = T5ForConditionalGeneration(config).from_pretrained( t5_torch_fpath) t5_torch_encoder = T5EncoderTorchFile.TorchModule(t5_model.encoder) t5_torch_decoder = T5DecoderTorchFile.TorchModule( t5_model.decoder, t5_model.lm_head, t5_model.config) encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference( t5_torch_encoder, input_ids, timing_profile) _, decoder_e2e_median_time = decoder_inference( t5_torch_decoder, input_ids, encoder_last_hidden_state, timing_profile) decoder_output_greedy, full_e2e_median_runtime = full_inference_greedy( t5_torch_encoder, t5_torch_decoder, input_ids, tokenizer, timing_profile, max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[metadata.variant], ) # Remove the padding and end tokens. semantic_outputs = tokenizer.convert_ids_to_tokens( decoder_output_greedy.tolist()[0])[1:-1] remove_underscore = "".join( [s.replace("\u2581", " ") for s in semantic_outputs]) return NetworkResult( input=inference_input, output_tensor=encoder_last_hidden_state, semantic_output=remove_underscore.strip(), median_runtime=[ NetworkRuntime( name=T5ModelTRTConfig.NETWORK_DECODER_SEGMENT_NAME, runtime=decoder_e2e_median_time, ), NetworkRuntime( name=T5ModelTRTConfig.NETWORK_ENCODER_SEGMENT_NAME, runtime=encoder_e2e_median_time, ), NetworkRuntime( name=T5ModelTRTConfig.NETWORK_FULL_NAME, runtime=full_e2e_median_runtime, ), ], models=network_fpaths, )
def generate_and_download_framework( self, metadata: NetworkMetadata, workspace: NNFolderWorkspace) -> NetworkModels: cache_variant = False if metadata.other.kv_cache: cache_variant = True trt_t5_config = self.config metadata_serialized = trt_t5_config.get_metadata_string(metadata) workspace_dir = workspace.get_path() pytorch_model_dir = os.path.join(workspace_dir, metadata_serialized) # We keep track of the generated torch location for cleanup later self.torch_t5_dir = pytorch_model_dir model = None tfm_config = T5Config( use_cache=cache_variant, num_layers=T5ModelTRTConfig.NUMBER_OF_LAYERS[metadata.variant], ) if not os.path.exists(pytorch_model_dir): # Generate the pre-trained weights model = T5ForConditionalGeneration(tfm_config).from_pretrained( metadata.variant) model.save_pretrained(pytorch_model_dir) print("Pytorch Model saved to {}".format(pytorch_model_dir)) else: print( "Frameworks file already exists, skipping generation and loading from file instead." ) model = T5ForConditionalGeneration(tfm_config).from_pretrained( pytorch_model_dir) # These ONNX models can be converted using special encoder and decoder classes. root_onnx_model_name = "{}.onnx".format(metadata_serialized) root_onnx_model_fpath = os.path.join(os.getcwd(), workspace_dir, root_onnx_model_name) encoder_onnx_model_fpath = root_onnx_model_fpath + "-encoder.onnx" decoder_onnx_model_fpath = root_onnx_model_fpath + "-decoder-with-lm-head.onnx" t5_encoder = T5EncoderTorchFile(model, metadata) t5_decoder = T5DecoderTorchFile(model, metadata) self.onnx_t5_encoder = t5_encoder.as_onnx_model( encoder_onnx_model_fpath, force_overwrite=False) self.onnx_t5_decoder = t5_decoder.as_onnx_model( decoder_onnx_model_fpath, force_overwrite=False) onnx_models = [ NetworkModel( name=T5ModelTRTConfig.NETWORK_DECODER_SEGMENT_NAME, fpath=self.onnx_t5_decoder.fpath, ), NetworkModel( name=T5ModelTRTConfig.NETWORK_ENCODER_SEGMENT_NAME, fpath=self.onnx_t5_encoder.fpath, ), ] torch_models = [ NetworkModel(name=T5ModelTRTConfig.NETWORK_FULL_NAME, fpath=pytorch_model_dir) ] return NetworkModels(torch=torch_models, onnx=onnx_models, trt=None)
def __init__(self, hparams: argparse.Namespace, num_labels=None, mode="base", config=None, tokenizer=None, model=None, **config_kwargs): """Initialize a model, tokenizer and config.""" super().__init__() # TODO: move to self.save_hyperparameters() # self.save_hyperparameters() # can also expand arguments into trainer signature for easier reading self.save_hyperparameters(hparams) self.step_count = 0 self.output_dir = Path(self.hparams.output_dir) cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None if config is None: self.config = AutoConfig.from_pretrained( self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path, **({ "num_labels": num_labels } if num_labels is not None else {}), cache_dir=cache_dir, **config_kwargs, ) else: self.config: PretrainedConfig = config extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(self.hparams, p, None): assert hasattr( self.config, p), f"model config doesn't have a `{p}` attribute" setattr(self.config, p, getattr(self.hparams, p)) if tokenizer is None: self.tokenizer = AutoTokenizer.from_pretrained( self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path, # self.hparams.model_name_or_path, cache_dir=cache_dir, ) else: self.tokenizer: PreTrainedTokenizer = tokenizer self.model_type = MODEL_MODES[mode] if model is None: try: self.model = self.model_type.from_pretrained( self.hparams.model_name_or_path, from_tf=bool(".ckpt" in self.hparams.model_name_or_path), config=self.config, cache_dir=cache_dir, ) except: self.model = T5ForConditionalGeneration(config=self.config) else: self.model = model
def __init__( self, model_name, args=None, tokenizer=None, use_cuda=True, cuda_device=-1, **kwargs, ): """ Initializes a T5Model model. Args: model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. """ # noqa: ignore flake8" self.args = self._load_model_args(model_name) if isinstance(args, dict): self.args.update_from_dict(args) elif isinstance(args, T5Args): self.args = args if "sweep_config" in kwargs: sweep_config = kwargs.pop("sweep_config") sweep_values = sweep_config_to_sweep_values(sweep_config) self.args.update_from_dict(sweep_values) if self.args.manual_seed: random.seed(self.args.manual_seed) np.random.seed(self.args.manual_seed) torch.manual_seed(self.args.manual_seed) if self.args.n_gpu > 0: torch.cuda.manual_seed_all(self.args.manual_seed) if use_cuda: if torch.cuda.is_available(): if cuda_device == -1: self.device = torch.device("cuda") else: self.device = torch.device(f"cuda:{cuda_device}") else: raise ValueError( "'use_cuda' set to True when cuda is unavailable." "Make sure CUDA is available or set `use_cuda=False`.") else: self.device = "cpu" self.results = {} if model_name is None: self.config = self.args.config self.model = T5ForConditionalGeneration(config=self.config) else: self.config = T5Config.from_pretrained(model_name, **self.args.config) self.model = T5ForConditionalGeneration.from_pretrained( model_name, config=self.config) if isinstance(tokenizer, T5Tokenizer): self.tokenizer = tokenizer else: self.tokenizer = T5Tokenizer.from_pretrained(model_name, truncate=True) self.model.resize_token_embeddings(len(self.tokenizer)) if self.args.dynamic_quantize: self.model = torch.quantization.quantize_dynamic(self.model, {torch.nn.Linear}, dtype=torch.qint8) if not use_cuda: self.args.fp16 = False self.args.model_type = "T5" if model_name is None: self.args.model_name = "T5_from_scratch" else: self.args.model_name = model_name if self.args.wandb_project and not wandb_available: warnings.warn( "wandb_project specified but wandb is not available. Wandb disabled." ) self.args.wandb_project = None
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration from transformers.modeling_t5 import load_tf_weights_in_t5 from flask import Flask, request, jsonify app = Flask(__name__) base_model = "t5-large" tokenizer = T5Tokenizer.from_pretrained(base_model) model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model)) load_tf_weights_in_t5(model, None, "/data/") model.eval() ret_dict = { 'low air quality': 'LowAirQuality', 'low humidity': 'LowHumidity', 'low brightness': 'LowBrightness', 'low noise level': 'LowNoise', 'low security': 'LowSecurity', 'low temperature': 'LowTemperature', 'high air quality': 'HighAirQuality', 'high humidity': 'HighHumidity', 'high brightness': 'HighBrightness', 'high noise level': 'HighNoise', 'high security': 'HighSecurity', 'high temperature': 'HighTemperature' } def run_model(input_string, **generator_args): input_ids = tokenizer.encode(input_string, return_tensors="pt")
("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) config = T5Config( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = T5TokenizerFast.from_pretrained(tokenizer_dir, max_len=512) model = T5ForConditionalGeneration(config=config) model.num_parameters() train_dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=f"{data_dir}/train_texts.txt", block_size=128, ) test_dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=f"{data_dir}/valid_texts.txt", block_size=128, ) data_collator = DataCollatorForLanguageModeling(
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path != "new": model = T5ForConditionalGeneration.from_pretrained( model_args.model_name_or_path, ) else: config = AutoConfig.from_pretrained("t5-small") model = T5ForConditionalGeneration(config=config) model.resize_token_embeddings(len(tokenizer)) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DoNothingDataCollator() # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) # trainer.train(model_path=model_path) trainer.train() trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") model.eval() data_collator = DoNothingDataCollatorForGeneration() sampler = SequentialSampler(eval_dataset) data_loader = DataLoader( eval_dataset, sampler=sampler, batch_size=training_args.eval_batch_size, collate_fn=data_collator.collate_batch, ) output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") writer = open(output_eval_file, "w") for inputs in tqdm(data_loader, "Prediction"): for k, v in inputs.items(): inputs[k] = v.cuda() with torch.no_grad(): outputs = model.generate( input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=12) dec = [tokenizer.decode(ids) for ids in outputs] for i in range(0, len(dec)): writer.write(dec[i] + "\n") return results