def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if training_args.should_log: transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = XLNetConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = XLNetTokenizerFast.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = XLNetForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. if training_args.do_train: column_names = datasets["train"].column_names elif training_args.do_eval: column_names = datasets["validation"].column_names else: column_names = datasets["test"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] tokenized_examples["is_impossible"] = [] tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0. # The cls token gets 1.0 too (for predictions of empty answers). tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != context_idx: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != context_idx: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) tokenized_examples["is_impossible"].append(0.0) return tokenized_examples if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: # Select samples from Dataset, This will help to decrease processing time train_dataset = train_dataset.select( range(data_args.max_train_samples)) # Create Training Features train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if data_args.max_train_samples is not None: # Select samples from dataset again since Feature Creation might increase number of features train_dataset = train_dataset.select( range(data_args.max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label. tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, input_ids in enumerate(tokenized_examples["input_ids"]): # Find the CLS token in the input ids. cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others 1.0. tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_idx else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = datasets["validation"] if data_args.max_eval_samples is not None: # Selecting Eval Samples from Dataset eval_examples = eval_examples.select( range(data_args.max_eval_samples)) # Create Features from Eval Dataset eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if data_args.max_eval_samples is not None: # Selecting Samples from Dataset again since Feature Creation might increase samples size eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) if training_args.do_predict: if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = datasets["test"] if data_args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select( range(data_args.max_predict_samples)) # Test Feature Creation predict_dataset = predict_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again predict_dataset = predict_dataset.select( range(data_args.max_predict_samples)) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data # collator. data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, start_n_top=model.config.start_n_top, end_n_top=model.config.end_n_top, output_dir=training_args.output_dir, is_world_process_zero=trainer.is_world_process_zero(), prefix=stage, ) # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k] } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric( "squad_v2" if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Initialize our Trainer trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, eval_examples=eval_examples if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Prediction if training_args.do_predict: logger.info("*** Predict ***") results = trainer.predict(predict_dataset, predict_examples) metrics = results.metrics max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) if training_args.push_to_hub: kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "question-answering" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name trainer.push_to_hub(**kwargs)
def create_and_check_xlnet_qa( self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels, ): model = XLNetForQuestionAnswering(config) model.to(torch_device) model.eval() result = model(input_ids_1) result_with_labels = model( input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels, cls_index=sequence_labels, is_impossible=is_impossible_labels, p_mask=input_mask, ) result_with_labels = model( input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels, cls_index=sequence_labels, is_impossible=is_impossible_labels, ) total_loss, mems = result_with_labels.to_tuple() result_with_labels = model( input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels, ) total_loss, mems = result_with_labels.to_tuple() self.parent.assertEqual(result_with_labels.loss.shape, ()) self.parent.assertEqual(result.start_top_log_probs.shape, (self.batch_size, model.config.start_n_top)) self.parent.assertEqual(result.start_top_index.shape, (self.batch_size, model.config.start_n_top)) self.parent.assertEqual(result.end_top_log_probs.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)) self.parent.assertEqual(result.end_top_index.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)) self.parent.assertEqual(result.cls_logits.shape, (self.batch_size, )) self.parent.assertListEqual( [mem.shape for mem in result.mems], [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers, )
def train(): # 加载预训练bert config = XLNetConfig.from_pretrained('xlnet_config.json') model = XLNetForQuestionAnswering.from_pretrained('xlnet_model.ckpt.index', from_tf=True, config=config) device = args.device model.to(device) # 准备 optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = adabound.AdaBound(optimizer_grouped_parameters, lr=1e-3, final_lr=0.1) # 准备数据 data = Dureader() train_dataloader, dev_dataloader = data.train_iter, data.dev_iter best_loss = 100000.0 model.train() for i in range(args.num_train_epochs): for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch")): input_ids, input_mask, segment_ids, start_positions, end_positions = \ batch.input_ids, batch.input_mask, batch.segment_ids, batch.start_position, batch.end_position input_ids, input_mask, segment_ids, start_positions, end_positions = \ input_ids.to(device), input_mask.to(device), segment_ids.to(device), start_positions.to(device), end_positions.to(device) # 计算loss outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] loss = loss / args.gradient_accumulation_steps loss.backward() # 更新梯度 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() # 验证 if step % args.log_step == 4: eval_loss = evaluate.evaluate(model, dev_dataloader) if eval_loss < best_loss: best_loss = eval_loss torch.save(model.state_dict(), './model_dir/' + "best_model") model.train()
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file if args.test_file is not None: data_files["test"] = args.test_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = XLNetConfig.from_pretrained(args.model_name_or_path) tokenizer = XLNetTokenizerFast.from_pretrained(args.model_name_or_path) model = XLNetForQuestionAnswering.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. column_names = raw_datasets["train"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] tokenized_examples["is_impossible"] = [] tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0. # The cls token gets 1.0 too (for predictions of empty answers). tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != context_idx: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != context_idx: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) tokenized_examples["is_impossible"].append(0.0) return tokenized_examples if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if args.max_train_samples is not None: # We will select sample from whole data if agument is specified train_dataset = train_dataset.select(range(args.max_train_samples)) # Create train feature from dataset train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on train dataset", ) if args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples train_dataset = train_dataset.select(range(args.max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label. tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, input_ids in enumerate(tokenized_examples["input_ids"]): # Find the CLS token in the input ids. cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others 1.0. tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_idx else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = raw_datasets["validation"] if args.max_eval_samples is not None: # We will select sample from whole data eval_examples = eval_examples.select(range(args.max_eval_samples)) # Validation Feature Creation eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again eval_dataset = eval_dataset.select(range(args.max_eval_samples)) if args.do_predict: if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = raw_datasets["test"] if args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select( range(args.max_predict_samples)) # Predict Feature Creation predict_dataset = predict_examples.map( prepare_validation_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) if args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again predict_dataset = predict_dataset.select( range(args.max_predict_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataset_for_model = eval_dataset.remove_columns( ["example_id", "offset_mapping"]) eval_dataloader = DataLoader(eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) if args.do_predict: predict_dataset_for_model = predict_dataset.remove_columns( ["example_id", "offset_mapping"]) predict_dataloader = DataLoader( predict_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search( examples=examples, features=features, predictions=predictions, version_2_with_negative=args.version_2_with_negative, n_best_size=args.n_best_size, max_answer_length=args.max_answer_length, start_n_top=model.config.start_n_top, end_n_top=model.config.end_n_top, output_dir=args.output_dir, prefix=stage, ) # Format the result to the format the metric expects. if args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k] } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric( "squad_v2" if args.version_2_with_negative else "squad") def create_and_fill_np_array(start_or_end_logits, dataset, max_len): """ Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor Args: start_or_end_logits(:obj:`tensor`): This is the output predictions of the model. We can only enter either start or end logits. eval_dataset: Evaluation dataset max_len(:obj:`int`): The maximum length of the output tensor. ( See the model.eval() part for more details ) """ step = 0 # create a numpy array and fill it with -100. logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float32) # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather for i, output_logit in enumerate( start_or_end_logits): # populate columns # We have to fill it such that we have to take the whole tensor and replace it on the newly created array # And after every iteration we have to change the step batch_size = output_logit.shape[0] cols = output_logit.shape[1] if step + batch_size < len(dataset): logits_concat[step:step + batch_size, :cols] = output_logit else: logits_concat[step:, :cols] = output_logit[:len(dataset) - step] step += batch_size return logits_concat # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break # intialize all lists to collect the batches all_start_top_log_probs = [] all_start_top_index = [] all_end_top_log_probs = [] all_end_top_index = [] all_cls_logits = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) start_top_log_probs = outputs.start_top_log_probs start_top_index = outputs.start_top_index end_top_log_probs = outputs.end_top_log_probs end_top_index = outputs.end_top_index cls_logits = outputs.cls_logits if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered start_top_log_probs = accelerator.pad_across_processes( start_top_log_probs, dim=1, pad_index=-100) start_top_index = accelerator.pad_across_processes( start_top_index, dim=1, pad_index=-100) end_top_log_probs = accelerator.pad_across_processes( end_top_log_probs, dim=1, pad_index=-100) end_top_index = accelerator.pad_across_processes( end_top_index, dim=1, pad_index=-100) cls_logits = accelerator.pad_across_processes(cls_logits, dim=1, pad_index=-100) all_start_top_log_probs.append( accelerator.gather(start_top_log_probs).cpu().numpy()) all_start_top_index.append( accelerator.gather(start_top_index).cpu().numpy()) all_end_top_log_probs.append( accelerator.gather(end_top_log_probs).cpu().numpy()) all_end_top_index.append( accelerator.gather(end_top_index).cpu().numpy()) all_cls_logits.append(accelerator.gather(cls_logits).cpu().numpy()) max_len = max([x.shape[1] for x in all_end_top_log_probs ]) # Get the max_length of the tensor # concatenate all numpy arrays collected above start_top_log_probs_concat = create_and_fill_np_array( all_start_top_log_probs, eval_dataset, max_len) start_top_index_concat = create_and_fill_np_array(all_start_top_index, eval_dataset, max_len) end_top_log_probs_concat = create_and_fill_np_array( all_end_top_log_probs, eval_dataset, max_len) end_top_index_concat = create_and_fill_np_array(all_end_top_index, eval_dataset, max_len) cls_logits_concat = np.concatenate(all_cls_logits, axis=0) # delete the list of numpy arrays del start_top_log_probs del start_top_index del end_top_log_probs del end_top_index del cls_logits outputs_numpy = ( start_top_log_probs_concat, start_top_index_concat, end_top_log_probs_concat, end_top_index_concat, cls_logits_concat, ) prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy) eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Evaluation metrics: {eval_metric}") if args.do_predict: # intialize all lists to collect the batches all_start_top_log_probs = [] all_start_top_index = [] all_end_top_log_probs = [] all_end_top_index = [] all_cls_logits = [] for step, batch in enumerate(predict_dataloader): with torch.no_grad(): outputs = model(**batch) start_top_log_probs = outputs.start_top_log_probs start_top_index = outputs.start_top_index end_top_log_probs = outputs.end_top_log_probs end_top_index = outputs.end_top_index cls_logits = outputs.cls_logits if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered start_top_log_probs = accelerator.pad_across_processes( start_top_log_probs, dim=1, pad_index=-100) start_top_index = accelerator.pad_across_processes( start_top_index, dim=1, pad_index=-100) end_top_log_probs = accelerator.pad_across_processes( end_top_log_probs, dim=1, pad_index=-100) end_top_index = accelerator.pad_across_processes( end_top_index, dim=1, pad_index=-100) cls_logits = accelerator.pad_across_processes( cls_logits, dim=1, pad_index=-100) all_start_top_log_probs.append( accelerator.gather(start_top_log_probs).cpu().numpy()) all_start_top_index.append( accelerator.gather(start_top_index).cpu().numpy()) all_end_top_log_probs.append( accelerator.gather(end_top_log_probs).cpu().numpy()) all_end_top_index.append( accelerator.gather(end_top_index).cpu().numpy()) all_cls_logits.append( accelerator.gather(cls_logits).cpu().numpy()) max_len = max([x.shape[1] for x in all_end_top_log_probs ]) # Get the max_length of the tensor # concatenate all numpy arrays collected above start_top_log_probs_concat = create_and_fill_np_array( all_start_top_log_probs, predict_dataset, max_len) start_top_index_concat = create_and_fill_np_array( all_start_top_index, predict_dataset, max_len) end_top_log_probs_concat = create_and_fill_np_array( all_end_top_log_probs, predict_dataset, max_len) end_top_index_concat = create_and_fill_np_array( all_end_top_index, predict_dataset, max_len) cls_logits_concat = np.concatenate(all_cls_logits, axis=0) # delete the list of numpy arrays del start_top_log_probs del start_top_index del end_top_log_probs del end_top_index del cls_logits outputs_numpy = ( start_top_log_probs_concat, start_top_index_concat, end_top_log_probs_concat, end_top_index_concat, cls_logits_concat, ) prediction = post_processing_function(predict_examples, predict_dataset, outputs_numpy) predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Predict metrics: {predict_metric}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
exit(1) # Config lr = 2e-5 batch_size = 4 accumulate_batch_size = 32 assert accumulate_batch_size % batch_size == 0 update_stepsize = accumulate_batch_size // batch_size dataset = sys.argv[4:] model_path = sys.argv[2] tokenizer = XLNetTokenizer.from_pretrained(model_path) model = XLNetForQuestionAnswering.from_pretrained(model_path) device = torch.device(sys.argv[1]) model.to(device) model.start_n_top = 5 model.end_n_top = 5 optimizer = AdamW(model.parameters(), lr=lr) optimizer.zero_grad() step = 0 patience, best_val = 0, 0 best_state_dict = model.state_dict() dataloader = get_dataloader('xlnet', 'train', tokenizer, batch_size=batch_size, num_workers=16) print('Start training...')
def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels): model = XLNetForQuestionAnswering(config) model.to(torch_device) model.eval() outputs = model(input_ids_1) start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs outputs = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels, cls_index=sequence_labels, is_impossible=is_impossible_labels, p_mask=input_mask) outputs = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels, cls_index=sequence_labels, is_impossible=is_impossible_labels) total_loss, mems = outputs outputs = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels) total_loss, mems = outputs result = { "loss": total_loss, "start_top_log_probs": start_top_log_probs, "start_top_index": start_top_index, "end_top_log_probs": end_top_log_probs, "end_top_index": end_top_index, "cls_logits": cls_logits, "mems": mems, } self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]) self.parent.assertListEqual( list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]) self.parent.assertListEqual( list(result["end_top_log_probs"].size()), [ self.batch_size, model.config.start_n_top * model.config.end_n_top ]) self.parent.assertListEqual(list(result["end_top_index"].size()), [ self.batch_size, model.config.start_n_top * model.config.end_n_top ]) self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size]) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
def _load_model(self, model_path, device): self.model = XLNetForQuestionAnswering.from_pretrained(model_path) self.model.to(device) self.model.eval()
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_TYPES), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--train_file", default=None, type=str, help="The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help="The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--version_2_with_negative", action="store_true", help="If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step." ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument( "--lang_id", default=0, type=int, help="language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)", ) parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features") args = parser.parse_args() if args.doc_stride >= args.max_seq_length - args.max_query_length: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " "examples. This could result in errors when building features from the examples. Please reduce the doc " "stride or increase the maximum length to ensure the features are correctly built." ) if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) # model = AutoModelForQuestionAnswering.from_pretrained( model = XLNetForQuestionAnswering.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned # model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir) # , force_download=True) model = XLNetForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: if args.do_train: logger.info("Loading checkpoints saved during training for evaluation") checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) ) else: logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path) checkpoints = [args.model_name_or_path] logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = AutoModelForQuestionAnswering.from_pretrained(checkpoint) # , force_download=True) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) return results
""" SMALL / MEDIUM / XLNET """ from transformers import BertForQuestionAnswering, BertTokenizer, XLNetForQuestionAnswering, XLNetTokenizer bert_small_model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-small-finetuned-squadv2') bert_small_tokenizer = BertTokenizer.from_pretrained('mrm8488/bert-small-finetuned-squadv2') print("Bert Small loaded...") bert_med_model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-medium-finetuned-squadv2') bert_med_tokenizer = BertTokenizer.from_pretrained('mrm8488/bert-medium-finetuned-squadv2') print("Bert Medium loaded...") XLNET_model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased') XLNET_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') print("XLNET loaded...") """ SMALL / MEDIUM / DISTIL BASE """ from transformers import BertForQuestionAnswering, BertTokenizer, DistilBertForQuestionAnswering, DistilBertTokenizer bert_small_model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-small-finetuned-squadv2') bert_small_tokenizer = BertTokenizer.from_pretrained('mrm8488/bert-small-finetuned-squadv2') print("Bert Small loaded...") bert_med_model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-medium-finetuned-squadv2') bert_med_tokenizer = BertTokenizer.from_pretrained('mrm8488/bert-medium-finetuned-squadv2')