def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = XLNetConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = XLNetTokenizerFast.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = XLNetForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names question_column_name = "question" if "question" in column_names else column_names[0] context_column_name = "context" if "context" in column_names else column_names[1] answer_column_name = "answers" if "answers" in column_names else column_names[2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" # Training preprocessing def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[question_column_name if pad_on_right else context_column_name], examples[context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=data_args.max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] tokenized_examples["is_impossible"] = [] tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0. # The cls token gets 1.0 too (for predictions of empty answers). tokenized_examples["p_mask"].append( [ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ] ) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != context_idx: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != context_idx: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append(token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append(token_end_index + 1) tokenized_examples["is_impossible"].append(0.0) return tokenized_examples if training_args.do_train: train_dataset = datasets["train"].map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[question_column_name if pad_on_right else context_column_name], examples[context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=data_args.max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label. tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, input_ids in enumerate(tokenized_examples["input_ids"]): # Find the CLS token in the input ids. cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others 1.0. tokenized_examples["p_mask"].append( [ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ] ) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append(examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_idx else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if training_args.do_eval: validation_dataset = datasets["validation"].map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data # collator. data_collator = default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding(tokenizer) # Post-processing: def post_processing_function(examples, features, predictions): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, start_n_top=model.config.start_n_top, end_n_top=model.config.end_n_top, output_dir=training_args.output_dir, is_world_process_zero=trainer.is_world_process_zero(), ) # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [ {"id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k]} for k, v in predictions.items() ] else: formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in datasets["validation"]] return EvalPrediction(predictions=formatted_predictions, label_ids=references) # TODO: Once the fix lands in a Datasets release, remove the _local here and the squad_v2_local folder. current_dir = os.path.sep.join(os.path.join(__file__).split(os.path.sep)[:-1]) metric = load_metric(os.path.join(current_dir, "squad_v2_local") if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Initialize our Trainer trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=validation_dataset if training_args.do_eval else None, eval_examples=datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def _build_vocab(self, max_vocab_cnt): # build vocab if self.tokenizer_type.startswith('word'): self._build_vocab_manual(max_vocab_cnt) elif self.tokenizer_type.startswith('bert-'): self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 30522 # fixed for pretrained BERT vocab (old version) config_pretrained = BertConfig.from_pretrained(self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} elif self.tokenizer_type.startswith('xlnet-'): # self.vocab = self.tokenizer.vocab # self.rev_vocab = self.tokenizer.ids_to_tokens # self.pad_id = self.vocab["[PAD]"] self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 32000 # fixed for pretrained BERT vocab config_pretrained = XLNetConfig.from_pretrained( self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} self.vocab = map_vocab self.rev_vocab = inv_map elif self.tokenizer_type.startswith('x5-'): self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 32000 config_pretrained = T5Config.from_pretrained(self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} self.vocab = map_vocab self.rev_vocab = inv_map elif self.tokenizer_type.startswith('bart-'): self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>") # self.vocab_count = 32000 # fixed for pretrained BERT vocab config_pretrained = BartConfig.from_pretrained(self.tokenizer_type) self.vocab_count = config_pretrained.vocab_size map_vocab = {} for ind in range(self.vocab_count): map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind) inv_map = {v: k for k, v in map_vocab.items()} return
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "fasta": FASTA_DATASET = True datasets = load_dataset_fasta(data_files, data_args.max_seq_length) else: if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = XLNetConfig() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = XLNetTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = XLNetLMHeadModel.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = XLNetLMHeadModel.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. tokenized_datasets = dict() for dataset_key, dataset in datasets.items(): # Tokenize encodings = tokenizer( dataset['sequences'], truncation=True, padding='max_length', # TODO get from args passed in max_length=data_args.max_seq_length, return_special_tokens_mask=True, return_token_type_ids=False, return_attention_mask=False ) torch_dataset = FastaDataset(encodings) tokenized_datasets[dataset_key] = torch_dataset # Data collator data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None ) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = XLNetConfig() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = XLNetLMHeadModel.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = XLNetLMHeadModel.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = raw_datasets["train"].column_names else: column_names = raw_datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length) with training_args.main_process_first(desc="dataset map tokenization"): tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset line_by_line", ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. def tokenize_function(examples): return tokenizer(examples[text_column_name]) with training_args.main_process_first(desc="dataset map tokenization"): tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on every text in dataset", ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= max_seq_length: total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map with training_args.main_process_first(desc="grouping texts together"): tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc=f"Grouping texts in chunks of {max_seq_length}", ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = tokenized_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = tokenized_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) # Data collator data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) try: perplexity = math.exp(metrics["eval_loss"]) except OverflowError: perplexity = float("inf") metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.push_to_hub: kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "language-modeling" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name trainer.push_to_hub(**kwargs)
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file if args.test_file is not None: data_files["test"] = args.test_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = XLNetConfig.from_pretrained(args.model_name_or_path) tokenizer = XLNetTokenizerFast.from_pretrained(args.model_name_or_path) model = XLNetForQuestionAnswering.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. column_names = raw_datasets["train"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] tokenized_examples["is_impossible"] = [] tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0. # The cls token gets 1.0 too (for predictions of empty answers). tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != context_idx: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != context_idx: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) tokenized_examples["is_impossible"].append(0.0) return tokenized_examples if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if args.max_train_samples is not None: # We will select sample from whole data if agument is specified train_dataset = train_dataset.select(range(args.max_train_samples)) # Create train feature from dataset train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on train dataset", ) if args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples train_dataset = train_dataset.select(range(args.max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label. tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, input_ids in enumerate(tokenized_examples["input_ids"]): # Find the CLS token in the input ids. cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others 1.0. tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_idx else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = raw_datasets["validation"] if args.max_eval_samples is not None: # We will select sample from whole data eval_examples = eval_examples.select(range(args.max_eval_samples)) # Validation Feature Creation eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again eval_dataset = eval_dataset.select(range(args.max_eval_samples)) if args.do_predict: if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = raw_datasets["test"] if args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select( range(args.max_predict_samples)) # Predict Feature Creation predict_dataset = predict_examples.map( prepare_validation_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) if args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again predict_dataset = predict_dataset.select( range(args.max_predict_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataset_for_model = eval_dataset.remove_columns( ["example_id", "offset_mapping"]) eval_dataloader = DataLoader(eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) if args.do_predict: predict_dataset_for_model = predict_dataset.remove_columns( ["example_id", "offset_mapping"]) predict_dataloader = DataLoader( predict_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search( examples=examples, features=features, predictions=predictions, version_2_with_negative=args.version_2_with_negative, n_best_size=args.n_best_size, max_answer_length=args.max_answer_length, start_n_top=model.config.start_n_top, end_n_top=model.config.end_n_top, output_dir=args.output_dir, prefix=stage, ) # Format the result to the format the metric expects. if args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k] } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric( "squad_v2" if args.version_2_with_negative else "squad") def create_and_fill_np_array(start_or_end_logits, dataset, max_len): """ Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor Args: start_or_end_logits(:obj:`tensor`): This is the output predictions of the model. We can only enter either start or end logits. eval_dataset: Evaluation dataset max_len(:obj:`int`): The maximum length of the output tensor. ( See the model.eval() part for more details ) """ step = 0 # create a numpy array and fill it with -100. logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float32) # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather for i, output_logit in enumerate( start_or_end_logits): # populate columns # We have to fill it such that we have to take the whole tensor and replace it on the newly created array # And after every iteration we have to change the step batch_size = output_logit.shape[0] cols = output_logit.shape[1] if step + batch_size < len(dataset): logits_concat[step:step + batch_size, :cols] = output_logit else: logits_concat[step:, :cols] = output_logit[:len(dataset) - step] step += batch_size return logits_concat # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break # intialize all lists to collect the batches all_start_top_log_probs = [] all_start_top_index = [] all_end_top_log_probs = [] all_end_top_index = [] all_cls_logits = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) start_top_log_probs = outputs.start_top_log_probs start_top_index = outputs.start_top_index end_top_log_probs = outputs.end_top_log_probs end_top_index = outputs.end_top_index cls_logits = outputs.cls_logits if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered start_top_log_probs = accelerator.pad_across_processes( start_top_log_probs, dim=1, pad_index=-100) start_top_index = accelerator.pad_across_processes( start_top_index, dim=1, pad_index=-100) end_top_log_probs = accelerator.pad_across_processes( end_top_log_probs, dim=1, pad_index=-100) end_top_index = accelerator.pad_across_processes( end_top_index, dim=1, pad_index=-100) cls_logits = accelerator.pad_across_processes(cls_logits, dim=1, pad_index=-100) all_start_top_log_probs.append( accelerator.gather(start_top_log_probs).cpu().numpy()) all_start_top_index.append( accelerator.gather(start_top_index).cpu().numpy()) all_end_top_log_probs.append( accelerator.gather(end_top_log_probs).cpu().numpy()) all_end_top_index.append( accelerator.gather(end_top_index).cpu().numpy()) all_cls_logits.append(accelerator.gather(cls_logits).cpu().numpy()) max_len = max([x.shape[1] for x in all_end_top_log_probs ]) # Get the max_length of the tensor # concatenate all numpy arrays collected above start_top_log_probs_concat = create_and_fill_np_array( all_start_top_log_probs, eval_dataset, max_len) start_top_index_concat = create_and_fill_np_array(all_start_top_index, eval_dataset, max_len) end_top_log_probs_concat = create_and_fill_np_array( all_end_top_log_probs, eval_dataset, max_len) end_top_index_concat = create_and_fill_np_array(all_end_top_index, eval_dataset, max_len) cls_logits_concat = np.concatenate(all_cls_logits, axis=0) # delete the list of numpy arrays del start_top_log_probs del start_top_index del end_top_log_probs del end_top_index del cls_logits outputs_numpy = ( start_top_log_probs_concat, start_top_index_concat, end_top_log_probs_concat, end_top_index_concat, cls_logits_concat, ) prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy) eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Evaluation metrics: {eval_metric}") if args.do_predict: # intialize all lists to collect the batches all_start_top_log_probs = [] all_start_top_index = [] all_end_top_log_probs = [] all_end_top_index = [] all_cls_logits = [] for step, batch in enumerate(predict_dataloader): with torch.no_grad(): outputs = model(**batch) start_top_log_probs = outputs.start_top_log_probs start_top_index = outputs.start_top_index end_top_log_probs = outputs.end_top_log_probs end_top_index = outputs.end_top_index cls_logits = outputs.cls_logits if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered start_top_log_probs = accelerator.pad_across_processes( start_top_log_probs, dim=1, pad_index=-100) start_top_index = accelerator.pad_across_processes( start_top_index, dim=1, pad_index=-100) end_top_log_probs = accelerator.pad_across_processes( end_top_log_probs, dim=1, pad_index=-100) end_top_index = accelerator.pad_across_processes( end_top_index, dim=1, pad_index=-100) cls_logits = accelerator.pad_across_processes( cls_logits, dim=1, pad_index=-100) all_start_top_log_probs.append( accelerator.gather(start_top_log_probs).cpu().numpy()) all_start_top_index.append( accelerator.gather(start_top_index).cpu().numpy()) all_end_top_log_probs.append( accelerator.gather(end_top_log_probs).cpu().numpy()) all_end_top_index.append( accelerator.gather(end_top_index).cpu().numpy()) all_cls_logits.append( accelerator.gather(cls_logits).cpu().numpy()) max_len = max([x.shape[1] for x in all_end_top_log_probs ]) # Get the max_length of the tensor # concatenate all numpy arrays collected above start_top_log_probs_concat = create_and_fill_np_array( all_start_top_log_probs, predict_dataset, max_len) start_top_index_concat = create_and_fill_np_array( all_start_top_index, predict_dataset, max_len) end_top_log_probs_concat = create_and_fill_np_array( all_end_top_log_probs, predict_dataset, max_len) end_top_index_concat = create_and_fill_np_array( all_end_top_index, predict_dataset, max_len) cls_logits_concat = np.concatenate(all_cls_logits, axis=0) # delete the list of numpy arrays del start_top_log_probs del start_top_index del end_top_log_probs del end_top_index del cls_logits outputs_numpy = ( start_top_log_probs_concat, start_top_index_concat, end_top_log_probs_concat, end_top_index_concat, cls_logits_concat, ) prediction = post_processing_function(predict_examples, predict_dataset, outputs_numpy) predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Predict metrics: {predict_metric}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def main(): # Section: Set device for PyTorch if torch.cuda.is_available(): # might need to update when using more than 1 GPU rank = 0 torch.cuda.set_device(rank) device = torch.device("cuda", rank) #torch.distributed.init_process_group(backend='nccl') n_gpu = torch.cuda.device_count() else: device = torch.device("cpu") n_gpu = 0 print("N GPU: ", n_gpu) # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("--batch_size", default=32, type=int, help="Indicate batch size") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--num_train_epochs", default=3.0, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--val_logging_step", default=100000, type=int, help="Number of steps in between logs of performance on validation set" ) parser.add_argument( "--train_logging_step", default=1000, type=int, help="Number of steps in between logs of performance on training set") parser.add_argument("--save_step", default=100000, type=int, help="Number of steps to save model parameters") parser.add_argument( "--model_id", type=str, help= "Model and optimizer will be saved at '/gpfs/data/razavianlab/capstone19/models/model_id'. " ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( "--feature_save_dir", type=str, help= "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/feature_save_dir'. " ) parser.add_argument( "--model_type", default="base", type=str, help="Whether to use the xlnet base model or the xlnet large model") parser.add_argument("--learning_rate", default=4e-5, type=float, help="Learning rate for optimizer") args = parser.parse_args() # Set random seed set_seeds(seed=args.seed, n_gpu=n_gpu) # Load data feature_save_path = os.path.join( '/gpfs/data/razavianlab/capstone19/preprocessed_data/', args.feature_save_dir) logger.info("Loading train dataset") train_dataloader = load_featurized_examples( args.batch_size, set_type="train", feature_save_path=feature_save_path) logger.info("Loading validation dataset") val_dataloader = load_featurized_examples( args.batch_size, set_type="val", feature_save_path=feature_save_path) # Load pretrained model num_train_optimization_steps = args.num_train_epochs * len( train_dataloader) if args.model_type == "large": config = XLNetConfig.from_pretrained('xlnet-large-cased', num_labels=2292) model = XLNetForSequenceClassification.from_pretrained( 'xlnet-large-cased', config=config) else: config = XLNetConfig.from_pretrained( 'xlnet-base-cased', num_labels=2292) # TODO: check if we need this model = XLNetForSequenceClassification.from_pretrained( 'xlnet-base-cased', config=config) model.to(device) optimizer, scheduler, model = initialize_optimizer(model, train_dataloader, args) logger.info("***** Running training *****") logger.info(" Num batches = %d", len(train_dataloader)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Total train batch size = %d", args.batch_size) logger.info(" Total optimization steps = %d", len(train_dataloader) * args.num_train_epochs) model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu))) train(train_dataloader=train_dataloader, val_dataloader=val_dataloader, model=model, optimizer=optimizer, scheduler=scheduler, num_train_epochs=args.num_train_epochs, n_gpu=n_gpu, device=device, model_id=args.model_id, save_step=args.save_step, train_logging_step=args.train_logging_step, val_logging_step=args.val_logging_step)
def main(): # Set device for PyTorch if torch.cuda.is_available(): # might need to update when using more than 1 GPU rank = 0 torch.cuda.set_device(rank) device = torch.device("cuda", rank) #torch.distributed.init_process_group(backend='nccl') n_gpu = torch.cuda.device_count() else: device = torch.device("cpu") n_gpu = 0 print("N GPU: ", n_gpu) # Parse arguments parser = argparse.ArgumentParser() parser.add_argument( "--model_id", type=str, help= "Model and optimizer should be saved at a folder inside '/gpfs/data/razavianlab/capstone19/models/{model_id}'. " ) parser.add_argument( "--checkpoint", type=str, help= "Checkpoint number. Model and optimizer should be saved at '/gpfs/data/razavianlab/capstone19/models/{model_id}/model_checkpoint_{checkpoint}'. " ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( "--feature_save_dir", type=str, help= "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/{feature_save_dir}'. " ) parser.add_argument("--set_type", type=str, help="Specify train/test file.") args = parser.parse_args() # Load training data feature_save_path = os.path.join( '/gpfs/data/razavianlab/capstone19/preprocessed_data/', args.feature_save_dir) logger.info("Loading test dataset") test_dataloader = load_featurized_examples( batch_size=32, set_type=args.set_type, feature_save_path=feature_save_path) # Load saved model model_path = os.path.join('/gpfs/data/razavianlab/capstone19/models/', args.model_id, 'model_checkpoint_' + args.checkpoint) logger.info("Loading saved model from {}".format(model_path)) config = XLNetConfig.from_pretrained( os.path.join(model_path, 'config.json'), num_labels=2292) # TODO: check if we need this model = XLNetForSequenceClassification.from_pretrained(model_path, config=config) model.to(device) model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu))) summaries = torch.empty(0, config.d_model).to(device) all_doc_ids = torch.empty(0).to(device) all_label_ids = torch.empty(0, 2292).to(device) for i, batch in enumerate(test_dataloader): model.eval() with torch.no_grad(): input_ids, input_mask, segment_ids, label_ids, doc_ids = batch input_ids = input_ids.to(device).long() input_mask = input_mask.to(device).long() segment_ids = segment_ids.to(device).long() doc_ids = doc_ids.to(device).float() label_ids = label_ids.to(device).float() transformer_outputs = model.module.transformer( input_ids=input_ids, token_type_ids=segment_ids, input_mask=input_mask) output = transformer_outputs[0] # extracting the CLS token summary = output[:, 0] summary = summary.to(device) summaries = torch.cat([summaries, summary], dim=0) all_doc_ids = torch.cat([all_doc_ids, doc_ids], dim=0) all_label_ids = torch.cat([all_label_ids, label_ids], dim=0) # Average the representation of the CLS token for all examples from the same document mask = torch.zeros(int(all_doc_ids.max().item()) + 1, len(summaries)) mask[all_doc_ids.long(), torch.arange(len(summaries))] = 1 averaging_matrix = torch.nn.functional.normalize(mask, p=1, dim=1).to(device) mean_summaries = torch.mm(averaging_matrix, summaries) print("mean summaries.shape", mean_summaries.size()) # Create an object storing one copy of the labels per document last_doc_id = -1 label_ids = torch.empty(0, all_label_ids.size()[1]).to(device) for (i, doc_id) in enumerate(all_doc_ids): if doc_id.item() != last_doc_id: label_ids = torch.cat([label_ids, all_label_ids[i].unsqueeze(0)]) last_doc_id = doc_id.item() print('label_ids shape', label_ids.size()) # Save the embedded representations of the document, along with the labels torch.save( mean_summaries, os.path.join(feature_save_path, args.set_type + '_summaries.pt')) torch.save( label_ids, os.path.join(feature_save_path, args.set_type + '_doc_label_ids.pt') ) # label_ids.pt has one record per window (and thus multiple records per document) return
configuration = XLNetConfig().from_dict({ "_name_or_path": "xlnet-predict-middle-notes", "architectures": ["XLNetLMHeadModel"], "attn_type": "bi", "bi_data": False, "bos_token_id": 10000, "clamp_len": -1, # "d_head": 64, "d_inner": 3072, "d_model": 768, "dropout": 0.1, "end_n_top": 5, "eos_token_id": 2, "ff_activation": "gelu", "initializer_range": 0.02, "layer_norm_eps": 1e-12, "mem_len": None, # null "model_type": "xlnet", "n_head": 8, # 12 originally "n_layer": 12, "pad_token_id": 10000, "reuse_len": None, # null, "same_length": False, "start_n_top": 5, "summary_activation": "tanh", "summary_last_dropout": 0.1, "summary_type": "last", "summary_use_proj": True, "untie_r": True, "use_mems_eval": True, "use_mems_train": True, # "vocab_size": 32000 })
def init_model(self): basic_encoder = None if self.config['use_bert']: bert_config = BertConfig.from_pretrained( self.config['bert_model_name'], cache_dir=self.config['bert_dir']) if self.config['num_bert_layer'] is not None: bert_config.num_hidden_layers = self.config['num_bert_layer'] bert = BertModel.from_pretrained(self.config['bert_model_name'], cache_dir=self.config['bert_dir'], config=bert_config) basic_encoder = bert elif self.config['use_xlnet']: xlnet_config = XLNetConfig.from_pretrained( 'hfl/chinese-xlnet-base', cache_dir=self.config['xlnet_dir']) xlnet_config.n_layer = self.config['num_xlnet_layer'] xlnet_config.mem_len = self.config['xlnet_mem_len'] xlnet = XLNetModel.from_pretrained( 'hfl/chinese-xlnet-base', cache_dir=self.config['xlnet_dir'], config=xlnet_config) basic_encoder = xlnet else: raise Exception('Not support other basic encoder') self.model = DocEE(self.config, basic_encoder, self.tokenizer) if self.config['cuda']: self.model.cuda() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config['learning_rate']) if self.config['resume_model']: OUTPUT_DIR = self.config['output_dir'] MODEL_SAVE_DIR = os.path.join(OUTPUT_DIR, self.config['model_save_dir']) if os.path.exists(MODEL_SAVE_DIR): cpt_file_names = os.listdir(MODEL_SAVE_DIR) if len(cpt_file_names) > 0: epoch_record = [] for cpt_file_name in cpt_file_names: epoch_record.append( int(cpt_file_name.split('-')[-1].split('.')[0])) epoch_record.sort() latest_epoch = epoch_record[-1] self.latest_epoch = latest_epoch + 1 latest_model_file_name = os.path.join( MODEL_SAVE_DIR, self.config['model_file'] % (self.config['ee_method'], latest_epoch)) if self.config['cuda']: store_dict = torch.load( latest_model_file_name, map_location=torch.device('cuda')) else: store_dict = torch.load(latest_model_file_name, map_location='cpu') self.model.load_state_dict(store_dict['model_state']) self.optimizer.load_state_dict( store_dict['optimizer_state']) print('resume train from %s' % latest_model_file_name) print('model init finish')
else: my_collect = collate_fn train_loader = DataLoader(train_dataset, num_workers=2, batch_size=args.batch_size, shuffle=True, collate_fn=my_collect) test_loader = DataLoader(test_dataset, num_workers=2, batch_size=args.eval_batch_size, shuffle=False, collate_fn=my_collect) # ##make model device = torch.device(args.gpu_ids) config = XLNetConfig.from_pretrained("xlnet-base-cased") config.num_labels = 5 if args.dataset is "ag_news": config.num_labels = 4 pretrained_model = XLNetForSequenceClassification.from_pretrained( "xlnet-base-cased", config=config) model = scl_model_Xlnet(config, device, pretrained_model, with_semi=args.with_mix, with_sum=args.with_summary) ##make optimizer optimizer = OpenAIAdam(model.parameters(), lr=args.lr, schedule='warmup_linear',
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer, BertConfig, BertModel, BertTokenizer, XLNetConfig, XLNetModel, XLNetTokenizer, XLNetForSequenceClassification import torch ''' config = RobertaConfig.from_pretrained("./roberta-base/roberta-base-config.json") tokenizer = RobertaTokenizer.from_pretrained("./roberta-base/roberta-base-vocab.json") model = RobertaModel.from_pretrained("./roberta-base/roberta-base-pytorch_model.bin", config=config) ''' config = XLNetConfig.from_pretrained( "./xlnet-base-cased/xlnet-base-cased-config.json") tokenizer = XLNetTokenizer.from_pretrained( "./xlnet-base-cased/xlnet-base-cased-spiece.model") model = XLNetModel.from_pretrained( "./xlnet-base-cased/xlnet-base-cased-pytorch_model.bin", config=config) input_ids = torch.tensor(tokenizer.encode("toxicity")).unsqueeze( 0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] print(last_hidden_states)
def __init__(self): super(Model, self).__init__() self.config = XLNetConfig.from_pretrained('./xlnet_pretrain/config.json') self.xlnet = XLNetModel.from_pretrained('./xlnet_pretrain/pytorch_model.bin', config=self.config) self.fc = nn.Linear(self.config.d_model, 2)
def main(config, model_filename): if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) if not os.path.exists(config.cache_dir): os.makedirs(config.cache_dir) if not os.path.exists(config.log_dir): os.makedirs(config.log_dir) model_file = os.path.join(config.output_dir, model_filename) # Prepare the device gpu_ids = [2] device, n_gpu = get_device(gpu_ids[0]) if n_gpu > 1: n_gpu = len(gpu_ids) # Set Random Seeds random.seed(config.seed) torch.manual_seed(config.seed) np.random.seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.deterministic = True tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') xlnet_config = XLNetConfig.from_pretrained(config.bert_config_path) cache_train_dataset = "cached_dataset_train_linear_512" cache_dev_dataset = "cached_dataset_dev_linear_512" if os.path.exists(config.cache_dir + '/' + cache_train_dataset): logger.info("Loading features from cached file %s", config.cache_dir + '/' + cache_train_dataset) train_dataset = torch.load(config.cache_dir + '/' + cache_train_dataset) dev_dataset = torch.load(config.cache_dir + '/' + cache_dev_dataset) else: train_dataset, dev_dataset, test_dataset = load_data( config.data_path, device, tokenizer, config.cache_dir, 64, 960) logger.info("save cached file in %s", config.cache_dir) torch.save(train_dataset, config.cache_dir + '/' + cache_train_dataset) torch.save(dev_dataset, config.cache_dir + '/' + cache_dev_dataset) # train_sampler = RandomSampler(train_dataset) # dev_sampler =RandomSampler(dev_dataset) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=config.train_batch_size, num_workers=8, pin_memory=False, drop_last=False) dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=config.dev_batch_size, num_workers=8, pin_memory=False, drop_last=False) # train_iterator = trange(int(config.epoch_num)) if config.model_name == "GAReader": from XLNet_Linear.GAReader.GAReader import GAReader model = GAReader(config.bert_word_dim, config.output_dim, config.hidden_size, config.rnn_num_layers, config.ga_layers, config.bidirectional, config.dropout, xlnet_config) # optimizer_grouped_parameter = [ # {'params':[p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay) and 'embedding' not in n and 'bert' not in n]}, # {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and 'embedding' not in n and 'bert' not in n]} # ] param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] # optimizer_parameter =[ # {'params':model.word_embedding.bert.parameters()}, # {'params':model.word_embedding.aggregation.parameters(),'lr':1e-4}, # # {'params':model.rnn.parameters(),'lr':1e-3}, # # {'params':model.ga_rnn.parameters(),'lr':1e-3}, # # {'params':model.mlp_att.parameters(),'lr':1e-2}, # # {'params':model.dot_layer.parameters(),'lr':1e-2}, # {'params':model.final_liear.parameters(),'lr':1e-4}, # ] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and 'xlnet' not in n ], 'name': [ n for n, p in param_optimizer if not any(nd in n for nd in no_decay) and 'xlnet' not in n ], 'weight_decay': 0.01, 'lr': 3e-4 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) and 'xlnet' not in n ], 'name': [ n for n, p in param_optimizer if not any(nd in n for nd in no_decay) and 'xlnet' not in n ], 'weight_decay': 0.0, 'lr': 3e-4 }, { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and 'xlnet' in n ], 'name': [ n for n, p in param_optimizer if not any(nd in n for nd in no_decay) and 'xlnet' not in n ], 'weight_decay': 0.01, 'lr': config.lr }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) and 'xlnet' in n ], 'name': [ n for n, p in param_optimizer if not any(nd in n for nd in no_decay) and 'xlnet' not in n ], 'weight_decay': 0.0, 'lr': config.lr }] # print(optimizer_grouped_parameter) optimizer = optim.AdamW(optimizer_grouped_parameters, lr=config.lr, eps=1e-6) # optimizer = optim.SGD(model.parameters(), lr=config.lr) # print(optimizer_grouped_parameter) # optimizer = optim.SGD(optimizer_parameter,lr=config.lr) # model,optimizer = amp.initialize(model,optimizer,opt_level="01") scheduler = get_linear_schedule_with_warmup(optimizer, 16000, 200000) criterion = nn.CrossEntropyLoss() model = model.to(device) criterion = criterion.to(device) if config.do_train: train(config.epoch_num, model, train_dataloader, dev_dataloader, optimizer, criterion, ['0', '1', '2', '3', '4'], model_file, config.log_dir, config.print_step, config.clip, device, scheduler) # trained_file = './ga/output/2020-10-20-22_41_37best_model_linear' # tt = torch.load(trained_file) # model.load_state_dict(torch.load(trained_file,map_location={'cuda:2':'cuda:1'})) model.load_state_dict(torch.load(model_file)) test_loss, test_acc, test_report = evaluate(model, train_dataloader, criterion, ['0', '1', '2', '3', '4'], device, log_dir) print("-------------- Test -------------") print("\t Loss: {} | Acc: {} | Macro avg F1: {} | Weighted avg F1: {}". format(test_loss, test_acc, test_report['macro avg']['f1-score'], test_report['weighted avg']['f1-score']))
def main(): # Set device for PyTorch if torch.cuda.is_available(): # might need to update when using more than 1 GPU rank = 0 torch.cuda.set_device(rank) device = torch.device("cuda", rank) #torch.distributed.init_process_group(backend='nccl') n_gpu = torch.cuda.device_count() else: device = torch.device("cpu") n_gpu = 0 print("N GPU: ", n_gpu) # Parse arguments parser = argparse.ArgumentParser() parser.add_argument( "--model_id", type=str, help= "Model and optimizer should be saved at a folder inside '/gpfs/data/razavianlab/capstone19/models/{model_id}'. " ) parser.add_argument( "--checkpoint", type=str, help= "Checkpoint number. Model and optimizer should be saved at '/gpfs/data/razavianlab/capstone19/models/{model_id}/model_checkpoint_{checkpoint}'. " ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( "--feature_save_dir", type=str, help= "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/{feature_save_dir}'. " ) parser.add_argument("--set_type", type=str, help="Specify train/val/test") parser.add_argument("--model_type", type=str, default='xlnet', help="Specify xlnet or classifier") parser.add_argument( '--num_hidden_layers', type=int, default=5, help= "Number of hidden layers for MLP classifier (not needed to evaluate a model with XLNet architecture)" ) parser.add_argument( '--hidden_size', type=int, default=1024, help= "Hidden size for MLP classifier (not needed to evaluate a model with XLNet architecture)" ) parser.add_argument( "--drop_rate", default=0.3, type=float, help= "Droprate in between hidden layers for MLP classifer (not needed to evaluate a model with XLNet architecture)" ) parser.add_argument( "--activation_function", default='sigmoid', type=str, help= "Activation function for MLP classifer (not needed to evaluate a model with XLNet architecture)" ) args = parser.parse_args() # Load training data feature_save_path = os.path.join( '/gpfs/data/razavianlab/capstone19/preprocessed_data/', args.feature_save_dir) logger.info("Loading {} dataset".format(args.set_type)) test_dataloader = load_featurized_examples( batch_size=32, set_type=args.set_type, sliding_window=(args.model_type == "classifier"), feature_save_path=feature_save_path) # Load saved model model_path = os.path.join('/gpfs/data/razavianlab/capstone19/models/', args.model_id, 'model_checkpoint_' + args.checkpoint) logger.info("Loading saved model from {}".format(model_path)) if args.model_type == "xlnet": config = XLNetConfig.from_pretrained( os.path.join(model_path, 'config.json'), num_labels=2292) # TODO: check if we need this model = XLNetForSequenceClassification.from_pretrained(model_path, config=config) else: saved_model = torch.load(os.path.join(model_path, 'model.pt')) model = SlidingClassifier(num_layers=args.num_hidden_layers, hidden_size=args.hidden_size, p=args.drop_rate, activation_function=args.activation_function) model.state_dict = saved_model['model'] model.to(device) model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu))) eval_folder = '/gpfs/data/razavianlab/capstone19/evals' val_file_name = os.path.join( eval_folder, args.model_id + "_{}_{}_metrics.p".format(args.checkpoint, args.set_type)) # Create empty data frame to store evaluation results in (to be written to val_file_name) val_results = pd.DataFrame(columns=[ 'loss', 'micro_AUC', 'macro_AUC', 'top1_precision', 'top3_precision', 'top5_precision', 'micro_f1', 'macro_f1', 'macro_AUC_list' ]) # Run evaluation results = evaluate(dataloader=test_dataloader, model=model, model_id=args.model_id, n_gpu=n_gpu, device=device, sliding_window=(args.model_type == "classifier")) # Save results val_results = val_results.append(pd.DataFrame(results, index=[0])) pickle.dump(val_results, open(val_file_name, "wb")) os.system("chgrp razavianlab {}".format(val_file_name)) return
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = XLNetConfig() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = XLNetLMHeadModel.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = XLNetLMHeadModel.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--encoder_model_type", default=None, type=str, required=True, help="Model type selected", ) parser.add_argument( "--encoder_model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name", ) parser.add_argument( "--decoder_model_type", default=None, type=str, required=True, help="Model type selected", ) parser.add_argument( "--decoder_model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--do_predict", action="store_true", help="Whether to run predictions on the test set.", ) parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents.", ) parser.add_argument( "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents.", ) parser.add_argument( "--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--optimizer", default="lamb", type=str, help="Optimizer (AdamW or lamb)", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument( "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.", ) parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda:0" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda:0", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device print('DEVICE : ' + str(args.device)) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.encoder_model_type = args.encoder_model_type.lower() args.decoder_model_type = args.decoder_model_type.lower() tokenizer_args = { k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS } logger.info("Tokenizer arguments: %s", tokenizer_args) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.encoder_model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, **tokenizer_args, ) # ensure there's a pad token if tokenizer.pad_token is None: tokenizer.pad_token = "<PAD>" # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later # pad_token_label_id = CrossEntropyLoss().ignore_index pad_token_label_id = tokenizer.pad_token_id if args.encoder_model_type == 'bert': config_encoder = BertConfig() elif args.encoder_model_type == 'gpt2': config_encoder = GPT2Config() elif args.encoder_model_type == 'xlnet': config_encoder = XLNetConfig() if args.decoder_model_type == 'bert': config_decoder = BertConfig() elif args.decoder_model_type == 'gpt2': config_decoder = GPT2Config() elif args.decoder_model_type == 'xlnet': config_decoder = XLNetConfig() config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) logger.info('Defining model...') model = EncoderDecoderModel.from_encoder_decoder_pretrained( args.encoder_model_name_or_path, args.decoder_model_name_or_path, config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, mode="train") global_step, tr_loss = train(args, train_dataset, model, tokenizer, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: # since the config is prefaced with `tokenizer_`, Autotokenizer doesn't instatiate this correctly #config = AutoConfig.from_pretrained(os.path.join(args.output_dir, "tokenizer_config.json")) #config = {"do_lower_case": False, "model_max_length": 512} #tokenizer = AutoTokenizer.from_pretrained(args.output_dir, config=config, **tokenizer_args) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" #model = EncoderDecoderModel.from_pretrained( # os.path.join(args.output_dir, "encoder"), os.path.join(args.output_dir, "decoder"), #) model.to(args.device) result, _ = evaluate( args, model, tokenizer, pad_token_label_id, mode="dev", prefix=global_step, ) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w", encoding="utf-8") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: # since the config is prefaced with `tokenizer_`, Autotokenizer doesn't instatiate this correctly #config = AutoConfig.from_pretrained(os.path.join(args.output_dir, "tokenizer_config.json")) #config = {"do_lower_case": False, "model_max_length": 512} #tokenizer = AutoTokenizer.from_pretrained(args.output_dir, config=config, **tokenizer_args) #model = EncoderDecoderModel.from_pretrained( # os.path.join(args.output_dir, "encoder"), os.path.join(args.output_dir, "decoder"), #) model.to(args.device) result, predictions = evaluate(args, model, tokenizer, pad_token_label_id, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w", encoding="utf-8") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w", encoding="utf-8") as writer: for example in predictions: output_line = ("output: " + tokenizer.decode( example, skip_special_tokens=True, clean_up_tokenization_spaces=True, ) + "\n") writer.write(output_line) return results
'batch_size': 64, 'tenacity': 5, 'epoch_size': 4 } # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--model', default='xlnet-base-cased', help='model name or path') args = parser.parse_args() config = XLNetConfig.from_pretrained(args.model) model = XLNetModel.from_pretrained(args.model, config=config) tokenizer = XLNetTokenizer.from_pretrained(args.model) params_senteval['model'] = model.cuda().eval() params_senteval['tokenizer'] = tokenizer se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [ 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI' ] results = se.eval(transfer_tasks)
# # if args.language == 'english': nlp = spacy.load('en_core_web_sm') # nlp = spacy.load('en', parser=False, entity=False) # elif args.language == 'french': # nlp = spacy.load('fr_core_news_sm') # elif args.language == 'german': # nlp = spacy.load('de_core_news_sm') # Create a Tokenizer with the default settings for English # including punctuation rules and exceptions spacy_tokenizer = nlp.Defaults.create_tokenizer(nlp) vocab = dict() tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') tokenizer.con config = XLNetConfig.from_pretrained('xlnet-large-cased', num_labels=3) model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', config=config) device = 'cuda' model.to(device) model.train() df_vocab = pd.DataFrame(columns=['token', 'frequency']) # df = pd.read_csv(filename_train) df = pd.read_csv(filename_train, header=None, usecols=[0, 1]) df.columns = ['text', 'label'] print('columns', df.columns) test_train_perc = 0.80
def main(): from transformers import XLNetConfig config = XLNetConfig( vocab_size=21_128, d_model=768, n_head=12, n_layer=6, ) from transformers import XLNetTokenizer tokenizer = XLNetTokenizer.from_pretrained("./model/spbpe", max_len=512) from transformers import XLNetLMHeadModel model = XLNetLMHeadModel(config=config) model.resize_token_embeddings(len(tokenizer)) print(model.num_parameters()) from transformers import LineByLineTextDataset dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="./data/data_train.csv", block_size=128, ) max_seq_length = 512 from transformers import DataCollatorForPermutationLanguageModeling data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=1.0 / 6, max_span_length=5) from transformers import Trainer, TrainingArguments training_args = TrainingArguments( output_dir="./model/xlnet_v1", overwrite_output_dir=True, num_train_epochs=5, per_gpu_train_batch_size=32, save_steps=10_000, save_total_limit=2, tpu_num_cores=8, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=True, ) trainer.train() if trainer.is_world_master(): trainer.save_model("./model/spbpe") print('FIN')
test=pd.read_csv('../tcdata/testB.csv',header=None) model_path='../model_weight/xlnet/' output_model='../tmp/xlnet.pth' batch_size=32 # 合并训练集与测试集 制作特征 for i in range(1,3): train[i]=train[i].apply(lambda x:x.replace('|','').strip()) for i in range(1,2): test[i]=test[i].apply(lambda x:x.replace('|','').strip()) train.columns=['idx','sentence','label1','label2'] test.columns=['idx','sentence'] # test.columns=['idx','sentence','label1','label2'] tokenizer=BertTokenizerFast.from_pretrained(model_path) config=XLNetConfig.from_pretrained(model_path,num_labels=17,hidden_dropout_prob=0.2) # config.output_attentions=True config.hidden_dropout_prob=0.2 # In[3]: def train_model(train_df,val_df,test_oof): ###-------------------- early_stop=0 print("Reading training data...") train_set = CustomDataset(train_df, maxlen=128,tokenizer=tokenizer) train_loader = Data.DataLoader(train_set, batch_size=batch_size, num_workers=5, shuffle=True) print("Reading validation data...")
def train(): # 加载预训练bert config = XLNetConfig.from_pretrained('xlnet_config.json') model = XLNetForQuestionAnswering.from_pretrained('xlnet_model.ckpt.index', from_tf=True, config=config) device = args.device model.to(device) # 准备 optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = adabound.AdaBound(optimizer_grouped_parameters, lr=1e-3, final_lr=0.1) # 准备数据 data = Dureader() train_dataloader, dev_dataloader = data.train_iter, data.dev_iter best_loss = 100000.0 model.train() for i in range(args.num_train_epochs): for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch")): input_ids, input_mask, segment_ids, start_positions, end_positions = \ batch.input_ids, batch.input_mask, batch.segment_ids, batch.start_position, batch.end_position input_ids, input_mask, segment_ids, start_positions, end_positions = \ input_ids.to(device), input_mask.to(device), segment_ids.to(device), start_positions.to(device), end_positions.to(device) # 计算loss outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, start_positions=start_positions, end_positions=end_positions) loss = outputs[0] loss = loss / args.gradient_accumulation_steps loss.backward() # 更新梯度 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() # 验证 if step % args.log_step == 4: eval_loss = evaluate.evaluate(model, dev_dataloader) if eval_loss < best_loss: best_loss = eval_loss torch.save(model.state_dict(), './model_dir/' + "best_model") model.train()
def __init__(self): super(XlnetModelTest, self).__init__() config = XLNetConfig.from_pretrained('Saier/models/config.json') self.xlnet = XLNetForSequenceClassification(config) # /bert_pretrain/ self.device = torch.device("cuda")
def prepare_config_and_inputs(self): input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float() input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size) perm_mask = torch.zeros( self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device, ) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token target_mapping = torch.zeros( self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device, ) target_mapping[:, 0, -1] = 1.0 # predict last token sequence_labels = None lm_labels = None is_impossible_labels = None token_labels = None if self.use_labels: lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) is_impossible_labels = ids_tensor([self.batch_size], 2).float() token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) config = XLNetConfig( vocab_size=self.vocab_size, d_model=self.hidden_size, n_head=self.num_attention_heads, d_inner=self.d_inner, n_layer=self.num_hidden_layers, untie_r=self.untie_r, mem_len=self.mem_len, clamp_len=self.clamp_len, same_length=self.same_length, reuse_len=self.reuse_len, bi_data=self.bi_data, initializer_range=self.initializer_range, num_labels=self.type_sequence_label_size, bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, eos_token_id=self.eos_token_id, return_dict=True, ) return ( config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels, )
def prepare_config_and_inputs(self): input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32) input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size) perm_mask = tf.zeros( (self.batch_size, self.seq_length + 1, self.seq_length), dtype=tf.float32) perm_mask_last = tf.ones((self.batch_size, self.seq_length + 1, 1), dtype=tf.float32) perm_mask = tf.concat([perm_mask, perm_mask_last], axis=-1) # perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token target_mapping = tf.zeros((self.batch_size, 1, self.seq_length), dtype=tf.float32) target_mapping_last = tf.ones((self.batch_size, 1, 1), dtype=tf.float32) target_mapping = tf.concat([target_mapping, target_mapping_last], axis=-1) # target_mapping[:, 0, -1] = 1.0 # predict last token sequence_labels = None lm_labels = None is_impossible_labels = None if self.use_labels: lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32) config = XLNetConfig( vocab_size=self.vocab_size, d_model=self.hidden_size, n_head=self.num_attention_heads, d_inner=self.d_inner, n_layer=self.num_hidden_layers, untie_r=self.untie_r, mem_len=self.mem_len, clamp_len=self.clamp_len, same_length=self.same_length, reuse_len=self.reuse_len, bi_data=self.bi_data, initializer_range=self.initializer_range, num_labels=self.type_sequence_label_size, ) return ( config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, )
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = XLNetConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = XLNetTokenizerFast.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = XLNetForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names else: column_names = raw_datasets["test"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Some of the questions have lots of whitespace on the left, which is not useful and will make the # truncation of the context fail (the tokenized question will take a lots of space). So we remove that # left whitespace examples[question_column_name] = [ q.lstrip() for q in examples[question_column_name] ] # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] tokenized_examples["is_impossible"] = [] tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0. # The cls token gets 1.0 too (for predictions of empty answers). tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != context_idx: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != context_idx: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) tokenized_examples["is_impossible"].append(0.0) return tokenized_examples if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: # Select samples from Dataset, This will help to decrease processing time train_dataset = train_dataset.select( range(data_args.max_train_samples)) # Create Training Features with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if data_args.max_train_samples is not None: # Select samples from dataset again since Feature Creation might increase number of features train_dataset = train_dataset.select( range(data_args.max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label. tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, input_ids in enumerate(tokenized_examples["input_ids"]): # Find the CLS token in the input ids. cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others 1.0. tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_idx else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = raw_datasets["validation"] if data_args.max_eval_samples is not None: # Selecting Eval Samples from Dataset eval_examples = eval_examples.select( range(data_args.max_eval_samples)) # Create Features from Eval Dataset with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if data_args.max_eval_samples is not None: # Selecting Samples from Dataset again since Feature Creation might increase samples size eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) if training_args.do_predict: if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = raw_datasets["test"] if data_args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select( range(data_args.max_predict_samples)) # Test Feature Creation with training_args.main_process_first( desc="prediction dataset map pre-processing"): predict_dataset = predict_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again predict_dataset = predict_dataset.select( range(data_args.max_predict_samples)) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data # collator. data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, start_n_top=model.config.start_n_top, end_n_top=model.config.end_n_top, output_dir=training_args.output_dir, log_level=log_level, prefix=stage, ) # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k] } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric( "squad_v2" if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Initialize our Trainer trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, eval_examples=eval_examples if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Prediction if training_args.do_predict: logger.info("*** Predict ***") results = trainer.predict(predict_dataset, predict_examples) metrics = results.metrics max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "question-answering" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): # Set device for PyTorch if torch.cuda.is_available(): # might need to update when using more than 1 GPU rank = 0 torch.cuda.set_device(rank) device = torch.device("cuda", rank) #torch.distributed.init_process_group(backend='nccl') n_gpu = torch.cuda.device_count() else: device = torch.device("cpu") n_gpu = 0 print("N GPU: ", n_gpu) # Parse arguments parser = argparse.ArgumentParser() parser.add_argument( "--model_id", type=str, help= "Model and optimizer should be saved at a folder inside '/gpfs/data/razavianlab/capstone19/models/{model_id}'. " ) parser.add_argument( "--checkpoint", type=str, help= "Checkpoint number. Model and optimizer should be saved at '/gpfs/data/razavianlab/capstone19/models/{model_id}/model_checkpoint_{checkpoint}'. " ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( "--feature_save_dir", type=str, help= "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/{feature_save_dir}'. " ) parser.add_argument( "--batch_size", type=int, default=32, help="Specify batch size for load featurized examples.") parser.add_argument("--set_type", type=str, help="Specify train/val/test file.") parser.add_argument("--save_batch", type=int, help="Save files every save_batch batches.") args = parser.parse_args() # Load data feature_save_path = os.path.join( '/gpfs/data/razavianlab/capstone19/preprocessed_data/', args.feature_save_dir) logger.info("Loading dataset") dataloader = load_featurized_examples(batch_size=args.batch_size, set_type=args.set_type, feature_save_path=feature_save_path) # Load saved model model_path = os.path.join('/gpfs/data/razavianlab/capstone19/models/', args.model_id, 'model_checkpoint_' + args.checkpoint) logger.info("Loading saved model from {}".format(model_path)) config = XLNetConfig.from_pretrained( os.path.join(model_path, 'config.json'), num_labels=2292) # TODO: check if we need this model = XLNetForSequenceClassification.from_pretrained(model_path, config=config) model.to(device) model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu))) last_batch_doc_id = -1 # Used to determine if the last document of the last batch was split up or not stored_logits = torch.empty(0, 2292).to( device ) # Stores logits until we finish a batch where the last document was not split up all_doc_ids = torch.empty(0).to( device ) # Stores the list of doc ids corresponding to the rows of stored_logits all_combined_logits = torch.empty(0, 2292).to( device ) # For all documents, stores the elementwise max of all logits for that document all_label_ids = torch.empty(0, 2292).to(device) stored_label_ids = torch.empty(0, 2292).to(device) for i, batch in enumerate(dataloader): if i % 1000 == 0 and i > 0: logger.info('Entering batch {}'.format(i)) model.eval() with torch.no_grad(): input_ids, input_mask, segment_ids, label_ids, doc_ids = batch input_ids = input_ids.to(device).long() input_mask = input_mask.to(device).long() segment_ids = segment_ids.to(device).long() doc_ids = doc_ids.to(device).float() label_ids = label_ids.to(device).float() # Get logits for this batch logits = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids)[0] # Check if any part of the last document in stored_logits is in this batch, # indicating that a document got split across stored_logits and this batch if all( doc_ids != last_batch_doc_id ) and last_batch_doc_id != -1: # This means that the last batch of stored_logits did not get split up # If nothing was split, then we can combine the logits in stored_logits by document # and store the results in all_combined_logits # Combine logits by doc_id last_doc_id = all_doc_ids[0].item() to_combine = torch.empty(0, 2292).to(device) for (j, doc_id) in enumerate(all_doc_ids): if doc_id.item() != last_doc_id: # Get the pointwise max over all logits for the last document combined_logits = torch.max( to_combine, dim=0 )[0].reshape( 1, -1 ) # pointwise max of all logits for the last document all_combined_logits = torch.cat( [all_combined_logits, combined_logits], dim=0) # Create to_combine for the new document and update last_doc_id to_combine = stored_logits[j, :].reshape(1, -1) last_doc_id = doc_id.item() else: # Add these logits to to_combine with the other logits for this document to_combine = torch.cat( [to_combine, stored_logits[j, :].reshape(1, -1)], dim=0) combined_logits = torch.max(to_combine, dim=0)[0].reshape(1, -1) all_combined_logits = torch.cat( [all_combined_logits, combined_logits], dim=0) # Create an object storing one copy of the labels per document last_doc_id = -1 for (j, doc_id) in enumerate(all_doc_ids): if (doc_id.item() != last_doc_id) and last_doc_id != -1: all_label_ids = torch.cat([ all_label_ids, stored_label_ids[j - 1].unsqueeze(0) ]) last_doc_id = doc_id.item() all_label_ids = torch.cat( [all_label_ids, stored_label_ids[j].unsqueeze(0)]) all_doc_ids = torch.empty(0).to(device) stored_logits = torch.empty(0, 2292).to(device) stored_label_ids = torch.empty(0, 2292).to(device) stored_logits = torch.cat([stored_logits, logits], dim=0) all_doc_ids = torch.cat([all_doc_ids, doc_ids], dim=0) stored_label_ids = torch.cat([stored_label_ids, label_ids], dim=0) last_batch_doc_id = doc_ids[-1] # If a doc was split, then save these logits until we find a batch where no doc was split else: stored_logits = torch.cat([stored_logits, logits], dim=0) all_doc_ids = torch.cat([all_doc_ids, doc_ids], dim=0) stored_label_ids = torch.cat([stored_label_ids, label_ids], dim=0) last_batch_doc_id = doc_ids[-1] # Save every number of steps and clear out the tensors to save memory if i % args.save_batch == 0 and i > 0: torch.save( all_label_ids, os.path.join( feature_save_path, "{}_label_ids_{}.pt".format(args.set_type, int(i / args.save_batch)))) torch.save( all_combined_logits, os.path.join( feature_save_path, "{}_logits_{}.pt".format(args.set_type, int(i / args.save_batch)))) all_combined_logits = torch.empty(0, 2292).to(device) all_label_ids = torch.empty(0, 2292).to(device) logger.info("Saved batch {}".format(int(i / args.save_batch))) # Store logits and labels for the final batch(es) last_doc_id = all_doc_ids[0].item() to_combine = torch.empty(0, 2292).to(device) for (j, doc_id) in enumerate(all_doc_ids): if doc_id.item() != last_doc_id: # Get the pointwise max over all logits for the last document combined_logits = torch.max(to_combine, dim=0)[0].reshape( 1, -1) # pointwise max of all logits for the last document all_combined_logits = torch.cat( [all_combined_logits, combined_logits], dim=0) # Create to_combine for the new document and update last_doc_id to_combine = stored_logits[j, :].reshape(1, -1) last_doc_id = doc_id.item() else: # Add these logits to to_combine with the other logits for this document to_combine = torch.cat( [to_combine, stored_logits[j, :].reshape(1, -1)], dim=0) combined_logits = torch.max(to_combine, dim=0)[0].reshape( 1, -1) # pointwise max of all logits for the last document all_combined_logits = torch.cat([all_combined_logits, combined_logits], dim=0) # Create an object storing one copy of the labels per document last_doc_id = -1 for (j, doc_id) in enumerate(all_doc_ids): if (doc_id.item() != last_doc_id) and last_doc_id != -1: all_label_ids = torch.cat( [all_label_ids, stored_label_ids[j - 1].unsqueeze(0)]) last_doc_id = doc_id.item() all_label_ids = torch.cat( [all_label_ids, stored_label_ids[j].unsqueeze(0)]) torch.save( all_label_ids, os.path.join( feature_save_path, "{}_label_ids_{}.pt".format(args.set_type, int(math.ceil(i / args.save_batch))))) torch.save( all_combined_logits, os.path.join( feature_save_path, "{}_logits_{}.pt".format(args.set_type, int(math.ceil(i / args.save_batch))))) logger.info("Saved batch {}".format(int(math.ceil(i / args.save_batch)))) return