def load_model_config_tokenizer(args): if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForMultipleChoice.from_config(config) return model, config, tokenizer
def load_reranker(model_name_or_path): logger.info(f'Loading model from: {model_name_or_path}') config = AutoConfig.from_pretrained(model_name_or_path) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True) model = AutoModelForMultipleChoice.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path), config=config, ) model = model.eval() return model, tokenizer
def load(self, fname = None): if fname is not None: self.load_path = fname if self.pretrained_bert: log.info(f"From pretrained {self.pretrained_bert}.") config = AutoConfig.from_pretrained(self.pretrained_bert, num_labels=self.n_classes, output_attentions=False, output_hidden_states=False) self.model = AutoModelForMultipleChoice.from_pretrained(self.pretrained_bert, config=config) elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.bert_config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file))) if self.attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob if self.hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob self.model = AutoModelForMultipleChoice.from_config(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.model.to(self.device) self.optimizer = getattr(torch.optim, self.optimizer_name)( self.model.parameters(), **self.optimizer_parameters) if self.lr_scheduler_name is not None: self.lr_scheduler = getattr(torch.optim.lr_scheduler, self.lr_scheduler_name)( self.optimizer, **self.lr_scheduler_parameters) if self.load_path: log.info(f"Load path {self.load_path} is given.") if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): raise ConfigError("Provided load path is incorrect!") weights_path = Path(self.load_path.resolve()) weights_path = weights_path.with_suffix(f".pth.tar") if weights_path.exists(): log.info(f"Load path {weights_path} exists.") log.info(f"Initializing `{self.__class__.__name__}` from saved.") # now load the weights, optimizer from saved log.info(f"Loading weights from {weights_path}.") checkpoint = torch.load(weights_path, map_location=self.device) self.model.load_state_dict(checkpoint["model_state_dict"]) self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) self.epochs_done = checkpoint.get("epochs_done", 0) else: log.info(f"Init from scratch. Load path {weights_path} does not exist.")
def download(model_name, cache_dir): model = AutoModelForMultipleChoice.from_pretrained( model_name, force_download=True, cache_dir=cache_dir, ) config = AutoConfig.from_pretrained( model_name, cache_dir=cache_dir, force_download=True, ) tokenizer = AutoTokenizer.from_pretrained( model_name, cache_dir=cache_dir, force_download=True, ) return model, config, tokenizer
def get_this_model(task, model_config): from transformers import AutoModelForSequenceClassification from transformers import AutoModelForSeq2SeqLM from transformers import AutoModelForMultipleChoice from transformers import AutoModelForTokenClassification if task in (SEQCLASSIFICATION, SEQREGRESSION): return AutoModelForSequenceClassification.from_pretrained( checkpoint_path, config=model_config) elif task == TOKENCLASSIFICATION: return AutoModelForTokenClassification.from_pretrained( checkpoint_path, config=model_config) elif task in NLG_TASKS: return AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path, config=model_config) elif task == MULTICHOICECLASSIFICATION: return AutoModelForMultipleChoice.from_pretrained( checkpoint_path, config=model_config)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, AllTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) checkpoint_dir = hyperparam_path_for_two_stage_evidence_selector( model_args, data_args, training_args) ckpt_dir = Path(checkpoint_dir) postfix = "" if training_args.train_extensive_evidence_selector or training_args.train_intensive_evidence_selector: postfix += "_train" else: postfix += "_eval" setup_root_logger(ckpt_dir, training_args.local_rank, debug=False, postfix=postfix) training_args.output_dir = checkpoint_dir # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) logger.info("Data parameters %s", data_args) logger.info("Model parameters %s", model_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the [datasets]: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). if data_args.dataset not in ['race', 'dream']: raise ValueError("Dataset should be race or dream.") # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. data_files = { 'train': data_args.train_file if data_args.train_file is not None else None, 'validation': data_args.validation_file if data_args.validation_file is not None else None, 'test': data_args.test_file if data_args.test_file is not None else None } # datasets = load_dataset(data_args.dataload_script, data_args.dataload_split, # data_files=data_files if data_files['train'] is not None else None, # data_dir=data_args.data_dir, # split={'train': ReadInstruction('train', from_=0, to=5, unit='abs'), # 'validation': ReadInstruction('validation', from_=0, to=5, unit='abs'), # 'test': ReadInstruction('test', from_=0, to=5, unit='abs')}) datasets = load_dataset( data_args.dataload_script, data_args.dataload_split, data_files=data_files if data_files['train'] is not None else None, data_dir=data_args.data_dir) # Load pretrained model and tokenizer # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) extensive_evidence_selector_path = model_args.extensive_evidence_selector_path \ if model_args.extensive_evidence_selector_path else model_args.model_name_or_path intensive_evidence_selector_path = model_args.intensive_evidence_selector_path \ if model_args.intensive_evidence_selector_path else model_args.model_name_or_path evidence_reader_path = model_args.evidence_reader_path \ if model_args.evidence_reader_path else model_args.model_name_or_path answer_verifier_path = model_args.answer_verifier_path \ if model_args.answer_verifier_path else model_args.model_name_or_path extensive_selector_config = AutoConfig.from_pretrained( extensive_evidence_selector_path, cache_dir=model_args.cache_dir, ) intensive_selector_config = AutoConfig.from_pretrained( intensive_evidence_selector_path, cache_dir=model_args.cache_dir, ) evidence_reader_config = AutoConfig.from_pretrained( evidence_reader_path, cache_dir=model_args.cache_dir, ) answer_verifier_config = AutoConfig.from_pretrained( answer_verifier_path, cache_dir=model_args.cache_dir, ) extensive_evidence_selector = AutoModelForSequenceClassification.from_pretrained( extensive_evidence_selector_path, config=extensive_selector_config, cache_dir=model_args.cache_dir, ) intensive_evidence_selector = AutoModelForMultipleChoice.from_pretrained( intensive_evidence_selector_path, config=intensive_selector_config, cache_dir=model_args.cache_dir, ) evidence_reader = AutoModelForMultipleChoice.from_pretrained( evidence_reader_path, config=evidence_reader_config, cache_dir=model_args.cache_dir, ) if model_args.verifier_type == "classification": answer_verifier = AutoModelForSequenceClassification.from_pretrained( answer_verifier_path, config=answer_verifier_config, cache_dir=model_args.cache_dir, ) elif model_args.verifier_type == "multi_choice": answer_verifier = AutoModelForMultipleChoice.from_pretrained( answer_verifier_path, config=answer_verifier_config, cache_dir=model_args.cache_dir, ) if training_args.train_extensive_evidence_selector: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names pprepare_features_for_initializing_evidence_selector = partial( prepare_features_for_initializing_extensive_evidence_selector, evidence_sampling_num=data_args.evidence_sampling_num, tokenizer=tokenizer, data_args=data_args, pseudo_label_path=data_args.pseudo_label_path) pprepare_features_for_generating_optionwise_evidence = partial( prepare_features_for_generating_optionwise_evidence, tokenizer=tokenizer, data_args=data_args) pprepare_features_for_reading_optionwise_evidence = partial( prepare_features_for_reading_optionwise_evidence, tokenizer=tokenizer, data_args=data_args) pprepare_features_for_intensive_evidence_selector = partial( prepare_features_for_intensive_evidence_selector, evidence_len=data_args.intensive_evidence_len, train_intensive_selector_with_option=data_args. train_intensive_selector_with_option, train_intensive_selector_with_non_overlapping_evidence=data_args. train_intensive_selector_with_non_overlapping_evidence, tokenizer=tokenizer, data_args=data_args) pprepare_features_for_multiple_choice = partial(prepare_features, tokenizer=tokenizer, data_args=data_args) if model_args.verifier_type == "classification": pprepare_features_for_training_answer_verifier = partial( prepare_features_for_training_answer_verifier, evidence_len=data_args.verifier_evidence_len, train_answer_verifier_with_option=data_args. train_answer_verifier_with_option, downsampling=data_args.train_verifier_with_downsampling, tokenizer=tokenizer, data_args=data_args) elif model_args.verifier_type == "multi_choice": pprepare_features_for_training_answer_verifier = partial( prepare_features_for_training_mc_style_answer_verifier, evidence_len=data_args.verifier_evidence_len, tokenizer=tokenizer, data_args=data_args) extensive_trainer = Trainer( model=extensive_evidence_selector, args=training_args, train_dataset=None, eval_dataset=None, tokenizer=tokenizer, data_collator=DataCollatorForSequenceClassification( tokenizer=tokenizer), compute_metrics=compute_mc_metrics, ) intensive_trainer = Trainer( model=intensive_evidence_selector, args=training_args, train_dataset=None, eval_dataset=None, tokenizer=tokenizer, data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), compute_metrics=compute_mc_metrics, ) mc_trainer = Trainer( model=evidence_reader, args=training_args, train_dataset=None, eval_dataset=None, tokenizer=tokenizer, data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), compute_metrics=compute_mc_metrics, ) verifier_trainer = Trainer( model=answer_verifier, args=training_args, train_dataset=None, eval_dataset=None, tokenizer=tokenizer, data_collator=DataCollatorForSequenceClassification( tokenizer=tokenizer) if model_args.verifier_type == "classification" else DataCollatorForMultipleChoice( tokenizer=tokenizer), compute_metrics=compute_classification_metrics if model_args.verifier_type == "classification" else compute_mc_metrics, ) if training_args.train_answer_verifier or training_args.eval_intensive_evidence_selector or training_args.eval_answer_verifier: multiple_choice_datasets = { k: datasets[k].map( pprepare_features_for_multiple_choice, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) for k in datasets.keys() } if training_args.train_extensive_evidence_selector or training_args.eval_extensive_evidence_selector: train_extensive_evidence_selector_datasets = { k: datasets[k].map( pprepare_features_for_initializing_evidence_selector, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) for k in datasets.keys() if k != "train" or training_args.train_extensive_evidence_selector } if training_args.train_extensive_evidence_selector: extensive_trainer.train_dataset = train_extensive_evidence_selector_datasets[ "train"] extensive_trainer.eval_dataset = train_extensive_evidence_selector_datasets[ "validation"] train_result = extensive_trainer.train() output_train_file = os.path.join(training_args.output_dir, "train_results.txt") with open(output_train_file, "w") as writer: logger.info("***** Extensive Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # generate extensive evidence logits if training_args.train_intensive_evidence_selector or training_args.train_answer_verifier: extensive_evidence_logits = { k: extensive_trainer.evidence_generating( v, pprepare_features_for_generating_optionwise_evidence) for k, v in datasets.items() } elif training_args.eval_intensive_evidence_selector or training_args.eval_answer_verifier: extensive_evidence_logits = { k: extensive_trainer.evidence_generating( v, pprepare_features_for_generating_optionwise_evidence) for k, v in datasets.items() if k != "train" } # prepare features for intensive evidence selector if training_args.train_intensive_evidence_selector or training_args.eval_intensive_evidence_selector \ or training_args.eval_answer_verifier: train_intensive_evidence_selector_datasets = {} extensive_evidence_sentences = {} for split in datasets.keys(): if not training_args.train_intensive_evidence_selector and split == 'train': continue intensive_dataset = datasets[split].map( partial(pprepare_features_for_intensive_evidence_selector, evidence_logits=extensive_evidence_logits[split]), batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) evidence_sentences = { eid: [[(logit, sent) for sent, logit in zip(option_sents, option_logits)] for option_sents, option_logits in zip( evidence_sent, evidence_logit)] for eid, evidence_sent, evidence_logit in zip( intensive_dataset['example_ids'], intensive_dataset['evidence_sentence'], intensive_dataset['evidence_logit']) } train_intensive_evidence_selector_datasets[ split] = intensive_dataset.remove_columns( ["evidence_sentence", "evidence_logit"]) extensive_evidence_sentences[split] = evidence_sentences # prepare features for answer verifier if training_args.train_answer_verifier or training_args.eval_answer_verifier: mc_label_dict = { split: { example['example_ids']: example['label'] for example in multiple_choice_datasets[split] } for split in datasets.keys() if split != "train" or training_args.train_answer_verifier } reader_output = { split: mc_trainer.evaluate(multiple_choice_datasets[split]) for split in datasets.keys() if split != "train" or training_args.train_answer_verifier } answer_logits = { split: { example_id: prediction.tolist() for prediction, label_id, example_id in zip( *reader_output[split][:-1]) } for split in datasets.keys() if split != "train" } if data_args.answer_logits_path: logger.info( f"loading answer logits from {data_args.answer_logits_path}") with open(data_args.answer_logits_path) as f: trainset_answer_logits = json.load(f) answer_logits['train'] = trainset_answer_logits train_answer_verifier_datasets = { k: datasets[k].map( partial(pprepare_features_for_training_answer_verifier, answer_logits=answer_logits[k], evidence_logits=extensive_evidence_logits[k], is_training=(k == "train")), batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) for k in datasets.keys() if k != "train" or training_args.train_answer_verifier } if training_args.train_answer_verifier: logger.info( f"total {sum(train_answer_verifier_datasets['train']['label'])} positive example for training verifier" ) if training_args.train_intensive_evidence_selector: intensive_trainer.train_dataset = train_intensive_evidence_selector_datasets[ "train"] intensive_trainer.eval_dataset = train_intensive_evidence_selector_datasets[ "validation"] train_result = intensive_trainer.train() output_train_file = os.path.join(training_args.output_dir, "train_results.txt") with open(output_train_file, "a+") as writer: logger.info("***** Intensive Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") if training_args.train_answer_verifier: verifier_trainer.train_dataset = train_answer_verifier_datasets[ "train"] verifier_trainer.eval_dataset = train_answer_verifier_datasets[ "validation"] train_result = verifier_trainer.train() output_train_file = os.path.join(training_args.output_dir, "train_results.txt") with open(output_train_file, "a+") as writer: logger.info("***** Intensive Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Evaluation # To use the best checkpoint model at end, use the aruguments # load_best_model_at_end, metric_for_best_model, evaluation_strategy steps # --load_best_model_at_end \ # --metric_for_best_model accuracy \ # --evaluation_strategy steps \ if training_args.eval_extensive_evidence_selector: for split in ["validation", "test"]: logger.info(f"*** Evaluate {split} set ***") results = extensive_trainer.evaluate( train_extensive_evidence_selector_datasets[split]).metrics fulleval_results, all_evidence_sentences = extensive_trainer.evaluate_extensive_selector_with_explicit_reader( evidence_reader=evidence_reader, eval_dataset=datasets[split], feature_func_for_evidence_reading= pprepare_features_for_reading_optionwise_evidence, feature_func_for_evidence_generating= pprepare_features_for_generating_optionwise_evidence) metrics = {**results, **fulleval_results} output_eval_file = os.path.join(training_args.output_dir, f"{split}_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Extensive Eval results *****") for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") output_evidence_file = os.path.join(training_args.output_dir, f"{split}_evidence.json") with open(output_evidence_file, "w") as f: json.dump(all_evidence_sentences, f) if training_args.eval_intensive_evidence_selector: for split in ["validation", "test"]: logger.info(f"*** Evaluate {split} set ***") metrics, _ = intensive_trainer.evaluate_intensive_selector_with_explicit_reader( evidence_reader=evidence_reader, multiple_choice_dataset=multiple_choice_datasets[split], intensive_selector_dataset= train_intensive_evidence_selector_datasets[split]) output_eval_file = os.path.join(training_args.output_dir, f"{split}_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Extensive Eval results *****") for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") if training_args.eval_answer_verifier: selector_output = { k: intensive_trainer.evaluate( train_intensive_evidence_selector_datasets[k]) for k in datasets.keys() if k != "train" } selector_logits = { k: { example_id: prediction.tolist() for prediction, label_id, example_id in zip( *selector_output[k][:-1]) } for k in datasets.keys() if k != "train" } for split in ["validation", "test"]: logger.info(f"*** Evaluate {split} set ***") results = verifier_trainer.evaluate( train_answer_verifier_datasets[split]) verifier_logits = { example_id: prediction.tolist() for prediction, label_id, example_id in zip(*results[:-1]) } metrics = results.metrics if model_args.verifier_type == "classification": if split == 'validation': fulleval_metrics = evaluate_verifier_with_reader_and_iselector( reader_logits=answer_logits[split], selector_logits=selector_logits[split], verifier_logits=verifier_logits, label_dict=mc_label_dict[split]) val_verify_thresholds = { k: v for k, v in fulleval_metrics.items() if "thresh" in k } else: fulleval_metrics = evaluate_verifier_with_reader_and_iselector( reader_logits=answer_logits[split], selector_logits=selector_logits[split], verifier_logits=verifier_logits, label_dict=mc_label_dict[split], threshold=val_verify_thresholds) else: fulleval_metrics = evaluate_mc_style_verifier_with_reader_and_iselector( reader_logits=answer_logits[split], selector_logits=selector_logits[split], verifier_logits=verifier_logits, label_dict=mc_label_dict[split]) metrics = {**metrics, **fulleval_metrics} output_eval_file = os.path.join(training_args.output_dir, f"{split}_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Verifier Eval results *****") for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") verifier_prediction = { 'verifier_logits': verifier_logits, 'reader_logits': answer_logits[split], 'selector_logits': selector_logits[split] } output_prediction_file = os.path.join( training_args.output_dir, f"{split}_verifier_prediction.json") with open(output_prediction_file, "w") as f: json.dump(verifier_prediction, f)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MetaTrainingArguments)) model_args, data_args, training_args, metatraining_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # BertForMultipleChoice model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets s1_train_dataset = ( MetaMultipleChoiceDataset( data_dir=os.path.join(data_args.data_dir, 'swag'), tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, num_task=20, k_support=5, k_query=1, ) if training_args.do_train else None ) # s2_train_dataset = ( # MetaMultipleChoiceDataset( # data_dir=os.path.join(data_args.data_dir, 'ComVE_A'), # tokenizer=tokenizer, # task=data_args.task_name, # max_seq_length=data_args.max_seq_length, # overwrite_cache=data_args.overwrite_cache, # mode=Split.train, # num_task=100, # k_support=5, # k_query=1, # ) # if training_args.do_train # else None # ) # s3_train_dataset = ( # MetaMultipleChoiceDataset( # data_dir=os.path.join(data_args.data_dir, 'ComVE_B'), # tokenizer=tokenizer, # task=data_args.task_name, # max_seq_length=data_args.max_seq_length, # overwrite_cache=data_args.overwrite_cache, # mode=Split.train, # num_task=100, # k_support=5, # k_query=1, # ) # if training_args.do_train # else None # ) # s1_train_dataset = ( # MultipleChoiceDataset( # data_dir=os.path.join(data_args.data_dir, 'swag'), # tokenizer=tokenizer, # task=data_args.task_name, # max_seq_length=data_args.max_seq_length, # overwrite_cache=data_args.overwrite_cache, # mode=Split.train, # ) # if training_args.do_train # else None # ) # eval_dataset = ( # MultipleChoiceDataset( # data_dir=data_args.data_dir, # tokenizer=tokenizer, # task=data_args.task_name, # max_seq_length=data_args.max_seq_length, # overwrite_cache=data_args.overwrite_cache, # mode=Split.test, # ) # if training_args.do_eval # else None # ) target_train_dataset = ( MultipleChoiceDataset( data_dir=os.path.join(data_args.data_dir, 'cqa'), tokenizer=tokenizer, task='cqa_clf', max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) # [TODO]:Modify this... # target_test_dataset = ( # MultipleChoiceDataset( # data_dir=os.path.join(data_args.data_dir, 'cqa'), # tokenizer=tokenizer, # task='cqa_clf', # max_seq_length=data_args.max_seq_length, # overwrite_cache=data_args.overwrite_cache, # mode=Split.test, # ) # if training_args.do_train # else None # ) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} # Initialize our Trainer # Create meta batch s1_db = create_batch_of_tasks(s1_train_dataset, is_shuffle = True, batch_size = metatraining_args.outer_batch_size) # s2_db = create_batch_of_tasks(s2_train_dataset, is_shuffle = True, batch_size = metatraining_args.outer_batch_size) # s3_db = create_batch_of_tasks(s3_train_dataset, is_shuffle = True, batch_size = metatraining_args.outer_batch_size) # Define Data Loader def _get_train_sampler(train_dataset) -> Optional[torch.utils.data.sampler.Sampler]: if isinstance(train_dataset, torch.utils.data.IterableDataset): return None else: return ( RandomSampler(train_dataset) ) # s1_train_sampler = _get_train_sampler(s1_train_dataset) # s1_train_dataloader = DataLoader(s1_tarin_dataset, # batch_size=args.train_batch_size, # sampler=s1_train_sampler, # collate_fn=DataCollatorWithPadding(tokenizer), # drop_last=args.dataloader_drop_last) target_train_sampler = _get_train_sampler(target_train_dataset) target_train_dataloader = DataLoader(target_train_dataset, batch_size=training_args.train_batch_size, sampler=target_train_sampler, collate_fn=default_data_collator, #DataCollatorWithPadding(tokenizer), drop_last=training_args.dataloader_drop_last) metalearner = MetaLearner(metatraining_args, tokenizer) mtl_optimizer = Adam(metalearner.model.parameters(), lr=metatraining_args.mtl_update_lr) for source_idx, db in enumerate([s1_db]): # , s2_db, s3_db]): for step, task_batch in enumerate(db): # Meta-Training(FOMAML) f = open('log.txt', 'a') # print("\n") # print(task_batch) # print("\n") acc, loss = metalearner(task_batch) print('Step:', step, '\tTraining Loss | Acc:', loss, " | ",acc) f.write(str(acc) + '\n') # Fine-tuning on Target Set # target_batch = iter(target_train_dataloader).next() target_train_loss = [] target_train_acc = [] metalearner.model.cuda() metalearner.model.train() print(metalearner.model.parameters()) for target_batch in tqdm.tqdm(target_train_dataloader): target_batch = metalearner.prepare_inputs(target_batch) outputs = metalearner.model(**target_batch) loss = outputs[0] loss.backward() metalearner.outer_optimizer.step() metalearner.outer_optimizer.zero_grad() target_train_loss.append(loss.item()) # Compute Acc for target logits = F.softmax(outputs[1], dim=1) target_label_id = target_batch.get('labels') pre_label_id = torch.argmax(logits,dim=1) pre_label_id = pre_label_id.detach().cpu().numpy().tolist() target_label_id = target_label_id.detach().cpu().numpy().tolist() acc = accuracy_score(pre_label_id,target_label_id) target_train_acc.append(acc) print("Target Loss: ", np.mean(target_train_loss)) print("Target Acc: ", np.mean(target_train_acc)) # end fine tuning # end MML # MTL : Normal fine tuning target_finetune_loss = [] for target_batch in target_train_dataloader: metalearner.model.train() target_batch = metalearner.prepare_inputs(target_batch) outputs = metalearner.model(**target_batch) loss = outputs[0] loss.backward() mtl_optimizer.step() mtl_optimizer.zero_grad() target_finetune_loss.append(loss.item()) print("Target Loss: ", np.mean(target_finetune_loss))
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} # Data collator data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, data_collator=data_collator, ) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) return results
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: for split in raw_datasets.keys(): raw_datasets[split] = raw_datasets[split].select(range(100)) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names else: column_names = raw_datasets["validation"].column_names # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" label_column_name = "label" if "label" in column_names else "labels" # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.model_name_or_path) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForMultipleChoice.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [ [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) ] labels = examples[label_column_name] # Flatten out first_sentences = list(chain(*first_sentences)) second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer( first_sentences, second_sentences, max_length=args.max_length, padding=padding, truncation=True, ) # Un-flatten tokenized_inputs = {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} tokenized_inputs["labels"] = labels return tokenized_inputs with accelerator.main_process_first(): processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForMultipleChoice( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None) ) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size ) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Use the device given by the `accelerator` object. device = accelerator.device model.to(device) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Metrics metric = load_metric("accuracy") # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() accelerator.print(f"epoch {epoch}: {eval_metric}") if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) config.attention_type = attention_type config.k_value = k_value tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) if data_args.reinit_pooler: if model_args.model_type in ["bert", "roberta"]: encoder_temp = getattr(model, model_args.model_type) encoder_temp.pooler.dense.weight.data.normal_( mean=0.0, std=encoder_temp.config.initializer_range) encoder_temp.pooler.dense.bias.data.zero_() for p in encoder_temp.pooler.parameters(): p.requires_grad = True elif model_args.model_type in ["xlnet", "bart", "electra"]: raise ValueError( f"{model_args.model_type} does not have a pooler at the end") else: raise NotImplementedError if data_args.reinit_layers > 0: if model_args.model_type in ["bert", "roberta", "electra"]: assert data_args.reinit_pooler or model_args.model_type == "electra" from transformers.modeling_bert import BertLayerNorm encoder_temp = getattr(model, model_args.model_type) for layer in encoder_temp.encoder.layer[-data_args.reinit_layers:]: for module in layer.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.data.normal_( mean=0.0, std=encoder_temp.config.initializer_range) elif isinstance(module, BertLayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() elif model_args.model_type == "xlnet": from transformers.modeling_xlnet import XLNetLayerNorm, XLNetRelativeAttention for layer in model.transformer.layer[-data_args.reinit_layers:]: for module in layer.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.data.normal_( mean=0.0, std=model.transformer.config.initializer_range) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() elif isinstance(module, XLNetLayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, XLNetRelativeAttention): for param in [ module.q, module.k, module.v, module.o, module.r, module.r_r_bias, module.r_s_bias, module.r_w_bias, module.seg_embed, ]: param.data.normal_( mean=0.0, std=model.transformer.config.initializer_range) elif model_args.model_type == "bart": for layer in model.model.decoder.layers[-data_args.reinit_layers:]: for module in layer.modules(): model.model._init_weights(module) else: raise NotImplementedError train_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, solve_coref=data_args.solve_coref, ) if training_args.do_train else None) eval_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, solve_coref=data_args.solve_coref, ) if training_args.do_eval else None) test_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, solve_coref=data_args.solve_coref, ) if training_args.do_predict else None) test_dataset_high = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, solve_coref=data_args.solve_coref, group='high', ) if training_args.do_predict else None) test_dataset_middle = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, solve_coref=data_args.solve_coref, group='middle', ) if training_args.do_predict else None) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} # Initialize our Trainer if training_args.freelb: trainer = FreeLBTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) else: trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) if training_args.do_predict: predictions, label_ids, metrics = trainer.predict(test_dataset) predictions_high, label_ids_high, metrics_high = trainer.predict( test_dataset_high) predictions_middle, label_ids_middle, metrics_middle = trainer.predict( test_dataset_middle) predictions_file = os.path.join(training_args.output_dir, "test_predictions") labels_ids_file = os.path.join(training_args.output_dir, "test_labels_id") predictions_file_high = os.path.join(training_args.output_dir, "test_predictions_high") labels_ids_file_high = os.path.join(training_args.output_dir, "test_labels_id_high") predictions_file_middle = os.path.join(training_args.output_dir, "test_predictions_middle") labels_ids_file_middle = os.path.join(training_args.output_dir, "test_labels_id_middle") torch.save(predictions, predictions_file) torch.save(label_ids, labels_ids_file) torch.save(predictions_high, predictions_file_high) torch.save(label_ids_high, labels_ids_file_high) torch.save(predictions_middle, predictions_file_middle) torch.save(label_ids_middle, labels_ids_file_middle) examples_ids = [] for input_feature in test_dataset.features: examples_ids.append(input_feature.example_id) examples_ids_file = os.path.join(training_args.output_dir, "examples_ids") torch.save(examples_ids, examples_ids_file) output_eval_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Test results *****") for key, value in metrics.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) for key, value in metrics_high.items(): logger.info(" high %s = %s", key, value) writer.write("high %s = %s\n" % (key, value)) for key, value in metrics_middle.items(): logger.info(" middle %s = %s", key, value) writer.write("middle %s = %s\n" % (key, value)) return results
def setup(argc=None, **kwargs): if argc is None: argc = sys.argv[1:] parser = HfArgumentParser(( ModelArguments, DataTrainingArguments, DirArguments, TrainingArguments, WindowArguments )) if ( isinstance(argc, list) and len(argc) == 1 and argc[0].endswith('.json') ): model_args, data_args, dir_args, training_args, window_args = ( parser.parse_json_file(argc[0]) ) elif isinstance(argc, dict): model_args, data_args, dir_args, training_args, window_args = ( parser.parse_dict(argc) ) else: model_args, data_args, dir_args, training_args, window_args = ( parser.parse_args_into_dataclasses() ) if ( os.path.exists(training_args.output_dir) and [f for f in os.listdir(training_args.output_dir) if f != '.gitignore'] and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) all_args = { 'model_args': model_args, 'data_args': data_args, 'dir_args': dir_args, 'training_args': training_args, 'window_args': window_args, } # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) config_kwargs = kwargs.pop('config_kwargs', {}) tokenizer_kwargs = kwargs.pop('tokenizer_kwargs', {}) model_kwargs = kwargs.pop('model_kwargs', {}) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, **config_kwargs, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, **tokenizer_kwargs, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, **model_kwargs, ) return all_args, processor, config, tokenizer, model
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.train_file is not None or data_args.validation_file is not None: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) else: # Downloading and loading the swag dataset from the hub. datasets = load_dataset("swag", "regular") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" # Preprocessing the datasets. def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [[ f"{header} {examples[end][i]}" for end in ending_names ] for i, header in enumerate(question_headers)] # Flatten out first_sentences = sum(first_sentences, []) second_sentences = sum(second_sentences, []) # Tokenize tokenized_examples = tokenizer( first_sentences, second_sentences, truncation=True, max_length=data_args.max_seq_length, padding="max_length" if data_args.pad_to_max_length else False, ) # Un-flatten return { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } tokenized_datasets = datasets.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorForMultipleChoice(tokenizer=tokenizer)) # Metric def compute_metrics(eval_predictions): predictions, label_ids = eval_predictions preds = np.argmax(predictions, axis=1) return { "accuracy": (preds == label_ids).astype(np.float32).mean().item() } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results_swag.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) checkpoint_dir = hyperparam_path_for_initializing_evidence_selector( model_args, data_args, training_args) ckpt_dir = Path(checkpoint_dir) postfix = "" if training_args.do_train: postfix += "_train" elif training_args.do_eval: postfix += "_eval" setup_root_logger(ckpt_dir, training_args.local_rank, debug=False, postfix=postfix) training_args.output_dir = checkpoint_dir # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the [datasets]: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). if data_args.dataset not in ['race', 'dream']: raise ValueError("Dataset should be race or dream.") else: if data_args.dataset == 'race': from mcmrc.data_utils.processors import prepare_features_for_reading_evidence if data_args.dataset == 'dream': pass # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. data_files = {} data_files[ 'train'] = data_args.train_file if data_args.train_file is not None else None data_files[ 'validation'] = data_args.validation_file if data_args.validation_file is not None else None data_files[ 'test'] = data_args.test_file if data_args.test_file is not None else None datasets = load_dataset( data_args.dataload_script, data_args.dataload_split, data_files=data_files if data_files['train'] is not None else None, data_dir=data_args.data_dir) # Load pretrained model and tokenizer # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names all_pseudo_label = load_pseudo_label(data_args.pseudo_label_path) if data_args.run_pseudo_label_with_options: pseudo_logit = all_pseudo_label['options_prob_diff'] else: pseudo_logit = all_pseudo_label['logit'] acc = all_pseudo_label['acc'] # Data collator data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer) # Metric def compute_metrics(eval_predictions): predictions, label_ids = eval_predictions preds = np.argmax(predictions, axis=1) return { "accuracy": (preds == label_ids).astype(np.float32).mean().item() } eval_on_dev = (data_args.eval_dataset == "all" or data_args.eval_dataset == "dev") and training_args.do_eval eval_on_test = (data_args.eval_dataset == "all" or data_args.eval_dataset == "test") and training_args.do_eval train_results = {} eval_results = {} test_results = {} for evidence_num in range(1, data_args.max_evidence_len + 1): pprepare_features_for_using_pseudo_label_as_evidence = partial( prepare_features_for_reading_evidence, run_pseudo_label_with_options=data_args. run_pseudo_label_with_options, evidence_logits=pseudo_logit, evidence_len=evidence_num, tokenizer=tokenizer, data_args=data_args) tokenized_datasets = datasets.map( pprepare_features_for_using_pseudo_label_as_evidence, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) if training_args.do_train: train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, f"evidence_{evidence_num}_trainer_state.json")) for key in list(train_result.metric.keys()): train_results[ f'evidence{evidence_num}_{key}'] = train_result.metric[key] if eval_on_dev: logger.info("*** Evaluate ***") results = trainer.evaluate( eval_dataset=tokenized_datasets["validation"]) for key in list(results.keys()): eval_results[f'evidence{evidence_num}_{key}'] = results[key] if eval_on_test: logger.info("*** Test ***") results = trainer.evaluate(eval_dataset=tokenized_datasets["test"]) for key in list(results.keys()): test_results[f'evidence{evidence_num}_{key}'] = results[key] if eval_on_dev: output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in sorted(eval_results.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") if eval_on_test: output_test_file = os.path.join(training_args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: logger.info("***** Test results *****") for key, value in sorted(test_results.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") if training_args.do_train: output_train_file = os.path.join(training_args.output_dir, "train_results.txt") with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_results.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n")
def main(): parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) return results
second_sentences = sum(second_sentences, []) # Tokenize tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True) # Un-flatten return { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } encoded_datasets = datasets.map(preprocess_function, batched=True, num_proc=2) model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint) args = TrainingArguments( "test-race", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=3, weight_decay=0.01, ) label_map = {"A": 0, "B": 1, "C": 2, "D": 3} @dataclass
def main(): # args parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # data processor = processors['race']() label_list = processor.get_labels() num_labels = len(label_list) # load model global_config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) global_model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=global_config, cache_dir=model_args.cache_dir, ) # local_model = BertForMaskedLM.from_pretrained( # ) # Get datasets train_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None) eval_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} # Initialize our Trainer trainer = Trainer( model=global_model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) return results
def main() -> None: global best_loss step = 0 args = parser.parse_args() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if args.start_index is not None or args.end_index is not None: start_index = args.start_index end_index = args.end_index if start_index is None: start_index = 0 if end_index is None: corpus = Corpus(filename=download(args.corpus), utterance_start_index=start_index) else: corpus = Corpus(filename=download(args.corpus), utterance_start_index=start_index, utterance_end_index=end_index) else: corpus = Corpus(filename=download(args.corpus)) add_title_to_root(corpus) conversations = list(corpus.iter_conversations()) tokenizer = AutoTokenizer.from_pretrained(args.model_name) dataset = ConversationPathDataset(corpus, tokenizer, min_len=args.conversation_min, max_len=args.conversation_max, n_neighbors=args.num_neighbors, max_tokenization_len=args.utterance_max) sampler = ConversationPathBatchSampler(args.batch_size, dataset.min_len, dataset.get_indices_by_len()) loader = DataLoader(dataset, batch_sampler=sampler, collate_fn=conversation_path_collate_fn, pin_memory=device.type != 'cpu', num_workers=4) # utterance_encoder = AutoModel.from_pretrained(args.model_name) # conversation_encoder = nn.LSTM(utterance_encoder.config.hidden_size, args.hidden, args.num_layers) # model = ConversationClassificationHRNN(utterance_encoder, conversation_encoder, 1) # mlm_head = AutoModelForMaskedLM.from_pretrained(args.model_name).predictions model = AutoModelForMultipleChoice.from_pretrained(args.model_name) model.to(device) # mlm_head.to(device) criterion = nn.CrossEntropyLoss() # optimizer = AdamW(list(model.parameters()) + list(mlm_head.parameters()), args.learning_rate) optimizer = AdamW(list(model.parameters()), args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=WARMUP_RATIO * args.training_steps, num_training_steps=args.training_steps) scaler = GradScaler() if args.resume_path is not None: if os.path.isfile(args.resume_path): print("=> loading checkpoint '{}'".format(args.resume_path)) checkpoint = torch.load(args.resume_path, map_location=device) step = checkpoint['step'] best_loss = checkpoint['best_loss'] model.bert.load_state_dict(checkpoint['state_dict']) # mlm_head.load_state_dict(checkpoint['head_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) print("=> loaded checkpoint '{}' (step {})".format( args.resume_path, checkpoint['step'])) else: print("=> no checkpoint found at '{}'".format(args.resume_path)) while step < args.training_steps: loop_steps = args.loop_steps if args.training_steps - step > args.loop_steps else args.training_steps - step # loss = train(loader, model, mlm_head, criterion, optimizer, scheduler, scaler, # device, loop_steps, step // args.loop_steps) loss = train(loader, model, criterion, optimizer, scheduler, scaler, device, loop_steps, step // args.loop_steps) step += loop_steps # checkpoint model every k training loops k = 2 if step % (k * args.loop_steps) == 0 or step == args.training_steps: is_best = loss < best_loss best_loss = min(loss, best_loss) run_name = '{}.{}.{}.{}.{}'.format( args.model_name.split('/')[-1], args.corpus, args.conversation_max, args.num_neighbors, args.utterance_max) # save_checkpoint({ # 'step': step, # 'model': args.model_name, # 'state_dict': model.state_dict(), # 'head_state_dict': mlm_head.state_dict(), # 'best_loss': best_loss, # 'optimizer': optimizer.state_dict(), # 'scheduler': scheduler.state_dict() # }, is_best, run_name) save_checkpoint( { 'step': step, 'model': args.model_name, 'state_dict': model.bert.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, is_best, run_name)
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment accelerator = Accelerator( log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: for split in raw_datasets.keys(): raw_datasets[split] = raw_datasets[split].select(range(100)) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names else: column_names = raw_datasets["validation"].column_names # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" label_column_name = "label" if "label" in column_names else "labels" # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.model_name_or_path) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForMultipleChoice.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [[ f"{header} {examples[end][i]}" for end in ending_names ] for i, header in enumerate(question_headers)] labels = examples[label_column_name] # Flatten out first_sentences = list(chain(*first_sentences)) second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer( first_sentences, second_sentences, max_length=args.max_length, padding=padding, truncation=True, ) # Un-flatten tokenized_inputs = { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } tokenized_inputs["labels"] = labels return tokenized_inputs with accelerator.main_process_first(): processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForMultipleChoice( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Use the device given by the `accelerator` object. device = accelerator.device model.to(device) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): checkpointing_steps = args.checkpointing_steps if args.checkpointing_steps.isdigit(): checkpointing_steps = int(args.checkpointing_steps) else: checkpointing_steps = None # We need to initialize the trackers we use, and also store our configuration if args.with_tracking: experiment_config = vars(args) # TensorBoard cannot log Enums, need the raw value experiment_config["lr_scheduler_type"] = experiment_config[ "lr_scheduler_type"].value accelerator.init_trackers("swag_no_trainer", experiment_config) # Metrics metric = load_metric("accuracy") # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print( f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[ -1] # Sorts folders by date modified, most recent checkpoint is the last # Extract `epoch_{i}` or `step_{i}` training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: completed_steps += 1 continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if completed_steps >= args.max_train_steps: break model.eval() samples_seen = 0 for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) predictions, references = accelerator.gather( (predictions, batch["labels"])) # If we are in a multiprocess environment, the last batch has duplicates if accelerator.num_processes > 1: if step == len(eval_dataloader) - 1: predictions = predictions[:len(eval_dataloader.dataset) - samples_seen] references = references[:len(eval_dataloader.dataset) - samples_seen] else: samples_seen += references.shape[0] metric.add_batch( predictions=predictions, references=references, ) eval_metric = metric.compute() accelerator.print(f"epoch {epoch}: {eval_metric}") if args.with_tracking: accelerator.log( { "accuracy": eval_metric, "train_loss": total_loss, "epoch": epoch, "step": completed_steps }, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True) if args.checkpointing_steps == "epoch": output_dir = f"epoch_{epoch}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: json.dump({"eval_accuracy": eval_metric["accuracy"]}, f)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, AdapterArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args, adapter_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." f" Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Setup adapters from transformers.adapter_config import AdapterType # base_model = getattr(model, model.base_model_prefix, model) # base_model.set_adapter_config(AdapterType.text_task, adapter_args.adapter_config) from transformers.adapter_config import PfeifferConfig model.load_adapter("/home/theorist17/projects/adapter/adapters/MNLI/mnli", "text_task", config=PfeifferConfig(), with_head=False) model.load_adapter( "/home/theorist17/projects/adapter/adapters/commonsenseqa/commonsenseqa", "text_task", config=PfeifferConfig(), with_head=False) model.load_adapter( "/home/theorist17/projects/adapter/adapters/conceptnet/conceptnet", "text_task", config=PfeifferConfig(), with_head=False) adapter_names = [["mnli", "commonsenseqa", "conceptnet"]] model.add_fusion(adapter_names[0], "dynamic") #model.base_model.set_active_adapters(adapter_names) #model.train_fusion(adapter_names) model.train_fusion(adapter_names) # inspect parameters of the fusion layer for (n, p) in model.named_parameters(): print(n, p.requires_grad) # Get datasets train_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None) eval_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None) def simple_accuracy(preds, labels): return (preds == labels).mean() def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, do_save_full_model=False, do_save_adapter_fusion=True, adapter_names=adapter_names, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(result) return eval_results
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv" ) parser.add_argument( "--predict_file", default=None, type=str, required=True, help="SWAG csv for predictions. E.g., val.csv or test.csv", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--max_seq_length", default=384, type=int, help="The maximum total input sequence length after tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, ) model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForMultipleChoice.from_pretrained(args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: if args.do_train: checkpoints = [args.output_dir] else: # if do_train is False and do_eval is true, load model directly from pretrained. checkpoints = [args.model_name_or_path] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) ) logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = AutoModelForMultipleChoice.from_pretrained(checkpoint) tokenizer = AutoTokenizer.from_pretrained(checkpoint) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.train_file is not None or data_args.validation_file is not None: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) else: # Downloading and loading the swag dataset from the hub. raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Preprocessing the datasets. def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [[ f"{header} {examples[end][i]}" for end in ending_names ] for i, header in enumerate(question_headers)] # Flatten out first_sentences = sum(first_sentences, []) second_sentences = sum(second_sentences, []) # Tokenize tokenized_examples = tokenizer( first_sentences, second_sentences, truncation=True, max_length=max_seq_length, padding="max_length" if data_args.pad_to_max_length else False, ) # Un-flatten return { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorForMultipleChoice( tokenizer=tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) # Metric def compute_metrics(eval_predictions): predictions, label_ids = eval_predictions preds = np.argmax(predictions, axis=1) return { "accuracy": (preds == label_ids).astype(np.float32).mean().item() } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) kwargs = dict( finetuned_from=model_args.model_name_or_path, tasks="multiple-choice", dataset_tags="swag", dataset_args="regular", dataset="SWAG", language="en", ) if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Setup adapters if adapter_args.train_adapter: task_name = data_args.task_name # check if adapter already exists, otherwise add it if task_name not in model.config.adapters.adapter_list(AdapterType.text_task): # resolve the adapter config adapter_config = AdapterConfig.load( adapter_args.adapter_config, non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, ) # load a pre-trained from Hub if specified if adapter_args.load_adapter: model.load_adapter( adapter_args.load_adapter, AdapterType.text_task, config=adapter_config, load_as=task_name, ) # otherwise, add a fresh adapter else: model.add_adapter(task_name, AdapterType.text_task, config=adapter_config) # optionally load a pre-trained language adapter if adapter_args.load_lang_adapter: # resolve the language adapter config lang_adapter_config = AdapterConfig.load( adapter_args.lang_adapter_config, non_linearity=adapter_args.lang_adapter_non_linearity, reduction_factor=adapter_args.lang_adapter_reduction_factor, ) # load the language adapter from Hub lang_adapter_name = model.load_adapter( adapter_args.load_lang_adapter, AdapterType.text_lang, config=lang_adapter_config, load_as=adapter_args.language, ) else: lang_adapter_name = None # Freeze all model weights except of those of this adapter model.train_adapter([task_name]) # Set the adapters to be used in every forward pass if lang_adapter_name: model.set_active_adapters([lang_adapter_name, task_name]) else: model.set_active_adapters([task_name]) # Get datasets train_dataset = ( MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, do_save_full_model=not adapter_args.train_adapter, do_save_adapters=adapter_args.train_adapter, ) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) return results
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) train_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, perturbation_type=data_args.perturbation_type, perturbation_num=data_args.perturbation_num_train, augment=data_args.augment, name_gender_or_race=data_args.name_gender_or_race, ) if training_args.do_train else None) eval_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_train else None) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Test test_dataset = MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, perturbation_type=data_args.perturbation_type, perturbation_num=data_args.perturbation_num_test, augment=data_args.augment, name_gender_or_race=data_args.name_gender_or_race, ) predictions, label_ids, metrics = trainer.predict(test_dataset) predictions_file = os.path.join(training_args.output_dir, "test_predictions") labels_ids_file = os.path.join(training_args.output_dir, "test_labels_id") torch.save(predictions, predictions_file) torch.save(label_ids, labels_ids_file) examples_ids = [] perturbated = [] run = [] for input_feature in test_dataset.features: examples_ids.append(input_feature.example_id) for examples in test_dataset.examples: perturbated.append(examples.perturbated) run.append(examples.run) examples_ids_file = os.path.join(training_args.output_dir, "examples_ids") torch.save(examples_ids, examples_ids_file) perturbated_file = os.path.join(training_args.output_dir, "perturbated") torch.save(perturbated, perturbated_file) run_file = os.path.join(training_args.output_dir, "run") torch.save(run, run_file) output_eval_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Test results *****") for key, value in metrics.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) return metrics
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, AllTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) checkpoint_dir = hyperparam_path_for_two_stage_evidence_selector( model_args, data_args, training_args) ckpt_dir = Path(checkpoint_dir) postfix = "" if training_args.train_evidence_selector or training_args.train_answer_verifier: postfix += "_train" else: postfix += "_eval" setup_root_logger(ckpt_dir, training_args.local_rank, debug=False, postfix=postfix) training_args.output_dir = checkpoint_dir # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) logger.info("Data parameters %s", data_args) logger.info("Model parameters %s", model_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the [datasets]: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). if data_args.dataset not in ['race', 'dream']: raise ValueError("Dataset should be race or dream.") if training_args.eval_on_exp_race and data_args.exp_race_file is None and data_args.dataset == 'race': raise ValueError("exp_race_file must be specified") if data_args.dataset == 'dream': training_args.eval_on_exp_race = False # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if data_args.debug_mode: datasets = load_dataset(data_args.dataload_script, data_args.dataload_split, data_dir=data_args.data_dir, split={ 'train': ReadInstruction('train', from_=0, to=5, unit='abs'), 'validation': ReadInstruction('validation', from_=0, to=5, unit='abs'), 'test': ReadInstruction('test', from_=0, to=5, unit='abs') }) else: datasets = load_dataset(data_args.dataload_script, data_args.dataload_split, data_dir=data_args.data_dir) if training_args.eval_on_exp_race: datasets['exp'] = Dataset.from_dict( load_exp_race_data(data_args.exp_race_file)) if training_args.eval_on_adv_race: for subset in os.listdir(data_args.adv_race_path): datasets[subset] = Dataset.from_dict( load_adv_race_data( os.path.join(data_args.adv_race_path, subset, "test_dis.json"))) # Load pretrained model and tokenizer # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) evidence_selector_path = model_args.evidence_selector_path \ if model_args.evidence_selector_path else model_args.model_name_or_path answer_verifier_path = model_args.answer_verifier_path \ if model_args.answer_verifier_path else model_args.model_name_or_path evidence_reader_path = model_args.evidence_reader_path \ if model_args.evidence_reader_path else model_args.model_name_or_path evidence_selector_config = AutoConfig.from_pretrained( evidence_selector_path, cache_dir=model_args.cache_dir, ) answer_verifier_config = AutoConfig.from_pretrained( answer_verifier_path, cache_dir=model_args.cache_dir, ) evidence_reader_config = AutoConfig.from_pretrained( evidence_reader_path, cache_dir=model_args.cache_dir, ) evidence_selector = AutoModelForSequenceClassification.from_pretrained( evidence_selector_path, config=evidence_selector_config, cache_dir=model_args.cache_dir, ) answer_verifier = AutoModelForMultipleChoice.from_pretrained( answer_verifier_path, config=answer_verifier_config, cache_dir=model_args.cache_dir, ) evidence_reader = AutoModelForMultipleChoice.from_pretrained( evidence_reader_path, config=evidence_reader_config, cache_dir=model_args.cache_dir, ) if training_args.train_evidence_selector: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names pprepare_features_for_initializing_evidence_selector = partial( prepare_features_for_initializing_evidence_selector, evidence_sampling_num=data_args.evidence_sampling_num, tokenizer=tokenizer, data_args=data_args, pseudo_label_path=data_args.pseudo_label_path) pprepare_features_for_generating_optionwise_evidence = partial( prepare_features_for_generating_optionwise_evidence, tokenizer=tokenizer, data_args=data_args) pprepare_features_for_reading_optionwise_evidence = partial( prepare_features_for_reading_optionwise_evidence, tokenizer=tokenizer, data_args=data_args) pprepare_features_for_answer_verifier = partial( prepare_features_for_answer_verifier, evidence_len=data_args.verifier_evidence_len, train_verifier_with_option=data_args.train_verifier_with_option, train_verifier_with_non_overlapping_evidence=data_args. train_verifier_with_non_overlapping_evidence, tokenizer=tokenizer, data_args=data_args) pprepare_features_for_multiple_choice = partial(prepare_features, tokenizer=tokenizer, data_args=data_args) training_args.num_train_epochs = training_args.num_train_selector_epochs selector_trainer = Trainer( model=evidence_selector, args=training_args, train_dataset=None, eval_dataset=None, tokenizer=tokenizer, data_collator=DataCollatorForSequenceClassification( tokenizer=tokenizer), compute_metrics=compute_mc_metrics, ) training_args.num_train_epochs = training_args.num_train_verifier_epochs verifier_trainer = Trainer( model=answer_verifier, args=training_args, train_dataset=None, eval_dataset=None, tokenizer=tokenizer, data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), compute_metrics=compute_mc_metrics, ) if training_args.train_evidence_selector and training_args.train_answer_verifier: selector_trainer.checkpoint_dir = os.path.join( training_args.output_dir, "evidence_selector") verifier_trainer.checkpoint_dir = os.path.join( training_args.output_dir, "answer_verifier") mc_trainer = Trainer( model=evidence_reader, args=training_args, train_dataset=None, eval_dataset=None, tokenizer=tokenizer, data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), compute_metrics=compute_mc_metrics, ) if training_args.eval_answer_verifier: multiple_choice_datasets = { k: datasets[k].map( pprepare_features_for_multiple_choice, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) for k in datasets.keys() } if training_args.train_evidence_selector or training_args.eval_evidence_selector: train_evidence_selector_datasets = { k: datasets[k].map( pprepare_features_for_initializing_evidence_selector, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) for k in datasets.keys() if k != "train" or training_args.train_evidence_selector } if training_args.train_evidence_selector: logger.info("**** Train Evidence Selector ****") selector_trainer.train_dataset = train_evidence_selector_datasets[ "train"] selector_trainer.eval_dataset = train_evidence_selector_datasets[ "validation"] train_result = selector_trainer.train() output_train_file = os.path.join(training_args.output_dir, "train_results.txt") with open(output_train_file, "w") as writer: logger.info("***** Evidence selector train results *****") writer.write("***** Evidence selector train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f"{key} = {value:.3f}") writer.write(f"{key} = {value:.3f}\n") if training_args.eval_evidence_selector: logger.info("**** Evaluate Evidence Selector ****") for split in ["validation", "test"]: logger.info(f"*** Evaluate {split} set ***") results = selector_trainer.evaluate( train_evidence_selector_datasets[split]).metrics fulleval_results, all_evidence_sentences = selector_trainer.evaluate_selector_with_explicit_reader( evidence_reader=evidence_reader, eval_dataset=datasets[split], feature_func_for_evidence_reading= pprepare_features_for_reading_optionwise_evidence, feature_func_for_evidence_generating= pprepare_features_for_generating_optionwise_evidence) metrics = {**results, **fulleval_results} output_eval_file = os.path.join(training_args.output_dir, f"{split}_selector_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Evidence Selector Eval results *****") for key, value in sorted(metrics.items()): logger.info(f"{key} = {value:.3f}") writer.write(f"{key} = {value:.3f}\n") output_evidence_file = os.path.join(training_args.output_dir, f"{split}_evidence.json") with open(output_evidence_file, "w") as f: json.dump(all_evidence_sentences, f) # generate evidence logits if training_args.train_answer_verifier: evidence_logits = { k: selector_trainer.evidence_generating( v, pprepare_features_for_generating_optionwise_evidence) for k, v in datasets.items() } elif training_args.eval_answer_verifier: evidence_logits = { k: selector_trainer.evidence_generating( v, pprepare_features_for_generating_optionwise_evidence) for k, v in datasets.items() if k != "train" } output_evidence_logits_file = os.path.join(training_args.output_dir, f"evidence_logits.json") with open(output_evidence_logits_file, "w") as f: json.dump(evidence_logits, f) # prepare features for answer verifier if training_args.train_answer_verifier or training_args.eval_answer_verifier: logger.info("**** preparing features for answer verifier ****") train_answer_verifier_datasets = {} evidence_sentences = {} for split in datasets.keys(): if not training_args.train_answer_verifier and split == 'train': continue verifier_dataset = datasets[split].map( partial(pprepare_features_for_answer_verifier, evidence_logits=evidence_logits[split]), batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) evidence_sentences = { eid: [[(logit, sent) for sent, logit in zip(option_sents, option_logits)] for option_sents, option_logits in zip( evidence_sent, evidence_logit)] for eid, evidence_sent, evidence_logit in zip( verifier_dataset['example_ids'], verifier_dataset['evidence_sentence'], verifier_dataset['evidence_logit']) } train_answer_verifier_datasets[ split] = verifier_dataset.remove_columns( ["evidence_sentence", "evidence_logit"]) evidence_sentences[split] = evidence_sentences output_evidence_file = os.path.join(training_args.output_dir, f"all_evidence.json") with open(output_evidence_file, "w") as f: json.dump(evidence_sentences, f) if torch.cuda.is_available(): logger.info("**** release evidence selector ****") del selector_trainer del evidence_selector torch.cuda.empty_cache() if training_args.train_answer_verifier: logger.info("**** Train answer verifier ****") verifier_trainer.train_dataset = train_answer_verifier_datasets[ "train"] verifier_trainer.eval_dataset = train_answer_verifier_datasets[ "validation"] train_result = verifier_trainer.train() output_train_file = os.path.join(training_args.output_dir, "verifier_train_results.txt") with open(output_train_file, "a+") as writer: logger.info("***** Verifier Train results *****") writer.write("***** Verifier Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f"{key} = {value:.3f}") writer.write(f"{key} = {value:.3f}\n") # Evaluation # To use the best checkpoint model at end, use the aruguments # load_best_model_at_end, metric_for_best_model, evaluation_strategy steps # --load_best_model_at_end \ # --metric_for_best_model accuracy \ # --evaluation_strategy steps \ if training_args.eval_answer_verifier: eval_sets = ["validation", "test"] if training_args.eval_on_exp_race and data_args.dataset == "race": eval_sets.append("exp") if training_args.eval_on_adv_race and data_args.dataset == "race": eval_sets += ['charSwap', 'AddSent', 'DE', 'DG', 'Orig'] for split in eval_sets: logger.info(f"*** Evaluate Answer Verifier on {split} set ***") metrics, predictions = verifier_trainer.evaluate_answer_verifier_with_explicit_reader( evidence_reader=evidence_reader, multiple_choice_dataset=multiple_choice_datasets[split], answer_verifier_dataset=train_answer_verifier_datasets[split]) output_prediction_file = os.path.join( training_args.output_dir, f"{split}_verifier_predictions.json") with open(output_prediction_file, "w") as f: json.dump(predictions, f) if training_args.eval_on_exp_race and split == "exp": ground_truth_file = json.load( open(data_args.exp_race_file, 'rb')) for ratio, merge_prediction in predictions.items(): prediction_file = {} for eid, probs in merge_prediction.items(): pred_option = np.argmax(probs) pred_evidence = sorted( evidence_sentences['exp'][eid][pred_option], key=lambda x: x[0], reverse=True)[0][1] prediction_file[eid] = { "answer": chr(pred_option + ord("A")), "evidence": pred_evidence } all_f1, ans_f1, evi_f1, total_count, skip_count = evaluate_multi_choice( ground_truth_file, prediction_file) metrics[f"merge_{ratio}_all_f1"] = all_f1 metrics[f"merge_{ratio}_ans_f1"] = ans_f1 metrics[f"merge_{ratio}_evi_f1"] = evi_f1 metrics[f"merge_{ratio}_total_count"] = total_count metrics[f"merge_{ratio}_skip_count"] = skip_count output_eval_file = os.path.join(training_args.output_dir, f"{split}_verifier_results.txt") with open(output_eval_file, "a+") as writer: logger.info(f"***** Eval {split} results *****") for key, value in sorted(metrics.items()): logger.info(f"{key} = {value:.3f}") writer.write(f"{key} = {value:.3f}\n")
model, config, tokenizer, preprocess_datasets = objects[0], objects[1], objects[2], objects[3] model.to(device) # DataLoaders creation: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. if args.train: total_steps = train(args, model, preprocess_datasets['train'], preprocess_datasets['dev'], device, tokenizer) if (not args.train) and args.eval: # only evaluation without train model = set_model_distributed(args, model) loss, eval_metric = evaluation(args, model, preprocess_datasets['dev'], device) if args.predict: if args.local_rank in [-1, 0]: logger.info("Predict on validation data...") if args.ensemble_models is not None: args.ensemble_models = args.ensemble_models.split(',') all_predictions = [] for model_path in args.ensemble_models: model = AutoModelForMultipleChoice.from_pretrained(model_path) model.to(device) current_predictions, current_logits = predict(args, model, preprocess_datasets['predict'], device, return_logit=True) all_predictions.append(current_logits) # model_number, example_num, 4 predictions = np.mean(all_predictions, 0).argmax(1) else: # get model on single gpu model = model.module if hasattr(model, "module") else model predictions = predict(args, model, preprocess_datasets['predict'], device) write_to_csv(preprocess_datasets['predict'], predictions, args.predict_out)
def run_multiple_choice(self, model_name, task_name, fp16): model_args = ModelArguments(model_name_or_path=model_name, cache_dir=self.cache_dir) data_args = DataTrainingArguments(task_name=task_name, data_dir=self.data_dir, max_seq_length=self.max_seq_length) training_args = TrainingArguments( output_dir=os.path.join(self.output_dir, task_name), do_train=True, do_eval=True, per_gpu_train_batch_size=self.train_batch_size, per_gpu_eval_batch_size=self.eval_batch_size, learning_rate=self.learning_rate, num_train_epochs=self.num_train_epochs, local_rank=self.local_rank, overwrite_output_dir=self.overwrite_output_dir, gradient_accumulation_steps=self.gradient_accumulation_steps, fp16=fp16, logging_steps=self.logging_steps) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) set_seed(training_args.seed) onnxruntime.set_seed(training_args.seed) try: processor = SwagProcessor() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, processor=processor, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None) eval_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, processor=processor, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} if model_name.startswith('bert'): model_desc = ModelDescription([ IODescription('input_ids', [ self.train_batch_size, num_labels, data_args.max_seq_length ], torch.int64, num_classes=model.config.vocab_size), IODescription('attention_mask', [ self.train_batch_size, num_labels, data_args.max_seq_length ], torch.int64, num_classes=2), IODescription('token_type_ids', [ self.train_batch_size, num_labels, data_args.max_seq_length ], torch.int64, num_classes=2), IODescription('labels', [self.train_batch_size, num_labels], torch.int64, num_classes=num_labels) ], [ IODescription('loss', [], torch.float32), IODescription('reshaped_logits', [self.train_batch_size, num_labels], torch.float32) ]) else: model_desc = ModelDescription([ IODescription('input_ids', ['batch', num_labels, 'max_seq_len_in_batch'], torch.int64, num_classes=model.config.vocab_size), IODescription('attention_mask', ['batch', num_labels, 'max_seq_len_in_batch'], torch.int64, num_classes=2), IODescription('labels', ['batch', num_labels], torch.int64, num_classes=num_labels) ], [ IODescription('loss', [], torch.float32), IODescription('reshaped_logits', ['batch', num_labels], torch.float32) ]) # Initialize the ORTTrainer within ORTTransformerTrainer trainer = ORTTransformerTrainer( model=model, model_desc=model_desc, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") result = trainer.evaluate() logger.info("***** Eval results {} *****".format( data_args.task_name)) for key, value in result.items(): logger.info(" %s = %s", key, value) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() checkpoint_dir = hyperparam_path_for_initializing_evidence_selector(model_args, data_args, training_args) ckpt_dir = Path(checkpoint_dir) postfix = "" if training_args.do_train: postfix += "_train" elif training_args.do_eval: postfix += "_eval" setup_root_logger(ckpt_dir, training_args.local_rank, debug=False, postfix=postfix) training_args.output_dir = checkpoint_dir # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the [datasets]: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). if data_args.dataset not in ['race', 'dream']: raise ValueError("Dataset should be race or dream.") else: if data_args.dataset == 'race': from mcmrc.data_utils.processors import prepare_features_for_initializing_simple_evidence_selector, \ prepare_features_for_generating_evidence_using_selector, prepare_features_for_reading_evidence if data_args.dataset == 'dream': pass # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. data_files = {} data_files['train'] = data_args.train_file if data_args.train_file is not None else None data_files['validation'] = data_args.validation_file if data_args.validation_file is not None else None data_files['test'] = data_args.test_file if data_args.test_file is not None else None datasets = load_dataset(data_args.dataload_script, data_args.dataload_split, data_files=data_files if data_files['train'] is not None else None, data_dir=data_args.data_dir) # Load pretrained model and tokenizer # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) if data_args.train_with_adversarial_examples: config.num_labels = 3 tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) evidence_selector = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) evidence_reader = AutoModelForMultipleChoice.from_pretrained( model_args.evidence_reader_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names pprepare_features_for_initializing_evidence_selector = partial(prepare_features_for_initializing_simple_evidence_selector, evidence_len=data_args.evidence_len, tokenizer=tokenizer, data_args=data_args, pseudo_label_path=data_args.pseudo_label_path) initializing_evidence_selector_datasets = datasets.map( pprepare_features_for_initializing_evidence_selector, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) pprepare_features_for_generating_evidence_using_selector = partial(prepare_features_for_generating_evidence_using_selector, tokenizer=tokenizer, data_args=data_args) evidence_generating_datasets = {k: datasets[k].map( pprepare_features_for_generating_evidence_using_selector, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) for k in datasets.keys() if k != "train"} pprepare_features_for_reading_evidence = partial(prepare_features_for_reading_evidence, pseudo_label_or_not=False, tokenizer=tokenizer, data_args=data_args) # Data collator data_collator = DataCollatorForSequenceClassification(tokenizer=tokenizer) # Metric def compute_metrics(eval_predictions): predictions, label_ids = eval_predictions preds = np.argmax(predictions, axis=1) return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()} # Initialize our Trainer trainer = Trainer( model=evidence_selector, args=training_args, train_dataset=initializing_evidence_selector_datasets["train"] if training_args.do_train else None, eval_dataset=initializing_evidence_selector_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) if training_args.do_train: train_result = trainer.train() output_train_file = os.path.join(training_args.output_dir, "train_results.txt") with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Evaluation # To use the best checkpoint model at end, use the aruguments # load_best_model_at_end, metric_for_best_model, evaluation_strategy steps # --load_best_model_at_end \ # --metric_for_best_model accuracy \ # --evaluation_strategy steps \ eval_on_dev = (data_args.eval_dataset == "all" or data_args.eval_dataset == "dev") and training_args.do_eval eval_on_test = (data_args.eval_dataset == "all" or data_args.eval_dataset == "test") and training_args.do_eval if eval_on_dev: logger.info("*** Evaluate ***") results = trainer.evaluate(initializing_evidence_selector_datasets["validation"]).metrics fulleval_results = trainer.evaluate_with_explicit_reader(evidence_reader, datasets["validation"], pprepare_features_for_reading_evidence, evidence_generating_datasets["validation"]) metrics = {**results, **fulleval_results} output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") if eval_on_test: logger.info("*** Test ***") results = trainer.evaluate(initializing_evidence_selector_datasets["test"]).metrics fulleval_results = trainer.evaluate_with_explicit_reader(evidence_reader, datasets["test"], pprepare_features_for_reading_evidence, evidence_generating_datasets["test"]) metrics = {**results, **fulleval_results} output_test_file = os.path.join(training_args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: logger.info("***** Test results *****") for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n")
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (BasicModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) checkpoint_dir = hyperparam_path_for_baseline(model_args, data_args, training_args) ckpt_dir = Path(checkpoint_dir) postfix = "" if training_args.do_train: postfix += "_train" if training_args.do_eval: postfix += "_eval" setup_root_logger(ckpt_dir, training_args.local_rank, debug=False, postfix=postfix) training_args.output_dir = checkpoint_dir # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). if not 0 <= data_args.holdout_set < data_args.n_fold: raise ValueError("Test fold must be in [0, n_fold)") if data_args.dataset not in ['race', 'dream']: raise ValueError("Dataset should be race or dream.") else: from mcmrc.data_utils.processors import prepare_features # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if data_args.debug_mode: datasets = load_dataset(data_args.dataload_script, data_args.dataload_split, data_dir=data_args.data_dir, split={ 'train': ReadInstruction('train', from_=0, to=5, unit='abs'), 'validation': ReadInstruction('validation', from_=0, to=5, unit='abs'), 'test': ReadInstruction('test', from_=0, to=5, unit='abs') }) else: datasets = load_dataset(data_args.dataload_script, data_args.dataload_split, data_dir=data_args.data_dir) if data_args.shuffle_train_dataset: datasets['train'] = datasets['train'].shuffle(seed=training_args.seed) if data_args.split_train_dataset: holdout_set_start = int( len(datasets['train']) / data_args.n_fold * data_args.holdout_set) holdout_set_end = int( len(datasets['train']) / data_args.n_fold * (data_args.holdout_set + 1)) shuffled_train_set = datasets['train'].shuffle(seed=training_args.seed) if holdout_set_start == 0: new_train_set = Dataset.from_dict( shuffled_train_set[holdout_set_end:]) elif holdout_set_end == len(datasets['train']): new_train_set = Dataset.from_dict( shuffled_train_set[:holdout_set_start]) else: new_train_set = concatenate_datasets([ Dataset.from_dict(shuffled_train_set[:holdout_set_start]), Dataset.from_dict(shuffled_train_set[holdout_set_end:]) ]) new_holdout_set = Dataset.from_dict( shuffled_train_set[holdout_set_start:holdout_set_end]) assert new_train_set.num_rows + new_holdout_set.num_rows == shuffled_train_set.num_rows datasets['train'] = new_train_set datasets['holdout_set'] = new_holdout_set # Load pretrained model and tokenizer # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names pprepare_features = partial(prepare_features, tokenizer=tokenizer, data_args=data_args) tokenized_datasets = datasets.map( pprepare_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorForMultipleChoice(tokenizer=tokenizer)) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_mc_metrics, ) # Training if training_args.do_train: train_result = trainer.train() output_train_file = os.path.join(training_args.output_dir, "train_results.txt") with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f"{key} = {value:.3f}") writer.write(f"{key} = {value:.3f}\n") # Evaluation # To use the best checkpoint model at end, use the aruguments # load_best_model_at_end, metric_for_best_model, evaluation_strategy steps # --load_best_model_at_end \ # --metric_for_best_model accuracy \ # --evaluation_strategy steps \ if training_args.do_eval: if training_args.load_best_model_at_end: best_model = AutoModelForMultipleChoice.from_pretrained( training_args.output_dir, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) best_model = best_model.to(training_args.device) for split in [k for k in datasets.keys() if k != "train"]: logger.info(f"*** Evaluate {split} set ***") results = trainer.evaluate(tokenized_datasets[split]) if training_args.load_best_model_at_end: final_model = trainer.model trainer.model = best_model best_model_results = trainer.evaluate( tokenized_datasets[split]) trainer.model = final_model output_eval_file = os.path.join(training_args.output_dir, f"{split}_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Extensive Eval results *****") if not training_args.do_train: writer.write( f"eval checkpoint {model_args.model_name_or_path}\n") for key, value in sorted(results.metrics.items()): logger.info(f"{key} = {value:.3f}") writer.write(f"{key} = {value:.3f}\n") if training_args.load_best_model_at_end: writer.write(f"best model on dev set\n") for key, value in sorted( best_model_results.metrics.items()): logger.info(f"{key} = {value:.3f}") writer.write(f"{key} = {value:.3f}\n") if data_args.output_prediction_file or data_args.split_train_dataset: prediction = { example_id: prediction.tolist() for prediction, label_id, example_id in zip(*results[:-1]) } if split == "holdout_set": output_prediction_file = os.path.join( training_args.output_dir, f"holdout_{data_args.n_fold}_{data_args.holdout_set}_prediction.json" ) else: output_prediction_file = os.path.join( training_args.output_dir, f"{split}_prediction.json") with open(output_prediction_file, "w") as f: json.dump(prediction, f)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (BasicModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). if data_args.dataset not in ['race', 'dream']: raise ValueError("Dataset should be race or dream.") else: from mcmrc.data_utils.processors import prepare_features_for_generate_pseudo_label # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.debug_mode: datasets = load_dataset(data_args.dataload_script, data_args.dataload_split, data_dir=data_args.data_dir, split={ 'train': ReadInstruction('train', from_=0, to=5, unit='abs'), 'validation': ReadInstruction('validation', from_=0, to=5, unit='abs'), 'test': ReadInstruction('test', from_=0, to=5, unit='abs') }) else: datasets = load_dataset(data_args.dataload_script, data_args.dataload_split, data_dir=data_args.data_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) column_names = datasets["train"].column_names pprepare_features_for_generate_pseudo_label = partial( prepare_features_for_generate_pseudo_label, tokenizer=tokenizer, data_args=data_args) tokenized_datasets = datasets.map( pprepare_features_for_generate_pseudo_label, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorForGeneratingEvidenceLabel( tokenizer=tokenizer)) device = training_args.device model.to(device) model.eval() if training_args.n_gpu > 1: model = torch.nn.DataParallel(model) pseudo_label = {} options_prob_diff = {} acc = {} for train_test_or_eval, dataset in tokenized_datasets.items(): dataloader = DataLoader(dataset, batch_size=1, sampler=SequentialSampler(dataset), collate_fn=data_collator, num_workers=0) pseudo_label_split = {} options_prob_diff_split = {} acc_split = {} print(f'{train_test_or_eval}', len(dataloader)) for step, batch in enumerate(tqdm.tqdm(dataloader)): with torch.no_grad(): origin_inputs = { "input_ids": batch['input_ids'].to(device), "attention_mask": batch['attention_mask'].to(device), "token_type_ids": batch['token_type_ids'].to(device), } origin_logits = model(**origin_inputs).logits.detach().cpu() example_ids = batch['example_ids'] sent_bounds = batch['sent_bound_token'] for i, one_example_sent_bounds in enumerate(sent_bounds): if example_ids[i] not in pseudo_label_split.keys(): kl_div_per_example = {} prob_diff_per_example = {} pseudo_label_split[example_ids[i]] = kl_div_per_example options_prob_diff_split[ example_ids[i]] = prob_diff_per_example else: kl_div_per_example = pseudo_label_split[example_ids[i]] prob_diff_per_example = options_prob_diff_split[ example_ids[i]] one_example_logit = origin_logits[i] one_example_sent_bounds = torch.tensor(one_example_sent_bounds, device=device) one_example_attention_mask = batch['attention_mask'][i] one_example_input_ids = batch['input_ids'][i] one_example_token_type_ids = batch['token_type_ids'][i] one_example_label = batch['labels'][i] sent_num = one_example_sent_bounds.size()[0] for j in range(0, sent_num, training_args.eval_batch_size): batch_start = j batch_end = j + training_args.eval_batch_size if j < sent_num - training_args.eval_batch_size else sent_num batched_sent_bound = torch.stack( (one_example_sent_bounds[batch_start:batch_end, 1], one_example_sent_bounds[batch_start:batch_end, 2])).unsqueeze(1).permute( 2, 1, 0) batched_attention_mask = one_example_attention_mask.unsqueeze( 0).expand(batch_end - batch_start, -1, -1).clone().to(device) pos_matrix = torch.arange( batched_attention_mask.size()[-1], device=device).view(1, 1, -1) if_in_sent = torch.logical_and( batched_sent_bound[:, :, 0].unsqueeze(-1) <= pos_matrix, pos_matrix <= batched_sent_bound[:, :, 1].unsqueeze(-1)) batched_attention_mask = torch.where( if_in_sent, torch.tensor(0, device=device), batched_attention_mask) batched_input_ids = one_example_input_ids.expand( batch_end - batch_start, -1, -1).contiguous() batched_token_type_ids = one_example_token_type_ids.expand( batch_end - batch_start, -1, -1).contiguous() with torch.no_grad(): masked_inputs = { "input_ids": batched_input_ids.to(device), "attention_mask": batched_attention_mask.to(device), "token_type_ids": batched_token_type_ids.to(device), } masked_logits = model( **masked_inputs).logits.detach().cpu() kl_divs = torch.sum(F.kl_div( F.log_softmax(masked_logits, dim=-1), F.softmax(one_example_logit, dim=-1), reduction='none'), dim=-1) prob_diff = F.softmax(masked_logits, dim=-1) - F.softmax( one_example_logit, dim=-1) for k, kl_div in enumerate( kl_divs.detach().cpu().tolist()): sent_idx = one_example_sent_bounds[batch_start + k, 0].item() evidence_or_noise = 1 if F.softmax(masked_logits[k], dim=-1)[one_example_label].item() \ < F.softmax(one_example_logit, dim=-1)[one_example_label].item() else -1 if sent_idx in kl_div_per_example.keys(): if kl_div > abs(kl_div_per_example[sent_idx]): kl_div_per_example[ sent_idx] = evidence_or_noise * kl_div prob_diff_per_example[sent_idx] = prob_diff[ k].detach().cpu().tolist() else: kl_div_per_example[ sent_idx] = evidence_or_noise * kl_div prob_diff_per_example[sent_idx] = prob_diff[ k].detach().cpu().tolist() acc_split[example_ids[i]] = 1 if torch.argmax( one_example_logit).item() == one_example_label.item( ) else 0 pseudo_label[train_test_or_eval] = pseudo_label_split options_prob_diff[train_test_or_eval] = options_prob_diff_split acc[train_test_or_eval] = acc_split label = { 'pseudo_label': pseudo_label, 'acc': acc, 'options_prob_diff': options_prob_diff } torch.save( label, data_args.dataset + f"_pseudo_label_with_options_{config.model_type}_{config.hidden_size}.pt" )