def build_training_data_loader(self): train_dataset, _, _ = data.load_and_cache_examples( tokenizer=self.tokenizer, task=self.context.get_data_config().get("task"), max_seq_length=self.context.get_hparam("max_seq_length"), doc_stride=self.context.get_hparam("doc_stride"), max_query_length=self.context.get_hparam("max_query_length"), evaluate=False, ) return DataLoader(train_dataset, batch_size=self.context.get_per_slot_batch_size())
def build_training_data_loader(self) -> DataLoader: if not self.data_downloaded: self.download_dataset() train_dataset = data.load_and_cache_examples( base_data_dir=self.download_directory, config=self.context.get_data_config(), model_type=self.context.get_hparam("model_type"), max_seq_length=self.context.get_hparam("max_seq_length"), evaluate=False, ) return DataLoader(train_dataset, batch_size=self.context.get_per_slot_batch_size())
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = batch.to(args.device) with torch.no_grad(): outputs = model(batch, labels=batch) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def train(args): logger.info("Creating config and model") config = txf.AutoConfig.from_pretrained(args.config_name) tokenizer = txf.AutoTokenizer.from_pretrained( args.config_name, do_lower_case=args.uncased_model, ) model = txf.AutoModelForQuestionAnswering.from_pretrained( args.config_name, from_tf=bool(".ckpt" in args.config_name), config=config, ) # TODO: Multi-GPU device = torch.device( "cuda" if torch.cuda.is_available() and args.num_gpus else "cpu") logger.info("Loading model to %s", device) model.to(device) logger.info("Creating data loader") train_dataset = data.load_and_cache_examples( args.train, tokenizer, args, evaluate=False, output_examples=False, ) train_dataloader = data.get_dataloader(train_dataset, args.per_gpu_train_batch_size, evaluate=False) # if args.max_steps > 0: # t_total = args.max_steps # args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.grad_acc_steps) + 1 # else: t_total = len(train_dataloader) // args.grad_acc_steps * args.epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.wd, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = txf.AdamW(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon) scheduler = txf.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) ## Train! logger.info("Training model") global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() for epoch in range(args.epochs): logger.info(f"[Epoch {epoch}] Starting") for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.has_unanswerable: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(device), }) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.grad_acc_steps > 1: loss = loss / args.grad_acc_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.grad_acc_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if (args.log_interval > 0 and global_step % args.log_interval == 0): logstr = "lr={:.5e}; loss={:.5e};".format( scheduler.get_lr()[0], (tr_loss - logging_loss) / args.log_interval) # Only evaluate when single GPU otherwise metrics may not average well if args.validation: logger.info( f"[Epoch {epoch} Global Step {global_step}] Starting evaluation..." ) results = evaluate(args, model, tokenizer, device, prefix=global_step) for key, value in results.items(): logstr += f" eval_{key}={value:.5e};" logger.info( f"[Epoch {epoch} Global Step {global_step}] Metrics: {logstr}" ) logging_loss = tr_loss # Save model checkpoint if args.checkpoint_interval > 0 and global_step % args.checkpoint_interval == 0: save_progress(model, tokenizer, args, checkpoint=global_step, optimizer=optimizer, scheduler=scheduler) if args.max_steps > 0 and global_step > args.max_steps: break if args.max_steps > 0 and global_step > args.max_steps: break logger.info("Training complete: Saving model") save_progress(model, tokenizer, args) return
def evaluate(args, model, tokenizer, device, prefix=""): eval_dataset, examples, features = data.load_and_cache_examples( args.validation, tokenizer, args, evaluate=True, output_examples=True, ) eval_dataloader = data.get_dataloader(eval_dataset, args.per_gpu_eval_batch_size, evaluate=True) all_results = [] start_time = timeit.default_timer() eval_batches = 0 for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) eval_batches += 1 with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] feature_indices = batch[3] # XLNet and XLM use more arguments for their predictions if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) # for lang_id-sensitive xlm models if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(device) }) outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) else: start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / (eval_batches * args.per_gpu_eval_batch_size)) # Compute predictions output_prediction_file = os.path.join(args.output_data_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join( args.output_data_dir, "nbest_predictions_{}.json".format(prefix)) if args.has_unanswerable: output_null_log_odds_file = os.path.join( args.output_data_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure if args.model_type in ["xlnet", "xlm"]: start_n_top = model.config.start_n_top if hasattr( model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr( model, "config") else model.module.config.end_n_top predictions = squad_metrics.compute_predictions_log_probs( examples, features, all_results, args.n_best_size, args.max_answer_len, output_prediction_file, output_nbest_file, output_null_log_odds_file, start_n_top, end_n_top, args.has_unanswerable, tokenizer, logger.level < logging.INFO, ) else: predictions = squad_metrics.compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_len, args.uncased_model, output_prediction_file, output_nbest_file, output_null_log_odds_file, logger.level < logging.INFO, args.has_unanswerable, args.null_score_diff_thresh, tokenizer, ) # Compute the F1 and exact scores. results = squad_metrics.squad_evaluate(examples, predictions) return results
def __init__(self, context: det.TrialContext): self.context = context self.config_class, self.tokenizer_class, self.model_class = constants.MODEL_CLASSES[ self.context.get_hparam("model_type") ] self.tokenizer = self.tokenizer_class.from_pretrained( self.context.get_data_config().get("pretrained_model_name"), do_lower_case=True, cache_dir=None ) self.validation_dataset, self.validation_examples, self.validation_features = data.load_and_cache_examples( tokenizer=self.tokenizer, task=self.context.get_data_config().get("task"), max_seq_length=self.context.get_hparam("max_seq_length"), doc_stride=self.context.get_hparam("doc_stride"), max_query_length=self.context.get_hparam("max_query_length"), evaluate=True, )
def build_validation_data_loader(self): self.validation_dataset, self.validation_examples, self.validation_features = data.load_and_cache_examples( data_dir=self.download_directory, tokenizer=self.tokenizer, task=self.context.get_data_config().get("task"), max_seq_length=self.context.get_hparam("max_seq_length"), doc_stride=self.context.get_hparam("doc_stride"), max_query_length=self.context.get_hparam("max_query_length"), evaluate=True, ) return DataLoader( self.validation_dataset, batch_size=self.context.get_per_slot_batch_size(), )
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file).") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) # Other parameters parser.add_argument( "--eval_data_file", default=None, type=str, help= "An optional input evaluation data file to evaluate the perplexity on (a text file)." ) # Available GPT-2 models: gpt2, gpt2-medium, gpt2-large parser.add_argument( "--model_name_or_path", default="gpt2", type=str, help="The model checkpoint for weights initialization.") parser.add_argument( "--config_name", default="", type=str, help= "Optional pretrained config name or path if not the same as model_name_or_path" ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "Optional pretrained tokenizer name or path if not the same as model_name_or_path" ) parser.add_argument( "--cache_dir", default="", type=str, help= "Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)" ) parser.add_argument( "--block_size", default=-1, type=int, help="Optional input sequence length after tokenization." "The training dataset will be truncated in block of this size for training." "Default to the model max input length for single sentence inputs (take into account special tokens)." ) parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( '--save_total_limit', type=int, default=None, help= 'Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default' ) parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() if args.eval_data_file is None and args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.warning("device: %s, n_gpu: %s, 16-bits training: %s", device, args.n_gpu, args.fp16) # Set seed set_seed(args) config_class, model_class, tokenizer_class = (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer) config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) if args.block_size <= 0: # Our input block size will be the max possible for the model args.block_size = tokenizer.max_len_single_sentence print("Block size:", args.block_size) args.block_size = min(args.block_size, tokenizer.max_len_single_sentence) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() if args.do_train: # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) # Evaluation results = {} if args.do_eval: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) return results
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") ## Other parameters parser.add_argument("--model_path", default="bert-base-uncased", type=str, help="Pre-trained BERT model to extend e.g. bert-base-uncased.") parser.add_argument("--model", default=None, type=str, help="Type of speech grader model: ['lstm', 'bert']") parser.add_argument("--max_score", default=6, type=float, help="Maximum score that an example can be awarded (the default value used is 6, inline with CEFR levels).") parser.add_argument('--special_tokens', default=[], type=str, action='append', help='Special tokens to mask when making auxiliary objective predictions. These are also denoted as special tokens for the BERT tokenizer.') parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Train a model.") parser.add_argument("--do_test", action='store_true', help="Evaluate the model at --model_dir on the test set.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument("--evaluate_during_training", action='store_true', help="Run evaluation during training at each logging step.") parser.add_argument('--save_best_on_evaluate', action='store_true', help="Save best model based on evaluation scoring loss.") parser.add_argument("--save_all_checkpoints", action='store_true', help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--output_dir", default=None, type=str, help="The output directory where the model predictions and checkpoints will be written during training.") parser.add_argument("--model_dir", default=None, type=str, help="The directory where the model files are stored (used for testing).") parser.add_argument("--model_args_dir", default=None, type=str, help="The directory where the model args are stored (used for testing).") parser.add_argument('--overwrite_cache', action='store_true', help="Overwrite the cached training, validation and testing sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--predictions-file', type=str, default=None) # Auxiliary objectives for obj in aux_objs: parser.add_argument('--use_{}_objective'.format(obj), action='store_true', help='Use {} objective during training. --{}_alpha must also be set'.format(obj, obj)) parser.add_argument('--{}_alpha'.format(obj), type=float, default=0.0, help='Weighting of {} objective in loss score. All alpha values must add to 1'.format(obj)) parser.add_argument('--score_alpha', type=float, default=1.0, help='Weighting of scoring objective in loss score. All alpha values must add to 1') # Parsing args args = parser.parse_args() args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if args.save_best_on_evaluate and not args.evaluate_during_training: args.logger.info('Cannot save best model if not evaluating') return # Setup logging logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO) # Train a model if args.do_train: # Store training arguments to facilitate reloading a model. if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) args.logger = logger if not args.model: args.logger.info("--model must be provided for training (['lstm', 'bert'])") return if args.model == 'lstm': vocab = data.load_and_cache_vocab(args.data_dir, logger) training_objectives = get_auxiliary_objectives(args, len(vocab)) grader = lstm_model.SpeechGraderModel(args, vocab, training_objectives).to(args.device) train_data = data.load_and_cache_examples( args.model, args.data_dir, args.max_seq_length, args.special_tokens, logger, vocab=vocab, reload=args.overwrite_cache) dev_data = data.load_and_cache_examples( args.model, args.data_dir, args.max_seq_length, args.special_tokens, logger, vocab=vocab, evaluate=True, reload=args.overwrite_cache) trainer = train.Trainer(args, grader, training_objectives) elif args.model == 'bert': tokenizer = BertTokenizer.from_pretrained(args.model_path, additional_special_tokens=args.special_tokens) config = BertConfig.from_pretrained(args.model_path) training_objectives = get_auxiliary_objectives(args, tokenizer.vocab_size) config.training_objectives = training_objectives config.max_score = args.max_score grader = bert_model.SpeechGraderModel(config=config).to(args.device) train_data = data.load_and_cache_examples( args.model, args.data_dir, args.max_seq_length, args.special_tokens, logger, tokenizer=tokenizer, reload=args.overwrite_cache) dev_data = data.load_and_cache_examples( args.model, args.data_dir, args.max_seq_length, args.special_tokens, logger, tokenizer=tokenizer, evaluate=True, reload=args.overwrite_cache) trainer = train.Trainer(args, grader, training_objectives, bert_tokenizer=tokenizer) else: args.logger.info("--model must be either 'lstm' or 'bert'") return trainer.train(train_data, dev_data) # Test a model if args.do_test: # Retrieve training arguments to facilitate reloading the model. args.logger = logger train_args = torch.load(os.path.join(args.model_args_dir, 'training_args.bin')) train_args.predictions_file = args.predictions_file train_args.logger = logger if train_args.model == 'lstm': # use the vocabulary from train time vocab = data.load_and_cache_vocab(train_args.data_dir, logger) training_objectives = get_auxiliary_objectives(train_args, len(vocab)) grader = lstm_model.SpeechGraderModel(args, vocab, training_objectives).to(args.device) grader.load_state_dict(torch.load(os.path.join(args.model_dir, 'lstm.model'))) test_data = data.load_and_cache_examples( train_args.model, args.data_dir, train_args.max_seq_length, train_args.special_tokens, logger, vocab=vocab, test=True, reload=args.overwrite_cache) trainer = train.Trainer(train_args, grader, training_objectives) else: tokenizer = BertTokenizer.from_pretrained(args.model_dir, do_lower_case=True) training_objectives = get_auxiliary_objectives(train_args, tokenizer.vocab_size) config = BertConfig.from_pretrained(args.model_dir) grader = bert_model.SpeechGraderModel.from_pretrained(args.model_dir, config=config).to(args.device) test_data = data.load_and_cache_examples( train_args.model, args.data_dir, train_args.max_seq_length, train_args.special_tokens, logger, tokenizer=tokenizer, test=True, reload=args.overwrite_cache) trainer = train.Trainer(train_args, grader, training_objectives, bert_tokenizer=tokenizer) trainer.test(test_data)
def main(): args = parse_args() if os.path.exists(args.output_data_dir) and os.listdir( args.output_data_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_data_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] print(MODEL_CLASSES[args.model_type]) config = config_class.from_pretrained(args.config_name, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.model_name, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) print(args.do_train) train_flag = True if train_flag: train_dataset = data.load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if train_flag and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_data_dir) and args.local_rank in [ -1, 0 ]: os.makedirs(args.output_data_dir) logger.info("Saving model checkpoint to %s", args.output_data_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training #model_to_save.save_pretrained(args.output_data_dir) #tokenizer.save_pretrained(args.output_data_dir) model_to_save.save_pretrained(args.model_dir) tokenizer.save_pretrained(args.model_dir) # Good practice: save your training arguments together with the trained model #torch.save(args, os.path.join(args.output_data_dir, 'training_args.bin')) torch.save(args, os.path.join(args.model_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.model_dir) tokenizer = tokenizer_class.from_pretrained(args.model_dir) model.to(args.device) # Evaluation eval_flag = True eval_all_checkpoints = True do_lower_case = True results = {} if eval_flag and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.model_dir, do_lower_case=do_lower_case) logger.info("******Output dir folders: %s", os.listdir(args.output_data_dir)) checkpoints = [args.output_data_dir] if eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_data_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) print(results) print('COOL')
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_data_dir, args.output_data_dir + '-MM') if args.task_name == "mnli" else ( args.output_data_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = data.load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) print("***** Running evaluation {} *****".format(prefix)) print(" Num examples = %d", len(eval_dataset)) print(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) #result! = compute_metrics(eval_task, preds, out_label_ids) result = compute_metrics("sst-2", preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results