evaluation_strategy: EvaluationStrategy = field( default=EvaluationStrategy.STEPS) learning_rate: float = field(default=1e-4) per_device_train_batch_size: int = field(default=32) per_device_eval_batch_size: int = field(default=32) num_train_epochs: float = field(default=10.0) save_total_limit: int = field(default=5) masking_probability: float = field(default=None) replacement_probability: float = field(default=None) select_labels: bool = field(default=False) parser = HfArgumentParser((MyTrainingArguments), description="Traing script.") parser.add_argument("data_config_name", nargs="?", default="NER", choices=["NER", "ROLES", "BORING", "PANELIZATION"], help="Name of the dataset configuration to use.") parser.add_argument("--dataset_path", help="The dataset to use for training.") parser.add_argument( "--no_cache", action="store_true", help= "Flag that forces re-donwloading the dataset rather than re-using it from the cacher." ) training_args, args = parser.parse_args_into_dataclasses() no_cache = args.no_cache data_config_name = args.data_config_name dataset_path = args.dataset_path output_dir_path = Path(training_args.output_dir) / data_config_name
def main(): parser = HfArgumentParser( (ModelArguments, DataProcessingArguments, TrainingArguments)) model_args, dataprocessing_args, training_args = parser.parse_args_into_dataclasses( ) # For now, let's merge all the sets of args into one, # but soon, we'll keep distinct sets of args, with a cleaner separation of concerns. args = argparse.Namespace(**vars(model_args), **vars(dataprocessing_args), **vars(training_args)) parser.add_argument('--freeze_bert', action='store_true') parser.add_argument('--prune_train', type=float, default=0.0) parser.add_argument('--prune_eval', type=float, default=0.0) parser.add_argument('--prune', type=str, default='random', help="default=random, global, l1") parser.add_argument('--prune_layers', type=str, default='') parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") args = parser.parse_args() print('Args:', args) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() # config = AutoConfig.from_pretrained( # args.config_name if args.config_name else args.model_name_or_path, # num_labels=num_labels, # finetuning_task=args.task_name, # cache_dir=args.cache_dir, # ) # tokenizer = AutoTokenizer.from_pretrained( # args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir, # ) # model = AutoModelForSequenceClassification.from_pretrained( # args.model_name_or_path, # from_tf=bool(".ckpt" in args.model_name_or_path), # config=config, # cache_dir=args.cache_dir, # ) config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) print('Model Size:') for mod_name, module in list(model.named_modules()): size = sum([ np.prod(p.size()) for p in filter(lambda p: p.requires_grad, module.parameters()) ]) print(mod_name, size) # for name, value in list(module.named_parameters()): # print(mod_name, name) if args.freeze_bert: print('Freezing bert weights') for name, param in model.bert.named_parameters(): if param.requires_grad: param.requires_grad = False print(name) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned # model = AutoModelForSequenceClassification.from_pretrained(args.output_dir) # tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: # tokenizer = AutoTokenizer.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( "/")[-1] if checkpoint.find("checkpoint") != -1 else "" # model = AutoModelForSequenceClassification.from_pretrained(checkpoint) model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
@dataclass class MyTrainingArguments(TrainingArguments): output_dir: str = field(default=LM_MODEL_PATH) overwrite_output_dir: bool = field(default=True) logging_steps: int = field(default=2000) evaluation_strategy: EvaluationStrategy = field( default=EvaluationStrategy.STEPS) per_device_train_batch_size: int = field(default=16) per_device_eval_batch_size: int = field(default=16) save_total_limit: int = field(default=5) parser = HfArgumentParser((MyTrainingArguments), description="Traing script.") parser.add_argument("data_config_name", nargs="?", default="MLM", choices=["MLM", "DET", "VERB", "SMALL"], help="Name of the dataset configuration to use.") parser.add_argument("--dataset_path", help="The dataset to use for training.") parser.add_argument( "--no_cache", action="store_true", help= "Flag that forces re-donwloading the dataset rather than re-using it from the cacher." ) training_args, args = parser.parse_args_into_dataclasses() no_cache = args.no_cache dataset_path = args.dataset_path data_config_name = args.data_config_name output_dir_path = Path(training_args.output_dir)
else: def compute_metrics_function(eval_pred: EvalPrediction) -> Dict: predictions, labels = eval_pred predictions = predictions[:, 0] return metric.compute(predictions=predictions, references=labels) return compute_metrics_function if __name__ == "__main__": parser = HfArgumentParser(TrainingArguments) parser.add_argument("--task", default="cola", help="name of GLUE task to compute") parser.add_argument("--model_checkpoint", default="distilbert-base-uncased") training_args, args = parser.parse_args_into_dataclasses() transformers.logging.set_verbosity_debug() task: str = args.task.lower() num_labels = num_labels_from_task(task) model = AutoModelForSequenceClassification.from_pretrained( args.model_checkpoint, num_labels=num_labels) tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint,