def main(): class CalibrationDL(): def __init__(self): path = os.path.abspath( os.path.expanduser('./brats_cal_images_list.txt')) with open(path, 'r') as f: self.preprocess_files = [line.rstrip() for line in f] self.loaded_files = {} self.batch_size = 1 def __getitem__(self, sample_id): file_name = self.preprocess_files[sample_id] print("Loading file {:}".format(file_name)) with open( os.path.join('build/calib_preprocess/', "{:}.pkl".format(file_name)), "rb") as f: self.loaded_files[sample_id] = pickle.load(f)[0] return torch.from_numpy( self.loaded_files[sample_id][np.newaxis, ...]).float(), None def __len__(self): self.count = len(self.preprocess_files) return self.count args = get_args() assert args.backend == "pytorch" model_path = os.path.join(args.model_dir, "plans.pkl") assert os.path.isfile( model_path), "Cannot find the model file {:}!".format(model_path) trainer, params = load_model_and_checkpoint_files( args.model_dir, folds=1, fp16=False, checkpoint_name='model_final_checkpoint') trainer.load_checkpoint_ram(params[0], False) model = trainer.network if args.tune: quantizer = Quantization('conf.yaml') quantizer.model = common.Model(model) quantizer.eval_func = eval_func calib_dl = CalibrationDL() quantizer.calib_dataloader = calib_dl q_model = quantizer() q_model.save('./lpot_workspace') exit(0) if args.benchmark: model.eval() if args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath(os.path.expanduser('./lpot_workspace')), model) else: new_model = model eval_func(new_model)
def test_tensor_dump(self): model = copy.deepcopy(self.lpot_model) model.model.eval().fuse_model() quantizer = Quantization('dump_yaml.yaml') dataset = quantizer.dataset('dummy', (100, 3, 256, 256), label=True) quantizer.model = common.Model(model.model) quantizer.calib_dataloader = common.DataLoader(dataset) quantizer.eval_func = eval_func quantizer() self.assertTrue( True if os.path.exists('runs/eval/baseline_acc0.0') else False) quantizer.eval_dataloader = common.DataLoader(dataset) quantizer() self.assertTrue( True if os.path.exists('runs/eval/baseline_acc0.0') else False)
def auto_tune(self): """This is lpot tuning part to generate a quantized pb Returns: graph: it will return a quantized pb """ from lpot import Quantization infer_graph = load_graph(self.args.input_graph) quantizer = Quantization(self.args.config) if self.args.calib_data: quantizer.model = infer_graph quantizer.calib_dataloader = Dataloader(self.args.calib_data, self.args.batch_size) quantizer.eval_func = self.eval_inference q_model = quantizer() return q_model else: print("Please provide calibration dataset!")
def main(_): graph = load_graph(FLAGS.input_graph) if FLAGS.mode == 'tune': from lpot import Quantization, common quantizer = Quantization(FLAGS.config) ds = Dataset(FLAGS.inputs_file, FLAGS.reference_file, FLAGS.vocab_file) quantizer.calib_dataloader = common.DataLoader(ds, collate_fn=collate_fn, \ batch_size=FLAGS.batch_size) quantizer.model = common.Model(graph) quantizer.eval_func = eval_func q_model = quantizer() try: q_model.save(FLAGS.output_model) except Exception as e: print("Failed to save model due to {}".format(str(e))) elif FLAGS.mode == 'benchmark': eval_func(graph, FLAGS.iters) elif FLAGS.mode == 'accuracy': eval_func(graph, -1)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help= "The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets." ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( '--version_2_with_negative', action='store_true', help= 'If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json output file." ) parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument("--do_calibration", action='store_true', help="Whether to do calibration.") parser.add_argument("--do_int8_inference", action='store_true', help="Whether to run int8 inference.") parser.add_argument("--do_fp32_inference", action='store_true', help="Whether to run fp32 inference.") parser.add_argument("--mkldnn_eval", action='store_true', help="evaluation with MKLDNN") parser.add_argument( "--tune", action='store_true', help="run Low Precision Optimization Tool to tune int8 acc.") parser.add_argument("--task_name", default=None, type=str, required=True, help="SQuAD task") parser.add_argument("--warmup", type=int, default=5, help="warmup for performance") parser.add_argument('-i', "--iter", default=0, type=int, help='For accuracy measurement only.') parser.add_argument('--config', type=str, default='conf.yaml', help="yaml config file") parser.add_argument('--benchmark', dest='benchmark', action='store_true', help='run benchmark') parser.add_argument('-r', "--accuracy_only", dest='accuracy_only', action='store_true', help='For accuracy measurement only.') parser.add_argument( "--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH', help= 'path to checkpoint tuned by Low Precision Optimization Tool (default: ./)' ) parser.add_argument('--int8', dest='int8', action='store_true', help='run benchmark') args = parser.parse_args() args.predict_file = os.path.join( args.output_dir, 'predictions_{}_{}.txt'.format( list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) mix_qkv = False if args.do_calibration or args.do_int8_inference or args.tune: mix_qkv = True # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, mix_qkv=mix_qkv, cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, 'einsum') except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir, force_download=True, mix_qkv=mix_qkv) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce model loading logs logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" if args.mkldnn_eval or args.do_fp32_inference: model = model_class.from_pretrained(checkpoint, force_download=True) model.to(args.device) # Evaluate result, _ = evaluate(args, model, tokenizer, prefix=global_step) result = dict( (k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items()) results.update(result) if args.tune: def eval_func_for_lpot(model): result, _ = evaluate(args, model, tokenizer) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) bert_task_acc_keys = [ 'best_f1', 'f1', 'mcc', 'spearmanr', 'acc' ] for key in bert_task_acc_keys: if key in result.keys(): logger.info("Finally Eval {}:{}".format( key, result[key])) acc = result[key] break return acc model = model_class.from_pretrained(checkpoint, force_download=True, mix_qkv=True) model.to(args.device) dataset = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=False) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) eval_task = "squad" from lpot import Quantization, common quantizer = Quantization(args.config) dataset = quantizer.dataset('bert', dataset=dataset, task=eval_task, model_type=args.model_type) quantizer.model = common.Model(model) quantizer.calib_dataloader = common.DataLoader( dataset, batch_size=args.eval_batch_size) quantizer.eval_func = eval_func_for_lpot q_model = quantizer() q_model.save(args.tuned_checkpoint) exit(0) if args.benchmark or args.accuracy_only: model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) if args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(args.tuned_checkpoint)), model) else: new_model = model result, _ = evaluate(args, new_model, tokenizer, prefix=global_step) exit(0) if args.do_calibration: model = model_class.from_pretrained(checkpoint, force_download=True, mix_qkv=True) model.to(args.device) model.qconfig = default_per_channel_qconfig propagate_qconfig_(model) add_observer_(model) # Evaluate evaluate(args, model, tokenizer, prefix=global_step, calibration=True) convert(model, inplace=True) quantized_model_path = "squad" + str( global_step) + "_quantized_model" if not os.path.exists(quantized_model_path): os.makedirs(quantized_model_path) model.save_pretrained(quantized_model_path) result, _ = evaluate(args, model, tokenizer, prefix=global_step) result = dict( (k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items()) results.update(result) if args.do_int8_inference: model = model_class.from_pretrained(checkpoint, force_download=True, mix_qkv=True) model.to(args.device) model.qconfig = default_per_channel_qconfig propagate_qconfig_(model) add_observer_(model) convert(model, inplace=True) quantized_model_path = "squad" + str( global_step) + "_quantized_model" if not os.path.exists(quantized_model_path): logger.info("Please run calibration first!") return model_bin_file = os.path.join(quantized_model_path, "pytorch_model.bin") state_dict = torch.load(model_bin_file) model.load_state_dict(state_dict) print(model) with torch.autograd.profiler.profile() as prof: result, _ = evaluate(args, model, tokenizer, prefix=global_step) print(prof.key_averages().table(sort_by="cpu_time_total")) result = dict( (k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) return results
EM_acc = evaluate(model) return EM_acc if __name__ == '__main__': if only_calibration: try: calibration(net, num_calib_batches, quantized_dtype, calib_mode) except AttributeError: nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx) warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115') elif not only_predict: train() evaluate() elif args.tune: # lpot auto-tuning dev_dataloader = gen_dataset() from lpot import Quantization, common quantizer = Quantization("./bert.yaml") quantizer.model = common.Model(net) quantizer.calib_dataloader = dev_dataloader quantizer.eval_dataloader = dev_dataloader quantizer.eval_func = eval_func q_model = quantizer() q_model.save(args.output_dir) elif model_parameters or deploy: evaluate()
logger.info("speed is %.2f samples/s" % speed) return acc if __name__ == '__main__': if only_calibration: try: calibration(model, dev_data_list, num_calib_batches, quantized_dtype, calib_mode) except AttributeError: nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx) warnings.warn( 'INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115') elif args.tune: # lpot auto-tuning if only_inference: calib_data = dev_data_list[0][1] from lpot import Quantization, common quantizer = Quantization("./bert.yaml") quantizer.model = common.Model(model) quantizer.calib_dataloader = calib_data quantizer.eval_dataloader = calib_data quantizer.eval_func = test_func q_model = quantizer() q_model.save(args.output_dir) else: train(task.metrics)
dummy_dataloader, benchmark=True) latency = np.array(results).mean() / args.eval_batch_size print('Latency: {:.3f} ms'.format(latency * 1000)) print('Throughput: {:.3f} items/sec'.format(args.eval_batch_size * 1. / latency)) print('--------------------------------------------------------------') if args.tune: from onnxruntime.transformers import optimizer from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions opt_options = BertOptimizationOptions('bert') opt_options.enable_embed_layer_norm = False model_optimizer = optimizer.optimize_model( args.model_path, 'bert', num_heads=12, hidden_size=768, optimization_options=opt_options) model = model_optimizer.model from lpot import Quantization, common quantize = Quantization(args.config) quantize.model = common.Model(model) quantize.calib_dataloader = eval_dataloader quantize.eval_func = eval_func q_model = quantize() q_model.save(args.output_model)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list;" + ", ".join(ALL_MODELS)) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument("--mkldnn_eval", action='store_true', help="evaluation with MKLDNN") parser.add_argument("--mkldnn_train", action='store_true', help="training with MKLDNN") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument("--do_fp32_inference", action='store_true', help="Whether to run fp32 inference.") parser.add_argument("--do_calibration", action='store_true', help="Whether to do calibration.") parser.add_argument("--do_int8_inference", action='store_true', help="Whether to run int8 inference.") parser.add_argument("--do_bf16", action='store_true', help="run bf16 evaluation / training.") parser.add_argument( "--tune", action='store_true', help="run Low Precision Optimization Tool to tune int8 acc.") parser.add_argument("--warmup", type=int, default=2, help="warmup for performance") parser.add_argument('-i', "--iter", default=0, type=int, help='For accuracy measurement only.') parser.add_argument('--config', default='conf.yaml', type=str, help='yaml config file path') parser.add_argument('--benchmark', dest='benchmark', action='store_true', help='run benchmark') parser.add_argument('-r', "--accuracy_only", dest='accuracy_only', action='store_true', help='For accuracy measurement only.') parser.add_argument( "--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH', help= 'path to checkpoint tuned by Low Precision Optimization Tool (default: ./)' ) parser.add_argument('--int8', dest='int8', action='store_true', help='run benchmark') args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) mix_qkv = False if args.do_calibration or args.do_int8_inference or args.tune: mix_qkv = True # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, mix_qkv=mix_qkv, bf16=args.do_bf16, mkldnn_train=args.mkldnn_train, cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" logger.info("Evaluate:" + args.task_name) if args.mkldnn_eval or args.do_fp32_inference or args.do_bf16: model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) if args.tune: def eval_func_for_lpot(model): result, perf = evaluate(args, model, tokenizer, prefix=prefix) bert_task_acc_keys = [ 'acc_and_f1', 'f1', 'mcc', 'spearmanr', 'acc' ] for key in bert_task_acc_keys: if key in result.keys(): logger.info("Finally Eval {}:{}".format( key, result[key])) acc = result[key] break return acc model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) eval_task_names = ( "mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) for eval_task in eval_task_names: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) if args.mkldnn_eval: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model) print(model) from lpot import Quantization, common quantizer = Quantization(args.config) if eval_task != "squad": eval_task = 'classifier' eval_dataset = quantizer.dataset( 'bert', dataset=eval_dataset, task=eval_task, model_type=args.model_type) quantizer.model = common.Model(model) quantizer.calib_dataloader = common.DataLoader( eval_dataset, batch_size=args.eval_batch_size) quantizer.eval_func = eval_func_for_lpot q_model = quantizer() q_model.save(args.tuned_checkpoint) exit(0) if args.benchmark or args.accuracy_only: model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) if args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(args.tuned_checkpoint)), model) else: new_model = model result, _ = evaluate(args, new_model, tokenizer, prefix=prefix) exit(0) if args.do_calibration: model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) model.qconfig = default_per_channel_qconfig fallback_layers = {} if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc": fallback_layers = {"bert.encoder.layer.9.output.dense."} propagate_qconfig_(model) fallback_layer(model, layer_name="", exculde_layers=fallback_layers) add_observer_(model) result, _ = evaluate(args, model, tokenizer, prefix=global_step, calibration=True) convert(model, inplace=True) quantized_model_path = args.task_name + "_quantized_model" if not os.path.exists(quantized_model_path): os.makedirs(quantized_model_path) model.save_pretrained(quantized_model_path) print(model) result, _ = evaluate(args, model, tokenizer, prefix=prefix) if args.do_int8_inference: model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) model.qconfig = default_per_channel_qconfig fallback_layers = {} if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc": fallback_layers = {"bert.encoder.layer.9.output.dense."} propagate_qconfig_(model) fallback_layer(model, layer_name="", exculde_layers=fallback_layers) add_observer_(model) convert(model, inplace=True) quantized_model_path = args.task_name + "_quantized_model" if not os.path.exists(quantized_model_path): logger.error( "please do calibrantion befor run int8 inference") return prepare(model, inplace=True) convert(model, inplace=True) model_bin_file = os.path.join(quantized_model_path, "pytorch_model.bin") state_dict = torch.load(model_bin_file) model.load_state_dict(state_dict) result, _ = evaluate(args, model, tokenizer, prefix=prefix) return results
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file).") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--eval_data_file", default=None, type=str, help="An optional input evaluation data file to evaluate the perplexity on (a text file).") parser.add_argument("--model_type", default="bert", type=str, help="The model architecture to be fine-tuned.") parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str, help="The model checkpoint for weights initialization.") parser.add_argument("--mlm", action='store_true', help="Train with masked-language modeling loss instead of language modeling.") parser.add_argument("--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss") parser.add_argument("--config_name", default="", type=str, help="Optional pretrained config name or path if not the same as model_name_or_path") parser.add_argument("--tokenizer_name", default="", type=str, help="Optional pretrained tokenizer name or path if not the same as model_name_or_path") parser.add_argument("--cache_dir", default="", type=str, help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)") parser.add_argument("--block_size", default=-1, type=int, help="Optional input sequence length after tokenization." "The training dataset will be truncated in block of this size for training." "Default to the model max input length for single sentence inputs (take into account special tokens).") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--evaluate_during_training", action='store_true', help="Run evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument('--save_total_limit', type=int, default=None, help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default') parser.add_argument("--eval_all_checkpoints", action='store_true', help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument("--tune", action='store_true', help="run Low Precision Optimization Tool to tune int8 acc.") parser.add_argument('-i', "--iter", default=0, type=int, help='For accuracy measurement only.') parser.add_argument('--config', type=str, default='conf.yaml', help="yaml config file") parser.add_argument("--do_fp32_inference", action='store_true', help="Whether to run fp32 inference.") parser.add_argument('--benchmark', dest='benchmark', action='store_true', help='run benchmark') parser.add_argument('-r', "--accuracy_only", dest='accuracy_only', action='store_true', help='For accuracy measurement only.') parser.add_argument("--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH', help='path to checkpoint tuned by Low Precision Optimization Tool (default: ./)') parser.add_argument('--int8', dest='int8', action='store_true', help='run benchmark') args = parser.parse_args() if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm: raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if args.eval_data_file is None and args.do_eval: raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) if args.block_size <= 0: args.block_size = tokenizer.max_len_single_sentence # Our input block size will be the max possible for the model args.block_size = min(args.block_size, tokenizer.max_len_single_sentence) model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) model.to(args.device) if args.local_rank == 0: torch.distributed.barrier() # End of barrier to make sure only the first process in distributed training download model & vocab logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) if args.local_rank == 0: torch.distributed.barrier() global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" if args.do_fp32_inference: model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + '_{}'.format(global_step), v.numpy()) for k, v in result.items()) results.update(result) if args.tune: def eval_func_for_lpot(model): result = evaluate(args, model, tokenizer, prefix=prefix) return 100 - result['perplexity'].numpy() model = model_class.from_pretrained(checkpoint) model.to(args.device) model.eval() from lpot import Quantization, common quantizer = Quantization(args.config) eval_dataset = WikiDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) quantizer.model = common.Model(model) quantizer.calib_dataloader = eval_dataloader quantizer.eval_func = eval_func_for_lpot q_model = quantizer() q_model.save(args.tuned_checkpoint) exit(0) if args.benchmark or args.accuracy_only: model = model_class.from_pretrained(checkpoint, mix_qkv=True) model.to(args.device) if args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model) else: new_model = model result = evaluate(args, new_model, tokenizer, prefix=prefix) exit(0) return results