示例#1
0
文件: run.py 项目: mbasnet1/lpot
def main():
    class CalibrationDL():
        def __init__(self):
            path = os.path.abspath(
                os.path.expanduser('./brats_cal_images_list.txt'))
            with open(path, 'r') as f:
                self.preprocess_files = [line.rstrip() for line in f]

            self.loaded_files = {}
            self.batch_size = 1

        def __getitem__(self, sample_id):
            file_name = self.preprocess_files[sample_id]
            print("Loading file {:}".format(file_name))
            with open(
                    os.path.join('build/calib_preprocess/',
                                 "{:}.pkl".format(file_name)), "rb") as f:
                self.loaded_files[sample_id] = pickle.load(f)[0]
            return torch.from_numpy(
                self.loaded_files[sample_id][np.newaxis, ...]).float(), None

        def __len__(self):
            self.count = len(self.preprocess_files)
            return self.count

    args = get_args()
    assert args.backend == "pytorch"
    model_path = os.path.join(args.model_dir, "plans.pkl")
    assert os.path.isfile(
        model_path), "Cannot find the model file {:}!".format(model_path)
    trainer, params = load_model_and_checkpoint_files(
        args.model_dir,
        folds=1,
        fp16=False,
        checkpoint_name='model_final_checkpoint')
    trainer.load_checkpoint_ram(params[0], False)
    model = trainer.network

    if args.tune:
        quantizer = Quantization('conf.yaml')
        quantizer.model = common.Model(model)
        quantizer.eval_func = eval_func
        calib_dl = CalibrationDL()
        quantizer.calib_dataloader = calib_dl
        q_model = quantizer()
        q_model.save('./lpot_workspace')
        exit(0)

    if args.benchmark:
        model.eval()
        if args.int8:
            from lpot.utils.pytorch import load
            new_model = load(
                os.path.abspath(os.path.expanduser('./lpot_workspace')), model)
        else:
            new_model = model
        eval_func(new_model)
示例#2
0
 def test_tensor_dump(self):
     model = copy.deepcopy(self.lpot_model)
     model.model.eval().fuse_model()
     quantizer = Quantization('dump_yaml.yaml')
     dataset = quantizer.dataset('dummy', (100, 3, 256, 256), label=True)
     quantizer.model = common.Model(model.model)
     quantizer.calib_dataloader = common.DataLoader(dataset)
     quantizer.eval_func = eval_func
     quantizer()
     self.assertTrue(
         True if os.path.exists('runs/eval/baseline_acc0.0') else False)
     quantizer.eval_dataloader = common.DataLoader(dataset)
     quantizer()
     self.assertTrue(
         True if os.path.exists('runs/eval/baseline_acc0.0') else False)
示例#3
0
 def auto_tune(self):
     """This is lpot tuning part to generate a quantized pb
     Returns:
         graph: it will return a quantized pb
     """
     from lpot import Quantization
     infer_graph = load_graph(self.args.input_graph)
     quantizer = Quantization(self.args.config)
     if self.args.calib_data:
         quantizer.model = infer_graph
         quantizer.calib_dataloader = Dataloader(self.args.calib_data,
                                                 self.args.batch_size)
         quantizer.eval_func = self.eval_inference
         q_model = quantizer()
         return q_model
     else:
         print("Please provide calibration dataset!")
示例#4
0
文件: main.py 项目: mbasnet1/lpot
def main(_):
    graph = load_graph(FLAGS.input_graph)
    if FLAGS.mode == 'tune':
        from lpot import Quantization, common
        quantizer = Quantization(FLAGS.config)
        ds = Dataset(FLAGS.inputs_file, FLAGS.reference_file, FLAGS.vocab_file)
        quantizer.calib_dataloader = common.DataLoader(ds, collate_fn=collate_fn, \
                                                 batch_size=FLAGS.batch_size)
        quantizer.model = common.Model(graph)
        quantizer.eval_func = eval_func
        q_model = quantizer()
        try:
            q_model.save(FLAGS.output_model)
        except Exception as e:
            print("Failed to save model due to {}".format(str(e)))
    elif FLAGS.mode == 'benchmark':
        eval_func(graph, FLAGS.iters)
    elif FLAGS.mode == 'accuracy':
        eval_func(graph, -1)
示例#5
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help=
        "The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets."
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")

    parser.add_argument(
        '--version_2_with_negative',
        action='store_true',
        help=
        'If true, the SQuAD examples contain some that do not have an answer.')
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json output file."
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument("--do_calibration",
                        action='store_true',
                        help="Whether to do calibration.")
    parser.add_argument("--do_int8_inference",
                        action='store_true',
                        help="Whether to run int8 inference.")
    parser.add_argument("--do_fp32_inference",
                        action='store_true',
                        help="Whether to run fp32 inference.")
    parser.add_argument("--mkldnn_eval",
                        action='store_true',
                        help="evaluation with MKLDNN")
    parser.add_argument(
        "--tune",
        action='store_true',
        help="run Low Precision Optimization Tool to tune int8 acc.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="SQuAD task")
    parser.add_argument("--warmup",
                        type=int,
                        default=5,
                        help="warmup for performance")
    parser.add_argument('-i',
                        "--iter",
                        default=0,
                        type=int,
                        help='For accuracy measurement only.')
    parser.add_argument('--config',
                        type=str,
                        default='conf.yaml',
                        help="yaml config file")
    parser.add_argument('--benchmark',
                        dest='benchmark',
                        action='store_true',
                        help='run benchmark')
    parser.add_argument('-r',
                        "--accuracy_only",
                        dest='accuracy_only',
                        action='store_true',
                        help='For accuracy measurement only.')
    parser.add_argument(
        "--tuned_checkpoint",
        default='./saved_results',
        type=str,
        metavar='PATH',
        help=
        'path to checkpoint tuned by Low Precision Optimization Tool (default: ./)'
    )
    parser.add_argument('--int8',
                        dest='int8',
                        action='store_true',
                        help='run benchmark')

    args = parser.parse_args()

    args.predict_file = os.path.join(
        args.output_dir, 'predictions_{}_{}.txt'.format(
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length)))

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    mix_qkv = False
    if args.do_calibration or args.do_int8_inference or args.tune:
        mix_qkv = True

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config,
        mix_qkv=mix_qkv,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
    # remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, 'einsum')
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False,
                                                output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Save the trained model and the tokenizer
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir,
                                            force_download=True,
                                            mix_qkv=mix_qkv)
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce model loading logs

        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            if args.mkldnn_eval or args.do_fp32_inference:
                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True)
                model.to(args.device)

                # Evaluate
                result, _ = evaluate(args,
                                     model,
                                     tokenizer,
                                     prefix=global_step)
                result = dict(
                    (k + ('_{}'.format(global_step) if global_step else ''), v)
                    for k, v in result.items())
                results.update(result)

            if args.tune:

                def eval_func_for_lpot(model):
                    result, _ = evaluate(args, model, tokenizer)
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                    bert_task_acc_keys = [
                        'best_f1', 'f1', 'mcc', 'spearmanr', 'acc'
                    ]
                    for key in bert_task_acc_keys:
                        if key in result.keys():
                            logger.info("Finally Eval {}:{}".format(
                                key, result[key]))
                            acc = result[key]
                            break
                    return acc

                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True,
                                                    mix_qkv=True)
                model.to(args.device)
                dataset = load_and_cache_examples(args,
                                                  tokenizer,
                                                  evaluate=True,
                                                  output_examples=False)
                args.eval_batch_size = args.per_gpu_eval_batch_size * max(
                    1, args.n_gpu)
                eval_task = "squad"
                from lpot import Quantization, common
                quantizer = Quantization(args.config)
                dataset = quantizer.dataset('bert',
                                            dataset=dataset,
                                            task=eval_task,
                                            model_type=args.model_type)
                quantizer.model = common.Model(model)
                quantizer.calib_dataloader = common.DataLoader(
                    dataset, batch_size=args.eval_batch_size)
                quantizer.eval_func = eval_func_for_lpot
                q_model = quantizer()
                q_model.save(args.tuned_checkpoint)
                exit(0)

            if args.benchmark or args.accuracy_only:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                if args.int8:
                    from lpot.utils.pytorch import load
                    new_model = load(
                        os.path.abspath(
                            os.path.expanduser(args.tuned_checkpoint)), model)
                else:
                    new_model = model
                result, _ = evaluate(args,
                                     new_model,
                                     tokenizer,
                                     prefix=global_step)
                exit(0)

            if args.do_calibration:
                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True,
                                                    mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                propagate_qconfig_(model)
                add_observer_(model)
                # Evaluate
                evaluate(args,
                         model,
                         tokenizer,
                         prefix=global_step,
                         calibration=True)
                convert(model, inplace=True)
                quantized_model_path = "squad" + str(
                    global_step) + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    os.makedirs(quantized_model_path)
                model.save_pretrained(quantized_model_path)
                result, _ = evaluate(args,
                                     model,
                                     tokenizer,
                                     prefix=global_step)
                result = dict(
                    (k + ('_{}'.format(global_step) if global_step else ''), v)
                    for k, v in result.items())
                results.update(result)
            if args.do_int8_inference:
                model = model_class.from_pretrained(checkpoint,
                                                    force_download=True,
                                                    mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                propagate_qconfig_(model)
                add_observer_(model)
                convert(model, inplace=True)
                quantized_model_path = "squad" + str(
                    global_step) + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    logger.info("Please run calibration first!")
                    return
                model_bin_file = os.path.join(quantized_model_path,
                                              "pytorch_model.bin")
                state_dict = torch.load(model_bin_file)
                model.load_state_dict(state_dict)
                print(model)
                with torch.autograd.profiler.profile() as prof:
                    result, _ = evaluate(args,
                                         model,
                                         tokenizer,
                                         prefix=global_step)
                print(prof.key_averages().table(sort_by="cpu_time_total"))
                result = dict(
                    (k + ('_{}'.format(global_step) if global_step else ''), v)
                    for k, v in result.items())
                results.update(result)
    logger.info("Results: {}".format(results))

    return results
示例#6
0
    EM_acc = evaluate(model)
    return EM_acc

if __name__ == '__main__':
    if only_calibration:
        try:
            calibration(net,
                        num_calib_batches,
                        quantized_dtype,
                        calib_mode)
        except AttributeError:
            nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
            warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
    elif not only_predict:
        train()
        evaluate()
    elif args.tune:
        # lpot auto-tuning
        dev_dataloader = gen_dataset()
        from lpot import Quantization, common
        quantizer = Quantization("./bert.yaml")
        quantizer.model = common.Model(net)
        quantizer.calib_dataloader = dev_dataloader
        quantizer.eval_dataloader = dev_dataloader
        quantizer.eval_func = eval_func
        q_model = quantizer()
        q_model.save(args.output_dir)
    elif model_parameters or deploy:
        evaluate()

示例#7
0
    logger.info("speed is %.2f samples/s" % speed)

    return acc


if __name__ == '__main__':
    if only_calibration:
        try:
            calibration(model, dev_data_list, num_calib_batches,
                        quantized_dtype, calib_mode)
        except AttributeError:
            nlp.utils.version.check_version('1.7.0',
                                            warning_only=True,
                                            library=mx)
            warnings.warn(
                'INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
    elif args.tune:
        # lpot auto-tuning
        if only_inference:
            calib_data = dev_data_list[0][1]
            from lpot import Quantization, common
            quantizer = Quantization("./bert.yaml")
            quantizer.model = common.Model(model)
            quantizer.calib_dataloader = calib_data
            quantizer.eval_dataloader = calib_data
            quantizer.eval_func = test_func
            q_model = quantizer()
            q_model.save(args.output_dir)
    else:
        train(task.metrics)
示例#8
0
                                      dummy_dataloader,
                                      benchmark=True)
            latency = np.array(results).mean() / args.eval_batch_size

            print('Latency: {:.3f} ms'.format(latency * 1000))
            print('Throughput: {:.3f} items/sec'.format(args.eval_batch_size *
                                                        1. / latency))
        print('--------------------------------------------------------------')

    if args.tune:
        from onnxruntime.transformers import optimizer
        from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions
        opt_options = BertOptimizationOptions('bert')
        opt_options.enable_embed_layer_norm = False

        model_optimizer = optimizer.optimize_model(
            args.model_path,
            'bert',
            num_heads=12,
            hidden_size=768,
            optimization_options=opt_options)
        model = model_optimizer.model

        from lpot import Quantization, common
        quantize = Quantization(args.config)
        quantize.model = common.Model(model)
        quantize.calib_dataloader = eval_dataloader
        quantize.eval_func = eval_func
        q_model = quantize()
        q_model.save(args.output_model)
示例#9
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list;"
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument("--mkldnn_eval",
                        action='store_true',
                        help="evaluation with MKLDNN")
    parser.add_argument("--mkldnn_train",
                        action='store_true',
                        help="training with MKLDNN")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument("--do_fp32_inference",
                        action='store_true',
                        help="Whether to run fp32 inference.")
    parser.add_argument("--do_calibration",
                        action='store_true',
                        help="Whether to do calibration.")
    parser.add_argument("--do_int8_inference",
                        action='store_true',
                        help="Whether to run int8 inference.")
    parser.add_argument("--do_bf16",
                        action='store_true',
                        help="run bf16 evaluation / training.")
    parser.add_argument(
        "--tune",
        action='store_true',
        help="run Low Precision Optimization Tool to tune int8 acc.")
    parser.add_argument("--warmup",
                        type=int,
                        default=2,
                        help="warmup for performance")
    parser.add_argument('-i',
                        "--iter",
                        default=0,
                        type=int,
                        help='For accuracy measurement only.')
    parser.add_argument('--config',
                        default='conf.yaml',
                        type=str,
                        help='yaml config file path')
    parser.add_argument('--benchmark',
                        dest='benchmark',
                        action='store_true',
                        help='run benchmark')
    parser.add_argument('-r',
                        "--accuracy_only",
                        dest='accuracy_only',
                        action='store_true',
                        help='For accuracy measurement only.')
    parser.add_argument(
        "--tuned_checkpoint",
        default='./saved_results',
        type=str,
        metavar='PATH',
        help=
        'path to checkpoint tuned by Low Precision Optimization Tool (default: ./)'
    )
    parser.add_argument('--int8',
                        dest='int8',
                        action='store_true',
                        help='run benchmark')

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)
    mix_qkv = False
    if args.do_calibration or args.do_int8_inference or args.tune:
        mix_qkv = True
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config,
        mix_qkv=mix_qkv,
        bf16=args.do_bf16,
        mkldnn_train=args.mkldnn_train,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                '/')[-1] if checkpoint.find('checkpoint') != -1 else ""

            logger.info("Evaluate:" + args.task_name)
            if args.mkldnn_eval or args.do_fp32_inference or args.do_bf16:
                model = model_class.from_pretrained(checkpoint)
                model.to(args.device)
                result = evaluate(args, model, tokenizer, prefix=prefix)
                result = dict((k + '_{}'.format(global_step), v)
                              for k, v in result.items())
                results.update(result)

            if args.tune:

                def eval_func_for_lpot(model):
                    result, perf = evaluate(args,
                                            model,
                                            tokenizer,
                                            prefix=prefix)
                    bert_task_acc_keys = [
                        'acc_and_f1', 'f1', 'mcc', 'spearmanr', 'acc'
                    ]
                    for key in bert_task_acc_keys:
                        if key in result.keys():
                            logger.info("Finally Eval {}:{}".format(
                                key, result[key]))
                            acc = result[key]
                            break
                    return acc

                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                eval_task_names = (
                    "mnli", "mnli-mm") if args.task_name == "mnli" else (
                        args.task_name, )

                for eval_task in eval_task_names:
                    eval_dataset = load_and_cache_examples(args,
                                                           eval_task,
                                                           tokenizer,
                                                           evaluate=True)

                    args.eval_batch_size = args.per_gpu_eval_batch_size * max(
                        1, args.n_gpu)
                    # multi-gpu eval
                    if args.n_gpu > 1:
                        model = torch.nn.DataParallel(model)

                    if args.mkldnn_eval:
                        from torch.utils import mkldnn as mkldnn_utils
                        model = mkldnn_utils.to_mkldnn(model)
                        print(model)
                    from lpot import Quantization, common
                    quantizer = Quantization(args.config)
                    if eval_task != "squad":
                        eval_task = 'classifier'
                    eval_dataset = quantizer.dataset(
                        'bert',
                        dataset=eval_dataset,
                        task=eval_task,
                        model_type=args.model_type)
                    quantizer.model = common.Model(model)
                    quantizer.calib_dataloader = common.DataLoader(
                        eval_dataset, batch_size=args.eval_batch_size)
                    quantizer.eval_func = eval_func_for_lpot
                    q_model = quantizer()
                    q_model.save(args.tuned_checkpoint)
                exit(0)

            if args.benchmark or args.accuracy_only:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)

                if args.int8:
                    from lpot.utils.pytorch import load
                    new_model = load(
                        os.path.abspath(
                            os.path.expanduser(args.tuned_checkpoint)), model)
                else:
                    new_model = model
                result, _ = evaluate(args, new_model, tokenizer, prefix=prefix)
                exit(0)

            if args.do_calibration:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                fallback_layers = {}
                if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc":
                    fallback_layers = {"bert.encoder.layer.9.output.dense."}
                propagate_qconfig_(model)
                fallback_layer(model,
                               layer_name="",
                               exculde_layers=fallback_layers)
                add_observer_(model)
                result, _ = evaluate(args,
                                     model,
                                     tokenizer,
                                     prefix=global_step,
                                     calibration=True)
                convert(model, inplace=True)
                quantized_model_path = args.task_name + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    os.makedirs(quantized_model_path)
                model.save_pretrained(quantized_model_path)
                print(model)
                result, _ = evaluate(args, model, tokenizer, prefix=prefix)
            if args.do_int8_inference:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)
                model.qconfig = default_per_channel_qconfig
                fallback_layers = {}
                if args.model_name_or_path == "bert-base-uncased" and args.task_name == "mrpc":
                    fallback_layers = {"bert.encoder.layer.9.output.dense."}
                propagate_qconfig_(model)
                fallback_layer(model,
                               layer_name="",
                               exculde_layers=fallback_layers)
                add_observer_(model)
                convert(model, inplace=True)
                quantized_model_path = args.task_name + "_quantized_model"
                if not os.path.exists(quantized_model_path):
                    logger.error(
                        "please do calibrantion befor run int8 inference")
                    return
                prepare(model, inplace=True)
                convert(model, inplace=True)
                model_bin_file = os.path.join(quantized_model_path,
                                              "pytorch_model.bin")
                state_dict = torch.load(model_bin_file)
                model.load_state_dict(state_dict)
                result, _ = evaluate(args, model, tokenizer, prefix=prefix)

    return results
示例#10
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_data_file", default=None, type=str, required=True,
                        help="The input training data file (a text file).")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--eval_data_file", default=None, type=str,
                        help="An optional input evaluation data file to evaluate the perplexity on (a text file).")

    parser.add_argument("--model_type", default="bert", type=str,
                        help="The model architecture to be fine-tuned.")
    parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str,
                        help="The model checkpoint for weights initialization.")

    parser.add_argument("--mlm", action='store_true',
                        help="Train with masked-language modeling loss instead of language modeling.")
    parser.add_argument("--mlm_probability", type=float, default=0.15,
                        help="Ratio of tokens to mask for masked language modeling loss")

    parser.add_argument("--config_name", default="", type=str,
                        help="Optional pretrained config name or path if not the same as model_name_or_path")
    parser.add_argument("--tokenizer_name", default="", type=str,
                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
    parser.add_argument("--cache_dir", default="", type=str,
                        help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)")
    parser.add_argument("--block_size", default=-1, type=int,
                        help="Optional input sequence length after tokenization."
                             "The training dataset will be truncated in block of this size for training."
                             "Default to the model max input length for single sentence inputs (take into account special tokens).")
    parser.add_argument("--do_train", action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--evaluate_during_training", action='store_true',
                        help="Run evaluation during training at each logging step.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--learning_rate", default=5e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=1.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps', type=int, default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps', type=int, default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument('--save_total_limit', type=int, default=None,
                        help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default')
    parser.add_argument("--eval_all_checkpoints", action='store_true',
                        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument('--overwrite_cache', action='store_true',
                        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")

    parser.add_argument('--fp16', action='store_true',
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    parser.add_argument('--fp16_opt_level', type=str, default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
    parser.add_argument("--tune", action='store_true',
                        help="run Low Precision Optimization Tool to tune int8 acc.")
    parser.add_argument('-i', "--iter", default=0, type=int,
                        help='For accuracy measurement only.')
    parser.add_argument('--config', type=str, default='conf.yaml', help="yaml config file")
    parser.add_argument("--do_fp32_inference", action='store_true',
                        help="Whether to run fp32 inference.")
    parser.add_argument('--benchmark', dest='benchmark', action='store_true',
                        help='run benchmark')
    parser.add_argument('-r', "--accuracy_only", dest='accuracy_only', action='store_true',
                        help='For accuracy measurement only.')
    parser.add_argument("--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH',
                        help='path to checkpoint tuned by Low Precision Optimization Tool (default: ./)')
    parser.add_argument('--int8', dest='int8', action='store_true',
                        help='run benchmark')
    args = parser.parse_args()

    if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                         "flag (masked language modeling).")
    if args.eval_data_file is None and args.do_eval:
        raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
                         "or remove the --do_eval argument.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab

    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
                                          cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
                                                do_lower_case=args.do_lower_case,
                                                cache_dir=args.cache_dir if args.cache_dir else None)
    if args.block_size <= 0:
        args.block_size = tokenizer.max_len_single_sentence  # Our input block size will be the max possible for the model
    args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
    model = model_class.from_pretrained(args.model_name_or_path,
                                        from_tf=bool('.ckpt' in args.model_name_or_path),
                                        config=config,
                                        cache_dir=args.cache_dir if args.cache_dir else None)
    model.to(args.device)

    if args.local_rank == 0:
        torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        if args.local_rank not in [-1, 0]:
            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache

        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)

        if args.local_rank == 0:
            torch.distributed.barrier()
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)


    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
            
            if args.do_fp32_inference:
                model = model_class.from_pretrained(checkpoint)
                model.to(args.device)
                result = evaluate(args, model, tokenizer, prefix=prefix)
                result = dict((k + '_{}'.format(global_step), v.numpy()) for k, v in result.items())
                results.update(result)

            if args.tune:
                def eval_func_for_lpot(model):
                    result = evaluate(args, model, tokenizer, prefix=prefix)
                    return 100 - result['perplexity'].numpy()

                model = model_class.from_pretrained(checkpoint)
                model.to(args.device)
                model.eval()

                from lpot import Quantization, common
                quantizer = Quantization(args.config)
                eval_dataset = WikiDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
                args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
                # Note that DistributedSampler samples randomly
                eval_sampler = SequentialSampler(eval_dataset)
                eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
                quantizer.model = common.Model(model)
                quantizer.calib_dataloader = eval_dataloader
                quantizer.eval_func = eval_func_for_lpot
                q_model = quantizer()
                q_model.save(args.tuned_checkpoint)
                exit(0)

            if args.benchmark or args.accuracy_only:
                model = model_class.from_pretrained(checkpoint, mix_qkv=True)
                model.to(args.device)

                if args.int8:
                    from lpot.utils.pytorch import load
                    new_model = load(
                        os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model)
                else:
                    new_model = model
                result = evaluate(args, new_model, tokenizer, prefix=prefix)
                exit(0)
    return results