Exemplo n.º 1
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)

    if args.tolerance == 0:
        args.tolerance = DEFAULT_TOLERANCE[args.precision]

    logger.info(f"Arguments:{args}")

    cache_dir = args.cache_dir
    output_dir = args.output if not args.output.endswith(
        ".onnx") else os.path.dirname(args.output)
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    if args.precision != Precision.FLOAT32:
        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"

    if args.precision == Precision.FLOAT16:
        assert args.use_gpu, "fp16 requires --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    model_class = MODEL_CLASSES[args.model_class][0]
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        cache_dir=cache_dir)
    if hasattr(config, 'return_tuple'):
        config.return_tuple = True
    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir,
                                                 args.model_name_or_path,
                                                 args.model_class)
    raw_onnx_model = args.output if args.output.endswith(
        '.onnx') else onnx_model_paths["raw"]
    output_path = raw_onnx_model if (
        args.output.endswith('.onnx') or
        (args.precision == Precision.FLOAT32 and not args.optimize_onnx)
    ) else onnx_model_paths[str(args.precision)]

    Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        Gpt2Helper.optimize_onnx(raw_onnx_model, output_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size)

    if args.precision == Precision.INT8:
        logger.info("quantizing model...")
        QuantizeHelper.quantize_onnx_model(output_path, output_path)
        model = QuantizeHelper.quantize_torch_model(model)
        logger.info("finished quantizing model")

    session = create_onnxruntime_session(output_path,
                                         args.use_gpu,
                                         enable_all_optimization=False,
                                         verbose=args.verbose)
    if session is not None:
        Gpt2Helper.test_parity(session,
                               model,
                               device,
                               args.precision == Precision.FLOAT16,
                               rtol=args.tolerance,
                               atol=args.tolerance,
                               model_class=args.model_class)

    if args.input_test_file:
        test_inputs = []
        with open(args.input_test_file) as read_f:
            for i, line in enumerate(read_f):
                line = line.rstrip()
                data = json.loads(line)
                input_ids = torch.from_numpy(
                    numpy.asarray(data["input_ids"],
                                  dtype=numpy.int64)).to(device)
                position_ids = torch.from_numpy(
                    numpy.asarray(data["position_ids"],
                                  dtype=numpy.int64)).to(device)
                numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32
                attention_mask = torch.from_numpy(
                    numpy.asarray(data["attention_mask"],
                                  dtype=numpy_float)).to(device)
                inputs = {
                    "input_ids": input_ids,
                    "position_ids": position_ids,
                    "attention_mask": attention_mask
                }
                test_inputs.append(inputs)
        Gpt2Tester.test_generation(session,
                                   model,
                                   device,
                                   test_inputs,
                                   precision=args.precision,
                                   model_class=args.model_class,
                                   top_k=20,
                                   top_k_no_order=True,
                                   max_steps=24,
                                   max_inputs=0,
                                   verbose=args.verbose)

    logger.info(f"Done. Output model: {output_path}")
Exemplo n.º 2
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)

    if args.tolerance == 0:
        args.tolerance = DEFAULT_TOLERANCE[args.precision]

    logger.info(f"Arguments:{args}")

    cache_dir = args.cache_dir
    output_dir = args.output if not args.output.endswith(
        ".onnx") else os.path.dirname(args.output)
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    if args.precision != Precision.FLOAT32:
        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"

    if args.precision == Precision.FLOAT16:
        assert args.use_gpu, "fp16 requires --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    model_class = MODEL_CLASSES[args.model_class][0]
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        cache_dir=cache_dir)
    if hasattr(config, 'return_tuple'):
        config.return_tuple = True
    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir,
                                                 args.model_name_or_path,
                                                 args.model_class)
    raw_onnx_model = args.output if args.output.endswith(
        '.onnx') else onnx_model_paths["raw"]
    output_path = raw_onnx_model if (
        args.output.endswith('.onnx') or
        (args.precision == Precision.FLOAT32 and not args.optimize_onnx)
    ) else onnx_model_paths[str(args.precision)]

    Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        Gpt2Helper.optimize_onnx(raw_onnx_model, output_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size)

    if args.precision == Precision.INT8:
        logger.info("quantizing model...")
        QuantizeHelper.quantize_onnx_model(output_path, output_path)
        model = QuantizeHelper.quantize_torch_model(model)
        logger.info("finished quantizing model")

    session = create_onnxruntime_session(output_path,
                                         args.use_gpu,
                                         enable_all_optimization=False,
                                         verbose=args.verbose)
    if session is not None:
        Gpt2Helper.test_parity(session,
                               model,
                               device,
                               args.precision == Precision.FLOAT16,
                               rtol=args.tolerance,
                               atol=args.tolerance,
                               model_class=args.model_class)

    logger.info(f"Done. Output model: {output_path}")
Exemplo n.º 3
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)

    if args.tolerance == 0:
        args.tolerance = DEFAULT_TOLERANCE[args.precision]

    logger.info(f"Arguments:{args}")

    cache_dir = args.cache_dir
    output_dir = args.output if not args.output.endswith(
        ".onnx") else os.path.dirname(args.output)
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    if args.precision != Precision.FLOAT32:
        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"

    if args.precision == Precision.FLOAT16:
        assert args.use_gpu, "fp16 requires --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    model_class = MODEL_CLASSES[args.model_class][0]
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        cache_dir=cache_dir)
    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    use_external_data_format = (config.n_layer > 24
                                )  #TODO: find a way to check model size > 2GB
    onnx_model_paths = Gpt2Helper.get_onnx_paths(
        output_dir,
        args.model_name_or_path,
        args.model_class,
        new_folder=use_external_data_format)
    raw_onnx_model = args.output if args.output.endswith(
        '.onnx') else onnx_model_paths["raw"]
    output_path = raw_onnx_model if (
        args.output.endswith('.onnx') or
        (args.precision == Precision.FLOAT32 and not args.optimize_onnx)
    ) else onnx_model_paths[str(args.precision)]

    logger.info(f"Exporting ONNX model to {raw_onnx_model}")
    use_padding = MODEL_CLASSES[args.model_class][2]
    Gpt2Helper.export_onnx(model,
                           device,
                           raw_onnx_model,
                           args.verbose,
                           use_external_data_format,
                           has_position_ids=use_padding,
                           has_attention_mask=use_padding)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        logger.info(f"Optimizing model to {output_path}")
        Gpt2Helper.optimize_onnx(raw_onnx_model, output_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size)

    if args.precision == Precision.INT8:
        logger.info("quantizing model...")
        QuantizeHelper.quantize_onnx_model(output_path, output_path)
        model = QuantizeHelper.quantize_torch_model(model)
        logger.info("finished quantizing model")

    session = create_onnxruntime_session(output_path,
                                         args.use_gpu,
                                         enable_all_optimization=True,
                                         verbose=args.verbose)
    if session is not None:
        Gpt2Helper.test_parity(session,
                               model,
                               device,
                               args.precision == Precision.FLOAT16,
                               rtol=args.tolerance,
                               atol=args.tolerance,
                               model_class=args.model_class,
                               has_position_ids=use_padding,
                               has_attention_mask=use_padding)

    if args.input_test_file:
        test_inputs = []
        # Each line of test file is a JSON string like:
        # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]}
        with open(args.input_test_file) as read_f:
            for i, line in enumerate(read_f):
                line = line.rstrip()
                data = json.loads(line)
                input_ids = torch.from_numpy(
                    numpy.asarray(data["input_ids"],
                                  dtype=numpy.int64)).to(device)

                if use_padding:
                    if "attention_mask" in data:
                        numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32
                        attention_mask = torch.from_numpy(
                            numpy.asarray(data["attention_mask"],
                                          dtype=numpy_float)).to(device)
                    else:
                        padding = -1
                        attention_mask = (
                            input_ids !=
                            padding).type(torch.float16 if args.precision ==
                                          Precision.FLOAT16 else torch.float32)
                        input_ids.masked_fill_(input_ids == padding, 0)

                    if "position_ids" in data:
                        position_ids = torch.from_numpy(
                            numpy.asarray(data["position_ids"],
                                          dtype=numpy.int64)).to(device)
                    else:
                        position_ids = (attention_mask.long().cumsum(-1) - 1)
                        position_ids.masked_fill_(position_ids < 0, 0)

                    inputs = {
                        "input_ids": input_ids,
                        "position_ids": position_ids,
                        "attention_mask": attention_mask
                    }
                else:
                    inputs = {"input_ids": input_ids}

                test_inputs.append(inputs)

        Gpt2Tester.test_generation(session,
                                   model,
                                   device,
                                   test_inputs,
                                   precision=args.precision,
                                   model_class=args.model_class,
                                   top_k=20,
                                   top_k_no_order=True,
                                   max_steps=24,
                                   max_inputs=0,
                                   verbose=args.verbose)

    logger.info(f"Done. Output model: {output_path}")