예제 #1
0
def parse_arguments():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-m",
        "--models",
        required=False,
        nargs="+",
        type=str,
        default=["bert-base-cased", "roberta-base", "gpt2"],
        choices=list(MODELS.keys()),
        help="Pre-trained models in the list: " + ", ".join(MODELS.keys()),
    )

    parser.add_argument(
        "--model_source",
        required=False,
        nargs=1,
        type=str,
        default="pt",
        choices=["pt", "tf"],
        help="Export onnx from pt or tf",
    )

    parser.add_argument(
        "--model_class",
        required=False,
        type=str,
        default=None,
        choices=list(MODEL_CLASSES),
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES),
    )

    parser.add_argument(
        "-e",
        "--engines",
        required=False,
        nargs="+",
        type=str,
        default=["onnxruntime"],
        choices=["onnxruntime", "torch", "torchscript", "tensorflow"],
        help="Engines to benchmark",
    )

    parser.add_argument(
        "-c",
        "--cache_dir",
        required=False,
        type=str,
        default=os.path.join(".", "cache_models"),
        help="Directory to cache pre-trained models",
    )

    parser.add_argument(
        "--onnx_dir",
        required=False,
        type=str,
        default=os.path.join(".", "onnx_models"),
        help="Directory to store onnx models",
    )

    parser.add_argument("-g",
                        "--use_gpu",
                        required=False,
                        action="store_true",
                        help="Run on gpu device")

    parser.add_argument(
        "--provider",
        required=False,
        type=str,
        default=None,
        help="Execution provider to use",
    )

    parser.add_argument(
        "-p",
        "--precision",
        type=Precision,
        default=Precision.FLOAT32,
        choices=list(Precision),
        help=
        "Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization",
    )

    parser.add_argument("--verbose",
                        required=False,
                        action="store_true",
                        help="Print more information")

    parser.add_argument(
        "--overwrite",
        required=False,
        action="store_true",
        help="Overwrite existing models",
    )

    parser.add_argument(
        "-o",
        "--optimizer_info",
        type=OptimizerInfo,
        default=OptimizerInfo.BYSCRIPT,
        choices=list(OptimizerInfo),
        help=
        "Optimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_opt",
    )

    parser.add_argument(
        "-v",
        "--validate_onnx",
        required=False,
        action="store_true",
        help="Validate ONNX model",
    )

    parser.add_argument(
        "-f",
        "--fusion_csv",
        required=False,
        default=None,
        help="CSV file for saving summary results of graph optimization.",
    )

    parser.add_argument(
        "-d",
        "--detail_csv",
        required=False,
        default=None,
        help="CSV file for saving detail results.",
    )

    parser.add_argument(
        "-r",
        "--result_csv",
        required=False,
        default=None,
        help="CSV file for saving summary results.",
    )

    parser.add_argument(
        "-i",
        "--input_counts",
        required=False,
        nargs="+",
        default=[1],
        type=int,
        choices=[1, 2, 3],
        help=
        "Number of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.",
    )

    parser.add_argument(
        "-t",
        "--test_times",
        required=False,
        default=100,
        type=int,
        help="Number of repeat times to get average inference latency.",
    )

    parser.add_argument("-b",
                        "--batch_sizes",
                        nargs="+",
                        type=int,
                        default=[1])

    parser.add_argument(
        "-s",
        "--sequence_lengths",
        nargs="+",
        type=int,
        default=[4, 8, 16, 32, 64, 128, 256],
    )

    parser.add_argument(
        "--disable_ort_io_binding",
        required=False,
        action="store_true",
        help="Disable running ONNX Runtime with binded inputs and outputs. ",
    )
    parser.set_defaults(disable_ort_io_binding=False)

    parser.add_argument(
        "-n",
        "--num_threads",
        required=False,
        nargs="+",
        type=int,
        default=[0],
        help="Threads to use",
    )

    parser.add_argument(
        "--force_num_layers",
        required=False,
        type=int,
        default=None,
        help="Manually set the model's layer number",
    )

    FusionOptions.add_arguments(parser)

    args = parser.parse_args()
    return args
예제 #2
0
def _parse_arguments():
    parser = argparse.ArgumentParser(
        description=
        'Graph optimization tool for ONNX Runtime. It transforms ONNX graph to use optimized operators for Transformer models.'
    )
    parser.add_argument('--input',
                        required=True,
                        type=str,
                        help="input onnx model path")

    parser.add_argument('--output',
                        required=True,
                        type=str,
                        help="optimized onnx model path")

    parser.add_argument('--model_type',
                        required=False,
                        type=str.lower,
                        default="bert",
                        choices=list(MODEL_TYPES.keys()),
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_TYPES.keys()))

    parser.add_argument(
        '--num_heads',
        required=False,
        type=int,
        default=0,
        help=
        "number of attention heads like 12 for bert-base and 16 for bert-large. Default is 0 to detect automatically for BERT. For other model type, this parameter need specify correctly."
    )

    parser.add_argument(
        '--hidden_size',
        required=False,
        type=int,
        default=0,
        help=
        "hidden size like 768 for bert-base and 1024 for bert-large. Default is 0 to detect automatically for BERT. For other model type, this parameter need specify correctly."
    )

    parser.add_argument(
        '--input_int32',
        required=False,
        action='store_true',
        help=
        "Use int32 (instead of int64) inputs. It could avoid unnecessary data cast when EmbedLayerNormalization is fused for BERT."
    )
    parser.set_defaults(input_int32=False)

    parser.add_argument(
        '--float16',
        required=False,
        action='store_true',
        help=
        "Convert all weights and nodes in float32 to float16. It has potential loss in precision compared to mixed precision conversion (see convert_float_to_float16)."
    )
    parser.set_defaults(float16=False)

    FusionOptions.add_arguments(parser)

    parser.add_argument('--verbose',
                        required=False,
                        action='store_true',
                        help="show debug information.")
    parser.set_defaults(verbose=False)

    parser.add_argument(
        '--use_gpu',
        required=False,
        action='store_true',
        help=
        "Use GPU for inference. Set this flag if your model is intended for GPU when opt_level > 1."
    )
    parser.set_defaults(use_gpu=False)

    parser.add_argument(
        '--only_onnxruntime',
        required=False,
        action='store_true',
        help="optimized by onnxruntime only, and no graph fusion in Python")
    parser.set_defaults(only_onnxruntime=False)

    parser.add_argument(
        '--opt_level',
        required=False,
        type=int,
        choices=[0, 1, 2, 99],
        default=None,
        help=
        "onnxruntime optimization level. 0 will disable onnxruntime graph optimization. The recommended value is 1. When opt_level > 1 is used, optimized model for GPU might not run in CPU. Level 2 and 99 are intended for --only_onnxruntime."
    )

    parser.add_argument(
        '--use_external_data_format',
        required=False,
        action='store_true',
        help="use external data format to store large model (>2GB)")
    parser.set_defaults(use_external_data_format=False)

    args = parser.parse_args()

    return args
예제 #3
0
def _parse_arguments():
    parser = argparse.ArgumentParser(
        description=
        'Graph optimization tool for ONNX Runtime. It transforms ONNX graph to use optimized operators for Transformer models.'
    )
    parser.add_argument('--input', required=True, type=str, help="input onnx model path")

    parser.add_argument('--output', required=True, type=str, help="optimized onnx model path")

    parser.add_argument('--model_type',
                        required=False,
                        type=str.lower,
                        default="bert",
                        choices=list(MODEL_TYPES.keys()),
                        help="Model type selected in the list: " + ", ".join(MODEL_TYPES.keys()))

    parser.add_argument(
        '--num_heads',
        required=False,
        type=int,
        default=12,
        help=
        "number of attention heads. 12 for bert-base model and 16 for bert-large. For BERT, set it to 0 to detect automatically."
    )

    parser.add_argument(
        '--hidden_size',
        required=False,
        type=int,
        default=768,
        help=
        "bert model hidden size. 768 for bert-base model and 1024 for bert-large. For BERT, set it to 0 to detect automatically."
    )

    parser.add_argument('--input_int32',
                        required=False,
                        action='store_true',
                        help="Use int32 (instead of int64) tensor as input to avoid unnecessary data cast.")
    parser.set_defaults(input_int32=False)

    parser.add_argument(
        '--float16',
        required=False,
        action='store_true',
        help=
        "If your target device is V100 or T4 GPU, try this to convert float32 to float16 for best performance (with potential loss in precision)."
    )
    parser.set_defaults(float16=False)

    FusionOptions.add_arguments(parser)

    parser.add_argument('--verbose', required=False, action='store_true', help="show debug information.")
    parser.set_defaults(verbose=False)

    parser.add_argument(
        '--use_gpu',
        required=False,
        action='store_true',
        help="use GPU for inference. Set this flag if your model is intended for GPU and opt_level > 1.")
    parser.set_defaults(use_gpu=False)

    parser.add_argument('--only_onnxruntime',
                        required=False,
                        action='store_true',
                        help="optimized by onnxruntime only, and no graph fusion in Python")
    parser.set_defaults(only_onnxruntime=False)

    parser.add_argument(
        '--opt_level',
        required=False,
        type=int,
        choices=[0, 1, 2, 99],
        default=None,
        help=
        "onnxruntime optimization level. 0 will disable onnxruntime graph optimization. Graph fusion in Python is not impacted by setting."
    )

    parser.add_argument('--use_external_data_format',
                        required=False,
                        action='store_true',
                        help="use external data format")
    parser.set_defaults(use_external_data_format=False)

    args = parser.parse_args()

    return args