def quantize_relay_module(mod, params, qconfig=None):
    """ Quantize the relay module with qconfig options.

    Parameters:
    ------
    mod : tvm.relay.module
        The original module.

    qconfig : tvm.relay.quantize.quantize.QConfig
        The quantization configuration

    Returns:
    ------
    qfunc : vm.relay.expr.Function
        The graph after quantization
    
    """

    # default qconfig
    if not qconfig:
        qconfig = qtz.qconfig()

    with qconfig:
        logging.debug('current quantize config')
        logging.debug(qtz.current_qconfig())
        mod['main'] = qtz.quantize(mod['main'], params=params)
        logging.debug('after quantize')
        logging.debug(mod['main'].astext(show_meta_data=False))
    return mod
def test_onnx_quantize_acc(cfg, rec_val, batch_size=1, original=False):
    qconfig = qtz.qconfig(
        skip_conv_layers=[0],
        skip_dense_layer=False,
        nbit_input=cfg.nbit_input,
        nbit_weight=cfg.nbit_input,
        dtype_input=cfg.dtype_input,
        dtype_weight=cfg.dtype_input,
        dtype_activation=cfg.dtype_output,
        debug_enabled_ops=None,
        calibrate_mode="percentile",
        calibrate_chunk_by=8,
    )

    dataset = list(calibrate_dataset(cfg.model, rec_val, batch_size, 64))
    model, logfile = get_onnx_model(cfg.model,
                                    batch_size,
                                    qconfig,
                                    tvm.target.cuda(),
                                    original=original,
                                    dataset=dataset)
    val_data, batch_fn = get_val_data(cfg.model,
                                      rec_val=rec_val,
                                      batch_size=batch_size)

    with tvm.autotvm.apply_history_best(logfile):
        acc = eval_acc(model, val_data, batch_fn, log_interval=1000)
    assert acc > cfg.expected_acc
    return acc
Пример #3
0
def tune_and_evaluate(tuning_opt, cfg, target, ctx, log_file):
    qconfig = qtz.qconfig(skip_conv_layers=[0],
                        nbit_input=cfg.nbit_input,
                        nbit_weight=cfg.nbit_input,
                        global_scale=cfg.global_scale,
                        dtype_input=cfg.dtype_input,
                        dtype_weight=cfg.dtype_input,
                        dtype_activation=cfg.dtype_output,
                        debug_enabled_ops=None)

    # extract workloads from relay program
    logging.info("Extract tasks...")
    mod, params, input_shape = get_model(cfg.model, cfg.batch_size, qconfig, target)

    tasks = autotvm.task.extract_from_program(mod, target=target,
                                            params=params, ops=(relay.op.nn.conv2d,))
    for i in range(len(tasks)):
        op_name = tasks[i].workload[0]
        if op_name == 'conv2d':
            func_create = 'topi_x86_conv2d_NCHWc'
        elif op_name == 'depthwise_conv2d_nchw':
            func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw'
        else:
            print ("Tuning {} is not supported on x86")
            raise ValueError("Tuning {} is not supported on x86".format(op_name))

        print ( "[Create Task %2d/%2d (%s, %s) ] " % (i+1, len(tasks), tasks[i].name, tasks[i].workload[0]))

        tsk = autotvm.task.create(func_create, args=tasks[i].args,
                                    target=tasks[i].target, template_key='direct')
        tsk.workload = tasks[i].workload
        tasks[i] = tsk

    # run tuning tasks
    logging.info("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    # with autotvm.apply_history_best(log_file):
    logging.info("Compile...")
    with relay.build_config(opt_level=3):
        graph, lib, params = relay.build_module.build(
            mod, target=target, params=params)

    # export library
    tmp = tempdir()
    filename = "net.tar"
    lib.export_library(tmp.relpath(filename))

    # load parameters
    module = tvm.contrib.graph_runtime.create(graph, lib, ctx)
    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype('float32'))
    module.set_input('data', data_tvm)
    module.set_input(**params)

    # evaluate
    logging.info("Evaluate inference time cost...")
    ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=60)
    prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
    logging.info("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
Пример #4
0
def test_quantize_pass():
    def quantize_weight(arr):
        maximum = np.amax(np.abs(arr.asnumpy()))
        scale = 2**math.ceil(math.log(maximum, 2))
        out = np.around(arr.asnumpy() / scale * 128).astype('int8')
        out = np.clip(out, -127, 127)
        return relay.const(out, 'int8')

    n, c, h, w = 1, 3, 224, 224

    def make_graph(data):
        weight = relay.var("conv_weight")
        out = relay.nn.conv2d(data,
                              weight,
                              kernel_size=(3, 3),
                              padding=(1, 1),
                              channels=c)
        out = relay.Function(relay.ir_pass.free_vars(out), out)
        return out

    def make_qgraph(data, weight):
        out = data * relay.const(32.0)
        out = relay.round(out)
        out = relay.clip(out, a_min=-127, a_max=127)
        out = out.astype('int8')

        out = relay.nn.conv2d(out,
                              weight,
                              kernel_size=(3, 3),
                              padding=(1, 1),
                              channels=c,
                              out_dtype='int32')
        out = out.astype('float32')
        out = relay.multiply(out, relay.const(0.00024414062))
        out = relay.Function(relay.ir_pass.free_vars(out), out)
        return out

    np.random.seed(42)

    data = relay.var("data", relay.TensorType((n, c, h, w), "float32"))
    graph = make_graph(data)
    dataset, params = make_dataset(graph, 10)

    with qtz.qconfig(skip_k_conv=0,
                     global_scale=4.0,
                     round_for_shift=False,
                     store_lowbit_output=False):
        qgraph0 = qtz.quantize(graph, params)
        qgraph0 = relay.ir_pass.infer_type(qgraph0)

    conv_weight = quantize_weight(params['conv_weight'])
    qgraph1 = make_qgraph(data, conv_weight)
    qgraph1 = relay.ir_pass.infer_type(qgraph1)

    graph = relay.create_executor('graph')
    res0 = graph.evaluate(qgraph0)(dataset[0]['data'])
    res1 = graph.evaluate(qgraph1)(dataset[0]['data'])
    tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy(), rtol=1e-3)
Пример #5
0
def build_model(args, gluon_model):
    """Build with relay."""
    import tvm
    from tvm import relay
    from tvm.relay import quantize as qtz
    img_size = 299 if args.model == 'inceptionv3' else 224
    data_shape = (args.batch_size, 3, img_size, img_size)
    net, params = relay.frontend.from_mxnet(gluon_model, {"data": data_shape})
    target = args.target

    if args.original:
        # run original model
        with relay.build_config(opt_level=3):
            graph, lib, params = relay.build(net, target, params=params)
        ctx = tvm.nd.context(target, 0)
        return graph, lib, params, ctx

    # constant folding and scale folding.
    print('original')
    print(net.astext(show_meta_data=False))
    with relay.build_config(opt_level=3):
        qgraph = relay.optimize(net, target, params)
        # qgraph = relay.optimize(qgraph)
    print('after optimize')
    print(qgraph.astext(show_meta_data=False))

    with qtz.qconfig(skip_k_conv=0,
                     nbit_input=args.nbit_input,
                     nbit_weight=args.nbit_input,
                     global_scale=args.global_scale,
                     dtype_input=args.dtype_input,
                     dtype_weight=args.dtype_input,
                     dtype_activation=args.dtype_output,
                     store_lowbit_output=False,
                     debug_enabled_ops=None):
        print(qtz.current_qconfig())
        qgraph = qtz.annotate(qgraph)
        print('after annotate')
        print(qgraph.astext(show_meta_data=False))
        qgraph = qtz.calibrate(qgraph)
        print('after calibrate\n')
        print(qgraph.astext(show_meta_data=False))
        if not args.simulated:
            qgraph = qtz.realize(qgraph)
            qgraph = relay.ir_pass.infer_type(qgraph)
            print('after realize\n')
            print(qgraph.astext(show_meta_data=False))

    with relay.build_config(opt_level=3):
        graph, lib, params = relay.build(qgraph, target)
    ctx = tvm.nd.context(target, 0)
    return graph, lib, params, ctx
def test_quantize_acc(cfg, rec_val):
    qconfig = qtz.qconfig(skip_conv_layers=[0],
                          nbit_input=cfg.nbit_input,
                          nbit_weight=cfg.nbit_input,
                          global_scale=cfg.global_scale,
                          dtype_input=cfg.dtype_input,
                          dtype_weight=cfg.dtype_input,
                          dtype_activation=cfg.dtype_output,
                          debug_enabled_ops=None)

    model = get_model(cfg.model, 32, qconfig, tvm.target.cuda())
    val_data, batch_fn = get_val_data(cfg.model,
                                      rec_val=rec_val,
                                      batch_size=32)

    acc = eval_acc(model, val_data, batch_fn)
    assert acc > cfg.expected_acc
    return acc
if __name__ == '__main__':

    tf_Inception_v1_path = '/home/terse/code/programming/blog/TVM_quantization/tf/InceptionV1/classify_image_graph_def-with_shapes.pb'
    mod, params, input_shape = get_tf_model_InceptionV1(tf_Inception_v1_path)

    logging.info(mod['main'].astext(show_meta_data=False))

    ctx = tvm.cpu()
    target = 'llvm -mcpu=core-avx2'

    # Configure the quantization behavior
    qconfig = qtz.qconfig(skip_conv_layers=[0],
                          nbit_input=8,
                          nbit_weight=8,
                          global_scale=8.0,
                          dtype_input='int8',
                          dtype_weight='int8',
                          dtype_activation='int8',
                          debug_enabled_ops=None)

    # mod['main'] = qtz.prerequisite_optimize(mod['main'],params=params)
    # logging.info(mod['main'].astext(show_meta_data=False))

    mod = quantize_relay_module(mod, params, qconfig)

    # autotvm_tune(mod['main'], params, target)

    # graph,lib,params = build_module(mod, params, target,'tuning_inceptv1.log')
    graph, lib, params = build_module(mod, params, target)

    save_compiled_module(graph, lib, params, "model_inception")
shape = {'data': data.shape}

dtype_dict = {}

# convert nnvm to relay
print("convert nnvm symbols into relay function...")
from nnvm.to_relay import to_relay
func, params = to_relay(sym, shape, 'float32', params=params)
# optimization
print("optimize relay graph...")
with tvm.relay.build_config(opt_level=2):
    func = tvm.relay.optimize(func, target, params)
# quantize
print("apply quantization...")
from tvm.relay import quantize
with quantize.qconfig():
   func = quantize.quantize(func, params)

# Relay build
print("Compiling the model...")
print(func.astext(show_meta_data=False))
with tvm.relay.build_config(opt_level=3):
    graph, lib, params = tvm.relay.build(func, target=target, params=params)

# Save the model
tmp = util.tempdir()
lib_fname = tmp.relpath('model.tar')
lib.export_library(lib_fname)

# NNVM
# with nnvm.compiler.build_config(opt_level=2):
Пример #9
0
def quantize_model(args):
    """Build with relay."""
    import tvm
    from tvm import relay
    from tvm.relay import quantize as qtz
    img_size = 224
    data_shape = (args.batch_size, 3, img_size, img_size)
    mx_sym, mx_args, mx_auxs = mx.model.load_checkpoint(args.model, 0)
    net, params = relay.frontend.from_mxnet(mx_sym, {"data": data_shape},
                                            arg_params=mx_args,
                                            aux_params=mx_auxs)
    target = args.target

    if args.original:
        # run original model
        with relay.build_config(opt_level=3):
            graph, lib, params = relay.build(net, target, params=params)
        ctx = tvm.nd.context(target, 0)
        return graph, lib, params, ctx

    # constant folding and scale folding.
    # print('original')
    # print(net.astext(show_meta_data=False))
    with relay.build_config(opt_level=3):
        qgraph = relay.optimize(net, target, params)
    # print('after optimize')
    # print(qgraph.astext(show_meta_data=False))

    with qtz.qconfig(skip_k_conv=0,
                     nbit_input=args.nbit_input,
                     nbit_weight=args.nbit_input,
                     global_scale=args.global_scale,
                     dtype_input=args.dtype_input,
                     dtype_weight=args.dtype_input,
                     dtype_activation=args.dtype_output,
                     store_lowbit_output=False,
                     debug_enabled_ops=None):
        print(qtz.current_qconfig())
        qgraph = qtz.annotate(qgraph)
        # print('after annotate')
        # print(qgraph.astext(show_meta_data=False))
        qgraph = qtz.calibrate(qgraph)
        # print('after calibrate\n')
        # print(qgraph.astext(show_meta_data=False))
        if not args.simulated:
            qgraph = qtz.realize(qgraph)
            qgraph = relay.ir_pass.infer_type(qgraph)
            # print('after realize\n')
            # print(qgraph.astext(show_meta_data=False))

    with relay.build_config(opt_level=3):
        graph, lib, params = relay.build(qgraph, target)

    ### save/load the graph, lib and params into separate files
    # save
    lib.export_library(os.path.join(thisdir, "deploy_lib.so"))
    with open(os.path.join(thisdir, "deploy_graph.json"), "w") as fo:
        fo.write(graph)
    with open(os.path.join(thisdir, "deploy_param.params"), "wb") as fo:
        fo.write(relay.save_param_dict(params))
    # load
    graph = open(os.path.join(thisdir, "deploy_graph.json")).read()
    lib = tvm.module.load(os.path.join(thisdir, "deploy_lib.so"))
    params = bytearray(
        open(os.path.join(thisdir, "deploy_param.params"), "rb").read())

    ctx = tvm.nd.context(target, 0)
    return graph, lib, params, ctx