예제 #1
1
])

with tvm.transform.PassContext(opt_level=3):
    mod = seq(mod)

tvm_target = get_tvm_target(device, get_device_type(), get_device_arch(),
                            get_device_attributes())

tvm_targets = tvm.target.Target(tvm_target)
cpu_target = "llvm"
target_host = cpu_target

cpudevice = tvm.runtime.cpu()

if logfile is not None:
    with autotvm.apply_history_best(logfile):
        with tvm.transform.PassContext(opt_level=3):
            graph_mod = relay.build(mod,
                                    tvm_targets,
                                    params=params,
                                    target_host=target_host)
else:
    with tvm.transform.PassContext(opt_level=3):
        graph_mod = relay.build(mod,
                                tvm_targets,
                                params=params,
                                target_host=target_host)

lib = graph_mod.get_lib()
params = graph_mod.get_params()
graph = graph_mod.get_json()
예제 #2
0
def tune_and_evaluate(tuning_opt):
    # extract workloads from nnvm graph
    print("Extract tasks...")
    net, params, data_shape, out_shape = get_network(model_name, batch_size)
    tasks = autotvm.task.extract_from_graph(net, target=target,
                                            shape={'data': data_shape}, dtype=dtype,
                                            symbols=(nnvm.sym.conv2d,))

    # run tuning tasks
    print("Tuning...")
    tune_kernels(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with nnvm.compiler.build_config(opt_level=3):
            graph, lib, params = nnvm.compiler.build(
                net, target=target, shape={'data': data_shape}, params=params, dtype=dtype)

        # upload parameters to device
        ctx = tvm.cpu()
        data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
        module = runtime.create(graph, lib, ctx)
        module.set_input('data', data_tvm)
        module.set_input(**params)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
예제 #3
0
def tune_and_evaluate(tuning_opt):
    # extract workloads from nnvm graph
    print("Extract tasks...")
    net, params, input_shape, out_shape = get_network(network, batch_size=1)
    tasks = autotvm.task.extract_from_graph(net, target=target, target_host=target_host,
                                            shape={'data': input_shape}, dtype=dtype,
                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with nnvm.compiler.build_config(opt_level=3):
            graph, lib, params = nnvm.compiler.build(
                net, target=target, target_host=target_host,
                shape={'data': input_shape}, params=params, dtype=dtype)

        # export library
        tmp = tempdir()
        if use_android:
            from tvm.contrib import ndk
            filename = "net.so"
            lib.export_library(tmp.relpath(filename), ndk.create_shared)
        else:
            filename = "net.tar"
            lib.export_library(tmp.relpath(filename))

        # upload module to device
        print("Upload...")
        remote = autotvm.measure.request_remote(device_key, 'localhost', 9190,
                                                timeout=10000)
        remote.upload(tmp.relpath(filename))
        rlib = remote.load_module(filename)

        # upload parameters to device
        ctx = remote.context(str(target), 0)
        module = runtime.create(graph, rlib, ctx)
        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
        module.set_input('data', data_tvm)
        module.set_input(**params)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number==1, repeat=30)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
예제 #4
0
파일: tvm_conv.py 프로젝트: shinh/test
def tune_and_evaluate(tuning_opt):
    # extract workloads from nnvm graph
    print("Extract tasks...")
    net, params, input_shape, out_shape = get_network(network, batch_size=1)
    print(net)
    input_name = 'Input_0' if network == 'onnx' else 'data'
    tasks = autotvm.task.extract_from_graph(net, target=target,
                                            shape={input_name: input_shape}, dtype=dtype,
                                            symbols=(nnvm.sym.conv2d,))

    # # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    #with autotvm.apply_history_best('onnx.log'):
    with autotvm.apply_history_best('gtx-1060.log'):
    #if True:
        print("Compile...")
        with nnvm.compiler.build_config(opt_level=3):
            graph, lib, params = nnvm.compiler.build(
                net, target=target, shape={input_name: input_shape}, params=params, dtype=dtype)

    # export library
    # tmp = tempdir()
    # filename = "net.tar"
    # lib.export_library(tmp.relpath(filename))

    # load parameters
    ctx = tvm.context('cuda', 0)
    module = runtime.create(graph, lib, ctx)
    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
    module.set_input(input_name, data_tvm)
    module.set_input(**params)

    # evaluate
    print("Evaluate inference time cost...")
    ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600)
    prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
    print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
          (np.mean(prof_res), np.std(prof_res)))
예제 #5
0
def tune_and_evaluate(tuning_opt):
    # extract workloads from relay program
    print("Extract tasks...")
    net, params, input_shape, out_shape = get_network(network, batch_size=1)
    tasks = autotvm.task.extract_from_program(net, target=target,
                                            params=params, ops=(relay.op.nn.conv2d,))

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with relay.build_config(opt_level=3):
            graph, lib, params = relay.build_module.build(
                net, target=target, params=params)

        # export library
        tmp = tempdir()
        filename = "net.tar"
        lib.export_library(tmp.relpath(filename))

        # load parameters
        ctx = tvm.context(str(target), 0)
        module = runtime.create(graph, lib, ctx)
        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
        module.set_input('data', data_tvm)
        module.set_input(**params)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
예제 #6
0
def compile_model(
    path,
    target,
    dump_code=None,
    target_host=None,
    model_format=None,
    tuning_records=None,
    alter_layout=None,
    shape_dict=None,
):
    """Compile a model from a supported framework into a TVM module.

    This function takes a union of the arguments of both frontends.load_model
    and compiler.compile_relay. The resulting TVM module can be executed using
    the graph runtime.

    Parameters
    ----------
    path: str
        Path to a file
    target : str
        The target for which to compile. Can be a plain string or
        a path.
    dump_code : list, optional
        Dump the generated code for the specified source types, on
        the requested target.
    target_host : str, optional
        The target of the host machine if host-side code
        needs to be generated.
    model_format: str, optional
        A string representing a name of a frontend to be used
    tuning_records: str, optional
        Path to the file produced by the tuning to be used during
        compilation.
    alter_layout: str, optional
        The layout to convert the graph to. Note, the convert layout
        pass doesn't currently guarantee the whole of the graph will
        be converted to the chosen layout.
    shape_dict: dict, optional
        A mapping from input names to their shape. When present,
        the default shapes in the model will be overwritten.

    Returns
    -------
    graph : str
        A JSON-serialized TVM execution graph.
    lib : tvm.module.Module
        A TVM module containing the compiled functions.
    params : dict
        The parameters (weights) for the TVM module.
    dumps : dict
        Dictionary containing the dumps specified.

    """
    dump_code = [x.strip()
                 for x in dump_code.split(",")] if dump_code else None
    mod, params = frontends.load_model(path, model_format, shape_dict)
    config = {}

    if alter_layout:
        mod = common.convert_graph_layout(mod, alter_layout)

    tvm_target, extra_targets = common.target_from_cli(target)
    target_host = tvm_target if not target_host else target_host

    for codegen_from_cli in extra_targets:
        codegen = composite_target.get_codegen_by_target(
            codegen_from_cli["name"])
        partition_function = codegen["pass_pipeline"]
        mod = partition_function(mod, params)
        if codegen["config_key"] is not None:
            config[codegen["config_key"]] = codegen_from_cli["opts"]

    if tuning_records and os.path.exists(tuning_records):
        logger.debug("tuning records file provided: %s", tuning_records)

        use_autoscheduler = True
        try:
            auto_scheduler.load_records(tuning_records)
        except tvm._ffi.base.TVMError:
            use_autoscheduler = False

        if use_autoscheduler:
            with auto_scheduler.ApplyHistoryBest(tuning_records):
                config["relay.backend.use_auto_scheduler"] = True
                with tvm.transform.PassContext(opt_level=3, config=config):
                    logger.debug("building relay graph with autoscheduler")
                    graph_module = relay.build(mod,
                                               target=target,
                                               params=params,
                                               target_host=target_host)
        else:
            with autotvm.apply_history_best(tuning_records):
                with tvm.transform.PassContext(opt_level=3, config=config):
                    logger.debug("building relay graph with tuning records")
                    graph_module = relay.build(mod,
                                               tvm_target,
                                               params=params,
                                               target_host=target_host)
    else:
        with tvm.transform.PassContext(opt_level=3, config=config):
            logger.debug("building relay graph (no tuning records provided)")
            graph_module = relay.build(mod,
                                       tvm_target,
                                       params=params,
                                       target_host=target_host)

    # Generate output dump files with sources
    dump_code = dump_code or []
    dumps = {}
    for source_type in dump_code:
        lib = graph_module.get_lib()
        # TODO lib.get_source call have inconsistent behavior for unsupported
        #      formats (@leandron).
        source = str(mod) if source_type == "relay" else lib.get_source(
            source_type)
        dumps[source_type] = source

    # TODO we need to update this return to use the updated graph module APIs
    #      as these getter functions will be deprecated in the next release (@leandron)
    return graph_module.get_json(), graph_module.get_lib(
    ), graph_module.get_params(), dumps
예제 #7
0
# In practice, making 1000 trials usually can find some good kernels
# for this template

# logging config (for printing tuning log to screen)
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

# the last layer in resnet
task = autotvm.task.create(conv2d_no_batching,
                           args=(1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)),
                           target='cuda')
print(task.config_space)

# use local gpu, measure 5 times for every config to reduce variance
# run 8 parallel threads for compilation
measure_option = autotvm.measure_option(mode='local',
                                        number=10,
                                        parallel_num=8,
                                        timeout=20)

# begin tuning, log records to file `cache.tsv`
tuner = autotvm.tuner.XGBTuner(task)
tuner.tune(n_trial=20,
           measure_option=measure_option,
           callbacks=[autotvm.callback.log_to_file('cache.tsv')])

# get best config from cache file
dispatch_context = autotvm.apply_history_best("cache.tsv")
best_config = dispatch_context.query(task.target, task.workload)
print("\nBest config:")
print(best_config)
예제 #8
0
def tune_and_evaluate(tuning_opt, layer_name='qconv_2', input_layout='nchw'):
    # extract workloads from relay program
    global output_file
    print("Extract tasks...")
    if input_layout == 'nchw':
        mod, params, input_shape = models.get_bitserial_conv2d_nchw(models.vgg16, layer_name,
                                    activation_bits=activation_bits, weight_bits=weight_bits)
    else:
        mod, params, input_shape = models.get_bitserial_conv2d_nhwc(models.vgg16, layer_name,
                                   activation_bits=activation_bits, weight_bits=weight_bits)

    tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                              params=params,
                                              ops=(relay.op.get("nn.bitserial_conv2d"),))

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    log_file = tuning_opt['log_filename']
    print('Extract the best from %s' % log_file)
    specific_layer = log_file.split('.')[0]
    
    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with relay.build_config(opt_level=3):
            graph, lib, params = relay.build_module.build(
                mod, target=target, params=params)

        # export library
        tmp = tempdir()
        if use_android:
            from tvm.contrib import ndk
            filename = "net.so"
            lib.export_library(tmp.relpath(filename), ndk.create_shared)
        else:
            filename = "net.tar"
            lib.export_library(tmp.relpath(filename))

        # upload module to device
        print("Upload...")
        remote = autotvm.measure.request_remote(device_key, '0.0.0.0', 9190,
                                                timeout=10000)
        remote.upload(tmp.relpath(filename))
        rlib = remote.load_module(filename)

        # upload parameters to device
        ctx = remote.context(str(target), 0)
        module = runtime.create(graph, rlib, ctx)
        data_tvm = tvm.nd.array((np.ones(input_shape)).astype(dtype))
        module.set_input('data', data_tvm)
        module.set_input(**params)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
        
        output_file.write(specific_layer + ',' + str(np.mean(prof_res)) + ',' + str(np.std(prof_res)) + '\n')
예제 #9
0
        last_layer = net.layers[net.n - 1]
        tvm.relay.testing.yolo_detection.do_nms_sort(dets, last_layer.classes,
                                                     nms_thresh)

        tic = time.time()
        # tvm.relay.testing.yolo_detection.draw_detections(
        #     font_path, img, dets, thresh, names, last_layer.classes)
        img = img.transpose(1, 2, 0)
        img = darwBbox(dets, img, thresh, names)
        img = np.flip(img, 2)
        tac = time.time()
        cv2.imshow('DarkNet', img)
        print(tac - tic, time.time() - tac)
        res, frame = cap.read()
        cv2.waitKey(1)
        cnt += 1
        if cnt % steps == 0:
            end = time.time()
            print(steps * 1. / (end - start))
            start = end
    cv2.destroyAllWindows()
    cap.release()


if __name__ == '__main__':
    if LOG_FILE is None:
        show()
    else:
        with autotvm.apply_history_best(LOG_FILE):
            show()
예제 #10
0
def test_conv2d_nchw():
    # load tophub
    ctx = autotvm.apply_history_best([])
    for device in get_all_backend():
        context = autotvm.tophub.context(device)
        context.__enter__()

    # ResNet18 workloads
    verify_conv2d_nchw(1,   3, 224,  64, 7, 2, 3)
    verify_conv2d_nchw(1,  64,  56,  64, 3, 1, 1)
    verify_conv2d_nchw(1,  64,  56,  64, 1, 1, 0)
    verify_conv2d_nchw(1,  64,  56, 128, 3, 2, 1)
    verify_conv2d_nchw(1,  64,  56, 128, 1, 2, 0)
    verify_conv2d_nchw(1, 128,  28, 128, 3, 1, 1)
    verify_conv2d_nchw(1, 128,  28, 256, 3, 2, 1)
    verify_conv2d_nchw(1, 128,  28, 256, 1, 2, 0)
    verify_conv2d_nchw(1, 256,  14, 256, 3, 1, 1)
    verify_conv2d_nchw(1, 256,  14, 512, 3, 2, 1)
    verify_conv2d_nchw(1, 256,  14, 512, 1, 2, 0)
    verify_conv2d_nchw(1, 512,   7, 512, 3, 1, 1)

    # bias, relu
    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True)
    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True)
    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)

    # dilation = 2
    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, dilation=2)

    # batch size
    verify_conv2d_nchw(4, 64, 56, 64, 3, 1, 1)
    verify_conv2d_nchw(9, 64, 56, 64, 3, 1, 1)

    # weird workloads
    verify_conv2d_nchw(2, 2, 2, 2, 2, 2, 2)
    verify_conv2d_nchw(3, 3, 3, 3, 3, 3, 3)
    verify_conv2d_nchw(4, 4, 4, 4, 4, 4, 4)
    verify_conv2d_nchw(5, 5, 5, 5, 5, 5, 5)
    verify_conv2d_nchw(6, 6, 6, 6, 6, 6, 6)

    # disable these tests due to some bugs of llvm with nvptx
    # verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=1)
    # verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=2)
    # verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1)

    # inception v3 workloads
    verify_conv2d_nchw(1,    3, 299,  32, 3, 2, 0)
    verify_conv2d_nchw(1,   32, 149,  32, 3, 1, 0)
    verify_conv2d_nchw(1,   32, 147,  64, 3, 1, 1)
    verify_conv2d_nchw(1,   64,  73,  80, 1, 1, 0)
    verify_conv2d_nchw(1,   80,  73, 192, 3, 1, 0)
    verify_conv2d_nchw(1,  192,  35,  64, 1, 1, 0)
    verify_conv2d_nchw(1,  192,  35,  48, 1, 1, 0)
    verify_conv2d_nchw(1,   48,  35,  64, 5, 1, 2)
    verify_conv2d_nchw(1,   64,  35,  96, 3, 1, 1)
    verify_conv2d_nchw(1,   96,  35,  96, 3, 1, 1)
    verify_conv2d_nchw(1,  192,  35,  32, 1, 1, 0)
    verify_conv2d_nchw(1,  256,  35,  64, 1, 1, 0)
    verify_conv2d_nchw(1,  256,  35,  48, 1, 1, 0)
    verify_conv2d_nchw(1,  288,  35,  64, 1, 1, 0)
    verify_conv2d_nchw(1,  288,  35,  48, 1, 1, 0)
    verify_conv2d_nchw(1,  288,  35, 384, 3, 2, 0)
    verify_conv2d_nchw(1,   96,  35,  96, 3, 2, 0)
    verify_conv2d_nchw(1,  768,  17, 192, 1, 1, 0)
    verify_conv2d_nchw(1,  768,  17, 128, 1, 1, 0)
    verify_conv2d_nchw(1,  128,  17, 128, 1, 1, 0)
    verify_conv2d_nchw(1,  128,  17, 192, 7, 1, 3)
    verify_conv2d_nchw(1,  128,  17, 128, 7, 1, 3)
    verify_conv2d_nchw(1,  128,  17, 192, 1, 1, 0)
    verify_conv2d_nchw(1,  768,  17, 160, 1, 1, 0)
    verify_conv2d_nchw(1,  160,  17, 160, 1, 1, 0)
    verify_conv2d_nchw(1,  160,  17, 192, 7, 1, 3)
    verify_conv2d_nchw(1,  160,  17, 160, 7, 1, 3)
    verify_conv2d_nchw(1,  160,  17, 192, 1, 1, 0)
    verify_conv2d_nchw(1,  192,  17, 192, 1, 1, 0)
    verify_conv2d_nchw(1,  192,  17, 192, 7, 1, 3)
    verify_conv2d_nchw(1,  192,  17, 320, 3, 2, 0)
    verify_conv2d_nchw(1,  192,  17, 192, 3, 2, 0)
    verify_conv2d_nchw(1, 1280,   8, 320, 1, 1, 0)
    verify_conv2d_nchw(1, 1280,   8, 384, 1, 1, 0)
    verify_conv2d_nchw(1,  384,   8, 384, 1, 1, 0)
    verify_conv2d_nchw(1,  384,   8, 384, 3, 1, 1)
    verify_conv2d_nchw(1, 1280,   8, 448, 1, 1, 0)
    verify_conv2d_nchw(1,  448,   8, 384, 3, 1, 1)
    verify_conv2d_nchw(1, 1280,   8, 192, 1, 1, 0)
    verify_conv2d_nchw(1, 2048,   8, 320, 1, 1, 0)
    verify_conv2d_nchw(1, 2048,   8, 384, 1, 1, 0)
    verify_conv2d_nchw(1, 2048,   8, 448, 1, 1, 0)
    verify_conv2d_nchw(1, 2048,   8, 192, 1, 1, 0)
    verify_conv2d_nchw(1, 1024,  19,  84, 3, 1, 1)
    verify_conv2d_nchw(1, 2048,  10, 126, 3, 1, 1)
    verify_conv2d_nchw(1,  512,   5, 126, 3, 1, 1)
    verify_conv2d_nchw(1,  256,   3, 126, 3, 1, 1)
예제 #11
0
def tune_kernels(
    N,
    H,
    W,
    CO,
    CI,
    KH,
    KW,
    strides,
    padding,
    dilation,
    trials,
    log_filename,
    measure_option,
    tuner,
    early_stopping,
):
    #N, H, W, CO, CI, KH, KW, strides, padding, dilation = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1,1)
    data = ('TENSOR', (N, CI, H, W), 'float32')
    kernel = ('TENSOR', (CO, CI, KH, KW), 'float32')

    # data = deserialize_args( ('TENSOR', tvm.placeholder((N, CI, H, W), dtype='float32', name='data')) )
    # kernel = deserialize_args(('TENSOR',tvm.placeholder((CO, CI, KH, KW), dtype='float32', name='kernel')) )
    origin_layout = 'NCHW'
    func_create = 'topi_x86_conv2d_NCHW_test'
    task = autotvm.task.create(func_create,
                               args=(data, kernel, strides, padding, 1,
                                     origin_layout, 'float32'),
                               target='llvm -mcpu=core-avx2',
                               template_key='direct')
    #task.workload = ['float32', 'float32', H, W, CI, 1, CO, KH, KW, 1, 1, 1, 1]
    print(task.config_space)
    trials = min(trials, len(task.config_space))

    # During tuning we will also try many invalid configs, so you are expected to
    # see many error reports. As long as you can see non-zero GFLOPS, it is okay.
    tuner = autotvm.tuner.XGBTuner(task, loss_type='rank')
    tuner.tune(n_trial=trials,
               measure_option=measure_option,
               callbacks=[
                   autotvm.callback.progress_bar(trials),
                   autotvm.callback.log_to_file(log_filename)
               ])

    dispatch_context = autotvm.apply_history_best(log_filename)
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)
    with autotvm.apply_history_best(log_filename):
        with tvm.target.create("llvm -mcpu=core-avx2"):
            s, arg_bufs = task.func(data, kernel, strides, padding, 1,
                                    origin_layout, 'float32', best_config)
            func = tvm.build(s, arg_bufs, "llvm -mcpu=core-avx2", name="fconv")
            print("arg_bufs 0", arg_bufs[0])
            print("arg_bufs 1", arg_bufs[1])
            print("arg_bufs 2", arg_bufs[2])
            # print(func.get_source())

            dump = "%s.ll" % log_filename
            f = open(dump, "a")
            f.write(func.get_source())
            f.close()
    '''    
예제 #12
0
def run(name, N, H, W, factor, CI, KH, KW, strides, padding, dilation):
    # s, arg_bufs = depthwise_conv2d_nchw(N, H, W, factor, CI, KH, KW, strides, padding, dilation)
    task = autotvm.task.create(depthwise_conv2d_nchw,
                               args=(N, H, W, factor, CI, KH, KW, strides,
                                     padding, dilation),
                               target='cuda')
    print(task.config_space)
    logfile = "depthwise_" + name + ".log"

    # Use local gpu, measure 10 times for every config to reduce variance
    # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds
    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(),
                                            runner=autotvm.LocalRunner(
                                                repeat=3,
                                                min_repeat_ms=100,
                                                timeout=4))

    # Begin tuning, log records to file `conv2d.log`
    # During tuning we will also try many invalid configs, so you are expected to
    # see many error reports. As long as you can see non-zero GFLOPS, it is okay.
    tuner = autotvm.tuner.XGBTuner(task)
    tuner.tune(n_trial=1000,
               measure_option=measure_option,
               callbacks=[autotvm.callback.log_to_file(logfile)])

    #########################################################################
    # Finally we can inspect the best config from log file, check correctness,
    # and measure running time.

    # inspect the best config
    dispatch_context = autotvm.apply_history_best(logfile)
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)

    # apply history best from log file
    with autotvm.apply_history_best(logfile):
        with tvm.target.create("cuda"):
            s, arg_bufs = depthwise_conv2d_nchw(N, H, W, factor, CI, KH, KW,
                                                strides, padding, dilation)
            func = tvm.build(s, arg_bufs)

    # check correctness
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CI, factor, KH, KW)).astype(np.float32)
    # c_np = conv2d_nchw_python(a_np, w_np, strides, padding)

    ctx = tvm.gpu()
    a_tvm = tvm.nd.array(a_np, ctx=ctx)
    w_tvm = tvm.nd.array(w_np, ctx=ctx)
    c_tvm = tvm.nd.empty((N, factor * CI, (H + 2 * pad - KH) // stride + 1,
                          (W + 2 * pad - KW) // stride + 1),
                         ctx=ctx)
    # func(a_tvm, w_tvm, c_tvm)

    # tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

    # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
    # and the overhead of kernel launch. You can also use nvprof to validate the result.
    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
    cost = evaluator(a_tvm, w_tvm, c_tvm).mean * 1e3
    print('Time cost of this operator: %f' % cost)
    with open("autotvm_conv_nchw.txt", "a") as f:
        f.write("name, {}\n".format(cost))
예제 #13
0
파일: yolo.py 프로젝트: bytann/tvm-yolov3
    def __init__(self, config):
        cfg_path = config['cfg_path']
        weights_path = config['weights_path']
        device_type = config['device_type']
        autotune = config['autotune']
        log_file = config['log_file']
        self.thresh = config['thresh']
        self.nms_thresh = config['nms_thresh']

        DARKNET_URL = 'https://github.com/dmlc/web-data/blob/master/darknet/lib/libdarknet2.0.so?raw=true'
        lib_path = download_testdata(DARKNET_URL,
                                     'libdarknet2.0.so',
                                     module="darknet")
        DARKNET_LIB = __darknetffi__.dlopen(lib_path)
        self.net = DARKNET_LIB.load_network(cfg_path.encode('utf-8'),
                                            weights_path.encode('utf-8'), 0)

        dtype = 'float32'
        data = np.empty([1, self.net.c, self.net.h, self.net.w], dtype)
        self.shape = {'data': data.shape}

        # convert darknet to relay functions
        mod, params = relay.frontend.from_darknet(self.net,
                                                  dtype=dtype,
                                                  shape=data.shape)

        # import graph to relay
        if device_type == 'cpu':
            target = 'llvm'
            ctx = tvm.cpu(0)
            if autotune:
                if not os.path.isfile(log_file):
                    err = "Autotvm log file does not exist."
                    raise NotImplementedError(err)
                with autotvm.apply_history_best(log_file):
                    with relay.build_config(opt_level=3):
                        graph, lib, self.params = relay.build_module.build(
                            mod, target=target, params=params)
            else:
                with relay.build_config(opt_level=3):
                    graph, lib, self.params = relay.build_module.build(
                        mod, target=target, params=params)
        elif device_type == 'cuda-cudnn':
            target = 'cuda -libs=cudnn'
            ctx = tvm.gpu()
            if autotune:
                if not os.path.isfile(log_file):
                    err = "Autotvm log file does not exist."
                    raise NotImplementedError(err)
                with autotvm.apply_history_best(log_file):
                    with relay.build_config(opt_level=3):
                        graph, lib, self.params = relay.build_module.build(
                            mod, target=target, params=params)
            else:
                with relay.build_config(opt_level=3):
                    graph, lib, self.params = relay.build_module.build(
                        mod, target=target, params=params)
        elif device_type == 'cuda':
            target = tvm.target.cuda()
            ctx = tvm.gpu()
            if autotune:
                if not os.path.isfile(log_file):
                    err = "Autotvm log file does not exist."
                    raise NotImplementedError(err)
                with autotvm.apply_history_best(log_file):
                    with relay.build_config(opt_level=3):
                        graph, lib, self.params = relay.build_module.build(
                            mod, target=target, params=params)
            else:
                with relay.build_config(opt_level=3):
                    graph, lib, self.params = relay.build_module.build(
                        mod, target=target, params=params)
        else:
            err = "Device type is not supported on this platform."
            raise NotImplementedError(err)

        self.m = graph_runtime.create(graph, lib, ctx)
예제 #14
0
def tune_kernels(args, M, N, P, K, trials,
                 measure_option, tuner, early_stopping,):

    feature_type = args.feature
    print('Feature:', feature_type)

    count = args.num_iters
    likwid_event = args.likwid_event
    random = args.random
    sa_n_iter = args.sa_num_iters
    save_features = not (args.no_save_features)

    task = autotvm.task.create("template/tc", args=(M,N,P,K,tc_index,'float32'), target='llvm -mcpu=core-avx2')
    print(task.config_space)

    trials = min(trials, len(task.config_space))

    for i in range(count): 
        if args.key_id != None and count == 1:
            save_ind = int(args.key_id)
        else:
            save_ind = i
        if random:
            log_filename = 'tc%i_%i_%i_%s_%icore_rand.log' % (tc_index, N, save_ind, feature_type, num_threads)
        else:
            log_filename = 'tc%i_%i_%i_%s_%icore.log' % (tc_index, N, save_ind, feature_type, num_threads)

        if likwid_event != None:
            if random:
                pickle_file = 'data/tc/likwid_rand_tc%i_%i_%s_features_%icore_%i_%i.pkl' % (tc_index, N, feature_type, num_threads, trials,  save_ind)
            else:
                pickle_file = 'data/tc/likwid_tc%i_%i_%s_features_%icore_%i_%i.pkl' % (tc_index, N, feature_type, num_threads, trials,  save_ind)
        else:
            if random:
                pickle_file = 'data/tc/rand_tc%i_%i_%s_features_%icore_%i_%i.pkl' % (tc_index, N, feature_type, num_threads, trials,  save_ind)
            else:
                pickle_file = 'data/tc/tc%i_%i_new_%s_features_%icore_%i_%i.pkl' % (tc_index, N, feature_type, num_threads, trials,  save_ind)
        if os.path.exists(pickle_file):
            print('File exists', pickle_file)
            continue

        tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type, loss_type='rank',
                plan_size=80, sa_n_iter=sa_n_iter)
        tuner.tune(n_trial=trials,
                   measure_option=measure_option,
                   callbacks=[
                       autotvm.callback.progress_bar(trials),
                       autotvm.callback.log_to_file(log_filename)],
                   likwid_event=likwid_event, save_features=save_features, random=random)

        dispatch_context = autotvm.apply_history_best(log_filename)
        best_config = dispatch_context.query(task.target, task.workload)
        print("\nBest config:")
        print(best_config)

    #print(tvm.lower(s, arg_bufs, simple_mode=True))
        if save_features:
            with open(pickle_file , 'wb') as output:
                pickle.dump([best_config, task, tuner.cost_model.saved_features], output, pickle.HIGHEST_PROTOCOL)
        try:
            os.remove(log_filename)
        except:
            pass
예제 #15
0
파일: compiler.py 프로젝트: mvermeulen/tvm
def compile_model(
    tvmc_model: TVMCModel,
    target: str,
    opt_level: int = 3,
    executor: Optional[Executor] = Executor("graph"),
    runtime: Optional[Runtime] = Runtime("cpp"),
    tuning_records: Optional[str] = None,
    package_path: Optional[str] = None,
    cross: Optional[Union[str, Callable]] = None,
    cross_options: Optional[str] = None,
    output_format: str = "so",
    dump_code: Optional[List[str]] = None,
    target_host: Optional[str] = None,
    desired_layout: Optional[str] = None,
    disabled_pass: Optional[str] = None,
    pass_context_configs: Optional[List[str]] = None,
    additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
    use_vm: bool = False,
    mod_name: Optional[str] = "default",
):
    """Compile a model from a supported framework into a TVM module.

    This function takes a union of the arguments of both frontends.load_model
    and compiler.compile_relay. The resulting TVM module can be executed using
    the graph executor.

    Parameters
    ----------
    tvmc_model : TVMCModel
        The model object that should be compiled.
    target : str
        The target for which to compile. Can be a plain string or
        a path.
    opt_level : int
        The option that controls various sorts of optimizations.
    tuning_records : str
        A path to tuning records produced using tvmc.tune. When provided,
        compilation will use more optimized kernels leading to better results.
    package_path : str, optional
        The path to export the compiled model to. If not provided it will
        be saved in a temporary directory.
    cross : str or callable object, optional
        Function that performs the actual compilation
    cross_options : str, optional
        Command line options to be passed to the cross compiler.
    output_format : str
        What format to use when saving the function library. Must be one of "so" or "tar".
        When compiling for a remote device without a cross compiler, "tar" will likely work better.
    dump_code : list, optional
        Dump the generated code for the specified source types, on
        the requested target.
    target_host : str, optional
        The target of the host machine if host-side code
        needs to be generated.
    desired_layout: str, optional
        The layout to convert the graph to. Note, the convert layout
        pass doesn't currently guarantee the whole of the graph will
        be converted to the chosen layout.
    disabled_pass: str, optional
        Comma-separated list of passes which needs to be disabled
        during compilation
    pass_context_configs: list[str], optional
        List of strings containing a set of configurations to be passed to the
        PassContext.
    additional_target_options: Optional[Dict[str, Dict[str, Any]]]
        Additional target options in a dictionary to combine with initial Target arguments
    use_vm: bool
        Whether to use the VM to compile the model as opposed to the graph executor
    mod_name: str, optional
        The module name

    Returns
    -------
    compiled_model : TVMCPackage
        The compiled TVMCModel ready to be run.

    """
    mod, params = tvmc_model.mod, tvmc_model.params

    config = parse_configs(pass_context_configs)

    if desired_layout:
        mod = convert_graph_layout(mod, desired_layout)

    tvm_target, extra_targets = target_from_cli(target,
                                                additional_target_options)
    tvm_target, target_host = Target.check_and_update_host_consist(
        tvm_target, target_host)

    for codegen_from_cli in extra_targets:
        codegen = composite_target.get_codegen_by_target(
            codegen_from_cli["name"])
        partition_function = codegen["pass_pipeline"]

        if codegen["config_key"] is not None:
            config[codegen["config_key"]] = codegen_from_cli["opts"]
        with tvm.transform.PassContext(config=config):
            mod = partition_function(mod,
                                     params,
                                     mod_name=mod_name,
                                     **codegen_from_cli["opts"])

    if tuning_records and os.path.exists(tuning_records):
        logger.debug("tuning records file provided: %s", tuning_records)

        use_autoscheduler = True
        try:
            auto_scheduler.load_records(tuning_records)
        except tvm._ffi.base.TVMError:
            use_autoscheduler = False

        if use_autoscheduler:
            with auto_scheduler.ApplyHistoryBest(tuning_records):
                config["relay.backend.use_auto_scheduler"] = True
                with tvm.transform.PassContext(opt_level=opt_level,
                                               config=config,
                                               disabled_pass=disabled_pass):
                    logger.debug("building relay graph with autoscheduler")
                    graph_module = build(
                        mod,
                        tvm_target=tvm_target,
                        executor=executor,
                        runtime=runtime,
                        params=params,
                        use_vm=use_vm,
                        mod_name=mod_name,
                    )
        else:
            with autotvm.apply_history_best(tuning_records):
                with tvm.transform.PassContext(opt_level=opt_level,
                                               config=config,
                                               disabled_pass=disabled_pass):
                    logger.debug("building relay graph with tuning records")
                    graph_module = build(
                        mod,
                        tvm_target=tvm_target,
                        executor=executor,
                        runtime=runtime,
                        params=params,
                        use_vm=use_vm,
                        mod_name=mod_name,
                    )
    else:
        with tvm.transform.PassContext(opt_level=opt_level,
                                       config=config,
                                       disabled_pass=disabled_pass):
            logger.debug("building relay graph (no tuning records provided)")
            graph_module = build(
                mod,
                tvm_target=tvm_target,
                executor=executor,
                runtime=runtime,
                params=params,
                use_vm=use_vm,
                mod_name=mod_name,
            )

    # Generate output dump files with sources
    if dump_code is None:
        dump_code = []
    if not isinstance(dump_code, list):
        dump_code = [dump_code]
    dumps = {}
    for source_type in dump_code:
        if use_vm:
            lib = graph_module.lib
        else:
            lib = graph_module.get_lib()
        # TODO lib.get_source call have inconsistent behavior for unsupported
        #      formats (@leandron).
        source = str(mod) if source_type == "relay" else lib.get_source(
            source_type)
        dumps[source_type] = source

    # Create a new tvmc model package object from the graph definition.
    package_path = tvmc_model.export_package(graph_module, package_path, cross,
                                             cross_options, output_format)

    # Write dumps to file.
    if dumps:
        save_dumps(package_path, dumps)

    return TVMCPackage(package_path)
예제 #16
0
def tune_kernels(
    N,
    H,
    W,
    CO,
    CI,
    KH,
    KW,
    strides,
    padding,
    dilation,
    trials,
    key,
    measure_option,
    tuner,
    early_stopping,
):
    data = ('TENSOR', (N, CI, H, W), 'float32')
    kernel = ('TENSOR', (CO, CI, KH, KW), 'float32')

    origin_layout = 'NCHW'

    if len(sys.argv) > 2:
        feature_type = sys.argv[2]
    else:
        #feature_type = 'datavol'
        feature_type = 'itervar'
        #feature_type = 'datavol_itervar'
    print('Feature:', feature_type)

    if len(sys.argv) > 3:
        if 'small' == sys.argv[3]:
            func_create = 'conv2d_NCHW_small.x86'
        elif 'wide' == sys.argv[3]:
            func_create = 'conv2d_NCHW_wide.x86'
        else:
            func_create = 'conv2d_NCHWc.x86'
    else:
        func_create = 'conv2d_NCHWc.x86'

    if len(sys.argv) > 4:
        count = int(sys.argv[4])
    else:
        count = 1

    if len(sys.argv) > 5:
        likwid_event = sys.argv[5]
    else:
        likwid_event = None

    task = autotvm.task.create(func_create,
                               args=(data, kernel, strides, padding, 1,
                                     origin_layout, origin_layout, 'float32'),
                               target='llvm -mcpu=core-avx2')
    using_NCHWc = True
    print(task.config_space)
    trials = min(trials, len(task.config_space))

    ctx = tvm.cpu()
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    c_np = conv2d_nchw_python(a_np, w_np, strides, padding).astype(np.float32)

    for i in range(count):
        log_filename = '%s_%i_%s_%s_%icore_rand.log' % (
            key, i, feature_type, sys.argv[3], num_threads)
        tuner = autotvm.tuner.XGBTuner(task,
                                       feature_type=feature_type,
                                       loss_type='rank',
                                       plan_size=32)
        tuner.tune(n_trial=trials,
                   measure_option=measure_option,
                   callbacks=[
                       autotvm.callback.progress_bar(trials),
                       autotvm.callback.log_to_file(log_filename)
                   ],
                   likwid_event=likwid_event)
        dispatch_context = autotvm.apply_history_best(log_filename)
        best_config = dispatch_context.query(task.target, task.workload)
        print("\nBest config:")
        print(best_config)

        # apply history best from log file
        with autotvm.apply_history_best(log_filename):
            with tvm.target.create("llvm -mcpu=core-avx2"):
                s, arg_bufs = task.func(*task.args)
                func = tvm.build(s, arg_bufs)

        if using_NCHWc:
            a_np_reshape = a_np.reshape(
                (N, CI // best_config['tile_ic'].size[-1],
                 best_config['tile_ic'].size[-1], H, W)).transpose(
                     (0, 1, 3, 4, 2))
            w_np_reshape = w_np.reshape(
                (CO // best_config['tile_oc'].size[-1],
                 best_config['tile_oc'].size[-1],
                 CI // best_config['tile_ic'].size[-1],
                 best_config['tile_ic'].size[-1], KH, KW)).transpose(
                     (0, 2, 4, 5, 3, 1))
            c_np_reshape = c_np.reshape(
                (N, CO // best_config['tile_oc'].size[-1],
                 best_config['tile_oc'].size[-1], H, W)).transpose(
                     (0, 1, 3, 4, 2))
        a_tvm = tvm.nd.array(a_np_reshape, ctx=ctx)
        w_tvm = tvm.nd.array(w_np_reshape, ctx=ctx)
        c_tvm = tvm.nd.array(c_np_reshape, ctx=ctx)
        if tuple(arg_bufs[1].shape) == w_tvm.shape:
            func(c_tvm, w_tvm, a_tvm)
        else:
            func(c_tvm, a_tvm, w_tvm)

        try:
            tvm.testing.assert_allclose(c_np_reshape,
                                        c_tvm.asnumpy(),
                                        rtol=1e-2)
        except:
            print('WARNING: Not equal!')
        evaluator = func.time_evaluator(func.entry_name,
                                        ctx,
                                        repeat=3,
                                        number=4)
        if tuple(arg_bufs[1].shape) == w_tvm.shape:
            print(evaluator(c_tvm, w_tvm, a_tvm))
        else:
            print(evaluator(c_tvm, a_tvm, w_tvm))
        os.remove(log_filename)

    print(tvm.lower(s, arg_bufs, simple_mode=True))
    if likwid_event != None:
        with open(
                'data/likwid_rand_%s_%s_features_%icore_%i_%s.pkl' %
            (key, feature_type, num_threads, trials, sys.argv[3]),
                'wb') as output:
            pickle.dump([best_config, task, tuner.cost_model.saved_features],
                        output, pickle.HIGHEST_PROTOCOL)
    else:
        with open(
                'data/%s_%s_features_%icore_%i_%s.pkl' %
            (key, feature_type, num_threads, trials, sys.argv[3]),
                'wb') as output:
            pickle.dump([best_config, task, tuner.cost_model.saved_features],
                        output, pickle.HIGHEST_PROTOCOL)
예제 #17
0
    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)

        task = autotvm.task.create(schedule_conv2d_nhwc_auto,
                                   args=(batch, in_channel, in_size,
                                         num_filter, kernel, stride),
                                   target="cuda")
        print(task.config_space)

        # logging config (for printing tuning log to the screen)
        logging.getLogger('autotvm').setLevel(logging.DEBUG)
        logging.getLogger('autotvm').addHandler(
            logging.StreamHandler(sys.stdout))

        # There are two steps for measuring a config: build and run.
        # By default, we use all cpu cores to compile program. Then measure them sequentially.
        # We measure 5 times and take average to reduce variance.
        measure_option = autotvm.measure_option(
            builder='local', runner=autotvm.LocalRunner(number=10))

        tuner = autotvm.tuner.RandomTuner(task)
        tuner.tune(n_trial=25,
                   measure_option=measure_option,
                   callbacks=[
                       autotvm.callback.log_to_file(
                           'conv2d_nhwc_{}.log'.format(in_size))
                   ])

        with autotvm.apply_history_best('conv2d_nhwc_{}.log'.format(in_size)):
            with tvm.target.create(device):
                s, [A, W,
                    B] = schedule_conv2d_nhwc_auto(batch, in_channel, in_size,
                                                   num_filter, kernel, stride)
                func = tvm.build(s, [A, W, B],
                                 device,
                                 name=("ddd%dddd" % in_size))

        @memoize("verify_nhwc")
        def get_ref_data():
            a_np = np.random.uniform(size=a_shape).astype(dtype)
            w_np = np.random.uniform(size=w_shape).astype(dtype)
            b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding)
            return a_np, w_np, b_np

        a_np, w_np, b_np = get_ref_data()

        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(b_np.shape), dtype=dtype),
                         ctx)

        func(a, w, b)
        timer_1 = func.time_evaluator(func.entry_name, ctx, number=10)
        tcost_1 = timer_1(a, w, b).mean
        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
        print("1x1 convolution: average running time is {:.2f} us.".format(
            tcost_1 * 1e6))
예제 #18
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)

        task = autotvm.task.create(schedule_depthwise_conv2d_nhwc_reuse_auto,
                                   args=(batch, in_channel, in_size,
                                         channel_multiplier, kernel, stride),
                                   target="cuda")
        print(task)
        print(task.config_space)

        # logging config (for printing tuning log to the screen)
        logging.getLogger('autotvm').setLevel(logging.DEBUG)
        logging.getLogger('autotvm').addHandler(
            logging.StreamHandler(sys.stdout))

        # There are two steps for measuring a config: build and run.
        # By default, we use all cpu cores to compile program. Then measure them sequentially.
        # We measure 5 times and take average to reduce variance.
        measure_option = autotvm.measure_option(
            builder='local', runner=autotvm.LocalRunner(number=10))

        tuner = autotvm.tuner.RandomTuner(task)
        tuner.tune(n_trial=25,
                   measure_option=measure_option,
                   callbacks=[
                       autotvm.callback.log_to_file(
                           'depthwise_conv2d_nhwc_{}.log'.format(in_size))
                   ])

        with autotvm.apply_history_best(
                'depthwise_conv2d_nhwc_{}.log'.format(in_size)):
            with tvm.target.create(device):
                s1, [Input, Filter, DepthwiseConv2d
                     ] = schedule_depthwise_conv2d_nhwc_reuse_auto(
                         batch, in_channel, in_size, channel_multiplier,
                         kernel, stride)
                # s3 = schedule_depthwise_conv2d_nhwc_reuse(Relu)
                # build the kernels
                f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d],
                               device,
                               name="ddd%dddd" % in_size)
                # f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
                # f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)

        # Prepare pod type for test data closure
        dtype = Input.dtype
        input_shape = get_const_tuple(Input.shape)
        filter_shape = get_const_tuple(Filter.shape)
        # scale_shape = get_const_tuple(Scale.shape)
        # shift_shape = get_const_tuple(Shift.shape)
        # scale_shift_shape = get_const_tuple(ScaleShift.shape)

        # Use memoize, pickle the test data for next time use.
        @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc")
        def get_ref_data():
            input_np = np.random.uniform(size=input_shape).astype(dtype)
            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
            # scale_np = np.random.uniform(size=scale_shape).astype(dtype)
            # shift_np = np.random.uniform(size=shift_shape).astype(dtype)
            # correctness with scipy
            depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nhwc(
                input_np,
                filter_np,
                stride=[stride_h, stride_w],
                padding=padding)
            # scale_shift_scipy = np.zeros(shape=scale_shift_shape)
            # for c in range(in_channel * channel_multiplier):
            #     scale_shift_scipy[:,:,:,c] = depthwise_conv2d_scipy[:,:,:,c] * scale_np[c] + shift_np[c]
            #     relu_scipy = np.maximum(scale_shift_scipy, 0)
            # return (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy)
            return (input_np, filter_np, depthwise_conv2d_scipy)

        # Get the test data
        # (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data()
        (input_np, filter_np, depthwise_conv2d_scipy) = get_ref_data()

        # prepare data
        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        # scale_tvm = tvm.nd.array(scale_np, ctx)
        # shift_tvm = tvm.nd.array(shift_np, ctx)
        depthwise_conv2d_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                     dtype=DepthwiseConv2d.dtype), ctx)
        # scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx)
        # relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # launch kernel 1 (depthwise_conv2d)
        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=10)
        tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
        # launch kernel 2 (depthwise_conv2d + scale_shift)
        # timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=10)
        # tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
        # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
        # timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=10)
        # tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
        # relu_scipy = np.maximum(scale_shift_scipy, 0)
        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(),
                                   depthwise_conv2d_scipy,
                                   rtol=1e-5)
        # np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
        # np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
        print(
            "Depthwise convolution: average running time is {:.2f} us.".format(
                tcost_1 * 1e6))
예제 #19
0
def main():
    with tvm.target.cuda():
        with autotvm.apply_history_best(args.log_file):
            for batch in [1, 16]:
                for name in ['vgg-19', 'resnet-50', 'resnext-50', 'inception_v3', 'drn-c-26', 'dcn-resnet-101']:
                    bench(name, batch)
예제 #20
0
def tvm_generic(N,
                H,
                W,
                C,
                kernel_size,
                K,
                stride=1,
                padding=0,
                dilation=1,
                groups=1,
                number=100,
                dev=0,
                timeout=4,
                target="llvm",
                trials=100):
    data_shape = (N, C, H, W)
    data = relay.var("data", shape=data_shape, dtype="float32")
    kernel_size = (kernel_size, kernel_size)
    stride = (stride, stride)
    padding = (padding, padding)
    body = layers.conv2d(data=data,
                         channels=K,
                         kernel_size=kernel_size,
                         strides=stride,
                         padding=padding,
                         name="conv2d")
    op = relay.Function(relay.ir_pass.free_vars(body), body)
    sym, params = create_workload(op)
    tasks = autotvm.task.extract_from_program(op,
                                              target=target,
                                              params=params,
                                              ops=(relay.op.nn.conv2d, ))
    tuning_option = {
        "log_filename":
        "tvm_baseline_{}.log".format(
            (N, C, H, W, K, kernel_size, stride, padding, dilation, groups)),
        "tuner":
        "xgb",
        "early_stopping":
        30,
        "measure_option":
        autotvm.measure_option(
            builder=autotvm.LocalBuilder(timeout=timeout),
            runner=autotvm.LocalRunner(number=number,
                                       repeat=1,
                                       timeout=timeout,
                                       min_repeat_ms=150),
            # runner=autotvm.RPCRunner(
            #     '1080ti',  # change the device key to your key
            #     '0.0.0.0', 9190,
            #     number=20, repeat=3, timeout=4, min_repeat_ms=150)
        ),
    }
    log_filename = tuning_option["log_filename"]
    tuner = tuning_option["tuner"]
    early_stopping = tuning_option["early_stopping"]
    measure_option = tuning_option["measure_option"]

    # only support one task
    assert len(tasks) == 1

    for i, task in enumerate(tasks):
        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

        # create tuner
        if tuner == 'xgb' or tuner == 'xgb-rank':
            tuner_obj = XGBTuner(task, loss_type='rank')
        elif tuner == 'ga':
            tuner_obj = GATuner(task, pop_size=100)
        elif tuner == 'random':
            tuner_obj = RandomTuner(task)
        elif tuner == 'gridsearch':
            tuner_obj = GridSearchTuner(task)
        else:
            raise ValueError("Invalid tuner: " + tuner)

        # do tuning
        n_trial = trials
        length = len(task.config_space)
        print("config space length=", length)
        # tuner_obj.tune(n_trial=min(n_trial, length),
        #                early_stopping=early_stopping,
        #                measure_option=measure_option,
        #                callbacks=[
        #                    autotvm.callback.progress_bar(n_trial, prefix=prefix),
        #                    autotvm.callback.log_to_file(log_filename)])

    if not os.path.exists(log_filename):
        raise RuntimeError(
            "the log file {} doesn't exists".format(log_filename))
    with autotvm.apply_history_best(log_filename):
        with relay.build_config(opt_level=3):
            graph, lib, params = relay.build_module.build(op,
                                                          target=target,
                                                          params=params)

        ctx = tvm.context(str(target), 0)
        data_tvm = tvm.nd.array(
            (np.random.uniform(size=data_shape)).astype("float32"))
        module = runtime.create(graph, lib, ctx)
        module.set_input("data", data_tvm)
        module.set_input(**params)

        # evaluate
        ftimer = module.module.time_evaluator("run",
                                              ctx,
                                              number=number,
                                              repeat=1)
        prof_res = np.array(ftimer().results) * 1e3
        return prof_res
예제 #21
0
                                        runner=autotvm.LocalRunner(number=5))

# begin tuning, log records to file `matmul.log`
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(n_trial=10,
           measure_option=measure_option,
           callbacks=[autotvm.callback.log_to_file('matmul.log')])

#########################################################################
# Finally we apply history best from the cache file and check its correctness.
# We can call the function :code:`matmul` directly under the
# :any:`autotvm.apply_history_best` context. When we call this function,
# it will query the dispatch context with its argument and get the best config
# with the same argument.

# apply history best from log file
with autotvm.apply_history_best('matmul.log'):
    with tvm.target.create("llvm"):
        s, arg_bufs = matmul(N, L, M, 'float32')
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = a_np.dot(b_np)

c_tvm = tvm.nd.empty(c_np.shape)
func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)

tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
예제 #22
0
def run(name,
        N,
        H,
        W,
        CO,
        CI,
        KH,
        KW,
        stride,
        pad,
        dilation,
        trials=100,
        timeout=4,
        number=10,
        target="llvm",
        dev=0,
        tune=True):
    N, H, W, CO, CI, KH, KW, strides, padding = N, H, W, CO, CI, KH, KW, (
        stride, stride), (pad, pad)
    task = autotvm.task.create(conv2d_nchw,
                               args=(N, H, W, CO, CI, KH, KW, strides, padding,
                                     dilation),
                               target=target)
    print("config_space length:", len(task.config_space))
    logfile = "conv2d_" + name + "_{}".format(
        (N, CI, H, W, CO, KH, KW, stride, pad, dilation)) + ".log"

    # Use local gpu, measure 10 times for every config to reduce variance
    # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds
    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(),
                                            runner=autotvm.LocalRunner(
                                                number=number,
                                                repeat=1,
                                                min_repeat_ms=150,
                                                timeout=timeout))

    # Begin tuning, log records to file `conv2d.log`
    # During tuning we will also try many invalid configs, so you are expected to
    # see many error reports. As long as you can see non-zero GFLOPS, it is okay.
    tuner = autotvm.tuner.XGBTuner(task)
    beg = time.time()
    print("Tune: ", tune)
    if tune:
        tuner.tune(n_trial=trials,
                   measure_option=measure_option,
                   callbacks=[autotvm.callback.log_to_file(logfile)])
    end = time.time()

    #########################################################################
    # Finally we can inspect the best config from log file, check correctness,
    # and measure running time.

    # inspect the best config
    dispatch_context = autotvm.apply_history_best(logfile)
    best_config = dispatch_context.query(task.target, task.workload)
    print("Optimize use ", end - beg, "s")
    print("\nBest config:")
    print(best_config)

    # apply history best from log file
    with autotvm.apply_history_best(logfile):
        with tvm.target.create(target):
            s, arg_bufs = conv2d_nchw(N, H, W, CO, CI, KH, KW, strides,
                                      padding, dilation)
            # print(tvm.lower(s, arg_bufs, simple_mode=True))
            func = tvm.build(s, arg_bufs, "cuda")
            print(func.imported_modules[0].get_source())
            func = tvm.build(s, arg_bufs)

    # check correctness
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    # c_np = conv2d_nchw_python(a_np, w_np, strides, padding)

    ctx = tvm.context(str(target), dev)
    a_tvm = tvm.nd.array(a_np, ctx=ctx)
    w_tvm = tvm.nd.array(w_np, ctx=ctx)
    c_tvm = tvm.nd.empty(
        (N, CO, (H + 2 * pad - dilation * (KH - 1) - 1) // stride + 1,
         (W + 2 * pad - dilation * (KW - 1) - 1) // stride + 1),
        ctx=ctx)
    # func(a_tvm, w_tvm, c_tvm)

    # tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

    # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
    # and the overhead of kernel launch. You can also use nvprof to validate the result.
    evaluator = func.time_evaluator(func.entry_name, ctx, number=number)
    cost = evaluator(a_tvm, w_tvm, c_tvm).mean * 1e3
    return cost
예제 #23
0
target = tvm.target.cuda()

size = 320

x, img = data.transforms.presets.ssd.load_test(im_fname, short=size)

board = None

device = 'gpu'

#log_filename = '{}-{}.{}.{}.log'.format(model, size, board, device)
log_filename = '{}.{}.{}.log'.format(model, board, device)


with autotvm.apply_history_best(log_filename):

    loaded_lib = tvm.module.load("lib/{}.tvm.so".format(model))

    loaded_json = open(("graph/{}.tvm.json".format(model))).read()
    # parameters in binary
    loaded_params = (bytearray(open("params/{}.tvm.params".format(model), "rb").read()))

    #nnvm.compiler.load_param_dict(loaded_params)

    fcreate = tvm.get_global_func("tvm.graph_runtime.create")

    ctx = tvm.gpu(0)
    
    #module = runtime.create(loaded_json, loaded_lib, ctx)
예제 #24
0
measure_option = autotvm.measure_option(mode='local', number=5)

# begin tuning, log records to file `cache.tsv`
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(n_trial=10,
           measure_option=measure_option,
           callbacks=[autotvm.callback.log_to_file('cache.tsv')])

#########################################################################
# Finally we apply history best from the cache file and check its correctness.
# We can call the function :code:`matmul` directly under the
# :any:`autotvm.apply_history_best` context. When we call this function,
# it will query the dispatch context with its argument and get the best config
# with the same argument.

# apply history best from log file
with autotvm.apply_history_best('cache.tsv'):
    with tvm.target.create("llvm"):
        s, arg_bufs = matmul(N, L, M, 'float32')
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = a_np.dot(b_np)

c_tvm = tvm.nd.empty(c_np.shape)
func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)

np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
def tune_and_evaluate(M, N, L, dtype, layout):
  task = autotvm.task.create("tutorial/auto_tensorcore/test_gemm", args=(N, L, M, dtype, layout),
                             target='cuda')
  print(task.config_space)

  logging.getLogger('autotvm').setLevel(logging.DEBUG)
  logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

  measure_option = autotvm.measure_option(
    builder='local',
    runner=autotvm.LocalRunner(number=5))

  tuner = autotvm.tuner.XGBTuner(task)
  tuner.tune(n_trial=1000,
             measure_option=measure_option,
             callbacks=[autotvm.callback.log_to_file('matmul.log')])

  dispatch_context = autotvm.apply_history_best("matmul.log")
  best_config = dispatch_context.query(task.target, task.workload)
  print("\nBest config:")
  print(best_config)
  with autotvm.apply_history_best('matmul.log'):
    with tvm.target.create("cuda"):
          s, arg_bufs = test_gemm(N, L, M, dtype, layout)
          print(tvm.lower(s, arg_bufs, simple_mode=True))
          func = tvm.build(s, arg_bufs)
  dev_module = func.imported_modules[0]
  print(dev_module.get_source())

  # check correctness
  if (layout == "NN"):
    shape_a = (N, L)
    shape_b = (L, M)
  elif (layout == "NT"):
    shape_a = (L, N)
    shape_b = (L, M)
  elif (layout == "TN"):
    shape_a = (N, L)
    shape_b = (M, L)
  elif (layout == "TT"):
    shape_a = (L, N)
    shape_b = (M, L)

  a_np = None
  b_np = None
  c_np = None
  c_np_type = None
  if dtype == 'float16':
    c_np_type = np.float32
    a_np = np.random.uniform(size=shape_a).astype(np.float16)
    b_np = np.random.uniform(size=shape_b).astype(np.float16)
    if (layout == "NN"):
      c_np = np.dot(a_np, b_np)
    elif (layout == "NT"):
      c_np = np.dot(a_np.T, b_np)
    elif (layout == "TN"):
      c_np = np.dot(a_np, b_np.T)
    elif (layout == "TT"):
      c_np = np.dot(a_np.T, b_np.T)
  elif dtype == 'int8':
    c_np_type = np.int32
    a_np = np.random.randint(low=-128, high=127, size=shape_a).astype(np.int8)
    b_np = np.random.randint(low=-128, high=127, size=shape_b).astype(np.int8)
    if (layout == "NN"):
      c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32))
    elif (layout == "NT"):
      c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32))
    elif (layout == "TN"):
      c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T)
    elif (layout == "TT"):
      c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T)
  elif dtype == 'int4':
    c_np_type = np.int32
    a_np_int = np.random.randint(low=-8, high=7, size=shape_a).astype(np.int32)
    b_np_int = np.random.randint(low=-8, high=7, size=shape_b).astype(np.int32)
    # "TN"
    c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
    a_np = np.zeros(shape=(N, int(L/8)), dtype = np.int32)
    b_np = np.zeros(shape=(M, int(L/8)), dtype = np.int32)
    # a_np --> col_major
    for i in range(N):
      for j in range(int(L/8)):
        for k in range(8):
          a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4))

    # b_np --> row_major
    for i in range(M):
      for j in range(int(L/8)):
        for k in range(8):
          b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4))
  elif dtype == 'int1':
    c_np_type = np.int32
    a_np_int = np.random.randint(low=0, high=1, size=shape_a).astype(np.int32)
    b_np_int = np.random.randint(low=0, high=1, size=shape_b).astype(np.int32)
    # "TN"
    c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
    a_np = np.zeros(shape=(N, int(L/32)), dtype = np.int32)
    b_np = np.zeros(shape=(M, int(L/32)), dtype = np.int32)
    for i in range(N):
      for j in range(int(L/32)):
        for k in range(32):
          a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 32 + k] & 0xf) << (31 - k))

    for i in range(M):
      for j in range(int(L/32)):
        for k in range(32):
          b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xf) << (31 - k))

  c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx)
  a_tvm = tvm.nd.array(a_np, ctx=ctx)
  b_tvm = tvm.nd.array(b_np, ctx=ctx)
  func(a_tvm, b_tvm, c_tvm)

  tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3)

  evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
  print('Time cost of this operator: %f' % evaluator(a_tvm, b_tvm, c_tvm).mean)
def tune_kernels(
    N,
    H,
    W,
    CO,
    CI,
    KH,
    KW,
    strides,
    padding,
    dilation,
    trials,
    log_filename,
    so_file,
    measure_option,
    tuner,
    early_stopping,
):
    # N, H, W, CO, CI, KH, KW, strides, padding, dilation = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1,1)
    data = ('TENSOR', (N, CI, H, W), 'float32')
    kernel = ('TENSOR', (CO, CI, KH, KW), 'float32')

    # data = deserialize_args( ('TENSOR', tvm.placeholder((N, CI, H, W), dtype='float32', name='data')) )
    # kernel = deserialize_args(('TENSOR',tvm.placeholder((CO, CI, KH, KW), dtype='float32', name='kernel')) )
    origin_layout = 'NCHW'
    func_create = 'topi_x86_conv2d_NCHW_test'
    task = autotvm.task.create(func_create,
                               args=(data, kernel, strides, padding, 1,
                                     origin_layout, 'float32'),
                               target='llvm -mcpu=skylake-avx512',
                               template_key='direct')
    # task.workload = ['float32', 'float32', H, W, CI, 1, CO, KH, KW, 1, 1, 1, 1]
    print(task.config_space)
    trials = min(trials, len(task.config_space))

    # During tuning we will also try many invalid configs, so you are expected to
    # see many error reports. As long as you can see non-zero GFLOPS, it is okay.
    tuner = autotvm.tuner.XGBTuner(task, loss_type='rank')
    # tuner.tune(n_trial=trials,
    #           measure_option=measure_option,
    #           callbacks=[
    #               autotvm.callback.progress_bar(trials),
    #               autotvm.callback.log_to_file(log_filename)])

    dispatch_context = autotvm.apply_history_best(log_filename)
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)
    with autotvm.apply_history_best(log_filename):
        with tvm.target.create("llvm -mcpu=skylake-avx512"):
            s, arg_bufs = task.func(data, kernel, strides, padding, 1,
                                    origin_layout, 'float32', best_config)
            func = tvm.build(s,
                             arg_bufs,
                             "llvm -mcpu=skylake-avx512",
                             name="fconv")
            print("arg_bufs 0", arg_bufs[0])
            print("arg_bufs 1", arg_bufs[1])
            print("arg_bufs 2", arg_bufs[2])
            # print(func.get_source())
            ''' 
            dump = "%s.ll" % log_filename
            f = open(dump, "a")
            f.write(func.get_source())
            f.close()
	    '''
    # path_dso = "/home/yufan/openmp-8.0.1.src/build/runtime/src/libomp.so"
    # m = tvm.module.load(path_dso)
    path_dso = "...your so file path" % so_file
    m = tvm.module.load(path_dso)
    fconv = m['fconv']
    iteration = 50
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    c_np = conv2d_nchw_python(a_np, w_np, strides, padding)
    a_tvm = tvm.nd.array(a_np)
    w_tvm = tvm.nd.array(w_np)
    c_tvm = tvm.nd.empty(c_np.shape)

    print("\n============= Conti  ====================\n")
    for x in range(0, iteration):
        fconv(a_tvm, w_tvm, c_tvm)
        tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
    print("\n============= Conti DONE====================\n")

    outH = arg_bufs[2].shape[2]
    outW = arg_bufs[2].shape[3]
    ctx = tvm.cpu()
    evaluator = func.time_evaluator(func.entry_name, ctx, number=500)
    time = evaluator(a_tvm, w_tvm, c_tvm).mean
    total_flop = 2 * N * outH * outW * CO * CI * KH * KW
    print('\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
    print('total_flop : ', total_flop)
    print('Time cost of this operator: %f' % time)
    print('GLFOPs : %f', (total_flop / time / 1000 / 1000 / 1000))
    print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n')
예제 #27
0
    logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
    task = autotvm.task.create(gemm_int8, args=(n, m, l), target='cuda')
    print(task.config_space)

    measure_option = autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
    )

    log_name = 'gemm_int8.log'
    if DO_TUNING:
        tuner = autotvm.tuner.XGBTuner(task)
        tuner.tune(n_trial=1000, measure_option=measure_option,
                   callbacks=[autotvm.callback.log_to_file(log_name)])

        dispatch_context = autotvm.apply_history_best(log_name)
        best_config = dispatch_context.query(task.target, task.workload)
        print('\nBest config:')
        print(best_config)
    else:
        config = task.config_space.get(PRETUNED_INDEX)
        dispatch_context = autotvm.task.ApplyConfig(config)
        print("Using pretuned config:")
        print(config)

    with dispatch_context:
        with tvm.target.create('cuda'):
            s, arg_bufs = gemm_int8(n, m, l)
            f = tvm.build(s, arg_bufs, 'cuda', name='gemm_int8')

    ctx = tvm.context('cuda', 0)
예제 #28
0
def tune(mod, params, X_ex):
    number = 10
    repeat = 1
    min_repeat_ms = 0  # since we're tuning on a CPU, can be set to 0
    timeout = 10  # in seconds

    # create a TVM runner
    runner = autotvm.LocalRunner(
        number=number,
        repeat=repeat,
        timeout=timeout,
        min_repeat_ms=min_repeat_ms,
    )

    # Create a simple structure for holding tuning options. We use an XGBoost
    # algorithim for guiding the search. For a production job, you will want to set
    # the number of trials to be larger than the value of 10 used here. For CPU we
    # recommend 1500, for GPU 3000-4000. The number of trials required can depend
    # on the particular model and processor, so it's worth spending some time
    # evaluating performance across a range of values to find the best balance
    # between tuning time and model optimization. Because running tuning is time
    # intensive we set number of trials to 10, but do not recommend a value this
    # small. The ``early_stopping`` parameter is the minimum number of trails to
    # run before a condition that stops the search early can be applied. The
    # measure option indicates where trial code will be built, and where it will be
    # run. In this case, we're using the ``LocalRunner`` we just created and a
    # ``LocalBuilder``. The ``tuning_records`` option specifies a file to write
    # the tuning data to.

    tuning_option = {
        "tuner": "xgb",
        "trials": 10,
        "early_stopping": 100,
        "measure_option": autotvm.measure_option(
            builder=autotvm.LocalBuilder(build_func="default"), runner=runner
        ),
        "tuning_records": "resnet-50-v2-autotuning.json",
    }
    
    tasks = autotvm.task.extract_from_program(mod["main"], target=TARGET, params=params)

    for i, task in enumerate(tasks):
        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
        tuner_obj = XGBTuner(task, loss_type="rank")
        tuner_obj.tune(
            n_trial=min(tuning_option["trials"], len(task.config_space)),
            early_stopping=tuning_option["early_stopping"],
            measure_option=tuning_option["measure_option"],
            callbacks=[
                autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
                autotvm.callback.log_to_file(tuning_option["tuning_records"]),
            ],
        )

    with autotvm.apply_history_best(tuning_option["tuning_records"]):
        with tvm.transform.PassContext(opt_level=3, config={}):
            lib = relay.build(mod, target=target, params=params)

    dev = tvm.device(str(target), 0)
    optimized_module = graph_executor.GraphModule(lib["default"](dev))

    optimized_module.set_input("input0", X_ex)
    optimized_module.run()  # dry run test

    return optimized_module
예제 #29
0
#   # [Task 22/24]  Current/Best:   13.33/ 207.66 GFLOPS | Progress: (1000/1000) | 761.74 s Done.
#   # [Task 23/24]  Current/Best:   53.29/ 192.98 GFLOPS | Progress: (1000/1000) | 799.90 s Done.
#   # [Task 24/24]  Current/Best:   25.03/ 146.14 GFLOPS | Progress: (1000/1000) | 1112.55 s Done.

################################################################################
# Compiling an Optimized Model with Tuning Data
# ----------------------------------------------
#
# As an output of the tuning process above, we obtained the tuning records
# stored in ``resnet-50-v2-autotuning.json``. The compiler will use the results to
# generate high performance code for the model on your specified target.
#
# Now that tuning data for the model has been collected, we can re-compile the
# model using optimized operators to speed up our computations.

with autotvm.apply_history_best(tuning_option["tuning_records"]):
    with tvm.transform.PassContext(opt_level=3, config={}):
        lib = relay.build(mod, target=target, params=params)

dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))

################################################################################
# Verify that the optimized model runs and produces the same results:

dtype = "float32"
module.set_input(input_name, img_data)
module.run()
output_shape = (1, 1000)
tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).asnumpy()
예제 #30
0
    runner=autotvm.LocalRunner(number=5))

# begin tuning, log records to file `matmul.log`
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(n_trial=10,
           measure_option=measure_option,
           callbacks=[autotvm.callback.log_to_file('matmul.log')])

#########################################################################
# Finally we apply history best from the cache file and check its correctness.
# We can call the function :code:`matmul` directly under the
# :any:`autotvm.apply_history_best` context. When we call this function,
# it will query the dispatch context with its argument and get the best config
# with the same argument.

# apply history best from log file
with autotvm.apply_history_best('matmul.log'):
    with tvm.target.create("llvm"):
        s, arg_bufs = matmul(N, L, M, 'float32')
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = a_np.dot(b_np)

c_tvm = tvm.nd.empty(c_np.shape)
func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)

tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
예제 #31
0
# You can use alternatives like XGBTuner.
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(
    n_trial=10,
    measure_option=measure_option,
    callbacks=[autotvm.callback.log_to_file("matmul.log")],
)

#########################################################################
# Finally we apply history best from the cache file and check its correctness.
# We can call the function :code:`matmul` directly under the
# :any:`autotvm.apply_history_best` context. When we call this function,
# it will query the dispatch context with its argument and get the best config
# with the same argument.

# apply history best from log file
with autotvm.apply_history_best("matmul.log"):
    with tvm.target.Target("llvm"):
        s, arg_bufs = matmul(N, L, M, "float32")
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = a_np.dot(b_np)

c_tvm = tvm.nd.empty(c_np.shape)
func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)

tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
예제 #32
0
# Begin tuning, log records to file `conv2d.log`
# During tuning we will also try many invalid configs, so you are expected to
# see many error reports. As long as you can see non-zero GFLOPS, it is okay.
tuner = autotvm.tuner.XGBTuner(task)
tuner.tune(
    n_trial=20,
    measure_option=measure_option,
    callbacks=[autotvm.callback.log_to_file("conv2d.log")],
)

#########################################################################
# Finally we can inspect the best config from log file, check correctness,
# and measure running time.

# inspect the best config
dispatch_context = autotvm.apply_history_best("conv2d.log")
best_config = dispatch_context.query(task.target, task.workload)
print("\nBest config:")
print(best_config)

# apply history best from log file
with autotvm.apply_history_best("conv2d.log"):
    with tvm.target.Target("cuda"):
        s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides,
                                         padding)
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
c_np = conv2d_nchw_python(a_np, w_np, strides, padding)
예제 #33
0
    lat = (end - beg) * 1e3
    if lat >= min_repeat_ms:
        break
    number = int(max(min_repeat_ms / (lat / number) + 1, number * 1.618))
print('mxnet mean lat: %.2f ms' % (lat / number))

mod, params = relay.frontend.from_mxnet(mx_net, shape_dict)
ctx = tvm.cpu()
if args.arm:
    target = "llvm -device=arm_cpu -target=aarch64-linux-gnu"
else:
    target = "llvm -mcpu=skylake-avx512"

log_path = "autotvm_logs"
logs = [os.path.join(log_path, f) for f in os.listdir(log_path)]
autotvm_ctx = autotvm.apply_history_best(None)
for log_file in logs:
    autotvm_ctx.load(log_file)

# apply logs
print("Compile...")
with autotvm_ctx:
    with relay.build_config(opt_level=3):
        graph, lib, params = relay.build(mod[mod.entry_func], target, params=params)

# benchmark
print("Check correctness...")
ex = runtime.create(graph, lib, ctx)
ex.set_input(data0=inputs, data1=token_types, data2=valid_length, **params)
ex.run()
out = ex.get_output(0)
예제 #34
0
print("Compiling the model...")

out = 'yolov3.tx2.gpu'

ins = 'yolov3.x86.gpu'

graph = load_tvm_graph('graph/{}'.format(ins))

params = load_tvm_params('params/{}'.format(ins))

symbol = graph.symbol

[neth, netw] = shape['data'][2:] # Current image shape is 608x608

<<<<<<< HEAD
with autotvm.apply_history_best('yolov3-darknet.tx2.gpu.log'):
    with nnvm.compiler.build_config(opt_level = 2):
        graph, lib, params = nnvm.compiler.build(symbol, target, shape, dtype = dtype_dict, params = params)
=======
with nnvm.compiler.build_config(opt_level = 2):

    graph, lib, params = nnvm.compiler.build(symbol, target, shape, dtype = dtype_dict, params = params)
>>>>>>> 3df6457f817f3ee5923f83d0c9377e0a1a19fc2e

######################################################################
# Load a test image
# --------------------------------------------------------------------
test_image = 'dog.jpg'
print("Loading the test image...")
img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \
          test_image + '?raw=true'
예제 #35
0
def tune_and_evaluate(tuning_opt):
    # extract workloads from nnvm graph
    print("Extract tasks...")
    net, params, input_shape, out_shape = get_network(network, batch_size=1)
    tasks = autotvm.task.extract_from_graph(net,
                                            target=target,
                                            target_host=target_host,
                                            shape={'data': input_shape},
                                            dtype=dtype,
                                            symbols=(nnvm.sym.conv2d,
                                                     nnvm.sym.dense))

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with nnvm.compiler.build_config(opt_level=3):
            graph, lib, params = nnvm.compiler.build(
                net,
                target=target,
                target_host=target_host,
                shape={'data': input_shape},
                params=params,
                dtype=dtype)

        # export library
        tmp = tempdir()
        if use_android:
            from tvm.contrib import ndk
            filename = "net.so"
            lib.export_library(tmp.relpath(filename), ndk.create_shared)
        else:
            filename = "net.tar"
            lib.export_library(tmp.relpath(filename))

        # upload module to device
        print("Upload...")
        remote = autotvm.measure.request_remote(device_key,
                                                'localhost',
                                                9190,
                                                timeout=10000)
        remote.upload(tmp.relpath(filename))
        rlib = remote.load_module(filename)

        # upload parameters to device
        ctx = remote.context(str(target), 0)
        module = runtime.create(graph, rlib, ctx)
        data_tvm = tvm.nd.array(
            (np.random.uniform(size=input_shape)).astype(dtype))
        module.set_input('data', data_tvm)
        module.set_input(**params)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=50, repeat=3)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
예제 #36
0
)

# Begin tuning, log records to file `conv2d.log`
# During tuning we will also try many invalid configs, so you are expected to
# see many error reports. As long as you can see non-zero GFLOPS, it is okay.
tuner = autotvm.tuner.XGBTuner(task)
tuner.tune(n_trial=20,
           measure_option=measure_option,
           callbacks=[autotvm.callback.log_to_file('conv2d.log')])

#########################################################################
# Finally we can inspect the best config from log file, check correctness,
# and measure running time.

# inspect the best config
dispatch_context = autotvm.apply_history_best("conv2d.log")
best_config = dispatch_context.query(task.target, task.workload)
print("\nBest config:")
print(best_config)

# apply history best from log file
with autotvm.apply_history_best('conv2d.log'):
    with tvm.target.create("cuda"):
        s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
c_np = conv2d_nchw_python(a_np, w_np, strides, padding)