def verify_convolution(input_dim, filter, padding):
    dtype = 'float32'
    N, C, H, W = input_dim
    OC, _, KH, KW = filter
    a_np = np.random.uniform(size=input_dim).astype(dtype)
    w_np = np.random.uniform(size=(OC, C, KH, KW)).astype(dtype)
    w_np_cm = np.transpose(w_np, axes=(2, 3, 1, 0))
    b_np = conv2d_nchw_python(a_np, w_np, [1, 1], padding)
    inputs = [('input1', datatypes.Array(C, H, W))]
    output = [('output', datatypes.Array(*b_np.shape))]
    builder = NeuralNetworkBuilder(inputs, output)
    builder.add_convolution(name='conv',
                            kernel_channels=3,
                            output_channels=OC,
                            height=KH,
                            width=KW,
                            stride_height=1,
                            stride_width=1,
                            border_mode=padding.lower(),
                            groups=1,
                            W=w_np_cm,
                            b=None,
                            has_bias=False,
                            is_deconv=False,
                            input_name='input1',
                            output_name='output')
    model = cm.models.MLModel(builder.spec)
    for target, ctx in ctx_list():
        out = run_tvm_graph(model,
                            target,
                            ctx, [a_np], ['input1'],
                            output_shape=None)
        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
예제 #2
0
def check_conv2d_output(
        data_tensor: LabelledTensor, kernel_tensor: LabelledTensor,
        micro_output_tensor: LabelledTensor, strides, padding):
    data_nchw_np = data_tensor.with_layout('NCHW').data
    kernel_oihw_np = kernel_tensor.with_layout('OIHW').data
    micro_output_nchw_np = micro_output_tensor.with_layout('NCHW').data

    topi_output_np = conv2d_nchw_python(data_nchw_np, kernel_oihw_np, strides, padding)
    tvm.testing.assert_allclose(micro_output_nchw_np.shape, topi_output_np.shape)
    for i in range(micro_output_nchw_np.shape[0]):
        tvm.testing.assert_allclose(micro_output_nchw_np[i], topi_output_np[i])
        print('ok', micro_output_nchw_np[i])
예제 #3
0
kernel_packed = kernel_np.reshape(out_channels // env.BLOCK_OUT, env.BLOCK_OUT,
                                  in_channels // env.BLOCK_IN, env.BLOCK_IN,
                                  kernel_h, kernel_w).transpose(
                                      (0, 2, 4, 5, 1, 3))

# Format the input/output arrays with tvm.nd.array to the DLPack standard
data_nd = tvm.nd.array(data_packed, ctx)
kernel_nd = tvm.nd.array(kernel_packed, ctx)
res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx)

# Invoke the module to perform the computation
f(data_nd, kernel_nd, res_nd)

# Verify against numpy implementation
res_ref = conv2d_nchw_python(data_np.astype(env.acc_dtype),
                             kernel_np.astype(env.acc_dtype),
                             (stride_h, stride_w),
                             (pad_h, pad_w)).astype(env.acc_dtype)
res_ref = res_ref >> env.INP_WIDTH
res_ref = np.clip(res_ref, 0, inp_max)
res_ref = res_ref.astype(res.dtype)
res_ref = res_ref.reshape(
    (batch_size // env.BATCH, env.BATCH, out_channels // env.BLOCK_OUT,
     env.BLOCK_OUT, fout_height, fout_width)).transpose((0, 2, 4, 5, 1, 3))
tvm.testing.assert_allclose(res_ref, res_nd.asnumpy())
print("Successful 2D convolution test!")

######################################################################
# Summary
# -------
# This tutorial demonstrates how TVM scheduling primitives can be used to
# lower 2D convolution onto hardware accelerator intrinsics, making
예제 #4
0
# inspect the best config
dispatch_context = autotvm.apply_history_best("conv2d.log")
best_config = dispatch_context.query(task.target, task.workload)
print("\nBest config:")
print(best_config)

# apply history best from log file
with autotvm.apply_history_best('conv2d.log'):
    with tvm.target.create("cuda"):
        s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides,
                                         padding)
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
c_np = conv2d_nchw_python(a_np, w_np, strides, padding)

ctx = tvm.gpu()
a_tvm = tvm.nd.array(a_np, ctx=ctx)
w_tvm = tvm.nd.array(w_np, ctx=ctx)
c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
func(a_tvm, w_tvm, c_tvm)

tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
# and the overhead of kernel launch. You can also use nvprof to validate the result.
evaluator = func.time_evaluator(func.entry_name, ctx, number=400)
print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
예제 #5
0
        print(tvm.lower(s, arg_bufs, simple_mode=True))
        func = tvm.build(s, arg_bufs)
        print(func.imported_modules[0].get_source())

# check correctness
a_np = np.random.randint(size=(N, CI // BI, H, W, BI),
                         low=-128,
                         high=127,
                         dtype='int8')
w_np = np.random.randint(size=(CO // BO, CI // BI, KH, KW, BO, BI),
                         low=-128,
                         high=127,
                         dtype='int8')
a_np_ = a_np.transpose((0, 1, 4, 2, 3)).ravel().reshape(N, CI, H, W)
w_np_ = w_np.transpose((0, 4, 1, 5, 2, 3)).ravel().reshape(CO, CI, KH, KW)
c_np = conv2d_nchw_python(a_np_, w_np_, strides, padding).astype('int8')
c_np = c_np.reshape(N, CO // BO, BO, *c_np.shape[2:]).transpose(0, 1, 3, 4, 2)

ctx = tvm.gpu()
a_tvm = tvm.nd.empty(a_np.shape, dtype='int8', ctx=ctx).copyfrom(a_np)
w_tvm = tvm.nd.empty(w_np.shape, dtype='int8', ctx=ctx).copyfrom(w_np)
c_tvm = tvm.nd.empty(c_np.shape, dtype='int8', ctx=ctx)
func(a_tvm, w_tvm, c_tvm)

np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

evaluator = func.time_evaluator(func.entry_name, ctx, number=1000)
t = evaluator(a_tvm, w_tvm, c_tvm).mean
num_flops = N * c_np.shape[-2] * c_np.shape[-3] * CO * CI * KH * KW * 2
GFLOPS = num_flops / (t * 1e3) / 1e6
print('Time cost of this operator: %f, %g GFLOPS' % (t, GFLOPS))
def tune_kernels(
    N,
    H,
    W,
    CO,
    CI,
    KH,
    KW,
    strides,
    padding,
    dilation,
    trials,
    log_filename,
    so_file,
    measure_option,
    tuner,
    early_stopping,
):
    # N, H, W, CO, CI, KH, KW, strides, padding, dilation = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1,1)
    data = ('TENSOR', (N, CI, H, W), 'float32')
    kernel = ('TENSOR', (CO, CI, KH, KW), 'float32')

    # data = deserialize_args( ('TENSOR', tvm.placeholder((N, CI, H, W), dtype='float32', name='data')) )
    # kernel = deserialize_args(('TENSOR',tvm.placeholder((CO, CI, KH, KW), dtype='float32', name='kernel')) )
    origin_layout = 'NCHW'
    func_create = 'topi_x86_conv2d_NCHW_test'
    task = autotvm.task.create(func_create,
                               args=(data, kernel, strides, padding, 1,
                                     origin_layout, 'float32'),
                               target='llvm -mcpu=skylake-avx512',
                               template_key='direct')
    # task.workload = ['float32', 'float32', H, W, CI, 1, CO, KH, KW, 1, 1, 1, 1]
    print(task.config_space)
    trials = min(trials, len(task.config_space))

    # During tuning we will also try many invalid configs, so you are expected to
    # see many error reports. As long as you can see non-zero GFLOPS, it is okay.
    tuner = autotvm.tuner.XGBTuner(task, loss_type='rank')
    # tuner.tune(n_trial=trials,
    #           measure_option=measure_option,
    #           callbacks=[
    #               autotvm.callback.progress_bar(trials),
    #               autotvm.callback.log_to_file(log_filename)])

    dispatch_context = autotvm.apply_history_best(log_filename)
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)
    with autotvm.apply_history_best(log_filename):
        with tvm.target.create("llvm -mcpu=skylake-avx512"):
            s, arg_bufs = task.func(data, kernel, strides, padding, 1,
                                    origin_layout, 'float32', best_config)
            func = tvm.build(s,
                             arg_bufs,
                             "llvm -mcpu=skylake-avx512",
                             name="fconv")
            print("arg_bufs 0", arg_bufs[0])
            print("arg_bufs 1", arg_bufs[1])
            print("arg_bufs 2", arg_bufs[2])
            # print(func.get_source())
            ''' 
            dump = "%s.ll" % log_filename
            f = open(dump, "a")
            f.write(func.get_source())
            f.close()
	    '''
    path_dso = "...your so file path" % so_file
    m = tvm.module.load(path_dso)
    fconv = m['fconv']
    iteration = 50
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    c_np = conv2d_nchw_python(a_np, w_np, strides, padding)
    a_tvm = tvm.nd.array(a_np)
    w_tvm = tvm.nd.array(w_np)
    c_tvm = tvm.nd.empty(c_np.shape)

    print("\n============= Conti  ====================\n")
    for x in range(0, iteration):
        fconv(a_tvm, w_tvm, c_tvm)
        tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
    print("\n============= Conti DONE====================\n")

    outH = arg_bufs[2].shape[2]
    outW = arg_bufs[2].shape[3]
    ctx = tvm.cpu()
    evaluator = func.time_evaluator(func.entry_name, ctx, number=500)
    time = evaluator(a_tvm, w_tvm, c_tvm).mean
    total_flop = 2 * N * outH * outW * CO * CI * KH * KW
    print('\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
    print('total_flop : ', total_flop)
    print('Time cost of this operator: %f' % time)
    print('GLFOPs : %f', (total_flop / time / 1000 / 1000 / 1000))
    print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n')
예제 #7
0
def tune_kernels(
    args,
    N,
    H,
    W,
    CO,
    CI,
    KH,
    KW,
    strides,
    padding,
    dilation,
    trials,
    key,
    measure_option,
    tuner,
    early_stopping,
):
    data = ('TENSOR', (N, CI, H, W), 'float32')
    kernel = ('TENSOR', (CO, CI, KH, KW), 'float32')

    origin_layout = 'NCHW'

    feature_type = args.feature
    print('Feature:', feature_type)

    if 'small' == args.search_size:
        func_create = 'conv2d_NCHWc_small.x86'
    elif 'mid' == args.search_size:
        func_create = 'conv2d_NCHWc_mid.x86'
    elif 'wide' == args.search_size:
        func_create = 'conv2d_NCHWc_wide.x86'
    elif 'huge' == args.search_size:
        func_create = 'conv2d_NCHWc_huge.x86'
    #elif 'nchw_small' == args.search_size:
    #    func_create = 'conv2d_NCHW_small.x86'
    #elif 'nchw_mid' == args.search_size:
    #    func_create = 'conv2d_NCHW_mid.x86'
    #elif 'nchw_wide' == args.search_size:
    #    func_create = 'conv2d_NCHW_wide.x86'
    else:
        func_create = 'conv2d_NCHWc.x86'

    count = args.num_iters
    likwid_event = args.likwid_event
    random = args.random
    sa_n_iter = args.sa_num_iters
    save_features = not (args.no_save_features)

    task = autotvm.task.create(func_create,
                               args=(data, kernel, strides, padding, 1,
                                     origin_layout, origin_layout, 'float32'),
                               target='c')
    if 'NCHWc' in func_create:
        using_NCHWc = True
    else:
        using_NCHWc = False
    print(task.config_space)
    trials = min(trials, len(task.config_space))

    ctx = tvm.cpu()
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    c_np = conv2d_nchw_python(a_np, w_np, strides, padding).astype(np.float32)

    for i in range(count):
        if random:
            log_filename = '%s_%i_%s_%s_%icore_rand_gcc.log' % (
                key, i, feature_type, args.search_size, num_threads)
        else:
            log_filename = '%s_%i_%s_%s_%icore_gcc.log' % (
                key, i, feature_type, args.search_size, num_threads)

        if args.key_id != None and count == 1:
            save_ind = int(args.key_id)
        else:
            save_ind = i
        if likwid_event != None:
            if random:
                pickle_file = '/media/frost/DATA/tvm_data/fix_gcc_likwid_rand_%s_%s_features_%icore_%i_%s_%i.pkl' % (
                    key, feature_type, num_threads, trials, args.search_size,
                    save_ind)
            else:
                pickle_file = '/media/frost/DATA/tvm_data/fix_gcc_likwid_%s_%s_features_%icore_%i_%s_%i.pkl' % (
                    key, feature_type, num_threads, trials, args.search_size,
                    save_ind)
        else:
            if random:
                pickle_file = '/media/frost/DATA/tvm_data/fix_gcc_rand_%s_%s_features_%icore_%i_%s_%i.pkl' % (
                    key, feature_type, num_threads, trials, args.search_size,
                    save_ind)
            else:
                pickle_file = '/media/frost/DATA/tvm_data/fix_gcc_sa_%i_%s_%s_features_%icore_%i_%s_%i.pkl' % (
                    sa_n_iter, key, feature_type, num_threads, trials,
                    args.search_size, save_ind)
        if os.path.exists(pickle_file):
            print('File exists', pickle_file)
            continue

        tuner = autotvm.tuner.XGBTuner(task,
                                       feature_type=feature_type,
                                       loss_type='rank',
                                       plan_size=32,
                                       sa_n_iter=sa_n_iter)
        tuner.tune(n_trial=trials,
                   measure_option=measure_option,
                   callbacks=[
                       autotvm.callback.progress_bar(trials),
                       autotvm.callback.log_to_file(log_filename)
                   ],
                   likwid_event=likwid_event,
                   save_features=save_features,
                   random=random)
        dispatch_context = autotvm.apply_history_best(log_filename)
        best_config = dispatch_context.query(task.target, task.workload)
        print("\nBest config:")
        print(best_config)

        # apply history best from log file
        with autotvm.apply_history_best(log_filename):
            with tvm.target.create("c"):
                s, arg_bufs = task.func(*task.args)
                func = tvm.build(s, arg_bufs)

        if using_NCHWc:
            a_np_reshape = a_np.reshape(
                (N, CI // best_config['tile_ic'].size[-1],
                 best_config['tile_ic'].size[-1], H, W)).transpose(
                     (0, 1, 3, 4, 2))
            w_np_reshape = w_np.reshape(
                (CO // best_config['tile_oc'].size[-1],
                 best_config['tile_oc'].size[-1],
                 CI // best_config['tile_ic'].size[-1],
                 best_config['tile_ic'].size[-1], KH, KW)).transpose(
                     (0, 2, 4, 5, 3, 1))
            c_np_reshape = c_np.reshape(
                (N, CO // best_config['tile_oc'].size[-1],
                 best_config['tile_oc'].size[-1], H, W)).transpose(
                     (0, 1, 3, 4, 2))
        a_tvm = tvm.nd.array(a_np_reshape, ctx=ctx)
        w_tvm = tvm.nd.array(w_np_reshape, ctx=ctx)
        c_tvm = tvm.nd.array(c_np_reshape, ctx=ctx)
        if tuple(arg_bufs[1].shape) == w_tvm.shape:
            func(c_tvm, w_tvm, a_tvm)
        else:
            func(c_tvm, a_tvm, w_tvm)

        try:
            tvm.testing.assert_allclose(c_np_reshape,
                                        c_tvm.asnumpy(),
                                        rtol=1e-2)
        except:
            print('WARNING: Not equal!')
        evaluator = func.time_evaluator(func.entry_name,
                                        ctx,
                                        repeat=3,
                                        number=4)
        if tuple(arg_bufs[1].shape) == w_tvm.shape:
            print(evaluator(c_tvm, w_tvm, a_tvm))
        else:
            print(evaluator(c_tvm, a_tvm, w_tvm))
        os.remove(log_filename)

        #print(tvm.lower(s, arg_bufs, simple_mode=True))
        if save_features:
            with open(pickle_file, 'wb') as output:
                pickle.dump(
                    [best_config, task, tuner.cost_model.saved_features],
                    output, pickle.HIGHEST_PROTOCOL)
예제 #8
0
                                  in_channels // env.BLOCK_IN,
                                  env.BLOCK_IN,
                                  kernel_h,
                                  kernel_w).transpose((0, 2, 4, 5, 1, 3))

# Format the input/output arrays with tvm.nd.array to the DLPack standard
data_nd = tvm.nd.array(data_packed, ctx)
kernel_nd = tvm.nd.array(kernel_packed, ctx)
res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx)

# Invoke the module to perform the computation
f(data_nd, kernel_nd, res_nd)

# Verify against numpy implementation
res_ref = conv2d_nchw_python(data_np.astype(env.acc_dtype),
                            kernel_np.astype(env.acc_dtype),
                            (stride_h, stride_w),
                            (pad_h, pad_w)).astype(env.acc_dtype)
res_ref = res_ref >> env.INP_WIDTH
res_ref = np.clip(res_ref, 0, inp_max)
res_ref = res_ref.astype(res.dtype)
res_ref = res_ref.reshape((batch_size // env.BATCH,
                           env.BATCH,
                           out_channels // env.BLOCK_OUT,
                           env.BLOCK_OUT,
                           fout_height,
                           fout_width)).transpose((0, 2, 4, 5, 1, 3))
tvm.testing.assert_allclose(res_ref, res_nd.asnumpy())
print("Successful 2D convolution test!")

######################################################################
# Summary
예제 #9
0
def build_conv2d_module(opts):
    batch = 1
    in_channel = 3
    out_channel = 16
    in_size = 8
    kernel = 3
    pad = 1
    stride = 1

    A = relay.var('A', shape=(batch, in_channel, in_size, in_size))
    W = relay.var('W', shape=(out_channel, in_channel, kernel, kernel))
    B = relay.op.nn.nn.conv2d(A,
                              W,
                              strides=(stride, stride),
                              padding=(pad, pad),
                              kernel_size=kernel,
                              data_layout='NCHW',
                              kernel_layout='OIHW',
                              out_layout='',
                              out_dtype='')

    a_data = np.random.uniform(size=(batch, in_channel, in_size,
                                     in_size)).astype('float32')
    w_data = np.random.uniform(size=(out_channel, in_channel, kernel,
                                     kernel)).astype('float32')
    func = relay.Function([A, W], B)
    params = {"W": w_data}
    graph, lib, params = relay.build_module.build(tvm.IRModule.from_expr(func),
                                                  target=TARGET,
                                                  params=params)

    build_dir = os.path.abspath(opts.out_dir)
    if not os.path.isdir(build_dir):
        os.makedirs(build_dir)

    lib.save(os.path.join(build_dir, 'conv2d_model.o'))
    with open(os.path.join(build_dir, 'conv2d_graph.json'),
              'w') as f_graph_json:
        f_graph_json.write(graph)
    with open(os.path.join(build_dir, 'conv2d_params.bin'), 'wb') as f_params:
        f_params.write(relay.save_param_dict(params))
    with open(os.path.join(build_dir, "conv2d_data.bin"), "wb") as fp:
        fp.write(a_data.astype(np.float32).tobytes())

    ## get TVM result on local machine
    params = {"W": w_data}
    local_target = 'llvm --system-lib'
    graph, lib, params = relay.build_module.build(tvm.IRModule.from_expr(func),
                                                  target=local_target,
                                                  params=params)
    tvm_out = run_conv2d_module(a_data,
                                graph,
                                lib,
                                params,
                                target=local_target)
    b_np = conv2d_nchw_python(a_data, w_data, (stride, stride), (pad, pad))
    print("TVM Output: " + str(tvm_out.shape))
    print("Numpy Output: " + str(b_np.shape))
    np.testing.assert_allclose(b_np, tvm_out, rtol=1e-2)
    with open(os.path.join(build_dir, "conv2d_output.bin"), "wb") as fp:
        fp.write(tvm_out.astype(np.float32).tobytes())
예제 #10
0
def tune_kernels(
    N,
    H,
    W,
    CO,
    CI,
    KH,
    KW,
    strides,
    padding,
    dilation,
    trials,
    key,
    measure_option,
    tuner,
    early_stopping,
):
    data = ('TENSOR', (N, CI, H, W), 'float32')
    kernel = ('TENSOR', (CO, CI, KH, KW), 'float32')

    origin_layout = 'NCHW'

    if len(sys.argv) > 2:
        feature_type = sys.argv[2]
    else:
        #feature_type = 'datavol'
        feature_type = 'itervar'
        #feature_type = 'datavol_itervar'
    print('Feature:', feature_type)

    if len(sys.argv) > 3:
        if 'small' == sys.argv[3]:
            func_create = 'conv2d_NCHW_small.x86'
        elif 'wide' == sys.argv[3]:
            func_create = 'conv2d_NCHW_wide.x86'
        else:
            func_create = 'conv2d_NCHWc.x86'
    else:
        func_create = 'conv2d_NCHWc.x86'

    if len(sys.argv) > 4:
        count = int(sys.argv[4])
    else:
        count = 1

    if len(sys.argv) > 5:
        likwid_event = sys.argv[5]
    else:
        likwid_event = None

    task = autotvm.task.create(func_create,
                               args=(data, kernel, strides, padding, 1,
                                     origin_layout, origin_layout, 'float32'),
                               target='llvm -mcpu=core-avx2')
    using_NCHWc = True
    print(task.config_space)
    trials = min(trials, len(task.config_space))

    ctx = tvm.cpu()
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    c_np = conv2d_nchw_python(a_np, w_np, strides, padding).astype(np.float32)

    for i in range(count):
        log_filename = '%s_%i_%s_%s_%icore_rand.log' % (
            key, i, feature_type, sys.argv[3], num_threads)
        tuner = autotvm.tuner.XGBTuner(task,
                                       feature_type=feature_type,
                                       loss_type='rank',
                                       plan_size=32)
        tuner.tune(n_trial=trials,
                   measure_option=measure_option,
                   callbacks=[
                       autotvm.callback.progress_bar(trials),
                       autotvm.callback.log_to_file(log_filename)
                   ],
                   likwid_event=likwid_event)
        dispatch_context = autotvm.apply_history_best(log_filename)
        best_config = dispatch_context.query(task.target, task.workload)
        print("\nBest config:")
        print(best_config)

        # apply history best from log file
        with autotvm.apply_history_best(log_filename):
            with tvm.target.create("llvm -mcpu=core-avx2"):
                s, arg_bufs = task.func(*task.args)
                func = tvm.build(s, arg_bufs)

        if using_NCHWc:
            a_np_reshape = a_np.reshape(
                (N, CI // best_config['tile_ic'].size[-1],
                 best_config['tile_ic'].size[-1], H, W)).transpose(
                     (0, 1, 3, 4, 2))
            w_np_reshape = w_np.reshape(
                (CO // best_config['tile_oc'].size[-1],
                 best_config['tile_oc'].size[-1],
                 CI // best_config['tile_ic'].size[-1],
                 best_config['tile_ic'].size[-1], KH, KW)).transpose(
                     (0, 2, 4, 5, 3, 1))
            c_np_reshape = c_np.reshape(
                (N, CO // best_config['tile_oc'].size[-1],
                 best_config['tile_oc'].size[-1], H, W)).transpose(
                     (0, 1, 3, 4, 2))
        a_tvm = tvm.nd.array(a_np_reshape, ctx=ctx)
        w_tvm = tvm.nd.array(w_np_reshape, ctx=ctx)
        c_tvm = tvm.nd.array(c_np_reshape, ctx=ctx)
        if tuple(arg_bufs[1].shape) == w_tvm.shape:
            func(c_tvm, w_tvm, a_tvm)
        else:
            func(c_tvm, a_tvm, w_tvm)

        try:
            tvm.testing.assert_allclose(c_np_reshape,
                                        c_tvm.asnumpy(),
                                        rtol=1e-2)
        except:
            print('WARNING: Not equal!')
        evaluator = func.time_evaluator(func.entry_name,
                                        ctx,
                                        repeat=3,
                                        number=4)
        if tuple(arg_bufs[1].shape) == w_tvm.shape:
            print(evaluator(c_tvm, w_tvm, a_tvm))
        else:
            print(evaluator(c_tvm, a_tvm, w_tvm))
        os.remove(log_filename)

    print(tvm.lower(s, arg_bufs, simple_mode=True))
    if likwid_event != None:
        with open(
                'data/likwid_rand_%s_%s_features_%icore_%i_%s.pkl' %
            (key, feature_type, num_threads, trials, sys.argv[3]),
                'wb') as output:
            pickle.dump([best_config, task, tuner.cost_model.saved_features],
                        output, pickle.HIGHEST_PROTOCOL)
    else:
        with open(
                'data/%s_%s_features_%icore_%i_%s.pkl' %
            (key, feature_type, num_threads, trials, sys.argv[3]),
                'wb') as output:
            pickle.dump([best_config, task, tuner.cost_model.saved_features],
                        output, pickle.HIGHEST_PROTOCOL)
예제 #11
0
def main_body(feature_type, source, target, history_file, n_trial=1000):
    assert source in conv_configs, "Source '{}' is not a convolution config.".format(
        source)
    assert target in conv_configs, "Target '{}' is not a convolution config.".format(
        target)
    assert os.path.exists(
        history_file), "History file '{}' does not exist.".format(history_file)

    # filename = 'conv2d_{}'.format(feature_type)
    filename = 'conv2d_transfer_{}_{}_{}'.format(feature_type, source, target)
    # NOTE: Important to use `fresh(filename, ...)` to prevent file overwriting.
    log_file = fresh(filename, 'log')
    dump_file = fresh(filename, 'txt')

    # NOTE: Dump file will contain info from later iterations.
    # logging.getLogger('autotvm').addHandler(logging.FileHandler(dump_file))

    # Target for transfer learning.
    N, H, W, CO, CI, KH, KW, strides, padding = conv_configs[target]
    task = autotvm.task.create(conv2d_no_batching,
                               args=(N, H, W, CO, CI, KH, KW, strides,
                                     padding),
                               target='cuda')
    print(task.config_space)
    tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type)

    # Load source history log file for transfer learning.
    print("Load history for transfer learning.")
    history_data = autotvm.record.load_from_file(history_file)
    # Note: `load_from_file` returns a generator.
    # Convert to list to prevent accidental consumption.
    history_data = list(history_data)
    tuner.load_history(history_data)

    # Specify operator measuring options.
    run_timeout = 30
    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(),
                                            runner=autotvm.LocalRunner(
                                                repeat=3,
                                                min_repeat_ms=100,
                                                timeout=run_timeout))

    # Begin tuning, log records to log file.
    # Begin tuning. Log records to `log_file`.
    tuner.tune(n_trial=n_trial,
               measure_option=measure_option,
               callbacks=[autotvm.callback.log_to_file(log_file)])

    # inspect the best config
    dispatch_context = autotvm.apply_history_best(log_file)
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)

    # apply history best from config file
    with autotvm.apply_history_best(log_file):
        with tvm.target.create("cuda"):
            s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides,
                                             padding)
            # ir = tvm.lower(s, arg_bufs, simple_mode=True)
            # print(ir)
            func = tvm.build(s, arg_bufs)

    # check correctness
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    c_np = conv2d_nchw_python(a_np, w_np, strides, padding)

    ctx = tvm.gpu()
    a_tvm = tvm.nd.array(a_np, ctx=ctx)
    w_tvm = tvm.nd.array(w_np, ctx=ctx)
    c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
    func(a_tvm, w_tvm, c_tvm)

    tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

    # Evaluate running time.
    # Choose a large repeat number to reduce noise.
    repeat_number = 1000
    evaluator = func.time_evaluator(func.entry_name, ctx, number=repeat_number)
    print('Time cost of this operator: %f' %
          evaluator(a_tvm, w_tvm, c_tvm).mean)
예제 #12
0
def tune_kernels(
    N,
    H,
    W,
    CO,
    CI,
    KH,
    KW,
    strides,
    padding,
    dilation,
    trials,
    log_filename,
    measure_option,
    tuner,
    early_stopping,
):
    data = ('TENSOR', (N, CI, H, W), 'float32')
    kernel = ('TENSOR', (CO, CI, KH, KW), 'float32')

    origin_layout = 'NCHW'

    func_create = 'conv2d_NCHW_dv.x86'
    #func_create = 'conv2d_nchw_spatial_pack.dv.x86'
    task = autotvm.task.create(func_create,
                               args=(data, kernel, strides, padding, 1,
                                     'float32'),
                               target='llvm -mcpu=core-avx2')
    using_NCHWc = False

    # Uncomment to run x86 script.
    #func_create = 'conv2d_NCHWc.x86'
    #task = autotvm.task.create(func_create,
    #                           args=(data, kernel, strides, padding, 1, origin_layout, origin_layout, 'float32'),
    #                           target='llvm -mcpu=core-avx2')
    #using_NCHWc = True

    #task.workload = ['float32', 'float32', H, W, CI, 1, CO, KH, KW, 1, 1, 1, 1]
    print(task.config_space)
    #print(len(task.config_space))
    trials = min(trials, len(task.config_space))

    # During tuning we will also try many invalid configs, so you are expected to
    # see many error reports. As long as you can see non-zero GFLOPS, it is okay.
    if len(sys.argv) > 1:
        feature_type = sys.argv[1]
    else:
        feature_type = 'datavol'
        #feature_type = 'itervar'
        #feature_type = 'datavol_itervar'
    print('Feature:', feature_type)
    for i in range(1):
        tuner = autotvm.tuner.XGBTuner(task,
                                       feature_type=feature_type,
                                       loss_type='rank',
                                       plan_size=8)
        tuner.tune(n_trial=trials,
                   measure_option=measure_option,
                   callbacks=[
                       autotvm.callback.progress_bar(trials),
                       autotvm.callback.log_to_file(log_filename)
                   ],
                   likwid_event='CACHE')
    #with open('data/%s_features_1core_%i_n%i_%i.pkl' % (feature_type, H, N, trials) , 'wb') as output:
    #with open('data/likwid_test.pkl' , 'wb') as output:
    #    pickle.dump([task, tuner.cost_model.saved_features], output, pickle.HIGHEST_PROTOCOL)

    dispatch_context = autotvm.apply_history_best(log_filename)
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)

    # apply history best from log file
    with autotvm.apply_history_best(log_filename):
        with tvm.target.create("llvm -mcpu=core-avx2"):
            s, arg_bufs = task.func(*task.args)
            func = tvm.build(s, arg_bufs)
            #print(tvm.lower(s, arg_bufs, simple_mode=True))

    ctx = tvm.cpu()
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    c_np = conv2d_nchw_python(a_np, w_np, strides, padding)

    if using_NCHWc:
        a_np = a_np.reshape((N, 8, H, W, CI // 8))
    a_tvm = tvm.nd.array(a_np, ctx=ctx)
    w_tvm = tvm.nd.array(w_np, ctx=ctx)
    c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=np.float32), ctx=ctx)
    func(c_tvm, w_tvm, a_tvm)

    tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
    print(evaluator(c_tvm, w_tvm, a_tvm))
예제 #13
0
def main_body(feature_type, conv_config, n_trial):
    filename = 'conv2d_{}_{}_n{}'.format(conv_config, feature_type, n_trial)
    # log_file = '{}.log'.format(filename)
    # dump_file = '{}.txt'.format(filename)
    # NOTE: Important to use `fresh(filename, ...)` to prevent file overwriting.
    log_file = fresh(filename, 'log')
    dump_file = fresh(filename, 'txt')

    # NOTE: Result directory experiments.
    # results_dir = fresh_dir('results')
    # log_file = fresh(os.path.join(results_dir, filename), 'log')
    # dump_file = fresh(os.path.join(results_dir, filename), 'txt')
    # # log_file = os.path.join(results_dir, log_file)
    # # dump_file = os.path.join(results_dir, dump_file)

    # NOTE: Dump file will contain info from later iterations.
    # logging.getLogger('autotvm').addHandler(logging.FileHandler(dump_file))

    N, H, W, CO, CI, KH, KW, strides, padding = conv_configs[conv_config]
    task = autotvm.task.create(conv2d_no_batching,
                               args=(N, H, W, CO, CI, KH, KW, strides,
                                     padding),
                               target='cuda')
    print(task.config_space)

    tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type)

    # Specify operator measuring options.
    run_timeout = 30
    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(),
                                            runner=autotvm.LocalRunner(
                                                repeat=3,
                                                min_repeat_ms=100,
                                                timeout=run_timeout))

    # Begin tuning. Log records to `log_file`.
    tuner.tune(n_trial=n_trial,
               measure_option=measure_option,
               callbacks=[autotvm.callback.log_to_file(log_file)])

    #########################################################################
    # Finally we can inspect the best config from log file, check correctness,
    # and measure running time.

    # inspect the best config
    dispatch_context = autotvm.apply_history_best(log_file)
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)

    # apply history best from config file
    with autotvm.apply_history_best(log_file):
        with tvm.target.create("cuda"):
            s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides,
                                             padding)
            # ir = tvm.lower(s, arg_bufs, simple_mode=True)
            # print(ir)
            func = tvm.build(s, arg_bufs)

    # check correctness
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    c_np = conv2d_nchw_python(a_np, w_np, strides, padding)

    ctx = tvm.gpu()
    a_tvm = tvm.nd.array(a_np, ctx=ctx)
    w_tvm = tvm.nd.array(w_np, ctx=ctx)
    c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
    func(a_tvm, w_tvm, c_tvm)

    tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

    # Evaluate running time.
    # Choose a large repeat number to reduce noise.
    repeat_number = 1000
    evaluator = func.time_evaluator(func.entry_name, ctx, number=repeat_number)
    print('Time cost of this operator: %f' %
          evaluator(a_tvm, w_tvm, c_tvm).mean)
def main(feature_type):
    conv_config = 'c12'
    filename = 'conv2d_{}'.format(feature_type)
    log_file = '{}.log'.format(filename)

    logging.getLogger('autotvm').setLevel(logging.DEBUG)
    logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

    # C12: the last conv layer in resnet.
    N, H, W, CO, CI, KH, KW, strides, padding = conv_configs[conv_config]
    task = autotvm.task.create(conv2d_no_batching,
                               args=(N, H, W, CO, CI, KH, KW, strides, padding),
                               target='cuda')
    print(task.config_space)

    tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type)
    n_trial = 1000
    repeat_number = 1000

    # Specify operator measuring options.
    run_timeout = 30
    measure_option = autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=run_timeout)
    )

    # Begin tuning. Log records to `log_file`.
    tuner.tune(n_trial=n_trial,
               measure_option=measure_option,
               callbacks=[autotvm.callback.log_to_file(log_file)])

    # inspect the best config
    dispatch_context = autotvm.apply_history_best(log_file)
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)

    # apply history best from log file
    with autotvm.apply_history_best(log_file):
        with tvm.target.create("cuda"):
            s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)
            ir = tvm.lower(s, arg_bufs, simple_mode=True)
            # print(ir)
            func = tvm.build(s, arg_bufs)

    # check correctness
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    c_np = conv2d_nchw_python(a_np, w_np, strides, padding)

    ctx = tvm.gpu()
    a_tvm = tvm.nd.array(a_np, ctx=ctx)
    w_tvm = tvm.nd.array(w_np, ctx=ctx)
    c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
    func(a_tvm, w_tvm, c_tvm)

    tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

    # Evaluate running time.
    # Choose a large repeat number to reduce noise.
    evaluator = func.time_evaluator(func.entry_name, ctx, number=repeat_number)
    print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
예제 #15
0
# inspect the best config
dispatch_context = autotvm.apply_history_best("conv2d.log")
best_config = dispatch_context.query(task.target, task.workload)
print("\nBest config:")
print(best_config)

# apply history best from log file
with autotvm.apply_history_best('conv2d.log'):
    with tvm.target.create("cuda"):
        s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
c_np = conv2d_nchw_python(a_np, w_np, strides, padding)

ctx = tvm.gpu()
a_tvm = tvm.nd.array(a_np, ctx=ctx)
w_tvm = tvm.nd.array(w_np, ctx=ctx)
c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
func(a_tvm, w_tvm, c_tvm)

tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
# and the overhead of kernel launch. You can also use nvprof to validate the result.
evaluator = func.time_evaluator(func.entry_name, ctx, number=400)
print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)