def verify_convolution(input_dim, filter, padding): dtype = 'float32' N, C, H, W = input_dim OC, _, KH, KW = filter a_np = np.random.uniform(size=input_dim).astype(dtype) w_np = np.random.uniform(size=(OC, C, KH, KW)).astype(dtype) w_np_cm = np.transpose(w_np, axes=(2, 3, 1, 0)) b_np = conv2d_nchw_python(a_np, w_np, [1, 1], padding) inputs = [('input1', datatypes.Array(C, H, W))] output = [('output', datatypes.Array(*b_np.shape))] builder = NeuralNetworkBuilder(inputs, output) builder.add_convolution(name='conv', kernel_channels=3, output_channels=OC, height=KH, width=KW, stride_height=1, stride_width=1, border_mode=padding.lower(), groups=1, W=w_np_cm, b=None, has_bias=False, is_deconv=False, input_name='input1', output_name='output') model = cm.models.MLModel(builder.spec) for target, ctx in ctx_list(): out = run_tvm_graph(model, target, ctx, [a_np], ['input1'], output_shape=None) tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
def check_conv2d_output( data_tensor: LabelledTensor, kernel_tensor: LabelledTensor, micro_output_tensor: LabelledTensor, strides, padding): data_nchw_np = data_tensor.with_layout('NCHW').data kernel_oihw_np = kernel_tensor.with_layout('OIHW').data micro_output_nchw_np = micro_output_tensor.with_layout('NCHW').data topi_output_np = conv2d_nchw_python(data_nchw_np, kernel_oihw_np, strides, padding) tvm.testing.assert_allclose(micro_output_nchw_np.shape, topi_output_np.shape) for i in range(micro_output_nchw_np.shape[0]): tvm.testing.assert_allclose(micro_output_nchw_np[i], topi_output_np[i]) print('ok', micro_output_nchw_np[i])
kernel_packed = kernel_np.reshape(out_channels // env.BLOCK_OUT, env.BLOCK_OUT, in_channels // env.BLOCK_IN, env.BLOCK_IN, kernel_h, kernel_w).transpose( (0, 2, 4, 5, 1, 3)) # Format the input/output arrays with tvm.nd.array to the DLPack standard data_nd = tvm.nd.array(data_packed, ctx) kernel_nd = tvm.nd.array(kernel_packed, ctx) res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx) # Invoke the module to perform the computation f(data_nd, kernel_nd, res_nd) # Verify against numpy implementation res_ref = conv2d_nchw_python(data_np.astype(env.acc_dtype), kernel_np.astype(env.acc_dtype), (stride_h, stride_w), (pad_h, pad_w)).astype(env.acc_dtype) res_ref = res_ref >> env.INP_WIDTH res_ref = np.clip(res_ref, 0, inp_max) res_ref = res_ref.astype(res.dtype) res_ref = res_ref.reshape( (batch_size // env.BATCH, env.BATCH, out_channels // env.BLOCK_OUT, env.BLOCK_OUT, fout_height, fout_width)).transpose((0, 2, 4, 5, 1, 3)) tvm.testing.assert_allclose(res_ref, res_nd.asnumpy()) print("Successful 2D convolution test!") ###################################################################### # Summary # ------- # This tutorial demonstrates how TVM scheduling primitives can be used to # lower 2D convolution onto hardware accelerator intrinsics, making
# inspect the best config dispatch_context = autotvm.apply_history_best("conv2d.log") best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best('conv2d.log'): with tvm.target.create("cuda"): s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) ctx = tvm.gpu() a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx) func(a_tvm, w_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise # and the overhead of kernel launch. You can also use nvprof to validate the result. evaluator = func.time_evaluator(func.entry_name, ctx, number=400) print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
print(tvm.lower(s, arg_bufs, simple_mode=True)) func = tvm.build(s, arg_bufs) print(func.imported_modules[0].get_source()) # check correctness a_np = np.random.randint(size=(N, CI // BI, H, W, BI), low=-128, high=127, dtype='int8') w_np = np.random.randint(size=(CO // BO, CI // BI, KH, KW, BO, BI), low=-128, high=127, dtype='int8') a_np_ = a_np.transpose((0, 1, 4, 2, 3)).ravel().reshape(N, CI, H, W) w_np_ = w_np.transpose((0, 4, 1, 5, 2, 3)).ravel().reshape(CO, CI, KH, KW) c_np = conv2d_nchw_python(a_np_, w_np_, strides, padding).astype('int8') c_np = c_np.reshape(N, CO // BO, BO, *c_np.shape[2:]).transpose(0, 1, 3, 4, 2) ctx = tvm.gpu() a_tvm = tvm.nd.empty(a_np.shape, dtype='int8', ctx=ctx).copyfrom(a_np) w_tvm = tvm.nd.empty(w_np.shape, dtype='int8', ctx=ctx).copyfrom(w_np) c_tvm = tvm.nd.empty(c_np.shape, dtype='int8', ctx=ctx) func(a_tvm, w_tvm, c_tvm) np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) evaluator = func.time_evaluator(func.entry_name, ctx, number=1000) t = evaluator(a_tvm, w_tvm, c_tvm).mean num_flops = N * c_np.shape[-2] * c_np.shape[-3] * CO * CI * KH * KW * 2 GFLOPS = num_flops / (t * 1e3) / 1e6 print('Time cost of this operator: %f, %g GFLOPS' % (t, GFLOPS))
def tune_kernels( N, H, W, CO, CI, KH, KW, strides, padding, dilation, trials, log_filename, so_file, measure_option, tuner, early_stopping, ): # N, H, W, CO, CI, KH, KW, strides, padding, dilation = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1,1) data = ('TENSOR', (N, CI, H, W), 'float32') kernel = ('TENSOR', (CO, CI, KH, KW), 'float32') # data = deserialize_args( ('TENSOR', tvm.placeholder((N, CI, H, W), dtype='float32', name='data')) ) # kernel = deserialize_args(('TENSOR',tvm.placeholder((CO, CI, KH, KW), dtype='float32', name='kernel')) ) origin_layout = 'NCHW' func_create = 'topi_x86_conv2d_NCHW_test' task = autotvm.task.create(func_create, args=(data, kernel, strides, padding, 1, origin_layout, 'float32'), target='llvm -mcpu=skylake-avx512', template_key='direct') # task.workload = ['float32', 'float32', H, W, CI, 1, CO, KH, KW, 1, 1, 1, 1] print(task.config_space) trials = min(trials, len(task.config_space)) # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task, loss_type='rank') # tuner.tune(n_trial=trials, # measure_option=measure_option, # callbacks=[ # autotvm.callback.progress_bar(trials), # autotvm.callback.log_to_file(log_filename)]) dispatch_context = autotvm.apply_history_best(log_filename) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) with autotvm.apply_history_best(log_filename): with tvm.target.create("llvm -mcpu=skylake-avx512"): s, arg_bufs = task.func(data, kernel, strides, padding, 1, origin_layout, 'float32', best_config) func = tvm.build(s, arg_bufs, "llvm -mcpu=skylake-avx512", name="fconv") print("arg_bufs 0", arg_bufs[0]) print("arg_bufs 1", arg_bufs[1]) print("arg_bufs 2", arg_bufs[2]) # print(func.get_source()) ''' dump = "%s.ll" % log_filename f = open(dump, "a") f.write(func.get_source()) f.close() ''' path_dso = "...your so file path" % so_file m = tvm.module.load(path_dso) fconv = m['fconv'] iteration = 50 a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) a_tvm = tvm.nd.array(a_np) w_tvm = tvm.nd.array(w_np) c_tvm = tvm.nd.empty(c_np.shape) print("\n============= Conti ====================\n") for x in range(0, iteration): fconv(a_tvm, w_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) print("\n============= Conti DONE====================\n") outH = arg_bufs[2].shape[2] outW = arg_bufs[2].shape[3] ctx = tvm.cpu() evaluator = func.time_evaluator(func.entry_name, ctx, number=500) time = evaluator(a_tvm, w_tvm, c_tvm).mean total_flop = 2 * N * outH * outW * CO * CI * KH * KW print('\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') print('total_flop : ', total_flop) print('Time cost of this operator: %f' % time) print('GLFOPs : %f', (total_flop / time / 1000 / 1000 / 1000)) print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n')
def tune_kernels( args, N, H, W, CO, CI, KH, KW, strides, padding, dilation, trials, key, measure_option, tuner, early_stopping, ): data = ('TENSOR', (N, CI, H, W), 'float32') kernel = ('TENSOR', (CO, CI, KH, KW), 'float32') origin_layout = 'NCHW' feature_type = args.feature print('Feature:', feature_type) if 'small' == args.search_size: func_create = 'conv2d_NCHWc_small.x86' elif 'mid' == args.search_size: func_create = 'conv2d_NCHWc_mid.x86' elif 'wide' == args.search_size: func_create = 'conv2d_NCHWc_wide.x86' elif 'huge' == args.search_size: func_create = 'conv2d_NCHWc_huge.x86' #elif 'nchw_small' == args.search_size: # func_create = 'conv2d_NCHW_small.x86' #elif 'nchw_mid' == args.search_size: # func_create = 'conv2d_NCHW_mid.x86' #elif 'nchw_wide' == args.search_size: # func_create = 'conv2d_NCHW_wide.x86' else: func_create = 'conv2d_NCHWc.x86' count = args.num_iters likwid_event = args.likwid_event random = args.random sa_n_iter = args.sa_num_iters save_features = not (args.no_save_features) task = autotvm.task.create(func_create, args=(data, kernel, strides, padding, 1, origin_layout, origin_layout, 'float32'), target='c') if 'NCHWc' in func_create: using_NCHWc = True else: using_NCHWc = False print(task.config_space) trials = min(trials, len(task.config_space)) ctx = tvm.cpu() a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding).astype(np.float32) for i in range(count): if random: log_filename = '%s_%i_%s_%s_%icore_rand_gcc.log' % ( key, i, feature_type, args.search_size, num_threads) else: log_filename = '%s_%i_%s_%s_%icore_gcc.log' % ( key, i, feature_type, args.search_size, num_threads) if args.key_id != None and count == 1: save_ind = int(args.key_id) else: save_ind = i if likwid_event != None: if random: pickle_file = '/media/frost/DATA/tvm_data/fix_gcc_likwid_rand_%s_%s_features_%icore_%i_%s_%i.pkl' % ( key, feature_type, num_threads, trials, args.search_size, save_ind) else: pickle_file = '/media/frost/DATA/tvm_data/fix_gcc_likwid_%s_%s_features_%icore_%i_%s_%i.pkl' % ( key, feature_type, num_threads, trials, args.search_size, save_ind) else: if random: pickle_file = '/media/frost/DATA/tvm_data/fix_gcc_rand_%s_%s_features_%icore_%i_%s_%i.pkl' % ( key, feature_type, num_threads, trials, args.search_size, save_ind) else: pickle_file = '/media/frost/DATA/tvm_data/fix_gcc_sa_%i_%s_%s_features_%icore_%i_%s_%i.pkl' % ( sa_n_iter, key, feature_type, num_threads, trials, args.search_size, save_ind) if os.path.exists(pickle_file): print('File exists', pickle_file) continue tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type, loss_type='rank', plan_size=32, sa_n_iter=sa_n_iter) tuner.tune(n_trial=trials, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(trials), autotvm.callback.log_to_file(log_filename) ], likwid_event=likwid_event, save_features=save_features, random=random) dispatch_context = autotvm.apply_history_best(log_filename) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best(log_filename): with tvm.target.create("c"): s, arg_bufs = task.func(*task.args) func = tvm.build(s, arg_bufs) if using_NCHWc: a_np_reshape = a_np.reshape( (N, CI // best_config['tile_ic'].size[-1], best_config['tile_ic'].size[-1], H, W)).transpose( (0, 1, 3, 4, 2)) w_np_reshape = w_np.reshape( (CO // best_config['tile_oc'].size[-1], best_config['tile_oc'].size[-1], CI // best_config['tile_ic'].size[-1], best_config['tile_ic'].size[-1], KH, KW)).transpose( (0, 2, 4, 5, 3, 1)) c_np_reshape = c_np.reshape( (N, CO // best_config['tile_oc'].size[-1], best_config['tile_oc'].size[-1], H, W)).transpose( (0, 1, 3, 4, 2)) a_tvm = tvm.nd.array(a_np_reshape, ctx=ctx) w_tvm = tvm.nd.array(w_np_reshape, ctx=ctx) c_tvm = tvm.nd.array(c_np_reshape, ctx=ctx) if tuple(arg_bufs[1].shape) == w_tvm.shape: func(c_tvm, w_tvm, a_tvm) else: func(c_tvm, a_tvm, w_tvm) try: tvm.testing.assert_allclose(c_np_reshape, c_tvm.asnumpy(), rtol=1e-2) except: print('WARNING: Not equal!') evaluator = func.time_evaluator(func.entry_name, ctx, repeat=3, number=4) if tuple(arg_bufs[1].shape) == w_tvm.shape: print(evaluator(c_tvm, w_tvm, a_tvm)) else: print(evaluator(c_tvm, a_tvm, w_tvm)) os.remove(log_filename) #print(tvm.lower(s, arg_bufs, simple_mode=True)) if save_features: with open(pickle_file, 'wb') as output: pickle.dump( [best_config, task, tuner.cost_model.saved_features], output, pickle.HIGHEST_PROTOCOL)
in_channels // env.BLOCK_IN, env.BLOCK_IN, kernel_h, kernel_w).transpose((0, 2, 4, 5, 1, 3)) # Format the input/output arrays with tvm.nd.array to the DLPack standard data_nd = tvm.nd.array(data_packed, ctx) kernel_nd = tvm.nd.array(kernel_packed, ctx) res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx) # Invoke the module to perform the computation f(data_nd, kernel_nd, res_nd) # Verify against numpy implementation res_ref = conv2d_nchw_python(data_np.astype(env.acc_dtype), kernel_np.astype(env.acc_dtype), (stride_h, stride_w), (pad_h, pad_w)).astype(env.acc_dtype) res_ref = res_ref >> env.INP_WIDTH res_ref = np.clip(res_ref, 0, inp_max) res_ref = res_ref.astype(res.dtype) res_ref = res_ref.reshape((batch_size // env.BATCH, env.BATCH, out_channels // env.BLOCK_OUT, env.BLOCK_OUT, fout_height, fout_width)).transpose((0, 2, 4, 5, 1, 3)) tvm.testing.assert_allclose(res_ref, res_nd.asnumpy()) print("Successful 2D convolution test!") ###################################################################### # Summary
def build_conv2d_module(opts): batch = 1 in_channel = 3 out_channel = 16 in_size = 8 kernel = 3 pad = 1 stride = 1 A = relay.var('A', shape=(batch, in_channel, in_size, in_size)) W = relay.var('W', shape=(out_channel, in_channel, kernel, kernel)) B = relay.op.nn.nn.conv2d(A, W, strides=(stride, stride), padding=(pad, pad), kernel_size=kernel, data_layout='NCHW', kernel_layout='OIHW', out_layout='', out_dtype='') a_data = np.random.uniform(size=(batch, in_channel, in_size, in_size)).astype('float32') w_data = np.random.uniform(size=(out_channel, in_channel, kernel, kernel)).astype('float32') func = relay.Function([A, W], B) params = {"W": w_data} graph, lib, params = relay.build_module.build(tvm.IRModule.from_expr(func), target=TARGET, params=params) build_dir = os.path.abspath(opts.out_dir) if not os.path.isdir(build_dir): os.makedirs(build_dir) lib.save(os.path.join(build_dir, 'conv2d_model.o')) with open(os.path.join(build_dir, 'conv2d_graph.json'), 'w') as f_graph_json: f_graph_json.write(graph) with open(os.path.join(build_dir, 'conv2d_params.bin'), 'wb') as f_params: f_params.write(relay.save_param_dict(params)) with open(os.path.join(build_dir, "conv2d_data.bin"), "wb") as fp: fp.write(a_data.astype(np.float32).tobytes()) ## get TVM result on local machine params = {"W": w_data} local_target = 'llvm --system-lib' graph, lib, params = relay.build_module.build(tvm.IRModule.from_expr(func), target=local_target, params=params) tvm_out = run_conv2d_module(a_data, graph, lib, params, target=local_target) b_np = conv2d_nchw_python(a_data, w_data, (stride, stride), (pad, pad)) print("TVM Output: " + str(tvm_out.shape)) print("Numpy Output: " + str(b_np.shape)) np.testing.assert_allclose(b_np, tvm_out, rtol=1e-2) with open(os.path.join(build_dir, "conv2d_output.bin"), "wb") as fp: fp.write(tvm_out.astype(np.float32).tobytes())
def tune_kernels( N, H, W, CO, CI, KH, KW, strides, padding, dilation, trials, key, measure_option, tuner, early_stopping, ): data = ('TENSOR', (N, CI, H, W), 'float32') kernel = ('TENSOR', (CO, CI, KH, KW), 'float32') origin_layout = 'NCHW' if len(sys.argv) > 2: feature_type = sys.argv[2] else: #feature_type = 'datavol' feature_type = 'itervar' #feature_type = 'datavol_itervar' print('Feature:', feature_type) if len(sys.argv) > 3: if 'small' == sys.argv[3]: func_create = 'conv2d_NCHW_small.x86' elif 'wide' == sys.argv[3]: func_create = 'conv2d_NCHW_wide.x86' else: func_create = 'conv2d_NCHWc.x86' else: func_create = 'conv2d_NCHWc.x86' if len(sys.argv) > 4: count = int(sys.argv[4]) else: count = 1 if len(sys.argv) > 5: likwid_event = sys.argv[5] else: likwid_event = None task = autotvm.task.create(func_create, args=(data, kernel, strides, padding, 1, origin_layout, origin_layout, 'float32'), target='llvm -mcpu=core-avx2') using_NCHWc = True print(task.config_space) trials = min(trials, len(task.config_space)) ctx = tvm.cpu() a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding).astype(np.float32) for i in range(count): log_filename = '%s_%i_%s_%s_%icore_rand.log' % ( key, i, feature_type, sys.argv[3], num_threads) tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type, loss_type='rank', plan_size=32) tuner.tune(n_trial=trials, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(trials), autotvm.callback.log_to_file(log_filename) ], likwid_event=likwid_event) dispatch_context = autotvm.apply_history_best(log_filename) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best(log_filename): with tvm.target.create("llvm -mcpu=core-avx2"): s, arg_bufs = task.func(*task.args) func = tvm.build(s, arg_bufs) if using_NCHWc: a_np_reshape = a_np.reshape( (N, CI // best_config['tile_ic'].size[-1], best_config['tile_ic'].size[-1], H, W)).transpose( (0, 1, 3, 4, 2)) w_np_reshape = w_np.reshape( (CO // best_config['tile_oc'].size[-1], best_config['tile_oc'].size[-1], CI // best_config['tile_ic'].size[-1], best_config['tile_ic'].size[-1], KH, KW)).transpose( (0, 2, 4, 5, 3, 1)) c_np_reshape = c_np.reshape( (N, CO // best_config['tile_oc'].size[-1], best_config['tile_oc'].size[-1], H, W)).transpose( (0, 1, 3, 4, 2)) a_tvm = tvm.nd.array(a_np_reshape, ctx=ctx) w_tvm = tvm.nd.array(w_np_reshape, ctx=ctx) c_tvm = tvm.nd.array(c_np_reshape, ctx=ctx) if tuple(arg_bufs[1].shape) == w_tvm.shape: func(c_tvm, w_tvm, a_tvm) else: func(c_tvm, a_tvm, w_tvm) try: tvm.testing.assert_allclose(c_np_reshape, c_tvm.asnumpy(), rtol=1e-2) except: print('WARNING: Not equal!') evaluator = func.time_evaluator(func.entry_name, ctx, repeat=3, number=4) if tuple(arg_bufs[1].shape) == w_tvm.shape: print(evaluator(c_tvm, w_tvm, a_tvm)) else: print(evaluator(c_tvm, a_tvm, w_tvm)) os.remove(log_filename) print(tvm.lower(s, arg_bufs, simple_mode=True)) if likwid_event != None: with open( 'data/likwid_rand_%s_%s_features_%icore_%i_%s.pkl' % (key, feature_type, num_threads, trials, sys.argv[3]), 'wb') as output: pickle.dump([best_config, task, tuner.cost_model.saved_features], output, pickle.HIGHEST_PROTOCOL) else: with open( 'data/%s_%s_features_%icore_%i_%s.pkl' % (key, feature_type, num_threads, trials, sys.argv[3]), 'wb') as output: pickle.dump([best_config, task, tuner.cost_model.saved_features], output, pickle.HIGHEST_PROTOCOL)
def main_body(feature_type, source, target, history_file, n_trial=1000): assert source in conv_configs, "Source '{}' is not a convolution config.".format( source) assert target in conv_configs, "Target '{}' is not a convolution config.".format( target) assert os.path.exists( history_file), "History file '{}' does not exist.".format(history_file) # filename = 'conv2d_{}'.format(feature_type) filename = 'conv2d_transfer_{}_{}_{}'.format(feature_type, source, target) # NOTE: Important to use `fresh(filename, ...)` to prevent file overwriting. log_file = fresh(filename, 'log') dump_file = fresh(filename, 'txt') # NOTE: Dump file will contain info from later iterations. # logging.getLogger('autotvm').addHandler(logging.FileHandler(dump_file)) # Target for transfer learning. N, H, W, CO, CI, KH, KW, strides, padding = conv_configs[target] task = autotvm.task.create(conv2d_no_batching, args=(N, H, W, CO, CI, KH, KW, strides, padding), target='cuda') print(task.config_space) tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type) # Load source history log file for transfer learning. print("Load history for transfer learning.") history_data = autotvm.record.load_from_file(history_file) # Note: `load_from_file` returns a generator. # Convert to list to prevent accidental consumption. history_data = list(history_data) tuner.load_history(history_data) # Specify operator measuring options. run_timeout = 30 measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner( repeat=3, min_repeat_ms=100, timeout=run_timeout)) # Begin tuning, log records to log file. # Begin tuning. Log records to `log_file`. tuner.tune(n_trial=n_trial, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file(log_file)]) # inspect the best config dispatch_context = autotvm.apply_history_best(log_file) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from config file with autotvm.apply_history_best(log_file): with tvm.target.create("cuda"): s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) # ir = tvm.lower(s, arg_bufs, simple_mode=True) # print(ir) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) ctx = tvm.gpu() a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx) func(a_tvm, w_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. # Choose a large repeat number to reduce noise. repeat_number = 1000 evaluator = func.time_evaluator(func.entry_name, ctx, number=repeat_number) print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
def tune_kernels( N, H, W, CO, CI, KH, KW, strides, padding, dilation, trials, log_filename, measure_option, tuner, early_stopping, ): data = ('TENSOR', (N, CI, H, W), 'float32') kernel = ('TENSOR', (CO, CI, KH, KW), 'float32') origin_layout = 'NCHW' func_create = 'conv2d_NCHW_dv.x86' #func_create = 'conv2d_nchw_spatial_pack.dv.x86' task = autotvm.task.create(func_create, args=(data, kernel, strides, padding, 1, 'float32'), target='llvm -mcpu=core-avx2') using_NCHWc = False # Uncomment to run x86 script. #func_create = 'conv2d_NCHWc.x86' #task = autotvm.task.create(func_create, # args=(data, kernel, strides, padding, 1, origin_layout, origin_layout, 'float32'), # target='llvm -mcpu=core-avx2') #using_NCHWc = True #task.workload = ['float32', 'float32', H, W, CI, 1, CO, KH, KW, 1, 1, 1, 1] print(task.config_space) #print(len(task.config_space)) trials = min(trials, len(task.config_space)) # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. if len(sys.argv) > 1: feature_type = sys.argv[1] else: feature_type = 'datavol' #feature_type = 'itervar' #feature_type = 'datavol_itervar' print('Feature:', feature_type) for i in range(1): tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type, loss_type='rank', plan_size=8) tuner.tune(n_trial=trials, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(trials), autotvm.callback.log_to_file(log_filename) ], likwid_event='CACHE') #with open('data/%s_features_1core_%i_n%i_%i.pkl' % (feature_type, H, N, trials) , 'wb') as output: #with open('data/likwid_test.pkl' , 'wb') as output: # pickle.dump([task, tuner.cost_model.saved_features], output, pickle.HIGHEST_PROTOCOL) dispatch_context = autotvm.apply_history_best(log_filename) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best(log_filename): with tvm.target.create("llvm -mcpu=core-avx2"): s, arg_bufs = task.func(*task.args) func = tvm.build(s, arg_bufs) #print(tvm.lower(s, arg_bufs, simple_mode=True)) ctx = tvm.cpu() a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) if using_NCHWc: a_np = a_np.reshape((N, 8, H, W, CI // 8)) a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=np.float32), ctx=ctx) func(c_tvm, w_tvm, a_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) evaluator = func.time_evaluator(func.entry_name, ctx, number=10) print(evaluator(c_tvm, w_tvm, a_tvm))
def main_body(feature_type, conv_config, n_trial): filename = 'conv2d_{}_{}_n{}'.format(conv_config, feature_type, n_trial) # log_file = '{}.log'.format(filename) # dump_file = '{}.txt'.format(filename) # NOTE: Important to use `fresh(filename, ...)` to prevent file overwriting. log_file = fresh(filename, 'log') dump_file = fresh(filename, 'txt') # NOTE: Result directory experiments. # results_dir = fresh_dir('results') # log_file = fresh(os.path.join(results_dir, filename), 'log') # dump_file = fresh(os.path.join(results_dir, filename), 'txt') # # log_file = os.path.join(results_dir, log_file) # # dump_file = os.path.join(results_dir, dump_file) # NOTE: Dump file will contain info from later iterations. # logging.getLogger('autotvm').addHandler(logging.FileHandler(dump_file)) N, H, W, CO, CI, KH, KW, strides, padding = conv_configs[conv_config] task = autotvm.task.create(conv2d_no_batching, args=(N, H, W, CO, CI, KH, KW, strides, padding), target='cuda') print(task.config_space) tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type) # Specify operator measuring options. run_timeout = 30 measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner( repeat=3, min_repeat_ms=100, timeout=run_timeout)) # Begin tuning. Log records to `log_file`. tuner.tune(n_trial=n_trial, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file(log_file)]) ######################################################################### # Finally we can inspect the best config from log file, check correctness, # and measure running time. # inspect the best config dispatch_context = autotvm.apply_history_best(log_file) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from config file with autotvm.apply_history_best(log_file): with tvm.target.create("cuda"): s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) # ir = tvm.lower(s, arg_bufs, simple_mode=True) # print(ir) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) ctx = tvm.gpu() a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx) func(a_tvm, w_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. # Choose a large repeat number to reduce noise. repeat_number = 1000 evaluator = func.time_evaluator(func.entry_name, ctx, number=repeat_number) print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
def main(feature_type): conv_config = 'c12' filename = 'conv2d_{}'.format(feature_type) log_file = '{}.log'.format(filename) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) # C12: the last conv layer in resnet. N, H, W, CO, CI, KH, KW, strides, padding = conv_configs[conv_config] task = autotvm.task.create(conv2d_no_batching, args=(N, H, W, CO, CI, KH, KW, strides, padding), target='cuda') print(task.config_space) tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type) n_trial = 1000 repeat_number = 1000 # Specify operator measuring options. run_timeout = 30 measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=run_timeout) ) # Begin tuning. Log records to `log_file`. tuner.tune(n_trial=n_trial, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file(log_file)]) # inspect the best config dispatch_context = autotvm.apply_history_best(log_file) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best(log_file): with tvm.target.create("cuda"): s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) ir = tvm.lower(s, arg_bufs, simple_mode=True) # print(ir) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) ctx = tvm.gpu() a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx) func(a_tvm, w_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. # Choose a large repeat number to reduce noise. evaluator = func.time_evaluator(func.entry_name, ctx, number=repeat_number) print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)