def test_copy_grad(): data = relay.var("data", relay.TensorType((10, 4), "float64")) fwd_func = relay.Function([data], relay.copy(data)) check_grad(fwd_func)
def test_wildcard_match_solo(): x = relay.Var("x", nat()) copy = relay.Function([x], relay.Match(x, [relay.Clause(relay.PatternWildcard(), x)]), nat()) res = intrp.evaluate(copy(s(s(s(z()))))) assert count(res) == 3
def test_iterate(): expr = relay.Call(iterate(double, relay.const(2)), [make_nat_expr(3)]) res = intrp.evaluate(relay.Function([], expr)()) assert count(res) == 12
def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug): """Test AutoTune for microTVM Zephyr""" if board != "qemu_x86": pytest.xfail(f"Autotune fails on {board}.") runtime = Runtime("crt", {"system-lib": True}) model = test_utils.ZEPHYR_BOARDS[board] build_config = {"debug": tvm_debug} # Create a Relay model data_shape = (1, 3, 16, 16) weight_shape = (8, 3, 5, 5) data = relay.var("data", relay.TensorType(data_shape, "float32")) weight = relay.var("weight", relay.TensorType(weight_shape, "float32")) y = relay.nn.conv2d( data, weight, padding=(2, 2), kernel_size=(5, 5), kernel_layout="OIHW", out_dtype="float32", ) f = relay.Function([data, weight], y) mod = tvm.IRModule.from_expr(f) mod = relay.transform.InferType()(mod) data_sample = np.random.rand(data_shape[0], data_shape[1], data_shape[2], data_shape[3]).astype("float32") weight_sample = np.random.rand(weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3]).astype("float32") params = {mod["main"].params[1].name_hint: weight_sample} target = tvm.target.target.micro(model) pass_context = tvm.transform.PassContext( opt_level=3, config={"tir.disable_vectorize": True}) with pass_context: tasks = tvm.autotvm.task.extract_from_program(mod["main"], {}, target) assert len(tasks) > 0 config_main_stack_size = None if test_utils.qemu_boards(board): config_main_stack_size = 1536 project_options = { "zephyr_board": board, "west_cmd": west_cmd, "verbose": 1, "project_type": "host_driven", } if config_main_stack_size is not None: project_options["config_main_stack_size"] = config_main_stack_size module_loader = tvm.micro.AutoTvmModuleLoader( template_project_dir=test_utils.TEMPLATE_PROJECT_DIR, project_options=project_options, ) timeout = 200 builder = tvm.autotvm.LocalBuilder( timeout=timeout, n_parallel=1, build_kwargs={"build_option": { "tir.disable_vectorize": True }}, do_fork=True, build_func=tvm.micro.autotvm_build_func, runtime=runtime, ) runner = tvm.autotvm.LocalRunner(number=1, repeat=1, timeout=timeout, module_loader=module_loader) measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner) log_path = pathlib.Path("zephyr_autotune.log") if log_path.exists(): log_path.unlink() n_trial = 10 for task in tasks: tuner = tvm.autotvm.tuner.GATuner(task) tuner.tune( n_trial=n_trial, measure_option=measure_option, callbacks=[ tvm.autotvm.callback.log_to_file(str(log_path)), tvm.autotvm.callback.progress_bar(n_trial, si_prefix="M"), ], si_prefix="M", ) assert tuner.best_flops > 0 check_tune_log(log_path) # Build without tuning with pass_context: lowered = tvm.relay.build(mod, target=target, runtime=runtime, params=params) temp_dir = utils.tempdir() with _make_session(temp_dir, board, west_cmd, lowered, build_config) as session: graph_mod = tvm.micro.create_local_graph_executor( lowered.get_graph_json(), session.get_system_lib(), session.device) graph_mod.set_input(**lowered.get_params()) graph_mod.run(data=data_sample) expected_output = graph_mod.get_output(0).numpy() del graph_mod # Build using autotune logs with tvm.autotvm.apply_history_best(str(log_path)): with pass_context: lowered_tuned = tvm.relay.build(mod, target=target, runtime=runtime, params=params) temp_dir = utils.tempdir() with _make_session(temp_dir, board, west_cmd, lowered_tuned, build_config) as session: graph_mod = tvm.micro.create_local_graph_executor( lowered_tuned.get_graph_json(), session.get_system_lib(), session.device) graph_mod.set_input(**lowered_tuned.get_params()) graph_mod.run(data=data_sample) output = graph_mod.get_output(0).numpy() del graph_mod tvm.testing.assert_allclose(output, expected_output, rtol=1e-4, atol=1e-5)
def expected(): x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32") w = relay.var("w", shape=(32, 16, 3, 3), dtype="float32") y = relay.nn.conv2d(x, w, padding=(1, 1)) y = relay.reshape(y, newshape=(32, 16, 16)) return relay.Function([x, w], y)
def fuse_partitions(pre_mod, mid_mod, post_mod): """Combine prefix, middle, and suffix modules into a single module. The combined module includes an additional `main` that fuses all three partitions together. Parameters ---------- pre_mod : tvm.IRModule Module containing an input quantization function mid_mod : tvm.IRModule Module containing core of a quantized inference function post_mod : tvm.IRModule Module containing an output dequantization function Returns ------- fused_mod : tvm.IRModule Module containing the input quantization, core quantized inference, output dequantization, and full quantized inference functions """ pre_func = pre_mod["main"] mid_func = mid_mod["main"] post_func = post_mod["main"] # create a module containing the prefix, middle, and suffix partitions fused_mod = tvm.IRModule( functions={ relay.GlobalVar("quantize_inputs"): pre_func, relay.GlobalVar("quantized_main"): mid_func, relay.GlobalVar("dequantize_outputs"): post_func, }) # construct a `main` that strings together the partitions, such that its # behaviour is equivalent to `main` in an *unpartitioned* module scope_builder = relay.ScopeBuilder() fused_mod_main_params = [ relay.Var(param.name_hint) for param in pre_func.params ] quantized_inputs = scope_builder.let( "quantized_inputs", relay.Call(fused_mod.get_global_var("quantize_inputs"), fused_mod_main_params), ) quantized_outputs = scope_builder.let( "quantized_outputs", relay.Call( fused_mod.get_global_var("quantized_main"), [ relay.TupleGetItem(quantized_inputs, i) for i in range(len(pre_func.ret_type.fields)) ], ), ) dequantized_outputs = scope_builder.let( "dequantized_outputs", relay.Call(fused_mod.get_global_var("dequantize_outputs"), [quantized_outputs]), ) scope_builder.ret(dequantized_outputs) fused_mod["main"] = relay.Function(fused_mod_main_params, scope_builder.get()) return fused_mod
def get_network(name, batch_size, dtype='float32'): """Get the symbol definition and random weight of a network Parameters ---------- name: str The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 'inception_v3', 'mobilenet', ... batch_size: int batch size dtype: str Data type Returns ------- net: tvm.IRModule The relay function of network definition params: dict The random parameters for benchmark input_shape: tuple The shape of input tensor output_shape: tuple The shape of output tensor """ input_shape = (batch_size, 3, 224, 224) output_shape = (batch_size, 1000) if name == 'mobilenet': net, params = testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype) elif name == 'inception_v3': input_shape = (batch_size, 3, 299, 299) net, params = testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) elif "resnet" in name: n_layer = int(name.split('-')[1]) net, params = testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype) elif "vgg" in name: n_layer = int(name.split('-')[1]) net, params = testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype) elif "densenet" in name: n_layer = int(name.split('-')[1]) net, params = testing.densenet.get_workload(densenet_size=n_layer, batch_size=batch_size, dtype=dtype) elif "squeezenet" in name: version = name.split("_v")[1] net, params = testing.squeezenet.get_workload(batch_size=batch_size, version=version, dtype=dtype) elif name == 'mxnet': # an example for mxnet model from mxnet.gluon.model_zoo.vision import get_model block = get_model('resnet18_v1', pretrained=True) net, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype) net = net["main"] net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs) net = tvm.IRModule.from_expr(net) else: raise ValueError("Unsupported network: " + name) return net, params, input_shape, output_shape
def test_match_func_attr(): pattern = wildcard().has_attr({"Composite": "add"}) x = relay.var("x") y = relay.var("y") f = relay.Function([x, y], x + y).with_attr("Composite", "add") assert pattern.match(f)
def verify_sum_grad(d_shape, axis=None, keepdims=False, exclude=False): data = relay.var("data", relay.TensorType(d_shape, "float32")) fwd_func = relay.Function([data], relay.sum(data, axis=axis, keepdims=keepdims, exclude=exclude)) check_grad(fwd_func)
def test_quadruple_partition_dominator(): # Pattern is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard()) is_unary_elemwise = (wildcard().has_attr({"TOpPattern": K_ELEMWISE}))( wildcard()) | is_op("add")(wildcard(), wildcard()) reduction = is_op("add")(wildcard(), wildcard()) diamond = dominates(is_conv2d, is_unary_elemwise, reduction) inp = relay.var("input") weight = relay.var("weight") # Classic Diamond def classic_diamond(inp, weight): conv2d = relay.op.nn.conv2d(inp, weight) relu = relay.op.nn.relu(conv2d) relu = relay.op.nn.relu(relu) leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0) return relu + leaky_relu # Deeper Branch def deeper_diamond(inp, weight): conv2d = relay.op.nn.conv2d(inp, weight) relu = relay.op.nn.relu(conv2d) relu = relay.op.nn.relu(relu) relu = relay.op.tanh(relu) leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0) return relu + leaky_relu # Single Branch def single_branch(inp, weight): conv2d = relay.op.nn.conv2d(inp, weight) relu = relay.op.nn.relu(conv2d) relu = relay.op.nn.relu(relu) tanh = relay.op.tanh(relu) return relu + tanh # Fuzzy path/nested Diamond def nested_diamond(inp, weight): conv2d = relay.op.nn.conv2d(inp, weight) relu = relay.op.nn.relu(conv2d) relu = relu + relu tanh = relay.op.tanh(relu) leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0) return tanh + leaky_relu partitioned = diamond.partition( nested_diamond( single_branch(deeper_diamond(classic_diamond(inp, weight), weight), weight), weight)) functions = [] partition_names = [ "nn.conv2d_nn.relu_nn.relu_nn.leaky_relu_add_", "nn.conv2d_nn.relu_nn.relu_tanh_nn.leaky_relu_add_", "nn.conv2d_nn.relu_nn.relu_tanh_add_", "nn.conv2d_nn.relu_add_tanh_nn.leaky_relu_add_", ] for i, f in enumerate( [classic_diamond, deeper_diamond, single_branch, nested_diamond]): inpf = relay.var("input") weightf = relay.var("weight") functions.append( relay.Function([inpf, weightf], f(inpf, weightf)).with_attr("PartitionedFromPattern", partition_names[i])) reference = functions[3](functions[2](functions[1](functions[0](inp, weight), weight), weight), weight) assert tvm.ir.structural_equal(partitioned, reference)
def test_partition_constant_embedding(): x = relay.var("x") w = relay.var("w") wc = relay.const(1) b = relay.var("b") xf = relay.var("x") wf = relay.var("w") bf = relay.var("b") embeded_func = relay.Function([xf, bf], conv_bias_relu(xf, wc, bf)).with_attr( "PartitionedFromPattern", "nn.conv2d_nn.bias_add_nn.relu_") xf = relay.var("x") wf = relay.var("w") bf = relay.var("b") lifted_func = relay.Function([xf, wf, bf], conv_bias_relu(xf, wf, bf)).with_attr( "PartitionedFromPattern", "nn.conv2d_nn.bias_add_nn.relu_") relu = conv_bias_relu(x, w, b) reluc = conv_bias_relu(x, wc, b) # Check lifting of wildcard matches pattern = is_op("nn.relu")(is_op("nn.bias_add")(is_op("nn.conv2d")( wildcard(), wildcard()), wildcard())) assert tvm.ir.structural_equal(lifted_func(x, w, b), pattern.partition(relu)) assert tvm.ir.structural_equal(lifted_func(x, wc, b), pattern.partition(reluc)) # Check lifting of input matches pattern = is_op("nn.relu")(is_op("nn.bias_add")(is_op("nn.conv2d")( wildcard(), is_var()), wildcard())) assert tvm.ir.structural_equal(lifted_func(x, w, b), pattern.partition(relu)) assert tvm.ir.structural_equal( reluc, pattern.partition(reluc)) # Constants are not Inputs # Check embedding of constant matches pattern = is_op("nn.relu")(is_op("nn.bias_add")(is_op("nn.conv2d")( wildcard(), is_constant()), wildcard())) assert tvm.ir.structural_equal(relu, pattern.partition(relu)) assert tvm.ir.structural_equal(embeded_func(x, b), pattern.partition(reluc)) # Check embedding of constant ExprPatterns pattern = is_op("nn.relu")(is_op("nn.bias_add")(is_op("nn.conv2d")( wildcard(), is_expr(wc)), wildcard())) assert tvm.ir.structural_equal(relu, pattern.partition(relu)) assert tvm.ir.structural_equal(embeded_func(x, b), pattern.partition(reluc)) # Check lifting/embedding of Alt matches pattern = is_op("nn.relu")(is_op("nn.bias_add")(is_op("nn.conv2d")( wildcard(), is_var() | is_constant()), wildcard())) assert tvm.ir.structural_equal(lifted_func(x, w, b), pattern.partition(relu)) assert tvm.ir.structural_equal(embeded_func(x, b), pattern.partition(reluc)) # Check lifting/embedding of Alt matches with the other ordering pattern = is_op("nn.relu")(is_op("nn.bias_add")(is_op("nn.conv2d")( wildcard(), is_constant() | is_var()), wildcard())) assert tvm.ir.structural_equal(lifted_func(x, w, b), pattern.partition(relu)) assert tvm.ir.structural_equal(embeded_func(x, b), pattern.partition(reluc))
def get_net(include_bn=True, include_sigmoid=False): data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32")) block1 = get_blocks("block1_", data, 3, 8, include_bn, include_sigmoid) # The second block is always conv + relu, to make it more interesting block2 = get_blocks("block2_", block1, 8, 8, False, include_sigmoid) return relay.Function(relay.analysis.free_vars(block2), block2)
def make_ethosn_composite(ethosn_expr, name): vars = relay.analysis.free_vars(ethosn_expr) func = relay.Function([relay.Var("a")], ethosn_expr) func = func.with_attr("Composite", name) call = relay.Call(func, vars) return call
def make_module(func, params): func = relay.Function(relay.analysis.free_vars(func), func) if params: relay.build_module.bind_params_by_name(func, params) return tvm.IRModule.from_expr(func)
def create_model(): ifm = relay.var("ifm", shape=ifm_shape, dtype="int32") clz = infra.make_ethosu_unary_elementwise(ifm, 4, "CLZ") return tvm.IRModule.from_expr(relay.Function([ifm], clz))
def before(): x = relay.var("x", shape=(1, 32, 56, 56)) w = relay.var("w", shape=(32, 1, 3, 3)) y = relay.nn.conv2d(x, w, padding=(1, 1), channels=32, kernel_size=(3, 3), groups=32) y = relay.Function(analysis.free_vars(y), y) return y
channels=1, padding=(0, 0)) simple_net = relay.nn.relu(simple_net) simple_net = relay.nn.avg_pool2d(simple_net, pool_size=(2, 2), strides=(2, 2), padding=(0, 0)) simple_net = relay.nn.relu(simple_net) simple_net = relay.nn.conv2d(simple_net, weight=conv2_weight, kernel_size=(2, 2), channels=1, padding=(0, 0)) print("----------TEST2----------") node = relay.analysis.free_vars(simple_net) simple_net = relay.Function(node, simple_net) print("----------TEST3----------") net, params = testing.create_workload(simple_net) print("-----NET.ASTEXT----------") print(net.astext(show_meta_data=False)) print("----------TEST4----------") opt_level = 0 target = tvm.target.cuda() with relay.build_config(opt_level=opt_level): graph, lib, params = relay.build_module.build(net, target, params=params) print("----------TEST5----------") ctx = tvm.gpu() #data = np.array([[[(1,2,3,4),(2,3,4,1),(3,4,1,2),(4,1,2,3)]]]).astype("float32") data = np.array([[[(1, 2, 3, 4, 5, 6, 7, 8), (2, 3, 4, 5, 6, 7, 8, 9),
def before(): x = relay.var("x", shape=(1, 64, 56, 56)) y = relay.nn.global_max_pool2d(x) y = relay.Function([x], y) return y
def test_byoc_microtvm(board, arduino_cli_cmd, tvm_debug, workspace_dir): """This is a simple test case to check BYOC capabilities of microTVM""" model = test_utils.ARDUINO_BOARDS[board] build_config = {"debug": tvm_debug} x = relay.var("x", shape=(10, 10)) w0 = relay.var("w0", shape=(10, 10)) w1 = relay.var("w1", shape=(10, 10)) w2 = relay.var("w2", shape=(10, 10)) w3 = relay.var("w3", shape=(10, 10)) w4 = relay.var("w4", shape=(10, 10)) w5 = relay.var("w5", shape=(10, 10)) w6 = relay.var("w6", shape=(10, 10)) w7 = relay.var("w7", shape=(10, 10)) # C compiler z0 = relay.add(x, w0) p0 = relay.subtract(z0, w1) q0 = relay.multiply(p0, w2) z1 = relay.add(x, w3) p1 = relay.subtract(z1, w4) q1 = relay.multiply(p1, w5) # Other parts on TVM z2 = relay.add(x, w6) q2 = relay.subtract(z2, w7) r = relay.concatenate((q0, q1, q2), axis=0) f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r) mod = tvm.IRModule() ann = byoc.CcompilerAnnotator() mod["main"] = ann.visit(f) mod = tvm.relay.transform.PartitionGraph()(mod) mod = tvm.relay.transform.InferType()(mod) x_data = np.random.rand(10, 10).astype("float32") w_data = [] for _ in range(8): w_data.append(np.random.rand(10, 10).astype("float32")) map_inputs = {"w{}".format(i): w_data[i] for i in range(8)} map_inputs["x"] = x_data check_result( relay_mod=mod, map_inputs=map_inputs, out_shape=(30, 10), result=np.concatenate( ( ((x_data + w_data[0]) - w_data[1]) * w_data[2], ((x_data + w_data[3]) - w_data[4]) * w_data[5], x_data + w_data[6] - w_data[7], ), axis=0, ), model=model, build_config=build_config, arduino_board=board, arduino_cli_cmd=arduino_cli_cmd, workspace_dir=workspace_dir, )
def expected(): add = relay.add(x, y) copy_add_sub = relay.device_copy(add, ctx2, ctx1) sub = relay.subtract(copy_add_sub, z) func = relay.Function([x, y, z], sub) return func
def to_relay(graph, shape_dict, dtype_dict, params): """Convert an NNVM graph into the corresponding Relay expression. Parameters ---------- graph : Graph The input graph. shape_dict : dict of str to shape The input shape. dtype_dict : dict of str to str/dtype The input shape. params : dict of str to array The parameters. Returns ------- (expr, params) : Tuple[relay.Expr, dict of str to array] The corresponding Relay expression and parameters. """ if isinstance(graph, Symbol): graph = graph_create(graph) param_shapes = dict((k, params[k].shape) for k in params) shape_dict = shape_dict.copy() shape_dict.update(param_shapes) graph = graph_attr.set_shape_inputs(graph, shape_dict) graph = graph_attr.set_dtype_inputs(graph, dtype_dict) graph = graph.apply(["InferShape", "InferType"]) shape = graph.json_attr("shape") dtype = [graph_attr.TCODE_TO_DTYPE[di] for di in graph.json_attr("dtype")] gidx = graph.index relay_map = {} fn_params = [] for nid, node in enumerate(gidx.nodes): children = [] for i in node['inputs']: child = relay_map[i[0]] if isinstance(child, expr.TupleWrapper): children.append(child[i[1]]) else: children.append(child) oshape = shape[gidx.entry_id(nid, 0)] odtype = dtype[gidx.entry_id(nid, 0)] attrs = node.get("attrs", {}) node_name = node["name"] op_name = node["op"] if op_name == "null": v = var(node_name, shape=oshape, dtype=odtype) fn_params.append(v) relay_map[nid] = v else: if op_name in NNVM_OP_2_RELAY_OP: str_attrs = StrAttrsDict(attrs) call = NNVM_OP_2_RELAY_OP[op_name](children, str_attrs, odtype) relay_map[nid] = call else: raise Exception( "nnvm.to_relay: unsupported operator: {0}".format(op_name)) outputs = [] for nid, idx, _ in gidx.output_entries: output = relay_map[nid] if isinstance(output, expr.TupleWrapper): outputs.append(output[idx]) else: outputs.append(output) if len(outputs) == 1: body = outputs[0] else: body = expr.Tuple(outputs) func = relay.Function(fn_params, body) return func, params
def expected(): add = relay.add(x, y) sub = relay.subtract(add, z) func = relay.Function([x, y, z], sub) return func
def resnet( units, num_stages, filter_list, num_classes, data_shape, bottle_neck=True, layout="NCHW", dtype="float32", ): """Return ResNet Program. Parameters ---------- units : list Number of units in each stage num_stages : int Number of stages filter_list : list Channel size of each stage num_classes : int Ouput size of symbol data_shape : tuple of int. The shape of input data. bottle_neck : bool Whether apply bottleneck transformation. layout: str The data layout for conv2d dtype : str The global data type. """ data_layout = layout kernel_layout = "OIHW" if layout == "NCHW" else "HWIO" bn_axis = data_layout.index("C") num_unit = len(units) assert num_unit == num_stages data = relay.var("data", shape=data_shape, dtype=dtype) data = layers.batch_norm_infer(data=data, epsilon=2e-5, axis=bn_axis, scale=False, name="bn_data") (_, _, height, _) = data_shape if layout == "NHWC": (_, height, _, _) = data_shape if height <= 32: # such as cifar10 body = layers.conv2d( data=data, channels=filter_list[0], kernel_size=(3, 3), strides=(1, 1), padding=(1, 1), name="conv0", data_layout=data_layout, kernel_layout=kernel_layout, ) else: # often expected to be 224 such as imagenet body = layers.conv2d( data=data, channels=filter_list[0], kernel_size=(7, 7), strides=(2, 2), padding=(3, 3), name="conv0", data_layout=data_layout, kernel_layout=kernel_layout, ) body = layers.batch_norm_infer(data=body, epsilon=2e-5, axis=bn_axis, name="bn0") body = relay.nn.relu(data=body) body = relay.nn.max_pool2d(data=body, pool_size=(3, 3), strides=(2, 2), padding=(1, 1), layout=data_layout) for i in range(num_stages): body = residual_unit( body, filter_list[i + 1], (1 if i == 0 else 2, 1 if i == 0 else 2), False, name="stage%d_unit%d" % (i + 1, 1), bottle_neck=bottle_neck, data_layout=data_layout, kernel_layout=kernel_layout, ) for j in range(units[i] - 1): body = residual_unit( body, filter_list[i + 1], (1, 1), True, name="stage%d_unit%d" % (i + 1, j + 2), bottle_neck=bottle_neck, data_layout=data_layout, kernel_layout=kernel_layout, ) bn1 = layers.batch_norm_infer(data=body, epsilon=2e-5, axis=bn_axis, name="bn1") relu1 = relay.nn.relu(data=bn1) # Although kernel is not used here when global_pool=True, we should put one pool1 = relay.nn.global_avg_pool2d(data=relu1, layout=data_layout) flat = relay.nn.batch_flatten(data=pool1) fc1 = layers.dense_add_bias(data=flat, units=num_classes, name="fc1") net = relay.nn.softmax(data=fc1) return relay.Function(relay.analysis.free_vars(net), net)
def test_saturation(): # Same params data_dtype = "uint8" x = relay.var("x", shape=(1, 4), dtype=data_dtype) y = relay.var("y", shape=(1, 4), dtype=data_dtype) z = relay.qnn.op.add( lhs=x, rhs=y, lhs_scale=relay.const(0.125, "float32"), lhs_zero_point=relay.const(0, "int32"), rhs_scale=relay.const(0.125, "float32"), rhs_zero_point=relay.const(0, "int32"), output_scale=relay.const(0.125, "float32"), output_zero_point=relay.const(0, "int32"), ) func = relay.Function([x, y], z) mod = tvm.IRModule.from_expr(func) mod = relay.transform.InferType()(mod) mod = relay.qnn.transform.CanonicalizeOps()(mod) func = mod["main"] mod = relay.transform.InferType()(mod) x_data = np.array((255, 1, 1, 0)).reshape((1, 4)) y_data = np.array((255, 255, 128, 0)).reshape((1, 4)) golden_output = np.array((255, 255, 129, 0)).reshape((1, 4)) op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)( x_data, y_data ) np.testing.assert_equal(op_res.numpy(), golden_output) # Same params, different scale z = relay.qnn.op.add( lhs=x, rhs=y, lhs_scale=relay.const(0.125, "float32"), lhs_zero_point=relay.const(0, "int32"), rhs_scale=relay.const(0.125, "float32"), rhs_zero_point=relay.const(0, "int32"), output_scale=relay.const(0.25, "float32"), output_zero_point=relay.const(0, "int32"), ) func = relay.Function([x, y], z) mod = tvm.IRModule.from_expr(func) mod = relay.transform.InferType()(mod) mod = relay.qnn.transform.CanonicalizeOps()(mod) func = mod["main"] x_data = np.array((255, 1, 1, 0)).reshape((1, 4)) y_data = np.array((255, 255, 127, 0)).reshape((1, 4)) golden_output = np.array((255, 129, 65, 0)).reshape((1, 4)) op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)( x_data, y_data ) np.testing.assert_equal(op_res.numpy(), golden_output) # Same io params, different output scale z = relay.qnn.op.add( lhs=x, rhs=y, lhs_scale=relay.const(0.125, "float32"), lhs_zero_point=relay.const(0, "int32"), rhs_scale=relay.const(0.125, "float32"), rhs_zero_point=relay.const(0, "int32"), output_scale=relay.const(0.25, "float32"), output_zero_point=relay.const(0, "int32"), ) func = relay.Function([x, y], z) mod = tvm.IRModule.from_expr(func) mod = relay.transform.InferType()(mod) mod = relay.qnn.transform.CanonicalizeOps()(mod) func = mod["main"] x_data = np.array((255, 1, 1, 0)).reshape((1, 4)) y_data = np.array((255, 255, 127, 0)).reshape((1, 4)) golden_output = np.array((255, 129, 65, 0)).reshape((1, 4)) op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)( x_data, y_data ) np.testing.assert_equal(op_res.numpy(), golden_output) # All params different z = relay.qnn.op.add( lhs=x, rhs=y, lhs_scale=relay.const(0.5, "float32"), lhs_zero_point=relay.const(0, "int32"), rhs_scale=relay.const(0.25, "float32"), rhs_zero_point=relay.const(0, "int32"), output_scale=relay.const(0.125, "float32"), output_zero_point=relay.const(0, "int32"), ) func = relay.Function([x, y], z) mod = tvm.IRModule.from_expr(func) mod = relay.transform.InferType()(mod) mod = relay.qnn.transform.CanonicalizeOps()(mod) func = mod["main"] x_data = np.array((255, 0, 1, 0)).reshape((1, 4)) y_data = np.array((0, 128, 64, 0)).reshape((1, 4)) golden_output = np.array((255, 255, 132, 0)).reshape((1, 4)) op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)( x_data, y_data ) np.testing.assert_equal(op_res.numpy(), golden_output)
def test_list_constructor(): test_consz = relay.GlobalVar("test_consz") func = relay.Function([], cons(z(), nil())) mod[test_consz] = func assert mod[test_consz].body.checked_type == l(nat())
def create_model(): ifm = relay.var("ifm", shape=ifm_shape, dtype=dtype) ifm2 = relay.var("ifm2", shape=ifm2_shape, dtype=dtype) c1 = relay.left_shift(ifm, ifm2) return tvm.IRModule.from_expr(relay.Function([ifm, ifm2], c1))
def test_compose(): n = relay.Var("n") inc = relay.Function([n], s(n)) x = relay.Var("x") res = intrp.evaluate(relay.Call(compose(inc, double), [s(s(z()))])) assert count(res) == 5
def create_model(): ifm = relay.var("ifm", shape=ifm_shape, dtype="int8") reshape = relay.op.reshape(ifm, newshape=new_shape) return tvm.IRModule.from_expr(relay.Function([ifm], reshape))
def tune_cutlass_function( func, use_3xtf32, split_k_slices, profile_all_alignments, find_first_valid, use_multiprocessing, gemm_profiler, conv2d_profiler, ): """Given a function intended to be offloaded to CUTLASS, profile each workload to select which kernels to emit. Parameters ---------- func : IRModule The Relay Function to tune for. use_3xtf32 : bool Wheter or not use slower but very accurate (compared to tf32) 3xtf32 mode for fp32 inputs on tensorcore. split_k_slices : list of int Split factor candidates for split-K GEMM. If split-K > 1, the GEMM K-loop is computed in parallel accross split-K blocks, and a seperate global reduction kernel is launched to accumulate partial reductions. The profiler will pick the best split-k factor from the given candidate list. Note that the larger split-K factor requires a larger workspace. Currently, parallel split-k has been tested only for wgrad. For GEMM and other conv2d kinds, split_k_slices is ignored. profile_all_alignments : bool When True, profile all kernal variants with smaller alignments than the largest possible. find_first_valid : bool Whether or not profile all candidate kernels, or stop profiling after the first applicable kernel is found. use_multiprocessing : bool Whether or not compile profiler executables for different kernels in parallel. gemm_profiler : CutlassGemmProfiler Profiler for dense operators. May cache results between tuned functions. conv2d_profiler : CutlassConv2DProfiler Profiler for conv2d operators. May cach results between tuned functions. Returns ------- annot_func : Function The input function with attributes capturing the best CUTLASS kernel found by tuning. """ annotator = OpAnnotator() annotator.visit(func) out_shape = annotator.signature["ret_shape"] out_dtype = annotator.signature["ret_dtype"] op_type = annotator.signature["op_type"] new_attrs = {"op_type": op_type} new_attrs.update(annotator.signature) new_attrs.update(func.attrs) arg0_shape = new_attrs["arg0_shape"] arg1_shape = new_attrs["arg1_shape"] arg0_dtype = new_attrs["arg0_dtype"] arg1_dtype = new_attrs["arg1_dtype"] if "conv2d" in op_type: new_attrs["padding"] = annotator.op_attrs.padding new_attrs["strides"] = annotator.op_attrs.strides new_attrs["dilation"] = annotator.op_attrs.dilation if "conv2d_transpose" in op_type: d_shape = out_shape w_shape = arg1_shape elif "conv2d_backward_weight" in op_type: d_shape = arg1_shape w_shape = out_shape else: d_shape = arg0_shape w_shape = arg1_shape new_attrs.update( handle_conv2d( conv2d_profiler, op_type, d_shape, w_shape, annotator.op_attrs.padding, annotator.op_attrs.strides, annotator.op_attrs.dilation, out_dtype, arg0_dtype, arg1_dtype, use_3xtf32, split_k_slices, profile_all_alignments, find_first_valid, use_multiprocessing, )) elif "batch_matmul" in op_type: new_attrs.update( handle_batch_matmul( gemm_profiler, op_type, arg0_shape, arg1_shape, out_dtype, arg0_dtype, arg1_dtype, use_3xtf32, find_first_valid, use_multiprocessing, )) elif "dense" in op_type: new_attrs.update( handle_dense( gemm_profiler, op_type, arg0_shape, arg1_shape, out_dtype, arg0_dtype, arg1_dtype, use_3xtf32, find_first_valid, use_multiprocessing, )) else: raise ValueError("%s unsupported composite" % op_type) new_attrs = tvm.ir.make_node("DictAttrs", **new_attrs) return relay.Function( func.params, func.body, ret_type=func.ret_type, type_params=func.type_params, attrs=new_attrs, )
def test_negative_grad(): data = relay.var("data", relay.TensorType((10, 4), "float32")) fwd_func = relay.Function([data], relay.negative(data)) check_grad(fwd_func)