def record_common(dag, s): target = tvm.target.Target("llvm") task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target) inp = auto_scheduler.measure.MeasureInput(task, s) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) # Test in-memory record processing. record_str = auto_scheduler.measure_record.dump_record_to_string(inp, res) r_inp, r_res = auto_scheduler.measure_record.load_record_from_string( record_str) # Only check the workload_key for simplification. assert inp.task.workload_key == r_inp.task.workload_key assert str(res) == str(r_res) # Test file-based record processing. with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, _ = log_reader.read_lines() assert len(inputs) == 1 s1 = dag.infer_bound_from_state(s) s2 = dag.infer_bound_from_state(inputs[0].state) assert s1 == s2 assert not (s1 == dag.get_init_state())
def test_record(): if not tvm.runtime.enabled("llvm"): return A = te.placeholder((512, 512), name='A') B = te.placeholder((512, 512), name='B') k = te.reduce_axis((0, 512), name='k') C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name='C') D = topi.nn.relu(C) k = te.reduce_axis((0, 512), name='k') E = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * D[k][j], axis=[k]), name='C') F = topi.nn.relu(E) dag = auto_scheduler.ComputeDAG([A, B, F]) s = dag.get_init_state() # Split its0 = s.split(C, s[C].iters[0], [4, 8, 8]) its1 = s.split(C, s[C].iters[4], [8, 4, 4]) # Reorder s.reorder(C, [its0[0], its1[0], its0[1], its1[1], its0[2], its1[2], its0[3], s[C].iters[8], its1[3]]) # Fuse s.fuse(C, [s[C].iters[0], s[C].iters[1], s[C].iters[2]]) # Compute at s.split(F, s[F].iters[0], [2]) s.compute_at(E, F, s[F].iters[0]) # Compute inline s.compute_inline(D) # Compute root s.compute_root(D) # Parallel s.parallel(C, s[C].iters[0]) # Thread bind(The blockIdx & threadIdx are used in GPU, just for record testing here) s.bind(C, s[C].iters[1], "blockIdx.x") s.bind(C, s[C].iters[2], "threadIdx.z") s.bind(C, s[C].iters[3], "vthread") # Unroll s.unroll(C, s[C].iters[4]) # Vectorize s.vectorize(C, s[C].iters[6]) target = tvm.target.create("llvm") task = auto_scheduler.SearchTask(dag, "test", target) inp = auto_scheduler.measure.MeasureInput(task, s) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, results = log_reader.read_lines() assert len(inputs) == 1 s1 = dag.infer_bound_from_state(s) s2 = dag.infer_bound_from_state(inputs[0].state) assert s1 == s2 assert not (s1 == dag.get_init_state())
def record_common(dag, s): target = tvm.target.create("llvm") task = auto_scheduler.SearchTask(dag, "test", target) inp = auto_scheduler.measure.MeasureInput(task, s) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, results = log_reader.read_lines() assert len(inputs) == 1 s1 = dag.infer_bound_from_state(s) s2 = dag.infer_bound_from_state(inputs[0].state) assert s1 == s2 assert not (s1 == dag.get_init_state())
def test_measure_target_host(): task = auto_scheduler.SearchTask( func=matmul_auto_scheduler_test, args=(512, 512, 512), target=tvm.target.Target("llvm", "llvm -mtriple=aarch64-linux-gnu"), ) inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, _ = log_reader.read_lines() assert len(inputs) == 1 raw_inp = inputs[0] recovered_inp = auto_scheduler.measure.recover_measure_input(raw_inp) assert str(recovered_inp.task.target.host) == str(inp.task.target.host)
def test_recover_measure_input(): task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm") inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, results = log_reader.read_lines() assert len(inputs) == 1 raw_inp = inputs[0] correct_inp = auto_scheduler.measure_record.recover_measure_input(raw_inp) assert str(correct_inp.task.compute_dag) == str(inp.task.compute_dag) correct_inp = auto_scheduler.measure_record.recover_measure_input( raw_inp, rebuild_state=True ) assert str(correct_inp.state) == str(inp.state)
def test_gpu_feature(): # Use records to build a complicated GPU program json_records = "\n".join( ( """{"i": [["[\\"matmul_auto_scheduler_test\\", 512, 512, 512]", "cuda"], [[], [["CHW", 2, "local"], ["SP", 2, 0, 512, [1, 16, 32, 1], 1], ["SP", 2, 5, 512, [4, 1, 1, 16], 1], ["SP", 2, 10, 512, [1, 2], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 3, 0, 1, 3], ["FSP", 3, 4, 2, 3], ["RE", 3, [0, 4, 1, 5, 2, 6, 3, 7]], ["FU", 2, [0, 1]], ["FU", 3, [0, 1]], ["FU", 2, [1, 2]], ["FU", 3, [1, 2]], ["FU", 2, [2, 3]], ["FU", 3, [2, 3]], ["CA", 2, 3, 2], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 3], ["FU", 2, [0, 1]], ["FFSP", 2, 0, [1, 2], 1, 1], ["AN", 2, 1, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 3], ["FU", 1, [0, 1]], ["FFSP", 1, 0, [1, 2], 1, 1], ["AN", 1, 1, 6], ["AN", 5, 0, 5], ["AN", 5, 1, 4], ["AN", 5, 2, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00536798], 0, 2.49277, 1585564852], "v": "v0.1"}""", ) ) # load states with tempfile.NamedTemporaryFile(mode="w") as f: f.write(json_records) f.flush() inputs, _ = auto_scheduler.RecordReader(f.name).read_lines() inp = inputs[0] task = auto_scheduler.SearchTask( workload_key=inp.task.workload_key, target=inp.task.target, hardware_params=auto_scheduler.HardwareParams( 100000, 16, 64, 1 << 30, 1 << 30, 1 << 30, 1 << 30, 1 << 30 ), ) state = task.compute_dag.infer_bound_from_state(inputs[0].state) fea = auto_scheduler.feature.get_per_store_features_from_states([state], task)[0] names = auto_scheduler.feature.get_per_store_feature_names() # build feature dict fea_dicts = [] for i in range(len(fea)): tmp_dict = {} for j in range(len(names)): tmp_dict[names[j]] = fea[i][j] fea_dicts.append(tmp_dict) """ lowered IR: Placeholder: A, B blockIdx.x [email protected]@ (0,8) vthread [email protected]@ (0,4) threadIdx.x [email protected]@ (0,16) C.local auto_unroll: 1024 for k.0 (0,256) for ax0@[email protected] (0,8) threadIdx.x ax0@[email protected] (0,16) B.shared = ... for ax0@[email protected] (0,64) threadIdx.x ax0@[email protected] (0,16) A.shared = ... for i_c.3 (0,32) for k.2 (0,2) for j_c.4 (0,16) C.local = ... for i.3 (0,32) for j.3 (0,16) C = ... """ # check gpu-related features assert fequal(fea_dicts[0]["blockIdx_x_len"], math.log2(8 + 1)) assert fequal(fea_dicts[0]["vthread_len"], math.log2(4 + 1)) assert fequal(fea_dicts[1]["threadIdx_x_len"], math.log2(16 + 1)) assert fequal(fea_dicts[0]["threadIdx_y_len"], math.log2(1 + 1)) assert fequal(fea_dicts[2]["blockIdx_z_len"], math.log2(1 + 1)) assert fequal(fea_dicts[0]["is_gpu"], 1.0)
def test_record(): if not tvm.runtime.enabled("llvm"): return A = te.placeholder((512, 512), name='A') B = te.placeholder((512, 512), name='B') k = te.reduce_axis((0, 512), name='k') C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name='C') D = topi.nn.relu(C) k = te.reduce_axis((0, 512), name='k') E = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * D[k][j], axis=[k]), name='E') F = topi.nn.relu(E) k = te.reduce_axis((0, 512), name='k') G = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * F[k][j], axis=[k]), name='G') H = topi.nn.relu(G) I = topi.nn.relu(H) dag = auto_scheduler.ComputeDAG([A, B, I]) s = dag.get_init_state() # Split its0 = s.split(C, s[C].iters[0], [4, 8, 8]) its1 = s.split(C, s[C].iters[4], [8, 4, 4]) # Reorder s.reorder(C, [ its0[0], its1[0], its0[1], its1[1], its0[2], its1[2], its0[3], s[C].iters[8], its1[3] ]) # Fuse s.fuse(C, [s[C].iters[0], s[C].iters[1], s[C].iters[2]]) # Compute at s.split(F, s[F].iters[0], [2]) s.compute_at(E, F, s[F].iters[0]) # Compute inline s.compute_inline(D) # Compute root s.compute_root(D) # Parallel s.parallel(C, s[C].iters[0]) # Thread bind(The blockIdx & threadIdx are used in GPU, just for record testing here) s.bind(C, s[C].iters[1], "blockIdx.x") s.bind(C, s[C].iters[2], "threadIdx.z") s.bind(C, s[C].iters[3], "vthread") # Unroll s.unroll(C, s[C].iters[4]) # Vectorize s.vectorize(C, s[C].iters[6]) # Cache Read D_global = s.cache_read(D, "global", [E]) s.compute_at(D_global, E, s[E].iters[2]) # Cache Write s.cache_write(D, "shared") #follow_split its2 = s.split(G, s[G].iters[0], [4, 2, 8, 4], True) split_step0 = len(s.transform_steps) - 1 s.follow_split(G, s[G].iters[5], split_step0, 4) #follow_fused_split its2 = s.split(H, s[H].iters[0], [4, 2, 8, 4], True) split_step1 = len(s.transform_steps) - 1 its3 = s.split(H, s[H].iters[5], [2, 4, 2, 4], True) split_step2 = len(s.transform_steps) - 1 its = [] for i0, i1 in zip(its2, its3): its.append(i0) its.append(i1) for i in range(0, 5): s.fuse(H, [s[H].iters[i], s[H].iters[i + 1]]) s.follow_fused_split(I, s[I].iters[0], [split_step1, split_step2], 0, False) target = tvm.target.create("llvm") task = auto_scheduler.SearchTask(dag, "test", target) inp = auto_scheduler.measure.MeasureInput(task, s) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, results = log_reader.read_lines() assert len(inputs) == 1 s1 = dag.infer_bound_from_state(s) s2 = dag.infer_bound_from_state(inputs[0].state) assert s1 == s2 assert not (s1 == dag.get_init_state())