def test_matmul_tune_and_run(self, n, m, k, seed, gc, dc): tuner = tc.Tuner(MATMUL_LANG) tuner_config = (tc.TunerConfig().generations(3).threads(32).pop_size( 2).tuner_min_launch_total_threads(1)) matmul_top1 = tuner.tune('matmul', (torch.randn( n, k, device='cuda'), torch.randn(k, m, device='cuda')), tc.MappingOptions('naive'), tuner_config) matmul_grad_top1 = tuner.tune( 'matmul_grad', (torch.randn(n, k, device='cuda'), torch.randn( k, m, device='cuda'), torch.randn(n, m, device='cuda')), tc.MappingOptions('naive'), tuner_config) X = np.random.rand(m, k).astype(np.float32) W = np.random.rand(k, n).astype(np.float32) def ref(X, W): return [np.dot(X, W)] op = core.CreateOperator( "TcOp", ["X", "Y"], "out", tc_def=MATMUL_LANG, tc_name="matmul", tc_grad_def=MATMUL_LANG, tc_grad_name="matmul_grad", inputs_used_by_gradient=[0, 1], output_gradients_used_by_gradient=[0], inputs_to_compute_gradients_of=[0, 1], mapping_options=matmul_top1.serialize(), grad_mapping_options=matmul_grad_top1.serialize(), ) self.assertReferenceChecks( device_option=gc, op=op, inputs=[X, W], reference=ref, ) for i in range(2): self.assertGradientChecks( device_option=gc, op=op, inputs=[X, W], outputs_to_check=i, outputs_with_grads=[0], )
def test_mapping_options(self): options = (tc.MappingOptions('naive').useSharedMemory( True).unrollCopyShared(False).mapToBlocks([256, 8]).mapToThreads([ 4, 16, 4 ]).tile([2, 8, 64, 128]).unroll(128).fixParametersBeforeScheduling( False).scheduleFusionStrategy( "Max").outerScheduleFusionStrategy("Preserve3Coincident"))
def test_tensordot_autotune_pybind(self): tensordot_str = """ def tensordot(float(N, C1, C2, H, W) I0, float(N, C2, C3, H, W) I1) -> (O) { O(n, c1, c3, h, w) +=! I0(n, c1, c2, h, w) * I1(n, c2, c3, h, w) } """ entry_point = "tensordot" N, C1, C2, C3, H, W = 40, 16, 8, 20, 13, 15 with tempfile.NamedTemporaryFile() as cache_file: I0 = torch.randn(N, C1, C2, H, W, device='cuda') I1 = torch.randn(N, C2, C3, H, W, device='cuda') tuner = tc.Tuner(tensordot_str, cache_file.name) top1 = tuner.tune(entry_point, (I0, I1), tc.MappingOptions('naive'), tuner_config) import tensor_comprehensions.tclib as tclib executor = tclib.compile(tensordot_str, entry_point, (I0, I1), top1) O = executor.run((I0, I1), ()) cache = tc.MappingOptionsCache(cache_file.name) best_options, = cache.load(tensordot_str, entry_point, (I0, I1), 10) assert str(top1) == str(best_options), ( "Expected the same but found {}\nand\n{}".format( top1, best_options)) executor = tclib.compile(tensordot_str, entry_point, (I0, I1), best_options) O = executor.run((I0, I1), ()) # No simple torch baseline, compare against naive executor = tclib.compile(tensordot_str, entry_point, (I0, I1), tc.MappingOptions('naive')) ref = executor.run((I0, I1), ()) tc.assert_almost_equal(ref, O, I0, I1, operations=C2)
def test_matmul_pybind(self): mm_str = """ def matmul(float(M,N) A, float(N,K) B) -> (C) { C(m, k) +=! A(m, r_n) * B(r_n, k) } """ A, B = (torch.randn(3, 4, device='cuda'), torch.randn(4, 5, device='cuda')) import tensor_comprehensions.tclib as tclib executor = tclib.compile(mm_str, "matmul", (A, B), tc.MappingOptions('naive')) C = executor.run((A, B), ()) torch.cuda.synchronize() expected = torch.mm(A, B) torch.cuda.synchronize() tc.assert_almost_equal(C, expected, A, B, operations=4) C = executor.run((A, B), (C, )) tc.assert_almost_equal(C, torch.mm(A, B), A, B, operations=4)
def build(args: argparse.Namespace, tc_str: str, entry_point: str, *inputs: torch.Tensor) -> tc.Executor: tuner_config = (tc.TunerConfig().threads(args.tuner_threads).generations( args.tuner_generations).pop_size(args.tuner_pop_size).number_elites( args.tuner_number_elites).devices(args.tuner_devices)) if args.autotuner: if args.debug: print("Running autotuner.") if args.load_from_cache: return tc.autotune_and_compile( tc_str, entry_point, *inputs, starting_options=None, tuner_config=tuner_config, cache_filename=args.tuner_cache_file, load_from_cache=args.load_from_cache, store_to_cache=args.store_to_cache) else: return tc.autotune_and_compile( tc_str, entry_point, *inputs, starting_options='naive', tuner_config=tuner_config, cache_filename=args.tuner_cache_file, load_from_cache=args.load_from_cache, store_to_cache=args.store_to_cache) elif args.load_from_cache: if args.debug: print("Loading autotuned mapping options from cache.") mapping_options = tc.make_load_from_cache_options_factory( args.tuner_cache_file)(tc_str, entry_point, *inputs) return tc.compile(tc_str, entry_point, mapping_options, *inputs) else: if args.debug: print("Building mapping options.") options = tc.MappingOptions("naive") if args.mapToBlocks is not None: options.mapToBlocks(args.mapToBlocks) if args.mapToThreads is not None: options.mapToThreads(args.mapToThreads) if args.tile is not None: options.tile(args.tile) if args.useSharedMemory is not None: options.useSharedMemory(args.useSharedMemory) if args.maxSharedMemory is not None: options.maxSharedMemory(args.maxSharedMemory) if args.unroll is not None: options.unroll(args.unroll) if args.unrollCopyShared is not None: options.unrollCopyShared(args.unrollCopyShared) if args.useReadOnlyCache is not None: options.useReadOnlyCache(args.useReadOnlyCache) if args.matchLibraryCalls is not None: options.matchLibraryCalls(args.matchLibraryCalls) if args.fixParametersBeforeScheduling is not None: options.fixParametersBeforeScheduling( args.fixParametersBeforeScheduling) if args.outerScheduleFusionStrategy is not None: options.outerScheduleFusionStrategy( args.outerScheduleFusionStrategy) if args.intraTileScheduleFusionStrategy is not None: options.intraTileScheduleFusionStrategy( args.intraTileScheduleFusionStrategy) return tc.compile(tc_str, entry_point, options, *inputs)
def main(): parser = argparse.ArgumentParser( "compile + tune + test tensor comp kernels...") parser.add_argument("--kernel_name", default=r"kernel_*") parser.add_argument("--list", const=True, action="store_const", default=False) parser.add_argument("--tune", const=True, action="store_const", default=False) parser.add_argument("--exact", const=True, action="store_const", default=False) parser.add_argument("--float32", const=True, action="store_const", default=False) parser.add_argument("--load_cache", const=True, action="store_const", default=False) parser.add_argument("--generations", default=10, type=int) parser.add_argument("--cache_filename", default="tc_cache", type=str) parser.add_argument("--init", default="naive", type=str) parser.add_argument("--threads", default=16, type=int) parser.add_argument("--pop_size", default=100, type=int) parser.add_argument("--crossover_rate", default=80, type=int) parser.add_argument("--mutation_rate", default=7, type=int) parser.add_argument("--number_elites", default=10, type=int) parser.add_argument("--height", default=32, type=int) parser.add_argument("--width", default=32, type=int) parser.add_argument("--N", default=8, type=int) parser.add_argument("--channels", default=3, type=int) parser.add_argument("--num_gpus", default=1, type=int) args = parser.parse_args() matched_kernels = [] gpus = ",".join([str(x) for x in range(args.num_gpus)]) print("devices: ", gpus) tuner_config = tc.TunerConfig().threads(args.threads).generations( args.generations).pop_size(args.pop_size).crossover_rate( args.crossover_rate).mutation_rate( args.mutation_rate).number_elites(args.number_elites) for k in all_kernel_strs: if not args.exact: if re.match(re.compile(args.kernel_name), k): matched_kernels.append(k) else: if k == args.kernel_name: matched_kernels.append(k) if args.list: print("Kernels available:") for k in matched_kernels: print("\t" + k) if args.init not in ["naive", "pointwise", "mlp"]: assert False start_options = tc.MappingOptions(args.init) if args.tune: if not args.load_cache: opts = tc.make_autotuned_options_factory( starting_options=start_options, cache_filename=args.cache_filename, store_to_cache=True, tuner_config=tuner_config) else: print("loading from cache...") opts = tc.make_autotuned_options_factory( load_from_cache=True, cache_filename=args.cache_filename, store_to_cache=True, tuner_config=tuner_config) else: if not args.load_cache: opts = tc.make_naive_options_factory() else: opts = tc.make_load_from_cache_options_factory( cache_filename=args.cache_filename) kernel_fn_map = {} N = args.N H = args.height W = args.width torch.manual_seed(0) x = torch.randn(N, H, W, args.channels).double().cuda() k_input = torch.randn(N, N, H, W, H, W).double().cuda() z = torch.tensor(1.0).double().cuda() y = x if args.float32: x = x.float() y = y.float() k_input = k_input.float() z = z.float() for k in matched_kernels: print(f"Tuning {k}") kernel_fn = tc.define(all_kernel_strs[k], opts) kernel_fn_map[k] = kernel_fn if "float" in k: k_call = getattr(kernel_fn, k.replace("kernel_float_", "")) else: k_call = getattr(kernel_fn, k.replace("kernel_", "")) if "input" in k: kxy = k_call(x, y) print("output: ", kxy) else: if "exponential_shifted" in k: print("calling exponential shifted") kxy = k_call(k_input, k_input, k_input, z) else: kxy = k_call(k_input, k_input, k_input)