示例#1
0
    def test_matmul_tune_and_run(self, n, m, k, seed, gc, dc):
        tuner = tc.Tuner(MATMUL_LANG)
        tuner_config = (tc.TunerConfig().generations(3).threads(32).pop_size(
            2).tuner_min_launch_total_threads(1))
        matmul_top1 = tuner.tune('matmul', (torch.randn(
            n, k, device='cuda'), torch.randn(k, m, device='cuda')),
                                 tc.MappingOptions('naive'), tuner_config)
        matmul_grad_top1 = tuner.tune(
            'matmul_grad',
            (torch.randn(n, k, device='cuda'), torch.randn(
                k, m, device='cuda'), torch.randn(n, m, device='cuda')),
            tc.MappingOptions('naive'), tuner_config)

        X = np.random.rand(m, k).astype(np.float32)
        W = np.random.rand(k, n).astype(np.float32)

        def ref(X, W):
            return [np.dot(X, W)]

        op = core.CreateOperator(
            "TcOp",
            ["X", "Y"],
            "out",
            tc_def=MATMUL_LANG,
            tc_name="matmul",
            tc_grad_def=MATMUL_LANG,
            tc_grad_name="matmul_grad",
            inputs_used_by_gradient=[0, 1],
            output_gradients_used_by_gradient=[0],
            inputs_to_compute_gradients_of=[0, 1],
            mapping_options=matmul_top1.serialize(),
            grad_mapping_options=matmul_grad_top1.serialize(),
        )

        self.assertReferenceChecks(
            device_option=gc,
            op=op,
            inputs=[X, W],
            reference=ref,
        )

        for i in range(2):
            self.assertGradientChecks(
                device_option=gc,
                op=op,
                inputs=[X, W],
                outputs_to_check=i,
                outputs_with_grads=[0],
            )
示例#2
0
 def test_mapping_options(self):
     options = (tc.MappingOptions('naive').useSharedMemory(
         True).unrollCopyShared(False).mapToBlocks([256, 8]).mapToThreads([
             4, 16, 4
         ]).tile([2, 8, 64, 128]).unroll(128).fixParametersBeforeScheduling(
             False).scheduleFusionStrategy(
                 "Max").outerScheduleFusionStrategy("Preserve3Coincident"))
示例#3
0
    def test_tensordot_autotune_pybind(self):
        tensordot_str = """
        def tensordot(float(N, C1, C2, H, W) I0, float(N, C2, C3, H, W) I1)
            -> (O)
        {
            O(n, c1, c3, h, w) +=! I0(n, c1, c2, h, w) * I1(n, c2, c3, h, w)
        }
        """
        entry_point = "tensordot"

        N, C1, C2, C3, H, W = 40, 16, 8, 20, 13, 15
        with tempfile.NamedTemporaryFile() as cache_file:
            I0 = torch.randn(N, C1, C2, H, W, device='cuda')
            I1 = torch.randn(N, C2, C3, H, W, device='cuda')

            tuner = tc.Tuner(tensordot_str, cache_file.name)
            top1 = tuner.tune(entry_point, (I0, I1),
                              tc.MappingOptions('naive'), tuner_config)

            import tensor_comprehensions.tclib as tclib
            executor = tclib.compile(tensordot_str, entry_point, (I0, I1),
                                     top1)
            O = executor.run((I0, I1), ())

            cache = tc.MappingOptionsCache(cache_file.name)
            best_options, = cache.load(tensordot_str, entry_point, (I0, I1),
                                       10)
            assert str(top1) == str(best_options), (
                "Expected the same but found {}\nand\n{}".format(
                    top1, best_options))

            executor = tclib.compile(tensordot_str, entry_point, (I0, I1),
                                     best_options)
            O = executor.run((I0, I1), ())

            # No simple torch baseline, compare against naive
            executor = tclib.compile(tensordot_str, entry_point, (I0, I1),
                                     tc.MappingOptions('naive'))
            ref = executor.run((I0, I1), ())

            tc.assert_almost_equal(ref, O, I0, I1, operations=C2)
示例#4
0
    def test_matmul_pybind(self):
        mm_str = """
        def matmul(float(M,N) A, float(N,K) B) -> (C) {
            C(m, k) +=! A(m, r_n) * B(r_n, k)
        }
        """

        A, B = (torch.randn(3, 4,
                            device='cuda'), torch.randn(4, 5, device='cuda'))

        import tensor_comprehensions.tclib as tclib
        executor = tclib.compile(mm_str, "matmul", (A, B),
                                 tc.MappingOptions('naive'))
        C = executor.run((A, B), ())
        torch.cuda.synchronize()
        expected = torch.mm(A, B)
        torch.cuda.synchronize()
        tc.assert_almost_equal(C, expected, A, B, operations=4)

        C = executor.run((A, B), (C, ))
        tc.assert_almost_equal(C, torch.mm(A, B), A, B, operations=4)
示例#5
0
def build(args: argparse.Namespace, tc_str: str, entry_point: str,
          *inputs: torch.Tensor) -> tc.Executor:
    tuner_config = (tc.TunerConfig().threads(args.tuner_threads).generations(
        args.tuner_generations).pop_size(args.tuner_pop_size).number_elites(
            args.tuner_number_elites).devices(args.tuner_devices))

    if args.autotuner:
        if args.debug: print("Running autotuner.")

        if args.load_from_cache:
            return tc.autotune_and_compile(
                tc_str,
                entry_point,
                *inputs,
                starting_options=None,
                tuner_config=tuner_config,
                cache_filename=args.tuner_cache_file,
                load_from_cache=args.load_from_cache,
                store_to_cache=args.store_to_cache)
        else:
            return tc.autotune_and_compile(
                tc_str,
                entry_point,
                *inputs,
                starting_options='naive',
                tuner_config=tuner_config,
                cache_filename=args.tuner_cache_file,
                load_from_cache=args.load_from_cache,
                store_to_cache=args.store_to_cache)

    elif args.load_from_cache:
        if args.debug: print("Loading autotuned mapping options from cache.")

        mapping_options = tc.make_load_from_cache_options_factory(
            args.tuner_cache_file)(tc_str, entry_point, *inputs)
        return tc.compile(tc_str, entry_point, mapping_options, *inputs)
    else:
        if args.debug: print("Building mapping options.")

        options = tc.MappingOptions("naive")

        if args.mapToBlocks is not None:
            options.mapToBlocks(args.mapToBlocks)
        if args.mapToThreads is not None:
            options.mapToThreads(args.mapToThreads)
        if args.tile is not None:
            options.tile(args.tile)
        if args.useSharedMemory is not None:
            options.useSharedMemory(args.useSharedMemory)
        if args.maxSharedMemory is not None:
            options.maxSharedMemory(args.maxSharedMemory)
        if args.unroll is not None:
            options.unroll(args.unroll)
        if args.unrollCopyShared is not None:
            options.unrollCopyShared(args.unrollCopyShared)
        if args.useReadOnlyCache is not None:
            options.useReadOnlyCache(args.useReadOnlyCache)
        if args.matchLibraryCalls is not None:
            options.matchLibraryCalls(args.matchLibraryCalls)
        if args.fixParametersBeforeScheduling is not None:
            options.fixParametersBeforeScheduling(
                args.fixParametersBeforeScheduling)
        if args.outerScheduleFusionStrategy is not None:
            options.outerScheduleFusionStrategy(
                args.outerScheduleFusionStrategy)
        if args.intraTileScheduleFusionStrategy is not None:
            options.intraTileScheduleFusionStrategy(
                args.intraTileScheduleFusionStrategy)

        return tc.compile(tc_str, entry_point, options, *inputs)
def main():
    parser = argparse.ArgumentParser(
        "compile + tune + test tensor comp kernels...")
    parser.add_argument("--kernel_name", default=r"kernel_*")
    parser.add_argument("--list",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--tune",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--exact",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--float32",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--load_cache",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--generations", default=10, type=int)
    parser.add_argument("--cache_filename", default="tc_cache", type=str)
    parser.add_argument("--init", default="naive", type=str)
    parser.add_argument("--threads", default=16, type=int)
    parser.add_argument("--pop_size", default=100, type=int)
    parser.add_argument("--crossover_rate", default=80, type=int)
    parser.add_argument("--mutation_rate", default=7, type=int)
    parser.add_argument("--number_elites", default=10, type=int)
    parser.add_argument("--height", default=32, type=int)
    parser.add_argument("--width", default=32, type=int)
    parser.add_argument("--N", default=8, type=int)
    parser.add_argument("--channels", default=3, type=int)
    parser.add_argument("--num_gpus", default=1, type=int)
    args = parser.parse_args()
    matched_kernels = []
    gpus = ",".join([str(x) for x in range(args.num_gpus)])
    print("devices: ", gpus)
    tuner_config = tc.TunerConfig().threads(args.threads).generations(
        args.generations).pop_size(args.pop_size).crossover_rate(
            args.crossover_rate).mutation_rate(
                args.mutation_rate).number_elites(args.number_elites)

    for k in all_kernel_strs:
        if not args.exact:
            if re.match(re.compile(args.kernel_name), k):
                matched_kernels.append(k)
        else:
            if k == args.kernel_name:
                matched_kernels.append(k)

    if args.list:
        print("Kernels available:")
        for k in matched_kernels:
            print("\t" + k)

    if args.init not in ["naive", "pointwise", "mlp"]:
        assert False

    start_options = tc.MappingOptions(args.init)

    if args.tune:
        if not args.load_cache:
            opts = tc.make_autotuned_options_factory(
                starting_options=start_options,
                cache_filename=args.cache_filename,
                store_to_cache=True,
                tuner_config=tuner_config)
        else:
            print("loading from cache...")
            opts = tc.make_autotuned_options_factory(
                load_from_cache=True,
                cache_filename=args.cache_filename,
                store_to_cache=True,
                tuner_config=tuner_config)
    else:
        if not args.load_cache:
            opts = tc.make_naive_options_factory()
        else:
            opts = tc.make_load_from_cache_options_factory(
                cache_filename=args.cache_filename)
    kernel_fn_map = {}
    N = args.N
    H = args.height
    W = args.width
    torch.manual_seed(0)
    x = torch.randn(N, H, W, args.channels).double().cuda()
    k_input = torch.randn(N, N, H, W, H, W).double().cuda()
    z = torch.tensor(1.0).double().cuda()
    y = x
    if args.float32:
        x = x.float()
        y = y.float()
        k_input = k_input.float()
        z = z.float()

    for k in matched_kernels:
        print(f"Tuning {k}")
        kernel_fn = tc.define(all_kernel_strs[k], opts)
        kernel_fn_map[k] = kernel_fn
        if "float" in k:
            k_call = getattr(kernel_fn, k.replace("kernel_float_", ""))
        else:
            k_call = getattr(kernel_fn, k.replace("kernel_", ""))

        if "input" in k:
            kxy = k_call(x, y)
            print("output: ", kxy)
        else:
            if "exponential_shifted" in k:
                print("calling exponential shifted")
                kxy = k_call(k_input, k_input, k_input, z)
            else:
                kxy = k_call(k_input, k_input, k_input)