Exemplo n.º 1
0
    def test_tc_autotune_reinforce(self):
        with tempfile.NamedTemporaryFile() as cache_file:
            group_normalization = """
            def moments(float(N, K) I) -> (mean, var) {
                # var = E(x^2) - mean^2.
                mean(n) +=! I(n, r_k)
                 var(n) +=! I(n, r_k) * I(n, r_k)
                mean(n)  = mean(n) / (K)
                 var(n)  =  var(n) / (K) - mean(n) * mean(n)
            }

            def group_normalization(
                float(N, G, D, H, W) I, float(G, D) gamma, float(G, D) beta,
                float(N, G) mean, float(N, G) var) -> (O)
            {
                O(n, g, d, h, w) = gamma(g, d)
                    * ( I(n, g, d, h, w) - mean(n, g) )
                    * rsqrt( var(n, g) + 1e-5 )
                    + beta(g, d)
            }
            """

            N, G, D, H, W = 32, 32, 4, 56, 56
            I, gamma, beta = (torch.randn(N, G, D, H, W, device='cuda'),
                              torch.randn(G, D, device='cuda').fill_(1.0),
                              torch.randn(G, D, device='cuda').zero_())

            T = tc.define(
                group_normalization,
                tc.make_autotuned_options_factory(
                    starting_options='naive',
                    tuner_config=tuner_config,
                    cache_filename=cache_file.name,
                    store_to_cache=True))
            # First occurrence triggers tuning
            mean, var = T.moments(I.view((N * G, -1)))
            out = T.group_normalization(I, gamma, beta, mean.view((N, G)),
                                        var.view((N, G)))

            # Create a new TC object to retrigger tuning
            T = tc.define(
                group_normalization,
                tc.make_autotuned_options_factory(
                    tuner_config=tuner_config,
                    cache_filename=cache_file.name,
                    load_from_cache=True,
                    store_to_cache=True))
            mean, var = T.moments(I.view((N * G, -1)))
            out = T.group_normalization(I, gamma, beta, mean.view((N, G)),
                                        var.view((N, G)))

            from torch.nn.modules.normalization import GroupNorm
            GN = GroupNorm(G, G * D).cuda()
            ref = GN.forward(I.view((N, G * D, H, W)))

            tc.assert_almost_equal(ref,
                                   out.view((N, G * D, H, W)),
                                   I,
                                   operations=D * H * W)
Exemplo n.º 2
0
    def test_conv_with_backward_2kernels(self):
        conv = """
        def convolution(float(N,C,H,W) I, float(M,C,KH,KW) W1, float(M) Bias)
        -> (O)
        {
            O(n, m, h, w) +=!
                I(n, r_c, h + r_kh, w + r_kw) * W1(m, r_c, r_kh, r_kw)
            O(n, m, h, w)  = O(n, m, h, w) + Bias(m)
        }
        def convolution_igrad(float(M,C,KH,KW) W1, float(N,M,H,W) d_O)
            -> (d_I)
        {
            d_I(n, c, h, w) +=!
                d_O(  n, r_m, h - r_kh, w - r_kw) * W1(r_m, c, r_kh, r_kw)
        }
        def convolution_wgrad(float(N,C,H,W) I, float(N,M,H,W) d_O) -> (d_W1)
        {
            d_W1(m, c, kh, kw) +=!
                d_O(r_n,   m, r_h - kh, r_w - kw) *  I(r_n, c,  r_h,  r_w)
        }
        def convolution_biasgrad(float(M) Bias) -> (d_Bias)
        {
            # TODO: Bias incorrect + check
            d_Bias(m) = Bias(m)
        }
        """

        N, C, H, W, O, kH, kW = 32, 4, 56, 56, 16, 1, 1
        T = tc.define(
            conv,
            tc.make_autotuned_options_factory(starting_options='naive',
                                              tuner_config=tuner_config))
        I = torch.randn(N, C, H, W, device='cuda', requires_grad=True)

        # Reference
        from torch.nn.modules.conv import Conv2d
        Conv = Conv2d(C, O, 1, stride=1).cuda()
        ref = Conv.forward(I)

        W = Conv.weight.clone()
        Bias = Conv.bias.clone()

        def convolution_backward(I, W, Bias, d_O):
            d_I = T.convolution_igrad(W, d_O)
            d_O = T.convolution_wgrad(I, d_O)
            d_Bias = T.convolution_biasgrad(Bias)
            return (d_I, d_O, d_Bias)

        convolution_function = tc.make_autograd(T.convolution,
                                                convolution_backward)

        # First occurrence triggers tuning
        out = convolution_function(I, W, Bias)
        out.sum().backward()

        # Subsequent occurrences do not
        out = convolution_function(I, W, Bias)
        out.sum().backward()

        tc.assert_almost_equal(ref, out, I, operations=C * kH * kW)
Exemplo n.º 3
0
    def test_group_norm_fused(self):
        group_normalization = """
            def group_normalization(
                float(N, G, D, H, W) I, float(G, D) gamma, float(G, D) beta)
            -> (Sum, SumSq, O)
            {
                Sum(n, g) +=! I(n, g, r_d, r_h, r_w)
              SumSq(n, g) +=! I(n, g, r_d, r_h, r_w) * I(n, g, r_d, r_h, r_w)
                O(n, g, d, h, w) =  gamma(g, d)
                    * ( I(n, g, d, h, w) - Sum(n, g) / (D * H * W))
                    * rsqrt( (SumSq(n, g) - Sum(n, g) * Sum(n, g) / (D * H * W))
                           / (D * H * W)
                           + 1e-5)
                    + beta(g, d)
            }
        """

        N, G, D, H, W = 32, 32, 4, 56, 56
        T = tc.define(
            group_normalization,
            tc.make_autotuned_options_factory(starting_options='naive',
                                              tuner_config=tuner_config))
        I, gamma, beta = (torch.randn(N, G, D, H, W, device='cuda'),
                          torch.randn(G, D, device='cuda').fill_(1.0),
                          torch.randn(G, D, device='cuda').zero_())
        Sum, SumSq, O = T.group_normalization(I, gamma, beta)

        from torch.nn.modules.normalization import GroupNorm
        GN = GroupNorm(G, G * D).cuda()
        ref = GN.forward(I.view((N, G * D, H, W)))

        tc.assert_almost_equal(ref,
                               O.view((N, G * D, H, W)),
                               I,
                               operations=D * H * W)
Exemplo n.º 4
0
def generate_options(tc_str: str, entry_point: str,
                     *inputs: torch.Tensor) -> tc.MappingOptions:
    global reinforce

    # TODO: comment the line below which serves the purpose of not blowing up
    # CI time
    return tc.make_naive_options_factory()(tc_str, entry_point, *inputs)

    if entry_point == 'make_idx':
        return tc.make_naive_options_factory()(tc_str, entry_point, *inputs)

    loaded = tc.make_load_from_cache_options_factory(args.tuner_cache_file)(
        tc_str, entry_point, *inputs)

    if loaded is None or entry_point in reinforce_list or '*' in reinforce_list:
        start = loaded if loaded is not None else 'naive'
        return tc.make_autotuned_options_factory(
            starting_options=start,
            tuner_config=tuner_config,
            cache_filename=args.tuner_cache_file,
            store_to_cache=True,
        )(tc_str, entry_point, *inputs)

    assert loaded is not None, 'None found'

    return loaded
Exemplo n.º 5
0
    def __init__(self, I, C, K, groups=1, padding=0, bias=False, from_cache=False, cache_file='tc_group3d.pt', tuner_config=None):
        '''
        Module providing grouped 3d convolution using tensor comprehensions

        :param I: Number of input channels
        :type I: int
        :param C: Number of output channels
        :type C: int
        :param K: Kernel size
        :type K: tuple or int
        :param groups: Number of groups
        :type groups: int
        :param from_cache: If True load from specified cache file, If False, perform autotuning
        :type from_cache: bool
        :param cache_file: Path and name of cache file
        :type cache_file: string
        :param padding: Amount of input padding
        :type padding: tuple or int
        :param bias: Not implemented
        :type bias: bool
        :param tuner_config: Tuner config object to use for auto-tuning
        :type tuner_config: tensor_comprehensions.TunerConfig
        '''
        import torch.nn.functional as F
        super().__init__()

        K = self.int_to_tuple(K)
        padding = self.int_to_tuple(padding)

        group_convolution = self.tc_string()
        if not from_cache:
            if tuner_config is None:
                tuner_config = tc.TunerConfig().generations(25).pop_size(100).number_elites(15)
            conv_option = tc.tclib.MappingOptions('naive').tile([1,1]).mapToThreads([4,16,4]).mapToBlocks([256,256]).unroll(1)
            TC = tc.define(group_convolution, tc.make_autotuned_options_factory(
                    starting_options=conv_option,
                    tuner_config=tuner_config,
                    cache_filename=cache_file,
                    store_to_cache=True,
                    load_from_cache=False
                    ))
        else:
            TC = tc.define(group_convolution, tc.make_load_from_cache_options_factory(cache_file))

        self.convolution_grouped = tc.make_autograd(TC.group_convolution, TC.convolution_grad)
        self.W = torch.nn.Parameter(torch.rand(groups, C/groups, I/groups, K[0], K[1], K[2]))
        self.pad = F.pad
        self.groups = groups
        self.padding = padding
        self.K = K
    cache = MappingOptionsCache(cache_file.name)
    top10 = cache.load(mm, "matmul", (A, B), 10)
    assert top1.__str__() == top10[0].__str__()

    # Compile and run with the new options
    compilation_cache.compile("matmul", (A, B), top1)
    time_tc(100, "raw unchecked_run tuned options\t",
            lambda name, ins: compilation_cache.unchecked_run(name, ins),
            "matmul", (A, B))

################################################################################
# 4. Simple torch.autograd.Function
################################################################################
T = tc.define(
    mm,
    tc.make_autotuned_options_factory(starting_options='naive',
                                      tuner_config=tuner_config))


def backward(A, B, d_C):
    d_A = T.matmul_agrad(B, d_C, unchecked=True)
    d_B = T.matmul_bgrad(A, d_C, unchecked=True)
    return d_A, d_B


matmul_function = tc.make_autograd(lambda A, B: T.matmul(A, B, unchecked=True),
                                   backward)

# Run once to trigger automatic tuning and compilation then time
# For example purposes, use retain_graph
# retain_graph = True prevents freeing the buffers when performing backward
# see e.g. https://stackoverflow.com/questions/46774641/what-does-the-parameter-retain-graph-mean-in-the-variables-backward-method
def main():
    parser = argparse.ArgumentParser(
        "compile + tune + test tensor comp kernels...")
    parser.add_argument("--kernel_name", default=r"kernel_*")
    parser.add_argument("--list",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--tune",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--exact",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--float32",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--load_cache",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--generations", default=10, type=int)
    parser.add_argument("--cache_filename", default="tc_cache", type=str)
    parser.add_argument("--init", default="naive", type=str)
    parser.add_argument("--threads", default=16, type=int)
    parser.add_argument("--pop_size", default=100, type=int)
    parser.add_argument("--crossover_rate", default=80, type=int)
    parser.add_argument("--mutation_rate", default=7, type=int)
    parser.add_argument("--number_elites", default=10, type=int)
    parser.add_argument("--height", default=32, type=int)
    parser.add_argument("--width", default=32, type=int)
    parser.add_argument("--N", default=8, type=int)
    parser.add_argument("--channels", default=3, type=int)
    parser.add_argument("--num_gpus", default=1, type=int)
    args = parser.parse_args()
    matched_kernels = []
    gpus = ",".join([str(x) for x in range(args.num_gpus)])
    print("devices: ", gpus)
    tuner_config = tc.TunerConfig().threads(args.threads).generations(
        args.generations).pop_size(args.pop_size).crossover_rate(
            args.crossover_rate).mutation_rate(
                args.mutation_rate).number_elites(args.number_elites)

    for k in all_kernel_strs:
        if not args.exact:
            if re.match(re.compile(args.kernel_name), k):
                matched_kernels.append(k)
        else:
            if k == args.kernel_name:
                matched_kernels.append(k)

    if args.list:
        print("Kernels available:")
        for k in matched_kernels:
            print("\t" + k)

    if args.init not in ["naive", "pointwise", "mlp"]:
        assert False

    start_options = tc.MappingOptions(args.init)

    if args.tune:
        if not args.load_cache:
            opts = tc.make_autotuned_options_factory(
                starting_options=start_options,
                cache_filename=args.cache_filename,
                store_to_cache=True,
                tuner_config=tuner_config)
        else:
            print("loading from cache...")
            opts = tc.make_autotuned_options_factory(
                load_from_cache=True,
                cache_filename=args.cache_filename,
                store_to_cache=True,
                tuner_config=tuner_config)
    else:
        if not args.load_cache:
            opts = tc.make_naive_options_factory()
        else:
            opts = tc.make_load_from_cache_options_factory(
                cache_filename=args.cache_filename)
    kernel_fn_map = {}
    N = args.N
    H = args.height
    W = args.width
    torch.manual_seed(0)
    x = torch.randn(N, H, W, args.channels).double().cuda()
    k_input = torch.randn(N, N, H, W, H, W).double().cuda()
    z = torch.tensor(1.0).double().cuda()
    y = x
    if args.float32:
        x = x.float()
        y = y.float()
        k_input = k_input.float()
        z = z.float()

    for k in matched_kernels:
        print(f"Tuning {k}")
        kernel_fn = tc.define(all_kernel_strs[k], opts)
        kernel_fn_map[k] = kernel_fn
        if "float" in k:
            k_call = getattr(kernel_fn, k.replace("kernel_float_", ""))
        else:
            k_call = getattr(kernel_fn, k.replace("kernel_", ""))

        if "input" in k:
            kxy = k_call(x, y)
            print("output: ", kxy)
        else:
            if "exponential_shifted" in k:
                print("calling exponential shifted")
                kxy = k_call(k_input, k_input, k_input, z)
            else:
                kxy = k_call(k_input, k_input, k_input)