示例#1
0
    def __init__(self, I, C, K, groups=1, padding=0, bias=False, from_cache=False, cache_file='tc_group3d.pt', tuner_config=None):
        '''
        Module providing grouped 3d convolution using tensor comprehensions

        :param I: Number of input channels
        :type I: int
        :param C: Number of output channels
        :type C: int
        :param K: Kernel size
        :type K: tuple or int
        :param groups: Number of groups
        :type groups: int
        :param from_cache: If True load from specified cache file, If False, perform autotuning
        :type from_cache: bool
        :param cache_file: Path and name of cache file
        :type cache_file: string
        :param padding: Amount of input padding
        :type padding: tuple or int
        :param bias: Not implemented
        :type bias: bool
        :param tuner_config: Tuner config object to use for auto-tuning
        :type tuner_config: tensor_comprehensions.TunerConfig
        '''
        import torch.nn.functional as F
        super().__init__()

        K = self.int_to_tuple(K)
        padding = self.int_to_tuple(padding)

        group_convolution = self.tc_string()
        if not from_cache:
            if tuner_config is None:
                tuner_config = tc.TunerConfig().generations(25).pop_size(100).number_elites(15)
            conv_option = tc.tclib.MappingOptions('naive').tile([1,1]).mapToThreads([4,16,4]).mapToBlocks([256,256]).unroll(1)
            TC = tc.define(group_convolution, tc.make_autotuned_options_factory(
                    starting_options=conv_option,
                    tuner_config=tuner_config,
                    cache_filename=cache_file,
                    store_to_cache=True,
                    load_from_cache=False
                    ))
        else:
            TC = tc.define(group_convolution, tc.make_load_from_cache_options_factory(cache_file))

        self.convolution_grouped = tc.make_autograd(TC.group_convolution, TC.convolution_grad)
        self.W = torch.nn.Parameter(torch.rand(groups, C/groups, I/groups, K[0], K[1], K[2]))
        self.pad = F.pad
        self.groups = groups
        self.padding = padding
        self.K = K
示例#2
0
    def test_matmul_tune_and_run(self, n, m, k, seed, gc, dc):
        tuner = tc.Tuner(MATMUL_LANG)
        tuner_config = (tc.TunerConfig().generations(3).threads(32).pop_size(
            2).tuner_min_launch_total_threads(1))
        matmul_top1 = tuner.tune('matmul', (torch.randn(
            n, k, device='cuda'), torch.randn(k, m, device='cuda')),
                                 tc.MappingOptions('naive'), tuner_config)
        matmul_grad_top1 = tuner.tune(
            'matmul_grad',
            (torch.randn(n, k, device='cuda'), torch.randn(
                k, m, device='cuda'), torch.randn(n, m, device='cuda')),
            tc.MappingOptions('naive'), tuner_config)

        X = np.random.rand(m, k).astype(np.float32)
        W = np.random.rand(k, n).astype(np.float32)

        def ref(X, W):
            return [np.dot(X, W)]

        op = core.CreateOperator(
            "TcOp",
            ["X", "Y"],
            "out",
            tc_def=MATMUL_LANG,
            tc_name="matmul",
            tc_grad_def=MATMUL_LANG,
            tc_grad_name="matmul_grad",
            inputs_used_by_gradient=[0, 1],
            output_gradients_used_by_gradient=[0],
            inputs_to_compute_gradients_of=[0, 1],
            mapping_options=matmul_top1.serialize(),
            grad_mapping_options=matmul_grad_top1.serialize(),
        )

        self.assertReferenceChecks(
            device_option=gc,
            op=op,
            inputs=[X, W],
            reference=ref,
        )

        for i in range(2):
            self.assertGradientChecks(
                device_option=gc,
                op=op,
                inputs=[X, W],
                outputs_to_check=i,
                outputs_with_grads=[0],
            )
示例#3
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
import unittest

import tempfile

import torch
import torch.cuda

import tensor_comprehensions as tc

tc.SILENT = True
tuner_config = tc.TunerConfig().threads(5).generations(3).pop_size(5)


class TestTC(unittest.TestCase):
    #
    # Self explicit
    #
    def test_imports(self):
        from tensor_comprehensions.tclib import logtostderr
        from tensor_comprehensions.tclib import debug_lang
        from tensor_comprehensions.tclib import debug_halide
        from tensor_comprehensions.tclib import debug_tc_mapper
        from tensor_comprehensions.tclib import debug_tuner
        from tensor_comprehensions.tclib import dump_cuda

        from tensor_comprehensions.tclib import CompilationCache
示例#4
0
def build(args: argparse.Namespace, tc_str: str, entry_point: str,
          *inputs: torch.Tensor) -> tc.Executor:
    tuner_config = (tc.TunerConfig().threads(args.tuner_threads).generations(
        args.tuner_generations).pop_size(args.tuner_pop_size).number_elites(
            args.tuner_number_elites).devices(args.tuner_devices))

    if args.autotuner:
        if args.debug: print("Running autotuner.")

        if args.load_from_cache:
            return tc.autotune_and_compile(
                tc_str,
                entry_point,
                *inputs,
                starting_options=None,
                tuner_config=tuner_config,
                cache_filename=args.tuner_cache_file,
                load_from_cache=args.load_from_cache,
                store_to_cache=args.store_to_cache)
        else:
            return tc.autotune_and_compile(
                tc_str,
                entry_point,
                *inputs,
                starting_options='naive',
                tuner_config=tuner_config,
                cache_filename=args.tuner_cache_file,
                load_from_cache=args.load_from_cache,
                store_to_cache=args.store_to_cache)

    elif args.load_from_cache:
        if args.debug: print("Loading autotuned mapping options from cache.")

        mapping_options = tc.make_load_from_cache_options_factory(
            args.tuner_cache_file)(tc_str, entry_point, *inputs)
        return tc.compile(tc_str, entry_point, mapping_options, *inputs)
    else:
        if args.debug: print("Building mapping options.")

        options = tc.MappingOptions("naive")

        if args.mapToBlocks is not None:
            options.mapToBlocks(args.mapToBlocks)
        if args.mapToThreads is not None:
            options.mapToThreads(args.mapToThreads)
        if args.tile is not None:
            options.tile(args.tile)
        if args.useSharedMemory is not None:
            options.useSharedMemory(args.useSharedMemory)
        if args.maxSharedMemory is not None:
            options.maxSharedMemory(args.maxSharedMemory)
        if args.unroll is not None:
            options.unroll(args.unroll)
        if args.unrollCopyShared is not None:
            options.unrollCopyShared(args.unrollCopyShared)
        if args.useReadOnlyCache is not None:
            options.useReadOnlyCache(args.useReadOnlyCache)
        if args.matchLibraryCalls is not None:
            options.matchLibraryCalls(args.matchLibraryCalls)
        if args.fixParametersBeforeScheduling is not None:
            options.fixParametersBeforeScheduling(
                args.fixParametersBeforeScheduling)
        if args.outerScheduleFusionStrategy is not None:
            options.outerScheduleFusionStrategy(
                args.outerScheduleFusionStrategy)
        if args.intraTileScheduleFusionStrategy is not None:
            options.intraTileScheduleFusionStrategy(
                args.intraTileScheduleFusionStrategy)

        return tc.compile(tc_str, entry_point, options, *inputs)
示例#5
0
文件: tum.py 项目: LoopTactics/tc-cim
def result(float(K, MAX_L, MAX_L) Beta, float(K, MAX_L, MAX_L) Dots) -> (O, tmpO) {
    # Triangular compute
    # tmp is necessary because we don't yet support 2-D reductions
    # But in practice, tmp also gives strictly more parallelism and allows
    # exploiting blocks without cross-block reductions
    tmpO(k, max_l_1) +=! (max_l_1 >= r_max_l_2) ? 0.0 :
        Dots(k, max_l_1, r_max_l_2) * Beta(k, max_l_1, r_max_l_2)
    O(k) +=! tmpO(k, max_l_1)
}
'''

###############################################################################
# Implicit compilation and tuning behavior
###############################################################################
tuner_config = (tc.TunerConfig().threads(args.tuner_threads).generations(
    args.tuner_generations).pop_size(args.tuner_pop_size).number_elites(
        args.tuner_number_elites).devices(args.tuner_devices))

# This function is used for reinforcing tuning
# 1. make_idx is small and does not get tuned or saved, just using naive
#    options on it is fine;
# 2. if we find an option in the cache, use it either as is or as starting
#    point for reinforcement, depending on whether the entry_point is in the
#    reinforcement list;
# 3. dots will benefit from being reinforced a few times (reaching 90us on P100)
reinforce_list = ['']


def generate_options(tc_str: str, entry_point: str,
                     *inputs: torch.Tensor) -> tc.MappingOptions:
    global reinforce
def main():
    parser = argparse.ArgumentParser(
        "compile + tune + test tensor comp kernels...")
    parser.add_argument("--kernel_name", default=r"kernel_*")
    parser.add_argument("--list",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--tune",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--exact",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--float32",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--load_cache",
                        const=True,
                        action="store_const",
                        default=False)
    parser.add_argument("--generations", default=10, type=int)
    parser.add_argument("--cache_filename", default="tc_cache", type=str)
    parser.add_argument("--init", default="naive", type=str)
    parser.add_argument("--threads", default=16, type=int)
    parser.add_argument("--pop_size", default=100, type=int)
    parser.add_argument("--crossover_rate", default=80, type=int)
    parser.add_argument("--mutation_rate", default=7, type=int)
    parser.add_argument("--number_elites", default=10, type=int)
    parser.add_argument("--height", default=32, type=int)
    parser.add_argument("--width", default=32, type=int)
    parser.add_argument("--N", default=8, type=int)
    parser.add_argument("--channels", default=3, type=int)
    parser.add_argument("--num_gpus", default=1, type=int)
    args = parser.parse_args()
    matched_kernels = []
    gpus = ",".join([str(x) for x in range(args.num_gpus)])
    print("devices: ", gpus)
    tuner_config = tc.TunerConfig().threads(args.threads).generations(
        args.generations).pop_size(args.pop_size).crossover_rate(
            args.crossover_rate).mutation_rate(
                args.mutation_rate).number_elites(args.number_elites)

    for k in all_kernel_strs:
        if not args.exact:
            if re.match(re.compile(args.kernel_name), k):
                matched_kernels.append(k)
        else:
            if k == args.kernel_name:
                matched_kernels.append(k)

    if args.list:
        print("Kernels available:")
        for k in matched_kernels:
            print("\t" + k)

    if args.init not in ["naive", "pointwise", "mlp"]:
        assert False

    start_options = tc.MappingOptions(args.init)

    if args.tune:
        if not args.load_cache:
            opts = tc.make_autotuned_options_factory(
                starting_options=start_options,
                cache_filename=args.cache_filename,
                store_to_cache=True,
                tuner_config=tuner_config)
        else:
            print("loading from cache...")
            opts = tc.make_autotuned_options_factory(
                load_from_cache=True,
                cache_filename=args.cache_filename,
                store_to_cache=True,
                tuner_config=tuner_config)
    else:
        if not args.load_cache:
            opts = tc.make_naive_options_factory()
        else:
            opts = tc.make_load_from_cache_options_factory(
                cache_filename=args.cache_filename)
    kernel_fn_map = {}
    N = args.N
    H = args.height
    W = args.width
    torch.manual_seed(0)
    x = torch.randn(N, H, W, args.channels).double().cuda()
    k_input = torch.randn(N, N, H, W, H, W).double().cuda()
    z = torch.tensor(1.0).double().cuda()
    y = x
    if args.float32:
        x = x.float()
        y = y.float()
        k_input = k_input.float()
        z = z.float()

    for k in matched_kernels:
        print(f"Tuning {k}")
        kernel_fn = tc.define(all_kernel_strs[k], opts)
        kernel_fn_map[k] = kernel_fn
        if "float" in k:
            k_call = getattr(kernel_fn, k.replace("kernel_float_", ""))
        else:
            k_call = getattr(kernel_fn, k.replace("kernel_", ""))

        if "input" in k:
            kxy = k_call(x, y)
            print("output: ", kxy)
        else:
            if "exponential_shifted" in k:
                print("calling exponential shifted")
                kxy = k_call(k_input, k_input, k_input, z)
            else:
                kxy = k_call(k_input, k_input, k_input)