def __init__(self, I, C, K, groups=1, padding=0, bias=False, from_cache=False, cache_file='tc_group3d.pt', tuner_config=None): ''' Module providing grouped 3d convolution using tensor comprehensions :param I: Number of input channels :type I: int :param C: Number of output channels :type C: int :param K: Kernel size :type K: tuple or int :param groups: Number of groups :type groups: int :param from_cache: If True load from specified cache file, If False, perform autotuning :type from_cache: bool :param cache_file: Path and name of cache file :type cache_file: string :param padding: Amount of input padding :type padding: tuple or int :param bias: Not implemented :type bias: bool :param tuner_config: Tuner config object to use for auto-tuning :type tuner_config: tensor_comprehensions.TunerConfig ''' import torch.nn.functional as F super().__init__() K = self.int_to_tuple(K) padding = self.int_to_tuple(padding) group_convolution = self.tc_string() if not from_cache: if tuner_config is None: tuner_config = tc.TunerConfig().generations(25).pop_size(100).number_elites(15) conv_option = tc.tclib.MappingOptions('naive').tile([1,1]).mapToThreads([4,16,4]).mapToBlocks([256,256]).unroll(1) TC = tc.define(group_convolution, tc.make_autotuned_options_factory( starting_options=conv_option, tuner_config=tuner_config, cache_filename=cache_file, store_to_cache=True, load_from_cache=False )) else: TC = tc.define(group_convolution, tc.make_load_from_cache_options_factory(cache_file)) self.convolution_grouped = tc.make_autograd(TC.group_convolution, TC.convolution_grad) self.W = torch.nn.Parameter(torch.rand(groups, C/groups, I/groups, K[0], K[1], K[2])) self.pad = F.pad self.groups = groups self.padding = padding self.K = K
def test_matmul_tune_and_run(self, n, m, k, seed, gc, dc): tuner = tc.Tuner(MATMUL_LANG) tuner_config = (tc.TunerConfig().generations(3).threads(32).pop_size( 2).tuner_min_launch_total_threads(1)) matmul_top1 = tuner.tune('matmul', (torch.randn( n, k, device='cuda'), torch.randn(k, m, device='cuda')), tc.MappingOptions('naive'), tuner_config) matmul_grad_top1 = tuner.tune( 'matmul_grad', (torch.randn(n, k, device='cuda'), torch.randn( k, m, device='cuda'), torch.randn(n, m, device='cuda')), tc.MappingOptions('naive'), tuner_config) X = np.random.rand(m, k).astype(np.float32) W = np.random.rand(k, n).astype(np.float32) def ref(X, W): return [np.dot(X, W)] op = core.CreateOperator( "TcOp", ["X", "Y"], "out", tc_def=MATMUL_LANG, tc_name="matmul", tc_grad_def=MATMUL_LANG, tc_grad_name="matmul_grad", inputs_used_by_gradient=[0, 1], output_gradients_used_by_gradient=[0], inputs_to_compute_gradients_of=[0, 1], mapping_options=matmul_top1.serialize(), grad_mapping_options=matmul_grad_top1.serialize(), ) self.assertReferenceChecks( device_option=gc, op=op, inputs=[X, W], reference=ref, ) for i in range(2): self.assertGradientChecks( device_option=gc, op=op, inputs=[X, W], outputs_to_check=i, outputs_with_grads=[0], )
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################## import unittest import tempfile import torch import torch.cuda import tensor_comprehensions as tc tc.SILENT = True tuner_config = tc.TunerConfig().threads(5).generations(3).pop_size(5) class TestTC(unittest.TestCase): # # Self explicit # def test_imports(self): from tensor_comprehensions.tclib import logtostderr from tensor_comprehensions.tclib import debug_lang from tensor_comprehensions.tclib import debug_halide from tensor_comprehensions.tclib import debug_tc_mapper from tensor_comprehensions.tclib import debug_tuner from tensor_comprehensions.tclib import dump_cuda from tensor_comprehensions.tclib import CompilationCache
def build(args: argparse.Namespace, tc_str: str, entry_point: str, *inputs: torch.Tensor) -> tc.Executor: tuner_config = (tc.TunerConfig().threads(args.tuner_threads).generations( args.tuner_generations).pop_size(args.tuner_pop_size).number_elites( args.tuner_number_elites).devices(args.tuner_devices)) if args.autotuner: if args.debug: print("Running autotuner.") if args.load_from_cache: return tc.autotune_and_compile( tc_str, entry_point, *inputs, starting_options=None, tuner_config=tuner_config, cache_filename=args.tuner_cache_file, load_from_cache=args.load_from_cache, store_to_cache=args.store_to_cache) else: return tc.autotune_and_compile( tc_str, entry_point, *inputs, starting_options='naive', tuner_config=tuner_config, cache_filename=args.tuner_cache_file, load_from_cache=args.load_from_cache, store_to_cache=args.store_to_cache) elif args.load_from_cache: if args.debug: print("Loading autotuned mapping options from cache.") mapping_options = tc.make_load_from_cache_options_factory( args.tuner_cache_file)(tc_str, entry_point, *inputs) return tc.compile(tc_str, entry_point, mapping_options, *inputs) else: if args.debug: print("Building mapping options.") options = tc.MappingOptions("naive") if args.mapToBlocks is not None: options.mapToBlocks(args.mapToBlocks) if args.mapToThreads is not None: options.mapToThreads(args.mapToThreads) if args.tile is not None: options.tile(args.tile) if args.useSharedMemory is not None: options.useSharedMemory(args.useSharedMemory) if args.maxSharedMemory is not None: options.maxSharedMemory(args.maxSharedMemory) if args.unroll is not None: options.unroll(args.unroll) if args.unrollCopyShared is not None: options.unrollCopyShared(args.unrollCopyShared) if args.useReadOnlyCache is not None: options.useReadOnlyCache(args.useReadOnlyCache) if args.matchLibraryCalls is not None: options.matchLibraryCalls(args.matchLibraryCalls) if args.fixParametersBeforeScheduling is not None: options.fixParametersBeforeScheduling( args.fixParametersBeforeScheduling) if args.outerScheduleFusionStrategy is not None: options.outerScheduleFusionStrategy( args.outerScheduleFusionStrategy) if args.intraTileScheduleFusionStrategy is not None: options.intraTileScheduleFusionStrategy( args.intraTileScheduleFusionStrategy) return tc.compile(tc_str, entry_point, options, *inputs)
def result(float(K, MAX_L, MAX_L) Beta, float(K, MAX_L, MAX_L) Dots) -> (O, tmpO) { # Triangular compute # tmp is necessary because we don't yet support 2-D reductions # But in practice, tmp also gives strictly more parallelism and allows # exploiting blocks without cross-block reductions tmpO(k, max_l_1) +=! (max_l_1 >= r_max_l_2) ? 0.0 : Dots(k, max_l_1, r_max_l_2) * Beta(k, max_l_1, r_max_l_2) O(k) +=! tmpO(k, max_l_1) } ''' ############################################################################### # Implicit compilation and tuning behavior ############################################################################### tuner_config = (tc.TunerConfig().threads(args.tuner_threads).generations( args.tuner_generations).pop_size(args.tuner_pop_size).number_elites( args.tuner_number_elites).devices(args.tuner_devices)) # This function is used for reinforcing tuning # 1. make_idx is small and does not get tuned or saved, just using naive # options on it is fine; # 2. if we find an option in the cache, use it either as is or as starting # point for reinforcement, depending on whether the entry_point is in the # reinforcement list; # 3. dots will benefit from being reinforced a few times (reaching 90us on P100) reinforce_list = [''] def generate_options(tc_str: str, entry_point: str, *inputs: torch.Tensor) -> tc.MappingOptions: global reinforce
def main(): parser = argparse.ArgumentParser( "compile + tune + test tensor comp kernels...") parser.add_argument("--kernel_name", default=r"kernel_*") parser.add_argument("--list", const=True, action="store_const", default=False) parser.add_argument("--tune", const=True, action="store_const", default=False) parser.add_argument("--exact", const=True, action="store_const", default=False) parser.add_argument("--float32", const=True, action="store_const", default=False) parser.add_argument("--load_cache", const=True, action="store_const", default=False) parser.add_argument("--generations", default=10, type=int) parser.add_argument("--cache_filename", default="tc_cache", type=str) parser.add_argument("--init", default="naive", type=str) parser.add_argument("--threads", default=16, type=int) parser.add_argument("--pop_size", default=100, type=int) parser.add_argument("--crossover_rate", default=80, type=int) parser.add_argument("--mutation_rate", default=7, type=int) parser.add_argument("--number_elites", default=10, type=int) parser.add_argument("--height", default=32, type=int) parser.add_argument("--width", default=32, type=int) parser.add_argument("--N", default=8, type=int) parser.add_argument("--channels", default=3, type=int) parser.add_argument("--num_gpus", default=1, type=int) args = parser.parse_args() matched_kernels = [] gpus = ",".join([str(x) for x in range(args.num_gpus)]) print("devices: ", gpus) tuner_config = tc.TunerConfig().threads(args.threads).generations( args.generations).pop_size(args.pop_size).crossover_rate( args.crossover_rate).mutation_rate( args.mutation_rate).number_elites(args.number_elites) for k in all_kernel_strs: if not args.exact: if re.match(re.compile(args.kernel_name), k): matched_kernels.append(k) else: if k == args.kernel_name: matched_kernels.append(k) if args.list: print("Kernels available:") for k in matched_kernels: print("\t" + k) if args.init not in ["naive", "pointwise", "mlp"]: assert False start_options = tc.MappingOptions(args.init) if args.tune: if not args.load_cache: opts = tc.make_autotuned_options_factory( starting_options=start_options, cache_filename=args.cache_filename, store_to_cache=True, tuner_config=tuner_config) else: print("loading from cache...") opts = tc.make_autotuned_options_factory( load_from_cache=True, cache_filename=args.cache_filename, store_to_cache=True, tuner_config=tuner_config) else: if not args.load_cache: opts = tc.make_naive_options_factory() else: opts = tc.make_load_from_cache_options_factory( cache_filename=args.cache_filename) kernel_fn_map = {} N = args.N H = args.height W = args.width torch.manual_seed(0) x = torch.randn(N, H, W, args.channels).double().cuda() k_input = torch.randn(N, N, H, W, H, W).double().cuda() z = torch.tensor(1.0).double().cuda() y = x if args.float32: x = x.float() y = y.float() k_input = k_input.float() z = z.float() for k in matched_kernels: print(f"Tuning {k}") kernel_fn = tc.define(all_kernel_strs[k], opts) kernel_fn_map[k] = kernel_fn if "float" in k: k_call = getattr(kernel_fn, k.replace("kernel_float_", "")) else: k_call = getattr(kernel_fn, k.replace("kernel_", "")) if "input" in k: kxy = k_call(x, y) print("output: ", kxy) else: if "exponential_shifted" in k: print("calling exponential shifted") kxy = k_call(k_input, k_input, k_input, z) else: kxy = k_call(k_input, k_input, k_input)