def main() -> None: parser = argparse.ArgumentParser(description='Generate ATen source files') parser.add_argument('-s', '--source-path', help='path to source directory for ATen', default='aten/src/ATen') parser.add_argument( '-o', '--output-dependencies', help='output a list of dependencies into the given file and exit') parser.add_argument('-d', '--install_dir', help='output directory', default='build/aten/src/ATen') parser.add_argument( '--rocm', action='store_true', help='reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly') # TODO: --op_registration_whitelist will be removed when all call-sites # for gen.py are moved over to using the operator YAML file for mobile # custom build. parser.add_argument( '--op_registration_whitelist', nargs='*', help='filter op registrations by the whitelist (if set); ' 'each item is `namespace`::`operator name` without overload name; ' 'e.g.: aten::empty aten::conv2d ...') parser.add_argument( '--op_selection_yaml_path', help='Provide a path to the operator selection (for custom build) YAML ' 'that contains the information about the set of selected operators ' 'and their categories (training, ...). Each operator is either a ' 'full operator name with overload or just a bare operator name. ' 'The operator names also contain the namespace prefix (e.g. aten::)') parser.add_argument( '--backend_whitelist', nargs='*', help='filter dispatch backend by the whitelist (if set), ' 'e.g.: CPU CUDA QuantizedCPU ...') parser.add_argument( '--static_dispatch_backend', help='generate static dispatch code for the specific backend (if set)') parser.add_argument( '--force_schema_registration', action='store_true', help= 'force it to generate schema-only registrations for all ops, including' 'those that are not listed on --op_registration_whitelist') options = parser.parse_args() selector = get_custom_build_selector( options.op_registration_whitelist, options.op_selection_yaml_path, ) native_functions = parse_native_yaml( os.path.join(options.source_path, 'native/native_functions.yaml')) pre_grouped_native_functions: Dict[FunctionSchema, Dict[SchemaKind, NativeFunction]] pre_grouped_native_functions = defaultdict(dict) for f in native_functions: d = pre_grouped_native_functions[f.func.signature()] assert f.func.kind() not in d d[f.func.kind()] = f def flatten_pre_group( d: Dict[SchemaKind, NativeFunction] ) -> Sequence[Union[NativeFunction, NativeFunctionsGroup]]: r = NativeFunctionsGroup.from_dict(d) if r is None: return list(d.values()) else: return [r] # TODO: how come ValuesView isn't a Sequence lol grouped_native_functions = list( concatMap(flatten_pre_group, list(pre_grouped_native_functions.values()))) structured_native_functions = [ g for g in grouped_native_functions if isinstance(g, NativeFunctionsGroup) ] template_dir = os.path.join(options.source_path, "templates") # NB: It is mandatory to NOT use os.path.join here, as the install directory # will eventually be ingested by cmake, which does not respect Windows style # path slashes. If you switch this to use os.path.join, you'll get an error # like: # # Syntax error in cmake code when parsing string # # C:/Jenkins/workspace/pytorch-builds/pytorch-win-ws2016-cuda9-cudnn7-py3-build/build/aten/src/ATen\core/TensorMethods.h # # Invalid character escape '\c'. core_install_dir = f'{options.install_dir}/core' pathlib.Path(core_install_dir).mkdir(parents=True, exist_ok=True) def make_file_manager(install_dir: str) -> FileManager: return FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=options.output_dependencies) core_fm = make_file_manager(core_install_dir) cpu_fm = make_file_manager(options.install_dir) cuda_fm = make_file_manager(options.install_dir) extra_cuda_headers = '''\ #include <c10/cuda/CUDAGuard.h> #include <ATen/cuda/ATenCUDAGeneral.h> #include <ATen/cuda/CUDADevice.h> #include <ATen/cuda/CUDAContext.h>''' if options.rocm: extra_cuda_headers = '''\ #include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h> #include <ATen/hip/ATenHIPGeneral.h> #include <ATen/hip/HIPDevice.h> #include <ATen/hip/HIPContext.h>''' dispatch_keys = [ DispatchKey.CPU, DispatchKey.SparseCPU, DispatchKey.SparseCsrCPU, DispatchKey.MkldnnCPU, DispatchKey.CUDA, DispatchKey.SparseCUDA, DispatchKey.SparseCsrCUDA, DispatchKey.QuantizedCPU, DispatchKey.QuantizedCUDA, DispatchKey.CompositeImplicitAutograd, DispatchKey.CompositeExplicitAutograd, # Meta is a magic key: it is automatically generated for structured # kernels DispatchKey.Meta, ] # Only a limited set of dispatch keys get CPUFunctions.h headers generated # for them; this is the set functions_keys = { DispatchKey.CPU, DispatchKey.CUDA, DispatchKey.CompositeImplicitAutograd, DispatchKey.CompositeExplicitAutograd, } if options.backend_whitelist: dispatch_keys = [ k for k in dispatch_keys if is_generic_dispatch_key(k) or str(k) in options.backend_whitelist ] static_dispatch_backend: Optional[DispatchKey] = None if options.static_dispatch_backend: static_dispatch_backend = DispatchKey.parse( options.static_dispatch_backend) for dispatch_key in dispatch_keys: fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm fm.write_with_template( f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: { 'extra_cuda_headers': extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else '', 'legacy_th_headers': '#include <ATen/LegacyTHFunctionsCPU.h>' if dispatch_key == DispatchKey.CPU else '#include <ATen/LegacyTHFunctionsCUDA.h>' if dispatch_key == DispatchKey.CUDA else '', 'DispatchKey': dispatch_key, 'dispatch_namespace': dispatch_key.lower(), 'dispatch_namespaced_definitions': list( concatMap( dest.RegisterDispatchKey(dispatch_key, Target.NAMESPACED_DEFINITION, selector, rocm=options.rocm), grouped_native_functions)), 'dispatch_anonymous_definitions': list( concatMap( dest.RegisterDispatchKey(dispatch_key, Target.ANONYMOUS_DEFINITION, selector, rocm=options.rocm), grouped_native_functions)), 'dispatch_registrations': list( concatMap( dest.RegisterDispatchKey(dispatch_key, Target.REGISTRATION, selector, rocm=options.rocm), grouped_native_functions)), }) if dispatch_key in functions_keys: fm.write_with_template( f'{dispatch_key}Functions.h', 'DispatchKeyFunctions.h', lambda: { 'dispatch_namespace': dispatch_key.lower(), 'dispatch_namespaced_declarations': list( concatMap( dest.RegisterDispatchKey( dispatch_key, Target.NAMESPACED_DECLARATION, selector, rocm=options.rocm), grouped_native_functions)), }) del fm # BackendSelect is generated specially cpu_fm.write( 'RegisterBackendSelect.cpp', lambda: { 'backend_select_method_definitions': list( mapMaybe(ComputeBackendSelect(Target.DEFINITION), native_functions)), 'backend_select_function_registrations': list( mapMaybe(ComputeBackendSelect(Target.REGISTRATION), native_functions)), }) cpu_fm.write( 'MetaFunctions.h', lambda: { 'declarations': list( mapMaybe(compute_meta_function_declaration, structured_native_functions)), }) schema_selector = selector if options.force_schema_registration: schema_selector = SelectiveBuilder.get_nop_selector() cpu_fm.write( 'RegisterSchema.cpp', lambda: { 'schema_registrations': list(mapMaybe(RegisterSchema(schema_selector), native_functions)), }) cpu_fm.write( 'Functions.h', lambda: { 'function_declarations': list( mapMaybe( ComputeFunction( Target.DECLARATION, static_dispatch_backend=static_dispatch_backend, is_redispatching_fn=False), native_functions)), }) cpu_fm.write( 'Functions.cpp', lambda: { 'static_dispatch_extra_headers': static_dispatch_extra_headers(static_dispatch_backend), 'function_definitions': list( mapMaybe( ComputeFunction( Target.DEFINITION, static_dispatch_backend=static_dispatch_backend, is_redispatching_fn=False), native_functions)), }) cpu_fm.write( 'RedispatchFunctions.h', lambda: { 'function_redispatch_declarations': list( mapMaybe( ComputeFunction( Target.DECLARATION, static_dispatch_backend=static_dispatch_backend, is_redispatching_fn=True), native_functions)), }) cpu_fm.write( 'RedispatchFunctions.cpp', lambda: { 'static_dispatch_extra_headers': static_dispatch_extra_headers(static_dispatch_backend), 'function_redispatch_definitions': list( mapMaybe( ComputeFunction( Target.DEFINITION, static_dispatch_backend=static_dispatch_backend, is_redispatching_fn=True), native_functions)), }) core_fm.write( 'TensorBody.h', lambda: { 'tensor_method_declarations': list( mapMaybe( ComputeTensorMethod(Target.DECLARATION, static_dispatch_backend= static_dispatch_backend), native_functions)), }) core_fm.write( 'TensorMethods.cpp', lambda: { 'static_dispatch_extra_headers': static_dispatch_extra_headers(static_dispatch_backend), 'tensor_method_definitions': list( mapMaybe( ComputeTensorMethod(Target.DEFINITION, static_dispatch_backend= static_dispatch_backend), native_functions)), }) core_fm.write( 'ATenOpList.cpp', lambda: { 'aten_ops': list(mapMaybe(compute_aten_op, native_functions)), }) cpu_fm.write( 'NativeFunctions.h', lambda: { 'native_function_declarations': list( concatMap(dest.compute_native_function_declaration, grouped_native_functions)), }) cpu_fm.write( 'Declarations.yaml', lambda: format_yaml( [compute_declaration_yaml(f) for f in native_functions])) cpu_fm.write( 'RegistrationDeclarations.h', lambda: { 'registration_declarations': [compute_registration_declarations(f) for f in native_functions], }) if options.output_dependencies: cpu_fm.write_outputs(options.output_dependencies) core_fm.write_outputs(f"{options.output_dependencies}-core") cuda_fm.write_outputs(f"{options.output_dependencies}-cuda")
def gen_differentiable_inputs( f: NativeFunction) -> List[DifferentiableInput]: return list( mapMaybe(gen_differentiable_input, f.func.arguments.non_out))
def emit_inplace_functionalization_body( f: NativeFunction, functional_op: Optional[NativeFunction]) -> str: # mutation case assert (modifies_arguments(f)) dispatcher_sig = DispatcherSignature.from_schema(f.func) keyset = 'dispatchKeySet & c10::after_func_keyset' return_type = dispatcher_sig.returns_type().remove_const_ref().cpp_type() unwrap_tensor_args_str, unwrapped_args_ctx = unwrap_tensor_args( dispatcher_sig) maybe_return = '' if len(f.func.returns) == 0 else 'return ' sync_tensor_args = '\n '.join( mapMaybe( lambda arg: f'at::functionalization::impl::sync({arg.name});' if arg.type.is_tensor_like() else None, f.func.arguments.flat_all)) # Note [functionalizating copy_() and not preserving strides] # copy_() can't be functionalized, since there doesn't exist an out-of-place variant. # We could add one, but that would be sub-optimal for functorch: copy() would need to allocate a fresh tensor. # This may seem like a large hack for one optimization, but copy_() is one of the most common inplace operators. # Instead, we can replace `self.copy_(src)` with `src.to(self).expand_as(self)`. # This maintains the exact same semantics, EXCEPT that we don't preserve the strides from `self`. # This seems like a reasonable tradeoff, for a few reasons: # - mutation removal is only used by functorch, and not by Vulkan or XLA. Functorch already doesn't preserve strides. # - There are actually a few other places where the functionalization pass currently doesn't support strides: # calls to slice/diagonal_scatter don't currently preserve the strides of their inputs (but maybe we should fix this). if str(f.func.name) == 'copy_': exprs = [keyset] + [a.name for a in unwrapped_args_ctx] functional_call_str = f"""\ auto tmp_intermediate = at::_ops::to_other::redispatch({keyset}, src_, self_, non_blocking, false, c10::nullopt); tmp_output = at::_ops::expand_as::redispatch({keyset}, tmp_intermediate, self_);""" elif functional_op is None: # We can't functionalize this inplace op, since we don't know what the corresponding functional op is. inplace_exprs = [keyset] + [ e.expr for e in translate( unwrapped_args_ctx, dispatcher_sig.arguments(), method=False) ] warn_str = "Note: the functionalization pass encountered an operator ({}) that it could not functionalize, \ because it couldn't find an out-of-place equivalent of the operator to call. \ Instead, it's calling the inplace/view operator directly. \ If this causes problems in your program, consider upstreaming the out-of-place op to PyTorch.".format( str(f.func.name)) return f""" if (c10::impl::tls_local_dispatch_key_set().included_.has(c10::DispatchKey::Functionalize)) {{ TORCH_WARN("{warn_str}"); }} {sync_tensor_args} {unwrap_tensor_args_str} at::AutoDispatchSkipFunctionalize guard; // Redispatch as normally otherwise, since XLA has its own lowerings for special inplace ops. {maybe_return}at::_ops::{f.func.name.unambiguous_name()}::redispatch({', '.join(inplace_exprs)}); """ else: # call the out-of-place variant of the op functional_sig = DispatcherSignature.from_schema(functional_op.func) functional_exprs = [keyset] + [ e.expr for e in translate( unwrapped_args_ctx, functional_sig.arguments(), method=False) ] functional_call_str = \ f"tmp_output = at::_ops::{functional_op.func.name.unambiguous_name()}::redispatch({', '.join(functional_exprs)});" mutable_input_post_processing = '\n'.join([ f""" auto {a.name}_functional = at::functionalization::impl::unsafeGetFunctionalWrapper({a.name}); {a.name}_functional->replace_(tmp_output); {a.name}_functional->commit_update();""" for a in f.func.arguments.flat_non_out if a.annotation and a.annotation.is_write and a.type.is_tensor_like() ]) return f"""