def maybe_partition_graph(gm: GraphModule): supported_ops = NvfuserPrimOperatorSupport() call_function_nodes = filter(lambda n: n.op == "call_function", gm.graph.nodes) # the graph is partitioned only if at least one node is not supported by nvFuser any_unsupported = any( not supported_ops.is_node_supported(None, node) for node in call_function_nodes ) if any_unsupported: # CapabilityBasedPartitioner modifies the graph in-place so we need to make a copy of the graph gm = deepcopy(gm) partitioner = CapabilityBasedPartitioner( gm, supported_ops, allows_single_node_partition=True ) partitions = partitioner.propose_partitions() if len(partitions) == 0: warn( "No partition found for the graph. " + "This is likely because the graph is not supported by nvFuser. " + "Please use the eager ATen mode to execute the graph.", category=RuntimeWarning, ) partitioned_graph = partitioner.fuse_partitions(partitions) return partitioned_graph, any_unsupported else: return gm, any_unsupported
def compile(self, graph_module: GraphModule) -> GraphModule: # entry function for nvFuser backend logging.debug("Compiling graph_module: ", graph_module.code) # FX graph based partitioning based on nvfuser supported ops if graph_module in self.partitioner_cache: logging.debug("partitioner_cache hit!") fused_graph_module = self.partitioner_cache[graph_module] else: partitioner = CapabilityBasedPartitioner( graph_module, self.supported_ops, allows_single_node_partition=False) fused_graph_module = partitioner.partition_and_fuse() self.partitioner_cache[graph_module] = fused_graph_module # Overriding fused_module's __call__() function with lower_to_prims_and_execute() for node in fused_graph_module.graph.nodes: # TODO: use a better way to identify fused submodule if node.op == "call_module" and "fused_" in node.name: fused_module = getattr(fused_graph_module, node.name) fused_module._wrapped_call = self.lower_to_prims_and_execute return fused_graph_module
def test_partitioner_xfail(self, fn, expected_partition): traced = symbolic_trace(fn) supported_ops = MockOperatorSupport() partitioner = CapabilityBasedPartitioner(traced, supported_ops, allows_single_node_partition=True) partitions = partitioner.propose_partitions() partitions_name = [[node.name for node in partition.nodes] for partition in partitions] with self.assertRaises(Exception): assert len(partitions_name) == len(expected_partition)
def partition_cudagraphs(gm, inputs): """ Partition an FX graph into sub-GraphModules that can be validly run under CUDA graphs. For a subgraph to be runnable under CUDA, all of the operations must involve CUDA tensors only/ """ FakeTensorProp(gm).propagate(*inputs) supported_ops = CudaGraphsSupport() # TODO: single node partition may be wrong due to the pessimization # from copying in and out the data. Check in benchmarks, perhaps partitioner = CapabilityBasedPartitioner(gm, supported_ops, allows_single_node_partition=True) partitions = partitioner.propose_partitions() fused_graph = partitioner.fuse_partitions(partitions) return fused_graph
def test_partitioner(self, fn, expected_partition): traced = symbolic_trace(fn) supported_ops = MockOperatorSupport() partitioner = CapabilityBasedPartitioner(traced, supported_ops, allows_single_node_partition=True) partitions = partitioner.propose_partitions() partitions_name = [[node.name for node in partition.nodes] for partition in partitions] assert len(partitions_name) == len(expected_partition) for i in range(len(partitions_name)): assert set(partitions_name[i]) == set(expected_partition[i]) fused_graph = partitioner.fuse_partitions(partitions) a, b, c = torch.rand(4), torch.rand(4), torch.rand(4) expected = fn(a, b, c) result = fused_graph(a, b, c) torch.testing.assert_close(expected, result)