示例#1
0
def test_rr_interleave():
    '''
        Tests RR interleaving of containers to memory banks
    '''
    @dace.program
    def rr_interleave(A: dace.float32[8], B: dace.float32[8],
                      C: dace.float32[8]):
        return A + B + C

    A = np.random.rand(8).astype(np.float32)
    B = np.random.rand(8).astype(np.float32)
    C = np.random.rand(8).astype(np.float32)

    sdfg = rr_interleave.to_sdfg()
    sdfg.apply_transformations([FPGATransformSDFG])

    #specifically run the the interleave transformation
    allocated = fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)

    # There will be 5 arrays (one is a temporary containing A + B)
    assert allocated == [2, 1, 1, 1]

    R = sdfg(A=A, B=B, C=C)
    assert np.allclose(A + B + C, R)

    return sdfg
示例#2
0
def test_mem_buffer_bicg():

    A = np.random.rand(N, M).astype(np.float32)
    p = np.random.rand(M).astype(np.float32)
    r = np.random.rand(M).astype(np.float32)

    # Parse SDFG and apply FPGA friendly optimization
    sdfg = bicg.to_sdfg(strict=True)
    applied = sdfg.apply_transformations([FPGATransformSDFG])
    assert applied == 1

    fpga_rr_interleave_containers_to_banks(sdfg, num_banks=4)

    # Use FPGA Expansion for lib nodes, and expand them to enable further optimizations
    from dace.libraries.blas import Gemv
    Gemv.default_implementation = "FPGA_Accumulate"
    sdfg.expand_library_nodes()
    sm_applied = sdfg.apply_transformations_repeated(
        [InlineSDFG, sm.StreamingMemory], [{}, {
            'storage': dace.StorageType.FPGA_Local,
            'use_memory_buffering': True
        }],
        print_report=True)
    assert sm_applied == 7  # 3 inlines and 4 Streaming memories

    sm_applied = sdfg.apply_transformations_repeated(
        [InlineSDFG, sm.StreamingMemory], [{}, {
            'storage': dace.StorageType.FPGA_Local,
            'use_memory_buffering': False
        }],
        print_report=True)

    assert sm_applied == 1  # 1 Streaming memories

    # specialize the SDFG (needed by the GEMV expansion)
    sdfg.specialize(dict(M=M, N=N))

    res0, res1 = sdfg(A=A, p=p, r=r)

    # Compute ground truth and Validate result
    res0_ref, res1_ref = bicg.f(A, p, r)

    assert np.allclose(res0_ref, res0)
    assert np.allclose(res1, res1_ref)

    return sdfg
示例#3
0
def auto_optimize(sdfg: SDFG,
                  device: dtypes.DeviceType,
                  validate: bool = True,
                  validate_all: bool = False) -> SDFG:
    """
    Runs a basic sequence of transformations to optimize a given SDFG to decent
    performance. In particular, performs the following:
        * Strict transformations
        * Strict auto-parallelization (loop-to-map)
        * Greedy application of SubgraphFusion
        * Tiled write-conflict resolution (MapTiling -> AccumulateTransient)
        * Tiled stream accumulation (MapTiling -> AccumulateTransient)
        * Collapse all maps to parallelize across all dimensions
        * Set all library nodes to expand to ``fast`` expansion, which calls
          the fastest library on the target device
    :param sdfg: The SDFG to optimize.
    :param device: the device to optimize for.
    :param validate: If True, validates the SDFG after all transformations
                     have been applied.
    :param validate_all: If True, validates the SDFG after every step.
    :return: The optimized SDFG.
    :note: Operates in-place on the given SDFG.
    :note: This function is still experimental and may harm correctness in
           certain cases. Please report an issue if it does.
    """
    # Strict transformations and loop parallelization
    transformed = True
    while transformed:
        sdfg.apply_strict_transformations(validate=False,
                                          validate_all=validate_all)

        xfh.split_interstate_edges(sdfg)

        # Try to parallelize loops
        l2ms = sdfg.apply_transformations_repeated(LoopToMap,
                                                   strict=True,
                                                   validate=False,
                                                   validate_all=validate_all)
        transformed = l2ms > 0

    # Map fusion
    greedy_fuse(sdfg, validate_all)

    if device == dtypes.DeviceType.FPGA:
        # apply FPGA Transformations
        sdfg.apply_fpga_transformations()
        fpga_aopt.fpga_global_to_local(sdfg)
        fpga_aopt.fpga_rr_interleave_containers_to_banks(sdfg)

        # Set all library nodes to expand to fast library calls
        set_fast_implementations(sdfg, device)
        return sdfg

    # Tiled WCR and streams
    for nsdfg in list(sdfg.all_sdfgs_recursive()):
        tile_wcrs(nsdfg, validate_all)

    # Collapse maps
    sdfg.apply_transformations_repeated(MapCollapse,
                                        strict=True,
                                        validate=False,
                                        validate_all=validate_all)
    for node, _ in sdfg.all_nodes_recursive():
        if isinstance(node, nodes.MapEntry):
            node.map.collapse = len(node.map.range)

    # Set all library nodes to expand to fast library calls
    set_fast_implementations(sdfg, device)

    # TODO(later): Safe vectorization

    # Disable OpenMP parallel sections
    # TODO(later): Set on a per-SDFG basis
    config.Config.set('compiler', 'cpu', 'openmp_sections', value=False)

    # Set all Default storage types that are constant sized to registers
    move_small_arrays_to_stack(sdfg)

    # Validate at the end
    if validate or validate_all:
        sdfg.validate()

    return sdfg
示例#4
0
def auto_optimize(sdfg: SDFG,
                  device: dtypes.DeviceType,
                  validate: bool = True,
                  validate_all: bool = False,
                  symbols: Dict[str, int] = None) -> SDFG:
    """
    Runs a basic sequence of transformations to optimize a given SDFG to decent
    performance. In particular, performs the following:
        * Simplify
        * Auto-parallelization (loop-to-map)
        * Greedy application of SubgraphFusion
        * Tiled write-conflict resolution (MapTiling -> AccumulateTransient)
        * Tiled stream accumulation (MapTiling -> AccumulateTransient)
        * Collapse all maps to parallelize across all dimensions
        * Set all library nodes to expand to ``fast`` expansion, which calls
          the fastest library on the target device
    :param sdfg: The SDFG to optimize.
    :param device: the device to optimize for.
    :param validate: If True, validates the SDFG after all transformations
                     have been applied.
    :param validate_all: If True, validates the SDFG after every step.
    :param symbols: Optional dict that maps symbols (str/symbolic) to int/float
    :return: The optimized SDFG.
    :note: Operates in-place on the given SDFG.
    :note: This function is still experimental and may harm correctness in
           certain cases. Please report an issue if it does.
    """
    debugprint = config.Config.get_bool('debugprint')

    # Simplification and loop parallelization
    transformed = True
    sdfg.apply_transformations_repeated(TrivialMapElimination,
                                        validate=validate,
                                        validate_all=validate_all)
    while transformed:
        sdfg.simplify(validate=False, validate_all=validate_all)
        for s in sdfg.sdfg_list:
            xfh.split_interstate_edges(s)
        l2ms = sdfg.apply_transformations_repeated(
            (LoopToMap, RefineNestedAccess),
            validate=False,
            validate_all=validate_all)
        transformed = l2ms > 0

    # Collapse maps and eliminate trivial dimensions
    sdfg.simplify()
    sdfg.apply_transformations_repeated(MapCollapse,
                                        validate=False,
                                        validate_all=validate_all)

    # Apply GPU transformations and set library node implementations

    if device == dtypes.DeviceType.GPU:
        sdfg.apply_gpu_transformations()
        sdfg.simplify()

    # fuse subgraphs greedily
    sdfg.simplify()

    greedy_fuse(sdfg, device=device, validate_all=validate_all)

    # fuse stencils greedily
    greedy_fuse(sdfg,
                device=device,
                validate_all=validate_all,
                recursive=False,
                stencil=True)

    if device == dtypes.DeviceType.FPGA:
        # apply FPGA Transformations
        sdfg.apply_fpga_transformations()
        fpga_auto_opt.fpga_global_to_local(sdfg)
        fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)

        # Set all library nodes to expand to fast library calls
        set_fast_implementations(sdfg, device)
        return sdfg

    # Tiled WCR and streams
    for nsdfg in list(sdfg.all_sdfgs_recursive()):
        tile_wcrs(nsdfg, validate_all)

    # Collapse maps
    sdfg.apply_transformations_repeated(MapCollapse,
                                        validate=False,
                                        validate_all=validate_all)
    for node, _ in sdfg.all_nodes_recursive():
        # Set OMP collapse property to map length
        if isinstance(node, nodes.MapEntry):
            # FORNOW: Leave out
            # node.map.collapse = len(node.map.range)
            pass

    # Set all library nodes to expand to fast library calls
    set_fast_implementations(sdfg, device)

    sdfg.expand_library_nodes()

    # TODO(later): Safe vectorization

    # Disable OpenMP parallel sections on a per-SDFG basis
    for nsdfg in sdfg.all_sdfgs_recursive():
        nsdfg.openmp_sections = False

    if symbols:
        # Specialize for all known symbols
        known_symbols = {
            s: v
            for (s, v) in symbols.items() if s in sdfg.free_symbols
        }
        known_symbols = {}
        for (s, v) in symbols.items():
            if s in sdfg.free_symbols:
                if isinstance(v, (int, float)):
                    known_symbols[s] = v
                if isinstance(v, sympy.core.numbers.Integer):
                    try:
                        known_symbols[s] = int(v)
                    except TypeError:
                        pass

        if debugprint and len(known_symbols) > 0:
            print("Specializing the SDFG for symbols", known_symbols)
        sdfg.specialize(known_symbols)

    # Set all Default storage types that are constant sized to registers
    move_small_arrays_to_stack(sdfg)
    '''
    # Fix storage and allocation properties, e.g., for benchmarking purposes
    # FORNOW: Leave out
    make_transients_persistent(sdfg, device)
    '''

    # Validate at the end
    if validate or validate_all:
        sdfg.validate()

    return sdfg