예제 #1
0
def _mutate_with_block_callee(blocks, blk_start, blk_end, inputs, outputs):
    """Mutate *blocks* for the callee of a with-context.

    Parameters
    ----------
    blocks : dict[ir.Block]
    blk_start, blk_end : int
        labels of the starting and ending block of the context-manager.
    inputs: sequence[str]
        Input variable names
    outputs: sequence[str]
        Output variable names
    """
    if not blocks:
        raise errors.NumbaValueError("No blocks in with-context block")
    head_blk = min(blocks)
    temp_blk = blocks[head_blk]
    scope = temp_blk.scope
    loc = temp_blk.loc

    blocks[blk_start] = ir_utils.fill_callee_prologue(
        block=ir.Block(scope=scope, loc=loc),
        inputs=inputs,
        label_next=head_blk,
        )
    blocks[blk_end] = ir_utils.fill_callee_epilogue(
        block=ir.Block(scope=scope, loc=loc),
        outputs=outputs,
    )
예제 #2
0
def _mutate_with_block_caller(dispatcher, blocks, blk_start, blk_end,
                              inputs, outputs):
    """Make a new block that calls into the lifeted with-context.

    Parameters
    ----------
    dispatcher : Dispatcher
    blocks : dict[ir.Block]
    blk_start, blk_end : int
        labels of the starting and ending block of the context-manager.
    inputs: sequence[str]
        Input variable names
    outputs: sequence[str]
        Output variable names
    """
    sblk = blocks[blk_start]
    scope = sblk.scope
    loc = sblk.loc
    newblock = ir.Block(scope=scope, loc=loc)

    ir_utils.fill_block_with_call(
        newblock=newblock,
        callee=dispatcher,
        label_next=blk_end,
        inputs=inputs,
        outputs=outputs,
        )
    return newblock
예제 #3
0
파일: utils.py 프로젝트: kozlov-alexey/hpat
def include_new_blocks(blocks,
                       new_blocks,
                       label,
                       new_body,
                       remove_non_return=True,
                       work_list=None,
                       func_ir=None):
    inner_blocks = add_offset_to_labels(new_blocks, ir_utils._max_label + 1)
    blocks.update(inner_blocks)
    ir_utils._max_label = max(blocks.keys())
    scope = blocks[label].scope
    loc = blocks[label].loc
    inner_topo_order = find_topo_order(inner_blocks)
    inner_first_label = inner_topo_order[0]
    inner_last_label = inner_topo_order[-1]
    if remove_non_return:
        remove_return_from_block(inner_blocks[inner_last_label])
    new_body.append(ir.Jump(inner_first_label, loc))
    blocks[label].body = new_body
    label = ir_utils.next_label()
    blocks[label] = ir.Block(scope, loc)
    if remove_non_return:
        inner_blocks[inner_last_label].body.append(ir.Jump(label, loc))
    # new_body.clear()
    if work_list is not None:
        topo_order = find_topo_order(inner_blocks)
        for _label in topo_order:
            block = inner_blocks[_label]
            block.scope = scope
            numba.core.inline_closurecall._add_definitions(func_ir, block)
            work_list.append((_label, block))
    return label
예제 #4
0
 def insert_block(self, offset, scope=None, loc=None):
     scope = scope or self.current_scope
     loc = loc or self.loc
     blk = ir.Block(scope=scope, loc=loc)
     self.blocks[offset] = blk
     self.current_block = blk
     self.current_block_offset = offset
     return blk
예제 #5
0
def _loop_lift_prepare_loop_func(loopinfo, blocks):
    """
    Inplace transform loop blocks for use as lifted loop.
    """
    entry_block = blocks[loopinfo.callfrom]
    scope = entry_block.scope
    loc = entry_block.loc

    # Lowering assumes the first block to be the one with the smallest offset
    firstblk = min(blocks) - 1
    blocks[firstblk] = ir_utils.fill_callee_prologue(
        block=ir.Block(scope=scope, loc=loc),
        inputs=loopinfo.inputs,
        label_next=loopinfo.callfrom,
    )
    blocks[loopinfo.returnto] = ir_utils.fill_callee_epilogue(
        block=ir.Block(scope=scope, loc=loc),
        outputs=loopinfo.outputs,
    )
예제 #6
0
파일: test_ir.py 프로젝트: zsoltc89/numba
 def gen_block():
     parent = ir.Scope(None, self.loc1)
     tmp = ir.Block(parent, self.loc2)
     assign1 = ir.Assign(self.var_a, self.var_b, self.loc3)
     assign2 = ir.Assign(self.var_a, self.var_c, self.loc3)
     assign3 = ir.Assign(self.var_c, self.var_b, self.loc3)
     tmp.append(assign1)
     tmp.append(assign2)
     tmp.append(assign3)
     return tmp
예제 #7
0
def _run_block_rewrite(blocks, states, handler):
    newblocks = {}
    for label, blk in blocks.items():
        _logger.debug("==== SSA block rewrite pass on %s", label)
        newblk = ir.Block(scope=blk.scope, loc=blk.loc)

        newbody = []
        states["label"] = label
        states["block"] = blk
        for stmt in _run_ssa_block_pass(states, blk, handler):
            assert stmt is not None
            newbody.append(stmt)
        newblk.body = newbody
        newblocks[label] = newblk
    return newblocks
예제 #8
0
def _loop_lift_modify_call_block(liftedloop, block, inputs, outputs, returnto):
    """
    Transform calling block from top-level function to call the lifted loop.
    """
    scope = block.scope
    loc = block.loc
    blk = ir.Block(scope=scope, loc=loc)

    ir_utils.fill_block_with_call(
        newblock=blk,
        callee=liftedloop,
        label_next=returnto,
        inputs=inputs,
        outputs=outputs,
    )
    return blk
예제 #9
0
def _bypass_with_context(blocks, blk_start, blk_end, forwardvars):
    """Given the starting and ending block of the with-context,
    replaces the head block with a new block that jumps to the end.

    *blocks* is modified inplace.
    """
    sblk = blocks[blk_start]
    scope = sblk.scope
    loc = sblk.loc
    newblk = ir.Block(scope=scope, loc=loc)
    for k, v in forwardvars.items():
        newblk.append(ir.Assign(value=scope.get_exact(k),
                                target=scope.get_exact(v),
                                loc=loc))
    newblk.append(ir.Jump(target=blk_end, loc=loc))
    blocks[blk_start] = newblk
예제 #10
0
파일: utils.py 프로젝트: kozlov-alexey/hpat
def inline_new_blocks(func_ir, block, i, callee_blocks, work_list=None):
    # adopted from inline_closure_call
    scope = block.scope
    instr = block.body[i]

    # 1. relabel callee_ir by adding an offset
    callee_blocks = add_offset_to_labels(callee_blocks,
                                         ir_utils._max_label + 1)
    callee_blocks = ir_utils.simplify_CFG(callee_blocks)
    max_label = max(callee_blocks.keys())
    #    reset globals in ir_utils before we use it
    ir_utils._max_label = max_label
    topo_order = find_topo_order(callee_blocks)

    # 5. split caller blocks into two
    new_blocks = []
    new_block = ir.Block(scope, block.loc)
    new_block.body = block.body[i + 1:]
    new_label = ir_utils.next_label()
    func_ir.blocks[new_label] = new_block
    new_blocks.append((new_label, new_block))
    block.body = block.body[:i]
    min_label = topo_order[0]
    block.body.append(ir.Jump(min_label, instr.loc))

    # 6. replace Return with assignment to LHS
    numba.core.inline_closurecall._replace_returns(callee_blocks, instr.target,
                                                   new_label)
    #    remove the old definition of instr.target too
    if (instr.target.name in func_ir._definitions):
        func_ir._definitions[instr.target.name] = []

    # 7. insert all new blocks, and add back definitions
    for label in topo_order:
        # block scope must point to parent's
        block = callee_blocks[label]
        block.scope = scope
        numba.core.inline_closurecall._add_definitions(func_ir, block)
        func_ir.blocks[label] = block
        new_blocks.append((label, block))

    if work_list is not None:
        for block in new_blocks:
            work_list.append(block)
    return callee_blocks
예제 #11
0
 def apply(self):
     """
     Rewrite all matching setitems as static_setitems where the index
     is the literal value of the string.
     """
     new_block = ir.Block(self.block.scope, self.block.loc)
     for inst in self.block.body:
         if isinstance(inst, ir.SetItem):
             if inst in self.setitems:
                 const, lit_val = self.setitems[inst]
                 new_inst = ir.StaticSetItem(target=inst.target,
                                             index=lit_val,
                                             index_var=inst.index,
                                             value=inst.value,
                                             loc=inst.loc)
                 self.calltypes[new_inst] = self.calltypes[inst]
                 inst = new_inst
         new_block.append(inst)
     return new_block
예제 #12
0
    def _gen_rebalances(self, rebalance_arrs, blocks):
        #
        for block in blocks.values():
            new_body = []
            for inst in block.body:
                # TODO: handle hiframes filter etc.
                if isinstance(inst, Parfor):
                    self._gen_rebalances(rebalance_arrs, {0: inst.init_block})
                    self._gen_rebalances(rebalance_arrs, inst.loop_body)
                if isinstance(
                        inst,
                        ir.Assign) and inst.target.name in rebalance_arrs:
                    out_arr = inst.target
                    self.func_ir._definitions[out_arr.name].remove(inst.value)
                    # hold inst results in tmp array
                    tmp_arr = ir.Var(out_arr.scope,
                                     mk_unique_var("rebalance_tmp"),
                                     out_arr.loc)
                    self.typemap[tmp_arr.name] = self.typemap[out_arr.name]
                    inst.target = tmp_arr
                    nodes = [inst]

                    def f(in_arr):  # pragma: no cover
                        out_a = sdc.distributed_api.rebalance_array(in_arr)

                    f_block = compile_to_numba_ir(
                        f, {
                            'sdc': sdc
                        }, self.typingctx, (self.typemap[tmp_arr.name], ),
                        self.typemap, self.calltypes).blocks.popitem()[1]
                    replace_arg_nodes(f_block, [tmp_arr])
                    nodes += f_block.body[:-3]  # remove none return
                    nodes[-1].target = out_arr
                    # update definitions
                    dumm_block = ir.Block(out_arr.scope, out_arr.loc)
                    dumm_block.body = nodes
                    build_definitions({0: dumm_block},
                                      self.func_ir._definitions)
                    new_body += nodes
                else:
                    new_body.append(inst)

            block.body = new_body
예제 #13
0
 def rewrite_single_backedge(loop):
     """
     Add new tail block that gathers all the backedges
     """
     header = loop.header
     tailkey = new_block_id()
     for blkkey in loop.body:
         blk = newblocks[blkkey]
         if header in blk.terminator.get_targets():
             newblk = blk.copy()
             # rewrite backedge into jumps to new tail block
             newblk.body[-1] = replace_target(blk.terminator, header,
                                              tailkey)
             newblocks[blkkey] = newblk
     # create new tail block
     entryblk = newblocks[header]
     tailblk = ir.Block(scope=entryblk.scope, loc=entryblk.loc)
     # add backedge
     tailblk.append(ir.Jump(target=header, loc=tailblk.loc))
     newblocks[tailkey] = tailblk
예제 #14
0
 def apply(self):
     """
     Rewrite all matching getitems as static_getitems where the index
     is the literal value of the string.
     """
     new_block = ir.Block(self.block.scope, self.block.loc)
     for inst in self.block.body:
         if isinstance(inst, ir.Assign):
             expr = inst.value
             if expr in self.getitems:
                 const, lit_val = self.getitems[expr]
                 new_expr = ir.Expr.static_getitem(value=expr.value,
                                                   index=lit_val,
                                                   index_var=expr.index,
                                                   loc=expr.loc)
                 self.calltypes[new_expr] = self.calltypes[expr]
                 inst = ir.Assign(value=new_expr,
                                  target=inst.target,
                                  loc=inst.loc)
         new_block.append(inst)
     return new_block
예제 #15
0
def copy_block(block):
    memo = {}
    new_block = ir.Block(block.scope, block.loc)
    new_block.body = [relatively_deep_copy(stmt, memo) for stmt in block.body]
    return new_block
예제 #16
0
def _rewrite_return(func_ir, target_block_label):
    """Rewrite a return block inside a with statement.

    Arguments
    ---------

    func_ir: Function IR
      the CFG to transform
    target_block_label: int
      the block index/label of the block containing the POP_BLOCK statement


    This implements a CFG transformation to insert a block between two other
    blocks.

    The input situation is:

    ┌───────────────┐
    │   top         │
    │   POP_BLOCK   │
    │   bottom      │
    └───────┬───────┘
            │
    ┌───────▼───────┐
    │               │
    │    RETURN     │
    │               │
    └───────────────┘

    If such a pattern is detected in IR, it means there is a `return` statement
    within a `with` context. The basic idea is to rewrite the CFG as follows:

    ┌───────────────┐
    │   top         │
    │   POP_BLOCK   │
    │               │
    └───────┬───────┘
            │
    ┌───────▼───────┐
    │               │
    │     bottom    │
    │               │
    └───────┬───────┘
            │
    ┌───────▼───────┐
    │               │
    │    RETURN     │
    │               │
    └───────────────┘

    We split the block that contains the `POP_BLOCK` statement into two blocks.
    Everything from the beginning of the block up to and including the
    `POP_BLOCK` statement is considered the 'top' and everything below is
    considered 'bottom'. Finally the jump statements are re-wired to make sure
    the CFG remains valid.

    """
    # the block itself from the index
    target_block = func_ir.blocks[target_block_label]
    # get the index of the block containing the return
    target_block_successor_label = target_block.terminator.get_targets()[0]
    # the return block
    target_block_successor = func_ir.blocks[target_block_successor_label]

    # create the new return block with an appropriate label
    max_label = ir_utils.find_max_label(func_ir.blocks)
    new_label = max_label + 1
    # create the new return block
    new_block_loc = target_block_successor.loc
    new_block_scope = ir.Scope(None, loc=new_block_loc)
    new_block = ir.Block(new_block_scope, loc=new_block_loc)

    # Split the block containing the POP_BLOCK into top and bottom
    # Block must be of the form:
    # -----------------
    # <some stmts>
    # POP_BLOCK
    # <some more stmts>
    # JUMP
    # -----------------
    top_body, bottom_body = [], []
    pop_blocks = [*target_block.find_insts(ir.PopBlock)]
    assert len(pop_blocks) == 1
    assert len([*target_block.find_insts(ir.Jump)]) == 1
    assert isinstance(target_block.body[-1], ir.Jump)
    pb_marker = pop_blocks[0]
    pb_is = target_block.body.index(pb_marker)
    top_body.extend(target_block.body[:pb_is])
    top_body.append(ir.Jump(target_block_successor_label, target_block.loc))
    bottom_body.extend(target_block.body[pb_is:-1])
    bottom_body.append(ir.Jump(new_label, target_block.loc))

    # get the contents of the return block
    return_body = func_ir.blocks[target_block_successor_label].body
    # finally, re-assign all blocks
    new_block.body.extend(return_body)
    target_block_successor.body.clear()
    target_block_successor.body.extend(bottom_body)
    target_block.body.clear()
    target_block.body.extend(top_body)

    # finally, append the new return block and rebuild the IR properties
    func_ir.blocks[new_label] = new_block
    func_ir._definitions = ir_utils.build_definitions(func_ir.blocks)
    return func_ir
예제 #17
0
def _fix_multi_exit_blocks(func_ir, exit_nodes, *, split_condition=None):
    """Modify the FunctionIR to create a single common exit node given the
    original exit nodes.

    Parameters
    ----------
    func_ir :
        The FunctionIR. Mutated inplace.
    exit_nodes :
        The original exit nodes. A sequence of block keys.
    split_condition : callable or None
        If not None, it is a callable with the signature
        `split_condition(statement)` that determines if the `statement` is the
        splitting point (e.g. `POP_BLOCK`) in an exit node.
        If it's None, the exit node is not split.
    """

    # Convert the following:
    #
    #     |           |
    # +-------+   +-------+
    # | exit0 |   | exit1 |
    # +-------+   +-------+
    #     |           |
    # +-------+   +-------+
    # | after0|   | after1|
    # +-------+   +-------+
    #     |           |
    #
    # To roughly:
    #
    #     |           |
    # +-------+   +-------+
    # | exit0 |   | exit1 |
    # +-------+   +-------+
    #     |           |
    #     +-----+-----+
    #           |
    #      +---------+
    #      | common  |
    #      +---------+
    #           |
    #       +-------+
    #       | post  |
    #       +-------+
    #           |
    #     +-----+-----+
    #     |           |
    # +-------+   +-------+
    # | after0|   | after1|
    # +-------+   +-------+

    blocks = func_ir.blocks
    # Getting the scope
    any_blk = min(func_ir.blocks.values())
    scope = any_blk.scope
    # Getting the maximum block label
    max_label = max(func_ir.blocks) + 1
    # Define the new common block for the new exit.
    common_block = ir.Block(any_blk.scope, loc=ir.unknown_loc)
    common_label = max_label
    max_label += 1
    blocks[common_label] = common_block
    # Define the new block after the exit.
    post_block = ir.Block(any_blk.scope, loc=ir.unknown_loc)
    post_label = max_label
    max_label += 1
    blocks[post_label] = post_block

    # Adjust each exit node
    remainings = []
    for i, k in enumerate(exit_nodes):
        blk = blocks[k]

        # split the block if needed
        if split_condition is not None:
            for pt, stmt in enumerate(blk.body):
                if split_condition(stmt):
                    break
        else:
            # no splitting
            pt = -1

        before = blk.body[:pt]
        after = blk.body[pt:]
        remainings.append(after)

        # Add control-point variable to mark which exit block this is.
        blk.body = before
        loc = blk.loc
        blk.body.append(
            ir.Assign(value=ir.Const(i, loc=loc),
                      target=scope.get_or_define("$cp", loc=loc),
                      loc=loc))
        # Replace terminator with a jump to the common block
        assert not blk.is_terminated
        blk.body.append(ir.Jump(common_label, loc=ir.unknown_loc))

    if split_condition is not None:
        # Move the splitting statement to the common block
        common_block.body.append(remainings[0][0])
    assert not common_block.is_terminated
    # Append jump from common block to post block
    common_block.body.append(ir.Jump(post_label, loc=loc))

    # Make if-else tree to jump to target
    remain_blocks = []
    for remain in remainings:
        remain_blocks.append(max_label)
        max_label += 1

    switch_block = post_block
    loc = ir.unknown_loc
    for i, remain in enumerate(remainings):
        match_expr = scope.redefine("$cp_check", loc=loc)
        match_rhs = scope.redefine("$cp_rhs", loc=loc)

        # Do comparison to match control-point variable to the exit block
        switch_block.body.append(
            ir.Assign(value=ir.Const(i, loc=loc), target=match_rhs, loc=loc), )

        # Add assignment for the comparison
        switch_block.body.append(
            ir.Assign(value=ir.Expr.binop(
                fn=operator.eq,
                lhs=scope.get("$cp"),
                rhs=match_rhs,
                loc=loc,
            ),
                      target=match_expr,
                      loc=loc), )

        # Insert jump to the next case
        [jump_target] = remain[-1].get_targets()
        switch_block.body.append(
            ir.Branch(match_expr, jump_target, remain_blocks[i], loc=loc), )
        switch_block = ir.Block(scope=scope, loc=loc)
        blocks[remain_blocks[i]] = switch_block

    # Add the final jump
    switch_block.body.append(ir.Jump(jump_target, loc=loc))

    return func_ir, common_label
예제 #18
0
파일: stencil.py 프로젝트: zsoltc89/numba
    def _stencil_wrapper(self, result, sigret, return_type, typemap, calltypes,
                         *args):
        # Overall approach:
        # 1) Construct a string containing a function definition for the stencil function
        #    that will execute the stencil kernel.  This function definition includes a
        #    unique stencil function name, the parameters to the stencil kernel, loop
        #    nests across the dimensions of the input array.  Those loop nests use the
        #    computed stencil kernel size so as not to try to compute elements where
        #    elements outside the bounds of the input array would be needed.
        # 2) The but of the loop nest in this new function is a special sentinel
        #    assignment.
        # 3) Get the IR of this new function.
        # 4) Split the block containing the sentinel assignment and remove the sentinel
        #    assignment.  Insert the stencil kernel IR into the stencil function IR
        #    after label and variable renaming of the stencil kernel IR to prevent
        #    conflicts with the stencil function IR.
        # 5) Compile the combined stencil function IR + stencil kernel IR into existence.

        # Copy the kernel so that our changes for this callsite
        # won't effect other callsites.
        (kernel_copy,
         copy_calltypes) = self.copy_ir_with_calltypes(self.kernel_ir,
                                                       calltypes)
        # The stencil kernel body becomes the body of a loop, for which args aren't needed.
        ir_utils.remove_args(kernel_copy.blocks)
        first_arg = kernel_copy.arg_names[0]

        in_cps, out_cps = ir_utils.copy_propagate(kernel_copy.blocks, typemap)
        name_var_table = ir_utils.get_name_var_table(kernel_copy.blocks)
        ir_utils.apply_copy_propagate(kernel_copy.blocks, in_cps,
                                      name_var_table, typemap, copy_calltypes)

        if "out" in name_var_table:
            raise ValueError(
                "Cannot use the reserved word 'out' in stencil kernels.")

        sentinel_name = ir_utils.get_unused_var_name("__sentinel__",
                                                     name_var_table)
        if config.DEBUG_ARRAY_OPT >= 1:
            print("name_var_table", name_var_table, sentinel_name)

        the_array = args[0]

        if config.DEBUG_ARRAY_OPT >= 1:
            print("_stencil_wrapper", return_type, return_type.dtype,
                  type(return_type.dtype), args)
            ir_utils.dump_blocks(kernel_copy.blocks)

        # We generate a Numba function to execute this stencil and here
        # create the unique name of this function.
        stencil_func_name = "__numba_stencil_%s_%s" % (hex(
            id(the_array)).replace("-", "_"), self.id)

        # We will put a loop nest in the generated function for each
        # dimension in the input array.  Here we create the name for
        # the index variable for each dimension.  index0, index1, ...
        index_vars = []
        for i in range(the_array.ndim):
            index_var_name = ir_utils.get_unused_var_name(
                "index" + str(i), name_var_table)
            index_vars += [index_var_name]

        # Create extra signature for out and neighborhood.
        out_name = ir_utils.get_unused_var_name("out", name_var_table)
        neighborhood_name = ir_utils.get_unused_var_name(
            "neighborhood", name_var_table)
        sig_extra = ""
        if result is not None:
            sig_extra += ", {}=None".format(out_name)
        if "neighborhood" in dict(self.kws):
            sig_extra += ", {}=None".format(neighborhood_name)

        # Get a list of the standard indexed array names.
        standard_indexed = self.options.get("standard_indexing", [])

        if first_arg in standard_indexed:
            raise ValueError("The first argument to a stencil kernel must "
                             "use relative indexing, not standard indexing.")

        if len(set(standard_indexed) - set(kernel_copy.arg_names)) != 0:
            raise ValueError("Standard indexing requested for an array name "
                             "not present in the stencil kernel definition.")

        # Add index variables to getitems in the IR to transition the accesses
        # in the kernel from relative to regular Python indexing.  Returns the
        # computed size of the stencil kernel and a list of the relatively indexed
        # arrays.
        kernel_size, relatively_indexed = self.add_indices_to_kernel(
            kernel_copy, index_vars, the_array.ndim, self.neighborhood,
            standard_indexed, typemap, copy_calltypes)
        if self.neighborhood is None:
            self.neighborhood = kernel_size

        if config.DEBUG_ARRAY_OPT >= 1:
            print("After add_indices_to_kernel")
            ir_utils.dump_blocks(kernel_copy.blocks)

        # The return in the stencil kernel becomes a setitem for that
        # particular point in the iteration space.
        ret_blocks = self.replace_return_with_setitem(kernel_copy.blocks,
                                                      index_vars, out_name)

        if config.DEBUG_ARRAY_OPT >= 1:
            print("After replace_return_with_setitem", ret_blocks)
            ir_utils.dump_blocks(kernel_copy.blocks)

        # Start to form the new function to execute the stencil kernel.
        func_text = "def {}({}{}):\n".format(stencil_func_name,
                                             ",".join(kernel_copy.arg_names),
                                             sig_extra)

        # Get loop ranges for each dimension, which could be either int
        # or variable. In the latter case we'll use the extra neighborhood
        # argument to the function.
        ranges = []
        for i in range(the_array.ndim):
            if isinstance(kernel_size[i][0], int):
                lo = kernel_size[i][0]
                hi = kernel_size[i][1]
            else:
                lo = "{}[{}][0]".format(neighborhood_name, i)
                hi = "{}[{}][1]".format(neighborhood_name, i)
            ranges.append((lo, hi))

        # If there are more than one relatively indexed arrays, add a call to
        # a function that will raise an error if any of the relatively indexed
        # arrays are of different size than the first input array.
        if len(relatively_indexed) > 1:
            func_text += "    raise_if_incompatible_array_sizes(" + first_arg
            for other_array in relatively_indexed:
                if other_array != first_arg:
                    func_text += "," + other_array
            func_text += ")\n"

        # Get the shape of the first input array.
        shape_name = ir_utils.get_unused_var_name("full_shape", name_var_table)
        func_text += "    {} = {}.shape\n".format(shape_name, first_arg)

        # If we have to allocate the output array (the out argument was not used)
        # then us numpy.full if the user specified a cval stencil decorator option
        # or np.zeros if they didn't to allocate the array.
        if result is None:
            return_type_name = numpy_support.as_dtype(
                return_type.dtype).type.__name__
            if "cval" in self.options:
                cval = self.options["cval"]
                if return_type.dtype != typing.typeof.typeof(cval):
                    raise ValueError(
                        "cval type does not match stencil return type.")
                out_init = "{} = np.full({}, {}, dtype=np.{})\n".format(
                    out_name, shape_name, cval, return_type_name)
            else:
                out_init = "{} = np.zeros({}, dtype=np.{})\n".format(
                    out_name, shape_name, return_type_name)
            func_text += "    " + out_init
        else:  # result is present, if cval is set then use it
            if "cval" in self.options:
                cval = self.options["cval"]
                cval_ty = typing.typeof.typeof(cval)
                if not self._typingctx.can_convert(cval_ty, return_type.dtype):
                    msg = "cval type does not match stencil return type."
                    raise ValueError(msg)
                out_init = "{}[:] = {}\n".format(out_name, cval)
                func_text += "    " + out_init

        offset = 1
        # Add the loop nests to the new function.
        for i in range(the_array.ndim):
            for j in range(offset):
                func_text += "    "
            # ranges[i][0] is the minimum index used in the i'th dimension
            # but minimum's greater than 0 don't preclude any entry in the array.
            # So, take the minimum of 0 and the minimum index found in the kernel
            # and this will be a negative number (potentially -0).  Then, we do
            # unary - on that to get the positive offset in this dimension whose
            # use is precluded.
            # ranges[i][1] is the maximum of 0 and the observed maximum index
            # in this dimension because negative maximums would not cause us to
            # preclude any entry in the array from being used.
            func_text += ("for {} in range(-min(0,{}),"
                          "{}[{}]-max(0,{})):\n").format(
                              index_vars[i], ranges[i][0], shape_name, i,
                              ranges[i][1])
            offset += 1

        for j in range(offset):
            func_text += "    "
        # Put a sentinel in the code so we can locate it in the IR.  We will
        # remove this sentinel assignment and replace it with the IR for the
        # stencil kernel body.
        func_text += "{} = 0\n".format(sentinel_name)
        func_text += "    return {}\n".format(out_name)

        if config.DEBUG_ARRAY_OPT >= 1:
            print("new stencil func text")
            print(func_text)

        # Force the new stencil function into existence.
        exec(func_text) in globals(), locals()
        stencil_func = eval(stencil_func_name)
        if sigret is not None:
            pysig = utils.pysignature(stencil_func)
            sigret.pysig = pysig
        # Get the IR for the newly created stencil function.
        from numba.core import compiler
        stencil_ir = compiler.run_frontend(stencil_func)
        ir_utils.remove_dels(stencil_ir.blocks)

        # rename all variables in stencil_ir afresh
        var_table = ir_utils.get_name_var_table(stencil_ir.blocks)
        new_var_dict = {}
        reserved_names = (
            [sentinel_name, out_name, neighborhood_name, shape_name] +
            kernel_copy.arg_names + index_vars)
        for name, var in var_table.items():
            if not name in reserved_names:
                new_var_dict[name] = ir_utils.mk_unique_var(name)
        ir_utils.replace_var_names(stencil_ir.blocks, new_var_dict)

        stencil_stub_last_label = max(stencil_ir.blocks.keys()) + 1

        # Shift labels in the kernel copy so they are guaranteed unique
        # and don't conflict with any labels in the stencil_ir.
        kernel_copy.blocks = ir_utils.add_offset_to_labels(
            kernel_copy.blocks, stencil_stub_last_label)
        new_label = max(kernel_copy.blocks.keys()) + 1
        # Adjust ret_blocks to account for addition of the offset.
        ret_blocks = [x + stencil_stub_last_label for x in ret_blocks]

        if config.DEBUG_ARRAY_OPT >= 1:
            print("ret_blocks w/ offsets", ret_blocks, stencil_stub_last_label)
            print("before replace sentinel stencil_ir")
            ir_utils.dump_blocks(stencil_ir.blocks)
            print("before replace sentinel kernel_copy")
            ir_utils.dump_blocks(kernel_copy.blocks)

        # Search all the block in the stencil outline for the sentinel.
        for label, block in stencil_ir.blocks.items():
            for i, inst in enumerate(block.body):
                if (isinstance(inst, ir.Assign)
                        and inst.target.name == sentinel_name):
                    # We found the sentinel assignment.
                    loc = inst.loc
                    scope = block.scope
                    # split block across __sentinel__
                    # A new block is allocated for the statements prior to the
                    # sentinel but the new block maintains the current block
                    # label.
                    prev_block = ir.Block(scope, loc)
                    prev_block.body = block.body[:i]
                    # The current block is used for statements after sentinel.
                    block.body = block.body[i + 1:]
                    # But the current block gets a new label.
                    body_first_label = min(kernel_copy.blocks.keys())

                    # The previous block jumps to the minimum labelled block of
                    # the parfor body.
                    prev_block.append(ir.Jump(body_first_label, loc))
                    # Add all the parfor loop body blocks to the gufunc
                    # function's IR.
                    for (l, b) in kernel_copy.blocks.items():
                        stencil_ir.blocks[l] = b

                    stencil_ir.blocks[new_label] = block
                    stencil_ir.blocks[label] = prev_block
                    # Add a jump from all the blocks that previously contained
                    # a return in the stencil kernel to the block
                    # containing statements after the sentinel.
                    for ret_block in ret_blocks:
                        stencil_ir.blocks[ret_block].append(
                            ir.Jump(new_label, loc))
                    break
            else:
                continue
            break

        stencil_ir.blocks = ir_utils.rename_labels(stencil_ir.blocks)
        ir_utils.remove_dels(stencil_ir.blocks)

        assert (isinstance(the_array, types.Type))
        array_types = args

        new_stencil_param_types = list(array_types)

        if config.DEBUG_ARRAY_OPT >= 1:
            print("new_stencil_param_types", new_stencil_param_types)
            ir_utils.dump_blocks(stencil_ir.blocks)

        # Compile the combined stencil function with the replaced loop
        # body in it.
        new_func = compiler.compile_ir(self._typingctx, self._targetctx,
                                       stencil_ir, new_stencil_param_types,
                                       None, compiler.DEFAULT_FLAGS, {})
        return new_func
예제 #19
0
def _create_gufunc_for_parfor_body(
    lowerer,
    parfor,
    typemap,
    typingctx,
    targetctx,
    flags,
    loop_ranges,
    locals,
    has_aliases,
    index_var_typ,
    races,
):
    """
    Takes a parfor and creates a gufunc function for its body. There
    are two parts to this function:

        1) Code to iterate across the iteration space as defined by
           the schedule.
        2) The parfor body that does the work for a single point in
           the iteration space.

    Part 1 is created as Python text for simplicity with a sentinel
    assignment to mark the point in the IR where the parfor body
    should be added. This Python text is 'exec'ed into existence and its
    IR retrieved with run_frontend. The IR is scanned for the sentinel
    assignment where that basic block is split and the IR for the parfor
    body inserted.
    """

    loc = parfor.init_block.loc

    # The parfor body and the main function body share ir.Var nodes.
    # We have to do some replacements of Var names in the parfor body
    # to make them legal parameter names. If we don't copy then the
    # Vars in the main function also would incorrectly change their name.

    loop_body = copy.copy(parfor.loop_body)
    remove_dels(loop_body)

    parfor_dim = len(parfor.loop_nests)
    loop_indices = [l.index_variable.name for l in parfor.loop_nests]

    # Get all the parfor params.
    parfor_params = parfor.params

    for start, stop, step in loop_ranges:
        if isinstance(start, ir.Var):
            parfor_params.add(start.name)
        if isinstance(stop, ir.Var):
            parfor_params.add(stop.name)

    # Get just the outputs of the parfor.
    parfor_outputs = numba.parfors.parfor.get_parfor_outputs(
        parfor, parfor_params)

    # Get all parfor reduction vars, and operators.
    typemap = lowerer.fndesc.typemap

    parfor_redvars, parfor_reddict = numba.parfors.parfor.get_parfor_reductions(
        lowerer.func_ir, parfor, parfor_params, lowerer.fndesc.calltypes)
    has_reduction = False if len(parfor_redvars) == 0 else True

    if has_reduction:
        _create_gufunc_for_reduction_parfor()

    # Compute just the parfor inputs as a set difference.
    parfor_inputs = sorted(list(set(parfor_params) - set(parfor_outputs)))

    for race in races:
        msg = ("Variable %s used in parallel loop may be written "
               "to simultaneously by multiple workers and may result "
               "in non-deterministic or unintended results." % race)
        warnings.warn(NumbaParallelSafetyWarning(msg, loc))
    replace_var_with_array(races, loop_body, typemap, lowerer.fndesc.calltypes)

    if config.DEBUG_ARRAY_OPT >= 1:
        print("parfor_params = ", parfor_params, type(parfor_params))
        print("parfor_outputs = ", parfor_outputs, type(parfor_outputs))
        print("parfor_inputs = ", parfor_inputs, type(parfor_inputs))

    # Reorder all the params so that inputs go first then outputs.
    parfor_params = parfor_inputs + parfor_outputs

    def addrspace_from(params, def_addr):
        addrspaces = []
        for p in params:
            if isinstance(to_scalar_from_0d(typemap[p]), types.npytypes.Array):
                addrspaces.append(def_addr)
            else:
                addrspaces.append(None)
        return addrspaces

    addrspaces = addrspace_from(parfor_params, address_space.GLOBAL)

    if config.DEBUG_ARRAY_OPT >= 1:
        print("parfor_params = ", parfor_params, type(parfor_params))
        print("loop_indices = ", loop_indices, type(loop_indices))
        print("loop_body = ", loop_body, type(loop_body))
        _print_body(loop_body)

    # Some Var are not legal parameter names so create a dict of
    # potentially illegal param name to guaranteed legal name.
    param_dict = legalize_names_with_typemap(parfor_params, typemap)
    if config.DEBUG_ARRAY_OPT >= 1:
        print("param_dict = ", sorted(param_dict.items()), type(param_dict))

    # Some loop_indices are not legal parameter names so create a dict
    # of potentially illegal loop index to guaranteed legal name.
    ind_dict = legalize_names_with_typemap(loop_indices, typemap)
    # Compute a new list of legal loop index names.
    legal_loop_indices = [ind_dict[v] for v in loop_indices]

    if config.DEBUG_ARRAY_OPT >= 1:
        print("ind_dict = ", sorted(ind_dict.items()), type(ind_dict))
        print(
            "legal_loop_indices = ",
            legal_loop_indices,
            type(legal_loop_indices),
        )

        for pd in parfor_params:
            print("pd = ", pd)
            print("pd type = ", typemap[pd], type(typemap[pd]))

    # Get the types of each parameter.
    param_types = [to_scalar_from_0d(typemap[v]) for v in parfor_params]

    param_types_addrspaces = copy.copy(param_types)

    # Calculate types of args passed to gufunc.
    func_arg_types = [typemap[v] for v in (parfor_inputs + parfor_outputs)]
    assert len(param_types_addrspaces) == len(addrspaces)
    for i in range(len(param_types_addrspaces)):
        if addrspaces[i] is not None:
            # Convert Numba's npytype.Array to DPPYArray data type. DPPYArray
            # allows us to specify an address space for the data and other
            # pointer arguments for the array.
            param_types_addrspaces[i] = npytypes_array_to_dppy_array(
                param_types_addrspaces[i], addrspaces[i])

    def print_arg_with_addrspaces(args):
        for a in args:
            print(a, type(a))
            if isinstance(a, types.npytypes.Array):
                print("addrspace:", a.addrspace)

    if config.DEBUG_ARRAY_OPT >= 1:
        print_arg_with_addrspaces(param_types)
        print("func_arg_types = ", func_arg_types, type(func_arg_types))

    # Replace illegal parameter names in the loop body with legal ones.
    replace_var_names(loop_body, param_dict)
    # remember the name before legalizing as the actual arguments
    parfor_args = parfor_params
    # Change parfor_params to be legal names.
    parfor_params = [param_dict[v] for v in parfor_params]
    parfor_params_orig = parfor_params

    parfor_params = []
    ascontig = False
    for pindex in range(len(parfor_params_orig)):
        if (ascontig and pindex < len(parfor_inputs)
                and isinstance(param_types[pindex], types.npytypes.Array)):
            parfor_params.append(parfor_params_orig[pindex] + "param")
        else:
            parfor_params.append(parfor_params_orig[pindex])

    # Change parfor body to replace illegal loop index vars with legal ones.
    replace_var_names(loop_body, ind_dict)
    loop_body_var_table = get_name_var_table(loop_body)
    sentinel_name = get_unused_var_name("__sentinel__", loop_body_var_table)

    if config.DEBUG_ARRAY_OPT >= 1:
        print("legal parfor_params = ", parfor_params, type(parfor_params))

    # Determine the unique names of the scheduling and gufunc functions.
    gufunc_name = "__numba_parfor_gufunc_%s" % (parfor.id)

    if config.DEBUG_ARRAY_OPT:
        # print("sched_func_name ", type(sched_func_name), sched_func_name)
        print("gufunc_name ", type(gufunc_name), gufunc_name)

    gufunc_txt = ""

    # Create the gufunc function.
    gufunc_txt += "def " + gufunc_name
    gufunc_txt += "(" + (", ".join(parfor_params)) + "):\n"

    gufunc_txt += _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges,
                                 param_dict)

    # Add the sentinel assignment so that we can find the loop body position
    # in the IR.
    gufunc_txt += "    "
    gufunc_txt += sentinel_name + " = 0\n"

    # gufunc returns nothing
    gufunc_txt += "    return None\n"

    if config.DEBUG_ARRAY_OPT:
        print("gufunc_txt = ", type(gufunc_txt), "\n", gufunc_txt)
        sys.stdout.flush()
    # Force gufunc outline into existence.
    globls = {"np": np, "numba": numba, "dppy": dppy}
    locls = {}
    exec(gufunc_txt, globls, locls)
    gufunc_func = locls[gufunc_name]

    if config.DEBUG_ARRAY_OPT:
        print("gufunc_func = ", type(gufunc_func), "\n", gufunc_func)
    # Get the IR for the gufunc outline.
    gufunc_ir = compiler.run_frontend(gufunc_func)

    if config.DEBUG_ARRAY_OPT:
        print("gufunc_ir dump ", type(gufunc_ir))
        gufunc_ir.dump()
        print("loop_body dump ", type(loop_body))
        _print_body(loop_body)

    # rename all variables in gufunc_ir afresh
    var_table = get_name_var_table(gufunc_ir.blocks)
    new_var_dict = {}
    reserved_names = ([sentinel_name] + list(param_dict.values()) +
                      legal_loop_indices)
    for name, var in var_table.items():
        if not (name in reserved_names):
            new_var_dict[name] = mk_unique_var(name)
    replace_var_names(gufunc_ir.blocks, new_var_dict)
    if config.DEBUG_ARRAY_OPT:
        print("gufunc_ir dump after renaming ")
        gufunc_ir.dump()

    prs_dict = {}
    pss_dict = {}
    pspmd_dict = {}

    gufunc_param_types = param_types

    if config.DEBUG_ARRAY_OPT:
        print(
            "gufunc_param_types = ",
            type(gufunc_param_types),
            "\n",
            gufunc_param_types,
        )

    gufunc_stub_last_label = max(gufunc_ir.blocks.keys()) + 1

    # Add gufunc stub last label to each parfor.loop_body label to prevent
    # label conflicts.
    loop_body = add_offset_to_labels(loop_body, gufunc_stub_last_label)
    # new label for splitting sentinel block
    new_label = max(loop_body.keys()) + 1

    # If enabled, add a print statement after every assignment.
    if config.DEBUG_ARRAY_OPT_RUNTIME:
        _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap)

    if config.DEBUG_ARRAY_OPT:
        print("parfor loop body")
        _print_body(loop_body)

    wrapped_blocks = wrap_loop_body(loop_body)
    # hoisted, not_hoisted = hoist(parfor_params, loop_body,
    #                             typemap, wrapped_blocks)
    setitems = set()
    find_setitems_body(setitems, loop_body, typemap)

    hoisted = []
    not_hoisted = []

    start_block = gufunc_ir.blocks[min(gufunc_ir.blocks.keys())]
    start_block.body = start_block.body[:-1] + hoisted + [start_block.body[-1]]
    unwrap_loop_body(loop_body)

    # store hoisted into diagnostics
    diagnostics = lowerer.metadata["parfor_diagnostics"]
    diagnostics.hoist_info[parfor.id] = {
        "hoisted": hoisted,
        "not_hoisted": not_hoisted,
    }

    lowerer.metadata["parfor_diagnostics"].extra_info[str(parfor.id)] = str(
        dpctl.get_current_queue().get_sycl_device().name)

    if config.DEBUG_ARRAY_OPT:
        print("After hoisting")
        _print_body(loop_body)

    # Search all the block in the gufunc outline for the sentinel assignment.
    for label, block in gufunc_ir.blocks.items():
        for i, inst in enumerate(block.body):
            if (isinstance(inst, ir.Assign)
                    and inst.target.name == sentinel_name):
                # We found the sentinel assignment.
                loc = inst.loc
                scope = block.scope
                # split block across __sentinel__
                # A new block is allocated for the statements prior to the
                # sentinel but the new block maintains the current block label.
                prev_block = ir.Block(scope, loc)
                prev_block.body = block.body[:i]

                # The current block is used for statements after the sentinel.
                block.body = block.body[i + 1:]
                # But the current block gets a new label.
                body_first_label = min(loop_body.keys())

                # The previous block jumps to the minimum labelled block of the
                # parfor body.
                prev_block.append(ir.Jump(body_first_label, loc))
                # Add all the parfor loop body blocks to the gufunc function's
                # IR.
                for (l, b) in loop_body.items():
                    gufunc_ir.blocks[l] = b
                body_last_label = max(loop_body.keys())
                gufunc_ir.blocks[new_label] = block
                gufunc_ir.blocks[label] = prev_block
                # Add a jump from the last parfor body block to the block
                # containing statements after the sentinel.
                gufunc_ir.blocks[body_last_label].append(
                    ir.Jump(new_label, loc))
                break
        else:
            continue
        break

    if config.DEBUG_ARRAY_OPT:
        print("gufunc_ir last dump before renaming")
        gufunc_ir.dump()

    gufunc_ir.blocks = rename_labels(gufunc_ir.blocks)
    remove_dels(gufunc_ir.blocks)

    if config.DEBUG_ARRAY_OPT:
        sys.stdout.flush()

    if config.DEBUG_ARRAY_OPT:
        print("gufunc_ir last dump")
        gufunc_ir.dump()
        print("flags", flags)
        print("typemap", typemap)

    old_alias = flags.noalias
    if not has_aliases:
        if config.DEBUG_ARRAY_OPT:
            print("No aliases found so adding noalias flag.")
        flags.noalias = True

    remove_dead(gufunc_ir.blocks, gufunc_ir.arg_names, gufunc_ir, typemap)

    if config.DEBUG_ARRAY_OPT:
        print("gufunc_ir after remove dead")
        gufunc_ir.dump()

    kernel_sig = signature(types.none, *gufunc_param_types)

    if config.DEBUG_ARRAY_OPT:
        sys.stdout.flush()

    if config.DEBUG_ARRAY_OPT:
        print("before DUFunc inlining".center(80, "-"))
        gufunc_ir.dump()

    # Inlining all DUFuncs
    dufunc_inliner(
        gufunc_ir,
        lowerer.fndesc.calltypes,
        typemap,
        lowerer.context.typing_context,
        lowerer.context,
    )

    if config.DEBUG_ARRAY_OPT:
        print("after DUFunc inline".center(80, "-"))
        gufunc_ir.dump()

    kernel_func = dppy.compiler.compile_kernel_parfor(
        dpctl.get_current_queue(),
        gufunc_ir,
        gufunc_param_types,
        param_types_addrspaces,
        debug=flags.debuginfo,
    )

    flags.noalias = old_alias

    if config.DEBUG_ARRAY_OPT:
        print("kernel_sig = ", kernel_sig)

    return kernel_func, parfor_args, kernel_sig, func_arg_types, setitems
예제 #20
0
    def _mk_stencil_parfor(self, label, in_args, out_arr, stencil_ir,
                           index_offsets, target, return_type, stencil_func,
                           arg_to_arr_dict):
        """ Converts a set of stencil kernel blocks to a parfor.
        """
        gen_nodes = []
        stencil_blocks = stencil_ir.blocks

        if config.DEBUG_ARRAY_OPT >= 1:
            print("_mk_stencil_parfor", label, in_args, out_arr, index_offsets,
                  return_type, stencil_func, stencil_blocks)
            ir_utils.dump_blocks(stencil_blocks)

        in_arr = in_args[0]
        # run copy propagate to replace in_args copies (e.g. a = A)
        in_arr_typ = self.typemap[in_arr.name]
        in_cps, out_cps = ir_utils.copy_propagate(stencil_blocks, self.typemap)
        name_var_table = ir_utils.get_name_var_table(stencil_blocks)

        ir_utils.apply_copy_propagate(stencil_blocks, in_cps, name_var_table,
                                      self.typemap, self.calltypes)
        if config.DEBUG_ARRAY_OPT >= 1:
            print("stencil_blocks after copy_propagate")
            ir_utils.dump_blocks(stencil_blocks)
        ir_utils.remove_dead(stencil_blocks, self.func_ir.arg_names,
                             stencil_ir, self.typemap)
        if config.DEBUG_ARRAY_OPT >= 1:
            print("stencil_blocks after removing dead code")
            ir_utils.dump_blocks(stencil_blocks)

        # create parfor vars
        ndims = self.typemap[in_arr.name].ndim
        scope = in_arr.scope
        loc = in_arr.loc
        parfor_vars = []
        for i in range(ndims):
            parfor_var = ir.Var(scope, mk_unique_var("$parfor_index_var"), loc)
            self.typemap[parfor_var.name] = types.intp
            parfor_vars.append(parfor_var)

        start_lengths, end_lengths = self._replace_stencil_accesses(
            stencil_ir, parfor_vars, in_args, index_offsets, stencil_func,
            arg_to_arr_dict)

        if config.DEBUG_ARRAY_OPT >= 1:
            print("stencil_blocks after replace stencil accesses")
            ir_utils.dump_blocks(stencil_blocks)

        # create parfor loop nests
        loopnests = []
        equiv_set = self.array_analysis.get_equiv_set(label)
        in_arr_dim_sizes = equiv_set.get_shape(in_arr)

        assert ndims == len(in_arr_dim_sizes)
        for i in range(ndims):
            last_ind = self._get_stencil_last_ind(in_arr_dim_sizes[i],
                                                  end_lengths[i], gen_nodes,
                                                  scope, loc)
            start_ind = self._get_stencil_start_ind(start_lengths[i],
                                                    gen_nodes, scope, loc)
            # start from stencil size to avoid invalid array access
            loopnests.append(
                numba.parfors.parfor.LoopNest(parfor_vars[i], start_ind,
                                              last_ind, 1))

        # We have to guarantee that the exit block has maximum label and that
        # there's only one exit block for the parfor body.
        # So, all return statements will change to jump to the parfor exit block.
        parfor_body_exit_label = max(stencil_blocks.keys()) + 1
        stencil_blocks[parfor_body_exit_label] = ir.Block(scope, loc)
        exit_value_var = ir.Var(scope, mk_unique_var("$parfor_exit_value"),
                                loc)
        self.typemap[exit_value_var.name] = return_type.dtype

        # create parfor index var
        for_replacing_ret = []
        if ndims == 1:
            parfor_ind_var = parfor_vars[0]
        else:
            parfor_ind_var = ir.Var(scope,
                                    mk_unique_var("$parfor_index_tuple_var"),
                                    loc)
            self.typemap[parfor_ind_var.name] = types.containers.UniTuple(
                types.intp, ndims)
            tuple_call = ir.Expr.build_tuple(parfor_vars, loc)
            tuple_assign = ir.Assign(tuple_call, parfor_ind_var, loc)
            for_replacing_ret.append(tuple_assign)

        if config.DEBUG_ARRAY_OPT >= 1:
            print("stencil_blocks after creating parfor index var")
            ir_utils.dump_blocks(stencil_blocks)

        # empty init block
        init_block = ir.Block(scope, loc)
        if out_arr is None:
            in_arr_typ = self.typemap[in_arr.name]

            shape_name = ir_utils.mk_unique_var("in_arr_shape")
            shape_var = ir.Var(scope, shape_name, loc)
            shape_getattr = ir.Expr.getattr(in_arr, "shape", loc)
            self.typemap[shape_name] = types.containers.UniTuple(
                types.intp, in_arr_typ.ndim)
            init_block.body.extend([ir.Assign(shape_getattr, shape_var, loc)])

            zero_name = ir_utils.mk_unique_var("zero_val")
            zero_var = ir.Var(scope, zero_name, loc)
            if "cval" in stencil_func.options:
                cval = stencil_func.options["cval"]
                # TODO: Loosen this restriction to adhere to casting rules.
                if return_type.dtype != typing.typeof.typeof(cval):
                    raise ValueError(
                        "cval type does not match stencil return type.")

                temp2 = return_type.dtype(cval)
            else:
                temp2 = return_type.dtype(0)
            full_const = ir.Const(temp2, loc)
            self.typemap[zero_name] = return_type.dtype
            init_block.body.extend([ir.Assign(full_const, zero_var, loc)])

            so_name = ir_utils.mk_unique_var("stencil_output")
            out_arr = ir.Var(scope, so_name, loc)
            self.typemap[out_arr.name] = numba.core.types.npytypes.Array(
                return_type.dtype, in_arr_typ.ndim, in_arr_typ.layout)
            dtype_g_np_var = ir.Var(scope, mk_unique_var("$np_g_var"), loc)
            self.typemap[dtype_g_np_var.name] = types.misc.Module(np)
            dtype_g_np = ir.Global('np', np, loc)
            dtype_g_np_assign = ir.Assign(dtype_g_np, dtype_g_np_var, loc)
            init_block.body.append(dtype_g_np_assign)

            dtype_np_attr_call = ir.Expr.getattr(dtype_g_np_var,
                                                 return_type.dtype.name, loc)
            dtype_attr_var = ir.Var(scope, mk_unique_var("$np_attr_attr"), loc)
            self.typemap[dtype_attr_var.name] = types.functions.NumberClass(
                return_type.dtype)
            dtype_attr_assign = ir.Assign(dtype_np_attr_call, dtype_attr_var,
                                          loc)
            init_block.body.append(dtype_attr_assign)

            stmts = ir_utils.gen_np_call("full", np.full, out_arr,
                                         [shape_var, zero_var, dtype_attr_var],
                                         self.typingctx, self.typemap,
                                         self.calltypes)
            equiv_set.insert_equiv(out_arr, in_arr_dim_sizes)
            init_block.body.extend(stmts)
        else:  # out is present
            if "cval" in stencil_func.options:  # do out[:] = cval
                cval = stencil_func.options["cval"]
                # TODO: Loosen this restriction to adhere to casting rules.
                cval_ty = typing.typeof.typeof(cval)
                if not self.typingctx.can_convert(cval_ty, return_type.dtype):
                    msg = "cval type does not match stencil return type."
                    raise ValueError(msg)

                # get slice ref
                slice_var = ir.Var(scope, mk_unique_var("$py_g_var"), loc)
                slice_fn_ty = self.typingctx.resolve_value_type(slice)
                self.typemap[slice_var.name] = slice_fn_ty
                slice_g = ir.Global('slice', slice, loc)
                slice_assigned = ir.Assign(slice_g, slice_var, loc)
                init_block.body.append(slice_assigned)

                sig = self.typingctx.resolve_function_type(
                    slice_fn_ty, (types.none, ) * 2, {})

                callexpr = ir.Expr.call(func=slice_var,
                                        args=(),
                                        kws=(),
                                        loc=loc)

                self.calltypes[callexpr] = sig
                slice_inst_var = ir.Var(scope, mk_unique_var("$slice_inst"),
                                        loc)
                self.typemap[slice_inst_var.name] = types.slice2_type
                slice_assign = ir.Assign(callexpr, slice_inst_var, loc)
                init_block.body.append(slice_assign)

                # get const val for cval
                cval_const_val = ir.Const(return_type.dtype(cval), loc)
                cval_const_var = ir.Var(scope, mk_unique_var("$cval_const"),
                                        loc)
                self.typemap[cval_const_var.name] = return_type.dtype
                cval_const_assign = ir.Assign(cval_const_val, cval_const_var,
                                              loc)
                init_block.body.append(cval_const_assign)

                # do setitem on `out` array
                setitemexpr = ir.StaticSetItem(out_arr, slice(None, None),
                                               slice_inst_var, cval_const_var,
                                               loc)
                init_block.body.append(setitemexpr)
                sig = signature(types.none, self.typemap[out_arr.name],
                                self.typemap[slice_inst_var.name],
                                self.typemap[out_arr.name].dtype)
                self.calltypes[setitemexpr] = sig

        self.replace_return_with_setitem(stencil_blocks, exit_value_var,
                                         parfor_body_exit_label)

        if config.DEBUG_ARRAY_OPT >= 1:
            print("stencil_blocks after replacing return")
            ir_utils.dump_blocks(stencil_blocks)

        setitem_call = ir.SetItem(out_arr, parfor_ind_var, exit_value_var, loc)
        self.calltypes[setitem_call] = signature(
            types.none, self.typemap[out_arr.name],
            self.typemap[parfor_ind_var.name],
            self.typemap[out_arr.name].dtype)
        stencil_blocks[parfor_body_exit_label].body.extend(for_replacing_ret)
        stencil_blocks[parfor_body_exit_label].body.append(setitem_call)

        # simplify CFG of parfor body (exit block could be simplified often)
        # add dummy return to enable CFG
        dummy_loc = ir.Loc("stencilparfor_dummy", -1)
        ret_const_var = ir.Var(scope, mk_unique_var("$cval_const"), dummy_loc)
        cval_const_assign = ir.Assign(ir.Const(0, loc=dummy_loc),
                                      ret_const_var, dummy_loc)
        stencil_blocks[parfor_body_exit_label].body.append(cval_const_assign)

        stencil_blocks[parfor_body_exit_label].body.append(
            ir.Return(ret_const_var, dummy_loc), )
        stencil_blocks = ir_utils.simplify_CFG(stencil_blocks)
        stencil_blocks[max(stencil_blocks.keys())].body.pop()

        if config.DEBUG_ARRAY_OPT >= 1:
            print("stencil_blocks after adding SetItem")
            ir_utils.dump_blocks(stencil_blocks)

        pattern = ('stencil', [start_lengths, end_lengths])
        parfor = numba.parfors.parfor.Parfor(loopnests, init_block,
                                             stencil_blocks, loc,
                                             parfor_ind_var, equiv_set,
                                             pattern, self.flags)
        gen_nodes.append(parfor)
        gen_nodes.append(ir.Assign(out_arr, target, loc))
        return gen_nodes