예제 #1
0
    def emit_zero_array(self, op, arglocs, regalloc):
        base_loc, startindex_loc, length_loc, \
            ofs_loc, itemsize_loc = arglocs

        if ofs_loc.is_imm():
            assert check_imm_value(ofs_loc.value)
            self.mc.AGHI(base_loc, ofs_loc)
        else:
            self.mc.AGR(base_loc, ofs_loc)
        if startindex_loc.is_imm():
            assert check_imm_value(startindex_loc.value)
            self.mc.AGHI(base_loc, startindex_loc)
        else:
            self.mc.AGR(base_loc, startindex_loc)
        assert not length_loc.is_imm()
        # contents of r0 do not matter because r1 is zero, so
        # no copying takes place
        self.mc.XGR(r.r1, r.r1)

        assert base_loc.is_even()
        assert length_loc.value == base_loc.value + 1

        # s390x has memset directly as a hardware instruction!!
        # it needs 5 registers allocated
        # dst = rX, dst len = rX+1 (ensured by the regalloc)
        # src = r0, src len = r1
        self.mc.MVCLE(base_loc, r.r0, l.addr(0))
        # NOTE this instruction can (determined by the cpu), just
        # quit the movement any time, thus it is looped until all bytes
        # are copied!
        self.mc.BRC(c.OF, l.imm(-self.mc.MVCLE_byte_count))
예제 #2
0
    def emit_zero_array(self, op, arglocs, regalloc):
        base_loc, startindex_loc, length_loc, \
            ofs_loc, itemsize_loc = arglocs

        if ofs_loc.is_imm():
            assert check_imm_value(ofs_loc.value)
            self.mc.AGHI(base_loc, ofs_loc)
        else:
            self.mc.AGR(base_loc, ofs_loc)
        if startindex_loc.is_imm():
            assert check_imm_value(startindex_loc.value)
            self.mc.AGHI(base_loc, startindex_loc)
        else:
            self.mc.AGR(base_loc, startindex_loc)
        assert not length_loc.is_imm()
        # contents of r0 do not matter because r1 is zero, so
        # no copying takes place
        self.mc.XGR(r.r1, r.r1)

        assert base_loc.is_even()
        assert length_loc.value == base_loc.value + 1

        # s390x has memset directly as a hardware instruction!!
        # it needs 5 registers allocated
        # dst = rX, dst len = rX+1 (ensured by the regalloc)
        # src = r0, src len = r1
        self.mc.MVCLE(base_loc, r.r0, l.addr(0))
        # NOTE this instruction can (determined by the cpu), just
        # quit the movement any time, thus it is looped until all bytes
        # are copied!
        self.mc.BRC(c.OF, l.imm(-self.mc.MVCLE_byte_count))
예제 #3
0
파일: assembler.py 프로젝트: sota/pypy-old
 def emit(self, op, arglocs, regalloc):
     l0, l1 = arglocs
     if l1.is_in_pool():
         getattr(self.mc, rp_func)(l0, l1)
     elif l1.is_imm():
         if check_imm_value(l1.value):
             getattr(self.mc, rh_func)(l0, l1)
         else:
             getattr(self.mc, ri_func)(l0, l1)
     else:
         getattr(self.mc, rr_func)(l0, l1)
예제 #4
0
 def _call_assembler_check_descr(self, value, tmploc):
     ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
     self.mc.LG(r.SCRATCH, l.addr(ofs, r.r2))
     if check_imm_value(value):
         self.mc.cmp_op(r.SCRATCH, l.imm(value), imm=True)
     else:
         self.mc.load_imm(r.SCRATCH2, value)
         self.mc.cmp_op(r.SCRATCH, r.SCRATCH2, imm=False)
     jump_if_eq = self.mc.currpos()
     self.mc.trap()  # patched later
     self.mc.write('\x00' * 4)  # patched later
     return jump_if_eq
예제 #5
0
 def _call_assembler_check_descr(self, value, tmploc):
     ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
     self.mc.LG(r.SCRATCH, l.addr(ofs, r.r2))
     if check_imm_value(value):
         self.mc.cmp_op(r.SCRATCH, l.imm(value), imm=True)
     else:
         self.mc.load_imm(r.SCRATCH2, value)
         self.mc.cmp_op(r.SCRATCH, r.SCRATCH2, imm=False)
     jump_if_eq = self.mc.currpos()
     self.mc.trap()      # patched later
     self.mc.write('\x00' * 4) # patched later
     return jump_if_eq
예제 #6
0
 def _emit_load_for_copycontent(self, dst, src_ptr, src_ofs, scale):
     if src_ofs.is_imm():
         value = src_ofs.value << scale
         if check_imm_value(value):
             self.mc.AGHIK(dst, src_ptr, l.imm(value))
         else:
             # it is fine to use r1 here, because it will
             # only hold a value before invoking the memory copy
             self.mc.load_imm(r.SCRATCH, value)
             self.mc.AGRK(dst, src_ptr, r.SCRATCH)
     elif scale == 0:
         self.mc.AGRK(dst, src_ptr, src_ofs)
     else:
         self.mc.SLLG(r.SCRATCH, src_ofs, l.addr(scale))
         self.mc.AGRK(dst, src_ptr, r.SCRATCH)
예제 #7
0
 def _emit_load_for_copycontent(self, dst, src_ptr, src_ofs, scale):
     if src_ofs.is_imm():
         value = src_ofs.value << scale
         if check_imm_value(value):
             self.mc.AGHIK(dst, src_ptr, l.imm(value))
         else:
             # it is fine to use r1 here, because it will
             # only hold a value before invoking the memory copy
             self.mc.load_imm(r.SCRATCH, value)
             self.mc.AGRK(dst, src_ptr, r.SCRATCH)
     elif scale == 0:
         self.mc.AGRK(dst, src_ptr, src_ofs)
     else:
         self.mc.SLLG(r.SCRATCH, src_ofs, l.addr(scale))
         self.mc.AGRK(dst, src_ptr, r.SCRATCH)
예제 #8
0
 def _emit_threadlocalref_get(self, op, arglocs, regalloc):
     [resloc] = arglocs
     offset = op.getarg(1).getint()  # getarg(0) == 'threadlocalref_get'
     calldescr = op.getdescr()
     size = calldescr.get_result_size()
     sign = calldescr.is_result_signed()
     #
     # This loads the stack location THREADLOCAL_OFS into a
     # register, and then read the word at the given offset.
     # It is only supported if 'translate_support_code' is
     # true; otherwise, the execute_token() was done with a
     # dummy value for the stack location THREADLOCAL_OFS
     #
     assert self.cpu.translate_support_code
     assert resloc.is_reg()
     assert check_imm_value(offset)
     self.mc.LG(resloc, l.addr(THREADLOCAL_ADDR_OFFSET, r.SP))
     self._memory_read(resloc, l.addr(offset, resloc), size, sign)
예제 #9
0
 def _emit_threadlocalref_get(self, op, arglocs, regalloc):
     [resloc] = arglocs
     offset = op.getarg(1).getint()   # getarg(0) == 'threadlocalref_get'
     calldescr = op.getdescr()
     size = calldescr.get_result_size()
     sign = calldescr.is_result_signed()
     #
     # This loads the stack location THREADLOCAL_OFS into a
     # register, and then read the word at the given offset.
     # It is only supported if 'translate_support_code' is
     # true; otherwise, the execute_token() was done with a
     # dummy value for the stack location THREADLOCAL_OFS
     #
     assert self.cpu.translate_support_code
     assert resloc.is_reg()
     assert check_imm_value(offset)
     self.mc.LG(resloc, l.addr(THREADLOCAL_ADDR_OFFSET, r.SP))
     self._memory_read(resloc, l.addr(offset, resloc), size, sign)
예제 #10
0
    def emit_guard_exception(self, op, arglocs, regalloc):
        loc, resloc = arglocs[:2]
        failargs = arglocs[2:]

        mc = self.mc
        mc.load_imm(r.SCRATCH, self.cpu.pos_exc_value())
        diff = self.cpu.pos_exception() - self.cpu.pos_exc_value()
        assert check_imm_value(diff)

        mc.LG(r.SCRATCH2, l.addr(diff, r.SCRATCH))
        mc.cmp_op(r.SCRATCH2, loc)
        self.guard_success_cc = c.EQ
        self._emit_guard(op, failargs)

        if resloc:
            mc.load(resloc, r.SCRATCH, 0)
        mc.LGHI(r.SCRATCH2, l.imm(0))
        mc.STG(r.SCRATCH2, l.addr(0, r.SCRATCH))
        mc.STG(r.SCRATCH2, l.addr(diff, r.SCRATCH))
예제 #11
0
    def emit_guard_exception(self, op, arglocs, regalloc):
        loc, resloc = arglocs[:2]
        failargs = arglocs[2:]

        mc = self.mc
        mc.load_imm(r.SCRATCH, self.cpu.pos_exc_value())
        diff = self.cpu.pos_exception() - self.cpu.pos_exc_value()
        assert check_imm_value(diff)

        mc.LG(r.SCRATCH2, l.addr(diff, r.SCRATCH))
        mc.cmp_op(r.SCRATCH2, loc)
        self.guard_success_cc = c.EQ
        self._emit_guard(op, failargs)

        if resloc:
            mc.load(resloc, r.SCRATCH, 0)
        mc.LGHI(r.SCRATCH2, l.imm(0))
        mc.STG(r.SCRATCH2, l.addr(0, r.SCRATCH))
        mc.STG(r.SCRATCH2, l.addr(diff, r.SCRATCH))
예제 #12
0
 def emit_guard_subclass(self, op, arglocs, regalloc):
     assert self.cpu.supports_guard_gc_type
     loc_object = arglocs[0]
     loc_check_against_class = arglocs[1]
     offset = self.cpu.vtable_offset
     offset2 = self.cpu.subclassrange_min_offset
     if offset is not None:
         # read this field to get the vtable pointer
         self.mc.LG(r.SCRATCH, l.addr(offset, loc_object))
         # read the vtable's subclassrange_min field
         assert check_imm_value(offset2)
         self.mc.load(r.SCRATCH2, r.SCRATCH, offset2)
     else:
         # read the typeid
         self._read_typeid(r.SCRATCH, loc_object)
         # read the vtable's subclassrange_min field, as a single
         # step with the correct offset
         base_type_info, shift_by, sizeof_ti = (
             self.cpu.gc_ll_descr.get_translated_info_for_typeinfo())
         self.mc.load_imm(r.SCRATCH2, base_type_info + sizeof_ti + offset2)
         assert shift_by == 0
         # add index manually
         # we cannot use r0 in l.addr(...)
         self.mc.AGR(r.SCRATCH, r.SCRATCH2)
         self.mc.load(r.SCRATCH2, r.SCRATCH, 0)
     # get the two bounds to check against
     vtable_ptr = loc_check_against_class.getint()
     vtable_ptr = rffi.cast(rclass.CLASSTYPE, vtable_ptr)
     check_min = vtable_ptr.subclassrange_min
     check_max = vtable_ptr.subclassrange_max
     assert check_max > check_min
     check_diff = check_max - check_min - 1
     # right now, a full PyPy uses less than 6000 numbers,
     # so we'll assert here that it always fit inside 15 bits
     assert 0 <= check_min <= 0x7fff
     assert 0 <= check_diff <= 0xffff
     # check by doing the unsigned comparison (tmp - min) < (max - min)
     self.mc.AGHI(r.SCRATCH2, l.imm(-check_min))
     self.mc.cmp_op(r.SCRATCH2, l.imm(check_diff), imm=True, signed=False)
     # the guard passes if we get a result of "below or equal"
     self.guard_success_cc = c.LE
     self._emit_guard(op, arglocs[2:])
예제 #13
0
 def emit_guard_subclass(self, op, arglocs, regalloc):
     assert self.cpu.supports_guard_gc_type
     loc_object = arglocs[0]
     loc_check_against_class = arglocs[1]
     offset = self.cpu.vtable_offset
     offset2 = self.cpu.subclassrange_min_offset
     if offset is not None:
         # read this field to get the vtable pointer
         self.mc.LG(r.SCRATCH, l.addr(offset, loc_object))
         # read the vtable's subclassrange_min field
         assert check_imm_value(offset2)
         self.mc.load(r.SCRATCH2, r.SCRATCH, offset2)
     else:
         # read the typeid
         self._read_typeid(r.SCRATCH, loc_object)
         # read the vtable's subclassrange_min field, as a single
         # step with the correct offset
         base_type_info, shift_by, sizeof_ti = (
             self.cpu.gc_ll_descr.get_translated_info_for_typeinfo())
         self.mc.load_imm(r.SCRATCH2, base_type_info + sizeof_ti + offset2)
         assert shift_by == 0
         # add index manually
         # we cannot use r0 in l.addr(...)
         self.mc.AGR(r.SCRATCH, r.SCRATCH2)
         self.mc.load(r.SCRATCH2, r.SCRATCH, 0)
     # get the two bounds to check against
     vtable_ptr = loc_check_against_class.getint()
     vtable_ptr = rffi.cast(rclass.CLASSTYPE, vtable_ptr)
     check_min = vtable_ptr.subclassrange_min
     check_max = vtable_ptr.subclassrange_max
     assert check_max > check_min
     check_diff = check_max - check_min - 1
     # right now, a full PyPy uses less than 6000 numbers,
     # so we'll assert here that it always fit inside 15 bits
     assert 0 <= check_min <= 0x7fff
     assert 0 <= check_diff <= 0xffff
     # check by doing the unsigned comparison (tmp - min) < (max - min)
     self.mc.AGHI(r.SCRATCH2, l.imm(-check_min))
     self.mc.cmp_op(r.SCRATCH2, l.imm(check_diff), imm=True, signed=False)
     # the guard passes if we get a result of "below or equal"
     self.guard_success_cc = c.LE
     self._emit_guard(op, arglocs[2:])
예제 #14
0
    def _write_barrier_fastpath(self,
                                mc,
                                descr,
                                arglocs,
                                regalloc,
                                array=False,
                                is_frame=False):
        # Write code equivalent to write_barrier() in the GC: it checks
        # a flag in the object at arglocs[0], and if set, it calls a
        # helper piece of assembler.  The latter saves registers as needed
        # and call the function remember_young_pointer() from the GC.
        if we_are_translated():
            cls = self.cpu.gc_ll_descr.has_write_barrier_class()
            assert cls is not None and isinstance(descr, cls)
        #
        card_marking_mask = 0
        mask = descr.jit_wb_if_flag_singlebyte
        if array and descr.jit_wb_cards_set != 0:
            # assumptions the rest of the function depends on:
            assert (
                descr.jit_wb_cards_set_byteofs == descr.jit_wb_if_flag_byteofs)
            card_marking_mask = descr.jit_wb_cards_set_singlebyte
        #
        loc_base = arglocs[0]
        assert loc_base.is_reg()
        if is_frame:
            assert loc_base is r.SPP
        assert check_imm_value(descr.jit_wb_if_flag_byteofs)
        mc.LLGC(r.SCRATCH2, l.addr(descr.jit_wb_if_flag_byteofs, loc_base))
        mc.LGR(r.SCRATCH, r.SCRATCH2)
        mc.NILL(r.SCRATCH, l.imm(mask & 0xFF))

        jz_location = mc.get_relative_pos()
        mc.reserve_cond_jump(short=True)  # patched later with 'EQ'

        # for cond_call_gc_wb_array, also add another fast path:
        # if GCFLAG_CARDS_SET, then we can just set one bit and be done
        if card_marking_mask:
            # GCFLAG_CARDS_SET is in the same byte, loaded in r2 already
            mc.LGR(r.SCRATCH, r.SCRATCH2)
            mc.NILL(r.SCRATCH, l.imm(card_marking_mask & 0xFF))
            js_location = mc.get_relative_pos()
            mc.reserve_cond_jump()  # patched later with 'NE'
        else:
            js_location = 0

        # Write only a CALL to the helper prepared in advance, passing it as
        # argument the address of the structure we are writing into
        # (the first argument to COND_CALL_GC_WB).
        helper_num = (card_marking_mask != 0)
        if is_frame:
            helper_num = 4
        elif regalloc.fprm.reg_bindings:
            helper_num += 2
        if self.wb_slowpath[helper_num] == 0:  # tests only
            assert not we_are_translated()
            assert not is_frame
            self.cpu.gc_ll_descr.write_barrier_descr = descr
            self._build_wb_slowpath(card_marking_mask != 0,
                                    bool(regalloc.fprm.reg_bindings))
            assert self.wb_slowpath[helper_num] != 0
        #
        if not is_frame:
            mc.LGR(r.r0, loc_base)  # unusual argument location

        mc.load_imm(r.r14, self.wb_slowpath[helper_num])
        mc.BASR(r.r14, r.r14)

        if card_marking_mask:
            # The helper ends again with a check of the flag in the object.
            # So here, we can simply write again a beq, which will be
            # taken if GCFLAG_CARDS_SET is still not set.
            jns_location = mc.get_relative_pos()
            mc.reserve_cond_jump(short=True)
            #
            # patch the 'NE' above
            currpos = mc.currpos()
            pmc = OverwritingBuilder(mc, js_location, 1)
            pmc.BRCL(c.NE, l.imm(currpos - js_location))
            pmc.overwrite()
            #
            # case GCFLAG_CARDS_SET: emit a few instructions to do
            # directly the card flag setting
            loc_index = arglocs[1]
            if loc_index.is_reg():
                tmp_loc = arglocs[2]
                n = descr.jit_wb_card_page_shift

                assert tmp_loc is not loc_index

                # compute in tmp_loc the byte offset:
                #   tmp_loc = ~(index >> (card_page_shift + 3))
                mc.SRLG(tmp_loc, loc_index, l.addr(n + 3))
                # invert the bits of tmp_loc

                # compute in SCRATCH the index of the bit inside the byte:
                #    scratch = (index >> card_page_shift) & 7
                # 0x80 sets zero flag. will store 0 into all not selected bits
                mc.RISBG(r.SCRATCH, loc_index, l.imm(61), l.imm(0x80 | 63),
                         l.imm(64 - n))
                mc.LGHI(r.SCRATCH2, l.imm(-1))
                mc.XGR(tmp_loc, r.SCRATCH2)

                # set SCRATCH2 to 1 << r1
                mc.LGHI(r.SCRATCH2, l.imm(1))
                mc.SLLG(r.SCRATCH2, r.SCRATCH2, l.addr(0, r.SCRATCH))

                # set this bit inside the byte of interest
                addr = l.addr(0, loc_base, tmp_loc)
                mc.LLGC(r.SCRATCH, addr)
                mc.OGRK(r.SCRATCH, r.SCRATCH, r.SCRATCH2)
                mc.STCY(r.SCRATCH, addr)
                # done
            else:
                byte_index = loc_index.value >> descr.jit_wb_card_page_shift
                byte_ofs = ~(byte_index >> 3)
                byte_val = 1 << (byte_index & 7)
                assert check_imm_value(byte_ofs,
                                       lower_bound=-2**19,
                                       upper_bound=2**19 - 1)

                addr = l.addr(byte_ofs, loc_base)
                mc.LLGC(r.SCRATCH, addr)
                mc.OILL(r.SCRATCH, l.imm(byte_val))
                mc.STCY(r.SCRATCH, addr)
            #
            # patch the beq just above
            currpos = mc.currpos()
            pmc = OverwritingBuilder(mc, jns_location, 1)
            pmc.BRC(c.EQ, l.imm(currpos - jns_location))
            pmc.overwrite()

        # patch the JZ above
        currpos = mc.currpos()
        pmc = OverwritingBuilder(mc, jz_location, 1)
        pmc.BRC(c.EQ, l.imm(currpos - jz_location))
        pmc.overwrite()
예제 #15
0
    def _write_barrier_fastpath(self, mc, descr, arglocs, regalloc, array=False,
                                is_frame=False):
        # Write code equivalent to write_barrier() in the GC: it checks
        # a flag in the object at arglocs[0], and if set, it calls a
        # helper piece of assembler.  The latter saves registers as needed
        # and call the function remember_young_pointer() from the GC.
        if we_are_translated():
            cls = self.cpu.gc_ll_descr.has_write_barrier_class()
            assert cls is not None and isinstance(descr, cls)
        #
        card_marking_mask = 0
        mask = descr.jit_wb_if_flag_singlebyte
        if array and descr.jit_wb_cards_set != 0:
            # assumptions the rest of the function depends on:
            assert (descr.jit_wb_cards_set_byteofs ==
                    descr.jit_wb_if_flag_byteofs)
            card_marking_mask = descr.jit_wb_cards_set_singlebyte
        #
        loc_base = arglocs[0]
        assert loc_base.is_reg()
        if is_frame:
            assert loc_base is r.SPP
        assert check_imm_value(descr.jit_wb_if_flag_byteofs)
        mc.LLGC(r.SCRATCH2, l.addr(descr.jit_wb_if_flag_byteofs, loc_base))
        mc.LGR(r.SCRATCH, r.SCRATCH2)
        mc.NILL(r.SCRATCH, l.imm(mask & 0xFF))

        jz_location = mc.get_relative_pos()
        mc.reserve_cond_jump(short=True)  # patched later with 'EQ'

        # for cond_call_gc_wb_array, also add another fast path:
        # if GCFLAG_CARDS_SET, then we can just set one bit and be done
        if card_marking_mask:
            # GCFLAG_CARDS_SET is in the same byte, loaded in r2 already
            mc.LGR(r.SCRATCH, r.SCRATCH2)
            mc.NILL(r.SCRATCH, l.imm(card_marking_mask & 0xFF))
            js_location = mc.get_relative_pos()
            mc.reserve_cond_jump()  # patched later with 'NE'
        else:
            js_location = 0

        # Write only a CALL to the helper prepared in advance, passing it as
        # argument the address of the structure we are writing into
        # (the first argument to COND_CALL_GC_WB).
        helper_num = (card_marking_mask != 0)
        if is_frame:
            helper_num = 4
        elif regalloc.fprm.reg_bindings:
            helper_num += 2
        if self.wb_slowpath[helper_num] == 0:    # tests only
            assert not we_are_translated()
            assert not is_frame
            self.cpu.gc_ll_descr.write_barrier_descr = descr
            self._build_wb_slowpath(card_marking_mask != 0,
                                    bool(regalloc.fprm.reg_bindings))
            assert self.wb_slowpath[helper_num] != 0
        #
        if not is_frame:
            mc.LGR(r.r0, loc_base)    # unusual argument location

        mc.load_imm(r.r14, self.wb_slowpath[helper_num])
        mc.BASR(r.r14, r.r14)

        if card_marking_mask:
            # The helper ends again with a check of the flag in the object.
            # So here, we can simply write again a beq, which will be
            # taken if GCFLAG_CARDS_SET is still not set.
            jns_location = mc.get_relative_pos()
            mc.reserve_cond_jump(short=True)
            #
            # patch the 'NE' above
            currpos = mc.currpos()
            pmc = OverwritingBuilder(mc, js_location, 1)
            pmc.BRCL(c.NE, l.imm(currpos - js_location))
            pmc.overwrite()
            #
            # case GCFLAG_CARDS_SET: emit a few instructions to do
            # directly the card flag setting
            loc_index = arglocs[1]
            if loc_index.is_reg():
                tmp_loc = arglocs[2]
                n = descr.jit_wb_card_page_shift

                assert tmp_loc is not loc_index

                # compute in tmp_loc the byte offset:
                #   tmp_loc = ~(index >> (card_page_shift + 3))
                mc.SRLG(tmp_loc, loc_index, l.addr(n+3))
                # invert the bits of tmp_loc

                # compute in SCRATCH the index of the bit inside the byte:
                #    scratch = (index >> card_page_shift) & 7
                # 0x80 sets zero flag. will store 0 into all not selected bits
                mc.RISBG(r.SCRATCH, loc_index, l.imm(61), l.imm(0x80 | 63), l.imm(64-n))
                mc.LGHI(r.SCRATCH2, l.imm(-1))
                mc.XGR(tmp_loc, r.SCRATCH2)

                # set SCRATCH2 to 1 << r1
                mc.LGHI(r.SCRATCH2, l.imm(1))
                mc.SLLG(r.SCRATCH2, r.SCRATCH2, l.addr(0,r.SCRATCH))

                # set this bit inside the byte of interest
                addr = l.addr(0, loc_base, tmp_loc)
                mc.LLGC(r.SCRATCH, addr)
                mc.OGRK(r.SCRATCH, r.SCRATCH, r.SCRATCH2)
                mc.STCY(r.SCRATCH, addr)
                # done
            else:
                byte_index = loc_index.value >> descr.jit_wb_card_page_shift
                byte_ofs = ~(byte_index >> 3)
                byte_val = 1 << (byte_index & 7)
                assert check_imm_value(byte_ofs, lower_bound=-2**19, upper_bound=2**19-1)

                addr = l.addr(byte_ofs, loc_base)
                mc.LLGC(r.SCRATCH, addr)
                mc.OILL(r.SCRATCH, l.imm(byte_val))
                mc.STCY(r.SCRATCH, addr)
            #
            # patch the beq just above
            currpos = mc.currpos()
            pmc = OverwritingBuilder(mc, jns_location, 1)
            pmc.BRC(c.EQ, l.imm(currpos - jns_location))
            pmc.overwrite()

        # patch the JZ above
        currpos = mc.currpos()
        pmc = OverwritingBuilder(mc, jz_location, 1)
        pmc.BRC(c.EQ, l.imm(currpos - jz_location))
        pmc.overwrite()