示例#1
0
文件: x86.py 项目: shreyas42/slingen
    def loadMatrix(self, mParams):
        src, dst = mParams['m'], mParams['nuM']
        sL, sR = mParams['mL'], mParams['mR']
        dL, dR = mParams['nuML'], mParams['nuMR']
        M, N = mParams['M'], mParams['N']
        nuMM, nuMN = mParams['nuMM'], mParams['nuMN']
        mStruct, mAccess = mParams['struct'], mParams['access']
        instructions = []

        if IdentityMatrix.test(mStruct, mAccess, M, N):
            comm = Comment('%dx%d - %s' % (M, N, 'Identity'))
            instrs = [
                Mov(V(1), ScaLoad(dst[dL.of(i), dR.of(i)])) for i in range(M)
            ]
            instructions.extend([comm] + instrs)

        if AllEntriesConstantMatrix.test(mStruct, mAccess, M, N):
            mat_type = mStruct.keys()[0]
            comm = Comment('%dx%d - %s' % (M, N, str(mat_type)))
            instrs = []
            for i in range(M):
                for j in range(N):
                    instrs.append(
                        Mov(V(mat_type._const_value),
                            ScaLoad(dst[dL.of(i), dR.of(j)])))
            instructions.extend([comm] + instrs)

        return instructions
示例#2
0
    def loadMatrix(self, mParams):
        src, dst = mParams['m'], mParams['nuM']
        sL, sR = mParams['mL'], mParams['mR']
        dL, dR = mParams['nuML'], mParams['nuMR']
        M, N = mParams['M'], mParams['N']
        #         nuMM = mParams['nuMM']
        isCompact = mParams['compact']
        instructions = []

        if M == 1 and N == 1:
            pc = Pointer(dst[dL.of(0), dR.of(0)])
            instr = mmStoreuPd(
                mmLoadSd(AddressOf(sa(src[sL.of(0), sR.of(0)]))), pc)
            instructions += [Comment("1x1 -> 1x2"), instr]
        elif M == 2 and N == 1:
            if not isCompact:
                es = [
                    mmLoadSd(Pointer(src[sL.of(i), sR.of(0)]))
                    for i in range(2)
                ]
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPd(mmShufflePd(es[0], es[1], (0, 0)), pc)
                instructions += [Comment("2x1 -> 2x1 - incompact"), instr]

        return instructions
示例#3
0
    def storeMatrix(self, mParams):
        src, dst = mParams['nuM'], mParams['m']
        sL, sR = mParams['nuML'], mParams['nuMR']
        dL, dR = mParams['mL'], mParams['mR']
        M, N = mParams['M'], mParams['N']
        isCompact = mParams['compact']
        instructions = []

        if M == 1 and N == 1:
            nuv = mmLoaduPd(Pointer(src[sL.of(0), sR.of(0)]))
            pc = AddressOf(sa(dst[dL.of(0), dR.of(0)]))
            instr = mmStoreSd(nuv, pc)
            instructions += [Comment("1x2 -> 1x1"), instr]
        elif M == 2 and N == 1:
            if not isCompact:
                nuv = mmLoaduPd(Pointer(src[sL.of(0), sR.of(0)]))
                e = mmShufflePd(nuv, nuv, (1, 1))
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(2)]
                instr0 = mmStoreSd(nuv, pcs[0])
                instr1 = mmStoreSd(e, pcs[1])
                instructions += [
                    Comment("2x1 -> 2x1 - (Store) Incompact"), instr0, instr1
                ]

        return instructions
示例#4
0
文件: x86.py 项目: shreyas42/slingen
    def Kro(self, s0Params, s1Params, dParams, opts):
        src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, K, N, P = s0Params['nuMM'], s0Params['nuMN'], s1Params[
            'nuMM'], s1Params['nuMN']
        instructions = []

        instructions += [
            Comment("1-BLAC: " + str(M) + "x" + str(K) + " Kro " + str(N) +
                    "x" + str(P))
        ]
        for i in range(M):
            for k in range(K):
                for j in range(N):
                    for p in range(P):
                        instr = Mov(
                            ScaMul(ScaLoad(src0[s0L.of(i),
                                                s0R.of(k)]),
                                   ScaLoad(src1[s1L.of(j),
                                                s1R.of(p)])),
                            ScaLoad(dst[dL.of(i + j),
                                        dR.of(k + p)]))
                        instructions += [instr]

        return instructions
示例#5
0
文件: x86.py 项目: shreyas42/slingen
    def Mul(self, s0Params, s1Params, dParams, opts):

        src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, K, N = s0Params['nuMM'], s0Params['nuMN'], s1Params['nuMN']
        instructions = []

        instructions += [
            Comment("1-BLAC: " + str(M) + "x" + str(K) + " * " + str(K) + "x" +
                    str(N))
        ]
        for i in range(M):
            for j in range(N):
                instr = Mov(
                    ScaMul(ScaLoad(src0[s0L.of(i), s0R.of(0)]),
                           ScaLoad(src1[s1L.of(0), s1R.of(j)])),
                    ScaLoad(dst[dL.of(i), dR.of(j)]))
                instructions += [instr]

        for k in range(1, K):
            for i in range(M):
                for j in range(N):
                    t = ScaMul(ScaLoad(src0[s0L.of(i), s0R.of(k)]),
                               ScaLoad(src1[s1L.of(k), s1R.of(j)]))
                    instr = Mov(ScaAdd(ScaLoad(dst[dL.of(i),
                                                   dR.of(j)]), t),
                                ScaLoad(dst[dL.of(i), dR.of(j)]))
                    instructions += [instr]

        return instructions
示例#6
0
    def Kro(self, s0Params, s1Params, dParams, opts):

        nu = 4
        src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        oM, oK, oN, oP = s0Params['M'], s0Params['N'], s1Params['M'], s1Params[
            'N']
        M, K, N, P = s0Params['nuMM'], s0Params['nuMN'], s1Params[
            'nuMM'], s1Params['nuMN']
        instructions = []

        instructions += [
            Comment(
                str(nu) + "-BLAC: " + str(M) + "x" + str(K) + " Kro " +
                str(N) + "x" + str(P))
        ]
        if oM * oK * oN * oP == 1:
            pc = Pointer(dst[dL.of(0), dR.of(0)])
            va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
            vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
            instr = mmStoreuPs(mmMulPs(va, vb), pc)
            instructions += [instr]
        elif oM * oK == 1:
            if N * P == nu:
                va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
                dup = mmShufflePs(va, va, (0, 0, 0, 0))
                vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(mmMulPs(dup, vb), pc)
                instructions += [instr]
            else:
                va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
                dup = mmShufflePs(va, va, (0, 0, 0, 0))
                for i in range(nu):
                    vb = mmLoaduPs(Pointer(src1[s1L.of(i), s1R.of(0)]))
                    pc = Pointer(dst[dL.of(i), dR.of(0)])
                    instr = mmStoreuPs(mmMulPs(dup, vb), pc)
                    instructions += [instr]
        else:
            if M * K == nu:
                vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                dup = mmShufflePs(vb, vb, (0, 0, 0, 0))
                va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(mmMulPs(va, dup), pc)
                instructions += [instr]
            else:
                vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                dup = mmShufflePs(vb, vb, (0, 0, 0, 0))
                for i in range(nu):
                    va = mmLoaduPs(Pointer(src0[s0L.of(i), s0R.of(0)]))
                    pc = Pointer(dst[dL.of(i), dR.of(0)])
                    instr = mmStoreuPs(mmMulPs(va, dup), pc)
                    instructions += [instr]

        for i in instructions:
            i.bounds.update(dParams['bounds'])
        return instructions
示例#7
0
    def Add(self, s0Params, s1Params, dParams, opts):

        nu = 2
        src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, N = dParams['nuMM'], dParams['nuMN']
        instructions = []

        instructions += [
            Comment(
                str(nu) + "-BLAC: " + str(M) + "x" + str(N) + " + " + str(M) +
                "x" + str(N))
        ]
        if M * N == nu:
            va = mmLoaduPd(Pointer(src0[s0L.of(0), s0R.of(0)]))
            vb = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)]))
            pc = Pointer(dst[dL.of(0), dR.of(0)])
            instr = mmStoreuPd(mmAddPd(va, vb), pc)
            instructions += [instr]
        elif M == nu and N == nu:
            for i in range(M):
                va = mmLoaduPd(Pointer(src0[s0L.of(i), s0R.of(0)]))
                vb = mmLoaduPd(Pointer(src1[s1L.of(i), s1R.of(0)]))
                pc = Pointer(dst[dL.of(i), dR.of(0)])
                instr = mmStoreuPd(mmAddPd(va, vb), pc)
                instructions += [instr]

        return instructions
示例#8
0
    def T(self, sParams, dParams, opts):

        nu = 2
        src, dst = sParams['nuM'], dParams['nuM']
        sL, sR = sParams['nuML'], sParams['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, N = dParams['nuMM'], dParams['nuMN']
        instructions = []

        instructions += [
            Comment(str(nu) + "-BLAC: (" + str(N) + "x" + str(M) + ")^T")
        ]
        if M * N == nu:
            va = mmLoaduPd(Pointer(src[sL.of(0), sR.of(0)]))
            pc = Pointer(dst[dL.of(0), dR.of(0)])
            instr = mmStoreuPd(va, pc)
            instructions += [instr]
        else:
            va0 = mmLoaduPd(Pointer(src[sL.of(0), sR.of(0)]))
            va1 = mmLoaduPd(Pointer(src[sL.of(1), sR.of(0)]))
            pc0 = Pointer(dst[dL.of(0), dR.of(0)])
            pc1 = Pointer(dst[dL.of(1), dR.of(0)])
            vt0 = mmUnpackloPd(va0, va1)
            vt1 = mmUnpackhiPd(va0, va1)
            instr0 = mmStoreuPd(vt0, pc0)
            instr1 = mmStoreuPd(vt1, pc1)
            instructions += [instr0, instr1]

        return instructions
示例#9
0
文件: x86.py 项目: shreyas42/slingen
    def Neg(self, sParams, dParams, opts):
        src, dst = sParams['nuM'], dParams['nuM']
        sL, sR = sParams['nuML'], sParams['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, N = dParams['nuMM'], dParams['nuMN']
        instructions = []

        instructions += [Comment("1-BLAC: -(" + str(N) + "x" + str(M) + ")")]
        instr = Mov(ScaMul(V(-1), ScaLoad(src[sL.of(0), sR.of(0)])),
                    ScaLoad(dst[dL.of(0), dR.of(0)]))
        instructions += [instr]

        return instructions
示例#10
0
文件: x86.py 项目: shreyas42/slingen
    def Zero(self, dParams, opts):

        dst = dParams['nuM']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, N = dParams['nuMM'], dParams['nuMN']
        instructions = []

        instructions += [Comment("1-BLAC: Zero " + str(M) + "x" + str(N))]
        for i in range(M):
            for j in range(N):
                instr = Mov(V(0), ScaLoad(dst[dL.of(i), dR.of(j)]))
                instructions.append(instr)

        return instructions
示例#11
0
文件: x86.py 项目: shreyas42/slingen
    def T(self, sParams, dParams, opts):
        src, dst = sParams['nuM'], dParams['nuM']
        sL, sR = sParams['nuML'], sParams['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, N = dParams['nuMM'], dParams['nuMN']
        instructions = []

        instructions += [Comment("1-BLAC: (" + str(N) + "x" + str(M) + ")^T")]
        for i in range(M):
            for j in range(N):
                instr = Mov(ScaLoad(src[sL.of(j), sR.of(i)]),
                            ScaLoad(dst[dL.of(i), dR.of(j)]))
                instructions += [instr]

        return instructions
示例#12
0
文件: x86.py 项目: shreyas42/slingen
    def Copy(self, sParams, dParams, opts):
        sub, dst = sParams['nuM'], dParams['nuM']
        subL, subR = sParams['nuML'], sParams['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, N = sParams['nuMM'], sParams['nuMN']
        instructions = []

        instructions += [Comment("1-BLAC: Copy " + str(M) + "x" + str(N))]
        for i in range(M):
            for j in range(N):
                instr = Mov(ScaLoad(sub[subL.of(i), subR.of(j)]),
                            ScaLoad(dst[dL.of(i), dR.of(j)]))
                instructions += [instr]

        return instructions
示例#13
0
文件: x86.py 项目: shreyas42/slingen
    def Sub(self, s0Params, s1Params, dParams, opts):

        src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, N = dParams['nuMM'], dParams['nuMN']
        instructions = []

        instructions += [
            Comment("1-BLAC: " + str(M) + "x" + str(N) + " - " + str(M) + "x" +
                    str(N))
        ]
        for i in range(M):
            for j in range(N):
                instr = Mov(
                    ScaSub(ScaLoad(src0[s0L.of(i), s0R.of(j)]),
                           ScaLoad(src1[s1L.of(i), s1R.of(j)])),
                    ScaLoad(dst[dL.of(i), dR.of(j)]))
                instructions += [instr]

        return instructions

        #     def LDiv(self, s0Params, s1Params, dParams, opts):
        #         src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        #         s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        #         s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        #         dL, dR   = dParams['nuML'], dParams['nuMR']
        #         M, N = s0Params['nuMM'], s1Params['nuMN']
        #         instructions = []
        #
        #         instructions += [ Comment("1-BLAC: " + str(M) + "x" + str(M) + " \ " + str(M) + "x" + str(N)) ]
        #
        #         for j in range(N):
        #             instr = Mov(ScaDiv(ScaLoad(src1[s1L.of(0),s1R.of(j)]), ScaLoad(src0[s0L.of(0),s0R.of(0)])), ScaLoad(dst[dL.of(0),dR.of(j)]))
        #             instructions += [ instr ]
        # #             for i in range(1, M):
        # #                 t0 = ScaMul(ScaLoad(src0[s0L.of(i),s0R.of(0)]), ScaLoad(dst[dL.of(0),dR.of(j)]))
        # #                 for k in range(1, i):
        # #                     t1 = ScaMul(ScaLoad(src0[s0L.of(i),s0R.of(k)]), ScaLoad(dst[dL.of(k),dR.of(j)]))
        # #                     t0 = ScaAdd(t0, t1)
        # #                 s = ScaSub(ScaLoad(src1[s1L.of(i),s1R.of(j)]), t0)
        # #                 instr = Mov(ScaDiv(s, ScaLoad(src0[s0L.of(i),s0R.of(i)])), ScaLoad(dst[dL.of(i),dR.of(j)]))
        # #                 instructions += [ instr ]

        return instructions
示例#14
0
    def T(self, sParams, dParams, opts):

        nu = 4
        src, dst = sParams['nuM'], dParams['nuM']
        sL, sR = sParams['nuML'], sParams['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, N = dParams['nuMM'], dParams['nuMN']
        instructions = []

        instructions += [
            Comment(str(nu) + "-BLAC: (" + str(N) + "x" + str(M) + ")^T")
        ]
        if M * N == nu:
            va = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
            pc = Pointer(dst[dL.of(0), dR.of(0)])
            instr = mmStoreuPs(va, pc)
            instructions += [instr]
        else:
            va0 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
            va1 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)]))
            va2 = mmLoaduPs(Pointer(src[sL.of(2), sR.of(0)]))
            va3 = mmLoaduPs(Pointer(src[sL.of(3), sR.of(0)]))
            #Equivalent of _MM_TRANSPOSE4_PS
            tmp0 = mmUnpackloPs(va0, va1)
            tmp2 = mmUnpackloPs(va2, va3)
            tmp1 = mmUnpackhiPs(va0, va1)
            tmp3 = mmUnpackhiPs(va2, va3)
            col0 = mmMovelhPs(tmp0, tmp2)
            col1 = mmMovehlPs(tmp2, tmp0)
            col2 = mmMovelhPs(tmp1, tmp3)
            col3 = mmMovehlPs(tmp3, tmp1)
            #Equivalent of _MM_TRANSPOSE4_PS
            pc0 = Pointer(dst[dL.of(0), dR.of(0)])
            pc1 = Pointer(dst[dL.of(1), dR.of(0)])
            pc2 = Pointer(dst[dL.of(2), dR.of(0)])
            pc3 = Pointer(dst[dL.of(3), dR.of(0)])
            instr0 = mmStoreuPs(col0, pc0)
            instr1 = mmStoreuPs(col1, pc1)
            instr2 = mmStoreuPs(col2, pc2)
            instr3 = mmStoreuPs(col3, pc3)
            instructions += [instr0, instr1, instr2, instr3]

        for i in instructions:
            i.bounds.update(dParams['bounds'])
        return instructions
示例#15
0
文件: x86.py 项目: shreyas42/slingen
    def Div(self, s0Params, s1Params, dParams, opts):
        src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, N = s0Params['nuMM'], s1Params['nuMN']
        instructions = []

        instructions += [
            Comment("1-BLAC: " + str(M) + "x" + str(N) + " / " + str(N) + "x" +
                    str(N))
        ]

        for i in range(M):
            instr = Mov(
                ScaDiv(ScaLoad(src0[s0L.of(i), s0R.of(0)]),
                       ScaLoad(src1[s1L.of(0), s1R.of(0)])),
                ScaLoad(dst[dL.of(i), dR.of(0)]))
            instructions += [instr]

        return instructions
示例#16
0
    def loadMatrix(self, mParams):
        src, dst = mParams['m'], mParams['nuM']
        sL, sR = mParams['mL'], mParams['mR']
        dL, dR = mParams['nuML'], mParams['nuMR']
        M, N = mParams['M'], mParams['N']
        isCompact, isCorner = mParams['compact'], mParams['corner']
        instructions = []

        if M == 1:
            if N == 1:
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(
                    mmLoadSs(AddressOf(sa(src[sL.of(0), sR.of(0)]))), pc)
                instructions += [Comment("1x1 -> 1x4"), instr]
            elif N == 2:
                v0_1 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(0),
                                                     sR.of(0)])))
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(v0_1, pc)
                instructions += [Comment("1x2 -> 1x4 - Corner")]
                instructions += [instr]
            elif N == 3:
                v0_1 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(0),
                                                     sR.of(0)])))
                e2 = mmLoadSs(Pointer(src[sL.of(0), sR.of(2)]))
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(mmShufflePs(v0_1, e2, (1, 0, 1, 0)), pc)
                instructions += [Comment("1x3 -> 1x4 - Corner")]
                instructions += [instr]
        elif M == 2:
            if N == 1:
                if isCompact:
                    v0_1 = mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(0),
                                                         sR.of(0)])))
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(v0_1, pc)
                    instructions += [Comment("2x1 -> 4x1 - Compact")]
                    instructions += [instr]
                else:
                    e0 = mmLoadSs(Pointer(src[sL.of(0), sR.of(0)]))
                    e1 = mmLoadSs(Pointer(src[sL.of(1), sR.of(0)]))
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(mmUnpackloPs(e0, e1), pc)
                    instructions += [Comment("2x1 -> 4x1 - incompact")]
                    instructions += [instr]
            elif N == 2:
                v0_1 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(0),
                                                     sR.of(0)])))
                v2_3 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(1),
                                                     sR.of(0)])))
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instr0 = mmStoreuPs(v0_1, pcs[0])
                instr1 = mmStoreuPs(v2_3, pcs[1])
                instructions += [Comment("2x2 -> 4x4")]
                instructions += [
                    instr0, instr1,
                    mmStoreuPs(mmSetzeroPs(), pcs[2]),
                    mmStoreuPs(mmSetzeroPs(), pcs[3])
                ]
            elif N == 3:
                if isCompact:
                    v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v4_5 = mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(1),
                                                         sR.of(1)])))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_3, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(mmShiftPs(v4_5, v0_3, 3), pcs[1])
                    instructions += [Comment("2x3 -> 4x4 - Compact")]
                    instructions += [
                        instr0, instr1,
                        mmStoreuPs(mmSetzeroPs(), pcs[2]),
                        mmStoreuPs(mmSetzeroPs(), pcs[3])
                    ]
                else:
                    v0_2 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v3_4 = mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(1),
                                                         sR.of(0)])))
                    e5 = mmLoadSs(Pointer(src[sL.of(1), sR.of(2)]))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_2, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(mmShufflePs(v3_4, e5, (1, 0, 1, 0)),
                                        pcs[1])
                    instructions += [Comment("2x3 -> 4x4 - Incompact")]
                    instructions += [
                        instr0, instr1,
                        mmStoreuPs(mmSetzeroPs(), pcs[2]),
                        mmStoreuPs(mmSetzeroPs(), pcs[3])
                    ]
            elif N == 4:
                v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)]))
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instr0 = mmStoreuPs(v0_3, pcs[0])
                instr1 = mmStoreuPs(v4_7, pcs[1])
                instructions += [Comment("2x4 -> 4x4")]
                instructions += [
                    instr0, instr1,
                    mmStoreuPs(mmSetzeroPs(), pcs[2]),
                    mmStoreuPs(mmSetzeroPs(), pcs[3])
                ]
        elif M == 3:
            if N == 1:
                if isCompact:
                    v0_1 = mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(0),
                                                         sR.of(0)])))
                    e2 = mmLoadSs(Pointer(src[sL.of(2), sR.of(0)]))
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(mmShufflePs(v0_1, e2, (1, 0, 1, 0)), pc)
                    instructions += [Comment("3x1 -> 4x1 - Compact")]
                    instructions += [instr]
                else:
                    e0 = mmLoadSs(Pointer(src[sL.of(0), sR.of(0)]))
                    e1 = mmLoadSs(Pointer(src[sL.of(1), sR.of(0)]))
                    e2 = mmLoadSs(Pointer(src[sL.of(2), sR.of(0)]))
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(
                        mmShufflePs(mmUnpackloPs(e0, e1), e2, (1, 0, 1, 0)),
                        pc)
                    instructions += [Comment("3x1 -> 4x1 - incompact")]
                    instructions += [instr]
            elif N == 2:
                v0_1 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(0),
                                                     sR.of(0)])))
                v2_3 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(1),
                                                     sR.of(0)])))
                v4_5 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(2),
                                                     sR.of(0)])))
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instr0 = mmStoreuPs(v0_1, pcs[0])
                instr1 = mmStoreuPs(v2_3, pcs[1])
                instr2 = mmStoreuPs(v4_5, pcs[2])
                instructions += [Comment("3x2 -> 4x4")]
                instructions += [
                    instr0, instr1, instr2,
                    mmStoreuPs(mmSetzeroPs(), pcs[3])
                ]
            elif N == 3:
                if isCompact:
                    v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(1)]))
                    e8 = mmLoadSs(Pointer(src[sL.of(2), sR.of(2)]))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_3, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(
                        mmBlendPs(mmShiftPs(v4_7, v0_3, 3), mmSetzeroPs(),
                                  (1, 0, 0, 0)), pcs[1])
                    instr2 = mmStoreuPs(mmShufflePs(v4_7, e8, (1, 0, 3, 2)),
                                        pcs[2])
                    instructions += [Comment("3x3 -> 4x4 - Compact")]
                    instructions += [
                        instr0, instr1, instr2,
                        mmStoreuPs(mmSetzeroPs(), pcs[3])
                    ]
                else:
                    v0_2 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v3_5 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)]))
                    v6_7 = mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(2),
                                                         sR.of(0)])))
                    e8 = mmLoadSs(Pointer(src[sL.of(2), sR.of(2)]))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_2, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(
                        mmBlendPs(v3_5, mmSetzeroPs(), (1, 0, 0, 0)), pcs[1])
                    instr2 = mmStoreuPs(mmShufflePs(v6_7, e8, (1, 0, 1, 0)),
                                        pcs[2])
                    instructions += [Comment("3x3 -> 4x4 - Incompact")]
                    instructions += [
                        instr0, instr1, instr2,
                        mmStoreuPs(mmSetzeroPs(), pcs[3])
                    ]
            elif N == 4:
                rows = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]))
                    for i in range(3)
                ]
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instrs = [mmStoreuPs(rows[i], pcs[i]) for i in range(3)]
                instructions += [Comment("3x4 -> 4x4")] + instrs + [
                    mmStoreuPs(mmSetzeroPs(), pcs[3])
                ]
        elif M == 4:
            if N == 1:
                if not isCompact:
                    es = [
                        mmLoadSs(Pointer(src[sL.of(i), sR.of(0)]))
                        for i in range(4)
                    ]
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(
                        mmShufflePs(mmUnpackloPs(es[0], es[1]),
                                    mmUnpackloPs(es[2], es[3]), (1, 0, 1, 0)),
                        pc)
                    instructions += [Comment("4x1 -> 4x1 - incompact"), instr]
            elif N == 2:
                rows = [
                    mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(i),
                                                         sR.of(0)])))
                    for i in range(4)
                ]
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instrs = [mmStoreuPs(rows[i], pcs[i]) for i in range(4)]
                instructions += [Comment("4x2 -> 4x4")] + instrs
            elif N == 3:
                if isCompact:
                    v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(1)]))
                    v8_11 = mmLoaduPs(Pointer(src[sL.of(2), sR.of(2)]))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_3, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(
                        mmBlendPs(mmShiftPs(v4_7, v0_3, 3), mmSetzeroPs(),
                                  (1, 0, 0, 0)), pcs[1])
                    instr2 = mmStoreuPs(
                        mmBlendPs(mmShiftPs(v8_11, v4_7, 2), mmSetzeroPs(),
                                  (1, 0, 0, 0)), pcs[2])
                    instr3 = mmStoreuPs(mmShiftPs(mmSetzeroPs(), v8_11, 1),
                                        pcs[3])
                    instructions += [Comment("4x3 -> 4x4 - Compact")]
                    instructions += [instr0, instr1, instr2, instr3]
                else:
                    v0_2 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v3_5 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)]))
                    v6_8 = mmLoaduPs(Pointer(src[sL.of(2), sR.of(0)]))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_2, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(
                        mmBlendPs(v3_5, mmSetzeroPs(), (1, 0, 0, 0)), pcs[1])
                    instr2 = mmStoreuPs(
                        mmBlendPs(v6_8, mmSetzeroPs(), (1, 0, 0, 0)), pcs[2])
                    if isCorner:
                        v9_10 = mmLoadlPi(
                            mmSetzeroPs(),
                            PointerCast("__m64",
                                        Pointer(src[sL.of(3),
                                                    sR.of(0)])))
                        e11 = mmLoadSs(Pointer(src[sL.of(3), sR.of(2)]))
                        instr3 = mmStoreuPs(
                            mmShufflePs(v9_10, e11, (1, 0, 1, 0)), pcs[3])
                        instructions += [
                            Comment("4x3 -> 4x4 - Incompact Corner")
                        ]
                        instructions += [instr0, instr1, instr2, instr3]
                    else:
                        v9_11 = mmLoaduPs(Pointer(src[sL.of(3), sR.of(0)]))
                        instr3 = mmStoreuPs(
                            mmBlendPs(v9_11, mmSetzeroPs(), (1, 0, 0, 0)),
                            pcs[3])
                        instructions += [Comment("4x3 -> 4x4 - Incompact")]
                        instructions += [instr0, instr1, instr2, instr3]

        for i in instructions:
            i.bounds.update(mParams['bounds'])
        return instructions
示例#17
0
    def Mul(self, s0Params, s1Params, dParams, opts):

        nu = 2
        src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, K, N = s0Params['nuMM'], s0Params['nuMN'], s1Params['nuMN']
        instructions = []

        instructions += [
            Comment(
                str(nu) + "-BLAC: " + str(M) + "x" + str(K) + " * " + str(K) +
                "x" + str(N))
        ]
        if M == 1:
            if N == 1:
                va = mmLoaduPd(Pointer(src0[s0L.of(0), s0R.of(0)]))
                vb = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)]))
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPd(mmHaddPd(mmMulPd(va, vb), mmSetzeroPd()),
                                   pc)
                #                     instr = mmStoreSd(mmHaddPd(mmMulPd(va, vb), mmSetzeroPd()), pc)
                instructions += [instr]
            else:
                va = mmLoaduPd(Pointer(src0[s0L.of(0), s0R.of(0)]))
                vb0 = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)]))
                vb1 = mmLoaduPd(Pointer(src1[s1L.of(1), s1R.of(0)]))
                vbt0 = mmUnpackloPd(vb0, vb1)
                vbt1 = mmUnpackhiPd(vb0, vb1)
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPd(
                    mmHaddPd(mmMulPd(va, vbt0), mmMulPd(va, vbt1)), pc)
                instructions += [instr]
        else:
            if K == 1:
                va0 = mmLoaddupPd(Pointer(src0[s0L.of(0), s0R.of(0)]))
                va1 = mmLoaddupPd(Pointer(src0[s0L.of(1), s0R.of(0)]))
                vb = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)]))
                pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                pc1 = Pointer(dst[dL.of(1), dR.of(0)])
                instr0 = mmStoreuPd(mmMulPd(va0, vb), pc0)
                instr1 = mmStoreuPd(mmMulPd(va1, vb), pc1)
                instructions += [instr0, instr1]
            else:
                if N == 1:
                    va0 = mmLoaduPd(Pointer(src0[s0L.of(0), s0R.of(0)]))
                    va1 = mmLoaduPd(Pointer(src0[s0L.of(1), s0R.of(0)]))
                    vb = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)]))
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPd(
                        mmHaddPd(mmMulPd(va0, vb), mmMulPd(va1, vb)), pc)
                    instructions += [instr]
                else:
                    va0 = mmLoaduPd(Pointer(src0[s0L.of(0), s0R.of(0)]))
                    va1 = mmLoaduPd(Pointer(src0[s0L.of(1), s0R.of(0)]))
                    vb0 = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)]))
                    vb1 = mmLoaduPd(Pointer(src1[s1L.of(1), s1R.of(0)]))
                    vbt0 = mmUnpackloPd(vb0, vb1)
                    vbt1 = mmUnpackhiPd(vb0, vb1)
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc1 = Pointer(dst[dL.of(1), dR.of(0)])
                    instr0 = mmStoreuPd(
                        mmHaddPd(mmMulPd(va0, vbt0), mmMulPd(va0, vbt1)), pc0)
                    instr1 = mmStoreuPd(
                        mmHaddPd(mmMulPd(va1, vbt0), mmMulPd(va1, vbt1)), pc1)
                    instructions += [instr0, instr1]

        return instructions
示例#18
0
    def Mul(self, s0Params, s1Params, dParams, opts):

        nu = 4
        src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, K, N = s0Params['nuMM'], s0Params['nuMN'], s1Params['nuMN']
        instructions = []

        instructions += [
            Comment(
                str(nu) + "-BLAC: " + str(M) + "x" + str(K) + " * " + str(K) +
                "x" + str(N))
        ]
        if M == 1:
            if N == 1:
                va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
                vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(mmDpPs(va, vb, [1, 1, 1, 1, 0, 0, 0, 1]),
                                   pc)
                instructions += [instr]
            else:
                vb0 = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                vb1 = mmLoaduPs(Pointer(src1[s1L.of(1), s1R.of(0)]))
                vb2 = mmLoaduPs(Pointer(src1[s1L.of(2), s1R.of(0)]))
                vb3 = mmLoaduPs(Pointer(src1[s1L.of(3), s1R.of(0)]))

                va00 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(0)]))
                va01 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(1)]))
                va02 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(2)]))
                va03 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(3)]))
                mul0 = mmMulPs(va00, vb0)
                mul1 = mmMulPs(va01, vb1)
                add0 = mmAddPs(mul0, mul1)
                mul2 = mmMulPs(va02, vb2)
                mul3 = mmMulPs(va03, vb3)
                add1 = mmAddPs(mul2, mul3)
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(mmAddPs(add0, add1), pc)
                instructions += [instr]
        else:
            if K == 1:
                va0 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(0)]))
                va1 = mmLoad1Ps(Pointer(src0[s0L.of(1), s0R.of(0)]))
                va2 = mmLoad1Ps(Pointer(src0[s0L.of(2), s0R.of(0)]))
                va3 = mmLoad1Ps(Pointer(src0[s0L.of(3), s0R.of(0)]))
                vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                pc1 = Pointer(dst[dL.of(1), dR.of(0)])
                pc2 = Pointer(dst[dL.of(2), dR.of(0)])
                pc3 = Pointer(dst[dL.of(3), dR.of(0)])
                instr0 = mmStoreuPs(mmMulPs(va0, vb), pc0)
                instr1 = mmStoreuPs(mmMulPs(va1, vb), pc1)
                instr2 = mmStoreuPs(mmMulPs(va2, vb), pc2)
                instr3 = mmStoreuPs(mmMulPs(va3, vb), pc3)
                instructions += [instr0, instr1, instr2, instr3]
            else:
                if N == 1:
                    va0 = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
                    va1 = mmLoaduPs(Pointer(src0[s0L.of(1), s0R.of(0)]))
                    va2 = mmLoaduPs(Pointer(src0[s0L.of(2), s0R.of(0)]))
                    va3 = mmLoaduPs(Pointer(src0[s0L.of(3), s0R.of(0)]))
                    vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                    mul0 = mmMulPs(va0, vb)
                    mul1 = mmMulPs(va1, vb)
                    mul2 = mmMulPs(va2, vb)
                    mul3 = mmMulPs(va3, vb)
                    hadd0 = mmHaddPs(mul0, mul1)
                    hadd1 = mmHaddPs(mul2, mul3)
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(mmHaddPs(hadd0, hadd1), pc)
                    instructions += [instr]
                else:
                    vb0 = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                    vb1 = mmLoaduPs(Pointer(src1[s1L.of(1), s1R.of(0)]))
                    vb2 = mmLoaduPs(Pointer(src1[s1L.of(2), s1R.of(0)]))
                    vb3 = mmLoaduPs(Pointer(src1[s1L.of(3), s1R.of(0)]))
                    for i in range(nu):
                        vai0 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(0)]))
                        vai1 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(1)]))
                        vai2 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(2)]))
                        vai3 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(3)]))
                        mul0 = mmMulPs(vai0, vb0)
                        mul1 = mmMulPs(vai1, vb1)
                        add0 = mmAddPs(mul0, mul1)
                        mul2 = mmMulPs(vai2, vb2)
                        mul3 = mmMulPs(vai3, vb3)
                        add1 = mmAddPs(mul2, mul3)
                        pc = Pointer(dst[dL.of(i), dR.of(0)])
                        instr = mmStoreuPs(mmAddPs(add0, add1), pc)
                        instructions += [instr]

        for i in instructions:
            i.bounds.update(dParams['bounds'])
        return instructions
示例#19
0
    def storeMatrix(self, mParams):
        src, dst = mParams['nuM'], mParams['m']
        sL, sR = mParams['nuML'], mParams['nuMR']
        dL, dR = mParams['mL'], mParams['mR']
        M, N = mParams['M'], mParams['N']
        isCompact = mParams['compact']
        instructions = []

        if M == 1:
            if N == 1:
                nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [1, 2, 3])
                pc = AddressOf(sa(dst[dL.of(0), dR.of(0)]))
                instr = mmStoreSs(nuv, pc)
                instructions += [Comment("1x4 -> 1x1"), instr]
            elif N == 2:
                nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [2, 3])
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStorelPi(nuv, PointerCast("__m64", pc))
                instructions += [Comment("1x4 -> 1x2 - Corner"), instr]
            elif N == 3:
                nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [3])
                e2 = mmShufflePs(nuv, nuv, (3, 3, 3, 2))
                pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                pc2 = Pointer(dst[dL.of(0), dR.of(2)])
                instr0 = mmStorelPi(nuv, PointerCast("__m64", pc0))
                instr1 = mmStoreSs(e2, pc2)
                instructions += [Comment("1x4 -> 1x3 - Corner")]
                instructions += [instr0, instr1]
        elif M == 2:
            if N == 1:
                nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [2, 3])
                if isCompact:
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStorelPi(nuv, PointerCast("__m64", pc))
                    instructions += [Comment("4x1 -> 2x1 - Compact"), instr]
                else:
                    e1 = mmShufflePs(nuv, nuv, (2, 2, 2, 1))
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc1 = Pointer(dst[dL.of(1), dR.of(0)])
                    instr0 = mmStoreSs(nuv, pc0)
                    instr1 = mmStoreSs(e1, pc1)
                    instructions += [Comment("4x1 -> 2x1 - Incompact")]
                    instructions += [instr0, instr1]
            elif N == 2:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [2, 3])
                    for i in range(2)
                ]
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(2)]
                instrs = [
                    mmStorelPi(nuvs[i], PointerCast("__m64", pcs[i]))
                    for i in range(2)
                ]
                instructions += [Comment("4x4 -> 2x2")] + instrs
            elif N == 3:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [3])
                    for i in range(2)
                ]
                if isCompact:
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc1 = Pointer(dst[dL.of(1), dR.of(1)])
                    instr0 = mmStoreuPs(
                        mmInsertPs(nuvs[0], nuvs[1], [0, 0, 1, 1, 0, 0, 0, 0]),
                        pc0)
                    instr1 = mmStorelPi(
                        mmShufflePs(nuvs[1], nuvs[1], (3, 3, 2, 1)),
                        PointerCast("__m64", pc1))
                    instructions += [Comment("4x4 -> 2x3 - Compact")]
                    instructions += [instr0, instr1]
                else:
                    instructions += [Comment("4x4 -> 2x3 - Compact")]
                    e2 = mmShufflePs(nuvs[0], nuvs[0], (3, 3, 3, 2))
                    instrRow0 = [
                        mmStorelPi(
                            nuvs[0],
                            PointerCast("__m64",
                                        Pointer(dst[dL.of(0),
                                                    dR.of(0)]))),
                        mmStoreSs(e2, Pointer(dst[dL.of(0), dR.of(2)]))
                    ]
                    e5 = mmShufflePs(nuvs[1], nuvs[1], (3, 3, 3, 2))
                    instrRow1 = [
                        mmStorelPi(
                            nuvs[1],
                            PointerCast("__m64",
                                        Pointer(dst[dL.of(1),
                                                    dR.of(0)]))),
                        mmStoreSs(e5, Pointer(dst[dL.of(1), dR.of(2)]))
                    ]
                    instructions += instrRow0 + instrRow1
            elif N == 4:
                v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)]))
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(2)]
                instr0 = mmStoreuPs(v0_3, pcs[0])
                instr1 = mmStoreuPs(v4_7, pcs[1])
                instructions += [Comment("4x4 -> 2x4")]
                instructions += [instr0, instr1]
        elif M == 3:
            if N == 1:
                nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [3])
                if isCompact:
                    e2 = mmShufflePs(nuv, nuv, (3, 3, 3, 2))
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc2 = Pointer(dst[dL.of(2), dR.of(0)])
                    instr0 = mmStorelPi(nuv, PointerCast("__m64", pc0))
                    instr1 = mmStoreSs(e2, pc2)
                    instructions += [Comment("4x1 -> 3x1 - Compact")]
                    instructions += [instr0, instr1]
                else:
                    e1 = mmShufflePs(nuv, nuv, (3, 3, 3, 1))
                    e2 = mmShufflePs(nuv, nuv, (3, 3, 3, 2))
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc1 = Pointer(dst[dL.of(1), dR.of(0)])
                    pc2 = Pointer(dst[dL.of(2), dR.of(0)])
                    instr0 = mmStoreSs(nuv, pc0)
                    instr1 = mmStoreSs(e1, pc1)
                    instr2 = mmStoreSs(e2, pc2)
                    instructions += [Comment("4x1 -> 3x1 - Incompact")]
                    instructions += [instr0, instr1, instr2]
            elif N == 2:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [2, 3])
                    for i in range(3)
                ]
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(3)]
                instrs = [
                    mmStorelPi(nuvs[i], PointerCast("__m64", pcs[i]))
                    for i in range(3)
                ]
                instructions += [Comment("4x4 -> 3x2")] + instrs
            elif N == 3:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [3])
                    for i in range(3)
                ]
                if isCompact:
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc1 = Pointer(dst[dL.of(1), dR.of(1)])
                    pc2 = Pointer(dst[dL.of(2), dR.of(2)])
                    instr0 = mmStoreuPs(
                        mmInsertPs(nuvs[0], nuvs[1], [0, 0, 1, 1, 0, 0, 0, 0]),
                        pc0)
                    instr1 = mmStoreuPs(
                        mmShufflePs(nuvs[1], nuvs[2], (1, 0, 2, 1)), pc1)
                    instr2 = mmStoreSs(
                        mmShufflePs(nuvs[2], nuvs[2], (3, 3, 3, 2)), pc2)
                    instructions += [
                        Comment("4x4 -> 3x3 - Compact"), instr0, instr1, instr2
                    ]
                else:
                    instructions += [Comment("4x4 -> 3x3 - Incompact")]
                    for i in range(3):
                        e = mmShufflePs(nuvs[i], nuvs[i], (3, 3, 3, 2))
                        instructions += [
                            mmStorelPi(
                                nuvs[i],
                                PointerCast("__m64",
                                            Pointer(dst[dL.of(i),
                                                        dR.of(0)]))),
                            mmStoreSs(e, Pointer(dst[dL.of(i),
                                                     dR.of(2)]))
                        ]
            elif N == 4:
                instrs = [
                    mmStoreuPs(mmLoaduPs(Pointer(src[sL.of(i),
                                                     sR.of(0)])),
                               Pointer(dst[dL.of(i), dR.of(0)]))
                    for i in range(3)
                ]
                instructions += [Comment("4x4 -> 2x4")] + instrs
        elif M == 4:
            if N == 1:
                if not isCompact:
                    nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    es = [
                        mmShufflePs(nuv, nuv, (3, 3, 3, i))
                        for i in range(1, 4)
                    ]
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreSs(nuv, pcs[0])
                    instrs = [
                        mmStoreSs(es[i - 1], pcs[i]) for i in range(1, 4)
                    ]
                    instructions += [
                        Comment("4x1 -> 4x1 - (Store) Incompact"), instr0
                    ] + instrs
            elif N == 2:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [2, 3])
                    for i in range(4)
                ]
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instrs = [
                    mmStorelPi(nuvs[i], PointerCast("__m64", pcs[i]))
                    for i in range(4)
                ]
                instructions += [Comment("4x4 -> 4x2")] + instrs
            elif N == 3:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [3])
                    for i in range(4)
                ]
                if isCompact:
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc1 = Pointer(dst[dL.of(1), dR.of(1)])
                    pc2 = Pointer(dst[dL.of(2), dR.of(2)])
                    instr0 = mmStoreuPs(
                        mmInsertPs(nuvs[0], nuvs[1], [0, 0, 1, 1, 0, 0, 0, 0]),
                        pc0)
                    instr1 = mmStoreuPs(
                        mmShufflePs(nuvs[1], nuvs[2], (1, 0, 2, 1)), pc1)
                    instr2 = mmStoreuPs(
                        mmShiftPs(nuvs[3],
                                  mmShufflePs(nuvs[2], nuvs[2], (2, 3, 3, 3)),
                                  3), pc2)
                    instructions += [
                        Comment("4x4 -> 4x3 - Compact"), instr0, instr1, instr2
                    ]
                else:
                    instructions += [Comment("4x4 -> 4x3 - Incompact")]
                    for i in range(4):
                        e = mmShufflePs(nuvs[i], nuvs[i], (3, 3, 3, 2))
                        instructions += [
                            mmStorelPi(
                                nuvs[i],
                                PointerCast("__m64",
                                            Pointer(dst[dL.of(i),
                                                        dR.of(0)]))),
                            mmStoreSs(e, Pointer(dst[dL.of(i),
                                                     dR.of(2)]))
                        ]

        for i in instructions:
            i.bounds.update(mParams['bounds'])
        return instructions