Exemplo n.º 1
0
    def Kro(self, s0Params, s1Params, dParams, opts):

        nu = 4
        src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        oM, oK, oN, oP = s0Params['M'], s0Params['N'], s1Params['M'], s1Params[
            'N']
        M, K, N, P = s0Params['nuMM'], s0Params['nuMN'], s1Params[
            'nuMM'], s1Params['nuMN']
        instructions = []

        instructions += [
            Comment(
                str(nu) + "-BLAC: " + str(M) + "x" + str(K) + " Kro " +
                str(N) + "x" + str(P))
        ]
        if oM * oK * oN * oP == 1:
            pc = Pointer(dst[dL.of(0), dR.of(0)])
            va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
            vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
            instr = mmStoreuPs(mmMulPs(va, vb), pc)
            instructions += [instr]
        elif oM * oK == 1:
            if N * P == nu:
                va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
                dup = mmShufflePs(va, va, (0, 0, 0, 0))
                vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(mmMulPs(dup, vb), pc)
                instructions += [instr]
            else:
                va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
                dup = mmShufflePs(va, va, (0, 0, 0, 0))
                for i in range(nu):
                    vb = mmLoaduPs(Pointer(src1[s1L.of(i), s1R.of(0)]))
                    pc = Pointer(dst[dL.of(i), dR.of(0)])
                    instr = mmStoreuPs(mmMulPs(dup, vb), pc)
                    instructions += [instr]
        else:
            if M * K == nu:
                vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                dup = mmShufflePs(vb, vb, (0, 0, 0, 0))
                va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(mmMulPs(va, dup), pc)
                instructions += [instr]
            else:
                vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                dup = mmShufflePs(vb, vb, (0, 0, 0, 0))
                for i in range(nu):
                    va = mmLoaduPs(Pointer(src0[s0L.of(i), s0R.of(0)]))
                    pc = Pointer(dst[dL.of(i), dR.of(0)])
                    instr = mmStoreuPs(mmMulPs(va, dup), pc)
                    instructions += [instr]

        for i in instructions:
            i.bounds.update(dParams['bounds'])
        return instructions
Exemplo n.º 2
0
    def Add(self, s0Params, s1Params, dParams, opts):

        nu = 4
        src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, N = dParams['nuMM'], dParams['nuMN']
        instructions = []

        instructions += [
            Comment(
                str(nu) + "-BLAC: " + str(M) + "x" + str(N) + " + " + str(M) +
                "x" + str(N))
        ]
        if M * N == nu:
            va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
            vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
            pc = Pointer(dst[dL.of(0), dR.of(0)])
            instr = mmStoreuPs(mmAddPs(va, vb), pc)
            instructions += [instr]
        elif M == nu and N == nu:
            for i in range(M):
                va = mmLoaduPs(Pointer(src0[s0L.of(i), s0R.of(0)]))
                vb = mmLoaduPs(Pointer(src1[s1L.of(i), s1R.of(0)]))
                pc = Pointer(dst[dL.of(i), dR.of(0)])
                instr = mmStoreuPs(mmAddPs(va, vb), pc)
                instructions += [instr]

        for i in instructions:
            i.bounds.update(dParams['bounds'])
        return instructions
Exemplo n.º 3
0
 def mmLoad1Ps(self, src, repList, bounds):
     sList = sorted(repList, key=lambda t: t[0], reverse=True)
     dst = sList[0][1]
     if dst.reglen == 4 and dst.mrmap == [0, 1, 2, 3]:
         at = src.pointer.getAt()
         direct = 1 if src.pointer.getMat().size[1].subs(
             bounds) > 1 else 0  # Temp solution
         sel = at[direct] % 4
         return mmShufflePs(mmLoaduPs(dst.pointer), mmLoaduPs(dst.pointer),
                            (sel, sel, sel, sel))
     else:
         raise Exception('Cannot load-replace!')
Exemplo n.º 4
0
 def ScaLoad(self, src, repList):  #repList is a list of tuples (line, dst)
     isFlt = (self.opts['precision'] == 'float')
     if src.pointer.at[1] == 0:
         return mmCvtssf32(mmLoaduPs(
             repList[0][1].pointer)) if isFlt else mmCvtsdf64(
                 mmLoaduPd(repList[0][1].pointer))
     if isFlt:
         return mmCvtssf32(
             mmShufflePs(mmLoaduPs(repList[0][1].pointer),
                         mmLoaduPs(repList[0][1].pointer),
                         (0, 0, 0, src.pointer.at[1])))
     return mmCvtsdf64(
         mmShufflePd(mmLoaduPd(repList[0][1].pointer),
                     mmLoaduPd(repList[0][1].pointer),
                     (0, src.pointer.at[1])))
Exemplo n.º 5
0
    def T(self, sParams, dParams, opts):

        nu = 4
        src, dst = sParams['nuM'], dParams['nuM']
        sL, sR = sParams['nuML'], sParams['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, N = dParams['nuMM'], dParams['nuMN']
        instructions = []

        instructions += [
            Comment(str(nu) + "-BLAC: (" + str(N) + "x" + str(M) + ")^T")
        ]
        if M * N == nu:
            va = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
            pc = Pointer(dst[dL.of(0), dR.of(0)])
            instr = mmStoreuPs(va, pc)
            instructions += [instr]
        else:
            va0 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
            va1 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)]))
            va2 = mmLoaduPs(Pointer(src[sL.of(2), sR.of(0)]))
            va3 = mmLoaduPs(Pointer(src[sL.of(3), sR.of(0)]))
            #Equivalent of _MM_TRANSPOSE4_PS
            tmp0 = mmUnpackloPs(va0, va1)
            tmp2 = mmUnpackloPs(va2, va3)
            tmp1 = mmUnpackhiPs(va0, va1)
            tmp3 = mmUnpackhiPs(va2, va3)
            col0 = mmMovelhPs(tmp0, tmp2)
            col1 = mmMovehlPs(tmp2, tmp0)
            col2 = mmMovelhPs(tmp1, tmp3)
            col3 = mmMovehlPs(tmp3, tmp1)
            #Equivalent of _MM_TRANSPOSE4_PS
            pc0 = Pointer(dst[dL.of(0), dR.of(0)])
            pc1 = Pointer(dst[dL.of(1), dR.of(0)])
            pc2 = Pointer(dst[dL.of(2), dR.of(0)])
            pc3 = Pointer(dst[dL.of(3), dR.of(0)])
            instr0 = mmStoreuPs(col0, pc0)
            instr1 = mmStoreuPs(col1, pc1)
            instr2 = mmStoreuPs(col2, pc2)
            instr3 = mmStoreuPs(col3, pc3)
            instructions += [instr0, instr1, instr2, instr3]

        for i in instructions:
            i.bounds.update(dParams['bounds'])
        return instructions
Exemplo n.º 6
0
 def ScaLoad(self, src, repList, bounds):
     '''
     src is the ScaLoad object we want to replace. 
     repList is a list of tuples (line, dst). The dst elements of the tuples are store commands.
     '''
     sList = sorted(repList, key=lambda t: t[0], reverse=True)
     isFlt = (self.opts['precision'] == 'float')
     if src.pointer.at[1] == 0:
         return mmCvtssf32(mmLoaduPs(
             sList[0][1].pointer)) if isFlt else mmCvtsdf64(
                 mmLoaduPd(sList[0][1].pointer))
     if isFlt:
         return mmCvtssf32(
             mmShufflePs(mmLoaduPs(sList[0][1].pointer),
                         mmLoaduPs(sList[0][1].pointer),
                         (0, 0, 0, src.pointer.at[1])))
     return mmCvtsdf64(
         mmShufflePd(mmLoaduPd(sList[0][1].pointer),
                     mmLoaduPd(sList[0][1].pointer),
                     (0, src.pointer.at[1])))
Exemplo n.º 7
0
    def mmLoaduPs(self, src, repList, bounds):
        sList = sorted(repList, key=lambda t: t[0], reverse=True)
        if len(sList) >= 2:
            dstList = [t[1] for t in sList[0:2]]
            addSList = sorted(dstList,
                              key=lambda dst: dst.pointer,
                              reverse=True)  #higher pointer to lower
            if addSList[1].reglen == 4 and addSList[
                    0].reglen == 4 and addSList[1].mrmap == [
                        0, 1
                    ] and addSList[0].mrmap == [0]:
                return mmShufflePs(
                    mmLoadlPi(mmSetzeroPs(), addSList[1].pointer),
                    mmLoadSs(addSList[0].pointer), (1, 0, 1, 0))
#             else:
#                 raise Exception('Cannot load-replace!')
        if len(sList) >= 3:
            dstList = [t[1] for t in sList[0:3]]
            addSList = sorted(dstList,
                              key=lambda dst: dst.pointer,
                              reverse=True)  #higher pointer to lower
            if all(s.reglen == 4 and s.mrmap == m
                   for s, m in zip(addSList, [[0, 1, 2, 3], [0], [0, 1]])):
                return mmShufflePs(
                    mmLoadlPi(mmSetzeroPs(),
                              PointerCast("__m64", addSList[2].pointer)),
                    mmUnpackloPs(mmLoadSs(addSList[1].pointer),
                                 mmLoaduPs(addSList[0].pointer)), (1, 0, 1, 0))
#             else:
#                 raise Exception('Cannot load-replace!')
        if len(sList) >= 4:
            dstList = [t[1] for t in sList[0:4]]
            addSList = sorted(dstList,
                              key=lambda dst: dst.pointer,
                              reverse=True)  #higher pointer to lower
            if all(map(lambda d: d.reglen == 4 and d.mrmap == [0], addSList)):
                return mmShufflePs(
                    mmShufflePs(mmLoadSs(addSList[3].pointer),
                                mmLoadSs(addSList[2].pointer), (1, 0, 1, 0)),
                    mmShufflePs(mmLoadSs(addSList[1].pointer),
                                mmLoadSs(addSList[0].pointer), (1, 0, 1, 0)),
                    (1, 0, 1, 0))
        if len(sList) >= 1 and sList[0][1].mrmap == [0, 1, 2, 3]:
            return src


#         else:
        raise Exception('Cannot load-replace!')
Exemplo n.º 8
0
    def storeMatrix(self, mParams):
        src, dst = mParams['nuM'], mParams['m']
        sL, sR = mParams['nuML'], mParams['nuMR']
        dL, dR = mParams['mL'], mParams['mR']
        M, N = mParams['M'], mParams['N']
        isCompact = mParams['compact']
        instructions = []

        if M == 1:
            if N == 1:
                nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [1, 2, 3])
                pc = AddressOf(sa(dst[dL.of(0), dR.of(0)]))
                instr = mmStoreSs(nuv, pc)
                instructions += [Comment("1x4 -> 1x1"), instr]
            elif N == 2:
                nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [2, 3])
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStorelPi(nuv, PointerCast("__m64", pc))
                instructions += [Comment("1x4 -> 1x2 - Corner"), instr]
            elif N == 3:
                nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [3])
                e2 = mmShufflePs(nuv, nuv, (3, 3, 3, 2))
                pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                pc2 = Pointer(dst[dL.of(0), dR.of(2)])
                instr0 = mmStorelPi(nuv, PointerCast("__m64", pc0))
                instr1 = mmStoreSs(e2, pc2)
                instructions += [Comment("1x4 -> 1x3 - Corner")]
                instructions += [instr0, instr1]
        elif M == 2:
            if N == 1:
                nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [2, 3])
                if isCompact:
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStorelPi(nuv, PointerCast("__m64", pc))
                    instructions += [Comment("4x1 -> 2x1 - Compact"), instr]
                else:
                    e1 = mmShufflePs(nuv, nuv, (2, 2, 2, 1))
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc1 = Pointer(dst[dL.of(1), dR.of(0)])
                    instr0 = mmStoreSs(nuv, pc0)
                    instr1 = mmStoreSs(e1, pc1)
                    instructions += [Comment("4x1 -> 2x1 - Incompact")]
                    instructions += [instr0, instr1]
            elif N == 2:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [2, 3])
                    for i in range(2)
                ]
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(2)]
                instrs = [
                    mmStorelPi(nuvs[i], PointerCast("__m64", pcs[i]))
                    for i in range(2)
                ]
                instructions += [Comment("4x4 -> 2x2")] + instrs
            elif N == 3:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [3])
                    for i in range(2)
                ]
                if isCompact:
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc1 = Pointer(dst[dL.of(1), dR.of(1)])
                    instr0 = mmStoreuPs(
                        mmInsertPs(nuvs[0], nuvs[1], [0, 0, 1, 1, 0, 0, 0, 0]),
                        pc0)
                    instr1 = mmStorelPi(
                        mmShufflePs(nuvs[1], nuvs[1], (3, 3, 2, 1)),
                        PointerCast("__m64", pc1))
                    instructions += [Comment("4x4 -> 2x3 - Compact")]
                    instructions += [instr0, instr1]
                else:
                    instructions += [Comment("4x4 -> 2x3 - Compact")]
                    e2 = mmShufflePs(nuvs[0], nuvs[0], (3, 3, 3, 2))
                    instrRow0 = [
                        mmStorelPi(
                            nuvs[0],
                            PointerCast("__m64",
                                        Pointer(dst[dL.of(0),
                                                    dR.of(0)]))),
                        mmStoreSs(e2, Pointer(dst[dL.of(0), dR.of(2)]))
                    ]
                    e5 = mmShufflePs(nuvs[1], nuvs[1], (3, 3, 3, 2))
                    instrRow1 = [
                        mmStorelPi(
                            nuvs[1],
                            PointerCast("__m64",
                                        Pointer(dst[dL.of(1),
                                                    dR.of(0)]))),
                        mmStoreSs(e5, Pointer(dst[dL.of(1), dR.of(2)]))
                    ]
                    instructions += instrRow0 + instrRow1
            elif N == 4:
                v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)]))
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(2)]
                instr0 = mmStoreuPs(v0_3, pcs[0])
                instr1 = mmStoreuPs(v4_7, pcs[1])
                instructions += [Comment("4x4 -> 2x4")]
                instructions += [instr0, instr1]
        elif M == 3:
            if N == 1:
                nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [3])
                if isCompact:
                    e2 = mmShufflePs(nuv, nuv, (3, 3, 3, 2))
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc2 = Pointer(dst[dL.of(2), dR.of(0)])
                    instr0 = mmStorelPi(nuv, PointerCast("__m64", pc0))
                    instr1 = mmStoreSs(e2, pc2)
                    instructions += [Comment("4x1 -> 3x1 - Compact")]
                    instructions += [instr0, instr1]
                else:
                    e1 = mmShufflePs(nuv, nuv, (3, 3, 3, 1))
                    e2 = mmShufflePs(nuv, nuv, (3, 3, 3, 2))
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc1 = Pointer(dst[dL.of(1), dR.of(0)])
                    pc2 = Pointer(dst[dL.of(2), dR.of(0)])
                    instr0 = mmStoreSs(nuv, pc0)
                    instr1 = mmStoreSs(e1, pc1)
                    instr2 = mmStoreSs(e2, pc2)
                    instructions += [Comment("4x1 -> 3x1 - Incompact")]
                    instructions += [instr0, instr1, instr2]
            elif N == 2:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [2, 3])
                    for i in range(3)
                ]
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(3)]
                instrs = [
                    mmStorelPi(nuvs[i], PointerCast("__m64", pcs[i]))
                    for i in range(3)
                ]
                instructions += [Comment("4x4 -> 3x2")] + instrs
            elif N == 3:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [3])
                    for i in range(3)
                ]
                if isCompact:
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc1 = Pointer(dst[dL.of(1), dR.of(1)])
                    pc2 = Pointer(dst[dL.of(2), dR.of(2)])
                    instr0 = mmStoreuPs(
                        mmInsertPs(nuvs[0], nuvs[1], [0, 0, 1, 1, 0, 0, 0, 0]),
                        pc0)
                    instr1 = mmStoreuPs(
                        mmShufflePs(nuvs[1], nuvs[2], (1, 0, 2, 1)), pc1)
                    instr2 = mmStoreSs(
                        mmShufflePs(nuvs[2], nuvs[2], (3, 3, 3, 2)), pc2)
                    instructions += [
                        Comment("4x4 -> 3x3 - Compact"), instr0, instr1, instr2
                    ]
                else:
                    instructions += [Comment("4x4 -> 3x3 - Incompact")]
                    for i in range(3):
                        e = mmShufflePs(nuvs[i], nuvs[i], (3, 3, 3, 2))
                        instructions += [
                            mmStorelPi(
                                nuvs[i],
                                PointerCast("__m64",
                                            Pointer(dst[dL.of(i),
                                                        dR.of(0)]))),
                            mmStoreSs(e, Pointer(dst[dL.of(i),
                                                     dR.of(2)]))
                        ]
            elif N == 4:
                instrs = [
                    mmStoreuPs(mmLoaduPs(Pointer(src[sL.of(i),
                                                     sR.of(0)])),
                               Pointer(dst[dL.of(i), dR.of(0)]))
                    for i in range(3)
                ]
                instructions += [Comment("4x4 -> 2x4")] + instrs
        elif M == 4:
            if N == 1:
                if not isCompact:
                    nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    es = [
                        mmShufflePs(nuv, nuv, (3, 3, 3, i))
                        for i in range(1, 4)
                    ]
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreSs(nuv, pcs[0])
                    instrs = [
                        mmStoreSs(es[i - 1], pcs[i]) for i in range(1, 4)
                    ]
                    instructions += [
                        Comment("4x1 -> 4x1 - (Store) Incompact"), instr0
                    ] + instrs
            elif N == 2:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [2, 3])
                    for i in range(4)
                ]
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instrs = [
                    mmStorelPi(nuvs[i], PointerCast("__m64", pcs[i]))
                    for i in range(4)
                ]
                instructions += [Comment("4x4 -> 4x2")] + instrs
            elif N == 3:
                nuvs = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [3])
                    for i in range(4)
                ]
                if isCompact:
                    pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                    pc1 = Pointer(dst[dL.of(1), dR.of(1)])
                    pc2 = Pointer(dst[dL.of(2), dR.of(2)])
                    instr0 = mmStoreuPs(
                        mmInsertPs(nuvs[0], nuvs[1], [0, 0, 1, 1, 0, 0, 0, 0]),
                        pc0)
                    instr1 = mmStoreuPs(
                        mmShufflePs(nuvs[1], nuvs[2], (1, 0, 2, 1)), pc1)
                    instr2 = mmStoreuPs(
                        mmShiftPs(nuvs[3],
                                  mmShufflePs(nuvs[2], nuvs[2], (2, 3, 3, 3)),
                                  3), pc2)
                    instructions += [
                        Comment("4x4 -> 4x3 - Compact"), instr0, instr1, instr2
                    ]
                else:
                    instructions += [Comment("4x4 -> 4x3 - Incompact")]
                    for i in range(4):
                        e = mmShufflePs(nuvs[i], nuvs[i], (3, 3, 3, 2))
                        instructions += [
                            mmStorelPi(
                                nuvs[i],
                                PointerCast("__m64",
                                            Pointer(dst[dL.of(i),
                                                        dR.of(0)]))),
                            mmStoreSs(e, Pointer(dst[dL.of(i),
                                                     dR.of(2)]))
                        ]

        for i in instructions:
            i.bounds.update(mParams['bounds'])
        return instructions
Exemplo n.º 9
0
    def Mul(self, s0Params, s1Params, dParams, opts):

        nu = 4
        src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM']
        s0L, s0R = s0Params['nuML'], s0Params['nuMR']
        s1L, s1R = s1Params['nuML'], s1Params['nuMR']
        dL, dR = dParams['nuML'], dParams['nuMR']
        M, K, N = s0Params['nuMM'], s0Params['nuMN'], s1Params['nuMN']
        instructions = []

        instructions += [
            Comment(
                str(nu) + "-BLAC: " + str(M) + "x" + str(K) + " * " + str(K) +
                "x" + str(N))
        ]
        if M == 1:
            if N == 1:
                va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
                vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(mmDpPs(va, vb, [1, 1, 1, 1, 0, 0, 0, 1]),
                                   pc)
                instructions += [instr]
            else:
                vb0 = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                vb1 = mmLoaduPs(Pointer(src1[s1L.of(1), s1R.of(0)]))
                vb2 = mmLoaduPs(Pointer(src1[s1L.of(2), s1R.of(0)]))
                vb3 = mmLoaduPs(Pointer(src1[s1L.of(3), s1R.of(0)]))

                va00 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(0)]))
                va01 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(1)]))
                va02 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(2)]))
                va03 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(3)]))
                mul0 = mmMulPs(va00, vb0)
                mul1 = mmMulPs(va01, vb1)
                add0 = mmAddPs(mul0, mul1)
                mul2 = mmMulPs(va02, vb2)
                mul3 = mmMulPs(va03, vb3)
                add1 = mmAddPs(mul2, mul3)
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(mmAddPs(add0, add1), pc)
                instructions += [instr]
        else:
            if K == 1:
                va0 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(0)]))
                va1 = mmLoad1Ps(Pointer(src0[s0L.of(1), s0R.of(0)]))
                va2 = mmLoad1Ps(Pointer(src0[s0L.of(2), s0R.of(0)]))
                va3 = mmLoad1Ps(Pointer(src0[s0L.of(3), s0R.of(0)]))
                vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                pc0 = Pointer(dst[dL.of(0), dR.of(0)])
                pc1 = Pointer(dst[dL.of(1), dR.of(0)])
                pc2 = Pointer(dst[dL.of(2), dR.of(0)])
                pc3 = Pointer(dst[dL.of(3), dR.of(0)])
                instr0 = mmStoreuPs(mmMulPs(va0, vb), pc0)
                instr1 = mmStoreuPs(mmMulPs(va1, vb), pc1)
                instr2 = mmStoreuPs(mmMulPs(va2, vb), pc2)
                instr3 = mmStoreuPs(mmMulPs(va3, vb), pc3)
                instructions += [instr0, instr1, instr2, instr3]
            else:
                if N == 1:
                    va0 = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)]))
                    va1 = mmLoaduPs(Pointer(src0[s0L.of(1), s0R.of(0)]))
                    va2 = mmLoaduPs(Pointer(src0[s0L.of(2), s0R.of(0)]))
                    va3 = mmLoaduPs(Pointer(src0[s0L.of(3), s0R.of(0)]))
                    vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                    mul0 = mmMulPs(va0, vb)
                    mul1 = mmMulPs(va1, vb)
                    mul2 = mmMulPs(va2, vb)
                    mul3 = mmMulPs(va3, vb)
                    hadd0 = mmHaddPs(mul0, mul1)
                    hadd1 = mmHaddPs(mul2, mul3)
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(mmHaddPs(hadd0, hadd1), pc)
                    instructions += [instr]
                else:
                    vb0 = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)]))
                    vb1 = mmLoaduPs(Pointer(src1[s1L.of(1), s1R.of(0)]))
                    vb2 = mmLoaduPs(Pointer(src1[s1L.of(2), s1R.of(0)]))
                    vb3 = mmLoaduPs(Pointer(src1[s1L.of(3), s1R.of(0)]))
                    for i in range(nu):
                        vai0 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(0)]))
                        vai1 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(1)]))
                        vai2 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(2)]))
                        vai3 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(3)]))
                        mul0 = mmMulPs(vai0, vb0)
                        mul1 = mmMulPs(vai1, vb1)
                        add0 = mmAddPs(mul0, mul1)
                        mul2 = mmMulPs(vai2, vb2)
                        mul3 = mmMulPs(vai3, vb3)
                        add1 = mmAddPs(mul2, mul3)
                        pc = Pointer(dst[dL.of(i), dR.of(0)])
                        instr = mmStoreuPs(mmAddPs(add0, add1), pc)
                        instructions += [instr]

        for i in instructions:
            i.bounds.update(dParams['bounds'])
        return instructions
Exemplo n.º 10
0
    def loadMatrix(self, mParams):
        src, dst = mParams['m'], mParams['nuM']
        sL, sR = mParams['mL'], mParams['mR']
        dL, dR = mParams['nuML'], mParams['nuMR']
        M, N = mParams['M'], mParams['N']
        isCompact, isCorner = mParams['compact'], mParams['corner']
        instructions = []

        if M == 1:
            if N == 1:
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(
                    mmLoadSs(AddressOf(sa(src[sL.of(0), sR.of(0)]))), pc)
                instructions += [Comment("1x1 -> 1x4"), instr]
            elif N == 2:
                v0_1 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(0),
                                                     sR.of(0)])))
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(v0_1, pc)
                instructions += [Comment("1x2 -> 1x4 - Corner")]
                instructions += [instr]
            elif N == 3:
                v0_1 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(0),
                                                     sR.of(0)])))
                e2 = mmLoadSs(Pointer(src[sL.of(0), sR.of(2)]))
                pc = Pointer(dst[dL.of(0), dR.of(0)])
                instr = mmStoreuPs(mmShufflePs(v0_1, e2, (1, 0, 1, 0)), pc)
                instructions += [Comment("1x3 -> 1x4 - Corner")]
                instructions += [instr]
        elif M == 2:
            if N == 1:
                if isCompact:
                    v0_1 = mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(0),
                                                         sR.of(0)])))
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(v0_1, pc)
                    instructions += [Comment("2x1 -> 4x1 - Compact")]
                    instructions += [instr]
                else:
                    e0 = mmLoadSs(Pointer(src[sL.of(0), sR.of(0)]))
                    e1 = mmLoadSs(Pointer(src[sL.of(1), sR.of(0)]))
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(mmUnpackloPs(e0, e1), pc)
                    instructions += [Comment("2x1 -> 4x1 - incompact")]
                    instructions += [instr]
            elif N == 2:
                v0_1 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(0),
                                                     sR.of(0)])))
                v2_3 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(1),
                                                     sR.of(0)])))
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instr0 = mmStoreuPs(v0_1, pcs[0])
                instr1 = mmStoreuPs(v2_3, pcs[1])
                instructions += [Comment("2x2 -> 4x4")]
                instructions += [
                    instr0, instr1,
                    mmStoreuPs(mmSetzeroPs(), pcs[2]),
                    mmStoreuPs(mmSetzeroPs(), pcs[3])
                ]
            elif N == 3:
                if isCompact:
                    v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v4_5 = mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(1),
                                                         sR.of(1)])))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_3, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(mmShiftPs(v4_5, v0_3, 3), pcs[1])
                    instructions += [Comment("2x3 -> 4x4 - Compact")]
                    instructions += [
                        instr0, instr1,
                        mmStoreuPs(mmSetzeroPs(), pcs[2]),
                        mmStoreuPs(mmSetzeroPs(), pcs[3])
                    ]
                else:
                    v0_2 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v3_4 = mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(1),
                                                         sR.of(0)])))
                    e5 = mmLoadSs(Pointer(src[sL.of(1), sR.of(2)]))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_2, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(mmShufflePs(v3_4, e5, (1, 0, 1, 0)),
                                        pcs[1])
                    instructions += [Comment("2x3 -> 4x4 - Incompact")]
                    instructions += [
                        instr0, instr1,
                        mmStoreuPs(mmSetzeroPs(), pcs[2]),
                        mmStoreuPs(mmSetzeroPs(), pcs[3])
                    ]
            elif N == 4:
                v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)]))
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instr0 = mmStoreuPs(v0_3, pcs[0])
                instr1 = mmStoreuPs(v4_7, pcs[1])
                instructions += [Comment("2x4 -> 4x4")]
                instructions += [
                    instr0, instr1,
                    mmStoreuPs(mmSetzeroPs(), pcs[2]),
                    mmStoreuPs(mmSetzeroPs(), pcs[3])
                ]
        elif M == 3:
            if N == 1:
                if isCompact:
                    v0_1 = mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(0),
                                                         sR.of(0)])))
                    e2 = mmLoadSs(Pointer(src[sL.of(2), sR.of(0)]))
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(mmShufflePs(v0_1, e2, (1, 0, 1, 0)), pc)
                    instructions += [Comment("3x1 -> 4x1 - Compact")]
                    instructions += [instr]
                else:
                    e0 = mmLoadSs(Pointer(src[sL.of(0), sR.of(0)]))
                    e1 = mmLoadSs(Pointer(src[sL.of(1), sR.of(0)]))
                    e2 = mmLoadSs(Pointer(src[sL.of(2), sR.of(0)]))
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(
                        mmShufflePs(mmUnpackloPs(e0, e1), e2, (1, 0, 1, 0)),
                        pc)
                    instructions += [Comment("3x1 -> 4x1 - incompact")]
                    instructions += [instr]
            elif N == 2:
                v0_1 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(0),
                                                     sR.of(0)])))
                v2_3 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(1),
                                                     sR.of(0)])))
                v4_5 = mmLoadlPi(
                    mmSetzeroPs(),
                    PointerCast("__m64", Pointer(src[sL.of(2),
                                                     sR.of(0)])))
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instr0 = mmStoreuPs(v0_1, pcs[0])
                instr1 = mmStoreuPs(v2_3, pcs[1])
                instr2 = mmStoreuPs(v4_5, pcs[2])
                instructions += [Comment("3x2 -> 4x4")]
                instructions += [
                    instr0, instr1, instr2,
                    mmStoreuPs(mmSetzeroPs(), pcs[3])
                ]
            elif N == 3:
                if isCompact:
                    v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(1)]))
                    e8 = mmLoadSs(Pointer(src[sL.of(2), sR.of(2)]))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_3, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(
                        mmBlendPs(mmShiftPs(v4_7, v0_3, 3), mmSetzeroPs(),
                                  (1, 0, 0, 0)), pcs[1])
                    instr2 = mmStoreuPs(mmShufflePs(v4_7, e8, (1, 0, 3, 2)),
                                        pcs[2])
                    instructions += [Comment("3x3 -> 4x4 - Compact")]
                    instructions += [
                        instr0, instr1, instr2,
                        mmStoreuPs(mmSetzeroPs(), pcs[3])
                    ]
                else:
                    v0_2 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v3_5 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)]))
                    v6_7 = mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(2),
                                                         sR.of(0)])))
                    e8 = mmLoadSs(Pointer(src[sL.of(2), sR.of(2)]))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_2, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(
                        mmBlendPs(v3_5, mmSetzeroPs(), (1, 0, 0, 0)), pcs[1])
                    instr2 = mmStoreuPs(mmShufflePs(v6_7, e8, (1, 0, 1, 0)),
                                        pcs[2])
                    instructions += [Comment("3x3 -> 4x4 - Incompact")]
                    instructions += [
                        instr0, instr1, instr2,
                        mmStoreuPs(mmSetzeroPs(), pcs[3])
                    ]
            elif N == 4:
                rows = [
                    mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]))
                    for i in range(3)
                ]
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instrs = [mmStoreuPs(rows[i], pcs[i]) for i in range(3)]
                instructions += [Comment("3x4 -> 4x4")] + instrs + [
                    mmStoreuPs(mmSetzeroPs(), pcs[3])
                ]
        elif M == 4:
            if N == 1:
                if not isCompact:
                    es = [
                        mmLoadSs(Pointer(src[sL.of(i), sR.of(0)]))
                        for i in range(4)
                    ]
                    pc = Pointer(dst[dL.of(0), dR.of(0)])
                    instr = mmStoreuPs(
                        mmShufflePs(mmUnpackloPs(es[0], es[1]),
                                    mmUnpackloPs(es[2], es[3]), (1, 0, 1, 0)),
                        pc)
                    instructions += [Comment("4x1 -> 4x1 - incompact"), instr]
            elif N == 2:
                rows = [
                    mmLoadlPi(
                        mmSetzeroPs(),
                        PointerCast("__m64", Pointer(src[sL.of(i),
                                                         sR.of(0)])))
                    for i in range(4)
                ]
                pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                instrs = [mmStoreuPs(rows[i], pcs[i]) for i in range(4)]
                instructions += [Comment("4x2 -> 4x4")] + instrs
            elif N == 3:
                if isCompact:
                    v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(1)]))
                    v8_11 = mmLoaduPs(Pointer(src[sL.of(2), sR.of(2)]))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_3, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(
                        mmBlendPs(mmShiftPs(v4_7, v0_3, 3), mmSetzeroPs(),
                                  (1, 0, 0, 0)), pcs[1])
                    instr2 = mmStoreuPs(
                        mmBlendPs(mmShiftPs(v8_11, v4_7, 2), mmSetzeroPs(),
                                  (1, 0, 0, 0)), pcs[2])
                    instr3 = mmStoreuPs(mmShiftPs(mmSetzeroPs(), v8_11, 1),
                                        pcs[3])
                    instructions += [Comment("4x3 -> 4x4 - Compact")]
                    instructions += [instr0, instr1, instr2, instr3]
                else:
                    v0_2 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]))
                    v3_5 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)]))
                    v6_8 = mmLoaduPs(Pointer(src[sL.of(2), sR.of(0)]))
                    pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)]
                    instr0 = mmStoreuPs(
                        mmBlendPs(v0_2, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0])
                    instr1 = mmStoreuPs(
                        mmBlendPs(v3_5, mmSetzeroPs(), (1, 0, 0, 0)), pcs[1])
                    instr2 = mmStoreuPs(
                        mmBlendPs(v6_8, mmSetzeroPs(), (1, 0, 0, 0)), pcs[2])
                    if isCorner:
                        v9_10 = mmLoadlPi(
                            mmSetzeroPs(),
                            PointerCast("__m64",
                                        Pointer(src[sL.of(3),
                                                    sR.of(0)])))
                        e11 = mmLoadSs(Pointer(src[sL.of(3), sR.of(2)]))
                        instr3 = mmStoreuPs(
                            mmShufflePs(v9_10, e11, (1, 0, 1, 0)), pcs[3])
                        instructions += [
                            Comment("4x3 -> 4x4 - Incompact Corner")
                        ]
                        instructions += [instr0, instr1, instr2, instr3]
                    else:
                        v9_11 = mmLoaduPs(Pointer(src[sL.of(3), sR.of(0)]))
                        instr3 = mmStoreuPs(
                            mmBlendPs(v9_11, mmSetzeroPs(), (1, 0, 0, 0)),
                            pcs[3])
                        instructions += [Comment("4x3 -> 4x4 - Incompact")]
                        instructions += [instr0, instr1, instr2, instr3]

        for i in instructions:
            i.bounds.update(mParams['bounds'])
        return instructions