def loadMatrix(self, mParams): src, dst = mParams['m'], mParams['nuM'] sL, sR = mParams['mL'], mParams['mR'] dL, dR = mParams['nuML'], mParams['nuMR'] M, N = mParams['M'], mParams['N'] nuMM, nuMN = mParams['nuMM'], mParams['nuMN'] mStruct, mAccess = mParams['struct'], mParams['access'] instructions = [] if IdentityMatrix.test(mStruct, mAccess, M, N): comm = Comment('%dx%d - %s' % (M, N, 'Identity')) instrs = [ Mov(V(1), ScaLoad(dst[dL.of(i), dR.of(i)])) for i in range(M) ] instructions.extend([comm] + instrs) if AllEntriesConstantMatrix.test(mStruct, mAccess, M, N): mat_type = mStruct.keys()[0] comm = Comment('%dx%d - %s' % (M, N, str(mat_type))) instrs = [] for i in range(M): for j in range(N): instrs.append( Mov(V(mat_type._const_value), ScaLoad(dst[dL.of(i), dR.of(j)]))) instructions.extend([comm] + instrs) return instructions
def loadMatrix(self, mParams): src, dst = mParams['m'], mParams['nuM'] sL, sR = mParams['mL'], mParams['mR'] dL, dR = mParams['nuML'], mParams['nuMR'] M, N = mParams['M'], mParams['N'] # nuMM = mParams['nuMM'] isCompact = mParams['compact'] instructions = [] if M == 1 and N == 1: pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPd( mmLoadSd(AddressOf(sa(src[sL.of(0), sR.of(0)]))), pc) instructions += [Comment("1x1 -> 1x2"), instr] elif M == 2 and N == 1: if not isCompact: es = [ mmLoadSd(Pointer(src[sL.of(i), sR.of(0)])) for i in range(2) ] pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPd(mmShufflePd(es[0], es[1], (0, 0)), pc) instructions += [Comment("2x1 -> 2x1 - incompact"), instr] return instructions
def storeMatrix(self, mParams): src, dst = mParams['nuM'], mParams['m'] sL, sR = mParams['nuML'], mParams['nuMR'] dL, dR = mParams['mL'], mParams['mR'] M, N = mParams['M'], mParams['N'] isCompact = mParams['compact'] instructions = [] if M == 1 and N == 1: nuv = mmLoaduPd(Pointer(src[sL.of(0), sR.of(0)])) pc = AddressOf(sa(dst[dL.of(0), dR.of(0)])) instr = mmStoreSd(nuv, pc) instructions += [Comment("1x2 -> 1x1"), instr] elif M == 2 and N == 1: if not isCompact: nuv = mmLoaduPd(Pointer(src[sL.of(0), sR.of(0)])) e = mmShufflePd(nuv, nuv, (1, 1)) pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(2)] instr0 = mmStoreSd(nuv, pcs[0]) instr1 = mmStoreSd(e, pcs[1]) instructions += [ Comment("2x1 -> 2x1 - (Store) Incompact"), instr0, instr1 ] return instructions
def Kro(self, s0Params, s1Params, dParams, opts): src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM'] s0L, s0R = s0Params['nuML'], s0Params['nuMR'] s1L, s1R = s1Params['nuML'], s1Params['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, K, N, P = s0Params['nuMM'], s0Params['nuMN'], s1Params[ 'nuMM'], s1Params['nuMN'] instructions = [] instructions += [ Comment("1-BLAC: " + str(M) + "x" + str(K) + " Kro " + str(N) + "x" + str(P)) ] for i in range(M): for k in range(K): for j in range(N): for p in range(P): instr = Mov( ScaMul(ScaLoad(src0[s0L.of(i), s0R.of(k)]), ScaLoad(src1[s1L.of(j), s1R.of(p)])), ScaLoad(dst[dL.of(i + j), dR.of(k + p)])) instructions += [instr] return instructions
def Mul(self, s0Params, s1Params, dParams, opts): src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM'] s0L, s0R = s0Params['nuML'], s0Params['nuMR'] s1L, s1R = s1Params['nuML'], s1Params['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, K, N = s0Params['nuMM'], s0Params['nuMN'], s1Params['nuMN'] instructions = [] instructions += [ Comment("1-BLAC: " + str(M) + "x" + str(K) + " * " + str(K) + "x" + str(N)) ] for i in range(M): for j in range(N): instr = Mov( ScaMul(ScaLoad(src0[s0L.of(i), s0R.of(0)]), ScaLoad(src1[s1L.of(0), s1R.of(j)])), ScaLoad(dst[dL.of(i), dR.of(j)])) instructions += [instr] for k in range(1, K): for i in range(M): for j in range(N): t = ScaMul(ScaLoad(src0[s0L.of(i), s0R.of(k)]), ScaLoad(src1[s1L.of(k), s1R.of(j)])) instr = Mov(ScaAdd(ScaLoad(dst[dL.of(i), dR.of(j)]), t), ScaLoad(dst[dL.of(i), dR.of(j)])) instructions += [instr] return instructions
def Kro(self, s0Params, s1Params, dParams, opts): nu = 4 src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM'] s0L, s0R = s0Params['nuML'], s0Params['nuMR'] s1L, s1R = s1Params['nuML'], s1Params['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] oM, oK, oN, oP = s0Params['M'], s0Params['N'], s1Params['M'], s1Params[ 'N'] M, K, N, P = s0Params['nuMM'], s0Params['nuMN'], s1Params[ 'nuMM'], s1Params['nuMN'] instructions = [] instructions += [ Comment( str(nu) + "-BLAC: " + str(M) + "x" + str(K) + " Kro " + str(N) + "x" + str(P)) ] if oM * oK * oN * oP == 1: pc = Pointer(dst[dL.of(0), dR.of(0)]) va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)])) vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)])) instr = mmStoreuPs(mmMulPs(va, vb), pc) instructions += [instr] elif oM * oK == 1: if N * P == nu: va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)])) dup = mmShufflePs(va, va, (0, 0, 0, 0)) vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs(mmMulPs(dup, vb), pc) instructions += [instr] else: va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)])) dup = mmShufflePs(va, va, (0, 0, 0, 0)) for i in range(nu): vb = mmLoaduPs(Pointer(src1[s1L.of(i), s1R.of(0)])) pc = Pointer(dst[dL.of(i), dR.of(0)]) instr = mmStoreuPs(mmMulPs(dup, vb), pc) instructions += [instr] else: if M * K == nu: vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)])) dup = mmShufflePs(vb, vb, (0, 0, 0, 0)) va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs(mmMulPs(va, dup), pc) instructions += [instr] else: vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)])) dup = mmShufflePs(vb, vb, (0, 0, 0, 0)) for i in range(nu): va = mmLoaduPs(Pointer(src0[s0L.of(i), s0R.of(0)])) pc = Pointer(dst[dL.of(i), dR.of(0)]) instr = mmStoreuPs(mmMulPs(va, dup), pc) instructions += [instr] for i in instructions: i.bounds.update(dParams['bounds']) return instructions
def Add(self, s0Params, s1Params, dParams, opts): nu = 2 src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM'] s0L, s0R = s0Params['nuML'], s0Params['nuMR'] s1L, s1R = s1Params['nuML'], s1Params['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, N = dParams['nuMM'], dParams['nuMN'] instructions = [] instructions += [ Comment( str(nu) + "-BLAC: " + str(M) + "x" + str(N) + " + " + str(M) + "x" + str(N)) ] if M * N == nu: va = mmLoaduPd(Pointer(src0[s0L.of(0), s0R.of(0)])) vb = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPd(mmAddPd(va, vb), pc) instructions += [instr] elif M == nu and N == nu: for i in range(M): va = mmLoaduPd(Pointer(src0[s0L.of(i), s0R.of(0)])) vb = mmLoaduPd(Pointer(src1[s1L.of(i), s1R.of(0)])) pc = Pointer(dst[dL.of(i), dR.of(0)]) instr = mmStoreuPd(mmAddPd(va, vb), pc) instructions += [instr] return instructions
def T(self, sParams, dParams, opts): nu = 2 src, dst = sParams['nuM'], dParams['nuM'] sL, sR = sParams['nuML'], sParams['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, N = dParams['nuMM'], dParams['nuMN'] instructions = [] instructions += [ Comment(str(nu) + "-BLAC: (" + str(N) + "x" + str(M) + ")^T") ] if M * N == nu: va = mmLoaduPd(Pointer(src[sL.of(0), sR.of(0)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPd(va, pc) instructions += [instr] else: va0 = mmLoaduPd(Pointer(src[sL.of(0), sR.of(0)])) va1 = mmLoaduPd(Pointer(src[sL.of(1), sR.of(0)])) pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc1 = Pointer(dst[dL.of(1), dR.of(0)]) vt0 = mmUnpackloPd(va0, va1) vt1 = mmUnpackhiPd(va0, va1) instr0 = mmStoreuPd(vt0, pc0) instr1 = mmStoreuPd(vt1, pc1) instructions += [instr0, instr1] return instructions
def Neg(self, sParams, dParams, opts): src, dst = sParams['nuM'], dParams['nuM'] sL, sR = sParams['nuML'], sParams['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, N = dParams['nuMM'], dParams['nuMN'] instructions = [] instructions += [Comment("1-BLAC: -(" + str(N) + "x" + str(M) + ")")] instr = Mov(ScaMul(V(-1), ScaLoad(src[sL.of(0), sR.of(0)])), ScaLoad(dst[dL.of(0), dR.of(0)])) instructions += [instr] return instructions
def Zero(self, dParams, opts): dst = dParams['nuM'] dL, dR = dParams['nuML'], dParams['nuMR'] M, N = dParams['nuMM'], dParams['nuMN'] instructions = [] instructions += [Comment("1-BLAC: Zero " + str(M) + "x" + str(N))] for i in range(M): for j in range(N): instr = Mov(V(0), ScaLoad(dst[dL.of(i), dR.of(j)])) instructions.append(instr) return instructions
def T(self, sParams, dParams, opts): src, dst = sParams['nuM'], dParams['nuM'] sL, sR = sParams['nuML'], sParams['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, N = dParams['nuMM'], dParams['nuMN'] instructions = [] instructions += [Comment("1-BLAC: (" + str(N) + "x" + str(M) + ")^T")] for i in range(M): for j in range(N): instr = Mov(ScaLoad(src[sL.of(j), sR.of(i)]), ScaLoad(dst[dL.of(i), dR.of(j)])) instructions += [instr] return instructions
def Copy(self, sParams, dParams, opts): sub, dst = sParams['nuM'], dParams['nuM'] subL, subR = sParams['nuML'], sParams['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, N = sParams['nuMM'], sParams['nuMN'] instructions = [] instructions += [Comment("1-BLAC: Copy " + str(M) + "x" + str(N))] for i in range(M): for j in range(N): instr = Mov(ScaLoad(sub[subL.of(i), subR.of(j)]), ScaLoad(dst[dL.of(i), dR.of(j)])) instructions += [instr] return instructions
def Sub(self, s0Params, s1Params, dParams, opts): src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM'] s0L, s0R = s0Params['nuML'], s0Params['nuMR'] s1L, s1R = s1Params['nuML'], s1Params['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, N = dParams['nuMM'], dParams['nuMN'] instructions = [] instructions += [ Comment("1-BLAC: " + str(M) + "x" + str(N) + " - " + str(M) + "x" + str(N)) ] for i in range(M): for j in range(N): instr = Mov( ScaSub(ScaLoad(src0[s0L.of(i), s0R.of(j)]), ScaLoad(src1[s1L.of(i), s1R.of(j)])), ScaLoad(dst[dL.of(i), dR.of(j)])) instructions += [instr] return instructions # def LDiv(self, s0Params, s1Params, dParams, opts): # src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM'] # s0L, s0R = s0Params['nuML'], s0Params['nuMR'] # s1L, s1R = s1Params['nuML'], s1Params['nuMR'] # dL, dR = dParams['nuML'], dParams['nuMR'] # M, N = s0Params['nuMM'], s1Params['nuMN'] # instructions = [] # # instructions += [ Comment("1-BLAC: " + str(M) + "x" + str(M) + " \ " + str(M) + "x" + str(N)) ] # # for j in range(N): # instr = Mov(ScaDiv(ScaLoad(src1[s1L.of(0),s1R.of(j)]), ScaLoad(src0[s0L.of(0),s0R.of(0)])), ScaLoad(dst[dL.of(0),dR.of(j)])) # instructions += [ instr ] # # for i in range(1, M): # # t0 = ScaMul(ScaLoad(src0[s0L.of(i),s0R.of(0)]), ScaLoad(dst[dL.of(0),dR.of(j)])) # # for k in range(1, i): # # t1 = ScaMul(ScaLoad(src0[s0L.of(i),s0R.of(k)]), ScaLoad(dst[dL.of(k),dR.of(j)])) # # t0 = ScaAdd(t0, t1) # # s = ScaSub(ScaLoad(src1[s1L.of(i),s1R.of(j)]), t0) # # instr = Mov(ScaDiv(s, ScaLoad(src0[s0L.of(i),s0R.of(i)])), ScaLoad(dst[dL.of(i),dR.of(j)])) # # instructions += [ instr ] return instructions
def T(self, sParams, dParams, opts): nu = 4 src, dst = sParams['nuM'], dParams['nuM'] sL, sR = sParams['nuML'], sParams['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, N = dParams['nuMM'], dParams['nuMN'] instructions = [] instructions += [ Comment(str(nu) + "-BLAC: (" + str(N) + "x" + str(M) + ")^T") ] if M * N == nu: va = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs(va, pc) instructions += [instr] else: va0 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)])) va1 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)])) va2 = mmLoaduPs(Pointer(src[sL.of(2), sR.of(0)])) va3 = mmLoaduPs(Pointer(src[sL.of(3), sR.of(0)])) #Equivalent of _MM_TRANSPOSE4_PS tmp0 = mmUnpackloPs(va0, va1) tmp2 = mmUnpackloPs(va2, va3) tmp1 = mmUnpackhiPs(va0, va1) tmp3 = mmUnpackhiPs(va2, va3) col0 = mmMovelhPs(tmp0, tmp2) col1 = mmMovehlPs(tmp2, tmp0) col2 = mmMovelhPs(tmp1, tmp3) col3 = mmMovehlPs(tmp3, tmp1) #Equivalent of _MM_TRANSPOSE4_PS pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc1 = Pointer(dst[dL.of(1), dR.of(0)]) pc2 = Pointer(dst[dL.of(2), dR.of(0)]) pc3 = Pointer(dst[dL.of(3), dR.of(0)]) instr0 = mmStoreuPs(col0, pc0) instr1 = mmStoreuPs(col1, pc1) instr2 = mmStoreuPs(col2, pc2) instr3 = mmStoreuPs(col3, pc3) instructions += [instr0, instr1, instr2, instr3] for i in instructions: i.bounds.update(dParams['bounds']) return instructions
def Div(self, s0Params, s1Params, dParams, opts): src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM'] s0L, s0R = s0Params['nuML'], s0Params['nuMR'] s1L, s1R = s1Params['nuML'], s1Params['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, N = s0Params['nuMM'], s1Params['nuMN'] instructions = [] instructions += [ Comment("1-BLAC: " + str(M) + "x" + str(N) + " / " + str(N) + "x" + str(N)) ] for i in range(M): instr = Mov( ScaDiv(ScaLoad(src0[s0L.of(i), s0R.of(0)]), ScaLoad(src1[s1L.of(0), s1R.of(0)])), ScaLoad(dst[dL.of(i), dR.of(0)])) instructions += [instr] return instructions
def loadMatrix(self, mParams): src, dst = mParams['m'], mParams['nuM'] sL, sR = mParams['mL'], mParams['mR'] dL, dR = mParams['nuML'], mParams['nuMR'] M, N = mParams['M'], mParams['N'] isCompact, isCorner = mParams['compact'], mParams['corner'] instructions = [] if M == 1: if N == 1: pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs( mmLoadSs(AddressOf(sa(src[sL.of(0), sR.of(0)]))), pc) instructions += [Comment("1x1 -> 1x4"), instr] elif N == 2: v0_1 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(0), sR.of(0)]))) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs(v0_1, pc) instructions += [Comment("1x2 -> 1x4 - Corner")] instructions += [instr] elif N == 3: v0_1 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(0), sR.of(0)]))) e2 = mmLoadSs(Pointer(src[sL.of(0), sR.of(2)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs(mmShufflePs(v0_1, e2, (1, 0, 1, 0)), pc) instructions += [Comment("1x3 -> 1x4 - Corner")] instructions += [instr] elif M == 2: if N == 1: if isCompact: v0_1 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(0), sR.of(0)]))) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs(v0_1, pc) instructions += [Comment("2x1 -> 4x1 - Compact")] instructions += [instr] else: e0 = mmLoadSs(Pointer(src[sL.of(0), sR.of(0)])) e1 = mmLoadSs(Pointer(src[sL.of(1), sR.of(0)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs(mmUnpackloPs(e0, e1), pc) instructions += [Comment("2x1 -> 4x1 - incompact")] instructions += [instr] elif N == 2: v0_1 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(0), sR.of(0)]))) v2_3 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(1), sR.of(0)]))) pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instr0 = mmStoreuPs(v0_1, pcs[0]) instr1 = mmStoreuPs(v2_3, pcs[1]) instructions += [Comment("2x2 -> 4x4")] instructions += [ instr0, instr1, mmStoreuPs(mmSetzeroPs(), pcs[2]), mmStoreuPs(mmSetzeroPs(), pcs[3]) ] elif N == 3: if isCompact: v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)])) v4_5 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(1), sR.of(1)]))) pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instr0 = mmStoreuPs( mmBlendPs(v0_3, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0]) instr1 = mmStoreuPs(mmShiftPs(v4_5, v0_3, 3), pcs[1]) instructions += [Comment("2x3 -> 4x4 - Compact")] instructions += [ instr0, instr1, mmStoreuPs(mmSetzeroPs(), pcs[2]), mmStoreuPs(mmSetzeroPs(), pcs[3]) ] else: v0_2 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)])) v3_4 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(1), sR.of(0)]))) e5 = mmLoadSs(Pointer(src[sL.of(1), sR.of(2)])) pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instr0 = mmStoreuPs( mmBlendPs(v0_2, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0]) instr1 = mmStoreuPs(mmShufflePs(v3_4, e5, (1, 0, 1, 0)), pcs[1]) instructions += [Comment("2x3 -> 4x4 - Incompact")] instructions += [ instr0, instr1, mmStoreuPs(mmSetzeroPs(), pcs[2]), mmStoreuPs(mmSetzeroPs(), pcs[3]) ] elif N == 4: v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)])) v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)])) pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instr0 = mmStoreuPs(v0_3, pcs[0]) instr1 = mmStoreuPs(v4_7, pcs[1]) instructions += [Comment("2x4 -> 4x4")] instructions += [ instr0, instr1, mmStoreuPs(mmSetzeroPs(), pcs[2]), mmStoreuPs(mmSetzeroPs(), pcs[3]) ] elif M == 3: if N == 1: if isCompact: v0_1 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(0), sR.of(0)]))) e2 = mmLoadSs(Pointer(src[sL.of(2), sR.of(0)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs(mmShufflePs(v0_1, e2, (1, 0, 1, 0)), pc) instructions += [Comment("3x1 -> 4x1 - Compact")] instructions += [instr] else: e0 = mmLoadSs(Pointer(src[sL.of(0), sR.of(0)])) e1 = mmLoadSs(Pointer(src[sL.of(1), sR.of(0)])) e2 = mmLoadSs(Pointer(src[sL.of(2), sR.of(0)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs( mmShufflePs(mmUnpackloPs(e0, e1), e2, (1, 0, 1, 0)), pc) instructions += [Comment("3x1 -> 4x1 - incompact")] instructions += [instr] elif N == 2: v0_1 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(0), sR.of(0)]))) v2_3 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(1), sR.of(0)]))) v4_5 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(2), sR.of(0)]))) pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instr0 = mmStoreuPs(v0_1, pcs[0]) instr1 = mmStoreuPs(v2_3, pcs[1]) instr2 = mmStoreuPs(v4_5, pcs[2]) instructions += [Comment("3x2 -> 4x4")] instructions += [ instr0, instr1, instr2, mmStoreuPs(mmSetzeroPs(), pcs[3]) ] elif N == 3: if isCompact: v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)])) v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(1)])) e8 = mmLoadSs(Pointer(src[sL.of(2), sR.of(2)])) pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instr0 = mmStoreuPs( mmBlendPs(v0_3, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0]) instr1 = mmStoreuPs( mmBlendPs(mmShiftPs(v4_7, v0_3, 3), mmSetzeroPs(), (1, 0, 0, 0)), pcs[1]) instr2 = mmStoreuPs(mmShufflePs(v4_7, e8, (1, 0, 3, 2)), pcs[2]) instructions += [Comment("3x3 -> 4x4 - Compact")] instructions += [ instr0, instr1, instr2, mmStoreuPs(mmSetzeroPs(), pcs[3]) ] else: v0_2 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)])) v3_5 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)])) v6_7 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(2), sR.of(0)]))) e8 = mmLoadSs(Pointer(src[sL.of(2), sR.of(2)])) pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instr0 = mmStoreuPs( mmBlendPs(v0_2, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0]) instr1 = mmStoreuPs( mmBlendPs(v3_5, mmSetzeroPs(), (1, 0, 0, 0)), pcs[1]) instr2 = mmStoreuPs(mmShufflePs(v6_7, e8, (1, 0, 1, 0)), pcs[2]) instructions += [Comment("3x3 -> 4x4 - Incompact")] instructions += [ instr0, instr1, instr2, mmStoreuPs(mmSetzeroPs(), pcs[3]) ] elif N == 4: rows = [ mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)])) for i in range(3) ] pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instrs = [mmStoreuPs(rows[i], pcs[i]) for i in range(3)] instructions += [Comment("3x4 -> 4x4")] + instrs + [ mmStoreuPs(mmSetzeroPs(), pcs[3]) ] elif M == 4: if N == 1: if not isCompact: es = [ mmLoadSs(Pointer(src[sL.of(i), sR.of(0)])) for i in range(4) ] pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs( mmShufflePs(mmUnpackloPs(es[0], es[1]), mmUnpackloPs(es[2], es[3]), (1, 0, 1, 0)), pc) instructions += [Comment("4x1 -> 4x1 - incompact"), instr] elif N == 2: rows = [ mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(i), sR.of(0)]))) for i in range(4) ] pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instrs = [mmStoreuPs(rows[i], pcs[i]) for i in range(4)] instructions += [Comment("4x2 -> 4x4")] + instrs elif N == 3: if isCompact: v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)])) v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(1)])) v8_11 = mmLoaduPs(Pointer(src[sL.of(2), sR.of(2)])) pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instr0 = mmStoreuPs( mmBlendPs(v0_3, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0]) instr1 = mmStoreuPs( mmBlendPs(mmShiftPs(v4_7, v0_3, 3), mmSetzeroPs(), (1, 0, 0, 0)), pcs[1]) instr2 = mmStoreuPs( mmBlendPs(mmShiftPs(v8_11, v4_7, 2), mmSetzeroPs(), (1, 0, 0, 0)), pcs[2]) instr3 = mmStoreuPs(mmShiftPs(mmSetzeroPs(), v8_11, 1), pcs[3]) instructions += [Comment("4x3 -> 4x4 - Compact")] instructions += [instr0, instr1, instr2, instr3] else: v0_2 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)])) v3_5 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)])) v6_8 = mmLoaduPs(Pointer(src[sL.of(2), sR.of(0)])) pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instr0 = mmStoreuPs( mmBlendPs(v0_2, mmSetzeroPs(), (1, 0, 0, 0)), pcs[0]) instr1 = mmStoreuPs( mmBlendPs(v3_5, mmSetzeroPs(), (1, 0, 0, 0)), pcs[1]) instr2 = mmStoreuPs( mmBlendPs(v6_8, mmSetzeroPs(), (1, 0, 0, 0)), pcs[2]) if isCorner: v9_10 = mmLoadlPi( mmSetzeroPs(), PointerCast("__m64", Pointer(src[sL.of(3), sR.of(0)]))) e11 = mmLoadSs(Pointer(src[sL.of(3), sR.of(2)])) instr3 = mmStoreuPs( mmShufflePs(v9_10, e11, (1, 0, 1, 0)), pcs[3]) instructions += [ Comment("4x3 -> 4x4 - Incompact Corner") ] instructions += [instr0, instr1, instr2, instr3] else: v9_11 = mmLoaduPs(Pointer(src[sL.of(3), sR.of(0)])) instr3 = mmStoreuPs( mmBlendPs(v9_11, mmSetzeroPs(), (1, 0, 0, 0)), pcs[3]) instructions += [Comment("4x3 -> 4x4 - Incompact")] instructions += [instr0, instr1, instr2, instr3] for i in instructions: i.bounds.update(mParams['bounds']) return instructions
def Mul(self, s0Params, s1Params, dParams, opts): nu = 2 src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM'] s0L, s0R = s0Params['nuML'], s0Params['nuMR'] s1L, s1R = s1Params['nuML'], s1Params['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, K, N = s0Params['nuMM'], s0Params['nuMN'], s1Params['nuMN'] instructions = [] instructions += [ Comment( str(nu) + "-BLAC: " + str(M) + "x" + str(K) + " * " + str(K) + "x" + str(N)) ] if M == 1: if N == 1: va = mmLoaduPd(Pointer(src0[s0L.of(0), s0R.of(0)])) vb = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPd(mmHaddPd(mmMulPd(va, vb), mmSetzeroPd()), pc) # instr = mmStoreSd(mmHaddPd(mmMulPd(va, vb), mmSetzeroPd()), pc) instructions += [instr] else: va = mmLoaduPd(Pointer(src0[s0L.of(0), s0R.of(0)])) vb0 = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)])) vb1 = mmLoaduPd(Pointer(src1[s1L.of(1), s1R.of(0)])) vbt0 = mmUnpackloPd(vb0, vb1) vbt1 = mmUnpackhiPd(vb0, vb1) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPd( mmHaddPd(mmMulPd(va, vbt0), mmMulPd(va, vbt1)), pc) instructions += [instr] else: if K == 1: va0 = mmLoaddupPd(Pointer(src0[s0L.of(0), s0R.of(0)])) va1 = mmLoaddupPd(Pointer(src0[s0L.of(1), s0R.of(0)])) vb = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)])) pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc1 = Pointer(dst[dL.of(1), dR.of(0)]) instr0 = mmStoreuPd(mmMulPd(va0, vb), pc0) instr1 = mmStoreuPd(mmMulPd(va1, vb), pc1) instructions += [instr0, instr1] else: if N == 1: va0 = mmLoaduPd(Pointer(src0[s0L.of(0), s0R.of(0)])) va1 = mmLoaduPd(Pointer(src0[s0L.of(1), s0R.of(0)])) vb = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPd( mmHaddPd(mmMulPd(va0, vb), mmMulPd(va1, vb)), pc) instructions += [instr] else: va0 = mmLoaduPd(Pointer(src0[s0L.of(0), s0R.of(0)])) va1 = mmLoaduPd(Pointer(src0[s0L.of(1), s0R.of(0)])) vb0 = mmLoaduPd(Pointer(src1[s1L.of(0), s1R.of(0)])) vb1 = mmLoaduPd(Pointer(src1[s1L.of(1), s1R.of(0)])) vbt0 = mmUnpackloPd(vb0, vb1) vbt1 = mmUnpackhiPd(vb0, vb1) pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc1 = Pointer(dst[dL.of(1), dR.of(0)]) instr0 = mmStoreuPd( mmHaddPd(mmMulPd(va0, vbt0), mmMulPd(va0, vbt1)), pc0) instr1 = mmStoreuPd( mmHaddPd(mmMulPd(va1, vbt0), mmMulPd(va1, vbt1)), pc1) instructions += [instr0, instr1] return instructions
def Mul(self, s0Params, s1Params, dParams, opts): nu = 4 src0, src1, dst = s0Params['nuM'], s1Params['nuM'], dParams['nuM'] s0L, s0R = s0Params['nuML'], s0Params['nuMR'] s1L, s1R = s1Params['nuML'], s1Params['nuMR'] dL, dR = dParams['nuML'], dParams['nuMR'] M, K, N = s0Params['nuMM'], s0Params['nuMN'], s1Params['nuMN'] instructions = [] instructions += [ Comment( str(nu) + "-BLAC: " + str(M) + "x" + str(K) + " * " + str(K) + "x" + str(N)) ] if M == 1: if N == 1: va = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)])) vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)])) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs(mmDpPs(va, vb, [1, 1, 1, 1, 0, 0, 0, 1]), pc) instructions += [instr] else: vb0 = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)])) vb1 = mmLoaduPs(Pointer(src1[s1L.of(1), s1R.of(0)])) vb2 = mmLoaduPs(Pointer(src1[s1L.of(2), s1R.of(0)])) vb3 = mmLoaduPs(Pointer(src1[s1L.of(3), s1R.of(0)])) va00 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(0)])) va01 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(1)])) va02 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(2)])) va03 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(3)])) mul0 = mmMulPs(va00, vb0) mul1 = mmMulPs(va01, vb1) add0 = mmAddPs(mul0, mul1) mul2 = mmMulPs(va02, vb2) mul3 = mmMulPs(va03, vb3) add1 = mmAddPs(mul2, mul3) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs(mmAddPs(add0, add1), pc) instructions += [instr] else: if K == 1: va0 = mmLoad1Ps(Pointer(src0[s0L.of(0), s0R.of(0)])) va1 = mmLoad1Ps(Pointer(src0[s0L.of(1), s0R.of(0)])) va2 = mmLoad1Ps(Pointer(src0[s0L.of(2), s0R.of(0)])) va3 = mmLoad1Ps(Pointer(src0[s0L.of(3), s0R.of(0)])) vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)])) pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc1 = Pointer(dst[dL.of(1), dR.of(0)]) pc2 = Pointer(dst[dL.of(2), dR.of(0)]) pc3 = Pointer(dst[dL.of(3), dR.of(0)]) instr0 = mmStoreuPs(mmMulPs(va0, vb), pc0) instr1 = mmStoreuPs(mmMulPs(va1, vb), pc1) instr2 = mmStoreuPs(mmMulPs(va2, vb), pc2) instr3 = mmStoreuPs(mmMulPs(va3, vb), pc3) instructions += [instr0, instr1, instr2, instr3] else: if N == 1: va0 = mmLoaduPs(Pointer(src0[s0L.of(0), s0R.of(0)])) va1 = mmLoaduPs(Pointer(src0[s0L.of(1), s0R.of(0)])) va2 = mmLoaduPs(Pointer(src0[s0L.of(2), s0R.of(0)])) va3 = mmLoaduPs(Pointer(src0[s0L.of(3), s0R.of(0)])) vb = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)])) mul0 = mmMulPs(va0, vb) mul1 = mmMulPs(va1, vb) mul2 = mmMulPs(va2, vb) mul3 = mmMulPs(va3, vb) hadd0 = mmHaddPs(mul0, mul1) hadd1 = mmHaddPs(mul2, mul3) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStoreuPs(mmHaddPs(hadd0, hadd1), pc) instructions += [instr] else: vb0 = mmLoaduPs(Pointer(src1[s1L.of(0), s1R.of(0)])) vb1 = mmLoaduPs(Pointer(src1[s1L.of(1), s1R.of(0)])) vb2 = mmLoaduPs(Pointer(src1[s1L.of(2), s1R.of(0)])) vb3 = mmLoaduPs(Pointer(src1[s1L.of(3), s1R.of(0)])) for i in range(nu): vai0 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(0)])) vai1 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(1)])) vai2 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(2)])) vai3 = mmLoad1Ps(Pointer(src0[s0L.of(i), s0R.of(3)])) mul0 = mmMulPs(vai0, vb0) mul1 = mmMulPs(vai1, vb1) add0 = mmAddPs(mul0, mul1) mul2 = mmMulPs(vai2, vb2) mul3 = mmMulPs(vai3, vb3) add1 = mmAddPs(mul2, mul3) pc = Pointer(dst[dL.of(i), dR.of(0)]) instr = mmStoreuPs(mmAddPs(add0, add1), pc) instructions += [instr] for i in instructions: i.bounds.update(dParams['bounds']) return instructions
def storeMatrix(self, mParams): src, dst = mParams['nuM'], mParams['m'] sL, sR = mParams['nuML'], mParams['nuMR'] dL, dR = mParams['mL'], mParams['mR'] M, N = mParams['M'], mParams['N'] isCompact = mParams['compact'] instructions = [] if M == 1: if N == 1: nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [1, 2, 3]) pc = AddressOf(sa(dst[dL.of(0), dR.of(0)])) instr = mmStoreSs(nuv, pc) instructions += [Comment("1x4 -> 1x1"), instr] elif N == 2: nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [2, 3]) pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStorelPi(nuv, PointerCast("__m64", pc)) instructions += [Comment("1x4 -> 1x2 - Corner"), instr] elif N == 3: nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [3]) e2 = mmShufflePs(nuv, nuv, (3, 3, 3, 2)) pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc2 = Pointer(dst[dL.of(0), dR.of(2)]) instr0 = mmStorelPi(nuv, PointerCast("__m64", pc0)) instr1 = mmStoreSs(e2, pc2) instructions += [Comment("1x4 -> 1x3 - Corner")] instructions += [instr0, instr1] elif M == 2: if N == 1: nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [2, 3]) if isCompact: pc = Pointer(dst[dL.of(0), dR.of(0)]) instr = mmStorelPi(nuv, PointerCast("__m64", pc)) instructions += [Comment("4x1 -> 2x1 - Compact"), instr] else: e1 = mmShufflePs(nuv, nuv, (2, 2, 2, 1)) pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc1 = Pointer(dst[dL.of(1), dR.of(0)]) instr0 = mmStoreSs(nuv, pc0) instr1 = mmStoreSs(e1, pc1) instructions += [Comment("4x1 -> 2x1 - Incompact")] instructions += [instr0, instr1] elif N == 2: nuvs = [ mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [2, 3]) for i in range(2) ] pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(2)] instrs = [ mmStorelPi(nuvs[i], PointerCast("__m64", pcs[i])) for i in range(2) ] instructions += [Comment("4x4 -> 2x2")] + instrs elif N == 3: nuvs = [ mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [3]) for i in range(2) ] if isCompact: pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc1 = Pointer(dst[dL.of(1), dR.of(1)]) instr0 = mmStoreuPs( mmInsertPs(nuvs[0], nuvs[1], [0, 0, 1, 1, 0, 0, 0, 0]), pc0) instr1 = mmStorelPi( mmShufflePs(nuvs[1], nuvs[1], (3, 3, 2, 1)), PointerCast("__m64", pc1)) instructions += [Comment("4x4 -> 2x3 - Compact")] instructions += [instr0, instr1] else: instructions += [Comment("4x4 -> 2x3 - Compact")] e2 = mmShufflePs(nuvs[0], nuvs[0], (3, 3, 3, 2)) instrRow0 = [ mmStorelPi( nuvs[0], PointerCast("__m64", Pointer(dst[dL.of(0), dR.of(0)]))), mmStoreSs(e2, Pointer(dst[dL.of(0), dR.of(2)])) ] e5 = mmShufflePs(nuvs[1], nuvs[1], (3, 3, 3, 2)) instrRow1 = [ mmStorelPi( nuvs[1], PointerCast("__m64", Pointer(dst[dL.of(1), dR.of(0)]))), mmStoreSs(e5, Pointer(dst[dL.of(1), dR.of(2)])) ] instructions += instrRow0 + instrRow1 elif N == 4: v0_3 = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)])) v4_7 = mmLoaduPs(Pointer(src[sL.of(1), sR.of(0)])) pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(2)] instr0 = mmStoreuPs(v0_3, pcs[0]) instr1 = mmStoreuPs(v4_7, pcs[1]) instructions += [Comment("4x4 -> 2x4")] instructions += [instr0, instr1] elif M == 3: if N == 1: nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)]), [3]) if isCompact: e2 = mmShufflePs(nuv, nuv, (3, 3, 3, 2)) pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc2 = Pointer(dst[dL.of(2), dR.of(0)]) instr0 = mmStorelPi(nuv, PointerCast("__m64", pc0)) instr1 = mmStoreSs(e2, pc2) instructions += [Comment("4x1 -> 3x1 - Compact")] instructions += [instr0, instr1] else: e1 = mmShufflePs(nuv, nuv, (3, 3, 3, 1)) e2 = mmShufflePs(nuv, nuv, (3, 3, 3, 2)) pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc1 = Pointer(dst[dL.of(1), dR.of(0)]) pc2 = Pointer(dst[dL.of(2), dR.of(0)]) instr0 = mmStoreSs(nuv, pc0) instr1 = mmStoreSs(e1, pc1) instr2 = mmStoreSs(e2, pc2) instructions += [Comment("4x1 -> 3x1 - Incompact")] instructions += [instr0, instr1, instr2] elif N == 2: nuvs = [ mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [2, 3]) for i in range(3) ] pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(3)] instrs = [ mmStorelPi(nuvs[i], PointerCast("__m64", pcs[i])) for i in range(3) ] instructions += [Comment("4x4 -> 3x2")] + instrs elif N == 3: nuvs = [ mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [3]) for i in range(3) ] if isCompact: pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc1 = Pointer(dst[dL.of(1), dR.of(1)]) pc2 = Pointer(dst[dL.of(2), dR.of(2)]) instr0 = mmStoreuPs( mmInsertPs(nuvs[0], nuvs[1], [0, 0, 1, 1, 0, 0, 0, 0]), pc0) instr1 = mmStoreuPs( mmShufflePs(nuvs[1], nuvs[2], (1, 0, 2, 1)), pc1) instr2 = mmStoreSs( mmShufflePs(nuvs[2], nuvs[2], (3, 3, 3, 2)), pc2) instructions += [ Comment("4x4 -> 3x3 - Compact"), instr0, instr1, instr2 ] else: instructions += [Comment("4x4 -> 3x3 - Incompact")] for i in range(3): e = mmShufflePs(nuvs[i], nuvs[i], (3, 3, 3, 2)) instructions += [ mmStorelPi( nuvs[i], PointerCast("__m64", Pointer(dst[dL.of(i), dR.of(0)]))), mmStoreSs(e, Pointer(dst[dL.of(i), dR.of(2)])) ] elif N == 4: instrs = [ mmStoreuPs(mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)])), Pointer(dst[dL.of(i), dR.of(0)])) for i in range(3) ] instructions += [Comment("4x4 -> 2x4")] + instrs elif M == 4: if N == 1: if not isCompact: nuv = mmLoaduPs(Pointer(src[sL.of(0), sR.of(0)])) es = [ mmShufflePs(nuv, nuv, (3, 3, 3, i)) for i in range(1, 4) ] pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instr0 = mmStoreSs(nuv, pcs[0]) instrs = [ mmStoreSs(es[i - 1], pcs[i]) for i in range(1, 4) ] instructions += [ Comment("4x1 -> 4x1 - (Store) Incompact"), instr0 ] + instrs elif N == 2: nuvs = [ mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [2, 3]) for i in range(4) ] pcs = [Pointer(dst[dL.of(i), dR.of(0)]) for i in range(4)] instrs = [ mmStorelPi(nuvs[i], PointerCast("__m64", pcs[i])) for i in range(4) ] instructions += [Comment("4x4 -> 4x2")] + instrs elif N == 3: nuvs = [ mmLoaduPs(Pointer(src[sL.of(i), sR.of(0)]), [3]) for i in range(4) ] if isCompact: pc0 = Pointer(dst[dL.of(0), dR.of(0)]) pc1 = Pointer(dst[dL.of(1), dR.of(1)]) pc2 = Pointer(dst[dL.of(2), dR.of(2)]) instr0 = mmStoreuPs( mmInsertPs(nuvs[0], nuvs[1], [0, 0, 1, 1, 0, 0, 0, 0]), pc0) instr1 = mmStoreuPs( mmShufflePs(nuvs[1], nuvs[2], (1, 0, 2, 1)), pc1) instr2 = mmStoreuPs( mmShiftPs(nuvs[3], mmShufflePs(nuvs[2], nuvs[2], (2, 3, 3, 3)), 3), pc2) instructions += [ Comment("4x4 -> 4x3 - Compact"), instr0, instr1, instr2 ] else: instructions += [Comment("4x4 -> 4x3 - Incompact")] for i in range(4): e = mmShufflePs(nuvs[i], nuvs[i], (3, 3, 3, 2)) instructions += [ mmStorelPi( nuvs[i], PointerCast("__m64", Pointer(dst[dL.of(i), dR.of(0)]))), mmStoreSs(e, Pointer(dst[dL.of(i), dR.of(2)])) ] for i in instructions: i.bounds.update(mParams['bounds']) return instructions