def conv_3x64_7x32_acc_i (i) : global V print " // accumulate r2-r4 to", [read_V(acc_r(i,j)) for j in range(3)] print_ldr("r10", acc_r(i,0), "limb 0") print_ldr("r12", acc_r(i,1), "limb 1") print_ldr("r14", acc_r(i,2), "limb 2") print " add r2, r2, r10" print " add r3, r3, r12" print " add r4, r4, r14" print " asr r8, r11, #6"
def conv_3x64_7x32_store_end (i) : # store 4 accumulators at end of thread global V print " // store r6-r8 to", [read_V(acc_r(i,j)) for j in range(4,7)] for j in range(4,7) : print_str("r"+str(j+2), acc_r(i,j), "limb %d" % (j)) print " // compress and store r2-r5" print_ldr("r6","hh","reload cursor") print_ldr("r7","q","load q") print_ldr("r8","q32","load round(-2^32/q)") print " br_32x2 r2, r3, r7, r8, r9" print " br_32x2 r4, r5, r7, r8, r9" print " str r2, [r6], #4" print " str r4, [r6], #4" print_str("r6","hh","store cursor")
def conv_3x64_7x32_acc (i) : global V print " // accumulate to", [read_V(acc_r(i,j)) for j in range(7)] print_ldr("r10", acc_r(i,0), "limb 0") print_ldr("r12", acc_r(i,1), "limb 1") print_ldr("r14", acc_r(i,2), "limb 2") print_ldr("r9", acc_r(i,3), "limb 3") print " add r2, r2, r10" print " add r3, r3, r12" print " add r4, r4, r14" print " add r5, r5, r9" print_ldr("r10", acc_r(i,4), "limb 4") print_ldr("r12", acc_r(i,5), "limb 5") print_ldr("r8", acc_r(i,6), "limb 6") print " add r6, r6, r10" print " add r7, r7, r12" print " add r8, r8, r11, asr #6"
def v (s) : return read_V(s)
def conv_3x64_7x32_store (i) : global V print " // store r2-r8 to", [read_V(acc_r(i,j)) for j in range(7)] for j in range(7) : print_str("r"+str(j+2), acc_r(i,j), "limb %d" % (j))