p(".text") p(".hidden poly_mod_3_Phi_n") p(".global poly_mod_3_Phi_n") p(".att_syntax prefix") p("poly_mod_3_Phi_n:") # rdi holds r N_min_1 = 0 t = 1 # NTRU_N is in 509th element; 13th word of 32nd register p("vmovdqa {}(%rdi), %ymm{}".format(31*32, N_min_1)) p("vpermq ${}, %ymm{}, %ymm{}".format(int('00000011', 2), N_min_1, N_min_1)) # move into high 16 in doubleword (to clear high 16) and multiply by two p("vpslld $17, %ymm{}, %ymm{}".format(N_min_1, N_min_1)) # clone into bottom 16 p("vpsrld $16, %ymm{}, %ymm{}".format(N_min_1, t)) p("vpor %ymm{}, %ymm{}, %ymm{}".format(N_min_1, t, N_min_1)) # and now it's everywhere in N_min_1 p("vbroadcastss %xmm{}, %ymm{}".format(N_min_1, N_min_1)) retval = 2 for i in range(NTRU_N32//16): p("vpaddw {}(%rdi), %ymm{}, %ymm{}".format(i*32, N_min_1, t)) mod3(t, retval) p("vmovdqa %ymm{}, {}(%rdi)".format(retval, i*32)) for i in range(NTRU_N, NTRU_N32): p("movw $0, {}(%rdi)".format(2*i)) p("ret")
a = 1 threes = 3 last = 4 retval = 5 modq = 6 p("vmovdqa const_3_repeating(%rip), %ymm{}".format(threes)) p("vmovdqa mask_modq(%rip), %ymm{}".format(modq)) p("vmovdqa {}(%rsi), %ymm{}".format((NTRU_N32 // 16 - 1) * 32, last)) p("vpand %ymm{}, %ymm{}, %ymm{}".format(modq, last, last)) p("vpsrlw ${}, %ymm{}, %ymm{}".format(LOGQ - 1, last, r)) p("vpxor %ymm{}, %ymm{}, %ymm{}".format(threes, r, r)) p("vpsllw ${}, %ymm{}, %ymm{}".format(LOGQ, r, r)) p("vpaddw %ymm{}, %ymm{}, %ymm{}".format(last, r, last)) mod3(last, retval) p("vpsllw $1, %ymm{}, %ymm{}".format(retval, last)) p("vextracti128 $1, %ymm{}, %xmm{}".format(last, last)) p("vpshufb shuf_b8_to_low_doubleword(%rip), %ymm{}, %ymm{}".format( last, last)) p("vinserti128 $1, %xmm{}, %ymm{}, %ymm{}".format(last, last, last)) for i in range(NTRU_N32 // 16): p("vmovdqa {}(%rsi), %ymm{}".format(i * 32, a)) p("vpand %ymm{}, %ymm{}, %ymm{}".format(modq, a, a)) p("vpsrlw ${}, %ymm{}, %ymm{}".format(LOGQ - 1, a, r)) p("vpxor %ymm{}, %ymm{}, %ymm{}".format(threes, r, r)) p("vpsllw ${}, %ymm{}, %ymm{}".format(LOGQ, r, r)) p("vpaddw %ymm{}, %ymm{}, %ymm{}".format(a, r, r)) p("vpaddw %ymm{}, %ymm{}, %ymm{}".format(last, r, r)) mod3(r, retval)
import mod3 def mod1(): print('Func from mod1.py') def go(): print('Go from mod1.py') # go() will not be imported in lesson.py __all__ = ['mod1'] print('This is mod1.py') print(__name__) mod3.mod3()
p("vpsrlq ${}, %ymm{}, %ymm{}".format(16 * word_of_low, last, last)) # Copy the last coefficient to high 16 of low double word p("vpslld $16, %ymm{}, %ymm{}".format(last, r)) p("vpsrld $16, %ymm{}, %ymm{}".format(r, last)) p("vpor %ymm{}, %ymm{}, %ymm{}".format(last, r, last)) # Broadcast the last coefficient to all 16 words p("vbroadcastss %xmm{}, %ymm{}".format(last, last)) # Add (-q) mod 3 to last if last is >= q/2. # Note (-q) mod 3=(-2^k) mod 3=1<<(1-(k&1)) p("vpsrlw ${}, %ymm{}, %ymm{}".format(LOGQ - 1, last, r)) if LOGQ % 2 == 0: p("vpsllw ${}, %ymm{}, %ymm{}".format(1, r, r)) p("vpaddw %ymm{}, %ymm{}, %ymm{}".format(last, r, last)) # map last to -last mod 3 mod3(last, r) p("vpsllw $1, %ymm{}, %ymm{}".format(r, last)) for i in range(NTRU_N32 // 16): p("vmovdqa {}(%rsi), %ymm{}".format(i * 32, a)) p("vpand %ymm{}, %ymm{}, %ymm{}".format(modq, a, a)) p("vpsrlw ${}, %ymm{}, %ymm{}".format(LOGQ - 1, a, r)) if LOGQ % 2 == 0: p("vpsllw ${}, %ymm{}, %ymm{}".format(1, r, r)) p("vpaddw %ymm{}, %ymm{}, %ymm{}".format(a, r, a)) p("vpaddw %ymm{}, %ymm{}, %ymm{}".format(a, last, a)) mod3(a, r) p("vmovdqa %ymm{}, {}(%rdi)".format(r, i * 32)) p("ret")
import mod3 mod3.mod3(10, 12)