MOV(reg_t8_offset.as_dword, 8) SUB(reg_t8_offset.as_dword, reg_row_start) IMUL(reg_t8_offset, reg_t_stride) reg_t8 = GeneralPurposeRegister64() LEA(reg_t8, [reg_t0 + reg_t8_offset * 1]) CMP(reg_row_start, 8) CMOVAE(reg_t8, reg_t0) reg_t0_column_8, reg_t8_column_8 = GeneralPurposeRegister64(), GeneralPurposeRegister64() LEA(reg_t0_column_8, [reg_t0 + YMMRegister.size]) LEA(reg_t8_column_8, [reg_t8 + YMMRegister.size]) vfft_columns_0_to_8 = [LocalVariable(YMMRegister.size) for _ in range(16)] vfft_columns_8_to_16 = [YMMRegister() if i < 4 else LocalVariable(YMMRegister.size) for i in range(16)] fft16x16.forward_vfft(reg_t0, reg_t8, reg_t_stride, data_out=vfft_columns_0_to_8, reg_row_start=reg_row_start, reg_row_end=reg_row_end, ymm_load_mask=ymm_load_mask_columns_0_to_8) ymm_load_mask_columns_8_to_16 = YMMRegister() VMOVDQA(ymm_load_mask_columns_8_to_16, load_mask_columns_8_to_16) fft16x16.forward_vfft(reg_t0_column_8, reg_t8_column_8, reg_t_stride, data_out=vfft_columns_8_to_16, reg_row_start=reg_row_start, reg_row_end=reg_row_end, ymm_load_mask=ymm_load_mask_columns_8_to_16) for row_batch_start, row_batch_end in [(0, 2), (2, 5), (5, 8)]: ymm_wr_list = [(YMMRegister(), YMMRegister()) for _ in range(row_batch_start, row_batch_end)] ymm_wi_list = [(YMMRegister(), YMMRegister()) for _ in range(row_batch_start, row_batch_end)] for row_offset, (ymm_wr, ymm_wi) in enumerate(zip(ymm_wr_list, ymm_wi_list)): row = row_batch_start + row_offset VMOVAPS(ymm_wr[0], vfft_columns_0_to_8[row*2+0]) VMOVAPS(ymm_wr[1], vfft_columns_8_to_16[row*2+0])
), GeneralPurposeRegister64() LEA(reg_t0_column_8, [reg_t0 + YMMRegister.size]) LEA(reg_t8_column_8, [reg_t8 + YMMRegister.size]) vfft_columns_0_to_8 = [ LocalVariable(YMMRegister.size) for _ in range(16) ] vfft_columns_8_to_16 = [ YMMRegister() if i < 4 else LocalVariable(YMMRegister.size) for i in range(16) ] fft16x16.forward_vfft(reg_t0, reg_t8, reg_t_stride, data_out=vfft_columns_0_to_8, reg_row_start=reg_row_start, reg_row_end=reg_row_end, ymm_load_mask=ymm_load_mask_columns_0_to_8) ymm_load_mask_columns_8_to_16 = YMMRegister() VMOVDQA(ymm_load_mask_columns_8_to_16, load_mask_columns_8_to_16) fft16x16.forward_vfft(reg_t0_column_8, reg_t8_column_8, reg_t_stride, data_out=vfft_columns_8_to_16, reg_row_start=reg_row_start, reg_row_end=reg_row_end, ymm_load_mask=ymm_load_mask_columns_8_to_16)
for i, ymm_i in enumerate(ymm_data): VMOVUPS([reg_f + i * YMMRegister.size], ymm_i) RETURN() import fft16x16 with Function("nnp_fft16_8real__fma3", (arg_t, arg_f), target=uarch.default + isa.fma3): reg_t0 = GeneralPurposeRegister64() LOAD.ARGUMENT(reg_t0, arg_t) reg_f = GeneralPurposeRegister64() LOAD.ARGUMENT(reg_f, arg_f) reg_stride = GeneralPurposeRegister64() MOV(reg_stride, YMMRegister.size) reg_t8 = GeneralPurposeRegister64() LEA(reg_t8, [reg_t0 + 8 * YMMRegister.size]) fft16x16.forward_vfft( reg_t0, reg_t8, reg_stride, data_out=[yword[reg_f + YMMRegister.size * i] for i in range(16)]) RETURN()