reg_row_cnt = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_row_cnt, arg_row_count) reg_col_cnt = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_col_cnt, arg_column_count) reg_row_off = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_row_off, arg_row_offset) reg_col_off = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_col_off, arg_column_offset) ymm_data = [YMMRegister(i) for i in range(8)] ymm_real, ymm_imag = ymm_data[0::2], ymm_data[1::2] block8x8.load_with_padding(ymm_data, reg_t, reg_inct, reg_row_off, reg_row_cnt, reg_col_off, reg_col_cnt) fft.real_to_complex_soa_perm.fft8_across_rows(ymm_data) fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag) fft.two_real_to_two_complex_soa_perm_planar.fft8_within_rows_postprocess(ymm_real[0], ymm_imag[0]) VSTOREPS = {"store": VMOVAPS, "stream": VMOVNTPS}[post_operation] for ymm_re, ymm_im in zip(ymm_real, ymm_imag): VSTOREPS([reg_f], ymm_re) VSTOREPS([reg_f + YMMRegister.size], ymm_im) if ymm_re is not ymm_real[-1]: ADD(reg_f, reg_incf) RETURN()
reg_row_cnt = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_row_cnt, arg_row_count) reg_col_cnt = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_col_cnt, arg_column_count) reg_row_off = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_row_off, arg_row_offset) reg_col_off = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_col_off, arg_column_offset) ymm_data = [YMMRegister(i) for i in range(8)] ymm_real, ymm_imag = ymm_data[0::2], ymm_data[1::2] block8x8.load_with_padding(ymm_data, reg_t, reg_inct, reg_row_off, reg_row_cnt, reg_col_off, reg_col_cnt) fft.real_to_complex_soa_perm.fft8_across_rows(ymm_data) fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag) fft.two_real_to_two_complex_soa_perm_planar.fft8_within_rows_postprocess( ymm_real[0], ymm_imag[0]) VSTOREPS = {"store": VMOVAPS, "stream": VMOVNTPS}[post_operation] for ymm_re, ymm_im in zip(ymm_real, ymm_imag): VSTOREPS([reg_f], ymm_re) VSTOREPS([reg_f + YMMRegister.size], ymm_im) if ymm_re is not ymm_real[-1]: ADD(reg_f, reg_incf) RETURN()
reg_row_cnt = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_row_cnt, arg_row_count) reg_col_cnt = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_col_cnt, arg_column_count) reg_row_off = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_row_off, arg_row_offset) reg_col_off = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_col_off, arg_column_offset) ymm_data = [YMMRegister() for _ in range(8)] block8x8.load_with_padding(ymm_data, reg_d, reg_stride_d, reg_row_off, reg_row_cnt, reg_col_off, reg_col_cnt) ymm_data = winograd.o6x6k3x3.input_transform(ymm_data) winograd.o6x6k3x3.transpose8x8(ymm_data) ymm_data = winograd.o6x6k3x3.input_transform(ymm_data) VSTOREPS = {"store": VMOVAPS, "stream": VMOVNTPS}[post_operation] for ymm_row in ymm_data: VSTOREPS([reg_wd], ymm_row) if ymm_row is not ymm_data[-1]: ADD(reg_wd, reg_stride_wd) RETURN() for reverse_kernel in [False, True]: