reg_s_stride = GeneralPurposeRegister64() LOAD.ARGUMENT(reg_s_stride, arg_s_stride) reg_row_count = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_row_count, arg_row_count) reg_column_count = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_column_count, arg_column_count) ymm_m = [YMMRegister() for _ in range(8)] for ymm in ymm_m: if with_bias and ymm is ymm_m[1]: VADDPS(ymm, xmm_bias.as_ymm, [reg_m]) else: VMOVAPS(ymm, [reg_m]) if ymm is not ymm_m[-1]: ADD(reg_m, reg_m_stride) ymm_t = winograd.o6x6k3x3.output_transform(ymm_m) ymm_tt = winograd.o6x6k3x3.transpose6x8(ymm_t) ymm_s = winograd.o6x6k3x3.output_transform(ymm_tt) block8x8.store_packed(ymm_s, reg_s, reg_s_stride, reg_row_count, reg_column_count, None, None, with_relu) RETURN()
reg_column_start = None ymm_data = [YMMRegister(i) for i in range(8)] ymm_real, ymm_imag = ymm_data[0::2], ymm_data[1::2] if with_bias: ymm_bias = YMMRegister() VMOVSS(ymm_bias.as_xmm, [reg_bias]) for ymm_re, ymm_im in zip(ymm_real, ymm_imag): VMOVAPS(ymm_re, [reg_f]) VMOVAPS(ymm_im, [reg_f + YMMRegister.size]) if with_bias and ymm_re is ymm_real[0]: VFMADD231PS(ymm_re, ymm_bias, Constant.float32x8(64.0)) if ymm_im is not ymm_imag[-1]: ADD(reg_f, reg_f_stride) fft.two_complex_soa_perm_to_two_real_planar.ifft8_within_rows_preprocess( ymm_real[0], ymm_imag[0]) fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag, transformation="inverse") fft.complex_soa_perm_to_real.ifft8_across_rows(ymm_data) block8x8.store_packed(ymm_data, reg_t, reg_t_stride, reg_row_count, reg_column_end, reg_row_start, reg_column_start, with_relu) RETURN()
LOAD.ARGUMENT(reg_m_stride, arg_m_stride) reg_s_stride = GeneralPurposeRegister64() LOAD.ARGUMENT(reg_s_stride, arg_s_stride) reg_row_count = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_row_count, arg_row_count) reg_column_count = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_column_count, arg_column_count) ymm_m = [YMMRegister() for _ in range(8)] for ymm in ymm_m: if with_bias and ymm is ymm_m[1]: VADDPS(ymm, xmm_bias.as_ymm, [reg_m]) else: VMOVAPS(ymm, [reg_m]) if ymm is not ymm_m[-1]: ADD(reg_m, reg_m_stride) ymm_t = winograd.o6x6k3x3.output_transform(ymm_m) ymm_tt = winograd.o6x6k3x3.transpose6x8(ymm_t) ymm_s = winograd.o6x6k3x3.output_transform(ymm_tt) block8x8.store_packed(ymm_s, reg_s, reg_s_stride, reg_row_count, reg_column_count) RETURN()
reg_column_start = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_column_start, arg_column_offset) ADD(reg_column_end, reg_column_start) else: reg_row_start = None reg_column_start = None ymm_data = [YMMRegister(i) for i in range(8)] ymm_real, ymm_imag = ymm_data[0::2], ymm_data[1::2] if with_bias: ymm_bias = YMMRegister() VMOVSS(ymm_bias.as_xmm, [reg_bias]) for ymm_re, ymm_im in zip(ymm_real, ymm_imag): VMOVAPS(ymm_re, [reg_f]) VMOVAPS(ymm_im, [reg_f + YMMRegister.size]) if with_bias and ymm_re is ymm_real[0]: VFMADD231PS(ymm_re, ymm_bias, Constant.float32x8(64.0)) if ymm_im is not ymm_imag[-1]: ADD(reg_f, reg_f_stride) fft.two_complex_soa_perm_to_two_real_planar.ifft8_within_rows_preprocess(ymm_real[0], ymm_imag[0]) fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag, transformation="inverse") fft.complex_soa_perm_to_real.ifft8_across_rows(ymm_data) block8x8.store_packed(ymm_data, reg_t, reg_t_stride, reg_row_count, reg_column_end, reg_row_start, reg_column_start, with_relu) RETURN()
LOAD.ARGUMENT(reg_m_stride, arg_m_stride) reg_s_stride = GeneralPurposeRegister64() LOAD.ARGUMENT(reg_s_stride, arg_s_stride) reg_row_count = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_row_count, arg_row_count) reg_column_count = GeneralPurposeRegister32() LOAD.ARGUMENT(reg_column_count, arg_column_count) ymm_m = [YMMRegister() for _ in range(8)] for ymm in ymm_m: if with_bias and ymm is ymm_m[1]: VADDPS(ymm, xmm_bias.as_ymm, [reg_m]) else: VMOVAPS(ymm, [reg_m]) if ymm is not ymm_m[-1]: ADD(reg_m, reg_m_stride) ymm_t = winograd.o6x6k3x3.output_transform(ymm_m) ymm_tt = winograd.o6x6k3x3.transpose6x8(ymm_t) ymm_s = winograd.o6x6k3x3.output_transform(ymm_tt) block8x8.store_packed(ymm_s, reg_s, reg_s_stride, reg_row_count, reg_column_count, None, None, with_relu) RETURN()